├── DCNv2 ├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── dcn_v2.py ├── make.sh ├── setup.py ├── src │ ├── cpu │ │ ├── dcn_v2_cpu.cpp │ │ └── vision.h │ ├── cuda │ │ ├── dcn_v2_cuda.cu │ │ ├── dcn_v2_im2col_cuda.cu │ │ ├── dcn_v2_im2col_cuda.h │ │ ├── dcn_v2_psroi_pooling_cuda.cu │ │ └── vision.h │ ├── dcn_v2.h │ └── vision.cpp └── test.py ├── LICENSE ├── README.md ├── dcn_cpp_plugin ├── CMakeLists.txt ├── README.md ├── dcn_v2.h ├── dcn_v2_cuda.cu ├── dcn_v2_im2col_cuda.cu ├── dcn_v2_im2col_cuda.h ├── vision.cpp └── vision.h ├── demo.py ├── model.py └── pose_dla_dcn.py /DCNv2/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .idea 3 | *.so 4 | *.o 5 | *pyc 6 | _ext 7 | build 8 | DCNv2.egg-info 9 | dist -------------------------------------------------------------------------------- /DCNv2/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Charles Shang 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /DCNv2/README.md: -------------------------------------------------------------------------------- 1 | ## Deformable Convolutional Networks V2 with Pytorch 1.0 2 | 3 | ### Build 4 | ```bash 5 | ./make.sh # build 6 | python test.py # run examples and gradient check 7 | ``` 8 | 9 | ### An Example 10 | - deformable conv 11 | ```python 12 | from dcn_v2 import DCN 13 | input = torch.randn(2, 64, 128, 128).cuda() 14 | # wrap all things (offset and mask) in DCN 15 | dcn = DCN(64, 64, kernel_size=(3,3), stride=1, padding=1, deformable_groups=2).cuda() 16 | output = dcn(input) 17 | print(output.shape) 18 | ``` 19 | - deformable roi pooling 20 | ```python 21 | from dcn_v2 import DCNPooling 22 | input = torch.randn(2, 32, 64, 64).cuda() 23 | batch_inds = torch.randint(2, (20, 1)).cuda().float() 24 | x = torch.randint(256, (20, 1)).cuda().float() 25 | y = torch.randint(256, (20, 1)).cuda().float() 26 | w = torch.randint(64, (20, 1)).cuda().float() 27 | h = torch.randint(64, (20, 1)).cuda().float() 28 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) 29 | 30 | # mdformable pooling (V2) 31 | # wrap all things (offset and mask) in DCNPooling 32 | dpooling = DCNPooling(spatial_scale=1.0 / 4, 33 | pooled_size=7, 34 | output_dim=32, 35 | no_trans=False, 36 | group_size=1, 37 | trans_std=0.1).cuda() 38 | 39 | dout = dpooling(input, rois) 40 | ``` 41 | ### Note 42 | Now the master branch is for pytorch 1.0 (new ATen API), you can switch back to pytorch 0.4 with, 43 | ```bash 44 | git checkout pytorch_0.4 45 | ``` 46 | 47 | ### Known Issues: 48 | 49 | - [x] Gradient check w.r.t offset (solved) 50 | - [ ] Backward is not reentrant (minor) 51 | 52 | This is an adaption of the official [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets/tree/master/DCNv2_op). 53 | 54 | I have ran the gradient check for many times with DOUBLE type. Every tensor **except offset** passes. 55 | However, when I set the offset to 0.5, it passes. I'm still wondering what cause this problem. Is it because some 56 | non-differential points? 57 | 58 | Update: all gradient check passes with double precision. 59 | 60 | Another issue is that it raises `RuntimeError: Backward is not reentrant`. However, the error is very small (`<1e-7` for 61 | float `<1e-15` for double), 62 | so it may not be a serious problem (?) 63 | 64 | Please post an issue or PR if you have any comments. 65 | -------------------------------------------------------------------------------- /DCNv2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xi11xi19/CenterNet2TorchScript/69d7241139ebb2aad095cf17901d3945ac705626/DCNv2/__init__.py -------------------------------------------------------------------------------- /DCNv2/dcn_v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import division 5 | 6 | import math 7 | import torch 8 | from torch import nn 9 | from torch.autograd import Function 10 | from torch.nn.modules.utils import _pair 11 | from torch.autograd.function import once_differentiable 12 | from torch.functional import F 13 | 14 | 15 | import _ext as _backend 16 | 17 | 18 | class DCNv2(nn.Module): 19 | 20 | def __init__(self, in_channels, out_channels, 21 | kernel_size, stride, padding, dilation=1, deformable_groups=1): 22 | super(DCNv2, self).__init__() 23 | self.in_channels = in_channels 24 | self.out_channels = out_channels 25 | self.kernel_size = _pair(kernel_size) 26 | self.stride = _pair(stride) 27 | self.padding = _pair(padding) 28 | self.dilation = _pair(dilation) 29 | self.deformable_groups = deformable_groups 30 | 31 | self.weight = nn.Parameter(torch.Tensor( 32 | out_channels, in_channels, *self.kernel_size)) 33 | self.bias = nn.Parameter(torch.Tensor(out_channels)) 34 | self.reset_parameters() 35 | 36 | def reset_parameters(self): 37 | n = self.in_channels 38 | for k in self.kernel_size: 39 | n *= k 40 | stdv = 1. / math.sqrt(n) 41 | self.weight.data.uniform_(-stdv, stdv) 42 | self.bias.data.zero_() 43 | 44 | 45 | def forward(self, input, offset, mask): 46 | # assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ 47 | # offset.shape[1] 48 | # assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \ 49 | # mask.shape[1] 50 | output = _backend.dcn_v2_forward(input, self.weight, self.bias, 51 | offset, mask, 52 | self.weight.shape[2], self.weight.shape[3], 53 | self.stride[0], self.stride[1], 54 | self.padding[0], self.padding[1], 55 | self.dilation[0], self.dilation[1], 56 | self.deformable_groups) 57 | return output 58 | 59 | 60 | class DCN(DCNv2): 61 | 62 | def __init__(self, in_channels, out_channels, 63 | kernel_size, stride, padding, 64 | dilation=1, deformable_groups=1): 65 | super(DCN, self).__init__(in_channels, out_channels, 66 | kernel_size, stride, padding, dilation, deformable_groups) 67 | 68 | channels_ = self.deformable_groups * 3 * \ 69 | self.kernel_size[0] * self.kernel_size[1] 70 | self.conv_offset_mask = nn.Conv2d(self.in_channels, 71 | channels_, 72 | kernel_size=self.kernel_size, 73 | stride=self.stride, 74 | padding=self.padding, 75 | bias=True) 76 | self.init_offset() 77 | 78 | def init_offset(self): 79 | self.conv_offset_mask.weight.data.zero_() 80 | self.conv_offset_mask.bias.data.zero_() 81 | 82 | def forward(self, input): 83 | out = self.conv_offset_mask(input) 84 | o1, o2, mask = torch.chunk(out, 3, dim=1) 85 | offset = torch.cat((o1, o2), dim=1) 86 | mask = torch.sigmoid(mask) 87 | output = torch.ops.my_ops.dcn_v2_cuda_forward_v2(input, self.weight, self.bias, 88 | offset, mask, 89 | self.kernel_size[0], self.kernel_size[1], 90 | self.stride[0], self.stride[1], 91 | self.padding[0], self.padding[1], 92 | self.dilation[0], self.dilation[1], 93 | self.deformable_groups) 94 | 95 | return output 96 | 97 | 98 | class _DCNv2Pooling(Function): 99 | @staticmethod 100 | def forward(ctx, input, rois, offset, 101 | spatial_scale, 102 | pooled_size, 103 | output_dim, 104 | no_trans, 105 | group_size=1, 106 | part_size=None, 107 | sample_per_part=4, 108 | trans_std=.0): 109 | ctx.spatial_scale = spatial_scale 110 | ctx.no_trans = int(no_trans) 111 | ctx.output_dim = output_dim 112 | ctx.group_size = group_size 113 | ctx.pooled_size = pooled_size 114 | ctx.part_size = pooled_size if part_size is None else part_size 115 | ctx.sample_per_part = sample_per_part 116 | ctx.trans_std = trans_std 117 | 118 | output, output_count = \ 119 | _backend.dcn_v2_psroi_pooling_forward(input, rois, offset, 120 | ctx.no_trans, ctx.spatial_scale, 121 | ctx.output_dim, ctx.group_size, 122 | ctx.pooled_size, ctx.part_size, 123 | ctx.sample_per_part, ctx.trans_std) 124 | ctx.save_for_backward(input, rois, offset, output_count) 125 | return output 126 | 127 | @staticmethod 128 | @once_differentiable 129 | def backward(ctx, grad_output): 130 | input, rois, offset, output_count = ctx.saved_tensors 131 | grad_input, grad_offset = \ 132 | _backend.dcn_v2_psroi_pooling_backward(grad_output, 133 | input, 134 | rois, 135 | offset, 136 | output_count, 137 | ctx.no_trans, 138 | ctx.spatial_scale, 139 | ctx.output_dim, 140 | ctx.group_size, 141 | ctx.pooled_size, 142 | ctx.part_size, 143 | ctx.sample_per_part, 144 | ctx.trans_std) 145 | 146 | return grad_input, None, grad_offset, \ 147 | None, None, None, None, None, None, None, None 148 | 149 | 150 | dcn_v2_pooling = _DCNv2Pooling.apply 151 | 152 | 153 | class DCNv2Pooling(nn.Module): 154 | 155 | def __init__(self, 156 | spatial_scale, 157 | pooled_size, 158 | output_dim, 159 | no_trans, 160 | group_size=1, 161 | part_size=None, 162 | sample_per_part=4, 163 | trans_std=.0): 164 | super(DCNv2Pooling, self).__init__() 165 | self.spatial_scale = spatial_scale 166 | self.pooled_size = pooled_size 167 | self.output_dim = output_dim 168 | self.no_trans = no_trans 169 | self.group_size = group_size 170 | self.part_size = pooled_size if part_size is None else part_size 171 | self.sample_per_part = sample_per_part 172 | self.trans_std = trans_std 173 | 174 | def forward(self, input, rois, offset): 175 | assert input.shape[1] == self.output_dim 176 | if self.no_trans: 177 | offset = input.new() 178 | return dcn_v2_pooling(input, rois, offset, 179 | self.spatial_scale, 180 | self.pooled_size, 181 | self.output_dim, 182 | self.no_trans, 183 | self.group_size, 184 | self.part_size, 185 | self.sample_per_part, 186 | self.trans_std) 187 | 188 | 189 | class DCNPooling(DCNv2Pooling): 190 | 191 | def __init__(self, 192 | spatial_scale, 193 | pooled_size, 194 | output_dim, 195 | no_trans, 196 | group_size=1, 197 | part_size=None, 198 | sample_per_part=4, 199 | trans_std=.0, 200 | deform_fc_dim=1024): 201 | super(DCNPooling, self).__init__(spatial_scale, 202 | pooled_size, 203 | output_dim, 204 | no_trans, 205 | group_size, 206 | part_size, 207 | sample_per_part, 208 | trans_std) 209 | 210 | self.deform_fc_dim = deform_fc_dim 211 | 212 | if not no_trans: 213 | self.offset_mask_fc = nn.Sequential( 214 | nn.Linear(self.pooled_size * self.pooled_size * 215 | self.output_dim, self.deform_fc_dim), 216 | nn.ReLU(inplace=True), 217 | nn.Linear(self.deform_fc_dim, self.deform_fc_dim), 218 | nn.ReLU(inplace=True), 219 | nn.Linear(self.deform_fc_dim, self.pooled_size * 220 | self.pooled_size * 3) 221 | ) 222 | self.offset_mask_fc[4].weight.data.zero_() 223 | self.offset_mask_fc[4].bias.data.zero_() 224 | 225 | def forward(self, input, rois): 226 | offset = input.new() 227 | 228 | if not self.no_trans: 229 | 230 | # do roi_align first 231 | n = rois.shape[0] 232 | roi = dcn_v2_pooling(input, rois, offset, 233 | self.spatial_scale, 234 | self.pooled_size, 235 | self.output_dim, 236 | True, # no trans 237 | self.group_size, 238 | self.part_size, 239 | self.sample_per_part, 240 | self.trans_std) 241 | 242 | # build mask and offset 243 | offset_mask = self.offset_mask_fc(roi.view(n, -1)) 244 | offset_mask = offset_mask.view( 245 | n, 3, self.pooled_size, self.pooled_size) 246 | o1, o2, mask = torch.chunk(offset_mask, 3, dim=1) 247 | offset = torch.cat((o1, o2), dim=1) 248 | mask = torch.sigmoid(mask) 249 | 250 | # do pooling with offset and mask 251 | return dcn_v2_pooling(input, rois, offset, 252 | self.spatial_scale, 253 | self.pooled_size, 254 | self.output_dim, 255 | self.no_trans, 256 | self.group_size, 257 | self.part_size, 258 | self.sample_per_part, 259 | self.trans_std) * mask 260 | # only roi_align 261 | return dcn_v2_pooling(input, rois, offset, 262 | self.spatial_scale, 263 | self.pooled_size, 264 | self.output_dim, 265 | self.no_trans, 266 | self.group_size, 267 | self.part_size, 268 | self.sample_per_part, 269 | self.trans_std) 270 | -------------------------------------------------------------------------------- /DCNv2/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python3 setup.py build develop 3 | -------------------------------------------------------------------------------- /DCNv2/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import glob 5 | 6 | import torch 7 | 8 | from torch.utils.cpp_extension import CUDA_HOME 9 | from torch.utils.cpp_extension import CppExtension 10 | from torch.utils.cpp_extension import CUDAExtension 11 | 12 | from setuptools import find_packages 13 | from setuptools import setup 14 | 15 | requirements = ["torch", "torchvision"] 16 | 17 | def get_extensions(): 18 | this_dir = os.path.dirname(os.path.abspath(__file__)) 19 | extensions_dir = os.path.join(this_dir, "src") 20 | 21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 24 | 25 | sources = main_file + source_cpu 26 | extension = CppExtension 27 | extra_compile_args = {"cxx": []} 28 | define_macros = [] 29 | 30 | if torch.cuda.is_available() and CUDA_HOME is not None: 31 | extension = CUDAExtension 32 | sources += source_cuda 33 | define_macros += [("WITH_CUDA", None)] 34 | extra_compile_args["nvcc"] = [ 35 | "-DCUDA_HAS_FP16=1", 36 | "-D__CUDA_NO_HALF_OPERATORS__", 37 | "-D__CUDA_NO_HALF_CONVERSIONS__", 38 | "-D__CUDA_NO_HALF2_OPERATORS__", 39 | ] 40 | else: 41 | raise NotImplementedError('Cuda is not availabel') 42 | 43 | sources = [os.path.join(extensions_dir, s) for s in sources] 44 | include_dirs = [extensions_dir] 45 | ext_modules = [ 46 | extension( 47 | "_ext", 48 | sources, 49 | include_dirs=include_dirs, 50 | define_macros=define_macros, 51 | extra_compile_args=extra_compile_args, 52 | ) 53 | ] 54 | return ext_modules 55 | 56 | setup( 57 | name="DCNv2", 58 | version="0.1", 59 | author="charlesshang", 60 | url="https://github.com/charlesshang/DCNv2", 61 | description="deformable convolutional networks", 62 | packages=find_packages(exclude=("configs", "tests",)), 63 | # install_requires=requirements, 64 | ext_modules=get_extensions(), 65 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 66 | ) -------------------------------------------------------------------------------- /DCNv2/src/cpu/dcn_v2_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | 7 | at::Tensor 8 | dcn_v2_cpu_forward(const at::Tensor &input, 9 | const at::Tensor &weight, 10 | const at::Tensor &bias, 11 | const at::Tensor &offset, 12 | const at::Tensor &mask, 13 | const int kernel_h, 14 | const int kernel_w, 15 | const int stride_h, 16 | const int stride_w, 17 | const int pad_h, 18 | const int pad_w, 19 | const int dilation_h, 20 | const int dilation_w, 21 | const int deformable_group) 22 | { 23 | AT_ERROR("Not implement on cpu"); 24 | } 25 | 26 | std::vector 27 | dcn_v2_cpu_backward(const at::Tensor &input, 28 | const at::Tensor &weight, 29 | const at::Tensor &bias, 30 | const at::Tensor &offset, 31 | const at::Tensor &mask, 32 | const at::Tensor &grad_output, 33 | int kernel_h, int kernel_w, 34 | int stride_h, int stride_w, 35 | int pad_h, int pad_w, 36 | int dilation_h, int dilation_w, 37 | int deformable_group) 38 | { 39 | AT_ERROR("Not implement on cpu"); 40 | } 41 | 42 | std::tuple 43 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, 44 | const at::Tensor &bbox, 45 | const at::Tensor &trans, 46 | const int no_trans, 47 | const float spatial_scale, 48 | const int output_dim, 49 | const int group_size, 50 | const int pooled_size, 51 | const int part_size, 52 | const int sample_per_part, 53 | const float trans_std) 54 | { 55 | AT_ERROR("Not implement on cpu"); 56 | } 57 | 58 | std::tuple 59 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, 60 | const at::Tensor &input, 61 | const at::Tensor &bbox, 62 | const at::Tensor &trans, 63 | const at::Tensor &top_count, 64 | const int no_trans, 65 | const float spatial_scale, 66 | const int output_dim, 67 | const int group_size, 68 | const int pooled_size, 69 | const int part_size, 70 | const int sample_per_part, 71 | const float trans_std) 72 | { 73 | AT_ERROR("Not implement on cpu"); 74 | } -------------------------------------------------------------------------------- /DCNv2/src/cpu/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor 5 | dcn_v2_cpu_forward(const at::Tensor &input, 6 | const at::Tensor &weight, 7 | const at::Tensor &bias, 8 | const at::Tensor &offset, 9 | const at::Tensor &mask, 10 | const int kernel_h, 11 | const int kernel_w, 12 | const int stride_h, 13 | const int stride_w, 14 | const int pad_h, 15 | const int pad_w, 16 | const int dilation_h, 17 | const int dilation_w, 18 | const int deformable_group); 19 | 20 | std::vector 21 | dcn_v2_cpu_backward(const at::Tensor &input, 22 | const at::Tensor &weight, 23 | const at::Tensor &bias, 24 | const at::Tensor &offset, 25 | const at::Tensor &mask, 26 | const at::Tensor &grad_output, 27 | int kernel_h, int kernel_w, 28 | int stride_h, int stride_w, 29 | int pad_h, int pad_w, 30 | int dilation_h, int dilation_w, 31 | int deformable_group); 32 | 33 | 34 | std::tuple 35 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input, 36 | const at::Tensor &bbox, 37 | const at::Tensor &trans, 38 | const int no_trans, 39 | const float spatial_scale, 40 | const int output_dim, 41 | const int group_size, 42 | const int pooled_size, 43 | const int part_size, 44 | const int sample_per_part, 45 | const float trans_std); 46 | 47 | std::tuple 48 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad, 49 | const at::Tensor &input, 50 | const at::Tensor &bbox, 51 | const at::Tensor &trans, 52 | const at::Tensor &top_count, 53 | const int no_trans, 54 | const float spatial_scale, 55 | const int output_dim, 56 | const int group_size, 57 | const int pooled_size, 58 | const int part_size, 59 | const int sample_per_part, 60 | const float trans_std); -------------------------------------------------------------------------------- /DCNv2/src/cuda/dcn_v2_cuda.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cuda/dcn_v2_im2col_cuda.h" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | extern THCState *state; 11 | 12 | // author: Charles Shang 13 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu 14 | 15 | // [batch gemm] 16 | // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu 17 | 18 | __global__ void createBatchGemmBuffer(const float **input_b, float **output_b, 19 | float **columns_b, const float **ones_b, 20 | const float **weight_b, const float **bias_b, 21 | float *input, float *output, 22 | float *columns, float *ones, 23 | float *weight, float *bias, 24 | const int input_stride, const int output_stride, 25 | const int columns_stride, const int ones_stride, 26 | const int num_batches) 27 | { 28 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 29 | if (idx < num_batches) 30 | { 31 | input_b[idx] = input + idx * input_stride; 32 | output_b[idx] = output + idx * output_stride; 33 | columns_b[idx] = columns + idx * columns_stride; 34 | ones_b[idx] = ones + idx * ones_stride; 35 | // share weights and bias within a Mini-Batch 36 | weight_b[idx] = weight; 37 | bias_b[idx] = bias; 38 | } 39 | } 40 | 41 | at::Tensor 42 | dcn_v2_cuda_forward(const at::Tensor &input, 43 | const at::Tensor &weight, 44 | const at::Tensor &bias, 45 | const at::Tensor &offset, 46 | const at::Tensor &mask, 47 | const int kernel_h, 48 | const int kernel_w, 49 | const int stride_h, 50 | const int stride_w, 51 | const int pad_h, 52 | const int pad_w, 53 | const int dilation_h, 54 | const int dilation_w, 55 | const int deformable_group) 56 | { 57 | using scalar_t = float; 58 | // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); 59 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 60 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); 61 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); 62 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); 63 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); 64 | 65 | const int batch = input.size(0); 66 | const int channels = input.size(1); 67 | const int height = input.size(2); 68 | const int width = input.size(3); 69 | 70 | const int channels_out = weight.size(0); 71 | const int channels_kernel = weight.size(1); 72 | const int kernel_h_ = weight.size(2); 73 | const int kernel_w_ = weight.size(3); 74 | 75 | // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); 76 | // printf("Channels: %d %d\n", channels, channels_kernel); 77 | // printf("Channels: %d %d\n", channels_out, channels_kernel); 78 | 79 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, 80 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); 81 | 82 | AT_ASSERTM(channels == channels_kernel, 83 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); 84 | 85 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 86 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 87 | 88 | auto ones = at::ones({batch, height_out, width_out}, input.options()); 89 | auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); 90 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); 91 | 92 | // prepare for batch-wise computing, which is significantly faster than instance-wise computing 93 | // when batch size is large. 94 | // launch batch threads 95 | int matrices_size = batch * sizeof(float *); 96 | auto input_b = static_cast(THCudaMalloc(state, matrices_size)); 97 | auto output_b = static_cast(THCudaMalloc(state, matrices_size)); 98 | auto columns_b = static_cast(THCudaMalloc(state, matrices_size)); 99 | auto ones_b = static_cast(THCudaMalloc(state, matrices_size)); 100 | auto weight_b = static_cast(THCudaMalloc(state, matrices_size)); 101 | auto bias_b = static_cast(THCudaMalloc(state, matrices_size)); 102 | 103 | const int block = 128; 104 | const int grid = (batch + block - 1) / block; 105 | 106 | createBatchGemmBuffer<<>>( 107 | input_b, output_b, 108 | columns_b, ones_b, 109 | weight_b, bias_b, 110 | input.data(), 111 | output.data(), 112 | columns.data(), 113 | ones.data(), 114 | weight.data(), 115 | bias.data(), 116 | channels * width * height, 117 | channels_out * width_out * height_out, 118 | channels * kernel_h * kernel_w * height_out * width_out, 119 | height_out * width_out, 120 | batch); 121 | 122 | long m_ = channels_out; 123 | long n_ = height_out * width_out; 124 | long k_ = 1; 125 | THCudaBlas_SgemmBatched(state, 126 | 't', 127 | 'n', 128 | n_, 129 | m_, 130 | k_, 131 | 1.0f, 132 | ones_b, k_, 133 | bias_b, k_, 134 | 0.0f, 135 | output_b, n_, 136 | batch); 137 | 138 | modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), 139 | input.data(), 140 | offset.data(), 141 | mask.data(), 142 | batch, channels, height, width, 143 | height_out, width_out, kernel_h, kernel_w, 144 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, 145 | deformable_group, 146 | columns.data()); 147 | 148 | long m = channels_out; 149 | long n = height_out * width_out; 150 | long k = channels * kernel_h * kernel_w; 151 | THCudaBlas_SgemmBatched(state, 152 | 'n', 153 | 'n', 154 | n, 155 | m, 156 | k, 157 | 1.0f, 158 | (const float **)columns_b, n, 159 | weight_b, k, 160 | 1.0f, 161 | output_b, n, 162 | batch); 163 | 164 | THCudaFree(state, input_b); 165 | THCudaFree(state, output_b); 166 | THCudaFree(state, columns_b); 167 | THCudaFree(state, ones_b); 168 | THCudaFree(state, weight_b); 169 | THCudaFree(state, bias_b); 170 | return output; 171 | } 172 | 173 | at::Tensor 174 | dcn_v2_cuda_forward_v2(const at::Tensor &input, 175 | const at::Tensor &weight, 176 | const at::Tensor &bias, 177 | const at::Tensor &offset, 178 | const at::Tensor &mask, 179 | const int64_t kernel_h, 180 | const int64_t kernel_w, 181 | const int64_t stride_h, 182 | const int64_t stride_w, 183 | const int64_t pad_h, 184 | const int64_t pad_w, 185 | const int64_t dilation_h, 186 | const int64_t dilation_w, 187 | const int64_t deformable_group) 188 | { 189 | using scalar_t = float; 190 | // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); 191 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 192 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); 193 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); 194 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); 195 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); 196 | 197 | const int batch = input.size(0); 198 | const int channels = input.size(1); 199 | const int height = input.size(2); 200 | const int width = input.size(3); 201 | 202 | const int channels_out = weight.size(0); 203 | const int channels_kernel = weight.size(1); 204 | const int kernel_h_ = weight.size(2); 205 | const int kernel_w_ = weight.size(3); 206 | 207 | // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h); 208 | // printf("Channels: %d %d\n", channels, channels_kernel); 209 | // printf("Channels: %d %d\n", channels_out, channels_kernel); 210 | 211 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, 212 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); 213 | 214 | AT_ASSERTM(channels == channels_kernel, 215 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); 216 | 217 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 218 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 219 | 220 | auto ones = at::ones({batch, height_out, width_out}, input.options()); 221 | auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); 222 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); 223 | 224 | // prepare for batch-wise computing, which is significantly faster than instance-wise computing 225 | // when batch size is large. 226 | // launch batch threads 227 | int matrices_size = batch * sizeof(float *); 228 | auto input_b = static_cast(THCudaMalloc(state, matrices_size)); 229 | auto output_b = static_cast(THCudaMalloc(state, matrices_size)); 230 | auto columns_b = static_cast(THCudaMalloc(state, matrices_size)); 231 | auto ones_b = static_cast(THCudaMalloc(state, matrices_size)); 232 | auto weight_b = static_cast(THCudaMalloc(state, matrices_size)); 233 | auto bias_b = static_cast(THCudaMalloc(state, matrices_size)); 234 | 235 | const int block = 128; 236 | const int grid = (batch + block - 1) / block; 237 | 238 | createBatchGemmBuffer<<>>( 239 | input_b, output_b, 240 | columns_b, ones_b, 241 | weight_b, bias_b, 242 | input.data(), 243 | output.data(), 244 | columns.data(), 245 | ones.data(), 246 | weight.data(), 247 | bias.data(), 248 | channels * width * height, 249 | channels_out * width_out * height_out, 250 | channels * kernel_h * kernel_w * height_out * width_out, 251 | height_out * width_out, 252 | batch); 253 | 254 | long m_ = channels_out; 255 | long n_ = height_out * width_out; 256 | long k_ = 1; 257 | THCudaBlas_SgemmBatched(state, 258 | 't', 259 | 'n', 260 | n_, 261 | m_, 262 | k_, 263 | 1.0f, 264 | ones_b, k_, 265 | bias_b, k_, 266 | 0.0f, 267 | output_b, n_, 268 | batch); 269 | 270 | modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), 271 | input.data(), 272 | offset.data(), 273 | mask.data(), 274 | batch, channels, height, width, 275 | height_out, width_out, kernel_h, kernel_w, 276 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, 277 | deformable_group, 278 | columns.data()); 279 | 280 | long m = channels_out; 281 | long n = height_out * width_out; 282 | long k = channels * kernel_h * kernel_w; 283 | THCudaBlas_SgemmBatched(state, 284 | 'n', 285 | 'n', 286 | n, 287 | m, 288 | k, 289 | 1.0f, 290 | (const float **)columns_b, n, 291 | weight_b, k, 292 | 1.0f, 293 | output_b, n, 294 | batch); 295 | 296 | THCudaFree(state, input_b); 297 | THCudaFree(state, output_b); 298 | THCudaFree(state, columns_b); 299 | THCudaFree(state, ones_b); 300 | THCudaFree(state, weight_b); 301 | THCudaFree(state, bias_b); 302 | return output; 303 | } 304 | 305 | __global__ void createBatchGemmBufferBackward( 306 | float **grad_output_b, 307 | float **columns_b, 308 | float **ones_b, 309 | float **weight_b, 310 | float **grad_weight_b, 311 | float **grad_bias_b, 312 | float *grad_output, 313 | float *columns, 314 | float *ones, 315 | float *weight, 316 | float *grad_weight, 317 | float *grad_bias, 318 | const int grad_output_stride, 319 | const int columns_stride, 320 | const int ones_stride, 321 | const int num_batches) 322 | { 323 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 324 | if (idx < num_batches) 325 | { 326 | grad_output_b[idx] = grad_output + idx * grad_output_stride; 327 | columns_b[idx] = columns + idx * columns_stride; 328 | ones_b[idx] = ones + idx * ones_stride; 329 | 330 | // share weights and bias within a Mini-Batch 331 | weight_b[idx] = weight; 332 | grad_weight_b[idx] = grad_weight; 333 | grad_bias_b[idx] = grad_bias; 334 | } 335 | } 336 | 337 | std::vector dcn_v2_cuda_backward(const at::Tensor &input, 338 | const at::Tensor &weight, 339 | const at::Tensor &bias, 340 | const at::Tensor &offset, 341 | const at::Tensor &mask, 342 | const at::Tensor &grad_output, 343 | int kernel_h, int kernel_w, 344 | int stride_h, int stride_w, 345 | int pad_h, int pad_w, 346 | int dilation_h, int dilation_w, 347 | int deformable_group) 348 | { 349 | 350 | THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous"); 351 | THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous"); 352 | 353 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 354 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); 355 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); 356 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); 357 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); 358 | 359 | const int batch = input.size(0); 360 | const int channels = input.size(1); 361 | const int height = input.size(2); 362 | const int width = input.size(3); 363 | 364 | const int channels_out = weight.size(0); 365 | const int channels_kernel = weight.size(1); 366 | const int kernel_h_ = weight.size(2); 367 | const int kernel_w_ = weight.size(3); 368 | 369 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, 370 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); 371 | 372 | AT_ASSERTM(channels == channels_kernel, 373 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); 374 | 375 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 376 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 377 | 378 | auto ones = at::ones({height_out, width_out}, input.options()); 379 | auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); 380 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); 381 | 382 | auto grad_input = at::zeros_like(input); 383 | auto grad_weight = at::zeros_like(weight); 384 | auto grad_bias = at::zeros_like(bias); 385 | auto grad_offset = at::zeros_like(offset); 386 | auto grad_mask = at::zeros_like(mask); 387 | 388 | using scalar_t = float; 389 | 390 | for (int b = 0; b < batch; b++) 391 | { 392 | auto input_n = input.select(0, b); 393 | auto offset_n = offset.select(0, b); 394 | auto mask_n = mask.select(0, b); 395 | auto grad_output_n = grad_output.select(0, b); 396 | auto grad_input_n = grad_input.select(0, b); 397 | auto grad_offset_n = grad_offset.select(0, b); 398 | auto grad_mask_n = grad_mask.select(0, b); 399 | 400 | long m = channels * kernel_h * kernel_w; 401 | long n = height_out * width_out; 402 | long k = channels_out; 403 | 404 | THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f, 405 | grad_output_n.data(), n, 406 | weight.data(), m, 0.0f, 407 | columns.data(), n); 408 | 409 | // gradient w.r.t. input coordinate data 410 | modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state), 411 | columns.data(), 412 | input_n.data(), 413 | offset_n.data(), 414 | mask_n.data(), 415 | 1, channels, height, width, 416 | height_out, width_out, kernel_h, kernel_w, 417 | pad_h, pad_w, stride_h, stride_w, 418 | dilation_h, dilation_w, deformable_group, 419 | grad_offset_n.data(), 420 | grad_mask_n.data()); 421 | // gradient w.r.t. input data 422 | modulated_deformable_col2im_cuda(THCState_getCurrentStream(state), 423 | columns.data(), 424 | offset_n.data(), 425 | mask_n.data(), 426 | 1, channels, height, width, 427 | height_out, width_out, kernel_h, kernel_w, 428 | pad_h, pad_w, stride_h, stride_w, 429 | dilation_h, dilation_w, deformable_group, 430 | grad_input_n.data()); 431 | 432 | // gradient w.r.t. weight, dWeight should accumulate across the batch and group 433 | modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), 434 | input_n.data(), 435 | offset_n.data(), 436 | mask_n.data(), 437 | 1, channels, height, width, 438 | height_out, width_out, kernel_h, kernel_w, 439 | pad_h, pad_w, stride_h, stride_w, 440 | dilation_h, dilation_w, deformable_group, 441 | columns.data()); 442 | 443 | long m_ = channels_out; 444 | long n_ = channels * kernel_h * kernel_w; 445 | long k_ = height_out * width_out; 446 | 447 | THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f, 448 | columns.data(), k_, 449 | grad_output_n.data(), k_, 1.0f, 450 | grad_weight.data(), n_); 451 | 452 | // gradient w.r.t. bias 453 | // long m_ = channels_out; 454 | // long k__ = height_out * width_out; 455 | THCudaBlas_Sgemv(state, 456 | 't', 457 | k_, m_, 1.0f, 458 | grad_output_n.data(), k_, 459 | ones.data(), 1, 1.0f, 460 | grad_bias.data(), 1); 461 | } 462 | 463 | return { 464 | grad_input, grad_offset, grad_mask, grad_weight, grad_bias 465 | }; 466 | } -------------------------------------------------------------------------------- /DCNv2/src/cuda/dcn_v2_im2col_cuda.cu: -------------------------------------------------------------------------------- 1 | #include "dcn_v2_im2col_cuda.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #define CUDA_KERNEL_LOOP(i, n) \ 14 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 15 | i < (n); \ 16 | i += blockDim.x * gridDim.x) 17 | 18 | const int CUDA_NUM_THREADS = 1024; 19 | inline int GET_BLOCKS(const int N) 20 | { 21 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 22 | } 23 | 24 | 25 | __device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width, 26 | const int height, const int width, float h, float w) 27 | { 28 | int h_low = floor(h); 29 | int w_low = floor(w); 30 | int h_high = h_low + 1; 31 | int w_high = w_low + 1; 32 | 33 | float lh = h - h_low; 34 | float lw = w - w_low; 35 | float hh = 1 - lh, hw = 1 - lw; 36 | 37 | float v1 = 0; 38 | if (h_low >= 0 && w_low >= 0) 39 | v1 = bottom_data[h_low * data_width + w_low]; 40 | float v2 = 0; 41 | if (h_low >= 0 && w_high <= width - 1) 42 | v2 = bottom_data[h_low * data_width + w_high]; 43 | float v3 = 0; 44 | if (h_high <= height - 1 && w_low >= 0) 45 | v3 = bottom_data[h_high * data_width + w_low]; 46 | float v4 = 0; 47 | if (h_high <= height - 1 && w_high <= width - 1) 48 | v4 = bottom_data[h_high * data_width + w_high]; 49 | 50 | float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; 51 | 52 | float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); 53 | return val; 54 | } 55 | 56 | __device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w, 57 | const int h, const int w, const int height, const int width) 58 | { 59 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) 60 | { 61 | //empty 62 | return 0; 63 | } 64 | 65 | int argmax_h_low = floor(argmax_h); 66 | int argmax_w_low = floor(argmax_w); 67 | int argmax_h_high = argmax_h_low + 1; 68 | int argmax_w_high = argmax_w_low + 1; 69 | 70 | float weight = 0; 71 | if (h == argmax_h_low && w == argmax_w_low) 72 | weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); 73 | if (h == argmax_h_low && w == argmax_w_high) 74 | weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); 75 | if (h == argmax_h_high && w == argmax_w_low) 76 | weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); 77 | if (h == argmax_h_high && w == argmax_w_high) 78 | weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); 79 | return weight; 80 | } 81 | 82 | __device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w, 83 | const int height, const int width, const float *im_data, 84 | const int data_width, const int bp_dir) 85 | { 86 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) 87 | { 88 | //empty 89 | return 0; 90 | } 91 | 92 | int argmax_h_low = floor(argmax_h); 93 | int argmax_w_low = floor(argmax_w); 94 | int argmax_h_high = argmax_h_low + 1; 95 | int argmax_w_high = argmax_w_low + 1; 96 | 97 | float weight = 0; 98 | 99 | if (bp_dir == 0) 100 | { 101 | if (argmax_h_low >= 0 && argmax_w_low >= 0) 102 | weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; 103 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1) 104 | weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; 105 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0) 106 | weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; 107 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) 108 | weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 109 | } 110 | else if (bp_dir == 1) 111 | { 112 | if (argmax_h_low >= 0 && argmax_w_low >= 0) 113 | weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; 114 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1) 115 | weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; 116 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0) 117 | weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; 118 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) 119 | weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 120 | } 121 | 122 | return weight; 123 | } 124 | 125 | __global__ void modulated_deformable_im2col_gpu_kernel(const int n, 126 | const float *data_im, const float *data_offset, const float *data_mask, 127 | const int height, const int width, const int kernel_h, const int kernel_w, 128 | const int pad_h, const int pad_w, 129 | const int stride_h, const int stride_w, 130 | const int dilation_h, const int dilation_w, 131 | const int channel_per_deformable_group, 132 | const int batch_size, const int num_channels, const int deformable_group, 133 | const int height_col, const int width_col, 134 | float *data_col) 135 | { 136 | // launch channels * batch_size * height_col * width_col cores 137 | CUDA_KERNEL_LOOP(index, n) 138 | { 139 | // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow) 140 | // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis 141 | 142 | // index index of output matrix 143 | const int w_col = index % width_col; 144 | const int h_col = (index / width_col) % height_col; 145 | // const int b_col = (index / width_col / height_col) % batch_size; 146 | const int b_col = (index / width_col / height_col / num_channels) % batch_size; 147 | // const int c_im = (index / width_col / height_col) / batch_size; 148 | const int c_im = (index / width_col / height_col) % num_channels; 149 | // const int c_col = c_im * kernel_h * kernel_w; 150 | const int c_col = c_im * kernel_h * kernel_w; 151 | 152 | // compute deformable group index 153 | const int deformable_group_index = c_im / channel_per_deformable_group; 154 | 155 | const int h_in = h_col * stride_h - pad_h; 156 | const int w_in = w_col * stride_w - pad_w; 157 | 158 | // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; 159 | float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col; 160 | //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; 161 | const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; 162 | const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 163 | 164 | const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 165 | 166 | for (int i = 0; i < kernel_h; ++i) 167 | { 168 | for (int j = 0; j < kernel_w; ++j) 169 | { 170 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; 171 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; 172 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; 173 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 174 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 175 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 176 | float val = static_cast(0); 177 | const float h_im = h_in + i * dilation_h + offset_h; 178 | const float w_im = w_in + j * dilation_w + offset_w; 179 | //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { 180 | if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) 181 | { 182 | //const float map_h = i * dilation_h + offset_h; 183 | //const float map_w = j * dilation_w + offset_w; 184 | //const int cur_height = height - h_in; 185 | //const int cur_width = width - w_in; 186 | //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); 187 | val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); 188 | } 189 | *data_col_ptr = val * mask; 190 | // data_col_ptr += batch_size * height_col * width_col; 191 | data_col_ptr += height_col * width_col; 192 | } 193 | } 194 | } 195 | } 196 | 197 | __global__ void modulated_deformable_col2im_gpu_kernel(const int n, 198 | const float *data_col, const float *data_offset, const float *data_mask, 199 | const int channels, const int height, const int width, 200 | const int kernel_h, const int kernel_w, 201 | const int pad_h, const int pad_w, 202 | const int stride_h, const int stride_w, 203 | const int dilation_h, const int dilation_w, 204 | const int channel_per_deformable_group, 205 | const int batch_size, const int deformable_group, 206 | const int height_col, const int width_col, 207 | float *grad_im) 208 | { 209 | CUDA_KERNEL_LOOP(index, n) 210 | { 211 | const int j = (index / width_col / height_col / batch_size) % kernel_w; 212 | const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; 213 | const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; 214 | // compute the start and end of the output 215 | 216 | const int deformable_group_index = c / channel_per_deformable_group; 217 | 218 | int w_out = index % width_col; 219 | int h_out = (index / width_col) % height_col; 220 | int b = (index / width_col / height_col) % batch_size; 221 | int w_in = w_out * stride_w - pad_w; 222 | int h_in = h_out * stride_h - pad_h; 223 | 224 | const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 225 | const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 226 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; 227 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; 228 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; 229 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 230 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 231 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 232 | const float cur_inv_h_data = h_in + i * dilation_h + offset_h; 233 | const float cur_inv_w_data = w_in + j * dilation_w + offset_w; 234 | 235 | const float cur_top_grad = data_col[index] * mask; 236 | const int cur_h = (int)cur_inv_h_data; 237 | const int cur_w = (int)cur_inv_w_data; 238 | for (int dy = -2; dy <= 2; dy++) 239 | { 240 | for (int dx = -2; dx <= 2; dx++) 241 | { 242 | if (cur_h + dy >= 0 && cur_h + dy < height && 243 | cur_w + dx >= 0 && cur_w + dx < width && 244 | abs(cur_inv_h_data - (cur_h + dy)) < 1 && 245 | abs(cur_inv_w_data - (cur_w + dx)) < 1) 246 | { 247 | int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; 248 | float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); 249 | atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); 250 | } 251 | } 252 | } 253 | } 254 | } 255 | 256 | __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n, 257 | const float *data_col, const float *data_im, 258 | const float *data_offset, const float *data_mask, 259 | const int channels, const int height, const int width, 260 | const int kernel_h, const int kernel_w, 261 | const int pad_h, const int pad_w, 262 | const int stride_h, const int stride_w, 263 | const int dilation_h, const int dilation_w, 264 | const int channel_per_deformable_group, 265 | const int batch_size, const int offset_channels, const int deformable_group, 266 | const int height_col, const int width_col, 267 | float *grad_offset, float *grad_mask) 268 | { 269 | CUDA_KERNEL_LOOP(index, n) 270 | { 271 | float val = 0, mval = 0; 272 | int w = index % width_col; 273 | int h = (index / width_col) % height_col; 274 | int c = (index / width_col / height_col) % offset_channels; 275 | int b = (index / width_col / height_col) / offset_channels; 276 | // compute the start and end of the output 277 | 278 | const int deformable_group_index = c / (2 * kernel_h * kernel_w); 279 | const int col_step = kernel_h * kernel_w; 280 | int cnt = 0; 281 | const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; 282 | const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; 283 | const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 284 | const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 285 | 286 | const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; 287 | 288 | for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) 289 | { 290 | const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; 291 | const int bp_dir = offset_c % 2; 292 | 293 | int j = (col_pos / width_col / height_col / batch_size) % kernel_w; 294 | int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; 295 | int w_out = col_pos % width_col; 296 | int h_out = (col_pos / width_col) % height_col; 297 | int w_in = w_out * stride_w - pad_w; 298 | int h_in = h_out * stride_h - pad_h; 299 | const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); 300 | const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); 301 | const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); 302 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 303 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 304 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 305 | float inv_h = h_in + i * dilation_h + offset_h; 306 | float inv_w = w_in + j * dilation_w + offset_w; 307 | if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) 308 | { 309 | inv_h = inv_w = -2; 310 | } 311 | else 312 | { 313 | mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); 314 | } 315 | const float weight = dmcn_get_coordinate_weight( 316 | inv_h, inv_w, 317 | height, width, data_im_ptr + cnt * height * width, width, bp_dir); 318 | val += weight * data_col_ptr[col_pos] * mask; 319 | cnt += 1; 320 | } 321 | // KERNEL_ASSIGN(grad_offset[index], offset_req, val); 322 | grad_offset[index] = val; 323 | if (offset_c % 2 == 0) 324 | // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval); 325 | grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; 326 | } 327 | } 328 | 329 | void modulated_deformable_im2col_cuda(cudaStream_t stream, 330 | const float* data_im, const float* data_offset, const float* data_mask, 331 | const int batch_size, const int channels, const int height_im, const int width_im, 332 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 333 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 334 | const int dilation_h, const int dilation_w, 335 | const int deformable_group, float* data_col) { 336 | // num_axes should be smaller than block size 337 | const int channel_per_deformable_group = channels / deformable_group; 338 | const int num_kernels = channels * batch_size * height_col * width_col; 339 | modulated_deformable_im2col_gpu_kernel 340 | <<>>( 342 | num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, 343 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, 344 | batch_size, channels, deformable_group, height_col, width_col, data_col); 345 | 346 | cudaError_t err = cudaGetLastError(); 347 | if (err != cudaSuccess) 348 | { 349 | printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); 350 | } 351 | 352 | } 353 | 354 | void modulated_deformable_col2im_cuda(cudaStream_t stream, 355 | const float* data_col, const float* data_offset, const float* data_mask, 356 | const int batch_size, const int channels, const int height_im, const int width_im, 357 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 358 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 359 | const int dilation_h, const int dilation_w, 360 | const int deformable_group, float* grad_im){ 361 | 362 | const int channel_per_deformable_group = channels / deformable_group; 363 | const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col; 364 | modulated_deformable_col2im_gpu_kernel 365 | <<>>( 367 | num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im, 368 | kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w, 369 | dilation_h, dilation_w, channel_per_deformable_group, 370 | batch_size, deformable_group, height_col, width_col, grad_im); 371 | cudaError_t err = cudaGetLastError(); 372 | if (err != cudaSuccess) 373 | { 374 | printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); 375 | } 376 | 377 | } 378 | 379 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, 380 | const float* data_col, const float* data_im, const float* data_offset, const float* data_mask, 381 | const int batch_size, const int channels, const int height_im, const int width_im, 382 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 383 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 384 | const int dilation_h, const int dilation_w, 385 | const int deformable_group, 386 | float* grad_offset, float* grad_mask) { 387 | const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group; 388 | const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group; 389 | modulated_deformable_col2im_coord_gpu_kernel 390 | <<>>( 392 | num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im, 393 | kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, 394 | dilation_h, dilation_w, channel_per_deformable_group, 395 | batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 396 | grad_offset, grad_mask); 397 | cudaError_t err = cudaGetLastError(); 398 | if (err != cudaSuccess) 399 | { 400 | printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err)); 401 | } 402 | } -------------------------------------------------------------------------------- /DCNv2/src/cuda/dcn_v2_im2col_cuda.h: -------------------------------------------------------------------------------- 1 | 2 | /*! 3 | ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** 4 | * 5 | * COPYRIGHT 6 | * 7 | * All contributions by the University of California: 8 | * Copyright (c) 2014-2017 The Regents of the University of California (Regents) 9 | * All rights reserved. 10 | * 11 | * All other contributions: 12 | * Copyright (c) 2014-2017, the respective contributors 13 | * All rights reserved. 14 | * 15 | * Caffe uses a shared copyright model: each contributor holds copyright over 16 | * their contributions to Caffe. The project versioning records all such 17 | * contribution and copyright details. If a contributor wants to further mark 18 | * their specific copyright on a particular contribution, they should indicate 19 | * their copyright solely in the commit message of the change when it is 20 | * committed. 21 | * 22 | * LICENSE 23 | * 24 | * Redistribution and use in source and binary forms, with or without 25 | * modification, are permitted provided that the following conditions are met: 26 | * 27 | * 1. Redistributions of source code must retain the above copyright notice, this 28 | * list of conditions and the following disclaimer. 29 | * 2. Redistributions in binary form must reproduce the above copyright notice, 30 | * this list of conditions and the following disclaimer in the documentation 31 | * and/or other materials provided with the distribution. 32 | * 33 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 34 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 35 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 36 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 37 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 38 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 39 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 40 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 41 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 42 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 | * 44 | * CONTRIBUTION AGREEMENT 45 | * 46 | * By contributing to the BVLC/caffe repository through pull-request, comment, 47 | * or otherwise, the contributor releases their content to the 48 | * license and copyright terms herein. 49 | * 50 | ***************** END Caffe Copyright Notice and Disclaimer ******************** 51 | * 52 | * Copyright (c) 2018 Microsoft 53 | * Licensed under The MIT License [see LICENSE for details] 54 | * \file modulated_deformable_im2col.h 55 | * \brief Function definitions of converting an image to 56 | * column matrix based on kernel, padding, dilation, and offset. 57 | * These functions are mainly used in deformable convolution operators. 58 | * \ref: https://arxiv.org/abs/1811.11168 59 | * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu 60 | */ 61 | 62 | /***************** Adapted by Charles Shang *********************/ 63 | 64 | #ifndef DCN_V2_IM2COL_CUDA 65 | #define DCN_V2_IM2COL_CUDA 66 | 67 | #ifdef __cplusplus 68 | extern "C" 69 | { 70 | #endif 71 | 72 | void modulated_deformable_im2col_cuda(cudaStream_t stream, 73 | const float *data_im, const float *data_offset, const float *data_mask, 74 | const int batch_size, const int channels, const int height_im, const int width_im, 75 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 76 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 77 | const int dilation_h, const int dilation_w, 78 | const int deformable_group, float *data_col); 79 | 80 | void modulated_deformable_col2im_cuda(cudaStream_t stream, 81 | const float *data_col, const float *data_offset, const float *data_mask, 82 | const int batch_size, const int channels, const int height_im, const int width_im, 83 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 84 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 85 | const int dilation_h, const int dilation_w, 86 | const int deformable_group, float *grad_im); 87 | 88 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream, 89 | const float *data_col, const float *data_im, const float *data_offset, const float *data_mask, 90 | const int batch_size, const int channels, const int height_im, const int width_im, 91 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 92 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 93 | const int dilation_h, const int dilation_w, 94 | const int deformable_group, 95 | float *grad_offset, float *grad_mask); 96 | 97 | #ifdef __cplusplus 98 | } 99 | #endif 100 | 101 | #endif -------------------------------------------------------------------------------- /DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 Microsoft 3 | * Licensed under The MIT License [see LICENSE for details] 4 | * \file deformable_psroi_pooling.cu 5 | * \brief 6 | * \author Yi Li, Guodong Zhang, Jifeng Dai 7 | */ 8 | /***************** Adapted by Charles Shang *********************/ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | #define CUDA_KERNEL_LOOP(i, n) \ 23 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 24 | i < (n); \ 25 | i += blockDim.x * gridDim.x) 26 | 27 | const int CUDA_NUM_THREADS = 1024; 28 | inline int GET_BLOCKS(const int N) 29 | { 30 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 31 | } 32 | 33 | template 34 | __device__ T bilinear_interp( 35 | const T *data, 36 | const T x, 37 | const T y, 38 | const int width, 39 | const int height) 40 | { 41 | int x1 = floor(x); 42 | int x2 = ceil(x); 43 | int y1 = floor(y); 44 | int y2 = ceil(y); 45 | T dist_x = static_cast(x - x1); 46 | T dist_y = static_cast(y - y1); 47 | T value11 = data[y1 * width + x1]; 48 | T value12 = data[y2 * width + x1]; 49 | T value21 = data[y1 * width + x2]; 50 | T value22 = data[y2 * width + x2]; 51 | T value = (1 - dist_x) * (1 - dist_y) * value11 + 52 | (1 - dist_x) * dist_y * value12 + 53 | dist_x * (1 - dist_y) * value21 + 54 | dist_x * dist_y * value22; 55 | return value; 56 | } 57 | 58 | template 59 | __global__ void DeformablePSROIPoolForwardKernel( 60 | const int count, 61 | const T *bottom_data, 62 | const T spatial_scale, 63 | const int channels, 64 | const int height, const int width, 65 | const int pooled_height, const int pooled_width, 66 | const T *bottom_rois, const T *bottom_trans, 67 | const int no_trans, 68 | const T trans_std, 69 | const int sample_per_part, 70 | const int output_dim, 71 | const int group_size, 72 | const int part_size, 73 | const int num_classes, 74 | const int channels_each_class, 75 | T *top_data, 76 | T *top_count) 77 | { 78 | CUDA_KERNEL_LOOP(index, count) 79 | { 80 | // The output is in order (n, ctop, ph, pw) 81 | int pw = index % pooled_width; 82 | int ph = (index / pooled_width) % pooled_height; 83 | int ctop = (index / pooled_width / pooled_height) % output_dim; 84 | int n = index / pooled_width / pooled_height / output_dim; 85 | 86 | // [start, end) interval for spatial sampling 87 | const T *offset_bottom_rois = bottom_rois + n * 5; 88 | int roi_batch_ind = offset_bottom_rois[0]; 89 | T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; 90 | T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; 91 | T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; 92 | T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; 93 | 94 | // Force too small ROIs to be 1x1 95 | T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 96 | T roi_height = max(roi_end_h - roi_start_h, 0.1); 97 | 98 | // Compute w and h at bottom 99 | T bin_size_h = roi_height / static_cast(pooled_height); 100 | T bin_size_w = roi_width / static_cast(pooled_width); 101 | 102 | T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); 103 | T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); 104 | 105 | int part_h = floor(static_cast(ph) / pooled_height * part_size); 106 | int part_w = floor(static_cast(pw) / pooled_width * part_size); 107 | int class_id = ctop / channels_each_class; 108 | T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; 109 | T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; 110 | 111 | T wstart = static_cast(pw) * bin_size_w + roi_start_w; 112 | wstart += trans_x * roi_width; 113 | T hstart = static_cast(ph) * bin_size_h + roi_start_h; 114 | hstart += trans_y * roi_height; 115 | 116 | T sum = 0; 117 | int count = 0; 118 | int gw = floor(static_cast(pw) * group_size / pooled_width); 119 | int gh = floor(static_cast(ph) * group_size / pooled_height); 120 | gw = min(max(gw, 0), group_size - 1); 121 | gh = min(max(gh, 0), group_size - 1); 122 | 123 | const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; 124 | for (int ih = 0; ih < sample_per_part; ih++) 125 | { 126 | for (int iw = 0; iw < sample_per_part; iw++) 127 | { 128 | T w = wstart + iw * sub_bin_size_w; 129 | T h = hstart + ih * sub_bin_size_h; 130 | // bilinear interpolation 131 | if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) 132 | { 133 | continue; 134 | } 135 | w = min(max(w, 0.), width - 1.); 136 | h = min(max(h, 0.), height - 1.); 137 | int c = (ctop * group_size + gh) * group_size + gw; 138 | T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height); 139 | sum += val; 140 | count++; 141 | } 142 | } 143 | top_data[index] = count == 0 ? static_cast(0) : sum / count; 144 | top_count[index] = count; 145 | } 146 | } 147 | 148 | template 149 | __global__ void DeformablePSROIPoolBackwardAccKernel( 150 | const int count, 151 | const T *top_diff, 152 | const T *top_count, 153 | const int num_rois, 154 | const T spatial_scale, 155 | const int channels, 156 | const int height, const int width, 157 | const int pooled_height, const int pooled_width, 158 | const int output_dim, 159 | T *bottom_data_diff, T *bottom_trans_diff, 160 | const T *bottom_data, 161 | const T *bottom_rois, 162 | const T *bottom_trans, 163 | const int no_trans, 164 | const T trans_std, 165 | const int sample_per_part, 166 | const int group_size, 167 | const int part_size, 168 | const int num_classes, 169 | const int channels_each_class) 170 | { 171 | CUDA_KERNEL_LOOP(index, count) 172 | { 173 | // The output is in order (n, ctop, ph, pw) 174 | int pw = index % pooled_width; 175 | int ph = (index / pooled_width) % pooled_height; 176 | int ctop = (index / pooled_width / pooled_height) % output_dim; 177 | int n = index / pooled_width / pooled_height / output_dim; 178 | 179 | // [start, end) interval for spatial sampling 180 | const T *offset_bottom_rois = bottom_rois + n * 5; 181 | int roi_batch_ind = offset_bottom_rois[0]; 182 | T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; 183 | T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; 184 | T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; 185 | T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; 186 | 187 | // Force too small ROIs to be 1x1 188 | T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 189 | T roi_height = max(roi_end_h - roi_start_h, 0.1); 190 | 191 | // Compute w and h at bottom 192 | T bin_size_h = roi_height / static_cast(pooled_height); 193 | T bin_size_w = roi_width / static_cast(pooled_width); 194 | 195 | T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); 196 | T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); 197 | 198 | int part_h = floor(static_cast(ph) / pooled_height * part_size); 199 | int part_w = floor(static_cast(pw) / pooled_width * part_size); 200 | int class_id = ctop / channels_each_class; 201 | T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std; 202 | T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std; 203 | 204 | T wstart = static_cast(pw) * bin_size_w + roi_start_w; 205 | wstart += trans_x * roi_width; 206 | T hstart = static_cast(ph) * bin_size_h + roi_start_h; 207 | hstart += trans_y * roi_height; 208 | 209 | if (top_count[index] <= 0) 210 | { 211 | continue; 212 | } 213 | T diff_val = top_diff[index] / top_count[index]; 214 | const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; 215 | T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; 216 | int gw = floor(static_cast(pw) * group_size / pooled_width); 217 | int gh = floor(static_cast(ph) * group_size / pooled_height); 218 | gw = min(max(gw, 0), group_size - 1); 219 | gh = min(max(gh, 0), group_size - 1); 220 | 221 | for (int ih = 0; ih < sample_per_part; ih++) 222 | { 223 | for (int iw = 0; iw < sample_per_part; iw++) 224 | { 225 | T w = wstart + iw * sub_bin_size_w; 226 | T h = hstart + ih * sub_bin_size_h; 227 | // bilinear interpolation 228 | if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) 229 | { 230 | continue; 231 | } 232 | w = min(max(w, 0.), width - 1.); 233 | h = min(max(h, 0.), height - 1.); 234 | int c = (ctop * group_size + gh) * group_size + gw; 235 | // backward on feature 236 | int x0 = floor(w); 237 | int x1 = ceil(w); 238 | int y0 = floor(h); 239 | int y1 = ceil(h); 240 | T dist_x = w - x0, dist_y = h - y0; 241 | T q00 = (1 - dist_x) * (1 - dist_y); 242 | T q01 = (1 - dist_x) * dist_y; 243 | T q10 = dist_x * (1 - dist_y); 244 | T q11 = dist_x * dist_y; 245 | int bottom_index_base = c * height * width; 246 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val); 247 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val); 248 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val); 249 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val); 250 | 251 | if (no_trans) 252 | { 253 | continue; 254 | } 255 | T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; 256 | T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; 257 | T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; 258 | T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; 259 | T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val; 260 | diff_x *= roi_width; 261 | T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val; 262 | diff_y *= roi_height; 263 | 264 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x); 265 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y); 266 | } 267 | } 268 | } 269 | } 270 | 271 | std::tuple 272 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, 273 | const at::Tensor &bbox, 274 | const at::Tensor &trans, 275 | const int no_trans, 276 | const float spatial_scale, 277 | const int output_dim, 278 | const int group_size, 279 | const int pooled_size, 280 | const int part_size, 281 | const int sample_per_part, 282 | const float trans_std) 283 | { 284 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 285 | AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor"); 286 | AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); 287 | 288 | const int batch = input.size(0); 289 | const int channels = input.size(1); 290 | const int height = input.size(2); 291 | const int width = input.size(3); 292 | const int channels_trans = no_trans ? 2 : trans.size(1); 293 | const int num_bbox = bbox.size(0); 294 | 295 | AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); 296 | auto pooled_height = pooled_size; 297 | auto pooled_width = pooled_size; 298 | 299 | auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); 300 | long out_size = num_bbox * output_dim * pooled_height * pooled_width; 301 | auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options()); 302 | 303 | const int num_classes = no_trans ? 1 : channels_trans / 2; 304 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; 305 | 306 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 307 | 308 | if (out.numel() == 0) 309 | { 310 | THCudaCheck(cudaGetLastError()); 311 | return std::make_tuple(out, top_count); 312 | } 313 | 314 | dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); 315 | dim3 block(512); 316 | 317 | AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] { 318 | DeformablePSROIPoolForwardKernel<<>>( 319 | out_size, 320 | input.contiguous().data(), 321 | spatial_scale, 322 | channels, 323 | height, width, 324 | pooled_height, 325 | pooled_width, 326 | bbox.contiguous().data(), 327 | trans.contiguous().data(), 328 | no_trans, 329 | trans_std, 330 | sample_per_part, 331 | output_dim, 332 | group_size, 333 | part_size, 334 | num_classes, 335 | channels_each_class, 336 | out.data(), 337 | top_count.data()); 338 | }); 339 | THCudaCheck(cudaGetLastError()); 340 | return std::make_tuple(out, top_count); 341 | } 342 | 343 | std::tuple 344 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, 345 | const at::Tensor &input, 346 | const at::Tensor &bbox, 347 | const at::Tensor &trans, 348 | const at::Tensor &top_count, 349 | const int no_trans, 350 | const float spatial_scale, 351 | const int output_dim, 352 | const int group_size, 353 | const int pooled_size, 354 | const int part_size, 355 | const int sample_per_part, 356 | const float trans_std) 357 | { 358 | AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor"); 359 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 360 | AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor"); 361 | AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor"); 362 | AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor"); 363 | 364 | const int batch = input.size(0); 365 | const int channels = input.size(1); 366 | const int height = input.size(2); 367 | const int width = input.size(3); 368 | const int channels_trans = no_trans ? 2 : trans.size(1); 369 | const int num_bbox = bbox.size(0); 370 | 371 | AT_ASSERTM(channels == output_dim, "input channels and output channels must equal"); 372 | auto pooled_height = pooled_size; 373 | auto pooled_width = pooled_size; 374 | long out_size = num_bbox * output_dim * pooled_height * pooled_width; 375 | const int num_classes = no_trans ? 1 : channels_trans / 2; 376 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; 377 | 378 | auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options()); 379 | auto trans_grad = at::zeros_like(trans); 380 | 381 | if (input_grad.numel() == 0) 382 | { 383 | THCudaCheck(cudaGetLastError()); 384 | return std::make_tuple(input_grad, trans_grad); 385 | } 386 | 387 | dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L)); 388 | dim3 block(512); 389 | cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 390 | 391 | AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] { 392 | DeformablePSROIPoolBackwardAccKernel<<>>( 393 | out_size, 394 | out_grad.contiguous().data(), 395 | top_count.contiguous().data(), 396 | num_bbox, 397 | spatial_scale, 398 | channels, 399 | height, 400 | width, 401 | pooled_height, 402 | pooled_width, 403 | output_dim, 404 | input_grad.contiguous().data(), 405 | trans_grad.contiguous().data(), 406 | input.contiguous().data(), 407 | bbox.contiguous().data(), 408 | trans.contiguous().data(), 409 | no_trans, 410 | trans_std, 411 | sample_per_part, 412 | group_size, 413 | part_size, 414 | num_classes, 415 | channels_each_class); 416 | }); 417 | THCudaCheck(cudaGetLastError()); 418 | return std::make_tuple(input_grad, trans_grad); 419 | } -------------------------------------------------------------------------------- /DCNv2/src/cuda/vision.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | at::Tensor 5 | dcn_v2_cuda_forward(const at::Tensor &input, 6 | const at::Tensor &weight, 7 | const at::Tensor &bias, 8 | const at::Tensor &offset, 9 | const at::Tensor &mask, 10 | const int kernel_h, 11 | const int kernel_w, 12 | const int stride_h, 13 | const int stride_w, 14 | const int pad_h, 15 | const int pad_w, 16 | const int dilation_h, 17 | const int dilation_w, 18 | const int deformable_group); 19 | 20 | std::vector 21 | dcn_v2_cuda_backward(const at::Tensor &input, 22 | const at::Tensor &weight, 23 | const at::Tensor &bias, 24 | const at::Tensor &offset, 25 | const at::Tensor &mask, 26 | const at::Tensor &grad_output, 27 | int kernel_h, int kernel_w, 28 | int stride_h, int stride_w, 29 | int pad_h, int pad_w, 30 | int dilation_h, int dilation_w, 31 | int deformable_group); 32 | 33 | 34 | std::tuple 35 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input, 36 | const at::Tensor &bbox, 37 | const at::Tensor &trans, 38 | const int no_trans, 39 | const float spatial_scale, 40 | const int output_dim, 41 | const int group_size, 42 | const int pooled_size, 43 | const int part_size, 44 | const int sample_per_part, 45 | const float trans_std); 46 | 47 | std::tuple 48 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad, 49 | const at::Tensor &input, 50 | const at::Tensor &bbox, 51 | const at::Tensor &trans, 52 | const at::Tensor &top_count, 53 | const int no_trans, 54 | const float spatial_scale, 55 | const int output_dim, 56 | const int group_size, 57 | const int pooled_size, 58 | const int part_size, 59 | const int sample_per_part, 60 | const float trans_std); -------------------------------------------------------------------------------- /DCNv2/src/dcn_v2.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | at::Tensor 10 | dcn_v2_forward(const at::Tensor &input, 11 | const at::Tensor &weight, 12 | const at::Tensor &bias, 13 | const at::Tensor &offset, 14 | const at::Tensor &mask, 15 | const int kernel_h, 16 | const int kernel_w, 17 | const int stride_h, 18 | const int stride_w, 19 | const int pad_h, 20 | const int pad_w, 21 | const int dilation_h, 22 | const int dilation_w, 23 | const int deformable_group) 24 | { 25 | if (input.type().is_cuda()) 26 | { 27 | #ifdef WITH_CUDA 28 | return dcn_v2_cuda_forward(input, weight, bias, offset, mask, 29 | kernel_h, kernel_w, 30 | stride_h, stride_w, 31 | pad_h, pad_w, 32 | dilation_h, dilation_w, 33 | deformable_group); 34 | #else 35 | AT_ERROR("Not compiled with GPU support"); 36 | #endif 37 | } 38 | AT_ERROR("Not implemented on the CPU"); 39 | } 40 | 41 | std::vector 42 | dcn_v2_backward(const at::Tensor &input, 43 | const at::Tensor &weight, 44 | const at::Tensor &bias, 45 | const at::Tensor &offset, 46 | const at::Tensor &mask, 47 | const at::Tensor &grad_output, 48 | int kernel_h, int kernel_w, 49 | int stride_h, int stride_w, 50 | int pad_h, int pad_w, 51 | int dilation_h, int dilation_w, 52 | int deformable_group) 53 | { 54 | if (input.type().is_cuda()) 55 | { 56 | #ifdef WITH_CUDA 57 | return dcn_v2_cuda_backward(input, 58 | weight, 59 | bias, 60 | offset, 61 | mask, 62 | grad_output, 63 | kernel_h, kernel_w, 64 | stride_h, stride_w, 65 | pad_h, pad_w, 66 | dilation_h, dilation_w, 67 | deformable_group); 68 | #else 69 | AT_ERROR("Not compiled with GPU support"); 70 | #endif 71 | } 72 | AT_ERROR("Not implemented on the CPU"); 73 | } 74 | 75 | std::tuple 76 | dcn_v2_psroi_pooling_forward(const at::Tensor &input, 77 | const at::Tensor &bbox, 78 | const at::Tensor &trans, 79 | const int no_trans, 80 | const float spatial_scale, 81 | const int output_dim, 82 | const int group_size, 83 | const int pooled_size, 84 | const int part_size, 85 | const int sample_per_part, 86 | const float trans_std) 87 | { 88 | if (input.type().is_cuda()) 89 | { 90 | #ifdef WITH_CUDA 91 | return dcn_v2_psroi_pooling_cuda_forward(input, 92 | bbox, 93 | trans, 94 | no_trans, 95 | spatial_scale, 96 | output_dim, 97 | group_size, 98 | pooled_size, 99 | part_size, 100 | sample_per_part, 101 | trans_std); 102 | #else 103 | AT_ERROR("Not compiled with GPU support"); 104 | #endif 105 | } 106 | AT_ERROR("Not implemented on the CPU"); 107 | } 108 | 109 | std::tuple 110 | dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad, 111 | const at::Tensor &input, 112 | const at::Tensor &bbox, 113 | const at::Tensor &trans, 114 | const at::Tensor &top_count, 115 | const int no_trans, 116 | const float spatial_scale, 117 | const int output_dim, 118 | const int group_size, 119 | const int pooled_size, 120 | const int part_size, 121 | const int sample_per_part, 122 | const float trans_std) 123 | { 124 | if (input.type().is_cuda()) 125 | { 126 | #ifdef WITH_CUDA 127 | return dcn_v2_psroi_pooling_cuda_backward(out_grad, 128 | input, 129 | bbox, 130 | trans, 131 | top_count, 132 | no_trans, 133 | spatial_scale, 134 | output_dim, 135 | group_size, 136 | pooled_size, 137 | part_size, 138 | sample_per_part, 139 | trans_std); 140 | #else 141 | AT_ERROR("Not compiled with GPU support"); 142 | #endif 143 | } 144 | AT_ERROR("Not implemented on the CPU"); 145 | } 146 | 147 | at::Tensor 148 | dcn_v2_cuda_forward_v2(const at::Tensor &input, 149 | const at::Tensor &weight, 150 | const at::Tensor &bias, 151 | const at::Tensor &offset, 152 | const at::Tensor &mask, 153 | const int64_t kernel_h, 154 | const int64_t kernel_w, 155 | const int64_t stride_h, 156 | const int64_t stride_w, 157 | const int64_t pad_h, 158 | const int64_t pad_w, 159 | const int64_t dilation_h, 160 | const int64_t dilation_w, 161 | const int64_t deformable_group); -------------------------------------------------------------------------------- /DCNv2/src/vision.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dcn_v2.h" 3 | #include 4 | 5 | // static auto registry = 6 | // torch::jit::RegisterOperators("my_ops::dcn_v2_forward", &dcn_v2_forward) 7 | // .op("my_ops::dcn_v2_backward", &dcn_v2_backward) 8 | // .op("my_ops::dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward) 9 | // .op("my_ops::dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward); 10 | 11 | static auto registry = 12 | torch::jit::RegisterOperators("my_ops::dcn_v2_cuda_forward_v2", &dcn_v2_cuda_forward_v2); 13 | 14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) 15 | { 16 | m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward"); 17 | m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward"); 18 | m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward"); 19 | m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward"); 20 | } 21 | -------------------------------------------------------------------------------- /DCNv2/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | from __future__ import print_function 4 | from __future__ import division 5 | 6 | import time 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import gradcheck 10 | 11 | from dcn_v2 import dcn_v2_conv, DCNv2, DCN 12 | from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling 13 | 14 | deformable_groups = 1 15 | N, inC, inH, inW = 2, 2, 4, 4 16 | outC = 2 17 | kH, kW = 3, 3 18 | 19 | 20 | def conv_identify(weight, bias): 21 | weight.data.zero_() 22 | bias.data.zero_() 23 | o, i, h, w = weight.shape 24 | y = h//2 25 | x = w//2 26 | for p in range(i): 27 | for q in range(o): 28 | if p == q: 29 | weight.data[q, p, y, x] = 1.0 30 | 31 | 32 | def check_zero_offset(): 33 | conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW, 34 | kernel_size=(kH, kW), 35 | stride=(1, 1), 36 | padding=(1, 1), 37 | bias=True).cuda() 38 | 39 | conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW, 40 | kernel_size=(kH, kW), 41 | stride=(1, 1), 42 | padding=(1, 1), 43 | bias=True).cuda() 44 | 45 | dcn_v2 = DCNv2(inC, outC, (kH, kW), 46 | stride=1, padding=1, dilation=1, 47 | deformable_groups=deformable_groups).cuda() 48 | 49 | conv_offset.weight.data.zero_() 50 | conv_offset.bias.data.zero_() 51 | conv_mask.weight.data.zero_() 52 | conv_mask.bias.data.zero_() 53 | conv_identify(dcn_v2.weight, dcn_v2.bias) 54 | 55 | input = torch.randn(N, inC, inH, inW).cuda() 56 | offset = conv_offset(input) 57 | mask = conv_mask(input) 58 | mask = torch.sigmoid(mask) 59 | output = dcn_v2(input, offset, mask) 60 | output *= 2 61 | d = (input - output).abs().max() 62 | if d < 1e-10: 63 | print('Zero offset passed') 64 | else: 65 | print('Zero offset failed') 66 | print(input) 67 | print(output) 68 | 69 | def check_gradient_dconv(): 70 | 71 | input = torch.rand(N, inC, inH, inW).cuda() * 0.01 72 | input.requires_grad = True 73 | 74 | offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2 75 | # offset.data.zero_() 76 | # offset.data -= 0.5 77 | offset.requires_grad = True 78 | 79 | mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda() 80 | # mask.data.zero_() 81 | mask.requires_grad = True 82 | mask = torch.sigmoid(mask) 83 | 84 | weight = torch.randn(outC, inC, kH, kW).cuda() 85 | weight.requires_grad = True 86 | 87 | bias = torch.rand(outC).cuda() 88 | bias.requires_grad = True 89 | 90 | stride = 1 91 | padding = 1 92 | dilation = 1 93 | 94 | print('check_gradient_dconv: ', 95 | gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias, 96 | stride, padding, dilation, deformable_groups), 97 | eps=1e-3, atol=1e-4, rtol=1e-2)) 98 | 99 | 100 | def check_pooling_zero_offset(): 101 | 102 | input = torch.randn(2, 16, 64, 64).cuda().zero_() 103 | input[0, :, 16:26, 16:26] = 1. 104 | input[1, :, 10:20, 20:30] = 2. 105 | rois = torch.tensor([ 106 | [0, 65, 65, 103, 103], 107 | [1, 81, 41, 119, 79], 108 | ]).cuda().float() 109 | pooling = DCNv2Pooling(spatial_scale=1.0 / 4, 110 | pooled_size=7, 111 | output_dim=16, 112 | no_trans=True, 113 | group_size=1, 114 | trans_std=0.0).cuda() 115 | 116 | out = pooling(input, rois, input.new()) 117 | s = ', '.join(['%f' % out[i, :, :, :].mean().item() 118 | for i in range(rois.shape[0])]) 119 | print(s) 120 | 121 | dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, 122 | pooled_size=7, 123 | output_dim=16, 124 | no_trans=False, 125 | group_size=1, 126 | trans_std=0.0).cuda() 127 | offset = torch.randn(20, 2, 7, 7).cuda().zero_() 128 | dout = dpooling(input, rois, offset) 129 | s = ', '.join(['%f' % dout[i, :, :, :].mean().item() 130 | for i in range(rois.shape[0])]) 131 | print(s) 132 | 133 | 134 | def check_gradient_dpooling(): 135 | input = torch.randn(2, 3, 5, 5).cuda() * 0.01 136 | N = 4 137 | batch_inds = torch.randint(2, (N, 1)).cuda().float() 138 | x = torch.rand((N, 1)).cuda().float() * 15 139 | y = torch.rand((N, 1)).cuda().float() * 15 140 | w = torch.rand((N, 1)).cuda().float() * 10 141 | h = torch.rand((N, 1)).cuda().float() * 10 142 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) 143 | offset = torch.randn(N, 2, 3, 3).cuda() 144 | input.requires_grad = True 145 | offset.requires_grad = True 146 | 147 | spatial_scale = 1.0 / 4 148 | pooled_size = 3 149 | output_dim = 3 150 | no_trans = 0 151 | group_size = 1 152 | trans_std = 0.0 153 | sample_per_part = 4 154 | part_size = pooled_size 155 | 156 | print('check_gradient_dpooling:', 157 | gradcheck(dcn_v2_pooling, (input, rois, offset, 158 | spatial_scale, 159 | pooled_size, 160 | output_dim, 161 | no_trans, 162 | group_size, 163 | part_size, 164 | sample_per_part, 165 | trans_std), 166 | eps=1e-4)) 167 | 168 | 169 | def example_dconv(): 170 | input = torch.randn(2, 64, 128, 128).cuda() 171 | # wrap all things (offset and mask) in DCN 172 | dcn = DCN(64, 64, kernel_size=(3, 3), stride=1, 173 | padding=1, deformable_groups=2).cuda() 174 | # print(dcn.weight.shape, input.shape) 175 | output = dcn(input) 176 | targert = output.new(*output.size()) 177 | targert.data.uniform_(-0.01, 0.01) 178 | error = (targert - output).mean() 179 | error.backward() 180 | print(output.shape) 181 | 182 | 183 | def example_dpooling(): 184 | input = torch.randn(2, 32, 64, 64).cuda() 185 | batch_inds = torch.randint(2, (20, 1)).cuda().float() 186 | x = torch.randint(256, (20, 1)).cuda().float() 187 | y = torch.randint(256, (20, 1)).cuda().float() 188 | w = torch.randint(64, (20, 1)).cuda().float() 189 | h = torch.randint(64, (20, 1)).cuda().float() 190 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) 191 | offset = torch.randn(20, 2, 7, 7).cuda() 192 | input.requires_grad = True 193 | offset.requires_grad = True 194 | 195 | # normal roi_align 196 | pooling = DCNv2Pooling(spatial_scale=1.0 / 4, 197 | pooled_size=7, 198 | output_dim=32, 199 | no_trans=True, 200 | group_size=1, 201 | trans_std=0.1).cuda() 202 | 203 | # deformable pooling 204 | dpooling = DCNv2Pooling(spatial_scale=1.0 / 4, 205 | pooled_size=7, 206 | output_dim=32, 207 | no_trans=False, 208 | group_size=1, 209 | trans_std=0.1).cuda() 210 | 211 | out = pooling(input, rois, offset) 212 | dout = dpooling(input, rois, offset) 213 | print(out.shape) 214 | print(dout.shape) 215 | 216 | target_out = out.new(*out.size()) 217 | target_out.data.uniform_(-0.01, 0.01) 218 | target_dout = dout.new(*dout.size()) 219 | target_dout.data.uniform_(-0.01, 0.01) 220 | e = (target_out - out).mean() 221 | e.backward() 222 | e = (target_dout - dout).mean() 223 | e.backward() 224 | 225 | 226 | def example_mdpooling(): 227 | input = torch.randn(2, 32, 64, 64).cuda() 228 | input.requires_grad = True 229 | batch_inds = torch.randint(2, (20, 1)).cuda().float() 230 | x = torch.randint(256, (20, 1)).cuda().float() 231 | y = torch.randint(256, (20, 1)).cuda().float() 232 | w = torch.randint(64, (20, 1)).cuda().float() 233 | h = torch.randint(64, (20, 1)).cuda().float() 234 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1) 235 | 236 | # mdformable pooling (V2) 237 | dpooling = DCNPooling(spatial_scale=1.0 / 4, 238 | pooled_size=7, 239 | output_dim=32, 240 | no_trans=False, 241 | group_size=1, 242 | trans_std=0.1, 243 | deform_fc_dim=1024).cuda() 244 | 245 | dout = dpooling(input, rois) 246 | target = dout.new(*dout.size()) 247 | target.data.uniform_(-0.1, 0.1) 248 | error = (target - dout).mean() 249 | error.backward() 250 | print(dout.shape) 251 | 252 | 253 | if __name__ == '__main__': 254 | 255 | example_dconv() 256 | example_dpooling() 257 | example_mdpooling() 258 | 259 | check_pooling_zero_offset() 260 | # zero offset check 261 | if inC == outC: 262 | check_zero_offset() 263 | 264 | check_gradient_dpooling() 265 | check_gradient_dconv() 266 | # """ 267 | # ****** Note: backward is not reentrant error may not be a serious problem, 268 | # ****** since the max error is less than 1e-7, 269 | # ****** Still looking for what trigger this problem 270 | # """ 271 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Charles Shang 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Convert CenterNet pytorch model to Torch Script for LibTorch 2 | can convert dla34 official model 3 | 4 | ## C PLUS PLUS CALL 5 | refer to the dcn_cpp_plugin -------------------------------------------------------------------------------- /dcn_cpp_plugin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1 FATAL_ERROR) 2 | project(dcn_v2_cuda_forward_v2) 3 | 4 | #add_compile_options(-std=c++11) 5 | 6 | #add_definitions(-D WITH_CUDA) 7 | 8 | set(Torch_DIR /usr/local/libtorch/share/cmake/Torch) 9 | find_package(Torch REQUIRED) 10 | 11 | include_directories(/usr/include/python3.5m) 12 | include_directories(/usr/include/python2.7/) 13 | 14 | 15 | #include_directories(/usr/local/cuda/include) 16 | #link_directories(/usr/local/cuda/lib64) 17 | 18 | set(CUDA_HOST_COMPILATION_CPP ON) 19 | #set(TORCH_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__") 20 | 21 | #set(CUDA_NVCC_FLAGS -std=c++11 22 | #-DCUDA_HAS_FP16=1 23 | #-D__CUDA_NO_HALF_OPERATORS__ 24 | #-D__CUDA_NO_HALF_CONVERSIONS__ 25 | #-D__CUDA_NO_HALF2_OPERATORS__) 26 | 27 | set(CUDA_NVCC_FLAGS -std=c++11 28 | -D__CUDA_NO_HALF_OPERATORS__ ) 29 | 30 | #set(CUDA_NVCC_FLAGS -std=c++11 -Xcompiler -fexceptions -Xcompiler -fPIC 31 | #-gencode arch=compute_30,code=sm_30 32 | #-gencode arch=compute_35,code=sm_35 33 | #-gencode arch=compute_50,code=sm_50 34 | #-gencode arch=compute_60,code=sm_60 35 | #-gencode arch=compute_60,code=compute_60) 36 | 37 | 38 | cuda_add_library(${PROJECT_NAME} SHARED 39 | vision.cpp 40 | dcn_v2_cuda.cu 41 | dcn_v2_im2col_cuda.cu 42 | ) 43 | 44 | # Enable C++11 45 | target_compile_features(${PROJECT_NAME} PRIVATE cxx_range_for) 46 | # Link against LibTorch 47 | target_link_libraries(${PROJECT_NAME} "${TORCH_LIBRARIES}") 48 | 49 | install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION lib) 50 | -------------------------------------------------------------------------------- /dcn_cpp_plugin/README.md: -------------------------------------------------------------------------------- 1 | # DCN C PLUS CPLUS PLUGIN 2 | 3 | ## usage 4 | void handle = dlopen("libdcn_v2_cuda_forward_v2.so", RTLD_LAZY); 5 | 6 | int gpu_id = 0; 7 | torch::jit::script::Module module = 8 | torch::jit::load("centernet.pt", torch::Device(torch::DeviceType::CUDA, gpu_id)); 9 | -------------------------------------------------------------------------------- /dcn_cpp_plugin/dcn_v2.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef DCN_V2_H 3 | #define DCN_V2_H 4 | #include 5 | 6 | #ifdef __cplusplus 7 | extern "C" 8 | { 9 | #endif 10 | 11 | 12 | at::Tensor 13 | dcn_v2_cuda_forward(const at::Tensor &input, 14 | const at::Tensor &weight, 15 | const at::Tensor &bias, 16 | const at::Tensor &offset, 17 | const at::Tensor &mask, 18 | const int64_t kernel_h, 19 | const int64_t kernel_w, 20 | const int64_t stride_h, 21 | const int64_t stride_w, 22 | const int64_t pad_h, 23 | const int64_t pad_w, 24 | const int64_t dilation_h, 25 | const int64_t dilation_w, 26 | const int64_t deformable_group); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif -------------------------------------------------------------------------------- /dcn_cpp_plugin/dcn_v2_cuda.cu: -------------------------------------------------------------------------------- 1 | #include "dcn_v2_im2col_cuda.h" 2 | #include "dcn_v2.h" 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | //extern THCState *state; 12 | 13 | //THCState *state; 14 | 15 | // THCState *state = at::globalContext().thc_state; 16 | 17 | THCState *state = at::globalContext().lazyInitCUDA(); 18 | //THCState *state = at::globalContext().getTHCState(); 19 | 20 | __global__ void createBatchGemmBuffer(const float **input_b, float **output_b, 21 | float **columns_b, const float **ones_b, 22 | const float **weight_b, const float **bias_b, 23 | float *input, float *output, 24 | float *columns, float *ones, 25 | float *weight, float *bias, 26 | const int input_stride, const int output_stride, 27 | const int columns_stride, const int ones_stride, 28 | const int num_batches) 29 | { 30 | const int idx = blockIdx.x * blockDim.x + threadIdx.x; 31 | if (idx < num_batches) 32 | { 33 | input_b[idx] = input + idx * input_stride; 34 | output_b[idx] = output + idx * output_stride; 35 | columns_b[idx] = columns + idx * columns_stride; 36 | ones_b[idx] = ones + idx * ones_stride; 37 | // share weights and bias within a Mini-Batch 38 | weight_b[idx] = weight; 39 | bias_b[idx] = bias; 40 | } 41 | } 42 | 43 | at::Tensor 44 | dcn_v2_cuda_forward(const at::Tensor &input, 45 | const at::Tensor &weight, 46 | const at::Tensor &bias, 47 | const at::Tensor &offset, 48 | const at::Tensor &mask, 49 | const int64_t kernel_h, 50 | const int64_t kernel_w, 51 | const int64_t stride_h, 52 | const int64_t stride_w, 53 | const int64_t pad_h, 54 | const int64_t pad_w, 55 | const int64_t dilation_h, 56 | const int64_t dilation_w, 57 | const int64_t deformable_group) 58 | { 59 | using scalar_t = float; 60 | //THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask)); 61 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); 62 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor"); 63 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor"); 64 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); 65 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); 66 | const int batch = input.size(0); 67 | const int channels = input.size(1); 68 | const int height = input.size(2); 69 | const int width = input.size(3); 70 | 71 | const int channels_out = weight.size(0); 72 | const int channels_kernel = weight.size(1); 73 | const int kernel_h_ = weight.size(2); 74 | const int kernel_w_ = weight.size(3); 75 | 76 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w, 77 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_); 78 | 79 | AT_ASSERTM(channels == channels_kernel, 80 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel); 81 | 82 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 83 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 84 | 85 | auto ones = at::ones({batch, height_out, width_out}, input.options()); 86 | auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options()); 87 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options()); 88 | 89 | int matrices_size = batch * sizeof(float *); 90 | 91 | auto input_b = static_cast(THCudaMalloc(state, matrices_size)); 92 | auto output_b = static_cast(THCudaMalloc(state, matrices_size)); 93 | auto columns_b = static_cast(THCudaMalloc(state, matrices_size)); 94 | auto ones_b = static_cast(THCudaMalloc(state, matrices_size)); 95 | auto weight_b = static_cast(THCudaMalloc(state, matrices_size)); 96 | auto bias_b = static_cast(THCudaMalloc(state, matrices_size)); 97 | 98 | const int block = 128; 99 | const int grid = (batch + block - 1) / block; 100 | 101 | createBatchGemmBuffer<<>>( 102 | (const float**)input_b, output_b, 103 | columns_b, ones_b, 104 | weight_b, bias_b, 105 | input.data(), 106 | output.data(), 107 | columns.data(), 108 | ones.data(), 109 | weight.data(), 110 | bias.data(), 111 | channels * width * height, 112 | channels_out * width_out * height_out, 113 | channels * kernel_h * kernel_w * height_out * width_out, 114 | height_out * width_out, 115 | batch); 116 | 117 | long m_ = channels_out; 118 | long n_ = height_out * width_out; 119 | long k_ = 1; 120 | THCudaBlas_SgemmBatched(state, 121 | 't', 122 | 'n', 123 | n_, 124 | m_, 125 | k_, 126 | 1.0f, 127 | ones_b, k_, 128 | bias_b, k_, 129 | 0.0f, 130 | output_b, n_, 131 | batch); 132 | 133 | modulated_deformable_im2col_cuda(THCState_getCurrentStream(state), 134 | input.data(), 135 | offset.data(), 136 | mask.data(), 137 | batch, channels, height, width, 138 | height_out, width_out, kernel_h, kernel_w, 139 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, 140 | deformable_group, 141 | columns.data()); 142 | 143 | long m = channels_out; 144 | long n = height_out * width_out; 145 | long k = channels * kernel_h * kernel_w; 146 | THCudaBlas_SgemmBatched(state, 147 | 'n', 148 | 'n', 149 | n, 150 | m, 151 | k, 152 | 1.0f, 153 | (const float **)columns_b, n, 154 | weight_b, k, 155 | 1.0f, 156 | output_b, n, 157 | batch); 158 | 159 | THCudaFree(state, input_b); 160 | THCudaFree(state, output_b); 161 | THCudaFree(state, columns_b); 162 | THCudaFree(state, ones_b); 163 | THCudaFree(state, weight_b); 164 | THCudaFree(state, bias_b); 165 | return output; 166 | } 167 | 168 | -------------------------------------------------------------------------------- /dcn_cpp_plugin/dcn_v2_im2col_cuda.cu: -------------------------------------------------------------------------------- 1 | #include "dcn_v2_im2col_cuda.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | #define CUDA_KERNEL_LOOP(i, n) \ 14 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 15 | i < (n); \ 16 | i += blockDim.x * gridDim.x) 17 | 18 | const int CUDA_NUM_THREADS = 1024; 19 | inline int GET_BLOCKS(const int N) 20 | { 21 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 22 | } 23 | 24 | 25 | __device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width, 26 | const int height, const int width, float h, float w) 27 | { 28 | int h_low = floor(h); 29 | int w_low = floor(w); 30 | int h_high = h_low + 1; 31 | int w_high = w_low + 1; 32 | 33 | float lh = h - h_low; 34 | float lw = w - w_low; 35 | float hh = 1 - lh, hw = 1 - lw; 36 | 37 | float v1 = 0; 38 | if (h_low >= 0 && w_low >= 0) 39 | v1 = bottom_data[h_low * data_width + w_low]; 40 | float v2 = 0; 41 | if (h_low >= 0 && w_high <= width - 1) 42 | v2 = bottom_data[h_low * data_width + w_high]; 43 | float v3 = 0; 44 | if (h_high <= height - 1 && w_low >= 0) 45 | v3 = bottom_data[h_high * data_width + w_low]; 46 | float v4 = 0; 47 | if (h_high <= height - 1 && w_high <= width - 1) 48 | v4 = bottom_data[h_high * data_width + w_high]; 49 | 50 | float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; 51 | 52 | float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); 53 | return val; 54 | } 55 | __device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w, 56 | const int height, const int width, const float *im_data, 57 | const int data_width, const int bp_dir) 58 | { 59 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) 60 | { 61 | //empty 62 | return 0; 63 | } 64 | 65 | int argmax_h_low = floor(argmax_h); 66 | int argmax_w_low = floor(argmax_w); 67 | int argmax_h_high = argmax_h_low + 1; 68 | int argmax_w_high = argmax_w_low + 1; 69 | 70 | float weight = 0; 71 | 72 | if (bp_dir == 0) 73 | { 74 | if (argmax_h_low >= 0 && argmax_w_low >= 0) 75 | weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; 76 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1) 77 | weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; 78 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0) 79 | weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; 80 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) 81 | weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 82 | } 83 | else if (bp_dir == 1) 84 | { 85 | if (argmax_h_low >= 0 && argmax_w_low >= 0) 86 | weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; 87 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1) 88 | weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; 89 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0) 90 | weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; 91 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) 92 | weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 93 | } 94 | 95 | return weight; 96 | } 97 | 98 | __global__ void modulated_deformable_im2col_gpu_kernel(const int n, 99 | const float *data_im, const float *data_offset, const float *data_mask, 100 | const int height, const int width, const int kernel_h, const int kernel_w, 101 | const int pad_h, const int pad_w, 102 | const int stride_h, const int stride_w, 103 | const int dilation_h, const int dilation_w, 104 | const int channel_per_deformable_group, 105 | const int batch_size, const int num_channels, const int deformable_group, 106 | const int height_col, const int width_col, 107 | float *data_col) 108 | { 109 | // launch channels * batch_size * height_col * width_col cores 110 | CUDA_KERNEL_LOOP(index, n) 111 | { 112 | // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow) 113 | // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis 114 | 115 | // index index of output matrix 116 | const int w_col = index % width_col; 117 | const int h_col = (index / width_col) % height_col; 118 | // const int b_col = (index / width_col / height_col) % batch_size; 119 | const int b_col = (index / width_col / height_col / num_channels) % batch_size; 120 | // const int c_im = (index / width_col / height_col) / batch_size; 121 | const int c_im = (index / width_col / height_col) % num_channels; 122 | // const int c_col = c_im * kernel_h * kernel_w; 123 | const int c_col = c_im * kernel_h * kernel_w; 124 | 125 | // compute deformable group index 126 | const int deformable_group_index = c_im / channel_per_deformable_group; 127 | 128 | const int h_in = h_col * stride_h - pad_h; 129 | const int w_in = w_col * stride_w - pad_w; 130 | 131 | // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; 132 | float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col; 133 | //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in; 134 | const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; 135 | const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; 136 | 137 | const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; 138 | 139 | for (int i = 0; i < kernel_h; ++i) 140 | { 141 | for (int j = 0; j < kernel_w; ++j) 142 | { 143 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; 144 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; 145 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; 146 | const float offset_h = data_offset_ptr[data_offset_h_ptr]; 147 | const float offset_w = data_offset_ptr[data_offset_w_ptr]; 148 | const float mask = data_mask_ptr[data_mask_hw_ptr]; 149 | float val = static_cast(0); 150 | const float h_im = h_in + i * dilation_h + offset_h; 151 | const float w_im = w_in + j * dilation_w + offset_w; 152 | //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { 153 | if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) 154 | { 155 | //const float map_h = i * dilation_h + offset_h; 156 | //const float map_w = j * dilation_w + offset_w; 157 | //const int cur_height = height - h_in; 158 | //const int cur_width = width - w_in; 159 | //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); 160 | val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); 161 | } 162 | *data_col_ptr = val * mask; 163 | // data_col_ptr += batch_size * height_col * width_col; 164 | data_col_ptr += height_col * width_col; 165 | } 166 | } 167 | } 168 | } 169 | 170 | void modulated_deformable_im2col_cuda(cudaStream_t stream, 171 | const float* data_im, const float* data_offset, const float* data_mask, 172 | const int batch_size, const int channels, const int height_im, const int width_im, 173 | const int height_col, const int width_col, const int kernel_h, const int kernel_w, 174 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 175 | const int dilation_h, const int dilation_w, 176 | const int deformable_group, float* data_col) { 177 | // num_axes should be smaller than block size 178 | const int channel_per_deformable_group = channels / deformable_group; 179 | const int num_kernels = channels * batch_size * height_col * width_col; 180 | modulated_deformable_im2col_gpu_kernel 181 | <<>>( 183 | num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w, 184 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group, 185 | batch_size, channels, deformable_group, height_col, width_col, data_col); 186 | 187 | cudaError_t err = cudaGetLastError(); 188 | if (err != cudaSuccess) 189 | { 190 | printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); 191 | } 192 | 193 | } -------------------------------------------------------------------------------- /dcn_cpp_plugin/dcn_v2_im2col_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef DCN_V2_IM2COL_CUDA 2 | #define DCN_V2_IM2COL_CUDA 3 | 4 | #ifdef __cplusplus 5 | extern "C" 6 | { 7 | #endif 8 | 9 | void modulated_deformable_im2col_cuda(cudaStream_t stream, 10 | const float *data_im, const float *data_offset, const float *data_mask, 11 | const int batch_size, const int channels, const int height_im, const int width_im, 12 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 13 | const int pad_h, const int pad_w, const int stride_h, const int stride_w, 14 | const int dilation_h, const int dilation_w, 15 | const int deformable_group, float *data_col); 16 | 17 | #ifdef __cplusplus 18 | } 19 | #endif 20 | 21 | #endif -------------------------------------------------------------------------------- /dcn_cpp_plugin/vision.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dcn_v2.h" 3 | #include 4 | #include "dcn_v2.h" 5 | 6 | at::Tensor 7 | dcn_v2_cuda_forward_v2(const at::Tensor &input, 8 | const at::Tensor &weight, 9 | const at::Tensor &bias, 10 | const at::Tensor &offset, 11 | const at::Tensor &mask, 12 | const int64_t kernel_h, 13 | const int64_t kernel_w, 14 | const int64_t stride_h, 15 | const int64_t stride_w, 16 | const int64_t pad_h, 17 | const int64_t pad_w, 18 | const int64_t dilation_h, 19 | const int64_t dilation_w, 20 | const int64_t deformable_group) 21 | { 22 | return dcn_v2_cuda_forward(input, 23 | weight, 24 | bias, 25 | offset, 26 | mask, 27 | kernel_h, 28 | kernel_w, 29 | stride_h, 30 | stride_w, 31 | pad_h, 32 | pad_w, 33 | dilation_h, 34 | dilation_w, 35 | deformable_group); 36 | } 37 | 38 | static auto registry = 39 | torch::jit::RegisterOperators("my_ops::dcn_v2_cuda_forward_v2", &dcn_v2_cuda_forward_v2); 40 | -------------------------------------------------------------------------------- /dcn_cpp_plugin/vision.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | #include 4 | 5 | at::Tensor 6 | dcn_v2_cuda_forward_v2(const at::Tensor &input, 7 | const at::Tensor &weight, 8 | const at::Tensor &bias, 9 | const at::Tensor &offset, 10 | const at::Tensor &mask, 11 | const int64_t kernel_h, 12 | const int64_t kernel_w, 13 | const int64_t stride_h, 14 | const int64_t stride_w, 15 | const int64_t pad_h, 16 | const int64_t pad_w, 17 | const int64_t dilation_h, 18 | const int64_t dilation_w, 19 | const int64_t deformable_group); 20 | 21 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from model import create_model, load_model 4 | import torch 5 | 6 | if __name__ == '__main__': 7 | num_classes = 80 8 | head_conv = 256 9 | heads = {'hm': num_classes, 10 | 'wh': 2 , 11 | 'reg': 2} 12 | 13 | load_model_path = 'ctdet_coco_dla_2x.pth' 14 | save_script_pt = 'centernet.pt' 15 | device = 0 16 | 17 | model = create_model('dla_34', heads, head_conv) 18 | model = load_model(model, load_model_path) 19 | model = model.to(device) 20 | model.eval() 21 | 22 | input_var = torch.zeros([1, 3, 512, 512], dtype=torch.float32).cuda() 23 | 24 | traced_script_module = torch.jit.trace(model, input_var) 25 | traced_script_module.save(save_script_pt) 26 | traced_script_module = torch.jit.load(save_script_pt) 27 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from pose_dla_dcn import get_pose_net as get_model 4 | 5 | 6 | def create_model(arch, heads, head_conv): 7 | num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0 8 | arch = arch[:arch.find('_')] if '_' in arch else arch 9 | model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv) 10 | return model 11 | 12 | 13 | def load_model(model, model_path, optimizer=None, resume=False, 14 | lr=None, lr_step=None): 15 | start_epoch = 0 16 | checkpoint = torch.load( 17 | model_path, map_location=lambda storage, loc: storage) 18 | print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch'])) 19 | state_dict_ = checkpoint['state_dict'] 20 | state_dict = {} 21 | 22 | # convert data_parallal to model 23 | for k in state_dict_: 24 | if k.startswith('module') and not k.startswith('module_list'): 25 | state_dict[k[7:]] = state_dict_[k] 26 | else: 27 | state_dict[k] = state_dict_[k] 28 | model_state_dict = model.state_dict() 29 | 30 | # check loaded parameters and created model parameters 31 | for k in state_dict: 32 | if k in model_state_dict: 33 | if state_dict[k].shape != model_state_dict[k].shape: 34 | print('Skip loading parameter {}, required shape{}, ' 35 | 'loaded shape{}.'.format( 36 | k, model_state_dict[k].shape, state_dict[k].shape)) 37 | state_dict[k] = model_state_dict[k] 38 | else: 39 | print('Drop parameter {}.'.format(k)) 40 | for k in model_state_dict: 41 | if not (k in state_dict): 42 | print('No param {}.'.format(k)) 43 | state_dict[k] = model_state_dict[k] 44 | model.load_state_dict(state_dict, strict=False) 45 | 46 | # resume optimizer parameters 47 | if optimizer is not None and resume: 48 | if 'optimizer' in checkpoint: 49 | optimizer.load_state_dict(checkpoint['optimizer']) 50 | start_epoch = checkpoint['epoch'] 51 | start_lr = lr 52 | for step in lr_step: 53 | if start_epoch >= step: 54 | start_lr *= 0.1 55 | for param_group in optimizer.param_groups: 56 | param_group['lr'] = start_lr 57 | print('Resumed optimizer with start lr', start_lr) 58 | else: 59 | print('No optimizer parameters in checkpoint.') 60 | if optimizer is not None: 61 | return model, optimizer, start_epoch 62 | else: 63 | return model 64 | -------------------------------------------------------------------------------- /pose_dla_dcn.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import math 7 | import logging 8 | import numpy as np 9 | from os.path import join 10 | 11 | import torch 12 | from torch import nn 13 | import torch.nn.functional as F 14 | import torch.utils.model_zoo as model_zoo 15 | 16 | from DCNv2.dcn_v2 import DCN 17 | 18 | BN_MOMENTUM = 0.1 19 | logger = logging.getLogger(__name__) 20 | 21 | def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'): 22 | return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash)) 23 | 24 | 25 | def conv3x3(in_planes, out_planes, stride=1): 26 | "3x3 convolution with padding" 27 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 28 | padding=1, bias=False) 29 | 30 | 31 | class BasicBlock(nn.Module): 32 | def __init__(self, inplanes, planes, stride=1, dilation=1): 33 | super(BasicBlock, self).__init__() 34 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, 35 | stride=stride, padding=dilation, 36 | bias=False, dilation=dilation) 37 | self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) 38 | self.relu = nn.ReLU(inplace=True) 39 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, 40 | stride=1, padding=dilation, 41 | bias=False, dilation=dilation) 42 | self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) 43 | self.stride = stride 44 | 45 | def forward(self, x, residual=None): 46 | if residual is None: 47 | residual = x 48 | 49 | out = self.conv1(x) 50 | out = self.bn1(out) 51 | out = self.relu(out) 52 | 53 | out = self.conv2(out) 54 | out = self.bn2(out) 55 | 56 | out += residual 57 | out = self.relu(out) 58 | 59 | return out 60 | 61 | 62 | class Bottleneck(nn.Module): 63 | expansion = 2 64 | 65 | def __init__(self, inplanes, planes, stride=1, dilation=1): 66 | super(Bottleneck, self).__init__() 67 | expansion = Bottleneck.expansion 68 | bottle_planes = planes // expansion 69 | self.conv1 = nn.Conv2d(inplanes, bottle_planes, 70 | kernel_size=1, bias=False) 71 | self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) 72 | self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, 73 | stride=stride, padding=dilation, 74 | bias=False, dilation=dilation) 75 | self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) 76 | self.conv3 = nn.Conv2d(bottle_planes, planes, 77 | kernel_size=1, bias=False) 78 | self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) 79 | self.relu = nn.ReLU(inplace=True) 80 | self.stride = stride 81 | 82 | def forward(self, x, residual=None): 83 | if residual is None: 84 | residual = x 85 | 86 | out = self.conv1(x) 87 | out = self.bn1(out) 88 | out = self.relu(out) 89 | 90 | out = self.conv2(out) 91 | out = self.bn2(out) 92 | out = self.relu(out) 93 | 94 | out = self.conv3(out) 95 | out = self.bn3(out) 96 | 97 | out += residual 98 | out = self.relu(out) 99 | 100 | return out 101 | 102 | 103 | class BottleneckX(nn.Module): 104 | expansion = 2 105 | cardinality = 32 106 | 107 | def __init__(self, inplanes, planes, stride=1, dilation=1): 108 | super(BottleneckX, self).__init__() 109 | cardinality = BottleneckX.cardinality 110 | # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0))) 111 | # bottle_planes = dim * cardinality 112 | bottle_planes = planes * cardinality // 32 113 | self.conv1 = nn.Conv2d(inplanes, bottle_planes, 114 | kernel_size=1, bias=False) 115 | self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) 116 | self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3, 117 | stride=stride, padding=dilation, bias=False, 118 | dilation=dilation, groups=cardinality) 119 | self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM) 120 | self.conv3 = nn.Conv2d(bottle_planes, planes, 121 | kernel_size=1, bias=False) 122 | self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) 123 | self.relu = nn.ReLU(inplace=True) 124 | self.stride = stride 125 | 126 | def forward(self, x, residual=None): 127 | if residual is None: 128 | residual = x 129 | 130 | out = self.conv1(x) 131 | out = self.bn1(out) 132 | out = self.relu(out) 133 | 134 | out = self.conv2(out) 135 | out = self.bn2(out) 136 | out = self.relu(out) 137 | 138 | out = self.conv3(out) 139 | out = self.bn3(out) 140 | 141 | out += residual 142 | out = self.relu(out) 143 | 144 | return out 145 | 146 | 147 | class Root(nn.Module): 148 | def __init__(self, in_channels, out_channels, kernel_size, residual): 149 | super(Root, self).__init__() 150 | self.conv = nn.Conv2d( 151 | in_channels, out_channels, 1, 152 | stride=1, bias=False, padding=(kernel_size - 1) // 2) 153 | self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) 154 | self.relu = nn.ReLU(inplace=True) 155 | self.residual = residual 156 | 157 | def forward(self, *x): 158 | children = x 159 | x = self.conv(torch.cat(x, 1)) 160 | x = self.bn(x) 161 | if self.residual: 162 | x += children[0] 163 | x = self.relu(x) 164 | 165 | return x 166 | 167 | 168 | class Tree(nn.Module): 169 | def __init__(self, levels, block, in_channels, out_channels, stride=1, 170 | level_root=False, root_dim=0, root_kernel_size=1, 171 | dilation=1, root_residual=False): 172 | super(Tree, self).__init__() 173 | if root_dim == 0: 174 | root_dim = 2 * out_channels 175 | if level_root: 176 | root_dim += in_channels 177 | if levels == 1: 178 | self.tree1 = block(in_channels, out_channels, stride, 179 | dilation=dilation) 180 | self.tree2 = block(out_channels, out_channels, 1, 181 | dilation=dilation) 182 | else: 183 | self.tree1 = Tree(levels - 1, block, in_channels, out_channels, 184 | stride, root_dim=0, 185 | root_kernel_size=root_kernel_size, 186 | dilation=dilation, root_residual=root_residual) 187 | self.tree2 = Tree(levels - 1, block, out_channels, out_channels, 188 | root_dim=root_dim + out_channels, 189 | root_kernel_size=root_kernel_size, 190 | dilation=dilation, root_residual=root_residual) 191 | if levels == 1: 192 | self.root = Root(root_dim, out_channels, root_kernel_size, 193 | root_residual) 194 | self.level_root = level_root 195 | self.root_dim = root_dim 196 | self.downsample = None 197 | self.project = None 198 | self.levels = levels 199 | if stride > 1: 200 | self.downsample = nn.MaxPool2d(stride, stride=stride) 201 | if in_channels != out_channels: 202 | self.project = nn.Sequential( 203 | nn.Conv2d(in_channels, out_channels, 204 | kernel_size=1, stride=1, bias=False), 205 | nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM) 206 | ) 207 | 208 | def forward(self, x, residual=None, children=None): 209 | children = [] if children is None else children 210 | bottom = self.downsample(x) if self.downsample else x 211 | residual = self.project(bottom) if self.project else bottom 212 | if self.level_root: 213 | children.append(bottom) 214 | x1 = self.tree1(x, residual) 215 | if self.levels == 1: 216 | x2 = self.tree2(x1) 217 | x = self.root(x2, x1, *children) 218 | else: 219 | children.append(x1) 220 | x = self.tree2(x1, children=children) 221 | return x 222 | 223 | 224 | class DLA(nn.Module): 225 | def __init__(self, levels, channels, num_classes=1000, 226 | block=BasicBlock, residual_root=False, linear_root=False): 227 | super(DLA, self).__init__() 228 | self.channels = channels 229 | self.num_classes = num_classes 230 | self.base_layer = nn.Sequential( 231 | nn.Conv2d(3, channels[0], kernel_size=7, stride=1, 232 | padding=3, bias=False), 233 | nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM), 234 | nn.ReLU(inplace=True)) 235 | self.level0 = self._make_conv_level( 236 | channels[0], channels[0], levels[0]) 237 | self.level1 = self._make_conv_level( 238 | channels[0], channels[1], levels[1], stride=2) 239 | self.level2 = Tree(levels[2], block, channels[1], channels[2], 2, 240 | level_root=False, 241 | root_residual=residual_root) 242 | self.level3 = Tree(levels[3], block, channels[2], channels[3], 2, 243 | level_root=True, root_residual=residual_root) 244 | self.level4 = Tree(levels[4], block, channels[3], channels[4], 2, 245 | level_root=True, root_residual=residual_root) 246 | self.level5 = Tree(levels[5], block, channels[4], channels[5], 2, 247 | level_root=True, root_residual=residual_root) 248 | 249 | # for m in self.modules(): 250 | # if isinstance(m, nn.Conv2d): 251 | # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 252 | # m.weight.data.normal_(0, math.sqrt(2. / n)) 253 | # elif isinstance(m, nn.BatchNorm2d): 254 | # m.weight.data.fill_(1) 255 | # m.bias.data.zero_() 256 | 257 | def _make_level(self, block, inplanes, planes, blocks, stride=1): 258 | downsample = None 259 | if stride != 1 or inplanes != planes: 260 | downsample = nn.Sequential( 261 | nn.MaxPool2d(stride, stride=stride), 262 | nn.Conv2d(inplanes, planes, 263 | kernel_size=1, stride=1, bias=False), 264 | nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), 265 | ) 266 | 267 | layers = [] 268 | layers.append(block(inplanes, planes, stride, downsample=downsample)) 269 | for i in range(1, blocks): 270 | layers.append(block(inplanes, planes)) 271 | 272 | return nn.Sequential(*layers) 273 | 274 | def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): 275 | modules = [] 276 | for i in range(convs): 277 | modules.extend([ 278 | nn.Conv2d(inplanes, planes, kernel_size=3, 279 | stride=stride if i == 0 else 1, 280 | padding=dilation, bias=False, dilation=dilation), 281 | nn.BatchNorm2d(planes, momentum=BN_MOMENTUM), 282 | nn.ReLU(inplace=True)]) 283 | inplanes = planes 284 | return nn.Sequential(*modules) 285 | 286 | def forward(self, x): 287 | y = [] 288 | x = self.base_layer(x) 289 | for i in range(6): 290 | x = getattr(self, 'level{}'.format(i))(x) 291 | y.append(x) 292 | return (y[0],y[1],y[2],y[3],y[4],y[5]) 293 | # return y 294 | 295 | def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'): 296 | # fc = self.fc 297 | if name.endswith('.pth'): 298 | model_weights = torch.load(data + name) 299 | else: 300 | model_url = get_model_url(data, name, hash) 301 | model_weights = model_zoo.load_url(model_url) 302 | num_classes = len(model_weights[list(model_weights.keys())[-1]]) 303 | self.fc = nn.Conv2d( 304 | self.channels[-1], num_classes, 305 | kernel_size=1, stride=1, padding=0, bias=True) 306 | self.load_state_dict(model_weights) 307 | # self.fc = fc 308 | 309 | 310 | def dla34(pretrained=True, **kwargs): # DLA-34 311 | model = DLA([1, 1, 1, 2, 2, 1], 312 | [16, 32, 64, 128, 256, 512], 313 | block=BasicBlock, **kwargs) 314 | if pretrained: 315 | model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86') 316 | return model 317 | 318 | class Identity(nn.Module): 319 | 320 | def __init__(self): 321 | super(Identity, self).__init__() 322 | 323 | def forward(self, x): 324 | return x 325 | 326 | 327 | def fill_fc_weights(layers): 328 | for m in layers.modules(): 329 | if isinstance(m, nn.Conv2d): 330 | if m.bias is not None: 331 | nn.init.constant_(m.bias, 0) 332 | 333 | 334 | def fill_up_weights(up): 335 | w = up.weight.data 336 | f = math.ceil(w.size(2) / 2) 337 | c = (2 * f - 1 - f % 2) / (2. * f) 338 | for i in range(w.size(2)): 339 | for j in range(w.size(3)): 340 | w[0, 0, i, j] = \ 341 | (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) 342 | for c in range(1, w.size(0)): 343 | w[c, 0, :, :] = w[0, 0, :, :] 344 | 345 | 346 | class DeformConv(nn.Module): 347 | def __init__(self, chi, cho): 348 | super(DeformConv, self).__init__() 349 | self.actf = nn.Sequential( 350 | nn.BatchNorm2d(cho, momentum=BN_MOMENTUM), 351 | nn.ReLU(inplace=True) 352 | ) 353 | self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1) 354 | 355 | def forward(self, x): 356 | x = self.conv(x) 357 | x = self.actf(x) 358 | return x 359 | 360 | 361 | class IDAUp(nn.Module): 362 | 363 | def __init__(self, o, channels, up_f): 364 | super(IDAUp, self).__init__() 365 | for i in range(1, len(channels)): 366 | c = channels[i] 367 | f = int(up_f[i]) 368 | proj = DeformConv(c, o) 369 | node = DeformConv(o, o) 370 | 371 | up = nn.ConvTranspose2d(o, o, f * 2, stride=f, 372 | padding=f // 2, output_padding=0, 373 | groups=o, bias=False) 374 | fill_up_weights(up) 375 | 376 | setattr(self, 'proj_' + str(i), proj) 377 | setattr(self, 'up_' + str(i), up) 378 | setattr(self, 'node_' + str(i), node) 379 | 380 | 381 | def forward(self, layers, startp, endp): 382 | for i in range(startp + 1, endp): 383 | upsample = getattr(self, 'up_' + str(i - startp)) 384 | project = getattr(self, 'proj_' + str(i - startp)) 385 | # layers[i] = upsample(project(layers[i])) 386 | upsample_layers = upsample(project(layers[i])) 387 | node = getattr(self, 'node_' + str(i - startp)) 388 | # layers[i] = node(layers[i] + layers[i - 1]) 389 | layers[i] = node(upsample_layers + layers[i - 1]) 390 | # node_tmp = node(upsample_layers + layers[i - 1]) 391 | return layers[-1] 392 | 393 | 394 | 395 | class DLAUp(nn.Module): 396 | def __init__(self, startp, channels, scales, in_channels=None): 397 | super(DLAUp, self).__init__() 398 | self.startp = startp 399 | if in_channels is None: 400 | in_channels = channels 401 | self.channels = channels 402 | channels = list(channels) 403 | scales = np.array(scales, dtype=int) 404 | for i in range(len(channels) - 1): 405 | j = -i - 2 406 | setattr(self, 'ida_{}'.format(i), 407 | IDAUp(channels[j], in_channels[j:], 408 | scales[j:] // scales[j])) 409 | scales[j + 1:] = scales[j] 410 | in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] 411 | 412 | def forward(self, layers): 413 | out = [layers[-1]] # start with 32 414 | for i in range(len(layers) - self.startp - 1): 415 | ida = getattr(self, 'ida_{}'.format(i)) 416 | ida_out = ida(layers, len(layers) -i - 2, len(layers)) 417 | # out.insert(0, layers[-1]) 418 | out.insert(0, ida_out) 419 | return out 420 | 421 | 422 | class Interpolate(nn.Module): 423 | def __init__(self, scale, mode): 424 | super(Interpolate, self).__init__() 425 | self.scale = scale 426 | self.mode = mode 427 | 428 | def forward(self, x): 429 | x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False) 430 | return x 431 | 432 | 433 | class DLASeg(nn.Module): 434 | def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel, 435 | last_level, head_conv, out_channel=0): 436 | super(DLASeg, self).__init__() 437 | assert down_ratio in [2, 4, 8, 16] 438 | self.first_level = int(np.log2(down_ratio)) 439 | self.last_level = last_level 440 | self.base = globals()[base_name](pretrained=pretrained) 441 | 442 | channels = self.base.channels 443 | scales = [2 ** i for i in range(len(channels[self.first_level:]))] 444 | self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales) 445 | if out_channel == 0: 446 | out_channel = channels[self.first_level] 447 | 448 | self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], 449 | [2 ** i for i in range(self.last_level - self.first_level)]) 450 | self.heads = heads 451 | for head in self.heads: 452 | classes = self.heads[head] 453 | if head_conv > 0: 454 | fc = nn.Sequential( 455 | nn.Conv2d(channels[self.first_level], head_conv, 456 | kernel_size=3, padding=1, bias=True), 457 | nn.ReLU(inplace=True), 458 | nn.Conv2d(head_conv, classes, 459 | kernel_size=final_kernel, stride=1, 460 | padding=final_kernel // 2, bias=True)) 461 | if 'hm' in head: 462 | fc[-1].bias.data.fill_(-2.19) 463 | else: 464 | fill_fc_weights(fc) 465 | else: 466 | fc = nn.Conv2d(channels[self.first_level], classes, 467 | kernel_size=final_kernel, stride=1, 468 | padding=final_kernel // 2, bias=True) 469 | if 'hm' in head: 470 | fc.bias.data.fill_(-2.19) 471 | else: 472 | fill_fc_weights(fc) 473 | self.__setattr__(head, fc) 474 | 475 | def forward(self, x): 476 | x = self.base(x) 477 | 478 | x = list(x) 479 | x = self.dla_up(x) 480 | 481 | y = [] 482 | for i in range(self.last_level - self.first_level): 483 | y.append(x[i].clone()) 484 | ida_out =self.ida_up(y, 0, len(y)) 485 | 486 | # z = {} 487 | # for head in self.heads: 488 | # z[head] = self.__getattr__(head)(y[-1]) 489 | # for head in self.heads: 490 | # z[head] = self.__getattr__(head)(ida_out) 491 | z = list() 492 | z.append(self.__getattr__('hm')(y[-1])) 493 | z.append(self.__getattr__('wh')(y[-1])) 494 | z.append(self.__getattr__('reg')(y[-1])) 495 | ret = (z[0],z[1],z[2]) 496 | return ret 497 | # return [z] 498 | 499 | 500 | def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4): 501 | model = DLASeg('dla{}'.format(num_layers), heads, 502 | pretrained=True, 503 | down_ratio=down_ratio, 504 | final_kernel=1, 505 | last_level=5, 506 | head_conv=head_conv) 507 | return model 508 | 509 | --------------------------------------------------------------------------------