├── DCNv2
├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── dcn_v2.py
├── make.sh
├── setup.py
├── src
│ ├── cpu
│ │ ├── dcn_v2_cpu.cpp
│ │ └── vision.h
│ ├── cuda
│ │ ├── dcn_v2_cuda.cu
│ │ ├── dcn_v2_im2col_cuda.cu
│ │ ├── dcn_v2_im2col_cuda.h
│ │ ├── dcn_v2_psroi_pooling_cuda.cu
│ │ └── vision.h
│ ├── dcn_v2.h
│ └── vision.cpp
└── test.py
├── LICENSE
├── README.md
├── dcn_cpp_plugin
├── CMakeLists.txt
├── README.md
├── dcn_v2.h
├── dcn_v2_cuda.cu
├── dcn_v2_im2col_cuda.cu
├── dcn_v2_im2col_cuda.h
├── vision.cpp
└── vision.h
├── demo.py
├── model.py
└── pose_dla_dcn.py
/DCNv2/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .idea
3 | *.so
4 | *.o
5 | *pyc
6 | _ext
7 | build
8 | DCNv2.egg-info
9 | dist
--------------------------------------------------------------------------------
/DCNv2/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Charles Shang
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/DCNv2/README.md:
--------------------------------------------------------------------------------
1 | ## Deformable Convolutional Networks V2 with Pytorch 1.0
2 |
3 | ### Build
4 | ```bash
5 | ./make.sh # build
6 | python test.py # run examples and gradient check
7 | ```
8 |
9 | ### An Example
10 | - deformable conv
11 | ```python
12 | from dcn_v2 import DCN
13 | input = torch.randn(2, 64, 128, 128).cuda()
14 | # wrap all things (offset and mask) in DCN
15 | dcn = DCN(64, 64, kernel_size=(3,3), stride=1, padding=1, deformable_groups=2).cuda()
16 | output = dcn(input)
17 | print(output.shape)
18 | ```
19 | - deformable roi pooling
20 | ```python
21 | from dcn_v2 import DCNPooling
22 | input = torch.randn(2, 32, 64, 64).cuda()
23 | batch_inds = torch.randint(2, (20, 1)).cuda().float()
24 | x = torch.randint(256, (20, 1)).cuda().float()
25 | y = torch.randint(256, (20, 1)).cuda().float()
26 | w = torch.randint(64, (20, 1)).cuda().float()
27 | h = torch.randint(64, (20, 1)).cuda().float()
28 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
29 |
30 | # mdformable pooling (V2)
31 | # wrap all things (offset and mask) in DCNPooling
32 | dpooling = DCNPooling(spatial_scale=1.0 / 4,
33 | pooled_size=7,
34 | output_dim=32,
35 | no_trans=False,
36 | group_size=1,
37 | trans_std=0.1).cuda()
38 |
39 | dout = dpooling(input, rois)
40 | ```
41 | ### Note
42 | Now the master branch is for pytorch 1.0 (new ATen API), you can switch back to pytorch 0.4 with,
43 | ```bash
44 | git checkout pytorch_0.4
45 | ```
46 |
47 | ### Known Issues:
48 |
49 | - [x] Gradient check w.r.t offset (solved)
50 | - [ ] Backward is not reentrant (minor)
51 |
52 | This is an adaption of the official [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets/tree/master/DCNv2_op).
53 |
54 | I have ran the gradient check for many times with DOUBLE type. Every tensor **except offset** passes.
55 | However, when I set the offset to 0.5, it passes. I'm still wondering what cause this problem. Is it because some
56 | non-differential points?
57 |
58 | Update: all gradient check passes with double precision.
59 |
60 | Another issue is that it raises `RuntimeError: Backward is not reentrant`. However, the error is very small (`<1e-7` for
61 | float `<1e-15` for double),
62 | so it may not be a serious problem (?)
63 |
64 | Please post an issue or PR if you have any comments.
65 |
--------------------------------------------------------------------------------
/DCNv2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xi11xi19/CenterNet2TorchScript/69d7241139ebb2aad095cf17901d3945ac705626/DCNv2/__init__.py
--------------------------------------------------------------------------------
/DCNv2/dcn_v2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import absolute_import
3 | from __future__ import print_function
4 | from __future__ import division
5 |
6 | import math
7 | import torch
8 | from torch import nn
9 | from torch.autograd import Function
10 | from torch.nn.modules.utils import _pair
11 | from torch.autograd.function import once_differentiable
12 | from torch.functional import F
13 |
14 |
15 | import _ext as _backend
16 |
17 |
18 | class DCNv2(nn.Module):
19 |
20 | def __init__(self, in_channels, out_channels,
21 | kernel_size, stride, padding, dilation=1, deformable_groups=1):
22 | super(DCNv2, self).__init__()
23 | self.in_channels = in_channels
24 | self.out_channels = out_channels
25 | self.kernel_size = _pair(kernel_size)
26 | self.stride = _pair(stride)
27 | self.padding = _pair(padding)
28 | self.dilation = _pair(dilation)
29 | self.deformable_groups = deformable_groups
30 |
31 | self.weight = nn.Parameter(torch.Tensor(
32 | out_channels, in_channels, *self.kernel_size))
33 | self.bias = nn.Parameter(torch.Tensor(out_channels))
34 | self.reset_parameters()
35 |
36 | def reset_parameters(self):
37 | n = self.in_channels
38 | for k in self.kernel_size:
39 | n *= k
40 | stdv = 1. / math.sqrt(n)
41 | self.weight.data.uniform_(-stdv, stdv)
42 | self.bias.data.zero_()
43 |
44 |
45 | def forward(self, input, offset, mask):
46 | # assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
47 | # offset.shape[1]
48 | # assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
49 | # mask.shape[1]
50 | output = _backend.dcn_v2_forward(input, self.weight, self.bias,
51 | offset, mask,
52 | self.weight.shape[2], self.weight.shape[3],
53 | self.stride[0], self.stride[1],
54 | self.padding[0], self.padding[1],
55 | self.dilation[0], self.dilation[1],
56 | self.deformable_groups)
57 | return output
58 |
59 |
60 | class DCN(DCNv2):
61 |
62 | def __init__(self, in_channels, out_channels,
63 | kernel_size, stride, padding,
64 | dilation=1, deformable_groups=1):
65 | super(DCN, self).__init__(in_channels, out_channels,
66 | kernel_size, stride, padding, dilation, deformable_groups)
67 |
68 | channels_ = self.deformable_groups * 3 * \
69 | self.kernel_size[0] * self.kernel_size[1]
70 | self.conv_offset_mask = nn.Conv2d(self.in_channels,
71 | channels_,
72 | kernel_size=self.kernel_size,
73 | stride=self.stride,
74 | padding=self.padding,
75 | bias=True)
76 | self.init_offset()
77 |
78 | def init_offset(self):
79 | self.conv_offset_mask.weight.data.zero_()
80 | self.conv_offset_mask.bias.data.zero_()
81 |
82 | def forward(self, input):
83 | out = self.conv_offset_mask(input)
84 | o1, o2, mask = torch.chunk(out, 3, dim=1)
85 | offset = torch.cat((o1, o2), dim=1)
86 | mask = torch.sigmoid(mask)
87 | output = torch.ops.my_ops.dcn_v2_cuda_forward_v2(input, self.weight, self.bias,
88 | offset, mask,
89 | self.kernel_size[0], self.kernel_size[1],
90 | self.stride[0], self.stride[1],
91 | self.padding[0], self.padding[1],
92 | self.dilation[0], self.dilation[1],
93 | self.deformable_groups)
94 |
95 | return output
96 |
97 |
98 | class _DCNv2Pooling(Function):
99 | @staticmethod
100 | def forward(ctx, input, rois, offset,
101 | spatial_scale,
102 | pooled_size,
103 | output_dim,
104 | no_trans,
105 | group_size=1,
106 | part_size=None,
107 | sample_per_part=4,
108 | trans_std=.0):
109 | ctx.spatial_scale = spatial_scale
110 | ctx.no_trans = int(no_trans)
111 | ctx.output_dim = output_dim
112 | ctx.group_size = group_size
113 | ctx.pooled_size = pooled_size
114 | ctx.part_size = pooled_size if part_size is None else part_size
115 | ctx.sample_per_part = sample_per_part
116 | ctx.trans_std = trans_std
117 |
118 | output, output_count = \
119 | _backend.dcn_v2_psroi_pooling_forward(input, rois, offset,
120 | ctx.no_trans, ctx.spatial_scale,
121 | ctx.output_dim, ctx.group_size,
122 | ctx.pooled_size, ctx.part_size,
123 | ctx.sample_per_part, ctx.trans_std)
124 | ctx.save_for_backward(input, rois, offset, output_count)
125 | return output
126 |
127 | @staticmethod
128 | @once_differentiable
129 | def backward(ctx, grad_output):
130 | input, rois, offset, output_count = ctx.saved_tensors
131 | grad_input, grad_offset = \
132 | _backend.dcn_v2_psroi_pooling_backward(grad_output,
133 | input,
134 | rois,
135 | offset,
136 | output_count,
137 | ctx.no_trans,
138 | ctx.spatial_scale,
139 | ctx.output_dim,
140 | ctx.group_size,
141 | ctx.pooled_size,
142 | ctx.part_size,
143 | ctx.sample_per_part,
144 | ctx.trans_std)
145 |
146 | return grad_input, None, grad_offset, \
147 | None, None, None, None, None, None, None, None
148 |
149 |
150 | dcn_v2_pooling = _DCNv2Pooling.apply
151 |
152 |
153 | class DCNv2Pooling(nn.Module):
154 |
155 | def __init__(self,
156 | spatial_scale,
157 | pooled_size,
158 | output_dim,
159 | no_trans,
160 | group_size=1,
161 | part_size=None,
162 | sample_per_part=4,
163 | trans_std=.0):
164 | super(DCNv2Pooling, self).__init__()
165 | self.spatial_scale = spatial_scale
166 | self.pooled_size = pooled_size
167 | self.output_dim = output_dim
168 | self.no_trans = no_trans
169 | self.group_size = group_size
170 | self.part_size = pooled_size if part_size is None else part_size
171 | self.sample_per_part = sample_per_part
172 | self.trans_std = trans_std
173 |
174 | def forward(self, input, rois, offset):
175 | assert input.shape[1] == self.output_dim
176 | if self.no_trans:
177 | offset = input.new()
178 | return dcn_v2_pooling(input, rois, offset,
179 | self.spatial_scale,
180 | self.pooled_size,
181 | self.output_dim,
182 | self.no_trans,
183 | self.group_size,
184 | self.part_size,
185 | self.sample_per_part,
186 | self.trans_std)
187 |
188 |
189 | class DCNPooling(DCNv2Pooling):
190 |
191 | def __init__(self,
192 | spatial_scale,
193 | pooled_size,
194 | output_dim,
195 | no_trans,
196 | group_size=1,
197 | part_size=None,
198 | sample_per_part=4,
199 | trans_std=.0,
200 | deform_fc_dim=1024):
201 | super(DCNPooling, self).__init__(spatial_scale,
202 | pooled_size,
203 | output_dim,
204 | no_trans,
205 | group_size,
206 | part_size,
207 | sample_per_part,
208 | trans_std)
209 |
210 | self.deform_fc_dim = deform_fc_dim
211 |
212 | if not no_trans:
213 | self.offset_mask_fc = nn.Sequential(
214 | nn.Linear(self.pooled_size * self.pooled_size *
215 | self.output_dim, self.deform_fc_dim),
216 | nn.ReLU(inplace=True),
217 | nn.Linear(self.deform_fc_dim, self.deform_fc_dim),
218 | nn.ReLU(inplace=True),
219 | nn.Linear(self.deform_fc_dim, self.pooled_size *
220 | self.pooled_size * 3)
221 | )
222 | self.offset_mask_fc[4].weight.data.zero_()
223 | self.offset_mask_fc[4].bias.data.zero_()
224 |
225 | def forward(self, input, rois):
226 | offset = input.new()
227 |
228 | if not self.no_trans:
229 |
230 | # do roi_align first
231 | n = rois.shape[0]
232 | roi = dcn_v2_pooling(input, rois, offset,
233 | self.spatial_scale,
234 | self.pooled_size,
235 | self.output_dim,
236 | True, # no trans
237 | self.group_size,
238 | self.part_size,
239 | self.sample_per_part,
240 | self.trans_std)
241 |
242 | # build mask and offset
243 | offset_mask = self.offset_mask_fc(roi.view(n, -1))
244 | offset_mask = offset_mask.view(
245 | n, 3, self.pooled_size, self.pooled_size)
246 | o1, o2, mask = torch.chunk(offset_mask, 3, dim=1)
247 | offset = torch.cat((o1, o2), dim=1)
248 | mask = torch.sigmoid(mask)
249 |
250 | # do pooling with offset and mask
251 | return dcn_v2_pooling(input, rois, offset,
252 | self.spatial_scale,
253 | self.pooled_size,
254 | self.output_dim,
255 | self.no_trans,
256 | self.group_size,
257 | self.part_size,
258 | self.sample_per_part,
259 | self.trans_std) * mask
260 | # only roi_align
261 | return dcn_v2_pooling(input, rois, offset,
262 | self.spatial_scale,
263 | self.pooled_size,
264 | self.output_dim,
265 | self.no_trans,
266 | self.group_size,
267 | self.part_size,
268 | self.sample_per_part,
269 | self.trans_std)
270 |
--------------------------------------------------------------------------------
/DCNv2/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python3 setup.py build develop
3 |
--------------------------------------------------------------------------------
/DCNv2/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import glob
5 |
6 | import torch
7 |
8 | from torch.utils.cpp_extension import CUDA_HOME
9 | from torch.utils.cpp_extension import CppExtension
10 | from torch.utils.cpp_extension import CUDAExtension
11 |
12 | from setuptools import find_packages
13 | from setuptools import setup
14 |
15 | requirements = ["torch", "torchvision"]
16 |
17 | def get_extensions():
18 | this_dir = os.path.dirname(os.path.abspath(__file__))
19 | extensions_dir = os.path.join(this_dir, "src")
20 |
21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
24 |
25 | sources = main_file + source_cpu
26 | extension = CppExtension
27 | extra_compile_args = {"cxx": []}
28 | define_macros = []
29 |
30 | if torch.cuda.is_available() and CUDA_HOME is not None:
31 | extension = CUDAExtension
32 | sources += source_cuda
33 | define_macros += [("WITH_CUDA", None)]
34 | extra_compile_args["nvcc"] = [
35 | "-DCUDA_HAS_FP16=1",
36 | "-D__CUDA_NO_HALF_OPERATORS__",
37 | "-D__CUDA_NO_HALF_CONVERSIONS__",
38 | "-D__CUDA_NO_HALF2_OPERATORS__",
39 | ]
40 | else:
41 | raise NotImplementedError('Cuda is not availabel')
42 |
43 | sources = [os.path.join(extensions_dir, s) for s in sources]
44 | include_dirs = [extensions_dir]
45 | ext_modules = [
46 | extension(
47 | "_ext",
48 | sources,
49 | include_dirs=include_dirs,
50 | define_macros=define_macros,
51 | extra_compile_args=extra_compile_args,
52 | )
53 | ]
54 | return ext_modules
55 |
56 | setup(
57 | name="DCNv2",
58 | version="0.1",
59 | author="charlesshang",
60 | url="https://github.com/charlesshang/DCNv2",
61 | description="deformable convolutional networks",
62 | packages=find_packages(exclude=("configs", "tests",)),
63 | # install_requires=requirements,
64 | ext_modules=get_extensions(),
65 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
66 | )
--------------------------------------------------------------------------------
/DCNv2/src/cpu/dcn_v2_cpu.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 |
6 |
7 | at::Tensor
8 | dcn_v2_cpu_forward(const at::Tensor &input,
9 | const at::Tensor &weight,
10 | const at::Tensor &bias,
11 | const at::Tensor &offset,
12 | const at::Tensor &mask,
13 | const int kernel_h,
14 | const int kernel_w,
15 | const int stride_h,
16 | const int stride_w,
17 | const int pad_h,
18 | const int pad_w,
19 | const int dilation_h,
20 | const int dilation_w,
21 | const int deformable_group)
22 | {
23 | AT_ERROR("Not implement on cpu");
24 | }
25 |
26 | std::vector
27 | dcn_v2_cpu_backward(const at::Tensor &input,
28 | const at::Tensor &weight,
29 | const at::Tensor &bias,
30 | const at::Tensor &offset,
31 | const at::Tensor &mask,
32 | const at::Tensor &grad_output,
33 | int kernel_h, int kernel_w,
34 | int stride_h, int stride_w,
35 | int pad_h, int pad_w,
36 | int dilation_h, int dilation_w,
37 | int deformable_group)
38 | {
39 | AT_ERROR("Not implement on cpu");
40 | }
41 |
42 | std::tuple
43 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
44 | const at::Tensor &bbox,
45 | const at::Tensor &trans,
46 | const int no_trans,
47 | const float spatial_scale,
48 | const int output_dim,
49 | const int group_size,
50 | const int pooled_size,
51 | const int part_size,
52 | const int sample_per_part,
53 | const float trans_std)
54 | {
55 | AT_ERROR("Not implement on cpu");
56 | }
57 |
58 | std::tuple
59 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
60 | const at::Tensor &input,
61 | const at::Tensor &bbox,
62 | const at::Tensor &trans,
63 | const at::Tensor &top_count,
64 | const int no_trans,
65 | const float spatial_scale,
66 | const int output_dim,
67 | const int group_size,
68 | const int pooled_size,
69 | const int part_size,
70 | const int sample_per_part,
71 | const float trans_std)
72 | {
73 | AT_ERROR("Not implement on cpu");
74 | }
--------------------------------------------------------------------------------
/DCNv2/src/cpu/vision.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | at::Tensor
5 | dcn_v2_cpu_forward(const at::Tensor &input,
6 | const at::Tensor &weight,
7 | const at::Tensor &bias,
8 | const at::Tensor &offset,
9 | const at::Tensor &mask,
10 | const int kernel_h,
11 | const int kernel_w,
12 | const int stride_h,
13 | const int stride_w,
14 | const int pad_h,
15 | const int pad_w,
16 | const int dilation_h,
17 | const int dilation_w,
18 | const int deformable_group);
19 |
20 | std::vector
21 | dcn_v2_cpu_backward(const at::Tensor &input,
22 | const at::Tensor &weight,
23 | const at::Tensor &bias,
24 | const at::Tensor &offset,
25 | const at::Tensor &mask,
26 | const at::Tensor &grad_output,
27 | int kernel_h, int kernel_w,
28 | int stride_h, int stride_w,
29 | int pad_h, int pad_w,
30 | int dilation_h, int dilation_w,
31 | int deformable_group);
32 |
33 |
34 | std::tuple
35 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
36 | const at::Tensor &bbox,
37 | const at::Tensor &trans,
38 | const int no_trans,
39 | const float spatial_scale,
40 | const int output_dim,
41 | const int group_size,
42 | const int pooled_size,
43 | const int part_size,
44 | const int sample_per_part,
45 | const float trans_std);
46 |
47 | std::tuple
48 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
49 | const at::Tensor &input,
50 | const at::Tensor &bbox,
51 | const at::Tensor &trans,
52 | const at::Tensor &top_count,
53 | const int no_trans,
54 | const float spatial_scale,
55 | const int output_dim,
56 | const int group_size,
57 | const int pooled_size,
58 | const int part_size,
59 | const int sample_per_part,
60 | const float trans_std);
--------------------------------------------------------------------------------
/DCNv2/src/cuda/dcn_v2_cuda.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include "cuda/dcn_v2_im2col_cuda.h"
3 |
4 | #include
5 | #include
6 |
7 | #include
8 | #include
9 | #include
10 | extern THCState *state;
11 |
12 | // author: Charles Shang
13 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
14 |
15 | // [batch gemm]
16 | // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
17 |
18 | __global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
19 | float **columns_b, const float **ones_b,
20 | const float **weight_b, const float **bias_b,
21 | float *input, float *output,
22 | float *columns, float *ones,
23 | float *weight, float *bias,
24 | const int input_stride, const int output_stride,
25 | const int columns_stride, const int ones_stride,
26 | const int num_batches)
27 | {
28 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;
29 | if (idx < num_batches)
30 | {
31 | input_b[idx] = input + idx * input_stride;
32 | output_b[idx] = output + idx * output_stride;
33 | columns_b[idx] = columns + idx * columns_stride;
34 | ones_b[idx] = ones + idx * ones_stride;
35 | // share weights and bias within a Mini-Batch
36 | weight_b[idx] = weight;
37 | bias_b[idx] = bias;
38 | }
39 | }
40 |
41 | at::Tensor
42 | dcn_v2_cuda_forward(const at::Tensor &input,
43 | const at::Tensor &weight,
44 | const at::Tensor &bias,
45 | const at::Tensor &offset,
46 | const at::Tensor &mask,
47 | const int kernel_h,
48 | const int kernel_w,
49 | const int stride_h,
50 | const int stride_w,
51 | const int pad_h,
52 | const int pad_w,
53 | const int dilation_h,
54 | const int dilation_w,
55 | const int deformable_group)
56 | {
57 | using scalar_t = float;
58 | // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
59 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
60 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
61 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
62 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
63 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
64 |
65 | const int batch = input.size(0);
66 | const int channels = input.size(1);
67 | const int height = input.size(2);
68 | const int width = input.size(3);
69 |
70 | const int channels_out = weight.size(0);
71 | const int channels_kernel = weight.size(1);
72 | const int kernel_h_ = weight.size(2);
73 | const int kernel_w_ = weight.size(3);
74 |
75 | // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
76 | // printf("Channels: %d %d\n", channels, channels_kernel);
77 | // printf("Channels: %d %d\n", channels_out, channels_kernel);
78 |
79 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
80 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
81 |
82 | AT_ASSERTM(channels == channels_kernel,
83 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
84 |
85 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
86 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
87 |
88 | auto ones = at::ones({batch, height_out, width_out}, input.options());
89 | auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
90 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
91 |
92 | // prepare for batch-wise computing, which is significantly faster than instance-wise computing
93 | // when batch size is large.
94 | // launch batch threads
95 | int matrices_size = batch * sizeof(float *);
96 | auto input_b = static_cast(THCudaMalloc(state, matrices_size));
97 | auto output_b = static_cast(THCudaMalloc(state, matrices_size));
98 | auto columns_b = static_cast(THCudaMalloc(state, matrices_size));
99 | auto ones_b = static_cast(THCudaMalloc(state, matrices_size));
100 | auto weight_b = static_cast(THCudaMalloc(state, matrices_size));
101 | auto bias_b = static_cast(THCudaMalloc(state, matrices_size));
102 |
103 | const int block = 128;
104 | const int grid = (batch + block - 1) / block;
105 |
106 | createBatchGemmBuffer<<>>(
107 | input_b, output_b,
108 | columns_b, ones_b,
109 | weight_b, bias_b,
110 | input.data(),
111 | output.data(),
112 | columns.data(),
113 | ones.data(),
114 | weight.data(),
115 | bias.data(),
116 | channels * width * height,
117 | channels_out * width_out * height_out,
118 | channels * kernel_h * kernel_w * height_out * width_out,
119 | height_out * width_out,
120 | batch);
121 |
122 | long m_ = channels_out;
123 | long n_ = height_out * width_out;
124 | long k_ = 1;
125 | THCudaBlas_SgemmBatched(state,
126 | 't',
127 | 'n',
128 | n_,
129 | m_,
130 | k_,
131 | 1.0f,
132 | ones_b, k_,
133 | bias_b, k_,
134 | 0.0f,
135 | output_b, n_,
136 | batch);
137 |
138 | modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
139 | input.data(),
140 | offset.data(),
141 | mask.data(),
142 | batch, channels, height, width,
143 | height_out, width_out, kernel_h, kernel_w,
144 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
145 | deformable_group,
146 | columns.data());
147 |
148 | long m = channels_out;
149 | long n = height_out * width_out;
150 | long k = channels * kernel_h * kernel_w;
151 | THCudaBlas_SgemmBatched(state,
152 | 'n',
153 | 'n',
154 | n,
155 | m,
156 | k,
157 | 1.0f,
158 | (const float **)columns_b, n,
159 | weight_b, k,
160 | 1.0f,
161 | output_b, n,
162 | batch);
163 |
164 | THCudaFree(state, input_b);
165 | THCudaFree(state, output_b);
166 | THCudaFree(state, columns_b);
167 | THCudaFree(state, ones_b);
168 | THCudaFree(state, weight_b);
169 | THCudaFree(state, bias_b);
170 | return output;
171 | }
172 |
173 | at::Tensor
174 | dcn_v2_cuda_forward_v2(const at::Tensor &input,
175 | const at::Tensor &weight,
176 | const at::Tensor &bias,
177 | const at::Tensor &offset,
178 | const at::Tensor &mask,
179 | const int64_t kernel_h,
180 | const int64_t kernel_w,
181 | const int64_t stride_h,
182 | const int64_t stride_w,
183 | const int64_t pad_h,
184 | const int64_t pad_w,
185 | const int64_t dilation_h,
186 | const int64_t dilation_w,
187 | const int64_t deformable_group)
188 | {
189 | using scalar_t = float;
190 | // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
191 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
192 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
193 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
194 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
195 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
196 |
197 | const int batch = input.size(0);
198 | const int channels = input.size(1);
199 | const int height = input.size(2);
200 | const int width = input.size(3);
201 |
202 | const int channels_out = weight.size(0);
203 | const int channels_kernel = weight.size(1);
204 | const int kernel_h_ = weight.size(2);
205 | const int kernel_w_ = weight.size(3);
206 |
207 | // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
208 | // printf("Channels: %d %d\n", channels, channels_kernel);
209 | // printf("Channels: %d %d\n", channels_out, channels_kernel);
210 |
211 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
212 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
213 |
214 | AT_ASSERTM(channels == channels_kernel,
215 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
216 |
217 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
218 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
219 |
220 | auto ones = at::ones({batch, height_out, width_out}, input.options());
221 | auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
222 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
223 |
224 | // prepare for batch-wise computing, which is significantly faster than instance-wise computing
225 | // when batch size is large.
226 | // launch batch threads
227 | int matrices_size = batch * sizeof(float *);
228 | auto input_b = static_cast(THCudaMalloc(state, matrices_size));
229 | auto output_b = static_cast(THCudaMalloc(state, matrices_size));
230 | auto columns_b = static_cast(THCudaMalloc(state, matrices_size));
231 | auto ones_b = static_cast(THCudaMalloc(state, matrices_size));
232 | auto weight_b = static_cast(THCudaMalloc(state, matrices_size));
233 | auto bias_b = static_cast(THCudaMalloc(state, matrices_size));
234 |
235 | const int block = 128;
236 | const int grid = (batch + block - 1) / block;
237 |
238 | createBatchGemmBuffer<<>>(
239 | input_b, output_b,
240 | columns_b, ones_b,
241 | weight_b, bias_b,
242 | input.data(),
243 | output.data(),
244 | columns.data(),
245 | ones.data(),
246 | weight.data(),
247 | bias.data(),
248 | channels * width * height,
249 | channels_out * width_out * height_out,
250 | channels * kernel_h * kernel_w * height_out * width_out,
251 | height_out * width_out,
252 | batch);
253 |
254 | long m_ = channels_out;
255 | long n_ = height_out * width_out;
256 | long k_ = 1;
257 | THCudaBlas_SgemmBatched(state,
258 | 't',
259 | 'n',
260 | n_,
261 | m_,
262 | k_,
263 | 1.0f,
264 | ones_b, k_,
265 | bias_b, k_,
266 | 0.0f,
267 | output_b, n_,
268 | batch);
269 |
270 | modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
271 | input.data(),
272 | offset.data(),
273 | mask.data(),
274 | batch, channels, height, width,
275 | height_out, width_out, kernel_h, kernel_w,
276 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
277 | deformable_group,
278 | columns.data());
279 |
280 | long m = channels_out;
281 | long n = height_out * width_out;
282 | long k = channels * kernel_h * kernel_w;
283 | THCudaBlas_SgemmBatched(state,
284 | 'n',
285 | 'n',
286 | n,
287 | m,
288 | k,
289 | 1.0f,
290 | (const float **)columns_b, n,
291 | weight_b, k,
292 | 1.0f,
293 | output_b, n,
294 | batch);
295 |
296 | THCudaFree(state, input_b);
297 | THCudaFree(state, output_b);
298 | THCudaFree(state, columns_b);
299 | THCudaFree(state, ones_b);
300 | THCudaFree(state, weight_b);
301 | THCudaFree(state, bias_b);
302 | return output;
303 | }
304 |
305 | __global__ void createBatchGemmBufferBackward(
306 | float **grad_output_b,
307 | float **columns_b,
308 | float **ones_b,
309 | float **weight_b,
310 | float **grad_weight_b,
311 | float **grad_bias_b,
312 | float *grad_output,
313 | float *columns,
314 | float *ones,
315 | float *weight,
316 | float *grad_weight,
317 | float *grad_bias,
318 | const int grad_output_stride,
319 | const int columns_stride,
320 | const int ones_stride,
321 | const int num_batches)
322 | {
323 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;
324 | if (idx < num_batches)
325 | {
326 | grad_output_b[idx] = grad_output + idx * grad_output_stride;
327 | columns_b[idx] = columns + idx * columns_stride;
328 | ones_b[idx] = ones + idx * ones_stride;
329 |
330 | // share weights and bias within a Mini-Batch
331 | weight_b[idx] = weight;
332 | grad_weight_b[idx] = grad_weight;
333 | grad_bias_b[idx] = grad_bias;
334 | }
335 | }
336 |
337 | std::vector dcn_v2_cuda_backward(const at::Tensor &input,
338 | const at::Tensor &weight,
339 | const at::Tensor &bias,
340 | const at::Tensor &offset,
341 | const at::Tensor &mask,
342 | const at::Tensor &grad_output,
343 | int kernel_h, int kernel_w,
344 | int stride_h, int stride_w,
345 | int pad_h, int pad_w,
346 | int dilation_h, int dilation_w,
347 | int deformable_group)
348 | {
349 |
350 | THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
351 | THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
352 |
353 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
354 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
355 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
356 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
357 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
358 |
359 | const int batch = input.size(0);
360 | const int channels = input.size(1);
361 | const int height = input.size(2);
362 | const int width = input.size(3);
363 |
364 | const int channels_out = weight.size(0);
365 | const int channels_kernel = weight.size(1);
366 | const int kernel_h_ = weight.size(2);
367 | const int kernel_w_ = weight.size(3);
368 |
369 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
370 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
371 |
372 | AT_ASSERTM(channels == channels_kernel,
373 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
374 |
375 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
376 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
377 |
378 | auto ones = at::ones({height_out, width_out}, input.options());
379 | auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
380 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
381 |
382 | auto grad_input = at::zeros_like(input);
383 | auto grad_weight = at::zeros_like(weight);
384 | auto grad_bias = at::zeros_like(bias);
385 | auto grad_offset = at::zeros_like(offset);
386 | auto grad_mask = at::zeros_like(mask);
387 |
388 | using scalar_t = float;
389 |
390 | for (int b = 0; b < batch; b++)
391 | {
392 | auto input_n = input.select(0, b);
393 | auto offset_n = offset.select(0, b);
394 | auto mask_n = mask.select(0, b);
395 | auto grad_output_n = grad_output.select(0, b);
396 | auto grad_input_n = grad_input.select(0, b);
397 | auto grad_offset_n = grad_offset.select(0, b);
398 | auto grad_mask_n = grad_mask.select(0, b);
399 |
400 | long m = channels * kernel_h * kernel_w;
401 | long n = height_out * width_out;
402 | long k = channels_out;
403 |
404 | THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
405 | grad_output_n.data(), n,
406 | weight.data(), m, 0.0f,
407 | columns.data(), n);
408 |
409 | // gradient w.r.t. input coordinate data
410 | modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state),
411 | columns.data(),
412 | input_n.data(),
413 | offset_n.data(),
414 | mask_n.data(),
415 | 1, channels, height, width,
416 | height_out, width_out, kernel_h, kernel_w,
417 | pad_h, pad_w, stride_h, stride_w,
418 | dilation_h, dilation_w, deformable_group,
419 | grad_offset_n.data(),
420 | grad_mask_n.data());
421 | // gradient w.r.t. input data
422 | modulated_deformable_col2im_cuda(THCState_getCurrentStream(state),
423 | columns.data(),
424 | offset_n.data(),
425 | mask_n.data(),
426 | 1, channels, height, width,
427 | height_out, width_out, kernel_h, kernel_w,
428 | pad_h, pad_w, stride_h, stride_w,
429 | dilation_h, dilation_w, deformable_group,
430 | grad_input_n.data());
431 |
432 | // gradient w.r.t. weight, dWeight should accumulate across the batch and group
433 | modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
434 | input_n.data(),
435 | offset_n.data(),
436 | mask_n.data(),
437 | 1, channels, height, width,
438 | height_out, width_out, kernel_h, kernel_w,
439 | pad_h, pad_w, stride_h, stride_w,
440 | dilation_h, dilation_w, deformable_group,
441 | columns.data());
442 |
443 | long m_ = channels_out;
444 | long n_ = channels * kernel_h * kernel_w;
445 | long k_ = height_out * width_out;
446 |
447 | THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
448 | columns.data(), k_,
449 | grad_output_n.data(), k_, 1.0f,
450 | grad_weight.data(), n_);
451 |
452 | // gradient w.r.t. bias
453 | // long m_ = channels_out;
454 | // long k__ = height_out * width_out;
455 | THCudaBlas_Sgemv(state,
456 | 't',
457 | k_, m_, 1.0f,
458 | grad_output_n.data(), k_,
459 | ones.data(), 1, 1.0f,
460 | grad_bias.data(), 1);
461 | }
462 |
463 | return {
464 | grad_input, grad_offset, grad_mask, grad_weight, grad_bias
465 | };
466 | }
--------------------------------------------------------------------------------
/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu:
--------------------------------------------------------------------------------
1 | #include "dcn_v2_im2col_cuda.h"
2 | #include
3 | #include
4 | #include
5 |
6 | #include
7 | #include
8 |
9 | #include
10 | #include
11 | #include
12 |
13 | #define CUDA_KERNEL_LOOP(i, n) \
14 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
15 | i < (n); \
16 | i += blockDim.x * gridDim.x)
17 |
18 | const int CUDA_NUM_THREADS = 1024;
19 | inline int GET_BLOCKS(const int N)
20 | {
21 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
22 | }
23 |
24 |
25 | __device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
26 | const int height, const int width, float h, float w)
27 | {
28 | int h_low = floor(h);
29 | int w_low = floor(w);
30 | int h_high = h_low + 1;
31 | int w_high = w_low + 1;
32 |
33 | float lh = h - h_low;
34 | float lw = w - w_low;
35 | float hh = 1 - lh, hw = 1 - lw;
36 |
37 | float v1 = 0;
38 | if (h_low >= 0 && w_low >= 0)
39 | v1 = bottom_data[h_low * data_width + w_low];
40 | float v2 = 0;
41 | if (h_low >= 0 && w_high <= width - 1)
42 | v2 = bottom_data[h_low * data_width + w_high];
43 | float v3 = 0;
44 | if (h_high <= height - 1 && w_low >= 0)
45 | v3 = bottom_data[h_high * data_width + w_low];
46 | float v4 = 0;
47 | if (h_high <= height - 1 && w_high <= width - 1)
48 | v4 = bottom_data[h_high * data_width + w_high];
49 |
50 | float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
51 |
52 | float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
53 | return val;
54 | }
55 |
56 | __device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w,
57 | const int h, const int w, const int height, const int width)
58 | {
59 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
60 | {
61 | //empty
62 | return 0;
63 | }
64 |
65 | int argmax_h_low = floor(argmax_h);
66 | int argmax_w_low = floor(argmax_w);
67 | int argmax_h_high = argmax_h_low + 1;
68 | int argmax_w_high = argmax_w_low + 1;
69 |
70 | float weight = 0;
71 | if (h == argmax_h_low && w == argmax_w_low)
72 | weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
73 | if (h == argmax_h_low && w == argmax_w_high)
74 | weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
75 | if (h == argmax_h_high && w == argmax_w_low)
76 | weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
77 | if (h == argmax_h_high && w == argmax_w_high)
78 | weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
79 | return weight;
80 | }
81 |
82 | __device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
83 | const int height, const int width, const float *im_data,
84 | const int data_width, const int bp_dir)
85 | {
86 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
87 | {
88 | //empty
89 | return 0;
90 | }
91 |
92 | int argmax_h_low = floor(argmax_h);
93 | int argmax_w_low = floor(argmax_w);
94 | int argmax_h_high = argmax_h_low + 1;
95 | int argmax_w_high = argmax_w_low + 1;
96 |
97 | float weight = 0;
98 |
99 | if (bp_dir == 0)
100 | {
101 | if (argmax_h_low >= 0 && argmax_w_low >= 0)
102 | weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
103 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
104 | weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
105 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
106 | weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
107 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
108 | weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
109 | }
110 | else if (bp_dir == 1)
111 | {
112 | if (argmax_h_low >= 0 && argmax_w_low >= 0)
113 | weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
114 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
115 | weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
116 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
117 | weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
118 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
119 | weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
120 | }
121 |
122 | return weight;
123 | }
124 |
125 | __global__ void modulated_deformable_im2col_gpu_kernel(const int n,
126 | const float *data_im, const float *data_offset, const float *data_mask,
127 | const int height, const int width, const int kernel_h, const int kernel_w,
128 | const int pad_h, const int pad_w,
129 | const int stride_h, const int stride_w,
130 | const int dilation_h, const int dilation_w,
131 | const int channel_per_deformable_group,
132 | const int batch_size, const int num_channels, const int deformable_group,
133 | const int height_col, const int width_col,
134 | float *data_col)
135 | {
136 | // launch channels * batch_size * height_col * width_col cores
137 | CUDA_KERNEL_LOOP(index, n)
138 | {
139 | // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
140 | // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
141 |
142 | // index index of output matrix
143 | const int w_col = index % width_col;
144 | const int h_col = (index / width_col) % height_col;
145 | // const int b_col = (index / width_col / height_col) % batch_size;
146 | const int b_col = (index / width_col / height_col / num_channels) % batch_size;
147 | // const int c_im = (index / width_col / height_col) / batch_size;
148 | const int c_im = (index / width_col / height_col) % num_channels;
149 | // const int c_col = c_im * kernel_h * kernel_w;
150 | const int c_col = c_im * kernel_h * kernel_w;
151 |
152 | // compute deformable group index
153 | const int deformable_group_index = c_im / channel_per_deformable_group;
154 |
155 | const int h_in = h_col * stride_h - pad_h;
156 | const int w_in = w_col * stride_w - pad_w;
157 |
158 | // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
159 | float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
160 | //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
161 | const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
162 | const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
163 |
164 | const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
165 |
166 | for (int i = 0; i < kernel_h; ++i)
167 | {
168 | for (int j = 0; j < kernel_w; ++j)
169 | {
170 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
171 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
172 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
173 | const float offset_h = data_offset_ptr[data_offset_h_ptr];
174 | const float offset_w = data_offset_ptr[data_offset_w_ptr];
175 | const float mask = data_mask_ptr[data_mask_hw_ptr];
176 | float val = static_cast(0);
177 | const float h_im = h_in + i * dilation_h + offset_h;
178 | const float w_im = w_in + j * dilation_w + offset_w;
179 | //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
180 | if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
181 | {
182 | //const float map_h = i * dilation_h + offset_h;
183 | //const float map_w = j * dilation_w + offset_w;
184 | //const int cur_height = height - h_in;
185 | //const int cur_width = width - w_in;
186 | //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
187 | val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
188 | }
189 | *data_col_ptr = val * mask;
190 | // data_col_ptr += batch_size * height_col * width_col;
191 | data_col_ptr += height_col * width_col;
192 | }
193 | }
194 | }
195 | }
196 |
197 | __global__ void modulated_deformable_col2im_gpu_kernel(const int n,
198 | const float *data_col, const float *data_offset, const float *data_mask,
199 | const int channels, const int height, const int width,
200 | const int kernel_h, const int kernel_w,
201 | const int pad_h, const int pad_w,
202 | const int stride_h, const int stride_w,
203 | const int dilation_h, const int dilation_w,
204 | const int channel_per_deformable_group,
205 | const int batch_size, const int deformable_group,
206 | const int height_col, const int width_col,
207 | float *grad_im)
208 | {
209 | CUDA_KERNEL_LOOP(index, n)
210 | {
211 | const int j = (index / width_col / height_col / batch_size) % kernel_w;
212 | const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
213 | const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
214 | // compute the start and end of the output
215 |
216 | const int deformable_group_index = c / channel_per_deformable_group;
217 |
218 | int w_out = index % width_col;
219 | int h_out = (index / width_col) % height_col;
220 | int b = (index / width_col / height_col) % batch_size;
221 | int w_in = w_out * stride_w - pad_w;
222 | int h_in = h_out * stride_h - pad_h;
223 |
224 | const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
225 | const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
226 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
227 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
228 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
229 | const float offset_h = data_offset_ptr[data_offset_h_ptr];
230 | const float offset_w = data_offset_ptr[data_offset_w_ptr];
231 | const float mask = data_mask_ptr[data_mask_hw_ptr];
232 | const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
233 | const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
234 |
235 | const float cur_top_grad = data_col[index] * mask;
236 | const int cur_h = (int)cur_inv_h_data;
237 | const int cur_w = (int)cur_inv_w_data;
238 | for (int dy = -2; dy <= 2; dy++)
239 | {
240 | for (int dx = -2; dx <= 2; dx++)
241 | {
242 | if (cur_h + dy >= 0 && cur_h + dy < height &&
243 | cur_w + dx >= 0 && cur_w + dx < width &&
244 | abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
245 | abs(cur_inv_w_data - (cur_w + dx)) < 1)
246 | {
247 | int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
248 | float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
249 | atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
250 | }
251 | }
252 | }
253 | }
254 | }
255 |
256 | __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
257 | const float *data_col, const float *data_im,
258 | const float *data_offset, const float *data_mask,
259 | const int channels, const int height, const int width,
260 | const int kernel_h, const int kernel_w,
261 | const int pad_h, const int pad_w,
262 | const int stride_h, const int stride_w,
263 | const int dilation_h, const int dilation_w,
264 | const int channel_per_deformable_group,
265 | const int batch_size, const int offset_channels, const int deformable_group,
266 | const int height_col, const int width_col,
267 | float *grad_offset, float *grad_mask)
268 | {
269 | CUDA_KERNEL_LOOP(index, n)
270 | {
271 | float val = 0, mval = 0;
272 | int w = index % width_col;
273 | int h = (index / width_col) % height_col;
274 | int c = (index / width_col / height_col) % offset_channels;
275 | int b = (index / width_col / height_col) / offset_channels;
276 | // compute the start and end of the output
277 |
278 | const int deformable_group_index = c / (2 * kernel_h * kernel_w);
279 | const int col_step = kernel_h * kernel_w;
280 | int cnt = 0;
281 | const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
282 | const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
283 | const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
284 | const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
285 |
286 | const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
287 |
288 | for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
289 | {
290 | const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
291 | const int bp_dir = offset_c % 2;
292 |
293 | int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
294 | int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
295 | int w_out = col_pos % width_col;
296 | int h_out = (col_pos / width_col) % height_col;
297 | int w_in = w_out * stride_w - pad_w;
298 | int h_in = h_out * stride_h - pad_h;
299 | const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
300 | const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
301 | const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
302 | const float offset_h = data_offset_ptr[data_offset_h_ptr];
303 | const float offset_w = data_offset_ptr[data_offset_w_ptr];
304 | const float mask = data_mask_ptr[data_mask_hw_ptr];
305 | float inv_h = h_in + i * dilation_h + offset_h;
306 | float inv_w = w_in + j * dilation_w + offset_w;
307 | if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
308 | {
309 | inv_h = inv_w = -2;
310 | }
311 | else
312 | {
313 | mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
314 | }
315 | const float weight = dmcn_get_coordinate_weight(
316 | inv_h, inv_w,
317 | height, width, data_im_ptr + cnt * height * width, width, bp_dir);
318 | val += weight * data_col_ptr[col_pos] * mask;
319 | cnt += 1;
320 | }
321 | // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
322 | grad_offset[index] = val;
323 | if (offset_c % 2 == 0)
324 | // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
325 | grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
326 | }
327 | }
328 |
329 | void modulated_deformable_im2col_cuda(cudaStream_t stream,
330 | const float* data_im, const float* data_offset, const float* data_mask,
331 | const int batch_size, const int channels, const int height_im, const int width_im,
332 | const int height_col, const int width_col, const int kernel_h, const int kernel_w,
333 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
334 | const int dilation_h, const int dilation_w,
335 | const int deformable_group, float* data_col) {
336 | // num_axes should be smaller than block size
337 | const int channel_per_deformable_group = channels / deformable_group;
338 | const int num_kernels = channels * batch_size * height_col * width_col;
339 | modulated_deformable_im2col_gpu_kernel
340 | <<>>(
342 | num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
343 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
344 | batch_size, channels, deformable_group, height_col, width_col, data_col);
345 |
346 | cudaError_t err = cudaGetLastError();
347 | if (err != cudaSuccess)
348 | {
349 | printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
350 | }
351 |
352 | }
353 |
354 | void modulated_deformable_col2im_cuda(cudaStream_t stream,
355 | const float* data_col, const float* data_offset, const float* data_mask,
356 | const int batch_size, const int channels, const int height_im, const int width_im,
357 | const int height_col, const int width_col, const int kernel_h, const int kernel_w,
358 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
359 | const int dilation_h, const int dilation_w,
360 | const int deformable_group, float* grad_im){
361 |
362 | const int channel_per_deformable_group = channels / deformable_group;
363 | const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
364 | modulated_deformable_col2im_gpu_kernel
365 | <<>>(
367 | num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
368 | kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
369 | dilation_h, dilation_w, channel_per_deformable_group,
370 | batch_size, deformable_group, height_col, width_col, grad_im);
371 | cudaError_t err = cudaGetLastError();
372 | if (err != cudaSuccess)
373 | {
374 | printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
375 | }
376 |
377 | }
378 |
379 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
380 | const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
381 | const int batch_size, const int channels, const int height_im, const int width_im,
382 | const int height_col, const int width_col, const int kernel_h, const int kernel_w,
383 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
384 | const int dilation_h, const int dilation_w,
385 | const int deformable_group,
386 | float* grad_offset, float* grad_mask) {
387 | const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
388 | const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
389 | modulated_deformable_col2im_coord_gpu_kernel
390 | <<>>(
392 | num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
393 | kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
394 | dilation_h, dilation_w, channel_per_deformable_group,
395 | batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
396 | grad_offset, grad_mask);
397 | cudaError_t err = cudaGetLastError();
398 | if (err != cudaSuccess)
399 | {
400 | printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
401 | }
402 | }
--------------------------------------------------------------------------------
/DCNv2/src/cuda/dcn_v2_im2col_cuda.h:
--------------------------------------------------------------------------------
1 |
2 | /*!
3 | ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
4 | *
5 | * COPYRIGHT
6 | *
7 | * All contributions by the University of California:
8 | * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
9 | * All rights reserved.
10 | *
11 | * All other contributions:
12 | * Copyright (c) 2014-2017, the respective contributors
13 | * All rights reserved.
14 | *
15 | * Caffe uses a shared copyright model: each contributor holds copyright over
16 | * their contributions to Caffe. The project versioning records all such
17 | * contribution and copyright details. If a contributor wants to further mark
18 | * their specific copyright on a particular contribution, they should indicate
19 | * their copyright solely in the commit message of the change when it is
20 | * committed.
21 | *
22 | * LICENSE
23 | *
24 | * Redistribution and use in source and binary forms, with or without
25 | * modification, are permitted provided that the following conditions are met:
26 | *
27 | * 1. Redistributions of source code must retain the above copyright notice, this
28 | * list of conditions and the following disclaimer.
29 | * 2. Redistributions in binary form must reproduce the above copyright notice,
30 | * this list of conditions and the following disclaimer in the documentation
31 | * and/or other materials provided with the distribution.
32 | *
33 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
34 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
36 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
37 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
38 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
39 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
40 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 | *
44 | * CONTRIBUTION AGREEMENT
45 | *
46 | * By contributing to the BVLC/caffe repository through pull-request, comment,
47 | * or otherwise, the contributor releases their content to the
48 | * license and copyright terms herein.
49 | *
50 | ***************** END Caffe Copyright Notice and Disclaimer ********************
51 | *
52 | * Copyright (c) 2018 Microsoft
53 | * Licensed under The MIT License [see LICENSE for details]
54 | * \file modulated_deformable_im2col.h
55 | * \brief Function definitions of converting an image to
56 | * column matrix based on kernel, padding, dilation, and offset.
57 | * These functions are mainly used in deformable convolution operators.
58 | * \ref: https://arxiv.org/abs/1811.11168
59 | * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
60 | */
61 |
62 | /***************** Adapted by Charles Shang *********************/
63 |
64 | #ifndef DCN_V2_IM2COL_CUDA
65 | #define DCN_V2_IM2COL_CUDA
66 |
67 | #ifdef __cplusplus
68 | extern "C"
69 | {
70 | #endif
71 |
72 | void modulated_deformable_im2col_cuda(cudaStream_t stream,
73 | const float *data_im, const float *data_offset, const float *data_mask,
74 | const int batch_size, const int channels, const int height_im, const int width_im,
75 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
76 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
77 | const int dilation_h, const int dilation_w,
78 | const int deformable_group, float *data_col);
79 |
80 | void modulated_deformable_col2im_cuda(cudaStream_t stream,
81 | const float *data_col, const float *data_offset, const float *data_mask,
82 | const int batch_size, const int channels, const int height_im, const int width_im,
83 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
84 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
85 | const int dilation_h, const int dilation_w,
86 | const int deformable_group, float *grad_im);
87 |
88 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
89 | const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
90 | const int batch_size, const int channels, const int height_im, const int width_im,
91 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
92 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
93 | const int dilation_h, const int dilation_w,
94 | const int deformable_group,
95 | float *grad_offset, float *grad_mask);
96 |
97 | #ifdef __cplusplus
98 | }
99 | #endif
100 |
101 | #endif
--------------------------------------------------------------------------------
/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu:
--------------------------------------------------------------------------------
1 | /*!
2 | * Copyright (c) 2017 Microsoft
3 | * Licensed under The MIT License [see LICENSE for details]
4 | * \file deformable_psroi_pooling.cu
5 | * \brief
6 | * \author Yi Li, Guodong Zhang, Jifeng Dai
7 | */
8 | /***************** Adapted by Charles Shang *********************/
9 |
10 | #include
11 | #include
12 | #include
13 | #include
14 |
15 | #include
16 | #include
17 |
18 | #include
19 | #include
20 | #include
21 |
22 | #define CUDA_KERNEL_LOOP(i, n) \
23 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
24 | i < (n); \
25 | i += blockDim.x * gridDim.x)
26 |
27 | const int CUDA_NUM_THREADS = 1024;
28 | inline int GET_BLOCKS(const int N)
29 | {
30 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
31 | }
32 |
33 | template
34 | __device__ T bilinear_interp(
35 | const T *data,
36 | const T x,
37 | const T y,
38 | const int width,
39 | const int height)
40 | {
41 | int x1 = floor(x);
42 | int x2 = ceil(x);
43 | int y1 = floor(y);
44 | int y2 = ceil(y);
45 | T dist_x = static_cast(x - x1);
46 | T dist_y = static_cast(y - y1);
47 | T value11 = data[y1 * width + x1];
48 | T value12 = data[y2 * width + x1];
49 | T value21 = data[y1 * width + x2];
50 | T value22 = data[y2 * width + x2];
51 | T value = (1 - dist_x) * (1 - dist_y) * value11 +
52 | (1 - dist_x) * dist_y * value12 +
53 | dist_x * (1 - dist_y) * value21 +
54 | dist_x * dist_y * value22;
55 | return value;
56 | }
57 |
58 | template
59 | __global__ void DeformablePSROIPoolForwardKernel(
60 | const int count,
61 | const T *bottom_data,
62 | const T spatial_scale,
63 | const int channels,
64 | const int height, const int width,
65 | const int pooled_height, const int pooled_width,
66 | const T *bottom_rois, const T *bottom_trans,
67 | const int no_trans,
68 | const T trans_std,
69 | const int sample_per_part,
70 | const int output_dim,
71 | const int group_size,
72 | const int part_size,
73 | const int num_classes,
74 | const int channels_each_class,
75 | T *top_data,
76 | T *top_count)
77 | {
78 | CUDA_KERNEL_LOOP(index, count)
79 | {
80 | // The output is in order (n, ctop, ph, pw)
81 | int pw = index % pooled_width;
82 | int ph = (index / pooled_width) % pooled_height;
83 | int ctop = (index / pooled_width / pooled_height) % output_dim;
84 | int n = index / pooled_width / pooled_height / output_dim;
85 |
86 | // [start, end) interval for spatial sampling
87 | const T *offset_bottom_rois = bottom_rois + n * 5;
88 | int roi_batch_ind = offset_bottom_rois[0];
89 | T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
90 | T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
91 | T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
92 | T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
93 |
94 | // Force too small ROIs to be 1x1
95 | T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
96 | T roi_height = max(roi_end_h - roi_start_h, 0.1);
97 |
98 | // Compute w and h at bottom
99 | T bin_size_h = roi_height / static_cast(pooled_height);
100 | T bin_size_w = roi_width / static_cast(pooled_width);
101 |
102 | T sub_bin_size_h = bin_size_h / static_cast(sample_per_part);
103 | T sub_bin_size_w = bin_size_w / static_cast(sample_per_part);
104 |
105 | int part_h = floor(static_cast(ph) / pooled_height * part_size);
106 | int part_w = floor(static_cast(pw) / pooled_width * part_size);
107 | int class_id = ctop / channels_each_class;
108 | T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
109 | T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
110 |
111 | T wstart = static_cast(pw) * bin_size_w + roi_start_w;
112 | wstart += trans_x * roi_width;
113 | T hstart = static_cast(ph) * bin_size_h + roi_start_h;
114 | hstart += trans_y * roi_height;
115 |
116 | T sum = 0;
117 | int count = 0;
118 | int gw = floor(static_cast(pw) * group_size / pooled_width);
119 | int gh = floor(static_cast(ph) * group_size / pooled_height);
120 | gw = min(max(gw, 0), group_size - 1);
121 | gh = min(max(gh, 0), group_size - 1);
122 |
123 | const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
124 | for (int ih = 0; ih < sample_per_part; ih++)
125 | {
126 | for (int iw = 0; iw < sample_per_part; iw++)
127 | {
128 | T w = wstart + iw * sub_bin_size_w;
129 | T h = hstart + ih * sub_bin_size_h;
130 | // bilinear interpolation
131 | if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
132 | {
133 | continue;
134 | }
135 | w = min(max(w, 0.), width - 1.);
136 | h = min(max(h, 0.), height - 1.);
137 | int c = (ctop * group_size + gh) * group_size + gw;
138 | T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
139 | sum += val;
140 | count++;
141 | }
142 | }
143 | top_data[index] = count == 0 ? static_cast(0) : sum / count;
144 | top_count[index] = count;
145 | }
146 | }
147 |
148 | template
149 | __global__ void DeformablePSROIPoolBackwardAccKernel(
150 | const int count,
151 | const T *top_diff,
152 | const T *top_count,
153 | const int num_rois,
154 | const T spatial_scale,
155 | const int channels,
156 | const int height, const int width,
157 | const int pooled_height, const int pooled_width,
158 | const int output_dim,
159 | T *bottom_data_diff, T *bottom_trans_diff,
160 | const T *bottom_data,
161 | const T *bottom_rois,
162 | const T *bottom_trans,
163 | const int no_trans,
164 | const T trans_std,
165 | const int sample_per_part,
166 | const int group_size,
167 | const int part_size,
168 | const int num_classes,
169 | const int channels_each_class)
170 | {
171 | CUDA_KERNEL_LOOP(index, count)
172 | {
173 | // The output is in order (n, ctop, ph, pw)
174 | int pw = index % pooled_width;
175 | int ph = (index / pooled_width) % pooled_height;
176 | int ctop = (index / pooled_width / pooled_height) % output_dim;
177 | int n = index / pooled_width / pooled_height / output_dim;
178 |
179 | // [start, end) interval for spatial sampling
180 | const T *offset_bottom_rois = bottom_rois + n * 5;
181 | int roi_batch_ind = offset_bottom_rois[0];
182 | T roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
183 | T roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
184 | T roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
185 | T roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
186 |
187 | // Force too small ROIs to be 1x1
188 | T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
189 | T roi_height = max(roi_end_h - roi_start_h, 0.1);
190 |
191 | // Compute w and h at bottom
192 | T bin_size_h = roi_height / static_cast(pooled_height);
193 | T bin_size_w = roi_width / static_cast(pooled_width);
194 |
195 | T sub_bin_size_h = bin_size_h / static_cast(sample_per_part);
196 | T sub_bin_size_w = bin_size_w / static_cast(sample_per_part);
197 |
198 | int part_h = floor(static_cast(ph) / pooled_height * part_size);
199 | int part_w = floor(static_cast(pw) / pooled_width * part_size);
200 | int class_id = ctop / channels_each_class;
201 | T trans_x = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
202 | T trans_y = no_trans ? static_cast(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
203 |
204 | T wstart = static_cast(pw) * bin_size_w + roi_start_w;
205 | wstart += trans_x * roi_width;
206 | T hstart = static_cast(ph) * bin_size_h + roi_start_h;
207 | hstart += trans_y * roi_height;
208 |
209 | if (top_count[index] <= 0)
210 | {
211 | continue;
212 | }
213 | T diff_val = top_diff[index] / top_count[index];
214 | const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
215 | T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
216 | int gw = floor(static_cast(pw) * group_size / pooled_width);
217 | int gh = floor(static_cast(ph) * group_size / pooled_height);
218 | gw = min(max(gw, 0), group_size - 1);
219 | gh = min(max(gh, 0), group_size - 1);
220 |
221 | for (int ih = 0; ih < sample_per_part; ih++)
222 | {
223 | for (int iw = 0; iw < sample_per_part; iw++)
224 | {
225 | T w = wstart + iw * sub_bin_size_w;
226 | T h = hstart + ih * sub_bin_size_h;
227 | // bilinear interpolation
228 | if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
229 | {
230 | continue;
231 | }
232 | w = min(max(w, 0.), width - 1.);
233 | h = min(max(h, 0.), height - 1.);
234 | int c = (ctop * group_size + gh) * group_size + gw;
235 | // backward on feature
236 | int x0 = floor(w);
237 | int x1 = ceil(w);
238 | int y0 = floor(h);
239 | int y1 = ceil(h);
240 | T dist_x = w - x0, dist_y = h - y0;
241 | T q00 = (1 - dist_x) * (1 - dist_y);
242 | T q01 = (1 - dist_x) * dist_y;
243 | T q10 = dist_x * (1 - dist_y);
244 | T q11 = dist_x * dist_y;
245 | int bottom_index_base = c * height * width;
246 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
247 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
248 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
249 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
250 |
251 | if (no_trans)
252 | {
253 | continue;
254 | }
255 | T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
256 | T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
257 | T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
258 | T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
259 | T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
260 | diff_x *= roi_width;
261 | T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
262 | diff_y *= roi_height;
263 |
264 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
265 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
266 | }
267 | }
268 | }
269 | }
270 |
271 | std::tuple
272 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
273 | const at::Tensor &bbox,
274 | const at::Tensor &trans,
275 | const int no_trans,
276 | const float spatial_scale,
277 | const int output_dim,
278 | const int group_size,
279 | const int pooled_size,
280 | const int part_size,
281 | const int sample_per_part,
282 | const float trans_std)
283 | {
284 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
285 | AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
286 | AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
287 |
288 | const int batch = input.size(0);
289 | const int channels = input.size(1);
290 | const int height = input.size(2);
291 | const int width = input.size(3);
292 | const int channels_trans = no_trans ? 2 : trans.size(1);
293 | const int num_bbox = bbox.size(0);
294 |
295 | AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
296 | auto pooled_height = pooled_size;
297 | auto pooled_width = pooled_size;
298 |
299 | auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
300 | long out_size = num_bbox * output_dim * pooled_height * pooled_width;
301 | auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
302 |
303 | const int num_classes = no_trans ? 1 : channels_trans / 2;
304 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
305 |
306 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
307 |
308 | if (out.numel() == 0)
309 | {
310 | THCudaCheck(cudaGetLastError());
311 | return std::make_tuple(out, top_count);
312 | }
313 |
314 | dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
315 | dim3 block(512);
316 |
317 | AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
318 | DeformablePSROIPoolForwardKernel<<>>(
319 | out_size,
320 | input.contiguous().data(),
321 | spatial_scale,
322 | channels,
323 | height, width,
324 | pooled_height,
325 | pooled_width,
326 | bbox.contiguous().data(),
327 | trans.contiguous().data(),
328 | no_trans,
329 | trans_std,
330 | sample_per_part,
331 | output_dim,
332 | group_size,
333 | part_size,
334 | num_classes,
335 | channels_each_class,
336 | out.data(),
337 | top_count.data());
338 | });
339 | THCudaCheck(cudaGetLastError());
340 | return std::make_tuple(out, top_count);
341 | }
342 |
343 | std::tuple
344 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
345 | const at::Tensor &input,
346 | const at::Tensor &bbox,
347 | const at::Tensor &trans,
348 | const at::Tensor &top_count,
349 | const int no_trans,
350 | const float spatial_scale,
351 | const int output_dim,
352 | const int group_size,
353 | const int pooled_size,
354 | const int part_size,
355 | const int sample_per_part,
356 | const float trans_std)
357 | {
358 | AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
359 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
360 | AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
361 | AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
362 | AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
363 |
364 | const int batch = input.size(0);
365 | const int channels = input.size(1);
366 | const int height = input.size(2);
367 | const int width = input.size(3);
368 | const int channels_trans = no_trans ? 2 : trans.size(1);
369 | const int num_bbox = bbox.size(0);
370 |
371 | AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
372 | auto pooled_height = pooled_size;
373 | auto pooled_width = pooled_size;
374 | long out_size = num_bbox * output_dim * pooled_height * pooled_width;
375 | const int num_classes = no_trans ? 1 : channels_trans / 2;
376 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
377 |
378 | auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
379 | auto trans_grad = at::zeros_like(trans);
380 |
381 | if (input_grad.numel() == 0)
382 | {
383 | THCudaCheck(cudaGetLastError());
384 | return std::make_tuple(input_grad, trans_grad);
385 | }
386 |
387 | dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
388 | dim3 block(512);
389 | cudaStream_t stream = at::cuda::getCurrentCUDAStream();
390 |
391 | AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
392 | DeformablePSROIPoolBackwardAccKernel<<>>(
393 | out_size,
394 | out_grad.contiguous().data(),
395 | top_count.contiguous().data(),
396 | num_bbox,
397 | spatial_scale,
398 | channels,
399 | height,
400 | width,
401 | pooled_height,
402 | pooled_width,
403 | output_dim,
404 | input_grad.contiguous().data(),
405 | trans_grad.contiguous().data(),
406 | input.contiguous().data(),
407 | bbox.contiguous().data(),
408 | trans.contiguous().data(),
409 | no_trans,
410 | trans_std,
411 | sample_per_part,
412 | group_size,
413 | part_size,
414 | num_classes,
415 | channels_each_class);
416 | });
417 | THCudaCheck(cudaGetLastError());
418 | return std::make_tuple(input_grad, trans_grad);
419 | }
--------------------------------------------------------------------------------
/DCNv2/src/cuda/vision.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | at::Tensor
5 | dcn_v2_cuda_forward(const at::Tensor &input,
6 | const at::Tensor &weight,
7 | const at::Tensor &bias,
8 | const at::Tensor &offset,
9 | const at::Tensor &mask,
10 | const int kernel_h,
11 | const int kernel_w,
12 | const int stride_h,
13 | const int stride_w,
14 | const int pad_h,
15 | const int pad_w,
16 | const int dilation_h,
17 | const int dilation_w,
18 | const int deformable_group);
19 |
20 | std::vector
21 | dcn_v2_cuda_backward(const at::Tensor &input,
22 | const at::Tensor &weight,
23 | const at::Tensor &bias,
24 | const at::Tensor &offset,
25 | const at::Tensor &mask,
26 | const at::Tensor &grad_output,
27 | int kernel_h, int kernel_w,
28 | int stride_h, int stride_w,
29 | int pad_h, int pad_w,
30 | int dilation_h, int dilation_w,
31 | int deformable_group);
32 |
33 |
34 | std::tuple
35 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
36 | const at::Tensor &bbox,
37 | const at::Tensor &trans,
38 | const int no_trans,
39 | const float spatial_scale,
40 | const int output_dim,
41 | const int group_size,
42 | const int pooled_size,
43 | const int part_size,
44 | const int sample_per_part,
45 | const float trans_std);
46 |
47 | std::tuple
48 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
49 | const at::Tensor &input,
50 | const at::Tensor &bbox,
51 | const at::Tensor &trans,
52 | const at::Tensor &top_count,
53 | const int no_trans,
54 | const float spatial_scale,
55 | const int output_dim,
56 | const int group_size,
57 | const int pooled_size,
58 | const int part_size,
59 | const int sample_per_part,
60 | const float trans_std);
--------------------------------------------------------------------------------
/DCNv2/src/dcn_v2.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "cpu/vision.h"
4 |
5 | #ifdef WITH_CUDA
6 | #include "cuda/vision.h"
7 | #endif
8 |
9 | at::Tensor
10 | dcn_v2_forward(const at::Tensor &input,
11 | const at::Tensor &weight,
12 | const at::Tensor &bias,
13 | const at::Tensor &offset,
14 | const at::Tensor &mask,
15 | const int kernel_h,
16 | const int kernel_w,
17 | const int stride_h,
18 | const int stride_w,
19 | const int pad_h,
20 | const int pad_w,
21 | const int dilation_h,
22 | const int dilation_w,
23 | const int deformable_group)
24 | {
25 | if (input.type().is_cuda())
26 | {
27 | #ifdef WITH_CUDA
28 | return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
29 | kernel_h, kernel_w,
30 | stride_h, stride_w,
31 | pad_h, pad_w,
32 | dilation_h, dilation_w,
33 | deformable_group);
34 | #else
35 | AT_ERROR("Not compiled with GPU support");
36 | #endif
37 | }
38 | AT_ERROR("Not implemented on the CPU");
39 | }
40 |
41 | std::vector
42 | dcn_v2_backward(const at::Tensor &input,
43 | const at::Tensor &weight,
44 | const at::Tensor &bias,
45 | const at::Tensor &offset,
46 | const at::Tensor &mask,
47 | const at::Tensor &grad_output,
48 | int kernel_h, int kernel_w,
49 | int stride_h, int stride_w,
50 | int pad_h, int pad_w,
51 | int dilation_h, int dilation_w,
52 | int deformable_group)
53 | {
54 | if (input.type().is_cuda())
55 | {
56 | #ifdef WITH_CUDA
57 | return dcn_v2_cuda_backward(input,
58 | weight,
59 | bias,
60 | offset,
61 | mask,
62 | grad_output,
63 | kernel_h, kernel_w,
64 | stride_h, stride_w,
65 | pad_h, pad_w,
66 | dilation_h, dilation_w,
67 | deformable_group);
68 | #else
69 | AT_ERROR("Not compiled with GPU support");
70 | #endif
71 | }
72 | AT_ERROR("Not implemented on the CPU");
73 | }
74 |
75 | std::tuple
76 | dcn_v2_psroi_pooling_forward(const at::Tensor &input,
77 | const at::Tensor &bbox,
78 | const at::Tensor &trans,
79 | const int no_trans,
80 | const float spatial_scale,
81 | const int output_dim,
82 | const int group_size,
83 | const int pooled_size,
84 | const int part_size,
85 | const int sample_per_part,
86 | const float trans_std)
87 | {
88 | if (input.type().is_cuda())
89 | {
90 | #ifdef WITH_CUDA
91 | return dcn_v2_psroi_pooling_cuda_forward(input,
92 | bbox,
93 | trans,
94 | no_trans,
95 | spatial_scale,
96 | output_dim,
97 | group_size,
98 | pooled_size,
99 | part_size,
100 | sample_per_part,
101 | trans_std);
102 | #else
103 | AT_ERROR("Not compiled with GPU support");
104 | #endif
105 | }
106 | AT_ERROR("Not implemented on the CPU");
107 | }
108 |
109 | std::tuple
110 | dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
111 | const at::Tensor &input,
112 | const at::Tensor &bbox,
113 | const at::Tensor &trans,
114 | const at::Tensor &top_count,
115 | const int no_trans,
116 | const float spatial_scale,
117 | const int output_dim,
118 | const int group_size,
119 | const int pooled_size,
120 | const int part_size,
121 | const int sample_per_part,
122 | const float trans_std)
123 | {
124 | if (input.type().is_cuda())
125 | {
126 | #ifdef WITH_CUDA
127 | return dcn_v2_psroi_pooling_cuda_backward(out_grad,
128 | input,
129 | bbox,
130 | trans,
131 | top_count,
132 | no_trans,
133 | spatial_scale,
134 | output_dim,
135 | group_size,
136 | pooled_size,
137 | part_size,
138 | sample_per_part,
139 | trans_std);
140 | #else
141 | AT_ERROR("Not compiled with GPU support");
142 | #endif
143 | }
144 | AT_ERROR("Not implemented on the CPU");
145 | }
146 |
147 | at::Tensor
148 | dcn_v2_cuda_forward_v2(const at::Tensor &input,
149 | const at::Tensor &weight,
150 | const at::Tensor &bias,
151 | const at::Tensor &offset,
152 | const at::Tensor &mask,
153 | const int64_t kernel_h,
154 | const int64_t kernel_w,
155 | const int64_t stride_h,
156 | const int64_t stride_w,
157 | const int64_t pad_h,
158 | const int64_t pad_w,
159 | const int64_t dilation_h,
160 | const int64_t dilation_w,
161 | const int64_t deformable_group);
--------------------------------------------------------------------------------
/DCNv2/src/vision.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dcn_v2.h"
3 | #include
4 |
5 | // static auto registry =
6 | // torch::jit::RegisterOperators("my_ops::dcn_v2_forward", &dcn_v2_forward)
7 | // .op("my_ops::dcn_v2_backward", &dcn_v2_backward)
8 | // .op("my_ops::dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward)
9 | // .op("my_ops::dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward);
10 |
11 | static auto registry =
12 | torch::jit::RegisterOperators("my_ops::dcn_v2_cuda_forward_v2", &dcn_v2_cuda_forward_v2);
13 |
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
15 | {
16 | m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
17 | m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
18 | m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
19 | m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
20 | }
21 |
--------------------------------------------------------------------------------
/DCNv2/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from __future__ import absolute_import
3 | from __future__ import print_function
4 | from __future__ import division
5 |
6 | import time
7 | import torch
8 | import torch.nn as nn
9 | from torch.autograd import gradcheck
10 |
11 | from dcn_v2 import dcn_v2_conv, DCNv2, DCN
12 | from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling
13 |
14 | deformable_groups = 1
15 | N, inC, inH, inW = 2, 2, 4, 4
16 | outC = 2
17 | kH, kW = 3, 3
18 |
19 |
20 | def conv_identify(weight, bias):
21 | weight.data.zero_()
22 | bias.data.zero_()
23 | o, i, h, w = weight.shape
24 | y = h//2
25 | x = w//2
26 | for p in range(i):
27 | for q in range(o):
28 | if p == q:
29 | weight.data[q, p, y, x] = 1.0
30 |
31 |
32 | def check_zero_offset():
33 | conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW,
34 | kernel_size=(kH, kW),
35 | stride=(1, 1),
36 | padding=(1, 1),
37 | bias=True).cuda()
38 |
39 | conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW,
40 | kernel_size=(kH, kW),
41 | stride=(1, 1),
42 | padding=(1, 1),
43 | bias=True).cuda()
44 |
45 | dcn_v2 = DCNv2(inC, outC, (kH, kW),
46 | stride=1, padding=1, dilation=1,
47 | deformable_groups=deformable_groups).cuda()
48 |
49 | conv_offset.weight.data.zero_()
50 | conv_offset.bias.data.zero_()
51 | conv_mask.weight.data.zero_()
52 | conv_mask.bias.data.zero_()
53 | conv_identify(dcn_v2.weight, dcn_v2.bias)
54 |
55 | input = torch.randn(N, inC, inH, inW).cuda()
56 | offset = conv_offset(input)
57 | mask = conv_mask(input)
58 | mask = torch.sigmoid(mask)
59 | output = dcn_v2(input, offset, mask)
60 | output *= 2
61 | d = (input - output).abs().max()
62 | if d < 1e-10:
63 | print('Zero offset passed')
64 | else:
65 | print('Zero offset failed')
66 | print(input)
67 | print(output)
68 |
69 | def check_gradient_dconv():
70 |
71 | input = torch.rand(N, inC, inH, inW).cuda() * 0.01
72 | input.requires_grad = True
73 |
74 | offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2
75 | # offset.data.zero_()
76 | # offset.data -= 0.5
77 | offset.requires_grad = True
78 |
79 | mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda()
80 | # mask.data.zero_()
81 | mask.requires_grad = True
82 | mask = torch.sigmoid(mask)
83 |
84 | weight = torch.randn(outC, inC, kH, kW).cuda()
85 | weight.requires_grad = True
86 |
87 | bias = torch.rand(outC).cuda()
88 | bias.requires_grad = True
89 |
90 | stride = 1
91 | padding = 1
92 | dilation = 1
93 |
94 | print('check_gradient_dconv: ',
95 | gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias,
96 | stride, padding, dilation, deformable_groups),
97 | eps=1e-3, atol=1e-4, rtol=1e-2))
98 |
99 |
100 | def check_pooling_zero_offset():
101 |
102 | input = torch.randn(2, 16, 64, 64).cuda().zero_()
103 | input[0, :, 16:26, 16:26] = 1.
104 | input[1, :, 10:20, 20:30] = 2.
105 | rois = torch.tensor([
106 | [0, 65, 65, 103, 103],
107 | [1, 81, 41, 119, 79],
108 | ]).cuda().float()
109 | pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
110 | pooled_size=7,
111 | output_dim=16,
112 | no_trans=True,
113 | group_size=1,
114 | trans_std=0.0).cuda()
115 |
116 | out = pooling(input, rois, input.new())
117 | s = ', '.join(['%f' % out[i, :, :, :].mean().item()
118 | for i in range(rois.shape[0])])
119 | print(s)
120 |
121 | dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
122 | pooled_size=7,
123 | output_dim=16,
124 | no_trans=False,
125 | group_size=1,
126 | trans_std=0.0).cuda()
127 | offset = torch.randn(20, 2, 7, 7).cuda().zero_()
128 | dout = dpooling(input, rois, offset)
129 | s = ', '.join(['%f' % dout[i, :, :, :].mean().item()
130 | for i in range(rois.shape[0])])
131 | print(s)
132 |
133 |
134 | def check_gradient_dpooling():
135 | input = torch.randn(2, 3, 5, 5).cuda() * 0.01
136 | N = 4
137 | batch_inds = torch.randint(2, (N, 1)).cuda().float()
138 | x = torch.rand((N, 1)).cuda().float() * 15
139 | y = torch.rand((N, 1)).cuda().float() * 15
140 | w = torch.rand((N, 1)).cuda().float() * 10
141 | h = torch.rand((N, 1)).cuda().float() * 10
142 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
143 | offset = torch.randn(N, 2, 3, 3).cuda()
144 | input.requires_grad = True
145 | offset.requires_grad = True
146 |
147 | spatial_scale = 1.0 / 4
148 | pooled_size = 3
149 | output_dim = 3
150 | no_trans = 0
151 | group_size = 1
152 | trans_std = 0.0
153 | sample_per_part = 4
154 | part_size = pooled_size
155 |
156 | print('check_gradient_dpooling:',
157 | gradcheck(dcn_v2_pooling, (input, rois, offset,
158 | spatial_scale,
159 | pooled_size,
160 | output_dim,
161 | no_trans,
162 | group_size,
163 | part_size,
164 | sample_per_part,
165 | trans_std),
166 | eps=1e-4))
167 |
168 |
169 | def example_dconv():
170 | input = torch.randn(2, 64, 128, 128).cuda()
171 | # wrap all things (offset and mask) in DCN
172 | dcn = DCN(64, 64, kernel_size=(3, 3), stride=1,
173 | padding=1, deformable_groups=2).cuda()
174 | # print(dcn.weight.shape, input.shape)
175 | output = dcn(input)
176 | targert = output.new(*output.size())
177 | targert.data.uniform_(-0.01, 0.01)
178 | error = (targert - output).mean()
179 | error.backward()
180 | print(output.shape)
181 |
182 |
183 | def example_dpooling():
184 | input = torch.randn(2, 32, 64, 64).cuda()
185 | batch_inds = torch.randint(2, (20, 1)).cuda().float()
186 | x = torch.randint(256, (20, 1)).cuda().float()
187 | y = torch.randint(256, (20, 1)).cuda().float()
188 | w = torch.randint(64, (20, 1)).cuda().float()
189 | h = torch.randint(64, (20, 1)).cuda().float()
190 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
191 | offset = torch.randn(20, 2, 7, 7).cuda()
192 | input.requires_grad = True
193 | offset.requires_grad = True
194 |
195 | # normal roi_align
196 | pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
197 | pooled_size=7,
198 | output_dim=32,
199 | no_trans=True,
200 | group_size=1,
201 | trans_std=0.1).cuda()
202 |
203 | # deformable pooling
204 | dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
205 | pooled_size=7,
206 | output_dim=32,
207 | no_trans=False,
208 | group_size=1,
209 | trans_std=0.1).cuda()
210 |
211 | out = pooling(input, rois, offset)
212 | dout = dpooling(input, rois, offset)
213 | print(out.shape)
214 | print(dout.shape)
215 |
216 | target_out = out.new(*out.size())
217 | target_out.data.uniform_(-0.01, 0.01)
218 | target_dout = dout.new(*dout.size())
219 | target_dout.data.uniform_(-0.01, 0.01)
220 | e = (target_out - out).mean()
221 | e.backward()
222 | e = (target_dout - dout).mean()
223 | e.backward()
224 |
225 |
226 | def example_mdpooling():
227 | input = torch.randn(2, 32, 64, 64).cuda()
228 | input.requires_grad = True
229 | batch_inds = torch.randint(2, (20, 1)).cuda().float()
230 | x = torch.randint(256, (20, 1)).cuda().float()
231 | y = torch.randint(256, (20, 1)).cuda().float()
232 | w = torch.randint(64, (20, 1)).cuda().float()
233 | h = torch.randint(64, (20, 1)).cuda().float()
234 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
235 |
236 | # mdformable pooling (V2)
237 | dpooling = DCNPooling(spatial_scale=1.0 / 4,
238 | pooled_size=7,
239 | output_dim=32,
240 | no_trans=False,
241 | group_size=1,
242 | trans_std=0.1,
243 | deform_fc_dim=1024).cuda()
244 |
245 | dout = dpooling(input, rois)
246 | target = dout.new(*dout.size())
247 | target.data.uniform_(-0.1, 0.1)
248 | error = (target - dout).mean()
249 | error.backward()
250 | print(dout.shape)
251 |
252 |
253 | if __name__ == '__main__':
254 |
255 | example_dconv()
256 | example_dpooling()
257 | example_mdpooling()
258 |
259 | check_pooling_zero_offset()
260 | # zero offset check
261 | if inC == outC:
262 | check_zero_offset()
263 |
264 | check_gradient_dpooling()
265 | check_gradient_dconv()
266 | # """
267 | # ****** Note: backward is not reentrant error may not be a serious problem,
268 | # ****** since the max error is less than 1e-7,
269 | # ****** Still looking for what trigger this problem
270 | # """
271 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Charles Shang
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Convert CenterNet pytorch model to Torch Script for LibTorch
2 | can convert dla34 official model
3 |
4 | ## C PLUS PLUS CALL
5 | refer to the dcn_cpp_plugin
--------------------------------------------------------------------------------
/dcn_cpp_plugin/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
2 | project(dcn_v2_cuda_forward_v2)
3 |
4 | #add_compile_options(-std=c++11)
5 |
6 | #add_definitions(-D WITH_CUDA)
7 |
8 | set(Torch_DIR /usr/local/libtorch/share/cmake/Torch)
9 | find_package(Torch REQUIRED)
10 |
11 | include_directories(/usr/include/python3.5m)
12 | include_directories(/usr/include/python2.7/)
13 |
14 |
15 | #include_directories(/usr/local/cuda/include)
16 | #link_directories(/usr/local/cuda/lib64)
17 |
18 | set(CUDA_HOST_COMPILATION_CPP ON)
19 | #set(TORCH_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__")
20 |
21 | #set(CUDA_NVCC_FLAGS -std=c++11
22 | #-DCUDA_HAS_FP16=1
23 | #-D__CUDA_NO_HALF_OPERATORS__
24 | #-D__CUDA_NO_HALF_CONVERSIONS__
25 | #-D__CUDA_NO_HALF2_OPERATORS__)
26 |
27 | set(CUDA_NVCC_FLAGS -std=c++11
28 | -D__CUDA_NO_HALF_OPERATORS__ )
29 |
30 | #set(CUDA_NVCC_FLAGS -std=c++11 -Xcompiler -fexceptions -Xcompiler -fPIC
31 | #-gencode arch=compute_30,code=sm_30
32 | #-gencode arch=compute_35,code=sm_35
33 | #-gencode arch=compute_50,code=sm_50
34 | #-gencode arch=compute_60,code=sm_60
35 | #-gencode arch=compute_60,code=compute_60)
36 |
37 |
38 | cuda_add_library(${PROJECT_NAME} SHARED
39 | vision.cpp
40 | dcn_v2_cuda.cu
41 | dcn_v2_im2col_cuda.cu
42 | )
43 |
44 | # Enable C++11
45 | target_compile_features(${PROJECT_NAME} PRIVATE cxx_range_for)
46 | # Link against LibTorch
47 | target_link_libraries(${PROJECT_NAME} "${TORCH_LIBRARIES}")
48 |
49 | install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION lib)
50 |
--------------------------------------------------------------------------------
/dcn_cpp_plugin/README.md:
--------------------------------------------------------------------------------
1 | # DCN C PLUS CPLUS PLUGIN
2 |
3 | ## usage
4 | void handle = dlopen("libdcn_v2_cuda_forward_v2.so", RTLD_LAZY);
5 |
6 | int gpu_id = 0;
7 | torch::jit::script::Module module =
8 | torch::jit::load("centernet.pt", torch::Device(torch::DeviceType::CUDA, gpu_id));
9 |
--------------------------------------------------------------------------------
/dcn_cpp_plugin/dcn_v2.h:
--------------------------------------------------------------------------------
1 |
2 | #ifndef DCN_V2_H
3 | #define DCN_V2_H
4 | #include
5 |
6 | #ifdef __cplusplus
7 | extern "C"
8 | {
9 | #endif
10 |
11 |
12 | at::Tensor
13 | dcn_v2_cuda_forward(const at::Tensor &input,
14 | const at::Tensor &weight,
15 | const at::Tensor &bias,
16 | const at::Tensor &offset,
17 | const at::Tensor &mask,
18 | const int64_t kernel_h,
19 | const int64_t kernel_w,
20 | const int64_t stride_h,
21 | const int64_t stride_w,
22 | const int64_t pad_h,
23 | const int64_t pad_w,
24 | const int64_t dilation_h,
25 | const int64_t dilation_w,
26 | const int64_t deformable_group);
27 |
28 | #ifdef __cplusplus
29 | }
30 | #endif
31 |
32 | #endif
--------------------------------------------------------------------------------
/dcn_cpp_plugin/dcn_v2_cuda.cu:
--------------------------------------------------------------------------------
1 | #include "dcn_v2_im2col_cuda.h"
2 | #include "dcn_v2.h"
3 | #include
4 | #include
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | //extern THCState *state;
12 |
13 | //THCState *state;
14 |
15 | // THCState *state = at::globalContext().thc_state;
16 |
17 | THCState *state = at::globalContext().lazyInitCUDA();
18 | //THCState *state = at::globalContext().getTHCState();
19 |
20 | __global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
21 | float **columns_b, const float **ones_b,
22 | const float **weight_b, const float **bias_b,
23 | float *input, float *output,
24 | float *columns, float *ones,
25 | float *weight, float *bias,
26 | const int input_stride, const int output_stride,
27 | const int columns_stride, const int ones_stride,
28 | const int num_batches)
29 | {
30 | const int idx = blockIdx.x * blockDim.x + threadIdx.x;
31 | if (idx < num_batches)
32 | {
33 | input_b[idx] = input + idx * input_stride;
34 | output_b[idx] = output + idx * output_stride;
35 | columns_b[idx] = columns + idx * columns_stride;
36 | ones_b[idx] = ones + idx * ones_stride;
37 | // share weights and bias within a Mini-Batch
38 | weight_b[idx] = weight;
39 | bias_b[idx] = bias;
40 | }
41 | }
42 |
43 | at::Tensor
44 | dcn_v2_cuda_forward(const at::Tensor &input,
45 | const at::Tensor &weight,
46 | const at::Tensor &bias,
47 | const at::Tensor &offset,
48 | const at::Tensor &mask,
49 | const int64_t kernel_h,
50 | const int64_t kernel_w,
51 | const int64_t stride_h,
52 | const int64_t stride_w,
53 | const int64_t pad_h,
54 | const int64_t pad_w,
55 | const int64_t dilation_h,
56 | const int64_t dilation_w,
57 | const int64_t deformable_group)
58 | {
59 | using scalar_t = float;
60 | //THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
61 | AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
62 | AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
63 | AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
64 | AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
65 | AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
66 | const int batch = input.size(0);
67 | const int channels = input.size(1);
68 | const int height = input.size(2);
69 | const int width = input.size(3);
70 |
71 | const int channels_out = weight.size(0);
72 | const int channels_kernel = weight.size(1);
73 | const int kernel_h_ = weight.size(2);
74 | const int kernel_w_ = weight.size(3);
75 |
76 | AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
77 | "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
78 |
79 | AT_ASSERTM(channels == channels_kernel,
80 | "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
81 |
82 | const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
83 | const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
84 |
85 | auto ones = at::ones({batch, height_out, width_out}, input.options());
86 | auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
87 | auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
88 |
89 | int matrices_size = batch * sizeof(float *);
90 |
91 | auto input_b = static_cast(THCudaMalloc(state, matrices_size));
92 | auto output_b = static_cast(THCudaMalloc(state, matrices_size));
93 | auto columns_b = static_cast(THCudaMalloc(state, matrices_size));
94 | auto ones_b = static_cast(THCudaMalloc(state, matrices_size));
95 | auto weight_b = static_cast(THCudaMalloc(state, matrices_size));
96 | auto bias_b = static_cast(THCudaMalloc(state, matrices_size));
97 |
98 | const int block = 128;
99 | const int grid = (batch + block - 1) / block;
100 |
101 | createBatchGemmBuffer<<>>(
102 | (const float**)input_b, output_b,
103 | columns_b, ones_b,
104 | weight_b, bias_b,
105 | input.data(),
106 | output.data(),
107 | columns.data(),
108 | ones.data(),
109 | weight.data(),
110 | bias.data(),
111 | channels * width * height,
112 | channels_out * width_out * height_out,
113 | channels * kernel_h * kernel_w * height_out * width_out,
114 | height_out * width_out,
115 | batch);
116 |
117 | long m_ = channels_out;
118 | long n_ = height_out * width_out;
119 | long k_ = 1;
120 | THCudaBlas_SgemmBatched(state,
121 | 't',
122 | 'n',
123 | n_,
124 | m_,
125 | k_,
126 | 1.0f,
127 | ones_b, k_,
128 | bias_b, k_,
129 | 0.0f,
130 | output_b, n_,
131 | batch);
132 |
133 | modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
134 | input.data(),
135 | offset.data(),
136 | mask.data(),
137 | batch, channels, height, width,
138 | height_out, width_out, kernel_h, kernel_w,
139 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
140 | deformable_group,
141 | columns.data());
142 |
143 | long m = channels_out;
144 | long n = height_out * width_out;
145 | long k = channels * kernel_h * kernel_w;
146 | THCudaBlas_SgemmBatched(state,
147 | 'n',
148 | 'n',
149 | n,
150 | m,
151 | k,
152 | 1.0f,
153 | (const float **)columns_b, n,
154 | weight_b, k,
155 | 1.0f,
156 | output_b, n,
157 | batch);
158 |
159 | THCudaFree(state, input_b);
160 | THCudaFree(state, output_b);
161 | THCudaFree(state, columns_b);
162 | THCudaFree(state, ones_b);
163 | THCudaFree(state, weight_b);
164 | THCudaFree(state, bias_b);
165 | return output;
166 | }
167 |
168 |
--------------------------------------------------------------------------------
/dcn_cpp_plugin/dcn_v2_im2col_cuda.cu:
--------------------------------------------------------------------------------
1 | #include "dcn_v2_im2col_cuda.h"
2 | #include
3 | #include
4 | #include
5 |
6 | #include
7 | #include
8 |
9 | #include
10 | #include
11 | #include
12 |
13 | #define CUDA_KERNEL_LOOP(i, n) \
14 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
15 | i < (n); \
16 | i += blockDim.x * gridDim.x)
17 |
18 | const int CUDA_NUM_THREADS = 1024;
19 | inline int GET_BLOCKS(const int N)
20 | {
21 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
22 | }
23 |
24 |
25 | __device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
26 | const int height, const int width, float h, float w)
27 | {
28 | int h_low = floor(h);
29 | int w_low = floor(w);
30 | int h_high = h_low + 1;
31 | int w_high = w_low + 1;
32 |
33 | float lh = h - h_low;
34 | float lw = w - w_low;
35 | float hh = 1 - lh, hw = 1 - lw;
36 |
37 | float v1 = 0;
38 | if (h_low >= 0 && w_low >= 0)
39 | v1 = bottom_data[h_low * data_width + w_low];
40 | float v2 = 0;
41 | if (h_low >= 0 && w_high <= width - 1)
42 | v2 = bottom_data[h_low * data_width + w_high];
43 | float v3 = 0;
44 | if (h_high <= height - 1 && w_low >= 0)
45 | v3 = bottom_data[h_high * data_width + w_low];
46 | float v4 = 0;
47 | if (h_high <= height - 1 && w_high <= width - 1)
48 | v4 = bottom_data[h_high * data_width + w_high];
49 |
50 | float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
51 |
52 | float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
53 | return val;
54 | }
55 | __device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
56 | const int height, const int width, const float *im_data,
57 | const int data_width, const int bp_dir)
58 | {
59 | if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
60 | {
61 | //empty
62 | return 0;
63 | }
64 |
65 | int argmax_h_low = floor(argmax_h);
66 | int argmax_w_low = floor(argmax_w);
67 | int argmax_h_high = argmax_h_low + 1;
68 | int argmax_w_high = argmax_w_low + 1;
69 |
70 | float weight = 0;
71 |
72 | if (bp_dir == 0)
73 | {
74 | if (argmax_h_low >= 0 && argmax_w_low >= 0)
75 | weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
76 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
77 | weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
78 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
79 | weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
80 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
81 | weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
82 | }
83 | else if (bp_dir == 1)
84 | {
85 | if (argmax_h_low >= 0 && argmax_w_low >= 0)
86 | weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
87 | if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
88 | weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
89 | if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
90 | weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
91 | if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
92 | weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
93 | }
94 |
95 | return weight;
96 | }
97 |
98 | __global__ void modulated_deformable_im2col_gpu_kernel(const int n,
99 | const float *data_im, const float *data_offset, const float *data_mask,
100 | const int height, const int width, const int kernel_h, const int kernel_w,
101 | const int pad_h, const int pad_w,
102 | const int stride_h, const int stride_w,
103 | const int dilation_h, const int dilation_w,
104 | const int channel_per_deformable_group,
105 | const int batch_size, const int num_channels, const int deformable_group,
106 | const int height_col, const int width_col,
107 | float *data_col)
108 | {
109 | // launch channels * batch_size * height_col * width_col cores
110 | CUDA_KERNEL_LOOP(index, n)
111 | {
112 | // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
113 | // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
114 |
115 | // index index of output matrix
116 | const int w_col = index % width_col;
117 | const int h_col = (index / width_col) % height_col;
118 | // const int b_col = (index / width_col / height_col) % batch_size;
119 | const int b_col = (index / width_col / height_col / num_channels) % batch_size;
120 | // const int c_im = (index / width_col / height_col) / batch_size;
121 | const int c_im = (index / width_col / height_col) % num_channels;
122 | // const int c_col = c_im * kernel_h * kernel_w;
123 | const int c_col = c_im * kernel_h * kernel_w;
124 |
125 | // compute deformable group index
126 | const int deformable_group_index = c_im / channel_per_deformable_group;
127 |
128 | const int h_in = h_col * stride_h - pad_h;
129 | const int w_in = w_col * stride_w - pad_w;
130 |
131 | // float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
132 | float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
133 | //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
134 | const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
135 | const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
136 |
137 | const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
138 |
139 | for (int i = 0; i < kernel_h; ++i)
140 | {
141 | for (int j = 0; j < kernel_w; ++j)
142 | {
143 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
144 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
145 | const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
146 | const float offset_h = data_offset_ptr[data_offset_h_ptr];
147 | const float offset_w = data_offset_ptr[data_offset_w_ptr];
148 | const float mask = data_mask_ptr[data_mask_hw_ptr];
149 | float val = static_cast(0);
150 | const float h_im = h_in + i * dilation_h + offset_h;
151 | const float w_im = w_in + j * dilation_w + offset_w;
152 | //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
153 | if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
154 | {
155 | //const float map_h = i * dilation_h + offset_h;
156 | //const float map_w = j * dilation_w + offset_w;
157 | //const int cur_height = height - h_in;
158 | //const int cur_width = width - w_in;
159 | //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
160 | val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
161 | }
162 | *data_col_ptr = val * mask;
163 | // data_col_ptr += batch_size * height_col * width_col;
164 | data_col_ptr += height_col * width_col;
165 | }
166 | }
167 | }
168 | }
169 |
170 | void modulated_deformable_im2col_cuda(cudaStream_t stream,
171 | const float* data_im, const float* data_offset, const float* data_mask,
172 | const int batch_size, const int channels, const int height_im, const int width_im,
173 | const int height_col, const int width_col, const int kernel_h, const int kernel_w,
174 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
175 | const int dilation_h, const int dilation_w,
176 | const int deformable_group, float* data_col) {
177 | // num_axes should be smaller than block size
178 | const int channel_per_deformable_group = channels / deformable_group;
179 | const int num_kernels = channels * batch_size * height_col * width_col;
180 | modulated_deformable_im2col_gpu_kernel
181 | <<>>(
183 | num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
184 | pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
185 | batch_size, channels, deformable_group, height_col, width_col, data_col);
186 |
187 | cudaError_t err = cudaGetLastError();
188 | if (err != cudaSuccess)
189 | {
190 | printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
191 | }
192 |
193 | }
--------------------------------------------------------------------------------
/dcn_cpp_plugin/dcn_v2_im2col_cuda.h:
--------------------------------------------------------------------------------
1 | #ifndef DCN_V2_IM2COL_CUDA
2 | #define DCN_V2_IM2COL_CUDA
3 |
4 | #ifdef __cplusplus
5 | extern "C"
6 | {
7 | #endif
8 |
9 | void modulated_deformable_im2col_cuda(cudaStream_t stream,
10 | const float *data_im, const float *data_offset, const float *data_mask,
11 | const int batch_size, const int channels, const int height_im, const int width_im,
12 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
13 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
14 | const int dilation_h, const int dilation_w,
15 | const int deformable_group, float *data_col);
16 |
17 | #ifdef __cplusplus
18 | }
19 | #endif
20 |
21 | #endif
--------------------------------------------------------------------------------
/dcn_cpp_plugin/vision.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dcn_v2.h"
3 | #include
4 | #include "dcn_v2.h"
5 |
6 | at::Tensor
7 | dcn_v2_cuda_forward_v2(const at::Tensor &input,
8 | const at::Tensor &weight,
9 | const at::Tensor &bias,
10 | const at::Tensor &offset,
11 | const at::Tensor &mask,
12 | const int64_t kernel_h,
13 | const int64_t kernel_w,
14 | const int64_t stride_h,
15 | const int64_t stride_w,
16 | const int64_t pad_h,
17 | const int64_t pad_w,
18 | const int64_t dilation_h,
19 | const int64_t dilation_w,
20 | const int64_t deformable_group)
21 | {
22 | return dcn_v2_cuda_forward(input,
23 | weight,
24 | bias,
25 | offset,
26 | mask,
27 | kernel_h,
28 | kernel_w,
29 | stride_h,
30 | stride_w,
31 | pad_h,
32 | pad_w,
33 | dilation_h,
34 | dilation_w,
35 | deformable_group);
36 | }
37 |
38 | static auto registry =
39 | torch::jit::RegisterOperators("my_ops::dcn_v2_cuda_forward_v2", &dcn_v2_cuda_forward_v2);
40 |
--------------------------------------------------------------------------------
/dcn_cpp_plugin/vision.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 | #include
4 |
5 | at::Tensor
6 | dcn_v2_cuda_forward_v2(const at::Tensor &input,
7 | const at::Tensor &weight,
8 | const at::Tensor &bias,
9 | const at::Tensor &offset,
10 | const at::Tensor &mask,
11 | const int64_t kernel_h,
12 | const int64_t kernel_w,
13 | const int64_t stride_h,
14 | const int64_t stride_w,
15 | const int64_t pad_h,
16 | const int64_t pad_w,
17 | const int64_t dilation_h,
18 | const int64_t dilation_w,
19 | const int64_t deformable_group);
20 |
21 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from model import create_model, load_model
4 | import torch
5 |
6 | if __name__ == '__main__':
7 | num_classes = 80
8 | head_conv = 256
9 | heads = {'hm': num_classes,
10 | 'wh': 2 ,
11 | 'reg': 2}
12 |
13 | load_model_path = 'ctdet_coco_dla_2x.pth'
14 | save_script_pt = 'centernet.pt'
15 | device = 0
16 |
17 | model = create_model('dla_34', heads, head_conv)
18 | model = load_model(model, load_model_path)
19 | model = model.to(device)
20 | model.eval()
21 |
22 | input_var = torch.zeros([1, 3, 512, 512], dtype=torch.float32).cuda()
23 |
24 | traced_script_module = torch.jit.trace(model, input_var)
25 | traced_script_module.save(save_script_pt)
26 | traced_script_module = torch.jit.load(save_script_pt)
27 |
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 |
2 | import torch
3 | from pose_dla_dcn import get_pose_net as get_model
4 |
5 |
6 | def create_model(arch, heads, head_conv):
7 | num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
8 | arch = arch[:arch.find('_')] if '_' in arch else arch
9 | model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
10 | return model
11 |
12 |
13 | def load_model(model, model_path, optimizer=None, resume=False,
14 | lr=None, lr_step=None):
15 | start_epoch = 0
16 | checkpoint = torch.load(
17 | model_path, map_location=lambda storage, loc: storage)
18 | print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch']))
19 | state_dict_ = checkpoint['state_dict']
20 | state_dict = {}
21 |
22 | # convert data_parallal to model
23 | for k in state_dict_:
24 | if k.startswith('module') and not k.startswith('module_list'):
25 | state_dict[k[7:]] = state_dict_[k]
26 | else:
27 | state_dict[k] = state_dict_[k]
28 | model_state_dict = model.state_dict()
29 |
30 | # check loaded parameters and created model parameters
31 | for k in state_dict:
32 | if k in model_state_dict:
33 | if state_dict[k].shape != model_state_dict[k].shape:
34 | print('Skip loading parameter {}, required shape{}, '
35 | 'loaded shape{}.'.format(
36 | k, model_state_dict[k].shape, state_dict[k].shape))
37 | state_dict[k] = model_state_dict[k]
38 | else:
39 | print('Drop parameter {}.'.format(k))
40 | for k in model_state_dict:
41 | if not (k in state_dict):
42 | print('No param {}.'.format(k))
43 | state_dict[k] = model_state_dict[k]
44 | model.load_state_dict(state_dict, strict=False)
45 |
46 | # resume optimizer parameters
47 | if optimizer is not None and resume:
48 | if 'optimizer' in checkpoint:
49 | optimizer.load_state_dict(checkpoint['optimizer'])
50 | start_epoch = checkpoint['epoch']
51 | start_lr = lr
52 | for step in lr_step:
53 | if start_epoch >= step:
54 | start_lr *= 0.1
55 | for param_group in optimizer.param_groups:
56 | param_group['lr'] = start_lr
57 | print('Resumed optimizer with start lr', start_lr)
58 | else:
59 | print('No optimizer parameters in checkpoint.')
60 | if optimizer is not None:
61 | return model, optimizer, start_epoch
62 | else:
63 | return model
64 |
--------------------------------------------------------------------------------
/pose_dla_dcn.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | import math
7 | import logging
8 | import numpy as np
9 | from os.path import join
10 |
11 | import torch
12 | from torch import nn
13 | import torch.nn.functional as F
14 | import torch.utils.model_zoo as model_zoo
15 |
16 | from DCNv2.dcn_v2 import DCN
17 |
18 | BN_MOMENTUM = 0.1
19 | logger = logging.getLogger(__name__)
20 |
21 | def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
22 | return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
23 |
24 |
25 | def conv3x3(in_planes, out_planes, stride=1):
26 | "3x3 convolution with padding"
27 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
28 | padding=1, bias=False)
29 |
30 |
31 | class BasicBlock(nn.Module):
32 | def __init__(self, inplanes, planes, stride=1, dilation=1):
33 | super(BasicBlock, self).__init__()
34 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
35 | stride=stride, padding=dilation,
36 | bias=False, dilation=dilation)
37 | self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
38 | self.relu = nn.ReLU(inplace=True)
39 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
40 | stride=1, padding=dilation,
41 | bias=False, dilation=dilation)
42 | self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
43 | self.stride = stride
44 |
45 | def forward(self, x, residual=None):
46 | if residual is None:
47 | residual = x
48 |
49 | out = self.conv1(x)
50 | out = self.bn1(out)
51 | out = self.relu(out)
52 |
53 | out = self.conv2(out)
54 | out = self.bn2(out)
55 |
56 | out += residual
57 | out = self.relu(out)
58 |
59 | return out
60 |
61 |
62 | class Bottleneck(nn.Module):
63 | expansion = 2
64 |
65 | def __init__(self, inplanes, planes, stride=1, dilation=1):
66 | super(Bottleneck, self).__init__()
67 | expansion = Bottleneck.expansion
68 | bottle_planes = planes // expansion
69 | self.conv1 = nn.Conv2d(inplanes, bottle_planes,
70 | kernel_size=1, bias=False)
71 | self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
72 | self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
73 | stride=stride, padding=dilation,
74 | bias=False, dilation=dilation)
75 | self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
76 | self.conv3 = nn.Conv2d(bottle_planes, planes,
77 | kernel_size=1, bias=False)
78 | self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
79 | self.relu = nn.ReLU(inplace=True)
80 | self.stride = stride
81 |
82 | def forward(self, x, residual=None):
83 | if residual is None:
84 | residual = x
85 |
86 | out = self.conv1(x)
87 | out = self.bn1(out)
88 | out = self.relu(out)
89 |
90 | out = self.conv2(out)
91 | out = self.bn2(out)
92 | out = self.relu(out)
93 |
94 | out = self.conv3(out)
95 | out = self.bn3(out)
96 |
97 | out += residual
98 | out = self.relu(out)
99 |
100 | return out
101 |
102 |
103 | class BottleneckX(nn.Module):
104 | expansion = 2
105 | cardinality = 32
106 |
107 | def __init__(self, inplanes, planes, stride=1, dilation=1):
108 | super(BottleneckX, self).__init__()
109 | cardinality = BottleneckX.cardinality
110 | # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
111 | # bottle_planes = dim * cardinality
112 | bottle_planes = planes * cardinality // 32
113 | self.conv1 = nn.Conv2d(inplanes, bottle_planes,
114 | kernel_size=1, bias=False)
115 | self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
116 | self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
117 | stride=stride, padding=dilation, bias=False,
118 | dilation=dilation, groups=cardinality)
119 | self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
120 | self.conv3 = nn.Conv2d(bottle_planes, planes,
121 | kernel_size=1, bias=False)
122 | self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
123 | self.relu = nn.ReLU(inplace=True)
124 | self.stride = stride
125 |
126 | def forward(self, x, residual=None):
127 | if residual is None:
128 | residual = x
129 |
130 | out = self.conv1(x)
131 | out = self.bn1(out)
132 | out = self.relu(out)
133 |
134 | out = self.conv2(out)
135 | out = self.bn2(out)
136 | out = self.relu(out)
137 |
138 | out = self.conv3(out)
139 | out = self.bn3(out)
140 |
141 | out += residual
142 | out = self.relu(out)
143 |
144 | return out
145 |
146 |
147 | class Root(nn.Module):
148 | def __init__(self, in_channels, out_channels, kernel_size, residual):
149 | super(Root, self).__init__()
150 | self.conv = nn.Conv2d(
151 | in_channels, out_channels, 1,
152 | stride=1, bias=False, padding=(kernel_size - 1) // 2)
153 | self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
154 | self.relu = nn.ReLU(inplace=True)
155 | self.residual = residual
156 |
157 | def forward(self, *x):
158 | children = x
159 | x = self.conv(torch.cat(x, 1))
160 | x = self.bn(x)
161 | if self.residual:
162 | x += children[0]
163 | x = self.relu(x)
164 |
165 | return x
166 |
167 |
168 | class Tree(nn.Module):
169 | def __init__(self, levels, block, in_channels, out_channels, stride=1,
170 | level_root=False, root_dim=0, root_kernel_size=1,
171 | dilation=1, root_residual=False):
172 | super(Tree, self).__init__()
173 | if root_dim == 0:
174 | root_dim = 2 * out_channels
175 | if level_root:
176 | root_dim += in_channels
177 | if levels == 1:
178 | self.tree1 = block(in_channels, out_channels, stride,
179 | dilation=dilation)
180 | self.tree2 = block(out_channels, out_channels, 1,
181 | dilation=dilation)
182 | else:
183 | self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
184 | stride, root_dim=0,
185 | root_kernel_size=root_kernel_size,
186 | dilation=dilation, root_residual=root_residual)
187 | self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
188 | root_dim=root_dim + out_channels,
189 | root_kernel_size=root_kernel_size,
190 | dilation=dilation, root_residual=root_residual)
191 | if levels == 1:
192 | self.root = Root(root_dim, out_channels, root_kernel_size,
193 | root_residual)
194 | self.level_root = level_root
195 | self.root_dim = root_dim
196 | self.downsample = None
197 | self.project = None
198 | self.levels = levels
199 | if stride > 1:
200 | self.downsample = nn.MaxPool2d(stride, stride=stride)
201 | if in_channels != out_channels:
202 | self.project = nn.Sequential(
203 | nn.Conv2d(in_channels, out_channels,
204 | kernel_size=1, stride=1, bias=False),
205 | nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
206 | )
207 |
208 | def forward(self, x, residual=None, children=None):
209 | children = [] if children is None else children
210 | bottom = self.downsample(x) if self.downsample else x
211 | residual = self.project(bottom) if self.project else bottom
212 | if self.level_root:
213 | children.append(bottom)
214 | x1 = self.tree1(x, residual)
215 | if self.levels == 1:
216 | x2 = self.tree2(x1)
217 | x = self.root(x2, x1, *children)
218 | else:
219 | children.append(x1)
220 | x = self.tree2(x1, children=children)
221 | return x
222 |
223 |
224 | class DLA(nn.Module):
225 | def __init__(self, levels, channels, num_classes=1000,
226 | block=BasicBlock, residual_root=False, linear_root=False):
227 | super(DLA, self).__init__()
228 | self.channels = channels
229 | self.num_classes = num_classes
230 | self.base_layer = nn.Sequential(
231 | nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
232 | padding=3, bias=False),
233 | nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
234 | nn.ReLU(inplace=True))
235 | self.level0 = self._make_conv_level(
236 | channels[0], channels[0], levels[0])
237 | self.level1 = self._make_conv_level(
238 | channels[0], channels[1], levels[1], stride=2)
239 | self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
240 | level_root=False,
241 | root_residual=residual_root)
242 | self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
243 | level_root=True, root_residual=residual_root)
244 | self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
245 | level_root=True, root_residual=residual_root)
246 | self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
247 | level_root=True, root_residual=residual_root)
248 |
249 | # for m in self.modules():
250 | # if isinstance(m, nn.Conv2d):
251 | # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
252 | # m.weight.data.normal_(0, math.sqrt(2. / n))
253 | # elif isinstance(m, nn.BatchNorm2d):
254 | # m.weight.data.fill_(1)
255 | # m.bias.data.zero_()
256 |
257 | def _make_level(self, block, inplanes, planes, blocks, stride=1):
258 | downsample = None
259 | if stride != 1 or inplanes != planes:
260 | downsample = nn.Sequential(
261 | nn.MaxPool2d(stride, stride=stride),
262 | nn.Conv2d(inplanes, planes,
263 | kernel_size=1, stride=1, bias=False),
264 | nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
265 | )
266 |
267 | layers = []
268 | layers.append(block(inplanes, planes, stride, downsample=downsample))
269 | for i in range(1, blocks):
270 | layers.append(block(inplanes, planes))
271 |
272 | return nn.Sequential(*layers)
273 |
274 | def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
275 | modules = []
276 | for i in range(convs):
277 | modules.extend([
278 | nn.Conv2d(inplanes, planes, kernel_size=3,
279 | stride=stride if i == 0 else 1,
280 | padding=dilation, bias=False, dilation=dilation),
281 | nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
282 | nn.ReLU(inplace=True)])
283 | inplanes = planes
284 | return nn.Sequential(*modules)
285 |
286 | def forward(self, x):
287 | y = []
288 | x = self.base_layer(x)
289 | for i in range(6):
290 | x = getattr(self, 'level{}'.format(i))(x)
291 | y.append(x)
292 | return (y[0],y[1],y[2],y[3],y[4],y[5])
293 | # return y
294 |
295 | def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
296 | # fc = self.fc
297 | if name.endswith('.pth'):
298 | model_weights = torch.load(data + name)
299 | else:
300 | model_url = get_model_url(data, name, hash)
301 | model_weights = model_zoo.load_url(model_url)
302 | num_classes = len(model_weights[list(model_weights.keys())[-1]])
303 | self.fc = nn.Conv2d(
304 | self.channels[-1], num_classes,
305 | kernel_size=1, stride=1, padding=0, bias=True)
306 | self.load_state_dict(model_weights)
307 | # self.fc = fc
308 |
309 |
310 | def dla34(pretrained=True, **kwargs): # DLA-34
311 | model = DLA([1, 1, 1, 2, 2, 1],
312 | [16, 32, 64, 128, 256, 512],
313 | block=BasicBlock, **kwargs)
314 | if pretrained:
315 | model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
316 | return model
317 |
318 | class Identity(nn.Module):
319 |
320 | def __init__(self):
321 | super(Identity, self).__init__()
322 |
323 | def forward(self, x):
324 | return x
325 |
326 |
327 | def fill_fc_weights(layers):
328 | for m in layers.modules():
329 | if isinstance(m, nn.Conv2d):
330 | if m.bias is not None:
331 | nn.init.constant_(m.bias, 0)
332 |
333 |
334 | def fill_up_weights(up):
335 | w = up.weight.data
336 | f = math.ceil(w.size(2) / 2)
337 | c = (2 * f - 1 - f % 2) / (2. * f)
338 | for i in range(w.size(2)):
339 | for j in range(w.size(3)):
340 | w[0, 0, i, j] = \
341 | (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
342 | for c in range(1, w.size(0)):
343 | w[c, 0, :, :] = w[0, 0, :, :]
344 |
345 |
346 | class DeformConv(nn.Module):
347 | def __init__(self, chi, cho):
348 | super(DeformConv, self).__init__()
349 | self.actf = nn.Sequential(
350 | nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
351 | nn.ReLU(inplace=True)
352 | )
353 | self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1)
354 |
355 | def forward(self, x):
356 | x = self.conv(x)
357 | x = self.actf(x)
358 | return x
359 |
360 |
361 | class IDAUp(nn.Module):
362 |
363 | def __init__(self, o, channels, up_f):
364 | super(IDAUp, self).__init__()
365 | for i in range(1, len(channels)):
366 | c = channels[i]
367 | f = int(up_f[i])
368 | proj = DeformConv(c, o)
369 | node = DeformConv(o, o)
370 |
371 | up = nn.ConvTranspose2d(o, o, f * 2, stride=f,
372 | padding=f // 2, output_padding=0,
373 | groups=o, bias=False)
374 | fill_up_weights(up)
375 |
376 | setattr(self, 'proj_' + str(i), proj)
377 | setattr(self, 'up_' + str(i), up)
378 | setattr(self, 'node_' + str(i), node)
379 |
380 |
381 | def forward(self, layers, startp, endp):
382 | for i in range(startp + 1, endp):
383 | upsample = getattr(self, 'up_' + str(i - startp))
384 | project = getattr(self, 'proj_' + str(i - startp))
385 | # layers[i] = upsample(project(layers[i]))
386 | upsample_layers = upsample(project(layers[i]))
387 | node = getattr(self, 'node_' + str(i - startp))
388 | # layers[i] = node(layers[i] + layers[i - 1])
389 | layers[i] = node(upsample_layers + layers[i - 1])
390 | # node_tmp = node(upsample_layers + layers[i - 1])
391 | return layers[-1]
392 |
393 |
394 |
395 | class DLAUp(nn.Module):
396 | def __init__(self, startp, channels, scales, in_channels=None):
397 | super(DLAUp, self).__init__()
398 | self.startp = startp
399 | if in_channels is None:
400 | in_channels = channels
401 | self.channels = channels
402 | channels = list(channels)
403 | scales = np.array(scales, dtype=int)
404 | for i in range(len(channels) - 1):
405 | j = -i - 2
406 | setattr(self, 'ida_{}'.format(i),
407 | IDAUp(channels[j], in_channels[j:],
408 | scales[j:] // scales[j]))
409 | scales[j + 1:] = scales[j]
410 | in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
411 |
412 | def forward(self, layers):
413 | out = [layers[-1]] # start with 32
414 | for i in range(len(layers) - self.startp - 1):
415 | ida = getattr(self, 'ida_{}'.format(i))
416 | ida_out = ida(layers, len(layers) -i - 2, len(layers))
417 | # out.insert(0, layers[-1])
418 | out.insert(0, ida_out)
419 | return out
420 |
421 |
422 | class Interpolate(nn.Module):
423 | def __init__(self, scale, mode):
424 | super(Interpolate, self).__init__()
425 | self.scale = scale
426 | self.mode = mode
427 |
428 | def forward(self, x):
429 | x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
430 | return x
431 |
432 |
433 | class DLASeg(nn.Module):
434 | def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel,
435 | last_level, head_conv, out_channel=0):
436 | super(DLASeg, self).__init__()
437 | assert down_ratio in [2, 4, 8, 16]
438 | self.first_level = int(np.log2(down_ratio))
439 | self.last_level = last_level
440 | self.base = globals()[base_name](pretrained=pretrained)
441 |
442 | channels = self.base.channels
443 | scales = [2 ** i for i in range(len(channels[self.first_level:]))]
444 | self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales)
445 | if out_channel == 0:
446 | out_channel = channels[self.first_level]
447 |
448 | self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level],
449 | [2 ** i for i in range(self.last_level - self.first_level)])
450 | self.heads = heads
451 | for head in self.heads:
452 | classes = self.heads[head]
453 | if head_conv > 0:
454 | fc = nn.Sequential(
455 | nn.Conv2d(channels[self.first_level], head_conv,
456 | kernel_size=3, padding=1, bias=True),
457 | nn.ReLU(inplace=True),
458 | nn.Conv2d(head_conv, classes,
459 | kernel_size=final_kernel, stride=1,
460 | padding=final_kernel // 2, bias=True))
461 | if 'hm' in head:
462 | fc[-1].bias.data.fill_(-2.19)
463 | else:
464 | fill_fc_weights(fc)
465 | else:
466 | fc = nn.Conv2d(channels[self.first_level], classes,
467 | kernel_size=final_kernel, stride=1,
468 | padding=final_kernel // 2, bias=True)
469 | if 'hm' in head:
470 | fc.bias.data.fill_(-2.19)
471 | else:
472 | fill_fc_weights(fc)
473 | self.__setattr__(head, fc)
474 |
475 | def forward(self, x):
476 | x = self.base(x)
477 |
478 | x = list(x)
479 | x = self.dla_up(x)
480 |
481 | y = []
482 | for i in range(self.last_level - self.first_level):
483 | y.append(x[i].clone())
484 | ida_out =self.ida_up(y, 0, len(y))
485 |
486 | # z = {}
487 | # for head in self.heads:
488 | # z[head] = self.__getattr__(head)(y[-1])
489 | # for head in self.heads:
490 | # z[head] = self.__getattr__(head)(ida_out)
491 | z = list()
492 | z.append(self.__getattr__('hm')(y[-1]))
493 | z.append(self.__getattr__('wh')(y[-1]))
494 | z.append(self.__getattr__('reg')(y[-1]))
495 | ret = (z[0],z[1],z[2])
496 | return ret
497 | # return [z]
498 |
499 |
500 | def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4):
501 | model = DLASeg('dla{}'.format(num_layers), heads,
502 | pretrained=True,
503 | down_ratio=down_ratio,
504 | final_kernel=1,
505 | last_level=5,
506 | head_conv=head_conv)
507 | return model
508 |
509 |
--------------------------------------------------------------------------------