├── DCNv2
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── dcn_v2.py
    ├── make.sh
    ├── setup.py
    ├── src
    │   ├── cpu
    │   │   ├── dcn_v2_cpu.cpp
    │   │   └── vision.h
    │   ├── cuda
    │   │   ├── dcn_v2_cuda.cu
    │   │   ├── dcn_v2_im2col_cuda.cu
    │   │   ├── dcn_v2_im2col_cuda.h
    │   │   ├── dcn_v2_psroi_pooling_cuda.cu
    │   │   └── vision.h
    │   ├── dcn_v2.h
    │   └── vision.cpp
    └── test.py
├── LICENSE
├── README.md
├── dcn_cpp_plugin
    ├── CMakeLists.txt
    ├── README.md
    ├── dcn_v2.h
    ├── dcn_v2_cuda.cu
    ├── dcn_v2_im2col_cuda.cu
    ├── dcn_v2_im2col_cuda.h
    ├── vision.cpp
    └── vision.h
├── demo.py
├── model.py
└── pose_dla_dcn.py


/DCNv2/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .idea
3 | *.so
4 | *.o
5 | *pyc
6 | _ext
7 | build
8 | DCNv2.egg-info
9 | dist


--------------------------------------------------------------------------------
/DCNv2/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Charles Shang
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/DCNv2/README.md:
--------------------------------------------------------------------------------
 1 | ## Deformable Convolutional Networks V2 with Pytorch 1.0
 2 | 
 3 | ### Build
 4 | ```bash
 5 |     ./make.sh         # build
 6 |     python test.py    # run examples and gradient check 
 7 | ```
 8 | 
 9 | ### An Example
10 | - deformable conv
11 | ```python
12 |     from dcn_v2 import DCN
13 |     input = torch.randn(2, 64, 128, 128).cuda()
14 |     # wrap all things (offset and mask) in DCN
15 |     dcn = DCN(64, 64, kernel_size=(3,3), stride=1, padding=1, deformable_groups=2).cuda()
16 |     output = dcn(input)
17 |     print(output.shape)
18 | ```
19 | - deformable roi pooling
20 | ```python
21 |     from dcn_v2 import DCNPooling
22 |     input = torch.randn(2, 32, 64, 64).cuda()
23 |     batch_inds = torch.randint(2, (20, 1)).cuda().float()
24 |     x = torch.randint(256, (20, 1)).cuda().float()
25 |     y = torch.randint(256, (20, 1)).cuda().float()
26 |     w = torch.randint(64, (20, 1)).cuda().float()
27 |     h = torch.randint(64, (20, 1)).cuda().float()
28 |     rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
29 | 
30 |     # mdformable pooling (V2)
31 |     # wrap all things (offset and mask) in DCNPooling
32 |     dpooling = DCNPooling(spatial_scale=1.0 / 4,
33 |                          pooled_size=7,
34 |                          output_dim=32,
35 |                          no_trans=False,
36 |                          group_size=1,
37 |                          trans_std=0.1).cuda()
38 | 
39 |     dout = dpooling(input, rois)
40 | ```
41 | ### Note
42 | Now the master branch is for pytorch 1.0 (new ATen API), you can switch back to pytorch 0.4 with,
43 | ```bash
44 | git checkout pytorch_0.4
45 | ```
46 | 
47 | ### Known Issues:
48 | 
49 | - [x] Gradient check w.r.t offset (solved)
50 | - [ ] Backward is not reentrant (minor)
51 | 
52 | This is an adaption of the official [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets/tree/master/DCNv2_op).
53 | 
54 | <s>I have ran the gradient check for many times with DOUBLE type. Every tensor **except offset** passes.
55 | However, when I set the offset to 0.5, it passes. I'm still wondering what cause this problem. Is it because some
56 | non-differential points? </s>
57 | 
58 | Update: all gradient check passes with double precision. 
59 | 
60 | Another issue is that it raises `RuntimeError: Backward is not reentrant`. However, the error is very small (`<1e-7` for 
61 | float `<1e-15` for double), 
62 | so it may not be a serious problem (?)
63 | 
64 | Please post an issue or PR if you have any comments.
65 |     


--------------------------------------------------------------------------------
/DCNv2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xi11xi19/CenterNet2TorchScript/69d7241139ebb2aad095cf17901d3945ac705626/DCNv2/__init__.py


--------------------------------------------------------------------------------
/DCNv2/dcn_v2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import absolute_import
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | 
  6 | import math
  7 | import torch
  8 | from torch import nn
  9 | from torch.autograd import Function
 10 | from torch.nn.modules.utils import _pair
 11 | from torch.autograd.function import once_differentiable
 12 | from torch.functional import F
 13 | 
 14 | 
 15 | import _ext as _backend
 16 | 
 17 | 
 18 | class DCNv2(nn.Module):
 19 | 
 20 |     def __init__(self, in_channels, out_channels,
 21 |                  kernel_size, stride, padding, dilation=1, deformable_groups=1):
 22 |         super(DCNv2, self).__init__()
 23 |         self.in_channels = in_channels
 24 |         self.out_channels = out_channels
 25 |         self.kernel_size = _pair(kernel_size)
 26 |         self.stride = _pair(stride)
 27 |         self.padding = _pair(padding)
 28 |         self.dilation = _pair(dilation)
 29 |         self.deformable_groups = deformable_groups
 30 | 
 31 |         self.weight = nn.Parameter(torch.Tensor(
 32 |             out_channels, in_channels, *self.kernel_size))
 33 |         self.bias = nn.Parameter(torch.Tensor(out_channels))
 34 |         self.reset_parameters()
 35 | 
 36 |     def reset_parameters(self):
 37 |         n = self.in_channels
 38 |         for k in self.kernel_size:
 39 |             n *= k
 40 |         stdv = 1. / math.sqrt(n)
 41 |         self.weight.data.uniform_(-stdv, stdv)
 42 |         self.bias.data.zero_()
 43 | 
 44 | 
 45 |     def forward(self, input, offset, mask):
 46 |         # assert 2 * self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
 47 |         #     offset.shape[1]
 48 |         # assert self.deformable_groups * self.kernel_size[0] * self.kernel_size[1] == \
 49 |         #     mask.shape[1]
 50 |         output = _backend.dcn_v2_forward(input, self.weight, self.bias,
 51 |                             offset, mask,
 52 |                             self.weight.shape[2], self.weight.shape[3],
 53 |                             self.stride[0], self.stride[1],
 54 |                             self.padding[0], self.padding[1],
 55 |                             self.dilation[0], self.dilation[1],
 56 |                             self.deformable_groups)
 57 |         return output
 58 | 
 59 | 
 60 | class DCN(DCNv2):
 61 | 
 62 |     def __init__(self, in_channels, out_channels,
 63 |                  kernel_size, stride, padding,
 64 |                  dilation=1, deformable_groups=1):
 65 |         super(DCN, self).__init__(in_channels, out_channels,
 66 |                                   kernel_size, stride, padding, dilation, deformable_groups)
 67 | 
 68 |         channels_ = self.deformable_groups * 3 * \
 69 |             self.kernel_size[0] * self.kernel_size[1]
 70 |         self.conv_offset_mask = nn.Conv2d(self.in_channels,
 71 |                                           channels_,
 72 |                                           kernel_size=self.kernel_size,
 73 |                                           stride=self.stride,
 74 |                                           padding=self.padding,
 75 |                                           bias=True)
 76 |         self.init_offset()
 77 | 
 78 |     def init_offset(self):
 79 |         self.conv_offset_mask.weight.data.zero_()
 80 |         self.conv_offset_mask.bias.data.zero_()
 81 | 
 82 |     def forward(self, input):
 83 |         out = self.conv_offset_mask(input)
 84 |         o1, o2, mask = torch.chunk(out, 3, dim=1)
 85 |         offset = torch.cat((o1, o2), dim=1)
 86 |         mask = torch.sigmoid(mask)
 87 |         output = torch.ops.my_ops.dcn_v2_cuda_forward_v2(input, self.weight, self.bias,
 88 |                                     offset, mask,
 89 |                                     self.kernel_size[0], self.kernel_size[1],
 90 |                                     self.stride[0], self.stride[1],
 91 |                                     self.padding[0], self.padding[1],
 92 |                                     self.dilation[0], self.dilation[1],
 93 |                                     self.deformable_groups)
 94 | 
 95 |         return output
 96 | 
 97 | 
 98 | class _DCNv2Pooling(Function):
 99 |     @staticmethod
100 |     def forward(ctx, input, rois, offset,
101 |                 spatial_scale,
102 |                 pooled_size,
103 |                 output_dim,
104 |                 no_trans,
105 |                 group_size=1,
106 |                 part_size=None,
107 |                 sample_per_part=4,
108 |                 trans_std=.0):
109 |         ctx.spatial_scale = spatial_scale
110 |         ctx.no_trans = int(no_trans)
111 |         ctx.output_dim = output_dim
112 |         ctx.group_size = group_size
113 |         ctx.pooled_size = pooled_size
114 |         ctx.part_size = pooled_size if part_size is None else part_size
115 |         ctx.sample_per_part = sample_per_part
116 |         ctx.trans_std = trans_std
117 | 
118 |         output, output_count = \
119 |             _backend.dcn_v2_psroi_pooling_forward(input, rois, offset,
120 |                                                   ctx.no_trans, ctx.spatial_scale,
121 |                                                   ctx.output_dim, ctx.group_size,
122 |                                                   ctx.pooled_size, ctx.part_size,
123 |                                                   ctx.sample_per_part, ctx.trans_std)
124 |         ctx.save_for_backward(input, rois, offset, output_count)
125 |         return output
126 | 
127 |     @staticmethod
128 |     @once_differentiable
129 |     def backward(ctx, grad_output):
130 |         input, rois, offset, output_count = ctx.saved_tensors
131 |         grad_input, grad_offset = \
132 |             _backend.dcn_v2_psroi_pooling_backward(grad_output,
133 |                                                    input,
134 |                                                    rois,
135 |                                                    offset,
136 |                                                    output_count,
137 |                                                    ctx.no_trans,
138 |                                                    ctx.spatial_scale,
139 |                                                    ctx.output_dim,
140 |                                                    ctx.group_size,
141 |                                                    ctx.pooled_size,
142 |                                                    ctx.part_size,
143 |                                                    ctx.sample_per_part,
144 |                                                    ctx.trans_std)
145 | 
146 |         return grad_input, None, grad_offset, \
147 |             None, None, None, None, None, None, None, None
148 | 
149 | 
150 | dcn_v2_pooling = _DCNv2Pooling.apply
151 | 
152 | 
153 | class DCNv2Pooling(nn.Module):
154 | 
155 |     def __init__(self,
156 |                  spatial_scale,
157 |                  pooled_size,
158 |                  output_dim,
159 |                  no_trans,
160 |                  group_size=1,
161 |                  part_size=None,
162 |                  sample_per_part=4,
163 |                  trans_std=.0):
164 |         super(DCNv2Pooling, self).__init__()
165 |         self.spatial_scale = spatial_scale
166 |         self.pooled_size = pooled_size
167 |         self.output_dim = output_dim
168 |         self.no_trans = no_trans
169 |         self.group_size = group_size
170 |         self.part_size = pooled_size if part_size is None else part_size
171 |         self.sample_per_part = sample_per_part
172 |         self.trans_std = trans_std
173 | 
174 |     def forward(self, input, rois, offset):
175 |         assert input.shape[1] == self.output_dim
176 |         if self.no_trans:
177 |             offset = input.new()
178 |         return dcn_v2_pooling(input, rois, offset,
179 |                               self.spatial_scale,
180 |                               self.pooled_size,
181 |                               self.output_dim,
182 |                               self.no_trans,
183 |                               self.group_size,
184 |                               self.part_size,
185 |                               self.sample_per_part,
186 |                               self.trans_std)
187 | 
188 | 
189 | class DCNPooling(DCNv2Pooling):
190 | 
191 |     def __init__(self,
192 |                  spatial_scale,
193 |                  pooled_size,
194 |                  output_dim,
195 |                  no_trans,
196 |                  group_size=1,
197 |                  part_size=None,
198 |                  sample_per_part=4,
199 |                  trans_std=.0,
200 |                  deform_fc_dim=1024):
201 |         super(DCNPooling, self).__init__(spatial_scale,
202 |                                          pooled_size,
203 |                                          output_dim,
204 |                                          no_trans,
205 |                                          group_size,
206 |                                          part_size,
207 |                                          sample_per_part,
208 |                                          trans_std)
209 | 
210 |         self.deform_fc_dim = deform_fc_dim
211 | 
212 |         if not no_trans:
213 |             self.offset_mask_fc = nn.Sequential(
214 |                 nn.Linear(self.pooled_size * self.pooled_size *
215 |                           self.output_dim, self.deform_fc_dim),
216 |                 nn.ReLU(inplace=True),
217 |                 nn.Linear(self.deform_fc_dim, self.deform_fc_dim),
218 |                 nn.ReLU(inplace=True),
219 |                 nn.Linear(self.deform_fc_dim, self.pooled_size *
220 |                           self.pooled_size * 3)
221 |             )
222 |             self.offset_mask_fc[4].weight.data.zero_()
223 |             self.offset_mask_fc[4].bias.data.zero_()
224 | 
225 |     def forward(self, input, rois):
226 |         offset = input.new()
227 | 
228 |         if not self.no_trans:
229 | 
230 |             # do roi_align first
231 |             n = rois.shape[0]
232 |             roi = dcn_v2_pooling(input, rois, offset,
233 |                                  self.spatial_scale,
234 |                                  self.pooled_size,
235 |                                  self.output_dim,
236 |                                  True,  # no trans
237 |                                  self.group_size,
238 |                                  self.part_size,
239 |                                  self.sample_per_part,
240 |                                  self.trans_std)
241 | 
242 |             # build mask and offset
243 |             offset_mask = self.offset_mask_fc(roi.view(n, -1))
244 |             offset_mask = offset_mask.view(
245 |                 n, 3, self.pooled_size, self.pooled_size)
246 |             o1, o2, mask = torch.chunk(offset_mask, 3, dim=1)
247 |             offset = torch.cat((o1, o2), dim=1)
248 |             mask = torch.sigmoid(mask)
249 | 
250 |             # do pooling with offset and mask
251 |             return dcn_v2_pooling(input, rois, offset,
252 |                                   self.spatial_scale,
253 |                                   self.pooled_size,
254 |                                   self.output_dim,
255 |                                   self.no_trans,
256 |                                   self.group_size,
257 |                                   self.part_size,
258 |                                   self.sample_per_part,
259 |                                   self.trans_std) * mask
260 |         # only roi_align
261 |         return dcn_v2_pooling(input, rois, offset,
262 |                               self.spatial_scale,
263 |                               self.pooled_size,
264 |                               self.output_dim,
265 |                               self.no_trans,
266 |                               self.group_size,
267 |                               self.part_size,
268 |                               self.sample_per_part,
269 |                               self.trans_std)
270 | 


--------------------------------------------------------------------------------
/DCNv2/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python3 setup.py build develop
3 | 


--------------------------------------------------------------------------------
/DCNv2/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import glob
 5 | 
 6 | import torch
 7 | 
 8 | from torch.utils.cpp_extension import CUDA_HOME
 9 | from torch.utils.cpp_extension import CppExtension
10 | from torch.utils.cpp_extension import CUDAExtension
11 | 
12 | from setuptools import find_packages
13 | from setuptools import setup
14 | 
15 | requirements = ["torch", "torchvision"]
16 | 
17 | def get_extensions():
18 |     this_dir = os.path.dirname(os.path.abspath(__file__))
19 |     extensions_dir = os.path.join(this_dir, "src")
20 | 
21 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
22 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
23 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
24 | 
25 |     sources = main_file + source_cpu
26 |     extension = CppExtension
27 |     extra_compile_args = {"cxx": []}
28 |     define_macros = []
29 | 
30 |     if torch.cuda.is_available() and CUDA_HOME is not None:
31 |         extension = CUDAExtension
32 |         sources += source_cuda
33 |         define_macros += [("WITH_CUDA", None)]
34 |         extra_compile_args["nvcc"] = [
35 |             "-DCUDA_HAS_FP16=1",
36 |             "-D__CUDA_NO_HALF_OPERATORS__",
37 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
38 |             "-D__CUDA_NO_HALF2_OPERATORS__",
39 |         ]
40 |     else:
41 |         raise NotImplementedError('Cuda is not availabel')
42 | 
43 |     sources = [os.path.join(extensions_dir, s) for s in sources]
44 |     include_dirs = [extensions_dir]
45 |     ext_modules = [
46 |         extension(
47 |             "_ext",
48 |             sources,
49 |             include_dirs=include_dirs,
50 |             define_macros=define_macros,
51 |             extra_compile_args=extra_compile_args,
52 |         )
53 |     ]
54 |     return ext_modules
55 | 
56 | setup(
57 |     name="DCNv2",
58 |     version="0.1",
59 |     author="charlesshang",
60 |     url="https://github.com/charlesshang/DCNv2",
61 |     description="deformable convolutional networks",
62 |     packages=find_packages(exclude=("configs", "tests",)),
63 |     # install_requires=requirements,
64 |     ext_modules=get_extensions(),
65 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
66 | )


--------------------------------------------------------------------------------
/DCNv2/src/cpu/dcn_v2_cpu.cpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | 
 3 | #include <ATen/ATen.h>
 4 | #include <ATen/cuda/CUDAContext.h>
 5 | 
 6 | 
 7 | at::Tensor
 8 | dcn_v2_cpu_forward(const at::Tensor &input,
 9 |                    const at::Tensor &weight,
10 |                    const at::Tensor &bias,
11 |                    const at::Tensor &offset,
12 |                    const at::Tensor &mask,
13 |                    const int kernel_h,
14 |                    const int kernel_w,
15 |                    const int stride_h,
16 |                    const int stride_w,
17 |                    const int pad_h,
18 |                    const int pad_w,
19 |                    const int dilation_h,
20 |                    const int dilation_w,
21 |                    const int deformable_group)
22 | {
23 |     AT_ERROR("Not implement on cpu");
24 | }
25 | 
26 | std::vector<at::Tensor>
27 | dcn_v2_cpu_backward(const at::Tensor &input,
28 |                     const at::Tensor &weight,
29 |                     const at::Tensor &bias,
30 |                     const at::Tensor &offset,
31 |                     const at::Tensor &mask,
32 |                     const at::Tensor &grad_output,
33 |                     int kernel_h, int kernel_w,
34 |                     int stride_h, int stride_w,
35 |                     int pad_h, int pad_w,
36 |                     int dilation_h, int dilation_w,
37 |                     int deformable_group)
38 | {
39 |     AT_ERROR("Not implement on cpu");
40 | }
41 | 
42 | std::tuple<at::Tensor, at::Tensor>
43 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
44 |                                  const at::Tensor &bbox,
45 |                                  const at::Tensor &trans,
46 |                                  const int no_trans,
47 |                                  const float spatial_scale,
48 |                                  const int output_dim,
49 |                                  const int group_size,
50 |                                  const int pooled_size,
51 |                                  const int part_size,
52 |                                  const int sample_per_part,
53 |                                  const float trans_std)
54 | {
55 |     AT_ERROR("Not implement on cpu");
56 | }
57 | 
58 | std::tuple<at::Tensor, at::Tensor>
59 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
60 |                                   const at::Tensor &input,
61 |                                   const at::Tensor &bbox,
62 |                                   const at::Tensor &trans,
63 |                                   const at::Tensor &top_count,
64 |                                   const int no_trans,
65 |                                   const float spatial_scale,
66 |                                   const int output_dim,
67 |                                   const int group_size,
68 |                                   const int pooled_size,
69 |                                   const int part_size,
70 |                                   const int sample_per_part,
71 |                                   const float trans_std)
72 | {
73 |     AT_ERROR("Not implement on cpu");
74 | }


--------------------------------------------------------------------------------
/DCNv2/src/cpu/vision.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor
 5 | dcn_v2_cpu_forward(const at::Tensor &input,
 6 |                     const at::Tensor &weight,
 7 |                     const at::Tensor &bias,
 8 |                     const at::Tensor &offset,
 9 |                     const at::Tensor &mask,
10 |                     const int kernel_h,
11 |                     const int kernel_w,
12 |                     const int stride_h,
13 |                     const int stride_w,
14 |                     const int pad_h,
15 |                     const int pad_w,
16 |                     const int dilation_h,
17 |                     const int dilation_w,
18 |                     const int deformable_group);
19 | 
20 | std::vector<at::Tensor>
21 | dcn_v2_cpu_backward(const at::Tensor &input,
22 |                      const at::Tensor &weight,
23 |                      const at::Tensor &bias,
24 |                      const at::Tensor &offset,
25 |                      const at::Tensor &mask,
26 |                      const at::Tensor &grad_output,
27 |                      int kernel_h, int kernel_w,
28 |                      int stride_h, int stride_w,
29 |                      int pad_h, int pad_w,
30 |                      int dilation_h, int dilation_w,
31 |                      int deformable_group);
32 | 
33 | 
34 | std::tuple<at::Tensor, at::Tensor>
35 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
36 |                                   const at::Tensor &bbox,
37 |                                   const at::Tensor &trans,
38 |                                   const int no_trans,
39 |                                   const float spatial_scale,
40 |                                   const int output_dim,
41 |                                   const int group_size,
42 |                                   const int pooled_size,
43 |                                   const int part_size,
44 |                                   const int sample_per_part,
45 |                                   const float trans_std);
46 | 
47 | std::tuple<at::Tensor, at::Tensor>
48 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
49 |                                    const at::Tensor &input,
50 |                                    const at::Tensor &bbox,
51 |                                    const at::Tensor &trans,
52 |                                    const at::Tensor &top_count,
53 |                                    const int no_trans,
54 |                                    const float spatial_scale,
55 |                                    const int output_dim,
56 |                                    const int group_size,
57 |                                    const int pooled_size,
58 |                                    const int part_size,
59 |                                    const int sample_per_part,
60 |                                    const float trans_std);


--------------------------------------------------------------------------------
/DCNv2/src/cuda/dcn_v2_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | #include "cuda/dcn_v2_im2col_cuda.h"
  3 | 
  4 | #include <ATen/ATen.h>
  5 | #include <ATen/cuda/CUDAContext.h>
  6 | 
  7 | #include <THC/THC.h>
  8 | #include <THC/THCAtomics.cuh>
  9 | #include <THC/THCDeviceUtils.cuh>
 10 | extern THCState *state;
 11 | 
 12 | // author: Charles Shang
 13 | // https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
 14 | 
 15 | // [batch gemm]
 16 | // https://github.com/pytorch/pytorch/blob/master/aten/src/THC/generic/THCTensorMathBlas.cu
 17 | 
 18 | __global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
 19 |                                       float **columns_b, const float **ones_b,
 20 |                                       const float **weight_b, const float **bias_b,
 21 |                                       float *input, float *output,
 22 |                                       float *columns, float *ones,
 23 |                                       float *weight, float *bias,
 24 |                                       const int input_stride, const int output_stride,
 25 |                                       const int columns_stride, const int ones_stride,
 26 |                                       const int num_batches)
 27 | {
 28 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 29 |     if (idx < num_batches)
 30 |     {
 31 |         input_b[idx] = input + idx * input_stride;
 32 |         output_b[idx] = output + idx * output_stride;
 33 |         columns_b[idx] = columns + idx * columns_stride;
 34 |         ones_b[idx] = ones + idx * ones_stride;
 35 |         // share weights and bias within a Mini-Batch
 36 |         weight_b[idx] = weight;
 37 |         bias_b[idx] = bias;
 38 |     }
 39 | }
 40 | 
 41 | at::Tensor
 42 | dcn_v2_cuda_forward(const at::Tensor &input,
 43 |                     const at::Tensor &weight,
 44 |                     const at::Tensor &bias,
 45 |                     const at::Tensor &offset,
 46 |                     const at::Tensor &mask,
 47 |                     const int kernel_h,
 48 |                     const int kernel_w,
 49 |                     const int stride_h,
 50 |                     const int stride_w,
 51 |                     const int pad_h,
 52 |                     const int pad_w,
 53 |                     const int dilation_h,
 54 |                     const int dilation_w,
 55 |                     const int deformable_group)
 56 | {
 57 |     using scalar_t = float;
 58 |     // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
 59 |     AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
 60 |     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
 61 |     AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
 62 |     AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
 63 |     AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
 64 | 
 65 |     const int batch = input.size(0);
 66 |     const int channels = input.size(1);
 67 |     const int height = input.size(2);
 68 |     const int width = input.size(3);
 69 | 
 70 |     const int channels_out = weight.size(0);
 71 |     const int channels_kernel = weight.size(1);
 72 |     const int kernel_h_ = weight.size(2);
 73 |     const int kernel_w_ = weight.size(3);
 74 | 
 75 |     // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
 76 |     // printf("Channels: %d %d\n", channels, channels_kernel);
 77 |     // printf("Channels: %d %d\n", channels_out, channels_kernel);
 78 | 
 79 |     AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
 80 |                "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
 81 | 
 82 |     AT_ASSERTM(channels == channels_kernel,
 83 |                "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
 84 | 
 85 |     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
 86 |     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
 87 | 
 88 |     auto ones = at::ones({batch, height_out, width_out}, input.options());
 89 |     auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
 90 |     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
 91 | 
 92 |     // prepare for batch-wise computing, which is significantly faster than instance-wise computing
 93 |     // when batch size is large.
 94 |     // launch batch threads
 95 |     int matrices_size = batch * sizeof(float *);
 96 |     auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
 97 |     auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
 98 |     auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
 99 |     auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
100 |     auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
101 |     auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
102 | 
103 |     const int block = 128;
104 |     const int grid = (batch + block - 1) / block;
105 | 
106 |     createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
107 |         input_b, output_b,
108 |         columns_b, ones_b,
109 |         weight_b, bias_b,
110 |         input.data<scalar_t>(),
111 |         output.data<scalar_t>(),
112 |         columns.data<scalar_t>(),
113 |         ones.data<scalar_t>(),
114 |         weight.data<scalar_t>(),
115 |         bias.data<scalar_t>(),
116 |         channels * width * height,
117 |         channels_out * width_out * height_out,
118 |         channels * kernel_h * kernel_w * height_out * width_out,
119 |         height_out * width_out,
120 |         batch);
121 | 
122 |     long m_ = channels_out;
123 |     long n_ = height_out * width_out;
124 |     long k_ = 1;
125 |     THCudaBlas_SgemmBatched(state,
126 |                             't',
127 |                             'n',
128 |                             n_,
129 |                             m_,
130 |                             k_,
131 |                             1.0f,
132 |                             ones_b, k_,
133 |                             bias_b, k_,
134 |                             0.0f,
135 |                             output_b, n_,
136 |                             batch);
137 | 
138 |     modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
139 |                                      input.data<scalar_t>(),
140 |                                      offset.data<scalar_t>(),
141 |                                      mask.data<scalar_t>(),
142 |                                      batch, channels, height, width,
143 |                                      height_out, width_out, kernel_h, kernel_w,
144 |                                      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
145 |                                      deformable_group,
146 |                                      columns.data<scalar_t>());
147 | 
148 |     long m = channels_out;
149 |     long n = height_out * width_out;
150 |     long k = channels * kernel_h * kernel_w;
151 |     THCudaBlas_SgemmBatched(state,
152 |                             'n',
153 |                             'n',
154 |                             n,
155 |                             m,
156 |                             k,
157 |                             1.0f,
158 |                             (const float **)columns_b, n,
159 |                             weight_b, k,
160 |                             1.0f,
161 |                             output_b, n,
162 |                             batch);
163 | 
164 |     THCudaFree(state, input_b);
165 |     THCudaFree(state, output_b);
166 |     THCudaFree(state, columns_b);
167 |     THCudaFree(state, ones_b);
168 |     THCudaFree(state, weight_b);
169 |     THCudaFree(state, bias_b);
170 |     return output;
171 | }
172 | 
173 | at::Tensor
174 | dcn_v2_cuda_forward_v2(const at::Tensor &input,
175 |                     const at::Tensor &weight,
176 |                     const at::Tensor &bias,
177 |                     const at::Tensor &offset,
178 |                     const at::Tensor &mask,
179 |                     const int64_t kernel_h,
180 |                     const int64_t kernel_w,
181 |                     const int64_t stride_h,
182 |                     const int64_t stride_w,
183 |                     const int64_t pad_h,
184 |                     const int64_t pad_w,
185 |                     const int64_t dilation_h,
186 |                     const int64_t dilation_w,
187 |                     const int64_t deformable_group)
188 | {
189 |     using scalar_t = float;
190 |     // THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
191 |     AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
192 |     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
193 |     AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
194 |     AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
195 |     AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
196 | 
197 |     const int batch = input.size(0);
198 |     const int channels = input.size(1);
199 |     const int height = input.size(2);
200 |     const int width = input.size(3);
201 | 
202 |     const int channels_out = weight.size(0);
203 |     const int channels_kernel = weight.size(1);
204 |     const int kernel_h_ = weight.size(2);
205 |     const int kernel_w_ = weight.size(3);
206 | 
207 |     // printf("Kernels: %d %d %d %d\n", kernel_h_, kernel_w_, kernel_w, kernel_h);
208 |     // printf("Channels: %d %d\n", channels, channels_kernel);
209 |     // printf("Channels: %d %d\n", channels_out, channels_kernel);
210 | 
211 |     AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
212 |                "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
213 | 
214 |     AT_ASSERTM(channels == channels_kernel,
215 |                "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
216 | 
217 |     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
218 |     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
219 | 
220 |     auto ones = at::ones({batch, height_out, width_out}, input.options());
221 |     auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
222 |     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
223 | 
224 |     // prepare for batch-wise computing, which is significantly faster than instance-wise computing
225 |     // when batch size is large.
226 |     // launch batch threads
227 |     int matrices_size = batch * sizeof(float *);
228 |     auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
229 |     auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
230 |     auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
231 |     auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
232 |     auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
233 |     auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
234 | 
235 |     const int block = 128;
236 |     const int grid = (batch + block - 1) / block;
237 | 
238 |     createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
239 |         input_b, output_b,
240 |         columns_b, ones_b,
241 |         weight_b, bias_b,
242 |         input.data<scalar_t>(),
243 |         output.data<scalar_t>(),
244 |         columns.data<scalar_t>(),
245 |         ones.data<scalar_t>(),
246 |         weight.data<scalar_t>(),
247 |         bias.data<scalar_t>(),
248 |         channels * width * height,
249 |         channels_out * width_out * height_out,
250 |         channels * kernel_h * kernel_w * height_out * width_out,
251 |         height_out * width_out,
252 |         batch);
253 | 
254 |     long m_ = channels_out;
255 |     long n_ = height_out * width_out;
256 |     long k_ = 1;
257 |     THCudaBlas_SgemmBatched(state,
258 |                             't',
259 |                             'n',
260 |                             n_,
261 |                             m_,
262 |                             k_,
263 |                             1.0f,
264 |                             ones_b, k_,
265 |                             bias_b, k_,
266 |                             0.0f,
267 |                             output_b, n_,
268 |                             batch);
269 | 
270 |     modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
271 |                                      input.data<scalar_t>(),
272 |                                      offset.data<scalar_t>(),
273 |                                      mask.data<scalar_t>(),
274 |                                      batch, channels, height, width,
275 |                                      height_out, width_out, kernel_h, kernel_w,
276 |                                      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
277 |                                      deformable_group,
278 |                                      columns.data<scalar_t>());
279 | 
280 |     long m = channels_out;
281 |     long n = height_out * width_out;
282 |     long k = channels * kernel_h * kernel_w;
283 |     THCudaBlas_SgemmBatched(state,
284 |                             'n',
285 |                             'n',
286 |                             n,
287 |                             m,
288 |                             k,
289 |                             1.0f,
290 |                             (const float **)columns_b, n,
291 |                             weight_b, k,
292 |                             1.0f,
293 |                             output_b, n,
294 |                             batch);
295 | 
296 |     THCudaFree(state, input_b);
297 |     THCudaFree(state, output_b);
298 |     THCudaFree(state, columns_b);
299 |     THCudaFree(state, ones_b);
300 |     THCudaFree(state, weight_b);
301 |     THCudaFree(state, bias_b);
302 |     return output;
303 | }
304 | 
305 | __global__ void createBatchGemmBufferBackward(
306 |     float **grad_output_b,
307 |     float **columns_b,
308 |     float **ones_b,
309 |     float **weight_b,
310 |     float **grad_weight_b,
311 |     float **grad_bias_b,
312 |     float *grad_output,
313 |     float *columns,
314 |     float *ones,
315 |     float *weight,
316 |     float *grad_weight,
317 |     float *grad_bias,
318 |     const int grad_output_stride,
319 |     const int columns_stride,
320 |     const int ones_stride,
321 |     const int num_batches)
322 | {
323 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
324 |     if (idx < num_batches)
325 |     {
326 |         grad_output_b[idx] = grad_output + idx * grad_output_stride;
327 |         columns_b[idx] = columns + idx * columns_stride;
328 |         ones_b[idx] = ones + idx * ones_stride;
329 | 
330 |         // share weights and bias within a Mini-Batch
331 |         weight_b[idx] = weight;
332 |         grad_weight_b[idx] = grad_weight;
333 |         grad_bias_b[idx] = grad_bias;
334 |     }
335 | }
336 | 
337 | std::vector<at::Tensor> dcn_v2_cuda_backward(const at::Tensor &input,
338 |                                              const at::Tensor &weight,
339 |                                              const at::Tensor &bias,
340 |                                              const at::Tensor &offset,
341 |                                              const at::Tensor &mask,
342 |                                              const at::Tensor &grad_output,
343 |                                              int kernel_h, int kernel_w,
344 |                                              int stride_h, int stride_w,
345 |                                              int pad_h, int pad_w,
346 |                                              int dilation_h, int dilation_w,
347 |                                              int deformable_group)
348 | {
349 | 
350 |     THArgCheck(input.is_contiguous(), 1, "input tensor has to be contiguous");
351 |     THArgCheck(weight.is_contiguous(), 2, "weight tensor has to be contiguous");
352 | 
353 |     AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
354 |     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
355 |     AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
356 |     AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
357 |     AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
358 | 
359 |     const int batch = input.size(0);
360 |     const int channels = input.size(1);
361 |     const int height = input.size(2);
362 |     const int width = input.size(3);
363 | 
364 |     const int channels_out = weight.size(0);
365 |     const int channels_kernel = weight.size(1);
366 |     const int kernel_h_ = weight.size(2);
367 |     const int kernel_w_ = weight.size(3);
368 | 
369 |     AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
370 |                "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
371 | 
372 |     AT_ASSERTM(channels == channels_kernel,
373 |                "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
374 | 
375 |     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
376 |     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
377 | 
378 |     auto ones = at::ones({height_out, width_out}, input.options());
379 |     auto columns = at::empty({channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
380 |     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
381 | 
382 |     auto grad_input = at::zeros_like(input);
383 |     auto grad_weight = at::zeros_like(weight);
384 |     auto grad_bias = at::zeros_like(bias);
385 |     auto grad_offset = at::zeros_like(offset);
386 |     auto grad_mask = at::zeros_like(mask);
387 | 
388 |     using scalar_t = float;
389 | 
390 |     for (int b = 0; b < batch; b++)
391 |     {
392 |         auto input_n = input.select(0, b);
393 |         auto offset_n = offset.select(0, b);
394 |         auto mask_n = mask.select(0, b);
395 |         auto grad_output_n = grad_output.select(0, b);
396 |         auto grad_input_n = grad_input.select(0, b);
397 |         auto grad_offset_n = grad_offset.select(0, b);
398 |         auto grad_mask_n = grad_mask.select(0, b);
399 | 
400 |         long m = channels * kernel_h * kernel_w;
401 |         long n = height_out * width_out;
402 |         long k = channels_out;
403 | 
404 |         THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
405 |                          grad_output_n.data<scalar_t>(), n,
406 |                          weight.data<scalar_t>(), m, 0.0f,
407 |                          columns.data<scalar_t>(), n);
408 | 
409 |         // gradient w.r.t. input coordinate data
410 |         modulated_deformable_col2im_coord_cuda(THCState_getCurrentStream(state),
411 |                                                columns.data<scalar_t>(),
412 |                                                input_n.data<scalar_t>(),
413 |                                                offset_n.data<scalar_t>(),
414 |                                                mask_n.data<scalar_t>(),
415 |                                                1, channels, height, width,
416 |                                                height_out, width_out, kernel_h, kernel_w,
417 |                                                pad_h, pad_w, stride_h, stride_w,
418 |                                                dilation_h, dilation_w, deformable_group,
419 |                                                grad_offset_n.data<scalar_t>(),
420 |                                                grad_mask_n.data<scalar_t>());
421 |         // gradient w.r.t. input data
422 |         modulated_deformable_col2im_cuda(THCState_getCurrentStream(state),
423 |                                          columns.data<scalar_t>(),
424 |                                          offset_n.data<scalar_t>(),
425 |                                          mask_n.data<scalar_t>(),
426 |                                          1, channels, height, width,
427 |                                          height_out, width_out, kernel_h, kernel_w,
428 |                                          pad_h, pad_w, stride_h, stride_w,
429 |                                          dilation_h, dilation_w, deformable_group,
430 |                                          grad_input_n.data<scalar_t>());
431 | 
432 |         // gradient w.r.t. weight, dWeight should accumulate across the batch and group
433 |         modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
434 |                                          input_n.data<scalar_t>(),
435 |                                          offset_n.data<scalar_t>(),
436 |                                          mask_n.data<scalar_t>(),
437 |                                          1, channels, height, width,
438 |                                          height_out, width_out, kernel_h, kernel_w,
439 |                                          pad_h, pad_w, stride_h, stride_w,
440 |                                          dilation_h, dilation_w, deformable_group,
441 |                                          columns.data<scalar_t>());
442 | 
443 |         long m_ = channels_out;
444 |         long n_ = channels * kernel_h * kernel_w;
445 |         long k_ = height_out * width_out;
446 | 
447 |         THCudaBlas_Sgemm(state, 't', 'n', n_, m_, k_, 1.0f,
448 |                          columns.data<scalar_t>(), k_,
449 |                          grad_output_n.data<scalar_t>(), k_, 1.0f,
450 |                          grad_weight.data<scalar_t>(), n_);
451 | 
452 |         // gradient w.r.t. bias
453 |         // long m_ = channels_out;
454 |         // long k__ = height_out * width_out;
455 |         THCudaBlas_Sgemv(state,
456 |                          't',
457 |                          k_, m_, 1.0f,
458 |                          grad_output_n.data<scalar_t>(), k_,
459 |                          ones.data<scalar_t>(), 1, 1.0f,
460 |                          grad_bias.data<scalar_t>(), 1);
461 |     }
462 | 
463 |     return {
464 |         grad_input, grad_offset, grad_mask, grad_weight, grad_bias
465 |     };
466 | }


--------------------------------------------------------------------------------
/DCNv2/src/cuda/dcn_v2_im2col_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include "dcn_v2_im2col_cuda.h"
  2 | #include <cstdio>
  3 | #include <algorithm>
  4 | #include <cstring>
  5 | 
  6 | #include <ATen/ATen.h>
  7 | #include <ATen/cuda/CUDAContext.h>
  8 | 
  9 | #include <THC/THC.h>
 10 | #include <THC/THCAtomics.cuh>
 11 | #include <THC/THCDeviceUtils.cuh>
 12 | 
 13 | #define CUDA_KERNEL_LOOP(i, n)                          \
 14 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
 15 |       i < (n);                                          \
 16 |       i += blockDim.x * gridDim.x)
 17 | 
 18 | const int CUDA_NUM_THREADS = 1024;
 19 | inline int GET_BLOCKS(const int N)
 20 | {
 21 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 22 | }
 23 | 
 24 | 
 25 | __device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
 26 |                                       const int height, const int width, float h, float w)
 27 | {
 28 |   int h_low = floor(h);
 29 |   int w_low = floor(w);
 30 |   int h_high = h_low + 1;
 31 |   int w_high = w_low + 1;
 32 | 
 33 |   float lh = h - h_low;
 34 |   float lw = w - w_low;
 35 |   float hh = 1 - lh, hw = 1 - lw;
 36 | 
 37 |   float v1 = 0;
 38 |   if (h_low >= 0 && w_low >= 0)
 39 |     v1 = bottom_data[h_low * data_width + w_low];
 40 |   float v2 = 0;
 41 |   if (h_low >= 0 && w_high <= width - 1)
 42 |     v2 = bottom_data[h_low * data_width + w_high];
 43 |   float v3 = 0;
 44 |   if (h_high <= height - 1 && w_low >= 0)
 45 |     v3 = bottom_data[h_high * data_width + w_low];
 46 |   float v4 = 0;
 47 |   if (h_high <= height - 1 && w_high <= width - 1)
 48 |     v4 = bottom_data[h_high * data_width + w_high];
 49 | 
 50 |   float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
 51 | 
 52 |   float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 53 |   return val;
 54 | }
 55 | 
 56 | __device__ float dmcn_get_gradient_weight(float argmax_h, float argmax_w,
 57 |                                           const int h, const int w, const int height, const int width)
 58 | {
 59 |   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
 60 |   {
 61 |     //empty
 62 |     return 0;
 63 |   }
 64 | 
 65 |   int argmax_h_low = floor(argmax_h);
 66 |   int argmax_w_low = floor(argmax_w);
 67 |   int argmax_h_high = argmax_h_low + 1;
 68 |   int argmax_w_high = argmax_w_low + 1;
 69 | 
 70 |   float weight = 0;
 71 |   if (h == argmax_h_low && w == argmax_w_low)
 72 |     weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
 73 |   if (h == argmax_h_low && w == argmax_w_high)
 74 |     weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
 75 |   if (h == argmax_h_high && w == argmax_w_low)
 76 |     weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
 77 |   if (h == argmax_h_high && w == argmax_w_high)
 78 |     weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
 79 |   return weight;
 80 | }
 81 | 
 82 | __device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
 83 |                                             const int height, const int width, const float *im_data,
 84 |                                             const int data_width, const int bp_dir)
 85 | {
 86 |   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
 87 |   {
 88 |     //empty
 89 |     return 0;
 90 |   }
 91 | 
 92 |   int argmax_h_low = floor(argmax_h);
 93 |   int argmax_w_low = floor(argmax_w);
 94 |   int argmax_h_high = argmax_h_low + 1;
 95 |   int argmax_w_high = argmax_w_low + 1;
 96 | 
 97 |   float weight = 0;
 98 | 
 99 |   if (bp_dir == 0)
100 |   {
101 |     if (argmax_h_low >= 0 && argmax_w_low >= 0)
102 |       weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
103 |     if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
104 |       weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
105 |     if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
106 |       weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
107 |     if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
108 |       weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
109 |   }
110 |   else if (bp_dir == 1)
111 |   {
112 |     if (argmax_h_low >= 0 && argmax_w_low >= 0)
113 |       weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
114 |     if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
115 |       weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
116 |     if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
117 |       weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
118 |     if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
119 |       weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
120 |   }
121 | 
122 |   return weight;
123 | }
124 | 
125 | __global__ void modulated_deformable_im2col_gpu_kernel(const int n,
126 |                                                        const float *data_im, const float *data_offset, const float *data_mask,
127 |                                                        const int height, const int width, const int kernel_h, const int kernel_w,
128 |                                                        const int pad_h, const int pad_w,
129 |                                                        const int stride_h, const int stride_w,
130 |                                                        const int dilation_h, const int dilation_w,
131 |                                                        const int channel_per_deformable_group,
132 |                                                        const int batch_size, const int num_channels, const int deformable_group,
133 |                                                        const int height_col, const int width_col,
134 |                                                        float *data_col)
135 | {
136 |   // launch channels * batch_size * height_col * width_col cores
137 |   CUDA_KERNEL_LOOP(index, n)
138 |   {
139 |     // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
140 |     // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
141 | 
142 |     // index index of output matrix
143 |     const int w_col = index % width_col;
144 |     const int h_col = (index / width_col) % height_col;
145 |     // const int b_col = (index / width_col / height_col) % batch_size;
146 |     const int b_col = (index / width_col / height_col / num_channels) % batch_size;
147 |     // const int c_im = (index / width_col / height_col) / batch_size;
148 |     const int c_im = (index / width_col / height_col) % num_channels;
149 |     // const int c_col = c_im * kernel_h * kernel_w;
150 |     const int c_col = c_im * kernel_h * kernel_w;
151 | 
152 |     // compute deformable group index
153 |     const int deformable_group_index = c_im / channel_per_deformable_group;
154 | 
155 |     const int h_in = h_col * stride_h - pad_h;
156 |     const int w_in = w_col * stride_w - pad_w;
157 | 
158 |     //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
159 |     float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
160 |     //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
161 |     const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
162 |     const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
163 | 
164 |     const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
165 | 
166 |     for (int i = 0; i < kernel_h; ++i)
167 |     {
168 |       for (int j = 0; j < kernel_w; ++j)
169 |       {
170 |         const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
171 |         const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
172 |         const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
173 |         const float offset_h = data_offset_ptr[data_offset_h_ptr];
174 |         const float offset_w = data_offset_ptr[data_offset_w_ptr];
175 |         const float mask = data_mask_ptr[data_mask_hw_ptr];
176 |         float val = static_cast<float>(0);
177 |         const float h_im = h_in + i * dilation_h + offset_h;
178 |         const float w_im = w_in + j * dilation_w + offset_w;
179 |         //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
180 |         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
181 |         {
182 |           //const float map_h = i * dilation_h + offset_h;
183 |           //const float map_w = j * dilation_w + offset_w;
184 |           //const int cur_height = height - h_in;
185 |           //const int cur_width = width - w_in;
186 |           //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
187 |           val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
188 |         }
189 |         *data_col_ptr = val * mask;
190 |         // data_col_ptr += batch_size * height_col * width_col;
191 |         data_col_ptr += height_col * width_col;
192 |       }
193 |     }
194 |   }
195 | }
196 | 
197 | __global__ void modulated_deformable_col2im_gpu_kernel(const int n,
198 |                                                        const float *data_col, const float *data_offset, const float *data_mask,
199 |                                                        const int channels, const int height, const int width,
200 |                                                        const int kernel_h, const int kernel_w,
201 |                                                        const int pad_h, const int pad_w,
202 |                                                        const int stride_h, const int stride_w,
203 |                                                        const int dilation_h, const int dilation_w,
204 |                                                        const int channel_per_deformable_group,
205 |                                                        const int batch_size, const int deformable_group,
206 |                                                        const int height_col, const int width_col,
207 |                                                        float *grad_im)
208 | {
209 |   CUDA_KERNEL_LOOP(index, n)
210 |   {
211 |     const int j = (index / width_col / height_col / batch_size) % kernel_w;
212 |     const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
213 |     const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
214 |     // compute the start and end of the output
215 | 
216 |     const int deformable_group_index = c / channel_per_deformable_group;
217 | 
218 |     int w_out = index % width_col;
219 |     int h_out = (index / width_col) % height_col;
220 |     int b = (index / width_col / height_col) % batch_size;
221 |     int w_in = w_out * stride_w - pad_w;
222 |     int h_in = h_out * stride_h - pad_h;
223 | 
224 |     const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
225 |     const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
226 |     const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
227 |     const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
228 |     const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
229 |     const float offset_h = data_offset_ptr[data_offset_h_ptr];
230 |     const float offset_w = data_offset_ptr[data_offset_w_ptr];
231 |     const float mask = data_mask_ptr[data_mask_hw_ptr];
232 |     const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
233 |     const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
234 | 
235 |     const float cur_top_grad = data_col[index] * mask;
236 |     const int cur_h = (int)cur_inv_h_data;
237 |     const int cur_w = (int)cur_inv_w_data;
238 |     for (int dy = -2; dy <= 2; dy++)
239 |     {
240 |       for (int dx = -2; dx <= 2; dx++)
241 |       {
242 |         if (cur_h + dy >= 0 && cur_h + dy < height &&
243 |             cur_w + dx >= 0 && cur_w + dx < width &&
244 |             abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
245 |             abs(cur_inv_w_data - (cur_w + dx)) < 1)
246 |         {
247 |           int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
248 |           float weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
249 |           atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
250 |         }
251 |       }
252 |     }
253 |   }
254 | }
255 | 
256 | __global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
257 |                                                              const float *data_col, const float *data_im,
258 |                                                              const float *data_offset, const float *data_mask,
259 |                                                              const int channels, const int height, const int width,
260 |                                                              const int kernel_h, const int kernel_w,
261 |                                                              const int pad_h, const int pad_w,
262 |                                                              const int stride_h, const int stride_w,
263 |                                                              const int dilation_h, const int dilation_w,
264 |                                                              const int channel_per_deformable_group,
265 |                                                              const int batch_size, const int offset_channels, const int deformable_group,
266 |                                                              const int height_col, const int width_col,
267 |                                                              float *grad_offset, float *grad_mask)
268 | {
269 |   CUDA_KERNEL_LOOP(index, n)
270 |   {
271 |     float val = 0, mval = 0;
272 |     int w = index % width_col;
273 |     int h = (index / width_col) % height_col;
274 |     int c = (index / width_col / height_col) % offset_channels;
275 |     int b = (index / width_col / height_col) / offset_channels;
276 |     // compute the start and end of the output
277 | 
278 |     const int deformable_group_index = c / (2 * kernel_h * kernel_w);
279 |     const int col_step = kernel_h * kernel_w;
280 |     int cnt = 0;
281 |     const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
282 |     const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
283 |     const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
284 |     const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
285 | 
286 |     const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
287 | 
288 |     for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
289 |     {
290 |       const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
291 |       const int bp_dir = offset_c % 2;
292 | 
293 |       int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
294 |       int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
295 |       int w_out = col_pos % width_col;
296 |       int h_out = (col_pos / width_col) % height_col;
297 |       int w_in = w_out * stride_w - pad_w;
298 |       int h_in = h_out * stride_h - pad_h;
299 |       const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
300 |       const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
301 |       const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
302 |       const float offset_h = data_offset_ptr[data_offset_h_ptr];
303 |       const float offset_w = data_offset_ptr[data_offset_w_ptr];
304 |       const float mask = data_mask_ptr[data_mask_hw_ptr];
305 |       float inv_h = h_in + i * dilation_h + offset_h;
306 |       float inv_w = w_in + j * dilation_w + offset_w;
307 |       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
308 |       {
309 |         inv_h = inv_w = -2;
310 |       }
311 |       else
312 |       {
313 |         mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
314 |       }
315 |       const float weight = dmcn_get_coordinate_weight(
316 |           inv_h, inv_w,
317 |           height, width, data_im_ptr + cnt * height * width, width, bp_dir);
318 |       val += weight * data_col_ptr[col_pos] * mask;
319 |       cnt += 1;
320 |     }
321 |     // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
322 |     grad_offset[index] = val;
323 |     if (offset_c % 2 == 0)
324 |       // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
325 |       grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
326 |   }
327 | }
328 | 
329 | void modulated_deformable_im2col_cuda(cudaStream_t stream,
330 |   const float* data_im, const float* data_offset, const float* data_mask,
331 |   const int batch_size, const int channels, const int height_im, const int width_im, 
332 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
333 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
334 |   const int dilation_h, const int dilation_w,
335 |   const int deformable_group, float* data_col) {
336 |   // num_axes should be smaller than block size
337 |   const int channel_per_deformable_group = channels / deformable_group;
338 |   const int num_kernels = channels * batch_size * height_col * width_col;
339 |   modulated_deformable_im2col_gpu_kernel
340 |       <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
341 |           0, stream>>>(
342 |       num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
343 |       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
344 |       batch_size, channels, deformable_group, height_col, width_col, data_col);
345 |   
346 |   cudaError_t err = cudaGetLastError();
347 |   if (err != cudaSuccess)
348 |   {
349 |     printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
350 |   }
351 | 
352 | }
353 | 
354 | void modulated_deformable_col2im_cuda(cudaStream_t stream,
355 |   const float* data_col, const float* data_offset, const float* data_mask,
356 |   const int batch_size, const int channels, const int height_im, const int width_im, 
357 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
358 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
359 |   const int dilation_h, const int dilation_w, 
360 |   const int deformable_group, float* grad_im){
361 | 
362 |   const int channel_per_deformable_group = channels / deformable_group;
363 |   const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
364 |   modulated_deformable_col2im_gpu_kernel
365 |       <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
366 |           0, stream>>>(
367 |         num_kernels, data_col, data_offset, data_mask, channels, height_im, width_im,
368 |         kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
369 |         dilation_h, dilation_w, channel_per_deformable_group,
370 |         batch_size, deformable_group, height_col, width_col, grad_im);
371 |   cudaError_t err = cudaGetLastError();
372 |   if (err != cudaSuccess)
373 |   {
374 |     printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
375 |   }
376 | 
377 | }
378 | 
379 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
380 |   const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
381 |   const int batch_size, const int channels, const int height_im, const int width_im, 
382 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
383 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
384 |   const int dilation_h, const int dilation_w, 
385 |   const int deformable_group,
386 |   float* grad_offset, float* grad_mask) {
387 |   const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
388 |   const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
389 |   modulated_deformable_col2im_coord_gpu_kernel
390 |       <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
391 |         0, stream>>>(
392 |         num_kernels, data_col, data_im, data_offset, data_mask, channels, height_im, width_im,
393 |         kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
394 |         dilation_h, dilation_w, channel_per_deformable_group,
395 |         batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col, 
396 |         grad_offset, grad_mask);
397 |   cudaError_t err = cudaGetLastError();
398 |   if (err != cudaSuccess)
399 |   {
400 |     printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
401 |   }
402 | }


--------------------------------------------------------------------------------
/DCNv2/src/cuda/dcn_v2_im2col_cuda.h:
--------------------------------------------------------------------------------
  1 | 
  2 | /*!
  3 |  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  4 |  *
  5 |  * COPYRIGHT
  6 |  *
  7 |  * All contributions by the University of California:
  8 |  * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
  9 |  * All rights reserved.
 10 |  *
 11 |  * All other contributions:
 12 |  * Copyright (c) 2014-2017, the respective contributors
 13 |  * All rights reserved.
 14 |  *
 15 |  * Caffe uses a shared copyright model: each contributor holds copyright over
 16 |  * their contributions to Caffe. The project versioning records all such
 17 |  * contribution and copyright details. If a contributor wants to further mark
 18 |  * their specific copyright on a particular contribution, they should indicate
 19 |  * their copyright solely in the commit message of the change when it is
 20 |  * committed.
 21 |  *
 22 |  * LICENSE
 23 |  *
 24 |  * Redistribution and use in source and binary forms, with or without
 25 |  * modification, are permitted provided that the following conditions are met:
 26 |  *
 27 |  * 1. Redistributions of source code must retain the above copyright notice, this
 28 |  * list of conditions and the following disclaimer.
 29 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 30 |  * this list of conditions and the following disclaimer in the documentation
 31 |  * and/or other materials provided with the distribution.
 32 |  *
 33 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 34 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 35 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 36 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 37 |  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 38 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 39 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 40 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 41 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 42 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 43 |  *
 44 |  * CONTRIBUTION AGREEMENT
 45 |  *
 46 |  * By contributing to the BVLC/caffe repository through pull-request, comment,
 47 |  * or otherwise, the contributor releases their content to the
 48 |  * license and copyright terms herein.
 49 |  *
 50 |  ***************** END Caffe Copyright Notice and Disclaimer ********************
 51 |  *
 52 |  * Copyright (c) 2018 Microsoft
 53 |  * Licensed under The MIT License [see LICENSE for details]
 54 |  * \file modulated_deformable_im2col.h
 55 |  * \brief Function definitions of converting an image to
 56 |  * column matrix based on kernel, padding, dilation, and offset.
 57 |  * These functions are mainly used in deformable convolution operators.
 58 |  * \ref: https://arxiv.org/abs/1811.11168
 59 |  * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
 60 |  */
 61 | 
 62 | /***************** Adapted by Charles Shang *********************/
 63 | 
 64 | #ifndef DCN_V2_IM2COL_CUDA
 65 | #define DCN_V2_IM2COL_CUDA
 66 | 
 67 | #ifdef __cplusplus
 68 | extern "C"
 69 | {
 70 | #endif
 71 | 
 72 |   void modulated_deformable_im2col_cuda(cudaStream_t stream,
 73 |                                         const float *data_im, const float *data_offset, const float *data_mask,
 74 |                                         const int batch_size, const int channels, const int height_im, const int width_im,
 75 |                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
 76 |                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
 77 |                                         const int dilation_h, const int dilation_w,
 78 |                                         const int deformable_group, float *data_col);
 79 | 
 80 |   void modulated_deformable_col2im_cuda(cudaStream_t stream,
 81 |                                         const float *data_col, const float *data_offset, const float *data_mask,
 82 |                                         const int batch_size, const int channels, const int height_im, const int width_im,
 83 |                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
 84 |                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
 85 |                                         const int dilation_h, const int dilation_w,
 86 |                                         const int deformable_group, float *grad_im);
 87 | 
 88 |   void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
 89 |                                          const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
 90 |                                          const int batch_size, const int channels, const int height_im, const int width_im,
 91 |                                          const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
 92 |                                          const int pad_h, const int pad_w, const int stride_h, const int stride_w,
 93 |                                          const int dilation_h, const int dilation_w,
 94 |                                          const int deformable_group,
 95 |                                          float *grad_offset, float *grad_mask);
 96 | 
 97 | #ifdef __cplusplus
 98 | }
 99 | #endif
100 | 
101 | #endif


--------------------------------------------------------------------------------
/DCNv2/src/cuda/dcn_v2_psroi_pooling_cuda.cu:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * Copyright (c) 2017 Microsoft
  3 |  * Licensed under The MIT License [see LICENSE for details]
  4 |  * \file deformable_psroi_pooling.cu
  5 |  * \brief
  6 |  * \author Yi Li, Guodong Zhang, Jifeng Dai
  7 | */
  8 | /***************** Adapted by Charles Shang *********************/
  9 | 
 10 | #include <cstdio>
 11 | #include <algorithm>
 12 | #include <cstring>
 13 | #include <iostream>
 14 | 
 15 | #include <ATen/ATen.h>
 16 | #include <ATen/cuda/CUDAContext.h>
 17 | 
 18 | #include <THC/THC.h>
 19 | #include <THC/THCAtomics.cuh>
 20 | #include <THC/THCDeviceUtils.cuh>
 21 | 
 22 | #define CUDA_KERNEL_LOOP(i, n)                        \
 23 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 24 |        i < (n);                                       \
 25 |        i += blockDim.x * gridDim.x)
 26 | 
 27 | const int CUDA_NUM_THREADS = 1024;
 28 | inline int GET_BLOCKS(const int N)
 29 | {
 30 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 31 | }
 32 | 
 33 | template <typename T>
 34 | __device__ T bilinear_interp(
 35 |     const T *data,
 36 |     const T x,
 37 |     const T y,
 38 |     const int width,
 39 |     const int height)
 40 | {
 41 |   int x1 = floor(x);
 42 |   int x2 = ceil(x);
 43 |   int y1 = floor(y);
 44 |   int y2 = ceil(y);
 45 |   T dist_x = static_cast<T>(x - x1);
 46 |   T dist_y = static_cast<T>(y - y1);
 47 |   T value11 = data[y1 * width + x1];
 48 |   T value12 = data[y2 * width + x1];
 49 |   T value21 = data[y1 * width + x2];
 50 |   T value22 = data[y2 * width + x2];
 51 |   T value = (1 - dist_x) * (1 - dist_y) * value11 +
 52 |             (1 - dist_x) * dist_y * value12 +
 53 |             dist_x * (1 - dist_y) * value21 +
 54 |             dist_x * dist_y * value22;
 55 |   return value;
 56 | }
 57 | 
 58 | template <typename T>
 59 | __global__ void DeformablePSROIPoolForwardKernel(
 60 |     const int count,
 61 |     const T *bottom_data,
 62 |     const T spatial_scale,
 63 |     const int channels,
 64 |     const int height, const int width,
 65 |     const int pooled_height, const int pooled_width,
 66 |     const T *bottom_rois, const T *bottom_trans,
 67 |     const int no_trans,
 68 |     const T trans_std,
 69 |     const int sample_per_part,
 70 |     const int output_dim,
 71 |     const int group_size,
 72 |     const int part_size,
 73 |     const int num_classes,
 74 |     const int channels_each_class,
 75 |     T *top_data,
 76 |     T *top_count)
 77 | {
 78 |   CUDA_KERNEL_LOOP(index, count)
 79 |   {
 80 |     // The output is in order (n, ctop, ph, pw)
 81 |     int pw = index % pooled_width;
 82 |     int ph = (index / pooled_width) % pooled_height;
 83 |     int ctop = (index / pooled_width / pooled_height) % output_dim;
 84 |     int n = index / pooled_width / pooled_height / output_dim;
 85 | 
 86 |     // [start, end) interval for spatial sampling
 87 |     const T *offset_bottom_rois = bottom_rois + n * 5;
 88 |     int roi_batch_ind = offset_bottom_rois[0];
 89 |     T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
 90 |     T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
 91 |     T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
 92 |     T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
 93 | 
 94 |     // Force too small ROIs to be 1x1
 95 |     T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
 96 |     T roi_height = max(roi_end_h - roi_start_h, 0.1);
 97 | 
 98 |     // Compute w and h at bottom
 99 |     T bin_size_h = roi_height / static_cast<T>(pooled_height);
100 |     T bin_size_w = roi_width / static_cast<T>(pooled_width);
101 | 
102 |     T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
103 |     T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
104 | 
105 |     int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
106 |     int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
107 |     int class_id = ctop / channels_each_class;
108 |     T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
109 |     T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
110 | 
111 |     T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
112 |     wstart += trans_x * roi_width;
113 |     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
114 |     hstart += trans_y * roi_height;
115 | 
116 |     T sum = 0;
117 |     int count = 0;
118 |     int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
119 |     int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
120 |     gw = min(max(gw, 0), group_size - 1);
121 |     gh = min(max(gh, 0), group_size - 1);
122 | 
123 |     const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
124 |     for (int ih = 0; ih < sample_per_part; ih++)
125 |     {
126 |       for (int iw = 0; iw < sample_per_part; iw++)
127 |       {
128 |         T w = wstart + iw * sub_bin_size_w;
129 |         T h = hstart + ih * sub_bin_size_h;
130 |         // bilinear interpolation
131 |         if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
132 |         {
133 |           continue;
134 |         }
135 |         w = min(max(w, 0.), width - 1.);
136 |         h = min(max(h, 0.), height - 1.);
137 |         int c = (ctop * group_size + gh) * group_size + gw;
138 |         T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
139 |         sum += val;
140 |         count++;
141 |       }
142 |     }
143 |     top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
144 |     top_count[index] = count;
145 |   }
146 | }
147 | 
148 | template <typename T>
149 | __global__ void DeformablePSROIPoolBackwardAccKernel(
150 |     const int count,
151 |     const T *top_diff,
152 |     const T *top_count,
153 |     const int num_rois,
154 |     const T spatial_scale,
155 |     const int channels,
156 |     const int height, const int width,
157 |     const int pooled_height, const int pooled_width,
158 |     const int output_dim,
159 |     T *bottom_data_diff, T *bottom_trans_diff,
160 |     const T *bottom_data,
161 |     const T *bottom_rois,
162 |     const T *bottom_trans,
163 |     const int no_trans,
164 |     const T trans_std,
165 |     const int sample_per_part,
166 |     const int group_size,
167 |     const int part_size,
168 |     const int num_classes,
169 |     const int channels_each_class)
170 | {
171 |   CUDA_KERNEL_LOOP(index, count)
172 |   {
173 |     // The output is in order (n, ctop, ph, pw)
174 |     int pw = index % pooled_width;
175 |     int ph = (index / pooled_width) % pooled_height;
176 |     int ctop = (index / pooled_width / pooled_height) % output_dim;
177 |     int n = index / pooled_width / pooled_height / output_dim;
178 | 
179 |     // [start, end) interval for spatial sampling
180 |     const T *offset_bottom_rois = bottom_rois + n * 5;
181 |     int roi_batch_ind = offset_bottom_rois[0];
182 |     T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
183 |     T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
184 |     T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
185 |     T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
186 | 
187 |     // Force too small ROIs to be 1x1
188 |     T roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
189 |     T roi_height = max(roi_end_h - roi_start_h, 0.1);
190 | 
191 |     // Compute w and h at bottom
192 |     T bin_size_h = roi_height / static_cast<T>(pooled_height);
193 |     T bin_size_w = roi_width / static_cast<T>(pooled_width);
194 | 
195 |     T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
196 |     T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
197 | 
198 |     int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
199 |     int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
200 |     int class_id = ctop / channels_each_class;
201 |     T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
202 |     T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
203 | 
204 |     T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
205 |     wstart += trans_x * roi_width;
206 |     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
207 |     hstart += trans_y * roi_height;
208 | 
209 |     if (top_count[index] <= 0)
210 |     {
211 |       continue;
212 |     }
213 |     T diff_val = top_diff[index] / top_count[index];
214 |     const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
215 |     T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
216 |     int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
217 |     int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
218 |     gw = min(max(gw, 0), group_size - 1);
219 |     gh = min(max(gh, 0), group_size - 1);
220 | 
221 |     for (int ih = 0; ih < sample_per_part; ih++)
222 |     {
223 |       for (int iw = 0; iw < sample_per_part; iw++)
224 |       {
225 |         T w = wstart + iw * sub_bin_size_w;
226 |         T h = hstart + ih * sub_bin_size_h;
227 |         // bilinear interpolation
228 |         if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
229 |         {
230 |           continue;
231 |         }
232 |         w = min(max(w, 0.), width - 1.);
233 |         h = min(max(h, 0.), height - 1.);
234 |         int c = (ctop * group_size + gh) * group_size + gw;
235 |         // backward on feature
236 |         int x0 = floor(w);
237 |         int x1 = ceil(w);
238 |         int y0 = floor(h);
239 |         int y1 = ceil(h);
240 |         T dist_x = w - x0, dist_y = h - y0;
241 |         T q00 = (1 - dist_x) * (1 - dist_y);
242 |         T q01 = (1 - dist_x) * dist_y;
243 |         T q10 = dist_x * (1 - dist_y);
244 |         T q11 = dist_x * dist_y;
245 |         int bottom_index_base = c * height * width;
246 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
247 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
248 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
249 |         atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
250 | 
251 |         if (no_trans)
252 |         {
253 |           continue;
254 |         }
255 |         T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
256 |         T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
257 |         T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
258 |         T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
259 |         T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
260 |         diff_x *= roi_width;
261 |         T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
262 |         diff_y *= roi_height;
263 | 
264 |         atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
265 |         atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
266 |       }
267 |     }
268 |   }
269 | }
270 | 
271 | std::tuple<at::Tensor, at::Tensor>
272 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
273 |                                   const at::Tensor &bbox,
274 |                                   const at::Tensor &trans,
275 |                                   const int no_trans,
276 |                                   const float spatial_scale,
277 |                                   const int output_dim,
278 |                                   const int group_size,
279 |                                   const int pooled_size,
280 |                                   const int part_size,
281 |                                   const int sample_per_part,
282 |                                   const float trans_std)
283 | {
284 |   AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
285 |   AT_ASSERTM(bbox.type().is_cuda(), "rois must be a CUDA tensor");
286 |   AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
287 | 
288 |   const int batch = input.size(0);
289 |   const int channels = input.size(1);
290 |   const int height = input.size(2);
291 |   const int width = input.size(3);
292 |   const int channels_trans = no_trans ? 2 : trans.size(1);
293 |   const int num_bbox = bbox.size(0);
294 | 
295 |   AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
296 |   auto pooled_height = pooled_size;
297 |   auto pooled_width = pooled_size;
298 | 
299 |   auto out = at::empty({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
300 |   long out_size = num_bbox * output_dim * pooled_height * pooled_width;
301 |   auto top_count = at::zeros({num_bbox, output_dim, pooled_height, pooled_width}, input.options());
302 | 
303 |   const int num_classes = no_trans ? 1 : channels_trans / 2;
304 |   const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
305 | 
306 |   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
307 | 
308 |   if (out.numel() == 0)
309 |   {
310 |     THCudaCheck(cudaGetLastError());
311 |     return std::make_tuple(out, top_count);
312 |   }
313 | 
314 |   dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
315 |   dim3 block(512);
316 | 
317 |   AT_DISPATCH_FLOATING_TYPES(input.type(), "dcn_v2_psroi_pooling_cuda_forward", [&] {
318 |     DeformablePSROIPoolForwardKernel<scalar_t><<<grid, block, 0, stream>>>(
319 |         out_size,
320 |         input.contiguous().data<scalar_t>(),
321 |         spatial_scale,
322 |         channels,
323 |         height, width,
324 |         pooled_height,
325 |         pooled_width,
326 |         bbox.contiguous().data<scalar_t>(),
327 |         trans.contiguous().data<scalar_t>(),
328 |         no_trans,
329 |         trans_std,
330 |         sample_per_part,
331 |         output_dim,
332 |         group_size,
333 |         part_size,
334 |         num_classes,
335 |         channels_each_class,
336 |         out.data<scalar_t>(),
337 |         top_count.data<scalar_t>());
338 |   });
339 |   THCudaCheck(cudaGetLastError());
340 |   return std::make_tuple(out, top_count);
341 | }
342 | 
343 | std::tuple<at::Tensor, at::Tensor>
344 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
345 |                                    const at::Tensor &input,
346 |                                    const at::Tensor &bbox,
347 |                                    const at::Tensor &trans,
348 |                                    const at::Tensor &top_count,
349 |                                    const int no_trans,
350 |                                    const float spatial_scale,
351 |                                    const int output_dim,
352 |                                    const int group_size,
353 |                                    const int pooled_size,
354 |                                    const int part_size,
355 |                                    const int sample_per_part,
356 |                                    const float trans_std)
357 | {
358 |   AT_ASSERTM(out_grad.type().is_cuda(), "out_grad must be a CUDA tensor");
359 |   AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
360 |   AT_ASSERTM(bbox.type().is_cuda(), "bbox must be a CUDA tensor");
361 |   AT_ASSERTM(trans.type().is_cuda(), "trans must be a CUDA tensor");
362 |   AT_ASSERTM(top_count.type().is_cuda(), "top_count must be a CUDA tensor");
363 | 
364 |   const int batch = input.size(0);
365 |   const int channels = input.size(1);
366 |   const int height = input.size(2);
367 |   const int width = input.size(3);
368 |   const int channels_trans = no_trans ? 2 : trans.size(1);
369 |   const int num_bbox = bbox.size(0);
370 | 
371 |   AT_ASSERTM(channels == output_dim, "input channels and output channels must equal");
372 |   auto pooled_height = pooled_size;
373 |   auto pooled_width = pooled_size;
374 |   long out_size = num_bbox * output_dim * pooled_height * pooled_width;
375 |   const int num_classes = no_trans ? 1 : channels_trans / 2;
376 |   const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
377 | 
378 |   auto input_grad = at::zeros({batch, channels, height, width}, out_grad.options());
379 |   auto trans_grad = at::zeros_like(trans);
380 | 
381 |   if (input_grad.numel() == 0)
382 |   {
383 |     THCudaCheck(cudaGetLastError());
384 |     return std::make_tuple(input_grad, trans_grad);
385 |   }
386 | 
387 |   dim3 grid(std::min(THCCeilDiv(out_size, 512L), 4096L));
388 |   dim3 block(512);
389 |   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
390 | 
391 |   AT_DISPATCH_FLOATING_TYPES(out_grad.type(), "dcn_v2_psroi_pooling_cuda_backward", [&] {
392 |     DeformablePSROIPoolBackwardAccKernel<scalar_t><<<grid, block, 0, stream>>>(
393 |         out_size,
394 |         out_grad.contiguous().data<scalar_t>(),
395 |         top_count.contiguous().data<scalar_t>(),
396 |         num_bbox,
397 |         spatial_scale,
398 |         channels,
399 |         height,
400 |         width,
401 |         pooled_height,
402 |         pooled_width,
403 |         output_dim,
404 |         input_grad.contiguous().data<scalar_t>(),
405 |         trans_grad.contiguous().data<scalar_t>(),
406 |         input.contiguous().data<scalar_t>(),
407 |         bbox.contiguous().data<scalar_t>(),
408 |         trans.contiguous().data<scalar_t>(),
409 |         no_trans,
410 |         trans_std,
411 |         sample_per_part,
412 |         group_size,
413 |         part_size,
414 |         num_classes,
415 |         channels_each_class);
416 |   });
417 |   THCudaCheck(cudaGetLastError());
418 |   return std::make_tuple(input_grad, trans_grad);
419 | }


--------------------------------------------------------------------------------
/DCNv2/src/cuda/vision.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <torch/extension.h>
 3 | 
 4 | at::Tensor
 5 | dcn_v2_cuda_forward(const at::Tensor &input,
 6 |                     const at::Tensor &weight,
 7 |                     const at::Tensor &bias,
 8 |                     const at::Tensor &offset,
 9 |                     const at::Tensor &mask,
10 |                     const int kernel_h,
11 |                     const int kernel_w,
12 |                     const int stride_h,
13 |                     const int stride_w,
14 |                     const int pad_h,
15 |                     const int pad_w,
16 |                     const int dilation_h,
17 |                     const int dilation_w,
18 |                     const int deformable_group);
19 | 
20 | std::vector<at::Tensor>
21 | dcn_v2_cuda_backward(const at::Tensor &input,
22 |                      const at::Tensor &weight,
23 |                      const at::Tensor &bias,
24 |                      const at::Tensor &offset,
25 |                      const at::Tensor &mask,
26 |                      const at::Tensor &grad_output,
27 |                      int kernel_h, int kernel_w,
28 |                      int stride_h, int stride_w,
29 |                      int pad_h, int pad_w,
30 |                      int dilation_h, int dilation_w,
31 |                      int deformable_group);
32 | 
33 | 
34 | std::tuple<at::Tensor, at::Tensor>
35 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
36 |                                   const at::Tensor &bbox,
37 |                                   const at::Tensor &trans,
38 |                                   const int no_trans,
39 |                                   const float spatial_scale,
40 |                                   const int output_dim,
41 |                                   const int group_size,
42 |                                   const int pooled_size,
43 |                                   const int part_size,
44 |                                   const int sample_per_part,
45 |                                   const float trans_std);
46 | 
47 | std::tuple<at::Tensor, at::Tensor>
48 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
49 |                                    const at::Tensor &input,
50 |                                    const at::Tensor &bbox,
51 |                                    const at::Tensor &trans,
52 |                                    const at::Tensor &top_count,
53 |                                    const int no_trans,
54 |                                    const float spatial_scale,
55 |                                    const int output_dim,
56 |                                    const int group_size,
57 |                                    const int pooled_size,
58 |                                    const int part_size,
59 |                                    const int sample_per_part,
60 |                                    const float trans_std);


--------------------------------------------------------------------------------
/DCNv2/src/dcn_v2.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "cpu/vision.h"
  4 | 
  5 | #ifdef WITH_CUDA
  6 | #include "cuda/vision.h"
  7 | #endif
  8 | 
  9 | at::Tensor
 10 | dcn_v2_forward(const at::Tensor &input,
 11 |                const at::Tensor &weight,
 12 |                const at::Tensor &bias,
 13 |                const at::Tensor &offset,
 14 |                const at::Tensor &mask,
 15 |                const int kernel_h,
 16 |                const int kernel_w,
 17 |                const int stride_h,
 18 |                const int stride_w,
 19 |                const int pad_h,
 20 |                const int pad_w,
 21 |                const int dilation_h,
 22 |                const int dilation_w,
 23 |                const int deformable_group)
 24 | {
 25 |     if (input.type().is_cuda())
 26 |     {
 27 | #ifdef WITH_CUDA
 28 |         return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
 29 |                                    kernel_h, kernel_w,
 30 |                                    stride_h, stride_w,
 31 |                                    pad_h, pad_w,
 32 |                                    dilation_h, dilation_w,
 33 |                                    deformable_group);
 34 | #else
 35 |         AT_ERROR("Not compiled with GPU support");
 36 | #endif
 37 |     }
 38 |     AT_ERROR("Not implemented on the CPU");
 39 | }
 40 | 
 41 | std::vector<at::Tensor>
 42 | dcn_v2_backward(const at::Tensor &input,
 43 |                 const at::Tensor &weight,
 44 |                 const at::Tensor &bias,
 45 |                 const at::Tensor &offset,
 46 |                 const at::Tensor &mask,
 47 |                 const at::Tensor &grad_output,
 48 |                 int kernel_h, int kernel_w,
 49 |                 int stride_h, int stride_w,
 50 |                 int pad_h, int pad_w,
 51 |                 int dilation_h, int dilation_w,
 52 |                 int deformable_group)
 53 | {
 54 |     if (input.type().is_cuda())
 55 |     {
 56 | #ifdef WITH_CUDA
 57 |         return dcn_v2_cuda_backward(input,
 58 |                                     weight,
 59 |                                     bias,
 60 |                                     offset,
 61 |                                     mask,
 62 |                                     grad_output,
 63 |                                     kernel_h, kernel_w,
 64 |                                     stride_h, stride_w,
 65 |                                     pad_h, pad_w,
 66 |                                     dilation_h, dilation_w,
 67 |                                     deformable_group);
 68 | #else
 69 |         AT_ERROR("Not compiled with GPU support");
 70 | #endif
 71 |     }
 72 |     AT_ERROR("Not implemented on the CPU");
 73 | }
 74 | 
 75 | std::tuple<at::Tensor, at::Tensor>
 76 | dcn_v2_psroi_pooling_forward(const at::Tensor &input,
 77 |                              const at::Tensor &bbox,
 78 |                              const at::Tensor &trans,
 79 |                              const int no_trans,
 80 |                              const float spatial_scale,
 81 |                              const int output_dim,
 82 |                              const int group_size,
 83 |                              const int pooled_size,
 84 |                              const int part_size,
 85 |                              const int sample_per_part,
 86 |                              const float trans_std)
 87 | {
 88 |     if (input.type().is_cuda())
 89 |     {
 90 | #ifdef WITH_CUDA
 91 |         return dcn_v2_psroi_pooling_cuda_forward(input,
 92 |                                                  bbox,
 93 |                                                  trans,
 94 |                                                  no_trans,
 95 |                                                  spatial_scale,
 96 |                                                  output_dim,
 97 |                                                  group_size,
 98 |                                                  pooled_size,
 99 |                                                  part_size,
100 |                                                  sample_per_part,
101 |                                                  trans_std);
102 | #else
103 |         AT_ERROR("Not compiled with GPU support");
104 | #endif
105 |     }
106 |     AT_ERROR("Not implemented on the CPU");
107 | }
108 | 
109 | std::tuple<at::Tensor, at::Tensor>
110 | dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
111 |                               const at::Tensor &input,
112 |                               const at::Tensor &bbox,
113 |                               const at::Tensor &trans,
114 |                               const at::Tensor &top_count,
115 |                               const int no_trans,
116 |                               const float spatial_scale,
117 |                               const int output_dim,
118 |                               const int group_size,
119 |                               const int pooled_size,
120 |                               const int part_size,
121 |                               const int sample_per_part,
122 |                               const float trans_std)
123 | {
124 |     if (input.type().is_cuda())
125 |     {
126 | #ifdef WITH_CUDA
127 |         return dcn_v2_psroi_pooling_cuda_backward(out_grad,
128 |                                                   input,
129 |                                                   bbox,
130 |                                                   trans,
131 |                                                   top_count,
132 |                                                   no_trans,
133 |                                                   spatial_scale,
134 |                                                   output_dim,
135 |                                                   group_size,
136 |                                                   pooled_size,
137 |                                                   part_size,
138 |                                                   sample_per_part,
139 |                                                   trans_std);
140 | #else
141 |         AT_ERROR("Not compiled with GPU support");
142 | #endif
143 |     }
144 |     AT_ERROR("Not implemented on the CPU");
145 | }
146 | 
147 | at::Tensor
148 | dcn_v2_cuda_forward_v2(const at::Tensor &input,
149 |                        const at::Tensor &weight,
150 |                        const at::Tensor &bias,
151 |                        const at::Tensor &offset,
152 |                        const at::Tensor &mask,
153 |                        const int64_t kernel_h,
154 |                        const int64_t kernel_w,
155 |                        const int64_t stride_h,
156 |                        const int64_t stride_w,
157 |                        const int64_t pad_h,
158 |                        const int64_t pad_w,
159 |                        const int64_t dilation_h,
160 |                        const int64_t dilation_w,
161 |                        const int64_t deformable_group);


--------------------------------------------------------------------------------
/DCNv2/src/vision.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "dcn_v2.h"
 3 | #include <torch/script.h>
 4 | 
 5 | // static auto registry =
 6 | //     torch::jit::RegisterOperators("my_ops::dcn_v2_forward", &dcn_v2_forward)
 7 | //         .op("my_ops::dcn_v2_backward", &dcn_v2_backward)
 8 | //         .op("my_ops::dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward)
 9 | //         .op("my_ops::dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward);
10 | 
11 | static auto registry =
12 |     torch::jit::RegisterOperators("my_ops::dcn_v2_cuda_forward_v2", &dcn_v2_cuda_forward_v2);
13 | 
14 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
15 | {
16 |   m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
17 |   m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
18 |   m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
19 |   m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
20 | }
21 | 


--------------------------------------------------------------------------------
/DCNv2/test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import absolute_import
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | 
  6 | import time
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import gradcheck
 10 | 
 11 | from dcn_v2 import dcn_v2_conv, DCNv2, DCN
 12 | from dcn_v2 import dcn_v2_pooling, DCNv2Pooling, DCNPooling
 13 | 
 14 | deformable_groups = 1
 15 | N, inC, inH, inW = 2, 2, 4, 4
 16 | outC = 2
 17 | kH, kW = 3, 3
 18 | 
 19 | 
 20 | def conv_identify(weight, bias):
 21 |     weight.data.zero_()
 22 |     bias.data.zero_()
 23 |     o, i, h, w = weight.shape
 24 |     y = h//2
 25 |     x = w//2
 26 |     for p in range(i):
 27 |         for q in range(o):
 28 |             if p == q:
 29 |                 weight.data[q, p, y, x] = 1.0
 30 | 
 31 | 
 32 | def check_zero_offset():
 33 |     conv_offset = nn.Conv2d(inC, deformable_groups * 2 * kH * kW,
 34 |                             kernel_size=(kH, kW),
 35 |                             stride=(1, 1),
 36 |                             padding=(1, 1),
 37 |                             bias=True).cuda()
 38 | 
 39 |     conv_mask = nn.Conv2d(inC, deformable_groups * 1 * kH * kW,
 40 |                           kernel_size=(kH, kW),
 41 |                           stride=(1, 1),
 42 |                           padding=(1, 1),
 43 |                           bias=True).cuda()
 44 | 
 45 |     dcn_v2 = DCNv2(inC, outC, (kH, kW),
 46 |                    stride=1, padding=1, dilation=1,
 47 |                    deformable_groups=deformable_groups).cuda()
 48 | 
 49 |     conv_offset.weight.data.zero_()
 50 |     conv_offset.bias.data.zero_()
 51 |     conv_mask.weight.data.zero_()
 52 |     conv_mask.bias.data.zero_()
 53 |     conv_identify(dcn_v2.weight, dcn_v2.bias)
 54 | 
 55 |     input = torch.randn(N, inC, inH, inW).cuda()
 56 |     offset = conv_offset(input)
 57 |     mask = conv_mask(input)
 58 |     mask = torch.sigmoid(mask)
 59 |     output = dcn_v2(input, offset, mask)
 60 |     output *= 2
 61 |     d = (input - output).abs().max()
 62 |     if d < 1e-10:
 63 |         print('Zero offset passed')
 64 |     else:
 65 |         print('Zero offset failed')
 66 |         print(input)
 67 |         print(output)
 68 | 
 69 | def check_gradient_dconv():
 70 | 
 71 |     input = torch.rand(N, inC, inH, inW).cuda() * 0.01
 72 |     input.requires_grad = True
 73 | 
 74 |     offset = torch.randn(N, deformable_groups * 2 * kW * kH, inH, inW).cuda() * 2
 75 |     # offset.data.zero_()
 76 |     # offset.data -= 0.5
 77 |     offset.requires_grad = True
 78 | 
 79 |     mask = torch.rand(N, deformable_groups * 1 * kW * kH, inH, inW).cuda()
 80 |     # mask.data.zero_()
 81 |     mask.requires_grad = True
 82 |     mask = torch.sigmoid(mask)
 83 | 
 84 |     weight = torch.randn(outC, inC, kH, kW).cuda()
 85 |     weight.requires_grad = True
 86 | 
 87 |     bias = torch.rand(outC).cuda()
 88 |     bias.requires_grad = True
 89 | 
 90 |     stride = 1
 91 |     padding = 1
 92 |     dilation = 1
 93 | 
 94 |     print('check_gradient_dconv: ',
 95 |           gradcheck(dcn_v2_conv, (input, offset, mask, weight, bias,
 96 |                     stride, padding, dilation, deformable_groups),
 97 |                     eps=1e-3, atol=1e-4, rtol=1e-2))
 98 | 
 99 | 
100 | def check_pooling_zero_offset():
101 | 
102 |     input = torch.randn(2, 16, 64, 64).cuda().zero_()
103 |     input[0, :, 16:26, 16:26] = 1.
104 |     input[1, :, 10:20, 20:30] = 2.
105 |     rois = torch.tensor([
106 |         [0, 65, 65, 103, 103],
107 |         [1, 81, 41, 119, 79],
108 |     ]).cuda().float()
109 |     pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
110 |                            pooled_size=7,
111 |                            output_dim=16,
112 |                            no_trans=True,
113 |                            group_size=1,
114 |                            trans_std=0.0).cuda()
115 | 
116 |     out = pooling(input, rois, input.new())
117 |     s = ', '.join(['%f' % out[i, :, :, :].mean().item()
118 |                    for i in range(rois.shape[0])])
119 |     print(s)
120 | 
121 |     dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
122 |                             pooled_size=7,
123 |                             output_dim=16,
124 |                             no_trans=False,
125 |                             group_size=1,
126 |                             trans_std=0.0).cuda()
127 |     offset = torch.randn(20, 2, 7, 7).cuda().zero_()
128 |     dout = dpooling(input, rois, offset)
129 |     s = ', '.join(['%f' % dout[i, :, :, :].mean().item()
130 |                    for i in range(rois.shape[0])])
131 |     print(s)
132 | 
133 | 
134 | def check_gradient_dpooling():
135 |     input = torch.randn(2, 3, 5, 5).cuda() * 0.01
136 |     N = 4
137 |     batch_inds = torch.randint(2, (N, 1)).cuda().float()
138 |     x = torch.rand((N, 1)).cuda().float() * 15
139 |     y = torch.rand((N, 1)).cuda().float() * 15
140 |     w = torch.rand((N, 1)).cuda().float() * 10
141 |     h = torch.rand((N, 1)).cuda().float() * 10
142 |     rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
143 |     offset = torch.randn(N, 2, 3, 3).cuda()
144 |     input.requires_grad = True
145 |     offset.requires_grad = True
146 | 
147 |     spatial_scale = 1.0 / 4
148 |     pooled_size = 3
149 |     output_dim = 3
150 |     no_trans = 0
151 |     group_size = 1
152 |     trans_std = 0.0
153 |     sample_per_part = 4
154 |     part_size = pooled_size
155 | 
156 |     print('check_gradient_dpooling:',
157 |           gradcheck(dcn_v2_pooling, (input, rois, offset,
158 |                                      spatial_scale,
159 |                                      pooled_size,
160 |                                      output_dim,
161 |                                      no_trans,
162 |                                      group_size,
163 |                                      part_size,
164 |                                      sample_per_part,
165 |                                      trans_std),
166 |                     eps=1e-4))
167 | 
168 | 
169 | def example_dconv():
170 |     input = torch.randn(2, 64, 128, 128).cuda()
171 |     # wrap all things (offset and mask) in DCN
172 |     dcn = DCN(64, 64, kernel_size=(3, 3), stride=1,
173 |               padding=1, deformable_groups=2).cuda()
174 |     # print(dcn.weight.shape, input.shape)
175 |     output = dcn(input)
176 |     targert = output.new(*output.size())
177 |     targert.data.uniform_(-0.01, 0.01)
178 |     error = (targert - output).mean()
179 |     error.backward()
180 |     print(output.shape)
181 | 
182 | 
183 | def example_dpooling():
184 |     input = torch.randn(2, 32, 64, 64).cuda()
185 |     batch_inds = torch.randint(2, (20, 1)).cuda().float()
186 |     x = torch.randint(256, (20, 1)).cuda().float()
187 |     y = torch.randint(256, (20, 1)).cuda().float()
188 |     w = torch.randint(64, (20, 1)).cuda().float()
189 |     h = torch.randint(64, (20, 1)).cuda().float()
190 |     rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
191 |     offset = torch.randn(20, 2, 7, 7).cuda()
192 |     input.requires_grad = True
193 |     offset.requires_grad = True
194 | 
195 |     # normal roi_align
196 |     pooling = DCNv2Pooling(spatial_scale=1.0 / 4,
197 |                            pooled_size=7,
198 |                            output_dim=32,
199 |                            no_trans=True,
200 |                            group_size=1,
201 |                            trans_std=0.1).cuda()
202 | 
203 |     # deformable pooling
204 |     dpooling = DCNv2Pooling(spatial_scale=1.0 / 4,
205 |                             pooled_size=7,
206 |                             output_dim=32,
207 |                             no_trans=False,
208 |                             group_size=1,
209 |                             trans_std=0.1).cuda()
210 | 
211 |     out = pooling(input, rois, offset)
212 |     dout = dpooling(input, rois, offset)
213 |     print(out.shape)
214 |     print(dout.shape)
215 | 
216 |     target_out = out.new(*out.size())
217 |     target_out.data.uniform_(-0.01, 0.01)
218 |     target_dout = dout.new(*dout.size())
219 |     target_dout.data.uniform_(-0.01, 0.01)
220 |     e = (target_out - out).mean()
221 |     e.backward()
222 |     e = (target_dout - dout).mean()
223 |     e.backward()
224 | 
225 | 
226 | def example_mdpooling():
227 |     input = torch.randn(2, 32, 64, 64).cuda()
228 |     input.requires_grad = True
229 |     batch_inds = torch.randint(2, (20, 1)).cuda().float()
230 |     x = torch.randint(256, (20, 1)).cuda().float()
231 |     y = torch.randint(256, (20, 1)).cuda().float()
232 |     w = torch.randint(64, (20, 1)).cuda().float()
233 |     h = torch.randint(64, (20, 1)).cuda().float()
234 |     rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
235 | 
236 |     # mdformable pooling (V2)
237 |     dpooling = DCNPooling(spatial_scale=1.0 / 4,
238 |                           pooled_size=7,
239 |                           output_dim=32,
240 |                           no_trans=False,
241 |                           group_size=1,
242 |                           trans_std=0.1,
243 |                           deform_fc_dim=1024).cuda()
244 | 
245 |     dout = dpooling(input, rois)
246 |     target = dout.new(*dout.size())
247 |     target.data.uniform_(-0.1, 0.1)
248 |     error = (target - dout).mean()
249 |     error.backward()
250 |     print(dout.shape)
251 | 
252 | 
253 | if __name__ == '__main__':
254 | 
255 |     example_dconv()
256 |     example_dpooling()
257 |     example_mdpooling()
258 | 
259 |     check_pooling_zero_offset()
260 |     # zero offset check
261 |     if inC == outC:
262 |         check_zero_offset()
263 | 
264 |     check_gradient_dpooling()
265 |     check_gradient_dconv()
266 |     # """
267 |     # ****** Note: backward is not reentrant error may not be a serious problem,
268 |     # ****** since the max error is less than 1e-7,
269 |     # ****** Still looking for what trigger this problem
270 |     # """
271 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Charles Shang
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Convert CenterNet pytorch model to Torch Script for LibTorch
2 | can convert dla34 official model
3 | 
4 | ## C PLUS PLUS CALL
5 | refer to the dcn_cpp_plugin


--------------------------------------------------------------------------------
/dcn_cpp_plugin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
 2 | project(dcn_v2_cuda_forward_v2)
 3 | 
 4 | #add_compile_options(-std=c++11)
 5 | 
 6 | #add_definitions(-D WITH_CUDA)
 7 | 
 8 | set(Torch_DIR /usr/local/libtorch/share/cmake/Torch) 
 9 | find_package(Torch REQUIRED)
10 | 
11 | include_directories(/usr/include/python3.5m)
12 | include_directories(/usr/include/python2.7/)
13 | 
14 | 
15 | #include_directories(/usr/local/cuda/include)
16 | #link_directories(/usr/local/cuda/lib64)
17 | 
18 | set(CUDA_HOST_COMPILATION_CPP ON)
19 | #set(TORCH_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__")
20 | 
21 | #set(CUDA_NVCC_FLAGS -std=c++11 
22 | #-DCUDA_HAS_FP16=1 
23 | #-D__CUDA_NO_HALF_OPERATORS__ 
24 | #-D__CUDA_NO_HALF_CONVERSIONS__
25 | #-D__CUDA_NO_HALF2_OPERATORS__)
26 | 
27 | set(CUDA_NVCC_FLAGS -std=c++11
28 | -D__CUDA_NO_HALF_OPERATORS__ )
29 | 
30 | #set(CUDA_NVCC_FLAGS -std=c++11 -Xcompiler -fexceptions -Xcompiler -fPIC 
31 | #-gencode arch=compute_30,code=sm_30
32 | #-gencode arch=compute_35,code=sm_35
33 | #-gencode arch=compute_50,code=sm_50
34 | #-gencode arch=compute_60,code=sm_60
35 | #-gencode arch=compute_60,code=compute_60)
36 | 
37 | 
38 | cuda_add_library(${PROJECT_NAME} SHARED 
39 | vision.cpp
40 | dcn_v2_cuda.cu
41 | dcn_v2_im2col_cuda.cu
42 | )
43 | 
44 | # Enable C++11
45 | target_compile_features(${PROJECT_NAME} PRIVATE cxx_range_for)
46 | # Link against LibTorch
47 | target_link_libraries(${PROJECT_NAME} "${TORCH_LIBRARIES}")
48 | 
49 | install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION lib)
50 | 


--------------------------------------------------------------------------------
/dcn_cpp_plugin/README.md:
--------------------------------------------------------------------------------
1 | # DCN C PLUS CPLUS PLUGIN
2 | 
3 | ## usage
4 | void handle = dlopen("libdcn_v2_cuda_forward_v2.so", RTLD_LAZY);
5 | 
6 | int gpu_id = 0;
7 | torch::jit::script::Module module = 
8 | torch::jit::load("centernet.pt", torch::Device(torch::DeviceType::CUDA, gpu_id));
9 | 


--------------------------------------------------------------------------------
/dcn_cpp_plugin/dcn_v2.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef DCN_V2_H
 3 | #define DCN_V2_H
 4 | #include <torch/extension.h>
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C"
 8 | {
 9 | #endif
10 |     
11 | 
12 |     at::Tensor
13 |     dcn_v2_cuda_forward(const at::Tensor &input,
14 |                         const at::Tensor &weight,
15 |                         const at::Tensor &bias,
16 |                         const at::Tensor &offset,
17 |                         const at::Tensor &mask,
18 |                         const int64_t kernel_h,
19 |                         const int64_t kernel_w,
20 |                         const int64_t stride_h,
21 |                         const int64_t stride_w,
22 |                         const int64_t pad_h,
23 |                         const int64_t pad_w,
24 |                         const int64_t dilation_h,
25 |                         const int64_t dilation_w,
26 |                         const int64_t deformable_group);
27 | 
28 | #ifdef __cplusplus
29 | }
30 | #endif
31 | 
32 | #endif


--------------------------------------------------------------------------------
/dcn_cpp_plugin/dcn_v2_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include "dcn_v2_im2col_cuda.h"
  2 | #include "dcn_v2.h"
  3 | #include <ATen/ATen.h>
  4 | #include <ATen/cuda/CUDAContext.h>
  5 | 
  6 | #include <torch/script.h>
  7 | #include <THC/THC.h>
  8 | #include <THC/THCAtomics.cuh>
  9 | #include <THC/THCDeviceUtils.cuh>
 10 | 
 11 | //extern THCState *state;
 12 | 
 13 | //THCState *state;
 14 | 
 15 | // THCState *state = at::globalContext().thc_state;
 16 | 
 17 | THCState *state = at::globalContext().lazyInitCUDA();
 18 | //THCState *state = at::globalContext().getTHCState();
 19 | 
 20 | __global__ void createBatchGemmBuffer(const float **input_b, float **output_b,
 21 |                                       float **columns_b, const float **ones_b,
 22 |                                       const float **weight_b, const float **bias_b,
 23 |                                       float *input, float *output,
 24 |                                       float *columns, float *ones,
 25 |                                       float *weight, float *bias,
 26 |                                       const int input_stride, const int output_stride,
 27 |                                       const int columns_stride, const int ones_stride,
 28 |                                       const int num_batches)
 29 | {
 30 |     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 31 |     if (idx < num_batches)
 32 |     {
 33 |         input_b[idx] = input + idx * input_stride;
 34 |         output_b[idx] = output + idx * output_stride;
 35 |         columns_b[idx] = columns + idx * columns_stride;
 36 |         ones_b[idx] = ones + idx * ones_stride;
 37 |         // share weights and bias within a Mini-Batch
 38 |         weight_b[idx] = weight;
 39 |         bias_b[idx] = bias;
 40 |     }
 41 | }
 42 | 
 43 | at::Tensor
 44 | dcn_v2_cuda_forward(const at::Tensor &input,
 45 |                     const at::Tensor &weight,
 46 |                     const at::Tensor &bias,
 47 |                     const at::Tensor &offset,
 48 |                     const at::Tensor &mask,
 49 |                     const int64_t kernel_h,
 50 |                     const int64_t kernel_w,
 51 |                     const int64_t stride_h,
 52 |                     const int64_t stride_w,
 53 |                     const int64_t pad_h,
 54 |                     const int64_t pad_w,
 55 |                     const int64_t dilation_h,
 56 |                     const int64_t dilation_w,
 57 |                     const int64_t deformable_group)
 58 | {
 59 |     using scalar_t = float;
 60 |     //THCAssertSameGPU(THCudaTensor_checkGPU(state, 5, input, weight, bias, offset, mask));
 61 |     AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
 62 |     AT_ASSERTM(weight.type().is_cuda(), "weight must be a CUDA tensor");
 63 |     AT_ASSERTM(bias.type().is_cuda(), "bias must be a CUDA tensor");
 64 |     AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor");
 65 |     AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor");
 66 |     const int batch = input.size(0);
 67 |     const int channels = input.size(1);
 68 |     const int height = input.size(2);
 69 |     const int width = input.size(3);
 70 | 
 71 |     const int channels_out = weight.size(0);
 72 |     const int channels_kernel = weight.size(1);
 73 |     const int kernel_h_ = weight.size(2);
 74 |     const int kernel_w_ = weight.size(3);
 75 | 
 76 |     AT_ASSERTM(kernel_h_ == kernel_h && kernel_w_ == kernel_w,
 77 |                "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", kernel_h_, kernel_w, kernel_h_, kernel_w_);
 78 | 
 79 |     AT_ASSERTM(channels == channels_kernel,
 80 |                "Input shape and kernel channels wont match: (%d vs %d).", channels, channels_kernel);
 81 | 
 82 |     const int height_out = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
 83 |     const int width_out = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
 84 | 
 85 |     auto ones = at::ones({batch, height_out, width_out}, input.options());
 86 |     auto columns = at::empty({batch, channels * kernel_h * kernel_w, 1 * height_out * width_out}, input.options());
 87 |     auto output = at::empty({batch, channels_out, height_out, width_out}, input.options());
 88 | 
 89 |     int matrices_size = batch * sizeof(float *);
 90 | 
 91 |     auto input_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
 92 |     auto output_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
 93 |     auto columns_b = static_cast<float **>(THCudaMalloc(state, matrices_size));
 94 |     auto ones_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
 95 |     auto weight_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
 96 |     auto bias_b = static_cast<const float **>(THCudaMalloc(state, matrices_size));
 97 | 
 98 |     const int block = 128;
 99 |     const int grid = (batch + block - 1) / block;
100 | 
101 |     createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
102 |         (const float**)input_b, output_b,
103 |         columns_b, ones_b,
104 |         weight_b, bias_b,
105 |         input.data<scalar_t>(),
106 |         output.data<scalar_t>(),
107 |         columns.data<scalar_t>(),
108 |         ones.data<scalar_t>(),
109 |         weight.data<scalar_t>(),
110 |         bias.data<scalar_t>(),
111 |         channels * width * height,
112 |         channels_out * width_out * height_out,
113 |         channels * kernel_h * kernel_w * height_out * width_out,
114 |         height_out * width_out,
115 |         batch);
116 | 
117 |     long m_ = channels_out;
118 |     long n_ = height_out * width_out;
119 |     long k_ = 1;
120 |     THCudaBlas_SgemmBatched(state,
121 |                             't',
122 |                             'n',
123 |                             n_,
124 |                             m_,
125 |                             k_,
126 |                             1.0f,
127 |                             ones_b, k_,
128 |                             bias_b, k_,
129 |                             0.0f,
130 |                             output_b, n_,
131 |                             batch);
132 | 
133 |     modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
134 |                                      input.data<scalar_t>(),
135 |                                      offset.data<scalar_t>(),
136 |                                      mask.data<scalar_t>(),
137 |                                      batch, channels, height, width,
138 |                                      height_out, width_out, kernel_h, kernel_w,
139 |                                      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
140 |                                      deformable_group,
141 |                                      columns.data<scalar_t>());
142 | 
143 |     long m = channels_out;
144 |     long n = height_out * width_out;
145 |     long k = channels * kernel_h * kernel_w;
146 |     THCudaBlas_SgemmBatched(state,
147 |                             'n',
148 |                             'n',
149 |                             n,
150 |                             m,
151 |                             k,
152 |                             1.0f,
153 |                             (const float **)columns_b, n,
154 |                             weight_b, k,
155 |                             1.0f,
156 |                             output_b, n,
157 |                             batch);
158 | 
159 |     THCudaFree(state, input_b);
160 |     THCudaFree(state, output_b);
161 |     THCudaFree(state, columns_b);
162 |     THCudaFree(state, ones_b);
163 |     THCudaFree(state, weight_b);
164 |     THCudaFree(state, bias_b);
165 |     return output;
166 | }
167 | 
168 | 


--------------------------------------------------------------------------------
/dcn_cpp_plugin/dcn_v2_im2col_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include "dcn_v2_im2col_cuda.h"
  2 | #include <cstdio>
  3 | #include <algorithm>
  4 | #include <cstring>
  5 | 
  6 | #include <ATen/ATen.h>
  7 | #include <ATen/cuda/CUDAContext.h>
  8 | 
  9 | #include <THC/THC.h>
 10 | #include <THC/THCAtomics.cuh>
 11 | #include <THC/THCDeviceUtils.cuh>
 12 | 
 13 | #define CUDA_KERNEL_LOOP(i, n)                          \
 14 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
 15 |       i < (n);                                          \
 16 |       i += blockDim.x * gridDim.x)
 17 | 
 18 | const int CUDA_NUM_THREADS = 1024;
 19 | inline int GET_BLOCKS(const int N)
 20 | {
 21 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 22 | }
 23 | 
 24 | 
 25 | __device__ float dmcn_im2col_bilinear(const float *bottom_data, const int data_width,
 26 |                                       const int height, const int width, float h, float w)
 27 | {
 28 |   int h_low = floor(h);
 29 |   int w_low = floor(w);
 30 |   int h_high = h_low + 1;
 31 |   int w_high = w_low + 1;
 32 | 
 33 |   float lh = h - h_low;
 34 |   float lw = w - w_low;
 35 |   float hh = 1 - lh, hw = 1 - lw;
 36 | 
 37 |   float v1 = 0;
 38 |   if (h_low >= 0 && w_low >= 0)
 39 |     v1 = bottom_data[h_low * data_width + w_low];
 40 |   float v2 = 0;
 41 |   if (h_low >= 0 && w_high <= width - 1)
 42 |     v2 = bottom_data[h_low * data_width + w_high];
 43 |   float v3 = 0;
 44 |   if (h_high <= height - 1 && w_low >= 0)
 45 |     v3 = bottom_data[h_high * data_width + w_low];
 46 |   float v4 = 0;
 47 |   if (h_high <= height - 1 && w_high <= width - 1)
 48 |     v4 = bottom_data[h_high * data_width + w_high];
 49 | 
 50 |   float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
 51 | 
 52 |   float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 53 |   return val;
 54 | }
 55 | __device__ float dmcn_get_coordinate_weight(float argmax_h, float argmax_w,
 56 |                                             const int height, const int width, const float *im_data,
 57 |                                             const int data_width, const int bp_dir)
 58 | {
 59 |   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
 60 |   {
 61 |     //empty
 62 |     return 0;
 63 |   }
 64 | 
 65 |   int argmax_h_low = floor(argmax_h);
 66 |   int argmax_w_low = floor(argmax_w);
 67 |   int argmax_h_high = argmax_h_low + 1;
 68 |   int argmax_w_high = argmax_w_low + 1;
 69 | 
 70 |   float weight = 0;
 71 | 
 72 |   if (bp_dir == 0)
 73 |   {
 74 |     if (argmax_h_low >= 0 && argmax_w_low >= 0)
 75 |       weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
 76 |     if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
 77 |       weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
 78 |     if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
 79 |       weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
 80 |     if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
 81 |       weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
 82 |   }
 83 |   else if (bp_dir == 1)
 84 |   {
 85 |     if (argmax_h_low >= 0 && argmax_w_low >= 0)
 86 |       weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
 87 |     if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
 88 |       weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
 89 |     if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
 90 |       weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
 91 |     if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
 92 |       weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
 93 |   }
 94 | 
 95 |   return weight;
 96 | }
 97 | 
 98 | __global__ void modulated_deformable_im2col_gpu_kernel(const int n,
 99 |                                                        const float *data_im, const float *data_offset, const float *data_mask,
100 |                                                        const int height, const int width, const int kernel_h, const int kernel_w,
101 |                                                        const int pad_h, const int pad_w,
102 |                                                        const int stride_h, const int stride_w,
103 |                                                        const int dilation_h, const int dilation_w,
104 |                                                        const int channel_per_deformable_group,
105 |                                                        const int batch_size, const int num_channels, const int deformable_group,
106 |                                                        const int height_col, const int width_col,
107 |                                                        float *data_col)
108 | {
109 |   // launch channels * batch_size * height_col * width_col cores
110 |   CUDA_KERNEL_LOOP(index, n)
111 |   {
112 |     // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
113 |     // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
114 | 
115 |     // index index of output matrix
116 |     const int w_col = index % width_col;
117 |     const int h_col = (index / width_col) % height_col;
118 |     // const int b_col = (index / width_col / height_col) % batch_size;
119 |     const int b_col = (index / width_col / height_col / num_channels) % batch_size;
120 |     // const int c_im = (index / width_col / height_col) / batch_size;
121 |     const int c_im = (index / width_col / height_col) % num_channels;
122 |     // const int c_col = c_im * kernel_h * kernel_w;
123 |     const int c_col = c_im * kernel_h * kernel_w;
124 | 
125 |     // compute deformable group index
126 |     const int deformable_group_index = c_im / channel_per_deformable_group;
127 | 
128 |     const int h_in = h_col * stride_h - pad_h;
129 |     const int w_in = w_col * stride_w - pad_w;
130 | 
131 |     //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
132 |     float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
133 |     //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
134 |     const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
135 |     const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
136 | 
137 |     const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
138 | 
139 |     for (int i = 0; i < kernel_h; ++i)
140 |     {
141 |       for (int j = 0; j < kernel_w; ++j)
142 |       {
143 |         const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
144 |         const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
145 |         const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
146 |         const float offset_h = data_offset_ptr[data_offset_h_ptr];
147 |         const float offset_w = data_offset_ptr[data_offset_w_ptr];
148 |         const float mask = data_mask_ptr[data_mask_hw_ptr];
149 |         float val = static_cast<float>(0);
150 |         const float h_im = h_in + i * dilation_h + offset_h;
151 |         const float w_im = w_in + j * dilation_w + offset_w;
152 |         //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
153 |         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
154 |         {
155 |           //const float map_h = i * dilation_h + offset_h;
156 |           //const float map_w = j * dilation_w + offset_w;
157 |           //const int cur_height = height - h_in;
158 |           //const int cur_width = width - w_in;
159 |           //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
160 |           val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
161 |         }
162 |         *data_col_ptr = val * mask;
163 |         // data_col_ptr += batch_size * height_col * width_col;
164 |         data_col_ptr += height_col * width_col;
165 |       }
166 |     }
167 |   }
168 | }
169 | 
170 | void modulated_deformable_im2col_cuda(cudaStream_t stream,
171 |   const float* data_im, const float* data_offset, const float* data_mask,
172 |   const int batch_size, const int channels, const int height_im, const int width_im, 
173 |   const int height_col, const int width_col, const int kernel_h, const int kernel_w,
174 |   const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
175 |   const int dilation_h, const int dilation_w,
176 |   const int deformable_group, float* data_col) {
177 |   // num_axes should be smaller than block size
178 |   const int channel_per_deformable_group = channels / deformable_group;
179 |   const int num_kernels = channels * batch_size * height_col * width_col;
180 |   modulated_deformable_im2col_gpu_kernel
181 |       <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS,
182 |           0, stream>>>(
183 |       num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
184 |       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
185 |       batch_size, channels, deformable_group, height_col, width_col, data_col);
186 |   
187 |   cudaError_t err = cudaGetLastError();
188 |   if (err != cudaSuccess)
189 |   {
190 |     printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
191 |   }
192 | 
193 | }


--------------------------------------------------------------------------------
/dcn_cpp_plugin/dcn_v2_im2col_cuda.h:
--------------------------------------------------------------------------------
 1 | #ifndef DCN_V2_IM2COL_CUDA
 2 | #define DCN_V2_IM2COL_CUDA
 3 | 
 4 | #ifdef __cplusplus
 5 | extern "C"
 6 | {
 7 | #endif
 8 | 
 9 |   void modulated_deformable_im2col_cuda(cudaStream_t stream,
10 |                                         const float *data_im, const float *data_offset, const float *data_mask,
11 |                                         const int batch_size, const int channels, const int height_im, const int width_im,
12 |                                         const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
13 |                                         const int pad_h, const int pad_w, const int stride_h, const int stride_w,
14 |                                         const int dilation_h, const int dilation_w,
15 |                                         const int deformable_group, float *data_col);
16 | 
17 | #ifdef __cplusplus
18 | }
19 | #endif
20 | 
21 | #endif


--------------------------------------------------------------------------------
/dcn_cpp_plugin/vision.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "dcn_v2.h"
 3 | #include <torch/script.h>
 4 | #include "dcn_v2.h"
 5 | 
 6 | at::Tensor
 7 | dcn_v2_cuda_forward_v2(const at::Tensor &input,
 8 |                        const at::Tensor &weight,
 9 |                        const at::Tensor &bias,
10 |                        const at::Tensor &offset,
11 |                        const at::Tensor &mask,
12 |                        const int64_t kernel_h,
13 |                        const int64_t kernel_w,
14 |                        const int64_t stride_h,
15 |                        const int64_t stride_w,
16 |                        const int64_t pad_h,
17 |                        const int64_t pad_w,
18 |                        const int64_t dilation_h,
19 |                        const int64_t dilation_w,
20 |                        const int64_t deformable_group)
21 | {
22 |     return dcn_v2_cuda_forward(input,
23 |                                weight,
24 |                                bias,
25 |                                offset,
26 |                                mask,
27 |                                kernel_h,
28 |                                kernel_w,
29 |                                stride_h,
30 |                                stride_w,
31 |                                pad_h,
32 |                                pad_w,
33 |                                dilation_h,
34 |                                dilation_w,
35 |                                deformable_group);
36 | }
37 | 
38 | static auto registry =
39 |     torch::jit::RegisterOperators("my_ops::dcn_v2_cuda_forward_v2", &dcn_v2_cuda_forward_v2);
40 | 


--------------------------------------------------------------------------------
/dcn_cpp_plugin/vision.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #pragma once
 3 | #include <torch/extension.h>
 4 | 
 5 | at::Tensor
 6 | dcn_v2_cuda_forward_v2(const at::Tensor &input,
 7 |                        const at::Tensor &weight,
 8 |                        const at::Tensor &bias,
 9 |                        const at::Tensor &offset,
10 |                        const at::Tensor &mask,
11 |                        const int64_t kernel_h,
12 |                        const int64_t kernel_w,
13 |                        const int64_t stride_h,
14 |                        const int64_t stride_w,
15 |                        const int64_t pad_h,
16 |                        const int64_t pad_w,
17 |                        const int64_t dilation_h,
18 |                        const int64_t dilation_w,
19 |                        const int64_t deformable_group);
20 | 
21 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from model import create_model, load_model
 4 | import torch
 5 | 
 6 | if __name__ == '__main__':
 7 |     num_classes = 80
 8 |     head_conv = 256
 9 |     heads = {'hm': num_classes,
10 |              'wh': 2 ,
11 |              'reg': 2}
12 | 
13 |     load_model_path = 'ctdet_coco_dla_2x.pth'
14 |     save_script_pt = 'centernet.pt'
15 |     device = 0
16 | 
17 |     model = create_model('dla_34', heads, head_conv)
18 |     model = load_model(model, load_model_path)
19 |     model = model.to(device)
20 |     model.eval()
21 | 
22 |     input_var = torch.zeros([1, 3, 512, 512], dtype=torch.float32).cuda()
23 | 
24 |     traced_script_module = torch.jit.trace(model, input_var)
25 |     traced_script_module.save(save_script_pt)
26 |     traced_script_module = torch.jit.load(save_script_pt)
27 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | from pose_dla_dcn import get_pose_net as get_model
 4 | 
 5 | 
 6 | def create_model(arch, heads, head_conv):
 7 |     num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
 8 |     arch = arch[:arch.find('_')] if '_' in arch else arch
 9 |     model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
10 |     return model
11 | 
12 | 
13 | def load_model(model, model_path, optimizer=None, resume=False,
14 |                lr=None, lr_step=None):
15 |     start_epoch = 0
16 |     checkpoint = torch.load(
17 |         model_path, map_location=lambda storage, loc: storage)
18 |     print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch']))
19 |     state_dict_ = checkpoint['state_dict']
20 |     state_dict = {}
21 | 
22 |     # convert data_parallal to model
23 |     for k in state_dict_:
24 |         if k.startswith('module') and not k.startswith('module_list'):
25 |             state_dict[k[7:]] = state_dict_[k]
26 |         else:
27 |             state_dict[k] = state_dict_[k]
28 |     model_state_dict = model.state_dict()
29 | 
30 |     # check loaded parameters and created model parameters
31 |     for k in state_dict:
32 |         if k in model_state_dict:
33 |             if state_dict[k].shape != model_state_dict[k].shape:
34 |                 print('Skip loading parameter {}, required shape{}, '
35 |                       'loaded shape{}.'.format(
36 |                           k, model_state_dict[k].shape, state_dict[k].shape))
37 |                 state_dict[k] = model_state_dict[k]
38 |         else:
39 |             print('Drop parameter {}.'.format(k))
40 |     for k in model_state_dict:
41 |         if not (k in state_dict):
42 |             print('No param {}.'.format(k))
43 |             state_dict[k] = model_state_dict[k]
44 |     model.load_state_dict(state_dict, strict=False)
45 | 
46 |     # resume optimizer parameters
47 |     if optimizer is not None and resume:
48 |         if 'optimizer' in checkpoint:
49 |             optimizer.load_state_dict(checkpoint['optimizer'])
50 |             start_epoch = checkpoint['epoch']
51 |             start_lr = lr
52 |             for step in lr_step:
53 |                 if start_epoch >= step:
54 |                     start_lr *= 0.1
55 |             for param_group in optimizer.param_groups:
56 |                 param_group['lr'] = start_lr
57 |             print('Resumed optimizer with start lr', start_lr)
58 |         else:
59 |             print('No optimizer parameters in checkpoint.')
60 |     if optimizer is not None:
61 |         return model, optimizer, start_epoch
62 |     else:
63 |         return model
64 | 


--------------------------------------------------------------------------------
/pose_dla_dcn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import math
  7 | import logging
  8 | import numpy as np
  9 | from os.path import join
 10 | 
 11 | import torch
 12 | from torch import nn
 13 | import torch.nn.functional as F
 14 | import torch.utils.model_zoo as model_zoo
 15 | 
 16 | from DCNv2.dcn_v2 import DCN
 17 | 
 18 | BN_MOMENTUM = 0.1
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
 22 |     return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
 23 | 
 24 | 
 25 | def conv3x3(in_planes, out_planes, stride=1):
 26 |     "3x3 convolution with padding"
 27 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 28 |                      padding=1, bias=False)
 29 | 
 30 | 
 31 | class BasicBlock(nn.Module):
 32 |     def __init__(self, inplanes, planes, stride=1, dilation=1):
 33 |         super(BasicBlock, self).__init__()
 34 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
 35 |                                stride=stride, padding=dilation,
 36 |                                bias=False, dilation=dilation)
 37 |         self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
 38 |         self.relu = nn.ReLU(inplace=True)
 39 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
 40 |                                stride=1, padding=dilation,
 41 |                                bias=False, dilation=dilation)
 42 |         self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
 43 |         self.stride = stride
 44 | 
 45 |     def forward(self, x, residual=None):
 46 |         if residual is None:
 47 |             residual = x
 48 | 
 49 |         out = self.conv1(x)
 50 |         out = self.bn1(out)
 51 |         out = self.relu(out)
 52 | 
 53 |         out = self.conv2(out)
 54 |         out = self.bn2(out)
 55 | 
 56 |         out += residual
 57 |         out = self.relu(out)
 58 | 
 59 |         return out
 60 | 
 61 | 
 62 | class Bottleneck(nn.Module):
 63 |     expansion = 2
 64 | 
 65 |     def __init__(self, inplanes, planes, stride=1, dilation=1):
 66 |         super(Bottleneck, self).__init__()
 67 |         expansion = Bottleneck.expansion
 68 |         bottle_planes = planes // expansion
 69 |         self.conv1 = nn.Conv2d(inplanes, bottle_planes,
 70 |                                kernel_size=1, bias=False)
 71 |         self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
 72 |         self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
 73 |                                stride=stride, padding=dilation,
 74 |                                bias=False, dilation=dilation)
 75 |         self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
 76 |         self.conv3 = nn.Conv2d(bottle_planes, planes,
 77 |                                kernel_size=1, bias=False)
 78 |         self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
 79 |         self.relu = nn.ReLU(inplace=True)
 80 |         self.stride = stride
 81 | 
 82 |     def forward(self, x, residual=None):
 83 |         if residual is None:
 84 |             residual = x
 85 | 
 86 |         out = self.conv1(x)
 87 |         out = self.bn1(out)
 88 |         out = self.relu(out)
 89 | 
 90 |         out = self.conv2(out)
 91 |         out = self.bn2(out)
 92 |         out = self.relu(out)
 93 | 
 94 |         out = self.conv3(out)
 95 |         out = self.bn3(out)
 96 | 
 97 |         out += residual
 98 |         out = self.relu(out)
 99 | 
100 |         return out
101 | 
102 | 
103 | class BottleneckX(nn.Module):
104 |     expansion = 2
105 |     cardinality = 32
106 | 
107 |     def __init__(self, inplanes, planes, stride=1, dilation=1):
108 |         super(BottleneckX, self).__init__()
109 |         cardinality = BottleneckX.cardinality
110 |         # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
111 |         # bottle_planes = dim * cardinality
112 |         bottle_planes = planes * cardinality // 32
113 |         self.conv1 = nn.Conv2d(inplanes, bottle_planes,
114 |                                kernel_size=1, bias=False)
115 |         self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
116 |         self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
117 |                                stride=stride, padding=dilation, bias=False,
118 |                                dilation=dilation, groups=cardinality)
119 |         self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
120 |         self.conv3 = nn.Conv2d(bottle_planes, planes,
121 |                                kernel_size=1, bias=False)
122 |         self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
123 |         self.relu = nn.ReLU(inplace=True)
124 |         self.stride = stride
125 | 
126 |     def forward(self, x, residual=None):
127 |         if residual is None:
128 |             residual = x
129 | 
130 |         out = self.conv1(x)
131 |         out = self.bn1(out)
132 |         out = self.relu(out)
133 | 
134 |         out = self.conv2(out)
135 |         out = self.bn2(out)
136 |         out = self.relu(out)
137 | 
138 |         out = self.conv3(out)
139 |         out = self.bn3(out)
140 | 
141 |         out += residual
142 |         out = self.relu(out)
143 | 
144 |         return out
145 | 
146 | 
147 | class Root(nn.Module):
148 |     def __init__(self, in_channels, out_channels, kernel_size, residual):
149 |         super(Root, self).__init__()
150 |         self.conv = nn.Conv2d(
151 |             in_channels, out_channels, 1,
152 |             stride=1, bias=False, padding=(kernel_size - 1) // 2)
153 |         self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
154 |         self.relu = nn.ReLU(inplace=True)
155 |         self.residual = residual
156 | 
157 |     def forward(self, *x):
158 |         children = x
159 |         x = self.conv(torch.cat(x, 1))
160 |         x = self.bn(x)
161 |         if self.residual:
162 |             x += children[0]
163 |         x = self.relu(x)
164 | 
165 |         return x
166 | 
167 | 
168 | class Tree(nn.Module):
169 |     def __init__(self, levels, block, in_channels, out_channels, stride=1,
170 |                  level_root=False, root_dim=0, root_kernel_size=1,
171 |                  dilation=1, root_residual=False):
172 |         super(Tree, self).__init__()
173 |         if root_dim == 0:
174 |             root_dim = 2 * out_channels
175 |         if level_root:
176 |             root_dim += in_channels
177 |         if levels == 1:
178 |             self.tree1 = block(in_channels, out_channels, stride,
179 |                                dilation=dilation)
180 |             self.tree2 = block(out_channels, out_channels, 1,
181 |                                dilation=dilation)
182 |         else:
183 |             self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
184 |                               stride, root_dim=0,
185 |                               root_kernel_size=root_kernel_size,
186 |                               dilation=dilation, root_residual=root_residual)
187 |             self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
188 |                               root_dim=root_dim + out_channels,
189 |                               root_kernel_size=root_kernel_size,
190 |                               dilation=dilation, root_residual=root_residual)
191 |         if levels == 1:
192 |             self.root = Root(root_dim, out_channels, root_kernel_size,
193 |                              root_residual)
194 |         self.level_root = level_root
195 |         self.root_dim = root_dim
196 |         self.downsample = None
197 |         self.project = None
198 |         self.levels = levels
199 |         if stride > 1:
200 |             self.downsample = nn.MaxPool2d(stride, stride=stride)
201 |         if in_channels != out_channels:
202 |             self.project = nn.Sequential(
203 |                 nn.Conv2d(in_channels, out_channels,
204 |                           kernel_size=1, stride=1, bias=False),
205 |                 nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
206 |             )
207 | 
208 |     def forward(self, x, residual=None, children=None):
209 |         children = [] if children is None else children
210 |         bottom = self.downsample(x) if self.downsample else x
211 |         residual = self.project(bottom) if self.project else bottom
212 |         if self.level_root:
213 |             children.append(bottom)
214 |         x1 = self.tree1(x, residual)
215 |         if self.levels == 1:
216 |             x2 = self.tree2(x1)
217 |             x = self.root(x2, x1, *children)
218 |         else:
219 |             children.append(x1)
220 |             x = self.tree2(x1, children=children)
221 |         return x
222 | 
223 | 
224 | class DLA(nn.Module):
225 |     def __init__(self, levels, channels, num_classes=1000,
226 |                  block=BasicBlock, residual_root=False, linear_root=False):
227 |         super(DLA, self).__init__()
228 |         self.channels = channels
229 |         self.num_classes = num_classes
230 |         self.base_layer = nn.Sequential(
231 |             nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
232 |                       padding=3, bias=False),
233 |             nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
234 |             nn.ReLU(inplace=True))
235 |         self.level0 = self._make_conv_level(
236 |             channels[0], channels[0], levels[0])
237 |         self.level1 = self._make_conv_level(
238 |             channels[0], channels[1], levels[1], stride=2)
239 |         self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
240 |                            level_root=False,
241 |                            root_residual=residual_root)
242 |         self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
243 |                            level_root=True, root_residual=residual_root)
244 |         self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
245 |                            level_root=True, root_residual=residual_root)
246 |         self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
247 |                            level_root=True, root_residual=residual_root)
248 | 
249 |         # for m in self.modules():
250 |         #     if isinstance(m, nn.Conv2d):
251 |         #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
252 |         #         m.weight.data.normal_(0, math.sqrt(2. / n))
253 |         #     elif isinstance(m, nn.BatchNorm2d):
254 |         #         m.weight.data.fill_(1)
255 |         #         m.bias.data.zero_()
256 | 
257 |     def _make_level(self, block, inplanes, planes, blocks, stride=1):
258 |         downsample = None
259 |         if stride != 1 or inplanes != planes:
260 |             downsample = nn.Sequential(
261 |                 nn.MaxPool2d(stride, stride=stride),
262 |                 nn.Conv2d(inplanes, planes,
263 |                           kernel_size=1, stride=1, bias=False),
264 |                 nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
265 |             )
266 | 
267 |         layers = []
268 |         layers.append(block(inplanes, planes, stride, downsample=downsample))
269 |         for i in range(1, blocks):
270 |             layers.append(block(inplanes, planes))
271 | 
272 |         return nn.Sequential(*layers)
273 | 
274 |     def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
275 |         modules = []
276 |         for i in range(convs):
277 |             modules.extend([
278 |                 nn.Conv2d(inplanes, planes, kernel_size=3,
279 |                           stride=stride if i == 0 else 1,
280 |                           padding=dilation, bias=False, dilation=dilation),
281 |                 nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
282 |                 nn.ReLU(inplace=True)])
283 |             inplanes = planes
284 |         return nn.Sequential(*modules)
285 | 
286 |     def forward(self, x):
287 |         y = []
288 |         x = self.base_layer(x)
289 |         for i in range(6):
290 |             x = getattr(self, 'level{}'.format(i))(x)
291 |             y.append(x)
292 |         return (y[0],y[1],y[2],y[3],y[4],y[5])
293 |         # return y
294 | 
295 |     def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
296 |         # fc = self.fc
297 |         if name.endswith('.pth'):
298 |             model_weights = torch.load(data + name)
299 |         else:
300 |             model_url = get_model_url(data, name, hash)
301 |             model_weights = model_zoo.load_url(model_url)
302 |         num_classes = len(model_weights[list(model_weights.keys())[-1]])
303 |         self.fc = nn.Conv2d(
304 |             self.channels[-1], num_classes,
305 |             kernel_size=1, stride=1, padding=0, bias=True)
306 |         self.load_state_dict(model_weights)
307 |         # self.fc = fc
308 | 
309 | 
310 | def dla34(pretrained=True, **kwargs):  # DLA-34
311 |     model = DLA([1, 1, 1, 2, 2, 1],
312 |                 [16, 32, 64, 128, 256, 512],
313 |                 block=BasicBlock, **kwargs)
314 |     if pretrained:
315 |         model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
316 |     return model
317 | 
318 | class Identity(nn.Module):
319 | 
320 |     def __init__(self):
321 |         super(Identity, self).__init__()
322 | 
323 |     def forward(self, x):
324 |         return x
325 | 
326 | 
327 | def fill_fc_weights(layers):
328 |     for m in layers.modules():
329 |         if isinstance(m, nn.Conv2d):
330 |             if m.bias is not None:
331 |                 nn.init.constant_(m.bias, 0)
332 | 
333 | 
334 | def fill_up_weights(up):
335 |     w = up.weight.data
336 |     f = math.ceil(w.size(2) / 2)
337 |     c = (2 * f - 1 - f % 2) / (2. * f)
338 |     for i in range(w.size(2)):
339 |         for j in range(w.size(3)):
340 |             w[0, 0, i, j] = \
341 |                 (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
342 |     for c in range(1, w.size(0)):
343 |         w[c, 0, :, :] = w[0, 0, :, :]
344 | 
345 | 
346 | class DeformConv(nn.Module):
347 |     def __init__(self, chi, cho):
348 |         super(DeformConv, self).__init__()
349 |         self.actf = nn.Sequential(
350 |             nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
351 |             nn.ReLU(inplace=True)
352 |         )
353 |         self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1)
354 | 
355 |     def forward(self, x):
356 |         x = self.conv(x)
357 |         x = self.actf(x)
358 |         return x
359 | 
360 | 
361 | class IDAUp(nn.Module):
362 | 
363 |     def __init__(self, o, channels, up_f):
364 |         super(IDAUp, self).__init__()
365 |         for i in range(1, len(channels)):
366 |             c = channels[i]
367 |             f = int(up_f[i])  
368 |             proj = DeformConv(c, o)
369 |             node = DeformConv(o, o)
370 |      
371 |             up = nn.ConvTranspose2d(o, o, f * 2, stride=f, 
372 |                                     padding=f // 2, output_padding=0,
373 |                                     groups=o, bias=False)
374 |             fill_up_weights(up)
375 | 
376 |             setattr(self, 'proj_' + str(i), proj)
377 |             setattr(self, 'up_' + str(i), up)
378 |             setattr(self, 'node_' + str(i), node)
379 |                  
380 |         
381 |     def forward(self, layers, startp, endp):
382 |         for i in range(startp + 1, endp):
383 |             upsample = getattr(self, 'up_' + str(i - startp))
384 |             project = getattr(self, 'proj_' + str(i - startp))
385 |             # layers[i] = upsample(project(layers[i]))
386 |             upsample_layers = upsample(project(layers[i]))
387 |             node = getattr(self, 'node_' + str(i - startp))
388 |             # layers[i] = node(layers[i] + layers[i - 1])
389 |             layers[i] = node(upsample_layers + layers[i - 1])
390 |             # node_tmp = node(upsample_layers + layers[i - 1])
391 |         return layers[-1]
392 | 
393 | 
394 | 
395 | class DLAUp(nn.Module):
396 |     def __init__(self, startp, channels, scales, in_channels=None):
397 |         super(DLAUp, self).__init__()
398 |         self.startp = startp
399 |         if in_channels is None:
400 |             in_channels = channels
401 |         self.channels = channels
402 |         channels = list(channels)
403 |         scales = np.array(scales, dtype=int)
404 |         for i in range(len(channels) - 1):
405 |             j = -i - 2
406 |             setattr(self, 'ida_{}'.format(i),
407 |                     IDAUp(channels[j], in_channels[j:],
408 |                           scales[j:] // scales[j]))
409 |             scales[j + 1:] = scales[j]
410 |             in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
411 | 
412 |     def forward(self, layers):
413 |         out = [layers[-1]] # start with 32
414 |         for i in range(len(layers) - self.startp - 1):
415 |             ida = getattr(self, 'ida_{}'.format(i))
416 |             ida_out = ida(layers, len(layers) -i - 2, len(layers))
417 |             # out.insert(0, layers[-1])
418 |             out.insert(0, ida_out)
419 |         return out
420 | 
421 | 
422 | class Interpolate(nn.Module):
423 |     def __init__(self, scale, mode):
424 |         super(Interpolate, self).__init__()
425 |         self.scale = scale
426 |         self.mode = mode
427 |         
428 |     def forward(self, x):
429 |         x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
430 |         return x
431 | 
432 | 
433 | class DLASeg(nn.Module):
434 |     def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel,
435 |                  last_level, head_conv, out_channel=0):
436 |         super(DLASeg, self).__init__()
437 |         assert down_ratio in [2, 4, 8, 16]
438 |         self.first_level = int(np.log2(down_ratio))
439 |         self.last_level = last_level
440 |         self.base = globals()[base_name](pretrained=pretrained)
441 | 
442 |         channels = self.base.channels
443 |         scales = [2 ** i for i in range(len(channels[self.first_level:]))]
444 |         self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales)
445 |         if out_channel == 0:
446 |             out_channel = channels[self.first_level]
447 | 
448 |         self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], 
449 |                             [2 ** i for i in range(self.last_level - self.first_level)])
450 |         self.heads = heads
451 |         for head in self.heads:
452 |             classes = self.heads[head]
453 |             if head_conv > 0:
454 |               fc = nn.Sequential(
455 |                   nn.Conv2d(channels[self.first_level], head_conv,
456 |                     kernel_size=3, padding=1, bias=True),
457 |                   nn.ReLU(inplace=True),
458 |                   nn.Conv2d(head_conv, classes, 
459 |                     kernel_size=final_kernel, stride=1, 
460 |                     padding=final_kernel // 2, bias=True))
461 |               if 'hm' in head:
462 |                 fc[-1].bias.data.fill_(-2.19)
463 |               else:
464 |                 fill_fc_weights(fc)
465 |             else:
466 |               fc = nn.Conv2d(channels[self.first_level], classes, 
467 |                   kernel_size=final_kernel, stride=1, 
468 |                   padding=final_kernel // 2, bias=True)
469 |               if 'hm' in head:
470 |                 fc.bias.data.fill_(-2.19)
471 |               else:
472 |                 fill_fc_weights(fc)
473 |             self.__setattr__(head, fc)
474 | 
475 |     def forward(self, x):
476 |         x = self.base(x)
477 | 
478 |         x = list(x)
479 |         x = self.dla_up(x)
480 | 
481 |         y = []
482 |         for i in range(self.last_level - self.first_level):
483 |             y.append(x[i].clone())
484 |         ida_out =self.ida_up(y, 0, len(y))
485 | 
486 |         # z = {}
487 |         # for head in self.heads:
488 |         #     z[head] = self.__getattr__(head)(y[-1])
489 |         # for head in self.heads:
490 |         #     z[head] = self.__getattr__(head)(ida_out)
491 |         z = list()
492 |         z.append(self.__getattr__('hm')(y[-1]))
493 |         z.append(self.__getattr__('wh')(y[-1]))
494 |         z.append(self.__getattr__('reg')(y[-1]))
495 |         ret = (z[0],z[1],z[2])
496 |         return ret
497 |         # return [z]
498 |     
499 | 
500 | def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4):
501 |   model = DLASeg('dla{}'.format(num_layers), heads,
502 |                  pretrained=True,
503 |                  down_ratio=down_ratio,
504 |                  final_kernel=1,
505 |                  last_level=5,
506 |                  head_conv=head_conv)
507 |   return model
508 | 
509 | 


--------------------------------------------------------------------------------