├── lib ├── deform_conv.cu.o ├── __init__.py ├── nvcc_compile.sh ├── g++_complie.sh ├── BUILD ├── deform_pooling.h ├── deform_conv_grad.py ├── deform_conv_op.py ├── deform_conv_util.h ├── deform_conv.h ├── deformable_pooling.cu.cc ├── deform_conv.cu.cc ├── deformable_pooling.cc └── deform_conv.cc ├── test_deform_conv.py ├── .gitignore ├── README.md ├── benchmark.py ├── demo.py └── LICENSE /lib/deform_conv.cu.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bwuzhang/TF_Deformable_Ops/HEAD/lib/deform_conv.cu.o -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | # from deform_conv_op import deform_conv_op 2 | # from deform_conv_op import deform_conv_grad_op 3 | 4 | # deform_conv = deform_conv_op -------------------------------------------------------------------------------- /lib/nvcc_compile.sh: -------------------------------------------------------------------------------- 1 | nvcc -std=c++11 -c -o deform_conv.cu.o deform_conv.cu.cc -I $TF_INC -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -L /usr/local/cuda-8.0/lib64/ --expt-relaxed-constexpr -------------------------------------------------------------------------------- /lib/g++_complie.sh: -------------------------------------------------------------------------------- 1 | g++ -std=c++11 -shared -o deform_conv.so deform_conv.cc deform_conv.cu.o -I $TF_INC -fPIC -lcudart -L $CUDA_HOME/lib64 -D GOOGLE_CUDA=1 -Wfatal-errors -I $CUDA_HOME/include -D_GLIBCXX_USE_CXX11_ABI=0 2 | -------------------------------------------------------------------------------- /lib/BUILD: -------------------------------------------------------------------------------- 1 | load("tf_custom_op_library") 2 | 3 | 4 | tf_custom_op_library( 5 | name = "deform_conv.so", 6 | srcs = ["deform_conv.cc", "deform_conv_util.h"], 7 | gpu_srcs = ["deform_conv.cu.cc", "deform_conv.h"], 8 | 9 | ) 10 | -------------------------------------------------------------------------------- /lib/deform_pooling.h: -------------------------------------------------------------------------------- 1 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 2 | #include "tensorflow/core/framework/register_types.h" 3 | #include "tensorflow/core/framework/tensor_types.h" 4 | #include "tensorflow/core/framework/tensor_shape.h" 5 | #include 6 | #include 7 | -------------------------------------------------------------------------------- /test_deform_conv.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 4 | # os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | import numpy as np 6 | import tensorflow as tf 7 | import lib.deform_conv_op as deform_conv_op 8 | 9 | # load test data, generated by np.random.random((8, 6, 4, 5)) 10 | with open("test.npz", 'rb') as f: 11 | arr = np.load(f) 12 | 13 | # arr = np.zeros((8, 6, 4, 5)) 14 | with tf.Session() as sess: 15 | with tf.device('/gpu:0'): 16 | a = tf.constant(arr, dtype=tf.float32) 17 | b = tf.constant(np.ones((21,2,2,2), dtype = np.float32)) 18 | c = tf.constant(np.ones((8,8,2,2), dtype = np.float32)) 19 | result = deform_conv_op.deform_conv_op(a, b, c, strides=[1, 1, 2, 2], rates=[1,1,1,1], padding="VALID", num_groups=3) 20 | sm = sess.run(result) 21 | d = tf.constant(np.ones((8,21,2,2), dtype = np.float32)) 22 | grad = tf.gradients(result, [a, b, c]) 23 | res = [sess.run(g) for g in grad] 24 | 25 | print(res[0]) 26 | # print(sm) 27 | 28 | -------------------------------------------------------------------------------- /lib/deform_conv_grad.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.framework import ops 3 | import deform_conv_op 4 | 5 | @ops.RegisterGradient("DeformConvOp") 6 | def _deform_conv_grad(op, grad): 7 | """The gradients for `deform_conv`. 8 | Args: 9 | op: The `deform_conv` `Operation` that we are differentiating, which we can use 10 | to find the inputs and outputs of the original op. 11 | grad: Gradient with respect to the output of the `roi_pool` op. 12 | Returns: 13 | Gradients with respect to the input of `zero_out`. 14 | """ 15 | data = op.inputs[0] 16 | filter = op.inputs[1] 17 | offset = op.inputs[2] 18 | 19 | strides = op.get_attr('strides') 20 | rates = op.get_attr('rates') 21 | num_groups = op.get_attr('num_groups') 22 | padding = op.get_attr('padding') 23 | data_format = op.get_attr('data_format') 24 | 25 | # compute gradient 26 | data_grad = deform_conv_op.deform_conv_grad_op(data, filter, offset, grad, strides, rates, num_groups, padding, data_format) 27 | 28 | return data_grad # List of one Tensor, since we have one input -------------------------------------------------------------------------------- /lib/deform_conv_op.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import os.path as osp 3 | from tensorflow.python.framework import ops 4 | 5 | 6 | filename = osp.join(osp.dirname(__file__), 'deform_conv.so') 7 | _deform_conv_module = tf.load_op_library(filename) 8 | deform_conv_op = _deform_conv_module.deform_conv_op 9 | deform_conv_grad_op = _deform_conv_module.deform_conv_backprop_op 10 | 11 | 12 | @ops.RegisterGradient("DeformConvOp") 13 | def _deform_conv_grad(op, grad): 14 | """The gradients for `deform_conv`. 15 | Args: 16 | op: The `deform_conv` `Operation` that we are differentiating, which we can use 17 | to find the inputs and outputs of the original op. 18 | grad: Gradient with respect to the output of the `roi_pool` op. 19 | Returns: 20 | Gradients with respect to the input of `zero_out`. 21 | """ 22 | data = op.inputs[0] 23 | filter = op.inputs[1] 24 | offset = op.inputs[2] 25 | 26 | strides = op.get_attr('strides') 27 | rates = op.get_attr('rates') 28 | num_groups = op.get_attr('num_groups') 29 | padding = op.get_attr('padding') 30 | data_format = op.get_attr('data_format') 31 | 32 | # compute gradient 33 | data_grad = deform_conv_grad_op(data, filter, offset, grad, strides, rates, num_groups, padding, data_format) 34 | 35 | return data_grad # List of one Tensor, since we have one input -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | .spyproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # mkdocs documentation 97 | /site 98 | 99 | # mypy 100 | .mypy_cache/ 101 | 102 | # apple file 103 | .DS_Store 104 | 105 | # vscode 106 | .vscode -------------------------------------------------------------------------------- /lib/deform_conv_util.h: -------------------------------------------------------------------------------- 1 | #include "tensorflow/core/util/tensor_format.h" 2 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 3 | #include "tensorflow/core/framework/tensor.h" 4 | 5 | namespace tensorflow { 6 | typedef std::vector TShape; 7 | 8 | inline int ProdShape(const TensorShape &shape, int start) { 9 | int64 res = 1; 10 | for(int i=start; i ToVector(const TensorShape &shape) { 17 | // int64 res = 1; 18 | std::vector res; 19 | for(int i=0; i num_steps_burn_in: 44 | if not i % 10: 45 | print ('%s: step %d, duration = %.3f' % 46 | (datetime.now(), i - num_steps_burn_in, duration)) 47 | total_duration += duration 48 | total_duration_squared += duration * duration 49 | mn = total_duration / FLAGS.num_batches 50 | vr = total_duration_squared / FLAGS.num_batches - mn * mn 51 | sd = math.sqrt(vr) 52 | print ('%s: %s across %d steps, %.3f +/- %.3f sec / batch' % 53 | (datetime.now(), info_string, FLAGS.num_batches, mn, sd)) 54 | 55 | 56 | 57 | 58 | def run_benchmark(): 59 | global parameters 60 | timing_entries = [] 61 | with tf.Graph().as_default(): 62 | # Generate some dummy images. 63 | image_size = 224 64 | kernel_col = 3 65 | kernel_rol = 3 66 | kernel_size = kernel_col*kernel_rol 67 | channel = 64 68 | # Note that our padding definition is slightly different the cuda-convnet. 69 | # In order to force the model to start with the same activations sizes, 70 | # we add 3 to the image_size and employ VALID padding above. 71 | if FLAGS.data_format == 'NCHW': 72 | image_shape = [FLAGS.batch_size, 3, image_size + 3, image_size + 3] 73 | else: 74 | image_shape = [FLAGS.batch_size, image_size + 3, image_size + 3, 3] 75 | offset_shape = [FLAGS.batch_size, 2*kernel_size, image_size + 3, image_size + 3] 76 | kernel_shape = [channel, 3, kernel_col, kernel_rol] 77 | images = tf.Variable(tf.random_normal(image_shape, 78 | dtype=tf.float32, 79 | stddev=1e-1)) 80 | offset = tf.Variable(tf.random_normal(offset_shape, 81 | dtype=tf.float32, 82 | stddev=1e-1)) 83 | kernel = tf.Variable(tf.random_normal(kernel_shape, 84 | dtype=tf.float32, 85 | stddev=1e-1)) 86 | parameters = [kernel] 87 | 88 | last_layer = deform_conv_op.deform_conv_op(images, kernel, offset, strides=[1, 1, 1, 1], rates=[1,1,1,1], padding="SAME", num_groups=1) 89 | 90 | # Build an initialization operation. 91 | init = tf.global_variables_initializer() 92 | 93 | # Start running operations on the Graph. 94 | sess = tf.Session() 95 | sess.run(init) 96 | 97 | run_forward = True 98 | run_forward_backward = True 99 | if FLAGS.forward_only and FLAGS.forward_backward_only: 100 | raise ValueError("Cannot specify --forward_only and " 101 | "--forward_backward_only at the same time.") 102 | if FLAGS.forward_only: 103 | run_forward_backward = False 104 | elif FLAGS.forward_backward_only: 105 | run_forward = False 106 | 107 | if run_forward: 108 | # Run the forward benchmark. 109 | timing_entries.append(time_tensorflow_run(sess, last_layer, "Forward")) 110 | 111 | if run_forward_backward: 112 | # Add a simple objective so we can calculate the backward pass. 113 | # objective = loss(last_layer, labels) 114 | loss = lambda x:tf.reduce_sum(x) 115 | objective = loss(last_layer) 116 | # Compute the gradient with respect to all the parameters. 117 | grad = tf.gradients(objective, parameters) 118 | # Run the backward benchmark. 119 | timing_entries.append(time_tensorflow_run(sess, grad, "Forward-backward")) 120 | 121 | # if FLAGS.csv_file: 122 | # store_data_in_csv(timing_entries) 123 | 124 | 125 | def main(_): 126 | run_benchmark() 127 | 128 | 129 | if __name__ == '__main__': 130 | tf.app.run() -------------------------------------------------------------------------------- /lib/deform_conv.h: -------------------------------------------------------------------------------- 1 | /*! 2 | ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** 3 | * 4 | * COPYRIGHT 5 | * 6 | * All contributions by the University of California: 7 | * Copyright (c) 2014-2017 The Regents of the University of California (Regents) 8 | * All rights reserved. 9 | * 10 | * All other contributions: 11 | * Copyright (c) 2014-2017, the respective contributors 12 | * All rights reserved. 13 | * 14 | * Caffe uses a shared copyright model: each contributor holds copyright over 15 | * their contributions to Caffe. The project versioning records all such 16 | * contribution and copyright details. If a contributor wants to further mark 17 | * their specific copyright on a particular contribution, they should indicate 18 | * their copyright solely in the commit message of the change when it is 19 | * committed. 20 | * 21 | * LICENSE 22 | * 23 | * Redistribution and use in source and binary forms, with or without 24 | * modification, are permitted provided that the following conditions are met: 25 | * 26 | * 1. Redistributions of source code must retain the above copyright notice, this 27 | * list of conditions and the following disclaimer. 28 | * 2. Redistributions in binary form must reproduce the above copyright notice, 29 | * this list of conditions and the following disclaimer in the documentation 30 | * and/or other materials provided with the distribution. 31 | * 32 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 33 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 34 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 35 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 36 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 37 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 39 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 41 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 42 | * 43 | * CONTRIBUTION AGREEMENT 44 | * 45 | * By contributing to the BVLC/caffe repository through pull-request, comment, 46 | * or otherwise, the contributor releases their content to the 47 | * license and copyright terms herein. 48 | * 49 | ***************** END Caffe Copyright Notice and Disclaimer ******************** 50 | * 51 | * Copyright (c) 2017 by Contributors 52 | * \file deformable_im2col.h 53 | * \brief Function definitions of converting an image to 54 | * column matrix based on kernel, padding, and dilation. 55 | * These functions are mainly used in convolution operators. 56 | * The implementation of the im2col and col2im algorithms 57 | * are copied from Caffe with minor interface modifications 58 | * adapting to MXNet data structures. 59 | */ 60 | 61 | #ifndef TENSORFLOW_KERNELS_CONV_OPS_im2col_H_ 62 | #define TENSORFLOW_KERNELS_CONV_OPS_im2col_H_ 63 | 64 | // #define EIGEN_USE_GPU 65 | 66 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 67 | #include "tensorflow/core/framework/register_types.h" 68 | #include "tensorflow/core/framework/tensor_types.h" 69 | #include "tensorflow/core/framework/tensor_shape.h" 70 | #include 71 | #include 72 | 73 | namespace tensorflow { 74 | // typedef Eigen::ThreadPoolDevice CPUDevice; 75 | typedef std::vector TShape; 76 | // typedef Eigen::GpuDevice GPUDevice; 77 | 78 | namespace functor { 79 | 80 | /*! 81 | * \brief cpu function of im2col algorithm 82 | * \param data_im pointer of a image (C, H, W,...) in the image batch 83 | * \param im_shape input image shape in dimensions (N, C, H, W,) 84 | * \param col_shape column buffer shape 85 | * \param kernel_shape kernel filter shape 86 | * \param pad pad shape 87 | * \param stride stride shape 88 | * \param dilation dilation shape 89 | * \param data_col start pointer of the column buffer to be filled 90 | */ 91 | template 92 | struct deformable_im2col { 93 | void operator()(const Device& d, 94 | const DType* data_im, const DType* data_offset, 95 | const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, 96 | const TShape& pad, const TShape& stride, const TShape& dilation, 97 | const int deformable_group, DType* data_col); 98 | }; 99 | 100 | /*!\brief 101 | * cpu function of col2im algorithm 102 | * \param s device stream 103 | * \param data_col start pointer of the column buffer to be filled 104 | * \param data_im start pointer of the image data 105 | * \param data_offset start pointer of the offset data 106 | * \param im_shape input image shape in dimensions (N, C, H, W,) 107 | * \param col_shape column buffer shape 108 | * \param kernel_shape kernel filter shape 109 | * \param pad pad shape 110 | * \param stride stride shape 111 | * \param dilation dilation shape 112 | * \param grad_im pointer of a image (C, H, W,...) in the image batch 113 | */ 114 | template 115 | struct deformable_col2im { 116 | void operator()(const Device& d, 117 | const DType* data_col, const DType* data_offset, 118 | const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, 119 | const TShape& pad, const TShape& stride, 120 | const TShape& dilation, const int deformable_group, 121 | DType* grad_im); 122 | }; 123 | 124 | 125 | 126 | template 127 | struct deformable_col2im_coord { 128 | void operator()(const Device& d, 129 | const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape, 130 | const TShape& col_shape, const TShape& kernel_shape, 131 | const TShape& pad, const TShape& stride, 132 | const TShape& dilation, const int deformable_group, DType* grad_offset); 133 | }; 134 | 135 | template 136 | struct im2col { 137 | void operator() (const Device& d, 138 | const DType* data_im, const TShape& im_shape, 139 | const TShape& col_shape, const TShape& kernel_shape, 140 | const TShape& pad, const TShape& stride, 141 | const TShape& dilation, DType* data_col); 142 | }; 143 | 144 | template 145 | struct pureAddTo { 146 | void operator() (const Device& d, const int n, DType* result_data, const DType* right_data); 147 | }; 148 | 149 | template 150 | struct pureSubTo { 151 | void operator() (const Device& d, const int n, DType* result_data, const DType* right_data); 152 | }; 153 | 154 | template 155 | struct setZero { 156 | void operator() (const Device& d, const int n, DType* result_data); 157 | }; 158 | 159 | 160 | } // namespace functor 161 | } // namespace tensorflow 162 | #endif // TENSORFLOW_KERNELS_CONV_OPS_im2col_H_ 163 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | 5 | from __future__ import print_function 6 | import os 7 | # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" # see issue #152 8 | # os.environ["CUDA_VISIBLE_DEVICES"]="1" 9 | from six.moves import xrange 10 | import os 11 | import tensorflow as tf 12 | import numpy as np 13 | import tensorflow.contrib.layers as ly 14 | from tensorflow.examples.tutorials.mnist import input_data 15 | from functools import partial 16 | from lib.deform_conv_op import deform_conv_op 17 | 18 | 19 | def lrelu(x, leak=0.3, name="lrelu"): 20 | with tf.variable_scope(name): 21 | f1 = 0.5 * (1 + leak) 22 | f2 = 0.5 * (1 - leak) 23 | return f1 * x + f2 * abs(x) 24 | 25 | 26 | def deform_conv_2d(img, num_outputs, kernel_size=3, stride=2, 27 | normalizer_fn=ly.batch_norm, activation_fn=lrelu, name=''): 28 | img_shape = img.shape.as_list() 29 | assert(len(img_shape) == 4) 30 | N, C, H, W = img_shape 31 | with tf.variable_scope('deform_conv' + '_' + name): 32 | offset = ly.conv2d(img, num_outputs=2 * kernel_size**2, kernel_size=3, 33 | stride=2, activation_fn=None, data_format='NCHW') 34 | 35 | kernel = tf.get_variable(name='d_kernel', shape=(num_outputs, C, kernel_size, kernel_size), 36 | initializer=tf.random_normal_initializer(0, 0.02)) 37 | res = deform_conv_op(img, filter=kernel, offset=offset, rates=[1, 1, 1, 1], padding='SAME', 38 | strides=[1, 1, stride, stride], num_groups=1) 39 | if normalizer_fn is not None: 40 | res = normalizer_fn(res) 41 | if activation_fn is not None: 42 | res = activation_fn(res) 43 | 44 | return res 45 | 46 | 47 | batch_size = 64 48 | z_dim = 128 49 | learning_rate_ger = 5e-5 50 | learning_rate_dis = 5e-5 51 | device = '/gpu:0' 52 | 53 | # update Citers times of critic in one iter(unless i < 25 or i % 500 == 0, 54 | # i is iterstep) 55 | Citers = 5 56 | # the upper bound and lower bound of parameters in critic 57 | clamp_lower = -0.01 58 | clamp_upper = 0.01 59 | # whether to use adam for parameter update, if the flag is set False, use tf.train.RMSPropOptimizer 60 | # as recommended in paper 61 | is_adam = False 62 | # whether to use SVHN or MNIST, set false and MNIST is used 63 | dataset_type = "mnist" 64 | # img size 65 | s = 32 66 | channel = 1 67 | # 'gp' for gp WGAN and 'regular' for vanilla 68 | mode = 'regular' 69 | # if 'gp' is chosen the corresponding lambda must be filled 70 | lam = 10. 71 | s2, s4, s8, s16 = int(s / 2), int(s / 4), int(s / 8), int(s / 16) 72 | # hidden layer size if mlp is chosen, ignore if otherwise 73 | ngf = 64 74 | ndf = 64 75 | # directory to store log, including loss and grad_norm of generator and critic 76 | log_dir = './log_wgan' 77 | ckpt_dir = './ckpt_wgan' 78 | if not os.path.exists(ckpt_dir): 79 | os.makedirs(ckpt_dir) 80 | # max iter step, note the one step indicates that a Citers updates of 81 | # critic and one update of generator 82 | max_iter_step = 20000 83 | 84 | # In[5]: 85 | 86 | 87 | def generator_conv(z): 88 | train = ly.fully_connected( 89 | z, 4 * 4 * 512, activation_fn=lrelu, normalizer_fn=ly.batch_norm) 90 | train = tf.reshape(train, (-1, 4, 4, 512)) 91 | train = ly.conv2d_transpose(train, 256, 3, stride=2, 92 | activation_fn=tf.nn.relu, normalizer_fn=ly.batch_norm, padding='SAME', weights_initializer=tf.random_normal_initializer(0, 0.02)) 93 | train = ly.conv2d_transpose(train, 128, 3, stride=2, 94 | activation_fn=tf.nn.relu, normalizer_fn=ly.batch_norm, padding='SAME', weights_initializer=tf.random_normal_initializer(0, 0.02)) 95 | train = ly.conv2d_transpose(train, 64, 3, stride=2, 96 | activation_fn=tf.nn.relu, normalizer_fn=ly.batch_norm, padding='SAME', weights_initializer=tf.random_normal_initializer(0, 0.02)) 97 | train = ly.conv2d_transpose(train, 1, 3, stride=1, 98 | activation_fn=tf.nn.tanh, padding='SAME', weights_initializer=tf.random_normal_initializer(0, 0.02)) 99 | print(train.name) 100 | return train 101 | 102 | 103 | # In[7]: 104 | 105 | def critic_conv(img, reuse=False): 106 | with tf.variable_scope('critic') as scope: 107 | if reuse: 108 | scope.reuse_variables() 109 | size = 64 110 | img = tf.transpose(img, [0, 3, 1, 2]) 111 | img = deform_conv_2d(img, num_outputs=size, kernel_size=3, 112 | stride=2, activation_fn=lrelu, name='conv3') 113 | img = deform_conv_2d(img, num_outputs=size * 2, kernel_size=3, 114 | stride=2, activation_fn=lrelu, normalizer_fn=ly.batch_norm, name='conv4') 115 | img = ly.conv2d(img, num_outputs=size * 4, kernel_size=3, 116 | stride=2, activation_fn=lrelu, normalizer_fn=ly.batch_norm, data_format="NCHW") 117 | 118 | img = ly.conv2d(img, num_outputs=size * 8, kernel_size=3, 119 | stride=2, activation_fn=lrelu, normalizer_fn=ly.batch_norm, data_format="NCHW") 120 | logit = ly.fully_connected(tf.reshape( 121 | img, [batch_size, -1]), 1, activation_fn=None) 122 | return logit 123 | 124 | 125 | # In[9]: 126 | 127 | def build_graph(): 128 | # z = tf.placeholder(tf.float32, shape=(batch_size, z_dim)) 129 | noise_dist = tf.contrib.distributions.Normal(0., 1.) 130 | z = noise_dist.sample((batch_size, z_dim)) 131 | generator = generator_mlp if is_mlp else generator_conv 132 | critic = critic_mlp if is_mlp else critic_conv 133 | with tf.variable_scope('generator'): 134 | train = generator(z) 135 | real_data = tf.placeholder( 136 | dtype=tf.float32, shape=(batch_size, s, s, channel)) 137 | print(real_data.shape) 138 | print(train.shape) 139 | true_logit = critic(real_data) 140 | fake_logit = critic(train, reuse=True) 141 | c_loss = tf.reduce_mean(fake_logit - true_logit) 142 | if mode is 'gp': 143 | alpha_dist = tf.contrib.distributions.Uniform(low=0., high=1.) 144 | alpha = alpha_dist.sample((batch_size, 1, 1, 1)) 145 | interpolated = real_data + alpha * (train - real_data) 146 | inte_logit = critic(interpolated, reuse=True) 147 | gradients = tf.gradients(inte_logit, [interpolated, ])[0] 148 | grad_l2 = tf.sqrt(tf.reduce_sum(tf.square(gradients), axis=[1, 2, 3])) 149 | gradient_penalty = tf.reduce_mean((grad_l2 - 1)**2) 150 | gp_loss_sum = tf.summary.scalar("gp_loss", gradient_penalty) 151 | grad = tf.summary.scalar("grad_norm", tf.nn.l2_loss(gradients)) 152 | c_loss += lam * gradient_penalty 153 | g_loss = tf.reduce_mean(-fake_logit) 154 | g_loss_sum = tf.summary.scalar("g_loss", g_loss) 155 | c_loss_sum = tf.summary.scalar("c_loss", c_loss) 156 | img_sum = tf.summary.image("img", train, max_outputs=10) 157 | theta_g = tf.get_collection( 158 | tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator') 159 | theta_c = tf.get_collection( 160 | tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') 161 | counter_g = tf.Variable(trainable=False, initial_value=0, dtype=tf.int32) 162 | opt_g = ly.optimize_loss(loss=g_loss, learning_rate=learning_rate_ger, 163 | optimizer=partial( 164 | tf.train.AdamOptimizer, beta1=0.5, beta2=0.9) if is_adam is True else tf.train.RMSPropOptimizer, 165 | variables=theta_g, global_step=counter_g, 166 | summaries=['gradient_norm'], clip_gradients=100.) 167 | counter_c = tf.Variable(trainable=False, initial_value=0, dtype=tf.int32) 168 | opt_c = ly.optimize_loss(loss=c_loss, learning_rate=learning_rate_dis, 169 | optimizer=partial( 170 | tf.train.AdamOptimizer, beta1=0.5, beta2=0.9) if is_adam is True else tf.train.RMSPropOptimizer, 171 | variables=theta_c, global_step=counter_c, 172 | summaries=['gradient_norm'], clip_gradients=100.) 173 | if mode is 'regular': 174 | clipped_var_c = [tf.assign(var, tf.clip_by_value( 175 | var, clamp_lower, clamp_upper)) for var in theta_c] 176 | # merge the clip operations on critic variables 177 | with tf.control_dependencies([opt_c]): 178 | opt_c = tf.tuple(clipped_var_c) 179 | if not mode in ['gp', 'regular']: 180 | raise(NotImplementedError('Only two modes')) 181 | return opt_g, opt_c, real_data 182 | 183 | 184 | # In[ ]: 185 | 186 | def main(): 187 | dataset = input_data.read_data_sets('MNIST_data', one_hot=True) 188 | with tf.device(device): 189 | opt_g, opt_c, real_data = build_graph() 190 | merged_all = tf.summary.merge_all() 191 | saver = tf.train.Saver() 192 | config = tf.ConfigProto(allow_soft_placement=True, 193 | log_device_placement=True) 194 | config.gpu_options.allow_growth = True 195 | config.gpu_options.per_process_gpu_memory_fraction = 0.8 196 | 197 | def next_feed_dict(): 198 | train_img = dataset.train.next_batch(batch_size)[0] 199 | train_img = 2 * train_img - 1 200 | train_img = np.reshape(train_img, (-1, 28, 28)) 201 | npad = ((0, 0), (2, 2), (2, 2)) 202 | train_img = np.pad(train_img, pad_width=npad, 203 | mode='constant', constant_values=-1) 204 | train_img = np.expand_dims(train_img, -1) 205 | feed_dict = {real_data: train_img} 206 | return feed_dict 207 | with tf.Session(config=config) as sess: 208 | sess.run(tf.global_variables_initializer()) 209 | summary_writer = tf.summary.FileWriter(log_dir, sess.graph) 210 | for i in range(max_iter_step): 211 | if i < 25 or i % 500 == 0: 212 | citers = 100 213 | else: 214 | citers = Citers 215 | for j in range(citers): 216 | feed_dict = next_feed_dict() 217 | if i % 100 == 99 and j == 0: 218 | run_options = tf.RunOptions( 219 | trace_level=tf.RunOptions.FULL_TRACE) 220 | run_metadata = tf.RunMetadata() 221 | _, merged = sess.run([opt_c, merged_all], feed_dict=feed_dict, 222 | options=run_options, run_metadata=run_metadata) 223 | summary_writer.add_summary(merged, i) 224 | summary_writer.add_run_metadata( 225 | run_metadata, 'critic_metadata {}'.format(i), i) 226 | else: 227 | sess.run(opt_c, feed_dict=feed_dict) 228 | feed_dict = next_feed_dict() 229 | if i % 100 == 99: 230 | _, merged = sess.run([opt_g, merged_all], feed_dict=feed_dict, 231 | options=run_options, run_metadata=run_metadata) 232 | summary_writer.add_summary(merged, i) 233 | summary_writer.add_run_metadata( 234 | run_metadata, 'generator_metadata {}'.format(i), i) 235 | else: 236 | sess.run(opt_g, feed_dict=feed_dict) 237 | if i % 1000 == 999: 238 | saver.save(sess, os.path.join( 239 | ckpt_dir, "model.ckpt"), global_step=i) 240 | 241 | 242 | main() 243 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /lib/deformable_pooling.cu.cc: -------------------------------------------------------------------------------- 1 | 2 | #define EIGEN_USE_GPU 3 | 4 | #include "deform_conv.h" 5 | #include "cuda.h" 6 | #include "tensorflow/core/util/cuda_kernel_helper.h" 7 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 8 | #include "tensorflow/core/framework/register_types.h" 9 | #include "tensorflow/core/framework/tensor_types.h" 10 | #include "tensorflow/core/framework/tensor.h" 11 | #include "tensorflow/core/platform/logging.h" 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | 18 | namespace tensorflow { 19 | 20 | typedef Eigen::GpuDevice GPUDevice; 21 | typedef std::vector TShape; 22 | 23 | template 24 | __device__ DType bilinear_interp( 25 | const DType* data, 26 | const DType x, 27 | const DType y, 28 | const int width, 29 | const int height) { 30 | int x1 = floor(x); 31 | int x2 = ceil(x); 32 | int y1 = floor(y); 33 | int y2 = ceil(y); 34 | DType dist_x = static_cast(x - x1); 35 | DType dist_y = static_cast(y - y1); 36 | DType value11 = data[y1*width + x1]; 37 | DType value12 = data[y2*width + x1]; 38 | DType value21 = data[y1*width + x2]; 39 | DType value22 = data[y2*width + x2]; 40 | DType value = (1 - dist_x)*(1 - dist_y)*value11 + (1 - dist_x)*dist_y*value12 41 | + dist_x*(1 - dist_y)*value21 + dist_x*dist_y*value22; 42 | return value; 43 | } 44 | 45 | template 46 | __global__ void DeformablePSROIPoolForwardKernel( 47 | const int count, 48 | const DType* bottom_data, 49 | const DType spatial_scale, 50 | const int channels, 51 | const int height, const int width, 52 | const int pooled_height, const int pooled_width, 53 | const DType* bottom_rois, const DType* bottom_trans, 54 | const bool no_trans, 55 | const DType trans_std, 56 | const int sample_per_part, 57 | const int output_dim, 58 | const int group_size, 59 | const int part_size, 60 | const int num_classes, 61 | const int channels_each_class, 62 | DType* top_data, 63 | DType* top_count) { 64 | CUDA_KERNEL_LOOP(index, count) { 65 | // The output is in order (n, ctop, ph, pw) 66 | int pw = index % pooled_width; 67 | int ph = (index / pooled_width) % pooled_height; 68 | int ctop = (index / pooled_width / pooled_height) % output_dim; 69 | int n = index / pooled_width / pooled_height / output_dim; 70 | 71 | // [start, end) interval for spatial sampling 72 | const DType* offset_bottom_rois = bottom_rois + n * 5; 73 | int roi_batch_ind = offset_bottom_rois[0]; 74 | DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; 75 | DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; 76 | DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; 77 | DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; 78 | 79 | // Force too small ROIs to be 1x1 80 | DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 81 | DType roi_height = max(roi_end_h - roi_start_h, 0.1); 82 | 83 | // Compute w and h at bottom 84 | DType bin_size_h = roi_height / static_cast(pooled_height); 85 | DType bin_size_w = roi_width / static_cast(pooled_width); 86 | 87 | DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); 88 | DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); 89 | 90 | int part_h = floor(static_cast(ph) / pooled_height*part_size); 91 | int part_w = floor(static_cast(pw) / pooled_width*part_size); 92 | int class_id = ctop / channels_each_class; 93 | DType trans_x = no_trans ? static_cast(0) : 94 | bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; 95 | DType trans_y = no_trans ? static_cast(0) : 96 | bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; 97 | 98 | DType wstart = static_cast(pw)* bin_size_w 99 | + roi_start_w; 100 | wstart += trans_x * roi_width; 101 | DType hstart = static_cast(ph) * bin_size_h 102 | + roi_start_h; 103 | hstart += trans_y * roi_height; 104 | 105 | DType sum = 0; 106 | int count = 0; 107 | int gw = floor(static_cast(pw) * group_size / pooled_width); 108 | int gh = floor(static_cast(ph)* group_size / pooled_height); 109 | gw = min(max(gw, 0), group_size - 1); 110 | gh = min(max(gh, 0), group_size - 1); 111 | 112 | const DType* offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width; 113 | for (int ih = 0; ih < sample_per_part; ih++) { 114 | for (int iw = 0; iw < sample_per_part; iw++) { 115 | DType w = wstart + iw*sub_bin_size_w; 116 | DType h = hstart + ih*sub_bin_size_h; 117 | // bilinear interpolation 118 | if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { 119 | continue; 120 | } 121 | w = min(max(w, 0.), width - 1.); 122 | h = min(max(h, 0.), height - 1.); 123 | int c = (ctop*group_size + gh)*group_size + gw; 124 | DType val = bilinear_interp(offset_bottom_data + c*height*width, w, h, width, height); 125 | sum += val; 126 | count++; 127 | } 128 | } 129 | top_data[index] = count == 0 ? static_cast(0) : sum / count; 130 | top_count[index] = count; 131 | } 132 | } 133 | 134 | template 135 | inline void DeformablePSROIPoolForward(const Tensor &out, 136 | const Tensor &data, 137 | const Tensor &bbox, 138 | const Tensor &trans, 139 | const Tensor &top_count, 140 | const bool no_trans, 141 | const float spatial_scale, 142 | const int output_dim, 143 | const int group_size, 144 | const int pooled_size, 145 | const int part_size, 146 | const int sample_per_part, 147 | const float trans_std) { 148 | // LOG(INFO) << "DeformablePSROIPoolForward"; 149 | const DType *bottom_data = data.dptr_; 150 | const DType *bottom_rois = bbox.dptr_; 151 | const DType *bottom_trans = no_trans ? NULL : trans.dptr_; 152 | DType *top_data = out.dptr_; 153 | DType *top_count_data = top_count.dptr_; 154 | const int count = out.shape_.Size(); 155 | const int channels = data.size(1); 156 | const int height = data.size(2); 157 | const int width = data.size(3); 158 | const int pooled_height = pooled_size; 159 | const int pooled_width = pooled_size; 160 | const int num_classes = no_trans ? 1 : trans.size(1) / 2; 161 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; 162 | 163 | cudaStream_t stream = Stream::GetStream(out.stream_); 164 | DeformablePSROIPoolForwardKernel << > >( 166 | count, bottom_data, spatial_scale, channels, height, width, pooled_height, pooled_width, 167 | bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, output_dim, 168 | group_size, part_size, num_classes, channels_each_class, top_data, top_count_data); 169 | DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); 170 | } 171 | 172 | template 173 | __global__ void DeformablePSROIPoolBackwardAccKernel( 174 | const int count, 175 | const DType* top_diff, 176 | const DType* top_count, 177 | const int num_rois, 178 | const DType spatial_scale, 179 | const int channels, 180 | const int height, const int width, 181 | const int pooled_height, const int pooled_width, 182 | const int output_dim, 183 | DType* bottom_data_diff, DType* bottom_trans_diff, 184 | const DType* bottom_data, 185 | const DType* bottom_rois, 186 | const DType* bottom_trans, 187 | const bool no_trans, 188 | const DType trans_std, 189 | const int sample_per_part, 190 | const int group_size, 191 | const int part_size, 192 | const int num_classes, 193 | const int channels_each_class) { 194 | CUDA_KERNEL_LOOP(index, count) { 195 | // The output is in order (n, ctop, ph, pw) 196 | int pw = index % pooled_width; 197 | int ph = (index / pooled_width) % pooled_height; 198 | int ctop = (index / pooled_width / pooled_height) % output_dim; 199 | int n = index / pooled_width / pooled_height / output_dim; 200 | 201 | // [start, end) interval for spatial sampling 202 | const DType* offset_bottom_rois = bottom_rois + n * 5; 203 | int roi_batch_ind = offset_bottom_rois[0]; 204 | DType roi_start_w = static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; 205 | DType roi_start_h = static_cast(round(offset_bottom_rois[2])) * spatial_scale - 0.5; 206 | DType roi_end_w = static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; 207 | DType roi_end_h = static_cast(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; 208 | 209 | // Force too small ROIs to be 1x1 210 | DType roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0 211 | DType roi_height = max(roi_end_h - roi_start_h, 0.1); 212 | 213 | // Compute w and h at bottom 214 | DType bin_size_h = roi_height / static_cast(pooled_height); 215 | DType bin_size_w = roi_width / static_cast(pooled_width); 216 | 217 | DType sub_bin_size_h = bin_size_h / static_cast(sample_per_part); 218 | DType sub_bin_size_w = bin_size_w / static_cast(sample_per_part); 219 | 220 | int part_h = floor(static_cast(ph) / pooled_height*part_size); 221 | int part_w = floor(static_cast(pw) / pooled_width*part_size); 222 | int class_id = ctop / channels_each_class; 223 | DType trans_x = no_trans ? static_cast(0) : 224 | bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w] * trans_std; 225 | DType trans_y = no_trans ? static_cast(0) : 226 | bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h)*part_size + part_w] * trans_std; 227 | 228 | DType wstart = static_cast(pw)* bin_size_w 229 | + roi_start_w; 230 | wstart += trans_x * roi_width; 231 | DType hstart = static_cast(ph) * bin_size_h 232 | + roi_start_h; 233 | hstart += trans_y * roi_height; 234 | 235 | if (top_count[index] <= 0) { 236 | continue; 237 | } 238 | DType diff_val = top_diff[index] / top_count[index]; 239 | const DType* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; 240 | DType* offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width; 241 | int gw = floor(static_cast(pw)* group_size / pooled_width); 242 | int gh = floor(static_cast(ph)* group_size / pooled_height); 243 | gw = min(max(gw, 0), group_size - 1); 244 | gh = min(max(gh, 0), group_size - 1); 245 | 246 | for (int ih = 0; ih < sample_per_part; ih++) { 247 | for (int iw = 0; iw < sample_per_part; iw++) { 248 | DType w = wstart + iw*sub_bin_size_w; 249 | DType h = hstart + ih*sub_bin_size_h; 250 | // bilinear interpolation 251 | if (w<-0.5 || w>width - 0.5 || h<-0.5 || h>height - 0.5) { 252 | continue; 253 | } 254 | w = min(max(w, 0.), width - 1.); 255 | h = min(max(h, 0.), height - 1.); 256 | int c = (ctop*group_size + gh)*group_size + gw; 257 | // backward on feature 258 | int x0 = floor(w); 259 | int x1 = ceil(w); 260 | int y0 = floor(h); 261 | int y1 = ceil(h); 262 | DType dist_x = w - x0, dist_y = h - y0; 263 | DType q00 = (1 - dist_x)*(1 - dist_y); 264 | DType q01 = (1 - dist_x)*dist_y; 265 | DType q10 = dist_x*(1 - dist_y); 266 | DType q11 = dist_x*dist_y; 267 | int bottom_index_base = c * height *width; 268 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x0, q00*diff_val); 269 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x0, q01*diff_val); 270 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y0*width + x1, q10*diff_val); 271 | atomicAdd(offset_bottom_data_diff + bottom_index_base + y1*width + x1, q11*diff_val); 272 | 273 | if (no_trans) { 274 | continue; 275 | } 276 | DType U00 = offset_bottom_data[bottom_index_base + y0*width + x0]; 277 | DType U01 = offset_bottom_data[bottom_index_base + y1*width + x0]; 278 | DType U10 = offset_bottom_data[bottom_index_base + y0*width + x1]; 279 | DType U11 = offset_bottom_data[bottom_index_base + y1*width + x1]; 280 | DType diff_x = (U11*dist_y + U10*(1 - dist_y) - U01*dist_y - U00*(1 - dist_y)) 281 | *trans_std*diff_val; 282 | diff_x *= roi_width; 283 | DType diff_y = (U11*dist_x + U01*(1 - dist_x) - U10*dist_x - U00*(1 - dist_x)) 284 | *trans_std*diff_val; 285 | diff_y *= roi_height; 286 | 287 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h)*part_size + part_w, diff_x); 288 | atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1)*part_size + part_h)*part_size + part_w, diff_y); 289 | } 290 | } 291 | } 292 | } 293 | 294 | template 295 | inline void DeformablePSROIPoolBackwardAcc(const Tensor &in_grad, 296 | const Tensor &trans_grad, 297 | const Tensor &out_grad, 298 | const Tensor &data, 299 | const Tensor &bbox, 300 | const Tensor &trans, 301 | const Tensor &top_count, 302 | const bool no_trans, 303 | const float spatial_scale, 304 | const int output_dim, 305 | const int group_size, 306 | const int pooled_size, 307 | const int part_size, 308 | const int sample_per_part, 309 | const float trans_std) { 310 | // LOG(INFO) << "DeformablePSROIPoolBackward"; 311 | const DType *top_diff = out_grad.dptr_; 312 | const DType *bottom_data = data.dptr_; 313 | const DType *bottom_rois = bbox.dptr_; 314 | const DType *bottom_trans = no_trans ? NULL : trans.dptr_; 315 | DType *bottom_data_diff = in_grad.dptr_; 316 | DType *bottom_trans_diff = no_trans ? NULL : trans_grad.dptr_; 317 | const DType *top_count_data = top_count.dptr_; 318 | const int count = out_grad.shape_.Size(); 319 | const int num_rois = bbox.size(0); 320 | const int channels = in_grad.size(1); 321 | const int height = in_grad.size(2); 322 | const int width = in_grad.size(3); 323 | const int pooled_height = pooled_size; 324 | const int pooled_width = pooled_size; 325 | const int num_classes = no_trans ? 1 : trans_grad.size(1) / 2; 326 | const int channels_each_class = no_trans ? output_dim : output_dim / num_classes; 327 | 328 | cudaStream_t stream = Stream::GetStream(in_grad.stream_); 329 | DeformablePSROIPoolBackwardAccKernel << > >( 331 | count, top_diff, top_count_data, num_rois, spatial_scale, channels, height, width, 332 | pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff, 333 | bottom_data, bottom_rois, bottom_trans, no_trans, trans_std, sample_per_part, 334 | group_size, part_size, num_classes, channels_each_class); 335 | DeformablePSROIPOOLING_CUDA_CHECK(cudaPeekAtLastError()); 336 | } 337 | 338 | } 339 | -------------------------------------------------------------------------------- /lib/deform_conv.cu.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | ******************* BEGIN Caffe Copyright Notice and Disclaimer **************** 3 | * 4 | * COPYRIGHT 5 | * 6 | * All contributions by the University of California: 7 | * Copyright (c) 2014-2017 The Regents of the University of California (Regents) 8 | * All rights reserved. 9 | * 10 | * All other contributions: 11 | * Copyright (c) 2014-2017, the respective contributors 12 | * All rights reserved. 13 | * 14 | * Caffe uses a shared copyright model: each contributor holds copyright over 15 | * their contributions to Caffe. The project versioning records all such 16 | * contribution and copyright details. If a contributor wants to further mark 17 | * their specific copyright on a particular contribution, they should indicate 18 | * their copyright solely in the commit message of the change when it is 19 | * committed. 20 | * 21 | * LICENSE 22 | * 23 | * Redistribution and use in source and binary forms, with or without 24 | * modification, are permitted provided that the following conditions are met: 25 | * 26 | * 1. Redistributions of source code must retain the above copyright notice, this 27 | * list of conditions and the following disclaimer. 28 | * 2. Redistributions in binary form must reproduce the above copyright notice, 29 | * this list of conditions and the following disclaimer in the documentation 30 | * and/or other materials provided with the distribution. 31 | * 32 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 33 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 34 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 35 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 36 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 37 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 39 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 41 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 42 | * 43 | * CONTRIBUTION AGREEMENT 44 | * 45 | * By contributing to the BVLC/caffe repository through pull-request, comment, 46 | * or otherwise, the contributor releases their content to the 47 | * license and copyright terms herein. 48 | * 49 | ***************** END Caffe Copyright Notice and Disclaimer ******************** 50 | * 51 | * Copyright (c) 2017 by Contributors 52 | * \file deformable_im2col.cuh 53 | * \brief Function definitions of converting an image to 54 | * column matrix based on kernel, padding, and dilation. 55 | * These functions are mainly used in convolution operators. 56 | * The implementation of the im2col and col2im algorithms 57 | * are copied from Caffe with minor interface modifications 58 | * adapting to MXNet data structures. 59 | */ 60 | 61 | #ifndef TENSORFLOW_KERNELS_CONV_OPS_im2col_gpu_H_ 62 | #define TENSORFLOW_KERNELS_CONV_OPS_im2col_gpu_H_ 63 | 64 | // #if GOOGLE_CUDA 65 | 66 | #define EIGEN_USE_GPU 67 | 68 | #include "deform_conv.h" 69 | #include "cuda.h" 70 | #include "tensorflow/core/util/cuda_kernel_helper.h" 71 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 72 | #include "tensorflow/core/framework/register_types.h" 73 | #include "tensorflow/core/framework/tensor_types.h" 74 | #include "tensorflow/core/framework/tensor.h" 75 | #include "tensorflow/core/platform/logging.h" 76 | 77 | #include 78 | #include 79 | #include 80 | 81 | 82 | namespace tensorflow { 83 | 84 | typedef Eigen::GpuDevice GPUDevice; 85 | typedef std::vector TShape; 86 | 87 | 88 | 89 | // fetch value from bottom_data(1D array), using subscript (h, w) 90 | template 91 | __device__ DType deformable_im2col_bilinear(const DType* bottom_data, const int data_width, 92 | const int height, const int width, DType h, DType w) { 93 | 94 | int h_low = floor(h); 95 | int w_low = floor(w); 96 | int h_high; 97 | int w_high; 98 | if (h_low >= height - 1) { 99 | h_high = h_low = height - 1; 100 | h = (DType)h_low; 101 | } 102 | else { 103 | h_high = h_low + 1; 104 | } 105 | 106 | if (w_low >= width - 1) { 107 | w_high = w_low = width - 1; 108 | w = (DType)w_low; 109 | } 110 | else { 111 | w_high = w_low + 1; 112 | } 113 | 114 | DType lh = h - h_low; 115 | DType lw = w - w_low; 116 | DType hh = 1 - lh, hw = 1 - lw; 117 | 118 | DType v1 = bottom_data[h_low * data_width + w_low]; 119 | DType v2 = bottom_data[h_low * data_width + w_high]; 120 | DType v3 = bottom_data[h_high * data_width + w_low]; 121 | DType v4 = bottom_data[h_high * data_width + w_high]; 122 | DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; 123 | 124 | DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); 125 | return val; 126 | } 127 | 128 | 129 | template 130 | __device__ DType get_gradient_weight(DType argmax_h, DType argmax_w, 131 | const int h, const int w, const int height, const int width) { 132 | 133 | if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) { 134 | //empty 135 | return 0; 136 | } 137 | 138 | argmax_h = max(argmax_h, (DType)0.0f); 139 | argmax_w = max(argmax_w, (DType)0.0f); 140 | 141 | int argmax_h_low = (int)argmax_h; 142 | int argmax_w_low = (int)argmax_w; 143 | int argmax_h_high; 144 | int argmax_w_high; 145 | if (argmax_h_low >= height - 1) { 146 | argmax_h_high = argmax_h_low = height - 1; 147 | argmax_h = (DType)argmax_h_low; 148 | } else { 149 | argmax_h_high = argmax_h_low + 1; 150 | } 151 | if (argmax_w_low >= width - 1) 152 | { 153 | argmax_w_high = argmax_w_low = width - 1; 154 | argmax_w = (DType)argmax_w_low; 155 | } else { 156 | argmax_w_high = argmax_w_low + 1; 157 | } 158 | DType weight = 0; 159 | if (h == argmax_h_low) { 160 | if (w == argmax_w_low) { 161 | weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); 162 | } else if (w == argmax_w_high) { 163 | weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); 164 | } 165 | } else if (h == argmax_h_high) { 166 | if (w == argmax_w_low) { 167 | weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); 168 | } else if (w == argmax_w_high) { 169 | weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); 170 | } 171 | } 172 | return weight; 173 | } 174 | 175 | 176 | template 177 | __device__ DType get_coordinate_weight(DType argmax_h, DType argmax_w, 178 | const int height, const int width, const DType* im_data, 179 | const int data_width, const int bp_dir) { 180 | 181 | if (argmax_h < 0 || argmax_h > height || argmax_w < 0 || argmax_w > width) 182 | { 183 | //empty 184 | return 0; 185 | } 186 | 187 | if (argmax_h < 0) argmax_h = 0; 188 | if (argmax_w < 0) argmax_w = 0; 189 | 190 | int argmax_h_low = (int)argmax_h; 191 | int argmax_w_low = (int)argmax_w; 192 | int argmax_h_high; 193 | int argmax_w_high; 194 | if (argmax_h_low >= height - 1) { 195 | argmax_h_high = argmax_h_low = height - 1; 196 | argmax_h = (DType)argmax_h_low; 197 | } else { 198 | argmax_h_high = argmax_h_low + 1; 199 | } 200 | if (argmax_w_low >= width - 1) { 201 | argmax_w_high = argmax_w_low = width - 1; 202 | argmax_w = (DType)argmax_w_low; 203 | } else { 204 | argmax_w_high = argmax_w_low + 1; 205 | } 206 | DType weight = 0; 207 | 208 | if (bp_dir == 0) { 209 | weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; 210 | weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; 211 | weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; 212 | weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 213 | } else if (bp_dir == 1) { 214 | weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; 215 | weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; 216 | weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; 217 | weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; 218 | } 219 | 220 | return weight; 221 | } 222 | 223 | 224 | /*! 225 | * \brief im2col gpu kernel. 226 | * DO NOT call this directly. Use wrapper function im2col() instead; 227 | */ 228 | template 229 | __global__ void deformable_im2col_gpu_kernel(const int n, const DType* data_im, const DType* data_offset, 230 | const int height, const int width, const int kernel_h, const int kernel_w, 231 | const int pad_h, const int pad_w, 232 | const int stride_h, const int stride_w, 233 | const int dilation_h, const int dilation_w, 234 | const int channel_per_deformable_group, 235 | const int height_col, const int width_col, 236 | DType* data_col) { 237 | CUDA_1D_KERNEL_LOOP(index, n) { 238 | // index index of output matrix 239 | const int w_col = index % width_col; 240 | const int h_col = (index / width_col) % height_col; 241 | const int c_im = (index / width_col) / height_col; 242 | const int c_col = c_im * kernel_h * kernel_w; 243 | 244 | // compute deformable group index 245 | const int deformable_group_index = c_im / channel_per_deformable_group; 246 | 247 | const int h_in = h_col * stride_h - pad_h; 248 | const int w_in = w_col * stride_w - pad_w; 249 | DType* data_col_ptr = data_col + (c_col * height_col + h_col) * width_col + w_col; 250 | const DType* data_im_ptr = data_im + (c_im * height + h_in) * width + w_in; 251 | const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; 252 | 253 | 254 | for (int i = 0; i < kernel_h; ++i) { 255 | for (int j = 0; j < kernel_w; ++j) { 256 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; 257 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; 258 | const DType offset_h = data_offset_ptr[data_offset_h_ptr]; 259 | const DType offset_w = data_offset_ptr[data_offset_w_ptr]; 260 | DType val = static_cast(0); 261 | const DType h_im = h_in + i * dilation_h + offset_h; 262 | const DType w_im = w_in + j * dilation_w + offset_w; 263 | if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { 264 | const DType map_h = i * dilation_h + offset_h; 265 | const DType map_w = j * dilation_w + offset_w; 266 | const int cur_height = height - h_in; 267 | const int cur_width = width - w_in; 268 | val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w); 269 | } 270 | *data_col_ptr = val; 271 | data_col_ptr += height_col * width_col; 272 | } 273 | } 274 | } 275 | } 276 | 277 | /*! 278 | * \brief DO NOT call this directly. Use wrapper function deformable_col2im() instead; 279 | */ 280 | template 281 | __global__ void deformable_col2im_gpu_kernel(const int n, const DType* data_col, const DType* data_offset, 282 | const int channels, const int height, const int width, 283 | const int kernel_h, const int kernel_w, 284 | const int pad_h, const int pad_w, 285 | const int stride_h, const int stride_w, 286 | const int dilation_h, const int dilation_w, 287 | const int channel_per_deformable_group, 288 | const int height_col, const int width_col, 289 | DType* grad_im) { 290 | CUDA_1D_KERNEL_LOOP(index, n) { 291 | const int j = (index / width_col / height_col) % kernel_w; 292 | const int i = (index / width_col / height_col / kernel_w) % kernel_h; 293 | const int c = index / width_col / height_col / kernel_w / kernel_h; 294 | // compute the start and end of the output 295 | 296 | const int deformable_group_index = c / channel_per_deformable_group; 297 | 298 | int w_out = index % width_col; 299 | int h_out = (index / width_col) % height_col; 300 | int w_in = w_out * stride_w - pad_w; 301 | int h_in = h_out * stride_h - pad_h; 302 | 303 | const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; 304 | const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; 305 | const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; 306 | const DType offset_h = data_offset_ptr[data_offset_h_ptr]; 307 | const DType offset_w = data_offset_ptr[data_offset_w_ptr]; 308 | const DType cur_inv_h_data = h_in + i * dilation_h + offset_h; 309 | const DType cur_inv_w_data = w_in + j * dilation_w + offset_w; 310 | 311 | const DType cur_top_grad = data_col[index]; 312 | const int cur_h = (int)cur_inv_h_data; 313 | const int cur_w = (int)cur_inv_w_data; 314 | for (int dy = -2; dy <= 2; dy++) { 315 | for (int dx = -2; dx <= 2; dx++) { 316 | if (cur_h + dy >= 0 && cur_h + dy < height && 317 | cur_w + dx >= 0 && cur_w + dx < width && 318 | abs(cur_inv_h_data - (cur_h + dy)) < 1 && 319 | abs(cur_inv_w_data - (cur_w + dx)) < 1 320 | ) { 321 | int cur_bottom_grad_pos = (c * height + cur_h + dy) * width + cur_w + dx; 322 | DType weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); 323 | CudaAtomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); 324 | } 325 | } 326 | } 327 | } 328 | } 329 | 330 | 331 | /*! 332 | * \brief DO NOT call this directly. Use wrapper function deformable_col2im_coord() instead; 333 | */ 334 | template 335 | __global__ void deformable_col2im_coord_gpu_kernel(const int n, const DType* data_col, 336 | const DType* data_im, const DType* data_offset, 337 | const int channels, const int height, const int width, 338 | const int kernel_h, const int kernel_w, 339 | const int pad_h, const int pad_w, 340 | const int stride_h, const int stride_w, 341 | const int dilation_h, const int dilation_w, 342 | const int channel_per_deformable_group, 343 | const int height_col, const int width_col, 344 | DType* grad_offset) { 345 | CUDA_1D_KERNEL_LOOP(index, n) { 346 | DType val = 0; 347 | int w = index % width_col; 348 | int h = (index / width_col) % height_col; 349 | int c = index / width_col / height_col; 350 | // compute the start and end of the output 351 | 352 | const int deformable_group_index = c / (2 * kernel_h * kernel_w); 353 | const int col_step = kernel_h * kernel_w; 354 | int cnt = 0; 355 | const DType* data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * width_col * height_col; 356 | const DType* data_im_ptr = data_im + deformable_group_index * channel_per_deformable_group / kernel_h / kernel_w * height * width; 357 | const DType* data_offset_ptr = data_offset + deformable_group_index * 2 * kernel_h * kernel_w * height_col * width_col; 358 | 359 | const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; 360 | 361 | for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { 362 | const int col_pos = ((col_c * height_col) + h) * width_col + w; 363 | const int bp_dir = offset_c % 2; 364 | 365 | int j = (col_pos / width_col / height_col) % kernel_w; 366 | int i = (col_pos / width_col / height_col / kernel_w) % kernel_h; 367 | int w_out = col_pos % width_col; 368 | int h_out = (col_pos / width_col) % height_col; 369 | int w_in = w_out * stride_w - pad_w; 370 | int h_in = h_out * stride_h - pad_h; 371 | const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); 372 | const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); 373 | const DType offset_h = data_offset_ptr[data_offset_h_ptr]; 374 | const DType offset_w = data_offset_ptr[data_offset_w_ptr]; 375 | DType inv_h = h_in + i * dilation_h + offset_h; 376 | DType inv_w = w_in + j * dilation_w + offset_w; 377 | if (inv_h < 0 || inv_w < 0 || inv_h >= height || inv_w >= width) { 378 | inv_h = inv_w = -1; 379 | } 380 | const DType weight = get_coordinate_weight( 381 | inv_h, inv_w, 382 | height, width, data_im_ptr + cnt * height * width, width, bp_dir); 383 | val += weight * data_col_ptr[col_pos]; 384 | cnt += 1; 385 | } 386 | 387 | grad_offset[index] = val; 388 | } 389 | } 390 | 391 | 392 | /*! 393 | * \brief im2col gpu kernel. 394 | * DO NOT call this directly. Use wrapper function im2col() instead; 395 | */ 396 | template 397 | __global__ void im2col_gpu_kernel(const int n, const DType* data_im, 398 | const int height, const int width, const int kernel_h, const int kernel_w, 399 | const int pad_h, const int pad_w, 400 | const int stride_h, const int stride_w, 401 | const int dilation_h, const int dilation_w, 402 | const int height_col, const int width_col, 403 | DType* data_col) { 404 | CUDA_1D_KERNEL_LOOP(index, n) { 405 | const int h_index = index / width_col; 406 | const int h_col = h_index % height_col; 407 | const int w_col = index % width_col; 408 | const int c_im = h_index / height_col; 409 | const int c_col = c_im * kernel_h * kernel_w; 410 | const int h_offset = h_col * stride_h - pad_h; 411 | const int w_offset = w_col * stride_w - pad_w; 412 | DType* data_col_ptr = data_col; 413 | data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; 414 | const DType* data_im_ptr = data_im; 415 | data_im_ptr += (c_im * height + h_offset) * width + w_offset; 416 | for (int i = 0; i < kernel_h; ++i) { 417 | for (int j = 0; j < kernel_w; ++j) { 418 | int h_im = h_offset + i * dilation_h; 419 | int w_im = w_offset + j * dilation_w; 420 | *data_col_ptr = 421 | (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? 422 | data_im_ptr[i * dilation_h * width + j * dilation_w] : static_cast(0); 423 | data_col_ptr += height_col * width_col; 424 | } 425 | } 426 | } 427 | } 428 | 429 | 430 | template 431 | __global__ void pureAddToKernel(const int n, DType* result_data, const DType* right_data) 432 | { 433 | 434 | CUDA_1D_KERNEL_LOOP(index, n) { 435 | CudaAtomicAdd(result_data+index, right_data[index]); 436 | } 437 | 438 | } 439 | template 440 | __global__ void pureSubToKernel(const int n, DType* result_data, const DType* right_data) 441 | { 442 | 443 | CUDA_1D_KERNEL_LOOP(index, n) { 444 | CudaAtomicAdd(result_data+index, -right_data[index]); 445 | } 446 | 447 | } 448 | 449 | template 450 | __global__ void setZeroKernel(const int n, DType* result_data) 451 | { 452 | 453 | CUDA_1D_KERNEL_LOOP(index, n) { 454 | *(result_data+index)=DType(0); 455 | } 456 | 457 | } 458 | 459 | 460 | namespace functor { 461 | 462 | inline int ProdShape(const TShape &shape, int start); 463 | 464 | 465 | /*!\brief im2col gpu version 466 | * \param s device stream 467 | * \param data_im pointer of an image (C, H, W, ...) in the image batch 468 | * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...) 469 | * \param kernel_shape kernel filter shape 470 | * \param pad pad shape 471 | * \param stride stride shape 472 | * \param dilation dilation shape 473 | * \param data_col column buffer pointer 474 | */ 475 | template 476 | struct deformable_im2col{ 477 | void operator()(const GPUDevice& d, 478 | const DType* data_im, const DType* data_offset, 479 | const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, 480 | const TShape& pad, const TShape& stride, const TShape& dilation, 481 | const int deformable_group, DType* data_col) { 482 | // num_axes should be smaller than block size 483 | int num_spatial_axes = kernel_shape.size(); 484 | int channel_per_deformable_group = im_shape[1] / deformable_group; 485 | int num_kernels = im_shape[1] * ProdShape(col_shape, 1); 486 | CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); 487 | CHECK_LT(num_spatial_axes, config.thread_per_block); 488 | // num_spatial_axes 2 channel_per_deformable_group 6 num_kernels 72 col_shape 24 3 4 0 im_shape8 6 im_shape4 5 kernel_shape 2 2 pad 0 0 489 | // num_spatial_axes 2 channel_per_deformable_group 6 num_kernels 72 col_shape 24 3 4 0 im_shape8 6 config 1 256 490 | // LOG(INFO) << "num_spatial_axes " << num_spatial_axes << " channel_per_deformable_group " << channel_per_deformable_group 491 | // << " num_kernels " << num_kernels << " col_shape " << col_shape[0]<<" " << col_shape[1] <<" " << col_shape[2]<<" " << col_shape[3]<<" " 492 | // << "im_shape" < // NOLINT_NEXT_LINE(whitespace/operators) 498 | <<>>( 499 | num_kernels, data_im, data_offset, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1], 500 | pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1], channel_per_deformable_group, 501 | col_shape[1], col_shape[2], data_col); 502 | // MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_im2col_gpu_kernel); 503 | break; 504 | default: 505 | LOG(FATAL) << "im2col_nd_gpu does not support computation with " 506 | << num_spatial_axes << " spatial axes"; 507 | } 508 | } 509 | 510 | }; 511 | 512 | 513 | inline int ProdShape(const TShape &shape, int start) { 514 | int64 res = 1; 515 | for(int i=start; i 534 | struct deformable_col2im{ 535 | void operator()(const GPUDevice& d, 536 | const DType* data_col, const DType* data_offset, 537 | const TShape& im_shape, const TShape& col_shape, const TShape& kernel_shape, 538 | const TShape& pad, const TShape& stride, 539 | const TShape& dilation, const int deformable_group, 540 | DType* grad_im) { 541 | int num_spatial_axes = kernel_shape.size(); 542 | int im_size = ProdShape(im_shape, 1); 543 | int channel_per_deformable_group = im_shape[1] / deformable_group; 544 | int num_kernels = ProdShape(col_shape, 0); 545 | // num_axes should be smaller than block size 546 | CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); 547 | CHECK_LT(num_spatial_axes, config.thread_per_block); 548 | // 6 4 52 2 0 0 2 21 1 6 2 2 549 | // 6 4 52 2 0 0 2 21 1 6 2 2 550 | // LOG(INFO) << im_shape[1]<<' '<<<>>( 560 | num_kernels, data_col, data_offset, im_shape[1], im_shape[2], im_shape[3], 561 | kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1], 562 | dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_im); 563 | // MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel); 564 | break; 565 | default: 566 | LOG(FATAL) << "col2im_nd_gpu does not support computation with " 567 | << num_spatial_axes << " spatial axes"; 568 | } 569 | } 570 | }; 571 | 572 | 573 | template 574 | struct deformable_col2im_coord{ 575 | void operator() (const GPUDevice& d, 576 | const DType* data_col, const DType* data_im, const DType* data_offset, const TShape& im_shape, 577 | const TShape& col_shape, const TShape& kernel_shape, 578 | const TShape& pad, const TShape& stride, 579 | const TShape& dilation, const int deformable_group, DType* grad_offset) { 580 | size_t num_spatial_axes = kernel_shape.size(); 581 | size_t num_kernels = col_shape[1] * col_shape[2] * 2 * kernel_shape[0] * kernel_shape[1] * deformable_group; 582 | size_t channel_per_deformable_group = col_shape[0] / deformable_group; 583 | // num_axes should be smaller than block size 584 | CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); 585 | CHECK_LT(num_spatial_axes, config.thread_per_block); 586 | // using namespace mxnet_op; 587 | switch (num_spatial_axes) { 588 | case 2: 589 | // To avoid involving atomic operations, we will launch one kernel per 590 | // bottom dimension, and then in the kernel add up the top dimensions. 591 | // NOLINT_NEXT_LINE(whitespace/operators) 592 | 593 | deformable_col2im_coord_gpu_kernel << > >( 594 | num_kernels, data_col, data_im, data_offset, im_shape[1], im_shape[2], im_shape[3], 595 | kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1], 596 | dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], col_shape[2], grad_offset); 597 | // MSHADOW_CUDA_POST_KERNEL_CHECK(deformable_col2im_gpu_kernel); 598 | break; 599 | default: 600 | LOG(FATAL) << "col2im_nd_gpu does not support computation with " 601 | << num_spatial_axes << " spatial axes"; 602 | } 603 | } 604 | }; 605 | 606 | 607 | /*!\brief im2col gpu version 608 | * \param s device stream 609 | * \param data_im pointer of an image (C, H, W, ...) in the image batch 610 | * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...) 611 | * \param kernel_shape kernel filter shape 612 | * \param pad pad shape 613 | * \param stride stride shape 614 | * \param dilation dilation shape 615 | * \param data_col column buffer pointer 616 | */ 617 | template 618 | struct im2col{ 619 | void operator() (const GPUDevice& d, 620 | const DType* data_im, const TShape& im_shape, 621 | const TShape& col_shape, const TShape& kernel_shape, 622 | const TShape& pad, const TShape& stride, 623 | const TShape& dilation, DType* data_col) { 624 | // num_axes should be smaller than block size 625 | int num_spatial_axes = kernel_shape.size(); 626 | int num_kernels = im_shape[1] * ProdShape(col_shape, 1); 627 | CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); 628 | CHECK_LT(num_spatial_axes, config.thread_per_block); 629 | 630 | switch (num_spatial_axes) { 631 | case 2: 632 | im2col_gpu_kernel // NOLINT_NEXT_LINE(whitespace/operators) 633 | <<>>( 634 | num_kernels, data_im, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1], 635 | pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1], 636 | col_shape[1], col_shape[2], data_col); 637 | break; 638 | 639 | default: 640 | LOG(FATAL) << "im2col_nd_gpu does not support computation with " 641 | << num_spatial_axes << " spatial axes"; 642 | } 643 | } 644 | }; 645 | 646 | 647 | template 648 | struct pureAddTo{ 649 | void operator() (const GPUDevice& d, const int n, DType* result_data, const DType* right_data){ 650 | CudaLaunchConfig config = GetCudaLaunchConfig(n, d); 651 | pureAddToKernel <<< config.block_count, config.thread_per_block, 0, d.stream() >>>(n, result_data, right_data); 652 | } 653 | 654 | }; 655 | 656 | template 657 | struct pureSubTo{ 658 | void operator() (const GPUDevice& d, const int n, DType* result_data, const DType* right_data){ 659 | CudaLaunchConfig config = GetCudaLaunchConfig(n, d); 660 | pureSubToKernel <<< config.block_count, config.thread_per_block, 0, d.stream() >>>(n, result_data, right_data); 661 | } 662 | 663 | }; 664 | template 665 | struct setZero{ 666 | void operator() (const GPUDevice& d, const int n, DType* result_data){ 667 | CudaLaunchConfig config = GetCudaLaunchConfig(n, d); 668 | setZeroKernel <<< config.block_count, config.thread_per_block, 0, d.stream() >>>(n, result_data); 669 | } 670 | 671 | }; 672 | 673 | 674 | } // namespace functor 675 | 676 | #define DECLARE_GPU_SPEC(DType) \ 677 | template struct functor::deformable_im2col; \ 678 | template struct functor::deformable_col2im; \ 679 | template struct functor::deformable_col2im_coord; \ 680 | template struct functor::pureAddTo; \ 681 | template struct functor::pureSubTo; \ 682 | template struct functor::setZero; \ 683 | template struct functor::im2col; 684 | 685 | // extern template struct Copy; 686 | TF_CALL_float(DECLARE_GPU_SPEC); 687 | TF_CALL_double(DECLARE_GPU_SPEC); 688 | 689 | // TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); 690 | #undef DECLARE_GPU_SPEC 691 | 692 | } // namespace tensorflow 693 | 694 | // #endif // GOOGLE_CUDA 695 | 696 | #endif // TENSORFLOW_KERNELS_CONV_OPS_im2col_gpu_H_ 697 | -------------------------------------------------------------------------------- /lib/deformable_pooling.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | // See docs in ../ops/nn_ops.cc. 17 | 18 | #define EIGEN_USE_THREADS 19 | 20 | #include 21 | #include 22 | 23 | 24 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 25 | // #include "tensorflow/core/common_runtime/device.h" 26 | #include "tensorflow/core/framework/op_kernel.h" 27 | #include "tensorflow/core/framework/op.h" 28 | #include "tensorflow/core/framework/numeric_op.h" 29 | #include "tensorflow/core/framework/register_types.h" 30 | #include "tensorflow/core/framework/tensor.h" 31 | #include "tensorflow/core/framework/tensor_shape.h" 32 | #include "tensorflow/core/framework/tensor_slice.h" 33 | #include "tensorflow/core/framework/common_shape_fns.h" 34 | #include "tensorflow/core/lib/core/errors.h" 35 | #include "tensorflow/core/lib/gtl/array_slice.h" 36 | #include "tensorflow/core/util/padding.h" 37 | #include "tensorflow/core/framework/shape_inference.h" 38 | #include "tensorflow/core/util/tensor_format.h" 39 | #include "tensorflow/core/kernels/bounds_check.h" 40 | 41 | #include "tensorflow/core/platform/stream_executor.h" 42 | #include "deform_conv.h" 43 | #include "deform_conv_util.h" 44 | 45 | 46 | namespace tensorflow { 47 | using shape_inference::DimensionHandle; 48 | using shape_inference::InferenceContext; 49 | using shape_inference::ShapeHandle; 50 | 51 | REGISTER_OP("DeformPoolOp").Input("x: T") 52 | .Input("offset: T") 53 | .Output("output: T") 54 | .Attr("T: {half, float, double}") 55 | .Attr("strides: list(int)") 56 | .Attr("pool_size: int") 57 | .Attr("rates: list(int)") 58 | .Attr("num_groups: int") 59 | .Attr(GetPaddingAttrString()) 60 | .Attr("data_format: { 'NHWC', 'NCHW' } = 'NCHW' ") 61 | .SetShapeFn([](InferenceContext* c) { 62 | ShapeHandle input_shape; 63 | TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape)); 64 | ShapeHandle filter_shape; 65 | TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape)); 66 | 67 | std::vector strides; 68 | TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides)); 69 | if (strides.size() != 4) { 70 | return errors::InvalidArgument( 71 | "Deformconv requires the stride attribute to contain 4 values, but " 72 | "got: ", 73 | strides.size()); 74 | } 75 | 76 | std::vector rates; 77 | TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates)); 78 | if (rates.size() != 4) { 79 | return errors::InvalidArgument( 80 | "Deformconv requires the rates attribute to contain 4 values, but " 81 | "got: ", 82 | rates.size()); 83 | } 84 | string data_format; 85 | TensorFormat data_format_; 86 | TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format)); 87 | FormatFromString(data_format, &data_format_); 88 | const int32 stride_rows = GetTensorDim(strides, data_format_, 'H'); 89 | const int32 stride_cols = GetTensorDim(strides, data_format_, 'W'); 90 | 91 | const int32 rate_rows = GetTensorDim(rates, data_format_, 'H'); 92 | const int32 rate_cols = GetTensorDim(rates, data_format_, 'W'); 93 | 94 | 95 | int groups; 96 | TF_RETURN_IF_ERROR(c->GetAttr("num_groups", &groups)); 97 | 98 | int pool_size; 99 | TF_RETURN_IF_ERROR(c->GetAttr("pool_size", &pool_size)) 100 | 101 | DimensionHandle batch_size_dim = c->Dim(input_shape, 0); 102 | DimensionHandle in_rows_dim = c->Dim(input_shape, 2); 103 | DimensionHandle in_cols_dim = c->Dim(input_shape, 3); 104 | DimensionHandle filter_rows_dim = c->Dim(filter_shape, 2); 105 | DimensionHandle filter_cols_dim = c->Dim(filter_shape, 3); 106 | DimensionHandle filter_depth_dim = c->Dim(filter_shape, 1); 107 | DimensionHandle output_depth_dim = c->Dim(filter_shape, 0); 108 | DimensionHandle multiplied_depth; 109 | TF_RETURN_IF_ERROR( 110 | c->Multiply(filter_depth_dim, groups, &multiplied_depth)); 111 | 112 | 113 | 114 | if (!c->ValueKnown(in_rows_dim) || !c->ValueKnown(in_cols_dim) || 115 | !c->ValueKnown(filter_rows_dim) || !c->ValueKnown(filter_cols_dim)) { 116 | ShapeHandle output_shape = 117 | c->MakeShape({batch_size_dim, output_depth_dim, InferenceContext::kUnknownDim, 118 | InferenceContext::kUnknownDim 119 | }); 120 | c->set_output(0, output_shape); 121 | return Status::OK(); 122 | } 123 | DimensionHandle unused; 124 | TF_RETURN_IF_ERROR( 125 | c->Merge(c->Dim(input_shape, 1), multiplied_depth, &unused)); 126 | 127 | auto in_rows = c->Value(in_rows_dim); 128 | auto in_cols = c->Value(in_cols_dim); 129 | auto filter_rows = c->Value(filter_rows_dim); 130 | auto filter_cols = c->Value(filter_cols_dim); 131 | auto filter_rows_eff = filter_rows + (filter_rows - 1) * (rate_rows - 1); 132 | auto filter_cols_eff = filter_cols + (filter_cols - 1) * (rate_cols - 1); 133 | 134 | Padding padding; 135 | TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding)); 136 | 137 | int64 output_rows, output_cols; 138 | int64 padding_before, padding_after; 139 | TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose( 140 | in_rows, filter_rows_eff, stride_rows, padding, &output_rows, 141 | &padding_before, &padding_after)); 142 | TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose( 143 | in_cols, filter_cols_eff, stride_cols, padding, &output_cols, 144 | &padding_before, &padding_after)); 145 | 146 | ShapeHandle output_shape = c->MakeShape( 147 | {batch_size_dim, output_depth_dim, output_rows, output_cols}); 148 | c->set_output(0, output_shape); 149 | return Status::OK(); 150 | }) 151 | .Doc(R"doc( 152 | only support NCHW now 153 | )doc"); 154 | 155 | 156 | REGISTER_OP("DeformPoolBackpropOp").Input("x: T") 157 | .Input("offset: T") 158 | .Input("out_grad: T") 159 | .Output("x_grad: T") 160 | .Output("filter_grad: T") 161 | .Output("offset_grad: T") 162 | .Attr("T: {half, float, double}") 163 | .Attr("strides: list(int)") 164 | .Attr("rates: list(int)") 165 | .Attr("num_groups: int") 166 | .Attr("pool_size: int") 167 | .Attr(GetPaddingAttrString()) 168 | .Attr("data_format: { 'NHWC', 'NCHW' } = 'NCHW' ") 169 | .SetShapeFn([](InferenceContext* c) { 170 | c->set_output(0, c->input(0)); 171 | c->set_output(1, c->input(1)); 172 | c->set_output(2, c->input(2)); 173 | return Status::OK(); 174 | }) 175 | .Doc(R"doc( 176 | only support NCHW now 177 | )doc"); 178 | 179 | typedef std::vector TShape; 180 | typedef Eigen::ThreadPoolDevice CPUDevice; 181 | typedef Eigen::GpuDevice GPUDevice; 182 | 183 | namespace { 184 | template 185 | perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) { 186 | perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory)); 187 | perftools::gputools::DeviceMemory typed(wrapped); 188 | return typed; 189 | } 190 | 191 | class CublasScratchAllocator : public perftools::gputools::ScratchAllocator { 192 | public: 193 | using Stream = ::perftools::gputools::Stream; 194 | using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory; 195 | 196 | CublasScratchAllocator(OpKernelContext* context) : context_(context) {} 197 | 198 | int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; } 199 | 200 | perftools::gputools::port::StatusOr AllocateBytes( 201 | Stream* stream, int64 byte_size) override { 202 | Tensor temporary_memory; 203 | 204 | Status allocation_status(context_->allocate_temp( 205 | DT_UINT8, TensorShape({byte_size}), &temporary_memory)); 206 | if (!allocation_status.ok()) { 207 | return perftools::gputools::port::StatusOr( 208 | DeviceMemoryBytes::MakeFromByteSize(nullptr, 0)); 209 | } 210 | // Hold the reference of the allocated tensors until the end of the 211 | // allocator. 212 | allocated_tensors_.push_back(temporary_memory); 213 | return perftools::gputools::port::StatusOr( 214 | DeviceMemoryBytes::MakeFromByteSize( 215 | temporary_memory.flat().data(), 216 | temporary_memory.flat().size())); 217 | } 218 | 219 | private: 220 | OpKernelContext* context_; 221 | std::vector allocated_tensors_; 222 | }; 223 | } // namespace 224 | 225 | 226 | namespace functor{ 227 | // LaunchBatchMatMul from batch_matmul_impl.h, modifies so now only support 2d case 228 | template 229 | struct LaunchBatchMatMul { 230 | // static void Launch(OpKernelContext* context, const Tensor& in_x, 231 | // const Tensor& in_y, bool adj_x, bool adj_y, Scalar* out) { 232 | static void Launch(OpKernelContext* context, const TensorShape& in_x_shape, const TensorShape& in_y_shape, const Scalar* in_x_ptr, 233 | const Scalar* in_y_ptr, bool adj_x, bool adj_y, Scalar* out) { 234 | constexpr perftools::gputools::blas::Transpose kTranspose = 235 | is_complex::value 236 | ? perftools::gputools::blas::Transpose::kConjugateTranspose 237 | : perftools::gputools::blas::Transpose::kTranspose; 238 | perftools::gputools::blas::Transpose trans[] = { 239 | perftools::gputools::blas::Transpose::kNoTranspose, kTranspose}; 240 | const uint64 m = in_x_shape.dim_size(adj_x ? 2 : 1); 241 | const uint64 k = in_x_shape.dim_size(adj_x ? 1 : 2); 242 | const uint64 n = in_y_shape.dim_size(adj_y ? 1 : 2); 243 | const uint64 batch_size = in_x_shape.dim_size(0); 244 | auto blas_transpose_a = trans[adj_x]; 245 | auto blas_transpose_b = trans[adj_y]; 246 | 247 | auto* stream = context->op_device_context()->stream(); 248 | OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); 249 | 250 | typedef perftools::gputools::DeviceMemory DeviceMemoryType; 251 | std::vector a_device_memory; 252 | std::vector b_device_memory; 253 | std::vector c_device_memory; 254 | std::vector a_ptrs; 255 | std::vector b_ptrs; 256 | std::vector c_ptrs; 257 | a_device_memory.reserve(batch_size); 258 | b_device_memory.reserve(batch_size); 259 | c_device_memory.reserve(batch_size); 260 | a_ptrs.reserve(batch_size); 261 | b_ptrs.reserve(batch_size); 262 | c_ptrs.reserve(batch_size); 263 | auto* a_base_ptr = in_x_ptr; 264 | auto* b_base_ptr = in_y_ptr; 265 | // auto* c_base_ptr = out->template flat().data(); 266 | auto* c_base_ptr = out; 267 | for (int64 i = 0; i ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m, 302 | static_cast(1.0), *(a_ptrs[0]), 303 | adj_x ? m : k, *(b_ptrs[0]), 1, 304 | static_cast(0.0), c_ptrs[0], 1) 305 | .ok(); 306 | if (!blas_launch_status) { 307 | context->SetStatus(errors::Internal( 308 | "Blas xGEMV launch failed : a.shape=", in_x_shape.DebugString(), 309 | ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n, 310 | ", k=", k)); 311 | } 312 | } else { 313 | bool blas_launch_status = 314 | stream 315 | ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 316 | static_cast(1.0), *(b_ptrs[0]), 317 | adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k, 318 | static_cast(0.0), c_ptrs[0], n) 319 | .ok(); 320 | if (!blas_launch_status) { 321 | context->SetStatus(errors::Internal( 322 | "Blas xGEMM launch failed : a.shape=", in_x_shape.DebugString(), 323 | ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n, 324 | ", k=", k)); 325 | } 326 | } 327 | } else { 328 | CublasScratchAllocator scratch_allocator(context); 329 | bool blas_launch_status = 330 | stream 331 | ->ThenBlasGemmBatchedWithScratch( 332 | blas_transpose_b, blas_transpose_a, n, m, k, 333 | static_cast(1.0), b_ptrs, adj_y ? k : n, a_ptrs, 334 | adj_x ? m : k, static_cast(0.0), c_ptrs, n, 335 | batch_size, &scratch_allocator) 336 | .ok(); 337 | if (!blas_launch_status) { 338 | context->SetStatus(errors::Internal( 339 | "Blas xGEMMBatched launch failed : a.shape=", 340 | in_x_shape.DebugString(), 341 | ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n, 342 | ", k=", k, ", batch_size=", batch_size)); 343 | } 344 | } 345 | } 346 | }; 347 | } 348 | 349 | template 350 | class DeformPoolOp : public OpKernel { 351 | public: 352 | explicit DeformPoolOp(OpKernelConstruction* context) : OpKernel(context) { 353 | OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); 354 | OP_REQUIRES_OK(context, context->GetAttr("rates", &rates_)); 355 | string data_format; 356 | OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 357 | // TensorFormat data_format_; 358 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 359 | errors::InvalidArgument("Invalid data format")); 360 | OP_REQUIRES(context, strides_.size() == 4, 361 | errors::InvalidArgument("Sliding window strides field must " 362 | "specify 4 dimensions")); 363 | const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); 364 | const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); 365 | OP_REQUIRES( 366 | context, stride_n == 1 && stride_c == 1, 367 | errors::InvalidArgument("Current implementation does not yet support " 368 | "strides in the batch and depth dimensions.")); 369 | OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 370 | const int64 stride_H = GetTensorDim(strides_, data_format_, 'H'); 371 | const int64 stride_W = GetTensorDim(strides_, data_format_, 'W'); 372 | OP_REQUIRES_OK(context, context->GetAttr("num_groups", &num_groups)); 373 | OP_REQUIRES_OK(context, context->GetAttr("pool_size", &pool_size)) 374 | 375 | } 376 | 377 | void Compute(OpKernelContext* context) override { 378 | CHECK(data_format_ == FORMAT_NCHW) << "Generic conv implementation only " 379 | "supports NCHW tensor format for now."; 380 | // Input tensor is of the following dimensions: 381 | // [ batch, in_rows, in_cols, in_depth ] 382 | 383 | const Tensor& input = context->input(0); 384 | const TensorShape& ishape = input.shape(); 385 | 386 | const Tensor& offset = context->input(1); 387 | const TensorShape& offset_shape = offset.shape(); 388 | 389 | // For 2D convolution, there should be 4 dimensions. 390 | OP_REQUIRES(context, input.dims() == 4, 391 | errors::InvalidArgument("input must be 4-dimensional", 392 | input.shape().DebugString())); 393 | OP_REQUIRES(context, offset.dims() == 4, 394 | errors::InvalidArgument("offset must be 4-dimensional: ", 395 | offset.shape().DebugString())); 396 | } 397 | 398 | // The last dimension for input is in_depth. It must be the same as the 399 | // filter's in_depth. 400 | const int64 in_depth = GetTensorDim(input, data_format_, 'C'); 401 | 402 | // The second dimension for input is rows/height. 403 | // The first dimension for filter is rows/height. 404 | const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); 405 | OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, 406 | std::numeric_limits::max()), 407 | errors::InvalidArgument("Input rows too large")); 408 | const int input_rows = static_cast(input_rows_raw); 409 | 410 | // The third dimension for input is columns/width. 411 | // The second dimension for filter is columns/width. 412 | const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); 413 | OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, 414 | std::numeric_limits::max()), 415 | errors::InvalidArgument("Input cols too large")); 416 | const int input_cols = static_cast(input_cols_raw); 417 | 418 | // The first dimension for input is batch. 419 | const int64 batch_raw = GetTensorDim(input, data_format_, 'N'); 420 | OP_REQUIRES(context, 421 | FastBoundsCheck(batch_raw, std::numeric_limits::max()), 422 | errors::InvalidArgument("batch is too large")); 423 | const int batch = static_cast(batch_raw); 424 | 425 | // For now we take the stride from the second and third dimensions only (we 426 | // do not support striding on the batch or depth dimension). 427 | const int stride_rows = GetTensorDim(strides_, data_format_, 'H'); 428 | const int stride_cols = GetTensorDim(strides_, data_format_, 'W'); 429 | const int rate_rows = GetTensorDim(rates_, data_format_, 'H'); 430 | const int rate_cols = GetTensorDim(rates_, data_format_, 'W'); 431 | 432 | int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; 433 | OP_REQUIRES_OK(context, 434 | GetWindowedOutputSize(input_rows, filter_rows, stride_rows, 435 | padding_, &out_rows, &pad_rows)); 436 | OP_REQUIRES_OK(context, 437 | GetWindowedOutputSize(input_cols, filter_cols, stride_cols, 438 | padding_, &out_cols, &pad_cols)); 439 | TShape pad({static_cast(pad_rows), static_cast(pad_cols)}); 440 | TShape stride({stride_rows, stride_cols}); 441 | TShape rates({rate_rows, rate_cols}); 442 | TensorShape out_shape = ShapeFromFormat(data_format_, batch, out_rows, out_cols); 443 | auto temp = DeformPoolParam(kernels, stride, pad, rates, num_groups, pool_size, true); 444 | this->param_ = &temp; 445 | // LOG(INFO)<<"rates "<<(this->param_->rates)[0]<<" "<<(this->param_->rates)[1]; 446 | LayerSetUp(ishape, offset_shape, out_shape); 447 | 448 | 449 | 450 | 451 | VLOG(2) << "Conv2D: in_depth = " << in_depth 452 | << ", input_cols = " << input_cols 453 | << ", filter_cols = " << filter_cols 454 | << ", input_rows = " << input_rows 455 | << ", filter_rows = " << filter_rows 456 | << ", stride_rows = " << stride_rows 457 | << ", stride_cols = " << stride_cols 458 | << ", out_depth = " << num_filter; 459 | 460 | // If there is nothing to compute, return. 461 | if (out_shape.num_elements() == 0) 462 | return; 463 | 464 | } 465 | 466 | private: 467 | void LayerSetUp(const TensorShape& ishape, const TensorShape& offset_shape, 468 | const TensorShape& oshape) { 469 | channel_axis_ = 1; // hard code channel axis 470 | const int first_spatial_axis = channel_axis_ + 1; 471 | const int num_axes = param_->kernel.size() + 2; 472 | num_spatial_axes_ = num_axes - first_spatial_axis; 473 | is_1x1_ = true; 474 | for (int i = 0; i < param_->kernel.size(); ++i) { 475 | is_1x1_ &= 476 | param_->kernel[i] == 1 && param_->stride[i] == 1 && param_->pad[i] == 0; 477 | if (!is_1x1_) break; 478 | } 479 | 480 | // batch size 481 | num_ = ishape.dim_size(0); 482 | // number of input channels 483 | channels_ = ishape.dim_size(1); 484 | group_ = param_->num_group; 485 | conv_out_channels_ = param_->num_filter; 486 | conv_in_channels_ = channels_; 487 | bias_term_ = !param_->no_bias; 488 | kernel_dim_ = conv_in_channels_ / group_ * param_->kernel[0]*param_->kernel[1]; 489 | weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; 490 | conv_out_spatial_dim_ = ProdShape(oshape, 2); 491 | col_offset_ = kernel_dim_ * conv_out_spatial_dim_; 492 | output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; 493 | // size of the column buffer used for storing im2col-ed pixels 494 | col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; 495 | // input/output image size (#channels * height * width) 496 | input_dim_ = ProdShape(ishape, 1); 497 | input_offset_dim_ = ProdShape(offset_shape, 1); 498 | output_dim_ = ProdShape(oshape, 1); 499 | num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; 500 | num_kernels_col2im_ = input_dim_; 501 | } 502 | 503 | // DeformableConvolutionParam param_; 504 | int channel_axis_; // channel axis of the input 505 | int channels_; // number of channels of input image 506 | int num_spatial_axes_; // number of spatial axes 507 | int num_; // batch size 508 | int group_; // number of groups 509 | int conv_out_channels_; // number of output channels (num_filter) 510 | int conv_out_spatial_dim_; // number of pixels of output images per channel 511 | int conv_in_channels_; // number of input channels 512 | int kernel_dim_; // number of input channels per group * kernel size 513 | int weight_offset_; // number of output channels per group * kernel_dim_ 514 | int col_offset_; 515 | int output_offset_; 516 | int col_buffer_size_; 517 | int input_dim_; 518 | int input_offset_dim_; 519 | int output_dim_; 520 | int num_kernels_im2col_; 521 | int num_kernels_col2im_; 522 | int num_groups; 523 | int pool_size; 524 | bool bias_term_; // has bias term? 525 | bool is_1x1_; 526 | 527 | std::vector strides_; 528 | std::vector rates_; 529 | Padding padding_; 530 | TensorFormat data_format_; 531 | DeformConvParam* param_; 532 | }; 533 | 534 | 535 | 536 | template 537 | class DeformPoolBackpropOp : public OpKernel { 538 | public: 539 | explicit DeformPoolBackpropOp(OpKernelConstruction* context) 540 | : OpKernel(context) { 541 | OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); 542 | OP_REQUIRES_OK(context, context->GetAttr("rates", &rates_)); 543 | string data_format; 544 | OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 545 | // TensorFormat data_format_; 546 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 547 | errors::InvalidArgument("Invalid data format")); 548 | OP_REQUIRES(context, strides_.size() == 4, 549 | errors::InvalidArgument("Sliding window strides field must " 550 | "specify 4 dimensions")); 551 | const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); 552 | const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); 553 | OP_REQUIRES( 554 | context, stride_n == 1 && stride_c == 1, 555 | errors::InvalidArgument("Current implementation does not yet support " 556 | "strides in the batch and depth dimensions.")); 557 | OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 558 | const int64 stride_H = GetTensorDim(strides_, data_format_, 'H'); 559 | const int64 stride_W = GetTensorDim(strides_, data_format_, 'W'); 560 | OP_REQUIRES_OK(context, context->GetAttr("num_groups", &num_groups)); 561 | OP_REQUIRES_OK(context, context->GetAttr("pool_size", &pool_size)); 562 | } 563 | 564 | void Compute(OpKernelContext* context) override { 565 | const Tensor& input = context->input(0); 566 | const Tensor& filter = context->input(1); 567 | const Tensor& offset = context->input(2); 568 | const Tensor& out_backprop = context->input(3); 569 | const T* input_ptr = input.template flat().data(); 570 | const T* filter_ptr = filter.template flat().data(); 571 | const T* offset_ptr = offset.template flat().data(); 572 | const T* out_backprop_ptr = out_backprop.template flat().data(); 573 | const TensorShape& input_shape = input.shape(); 574 | const TensorShape& filter_shape = filter.shape(); 575 | const TensorShape& offset_shape = offset.shape(); 576 | const TensorShape& out_backprop_shape = out_backprop.shape(); 577 | 578 | // const Tensor& filter_backprop = context->input(4); 579 | // const Tensor& offset_backprop = context->input(5); 580 | int num_filter = filter.dim_size(0); 581 | const int64 in_depth = GetTensorDim(input, data_format_, 'C'); 582 | const int batch = input.dim_size(0); 583 | const int depth = input.dim_size(1); 584 | int64 out_rows = input.dim_size(2); 585 | int64 out_cols = input.dim_size(3); 586 | 587 | const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); 588 | const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); 589 | const int stride_rows = GetTensorDim(strides_, data_format_, 'H'); 590 | const int stride_cols = GetTensorDim(strides_, data_format_, 'W'); 591 | const int rate_rows = GetTensorDim(rates_, data_format_, 'H'); 592 | const int rate_cols = GetTensorDim(rates_, data_format_, 'W'); 593 | const int input_rows = static_cast(input_rows_raw); 594 | const int filter_rows = static_cast(filter.dim_size(2)); 595 | const int input_cols = static_cast(input_cols_raw); 596 | const int filter_cols = static_cast(filter.dim_size(3)); 597 | int64 pad_rows = 0, pad_cols = 0; 598 | OP_REQUIRES_OK(context, 599 | GetWindowedOutputSize(input_rows, filter_rows, stride_rows, 600 | padding_, &out_rows, &pad_rows)); 601 | OP_REQUIRES_OK(context, 602 | GetWindowedOutputSize(input_cols, filter_cols, stride_cols, 603 | padding_, &out_cols, &pad_cols)); 604 | TShape pad({static_cast(pad_rows), static_cast(pad_cols)}); 605 | TShape stride({stride_rows, stride_cols}); 606 | TShape kernels({filter_rows, filter_cols}); 607 | TShape rates({rate_rows, rate_cols}); 608 | auto temp = DeformPoolParam(kernels, stride, pad, rates, num_groups, num_filter, pool_size, true); 609 | param_ = &temp; 610 | LayerSetUp(input_shape, offset_shape, out_backprop_shape); 611 | int M = kernel_dim_; 612 | int N = conv_out_spatial_dim_; 613 | int K = conv_out_channels_ / group_; 614 | 615 | Tensor* in_backprop = nullptr; 616 | OP_REQUIRES_OK(context, 617 | context->allocate_output(0, input.shape(), &in_backprop)); 618 | auto in_backprop_ptr = in_backprop->template flat().data(); 619 | Tensor* filter_backprop = nullptr; 620 | OP_REQUIRES_OK(context, 621 | context->allocate_output(1, filter.shape(), &filter_backprop)); 622 | auto filter_backprop_ptr = filter_backprop->template flat().data(); 623 | Tensor temp_filter_backprop; 624 | Tensor* offset_backprop = nullptr; 625 | OP_REQUIRES_OK(context, 626 | context->allocate_output(2, offset.shape(), &offset_backprop)); 627 | auto offset_backprop_ptr = offset_backprop->template flat().data(); 628 | OP_REQUIRES_OK(context, 629 | context->allocate_temp(DataTypeToEnum::value, filter.shape(), &temp_filter_backprop)); 630 | auto temp_filter_backprop_ptr = temp_filter_backprop.template flat().data(); 631 | TensorShape col_buffer_shape({conv_in_channels_*filter_rows*filter_cols, out_backprop.dim_size(2), out_backprop.dim_size(3)}); 632 | TensorShape col_buffer_3d_shape({group_, M, N}); 633 | Tensor col_buffer_3d; 634 | OP_REQUIRES_OK(context, 635 | context->allocate_temp(DataTypeToEnum::value, col_buffer_3d_shape, &col_buffer_3d)); 636 | // auto col_temp = col_buffer_3d.template flat(); 637 | // col_temp.device(d) = col_temp.constant(T(0)); 638 | T* col_buffer_ptr = col_buffer_3d.template flat().data(); 639 | auto weight_3d_shape=TensorShape({group_, K, M}); 640 | const T* weight_3d_ptr = filter_ptr; 641 | Tensor out_grad_4d; 642 | TensorShape out_grad_4d_shape = TensorShape({num_, group_, K, N}); 643 | int out_grad_3d_dim = group_ * K * N; 644 | const T* out_grad_4d_ptr = out_backprop_ptr; 645 | // If there is nothing to compute, return. 646 | if (input.shape().num_elements() == 0) { 647 | return; 648 | } 649 | TensorShape out_grad_3d_shape = out_grad_4d_shape; 650 | out_grad_3d_shape.RemoveDim(0); 651 | const Device& d = context->eigen_device(); 652 | functor::setZero()(d, group_*M*N, col_buffer_ptr); 653 | functor::setZero()(d, ProdShape(filter.shape(), 0), temp_filter_backprop_ptr); 654 | functor::setZero()(d, ProdShape(input.shape(), 0), in_backprop_ptr); 655 | functor::setZero()(d, ProdShape(filter.shape(), 0), filter_backprop_ptr); 656 | 657 | // functor::setZero()(d, group_*M*N, col_buffer_ptr); 658 | // 32 120 8 3 7 4 659 | // 32 120 8 3 7 4 660 | // LOG(WARNING) << input_offset_dim_<<' ' << input_dim_<<' '<::Launch(context, weight_3d_shape, out_grad_3d_shape, weight_3d_ptr, 666 | out_grad_4d_ptr+n*out_grad_3d_dim, true, false, col_buffer_ptr); 667 | 668 | 669 | // gradient w.r.t. input coordinate data 670 | functor::deformable_col2im_coord()(d, col_buffer_ptr, 671 | input_ptr + n*input_dim_, offset_ptr + n*input_offset_dim_, 672 | ToVector(input_shape), ToVector(col_buffer_shape), 673 | param_->kernel, param_->pad, param_->stride, param_->rates, 1, 674 | offset_backprop_ptr + n*input_offset_dim_); 675 | 676 | // gradient w.r.t. input data 677 | functor::deformable_col2im()(d, col_buffer_ptr, 678 | offset_ptr + n*input_offset_dim_, ToVector(input_shape), ToVector(col_buffer_shape), 679 | param_->kernel, param_->pad, param_->stride, param_->rates, 1, 680 | in_backprop_ptr + n*input_dim_); 681 | 682 | // gradient w.r.t. weight, dWeight should accumulate across the batch and group 683 | functor::im2col()(d, input_ptr + n*input_dim_, ToVector(input_shape), 684 | ToVector(col_buffer_shape), param_->kernel, param_->pad, param_->stride, param_->rates, 685 | col_buffer_ptr); 686 | if (0 == n) { 687 | functor::LaunchBatchMatMul::Launch(context, out_grad_3d_shape, col_buffer_3d_shape, out_grad_4d_ptr+n*out_grad_3d_dim, 688 | col_buffer_ptr, false, true, filter_backprop_ptr); 689 | } 690 | else { 691 | functor::LaunchBatchMatMul::Launch(context, out_grad_3d_shape, col_buffer_3d_shape, out_grad_4d_ptr+n*out_grad_3d_dim, 692 | col_buffer_ptr, false, true, temp_filter_backprop_ptr); 693 | functor::pureAddTo()(d, ProdShape(filter.shape(), 0), filter_backprop_ptr, temp_filter_backprop_ptr); 694 | } 695 | } 696 | // functor::pureSubTo()(d, ProdShape(input_shape, 0), in_backprop_ptr, input_ptr); 697 | } 698 | 699 | 700 | private: 701 | void LayerSetUp(const TensorShape& ishape, const TensorShape& offset_shape, 702 | const TensorShape& oshape) { 703 | channel_axis_ = 1; // hard code channel axis 704 | const int first_spatial_axis = channel_axis_ + 1; 705 | const int num_axes = param_->kernel.size() + 2; 706 | num_spatial_axes_ = num_axes - first_spatial_axis; 707 | is_1x1_ = true; 708 | for (int i = 0; i < param_->kernel.size(); ++i) { 709 | is_1x1_ &= 710 | param_->kernel[i] == 1 && param_->stride[i] == 1 && param_->pad[i] == 0; 711 | if (!is_1x1_) break; 712 | } 713 | 714 | // batch size 715 | num_ = ishape.dim_size(0); 716 | // number of input channels 717 | channels_ = ishape.dim_size(1); 718 | group_ = param_->num_group; 719 | conv_out_channels_ = param_->num_filter; 720 | conv_in_channels_ = channels_; 721 | bias_term_ = !param_->no_bias; 722 | kernel_dim_ = conv_in_channels_ / group_ * param_->kernel[0]*param_->kernel[1]; 723 | weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; 724 | conv_out_spatial_dim_ = ProdShape(oshape, 2); 725 | col_offset_ = kernel_dim_ * conv_out_spatial_dim_; 726 | output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; 727 | // size of the column buffer used for storing im2col-ed pixels 728 | col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; 729 | // input/output image size (#channels * height * width) 730 | input_dim_ = ProdShape(ishape, 1); 731 | input_offset_dim_ = ProdShape(offset_shape, 1); 732 | output_dim_ = ProdShape(oshape, 1); 733 | num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; 734 | num_kernels_col2im_ = input_dim_; 735 | } 736 | 737 | // DeformableConvolutionParam param_; 738 | int channel_axis_; // channel axis of the input 739 | int channels_; // number of channels of input image 740 | int num_spatial_axes_; // number of spatial axes 741 | int num_; // batch size 742 | int group_; // number of groups 743 | int conv_out_channels_; // number of output channels (num_filter) 744 | int conv_out_spatial_dim_; // number of pixels of output images per channel 745 | int conv_in_channels_; // number of input channels 746 | int kernel_dim_; // number of input channels per group * kernel size 747 | int weight_offset_; // number of output channels per group * kernel_dim_ 748 | int col_offset_; 749 | int output_offset_; 750 | int col_buffer_size_; 751 | int input_dim_; 752 | int input_offset_dim_; 753 | int output_dim_; 754 | int num_kernels_im2col_; 755 | int num_kernels_col2im_; 756 | int num_groups; 757 | int pool_size; 758 | bool bias_term_; // has bias term? 759 | bool is_1x1_; 760 | 761 | std::vector strides_; 762 | std::vector rates_; 763 | Padding padding_; 764 | TensorFormat data_format_; 765 | DeformConvParam* param_; 766 | 767 | }; 768 | 769 | #if GOOGLE_CUDA 770 | 771 | #define REGISTER(T) \ 772 | REGISTER_KERNEL_BUILDER( \ 773 | Name("DeformConvOp").Device(DEVICE_GPU).TypeConstraint("T"), \ 774 | DeformConvOp); \ 775 | REGISTER_KERNEL_BUILDER( \ 776 | Name("DeformConvBackpropOp").Device(DEVICE_GPU).TypeConstraint("T"), \ 777 | DeformConvBackpropOp); 778 | 779 | // TF_CALL_GPU_NUMBER_TYPES(REGISTER); 780 | TF_CALL_float(REGISTER); 781 | TF_CALL_double(REGISTER); 782 | 783 | #undef REGISTER 784 | 785 | #endif // GOOGLE_CUDA 786 | 787 | } // namespace tensorflow 788 | -------------------------------------------------------------------------------- /lib/deform_conv.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | // See docs in ../ops/nn_ops.cc. 17 | 18 | #define EIGEN_USE_THREADS 19 | 20 | #include 21 | #include 22 | 23 | 24 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 25 | // #include "tensorflow/core/common_runtime/device.h" 26 | #include "tensorflow/core/framework/op_kernel.h" 27 | #include "tensorflow/core/framework/op.h" 28 | #include "tensorflow/core/framework/numeric_op.h" 29 | #include "tensorflow/core/framework/register_types.h" 30 | #include "tensorflow/core/framework/tensor.h" 31 | #include "tensorflow/core/framework/tensor_shape.h" 32 | #include "tensorflow/core/framework/tensor_slice.h" 33 | #include "tensorflow/core/framework/common_shape_fns.h" 34 | #include "tensorflow/core/lib/core/errors.h" 35 | #include "tensorflow/core/lib/gtl/array_slice.h" 36 | #include "tensorflow/core/util/padding.h" 37 | #include "tensorflow/core/framework/shape_inference.h" 38 | #include "tensorflow/core/util/tensor_format.h" 39 | #include "tensorflow/core/kernels/bounds_check.h" 40 | 41 | #include "tensorflow/core/platform/stream_executor.h" 42 | #include "deform_conv.h" 43 | #include "deform_conv_util.h" 44 | 45 | 46 | namespace tensorflow { 47 | using shape_inference::DimensionHandle; 48 | using shape_inference::InferenceContext; 49 | using shape_inference::ShapeHandle; 50 | 51 | REGISTER_OP("DeformConvOp").Input("x: T") 52 | .Input("filter: T") 53 | .Input("offset: T") 54 | .Output("output: T") 55 | .Attr("T: {half, float, double}") 56 | .Attr("strides: list(int)") 57 | .Attr("rates: list(int)") 58 | .Attr("num_groups: int") 59 | .Attr(GetPaddingAttrString()) 60 | .Attr("data_format: { 'NHWC', 'NCHW' } = 'NCHW' ") 61 | .SetShapeFn([](InferenceContext* c) { 62 | ShapeHandle input_shape; 63 | TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape)); 64 | ShapeHandle filter_shape; 65 | TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape)); 66 | 67 | std::vector strides; 68 | TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides)); 69 | if (strides.size() != 4) { 70 | return errors::InvalidArgument( 71 | "Deformconv requires the stride attribute to contain 4 values, but " 72 | "got: ", 73 | strides.size()); 74 | } 75 | 76 | std::vector rates; 77 | TF_RETURN_IF_ERROR(c->GetAttr("rates", &rates)); 78 | if (rates.size() != 4) { 79 | return errors::InvalidArgument( 80 | "Deformconv requires the rates attribute to contain 4 values, but " 81 | "got: ", 82 | rates.size()); 83 | } 84 | string data_format; 85 | TensorFormat data_format_; 86 | TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format)); 87 | FormatFromString(data_format, &data_format_); 88 | const int32 stride_rows = GetTensorDim(strides, data_format_, 'H'); 89 | const int32 stride_cols = GetTensorDim(strides, data_format_, 'W'); 90 | 91 | const int32 rate_rows = GetTensorDim(rates, data_format_, 'H'); 92 | const int32 rate_cols = GetTensorDim(rates, data_format_, 'W'); 93 | 94 | 95 | int groups; 96 | TF_RETURN_IF_ERROR(c->GetAttr("num_groups", &groups)); 97 | 98 | DimensionHandle batch_size_dim = c->Dim(input_shape, 0); 99 | DimensionHandle in_rows_dim = c->Dim(input_shape, 2); 100 | DimensionHandle in_cols_dim = c->Dim(input_shape, 3); 101 | DimensionHandle filter_rows_dim = c->Dim(filter_shape, 2); 102 | DimensionHandle filter_cols_dim = c->Dim(filter_shape, 3); 103 | DimensionHandle filter_depth_dim = c->Dim(filter_shape, 1); 104 | DimensionHandle output_depth_dim = c->Dim(filter_shape, 0); 105 | DimensionHandle multiplied_depth; 106 | TF_RETURN_IF_ERROR( 107 | c->Multiply(filter_depth_dim, groups, &multiplied_depth)); 108 | 109 | 110 | 111 | if (!c->ValueKnown(in_rows_dim) || !c->ValueKnown(in_cols_dim) || 112 | !c->ValueKnown(filter_rows_dim) || !c->ValueKnown(filter_cols_dim)) { 113 | ShapeHandle output_shape = 114 | c->MakeShape({batch_size_dim, output_depth_dim, InferenceContext::kUnknownDim, 115 | InferenceContext::kUnknownDim 116 | }); 117 | c->set_output(0, output_shape); 118 | return Status::OK(); 119 | } 120 | DimensionHandle unused; 121 | TF_RETURN_IF_ERROR( 122 | c->Merge(c->Dim(input_shape, 1), multiplied_depth, &unused)); 123 | 124 | auto in_rows = c->Value(in_rows_dim); 125 | auto in_cols = c->Value(in_cols_dim); 126 | auto filter_rows = c->Value(filter_rows_dim); 127 | auto filter_cols = c->Value(filter_cols_dim); 128 | auto filter_rows_eff = filter_rows + (filter_rows - 1) * (rate_rows - 1); 129 | auto filter_cols_eff = filter_cols + (filter_cols - 1) * (rate_cols - 1); 130 | 131 | Padding padding; 132 | TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding)); 133 | 134 | int64 output_rows, output_cols; 135 | int64 padding_before, padding_after; 136 | TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose( 137 | in_rows, filter_rows_eff, stride_rows, padding, &output_rows, 138 | &padding_before, &padding_after)); 139 | TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose( 140 | in_cols, filter_cols_eff, stride_cols, padding, &output_cols, 141 | &padding_before, &padding_after)); 142 | 143 | ShapeHandle output_shape = c->MakeShape( 144 | {batch_size_dim, output_depth_dim, output_rows, output_cols}); 145 | c->set_output(0, output_shape); 146 | return Status::OK(); 147 | }) 148 | .Doc(R"doc( 149 | only support NCHW now 150 | )doc"); 151 | 152 | 153 | REGISTER_OP("DeformConvBackpropOp").Input("x: T") 154 | .Input("filter: T") 155 | .Input("offset: T") 156 | .Input("out_grad: T") 157 | .Output("x_grad: T") 158 | .Output("filter_grad: T") 159 | .Output("offset_grad: T") 160 | .Attr("T: {half, float, double}") 161 | .Attr("strides: list(int)") 162 | .Attr("rates: list(int)") 163 | .Attr("num_groups: int") 164 | .Attr(GetPaddingAttrString()) 165 | .Attr("data_format: { 'NHWC', 'NCHW' } = 'NCHW' ") 166 | .SetShapeFn([](InferenceContext* c) { 167 | c->set_output(0, c->input(0)); 168 | c->set_output(1, c->input(1)); 169 | c->set_output(2, c->input(2)); 170 | return Status::OK(); 171 | }) 172 | .Doc(R"doc( 173 | only support NCHW now 174 | )doc"); 175 | 176 | typedef std::vector TShape; 177 | typedef Eigen::ThreadPoolDevice CPUDevice; 178 | typedef Eigen::GpuDevice GPUDevice; 179 | 180 | namespace { 181 | template 182 | perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) { 183 | perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory)); 184 | perftools::gputools::DeviceMemory typed(wrapped); 185 | return typed; 186 | } 187 | 188 | class CublasScratchAllocator : public perftools::gputools::ScratchAllocator { 189 | public: 190 | using Stream = ::perftools::gputools::Stream; 191 | using DeviceMemoryBytes = ::perftools::gputools::DeviceMemory; 192 | 193 | CublasScratchAllocator(OpKernelContext* context) : context_(context) {} 194 | 195 | int64 GetMemoryLimitInBytes(Stream* stream) override { return -1; } 196 | 197 | perftools::gputools::port::StatusOr AllocateBytes( 198 | Stream* stream, int64 byte_size) override { 199 | Tensor temporary_memory; 200 | 201 | Status allocation_status(context_->allocate_temp( 202 | DT_UINT8, TensorShape({byte_size}), &temporary_memory)); 203 | if (!allocation_status.ok()) { 204 | return perftools::gputools::port::StatusOr( 205 | DeviceMemoryBytes::MakeFromByteSize(nullptr, 0)); 206 | } 207 | // Hold the reference of the allocated tensors until the end of the 208 | // allocator. 209 | allocated_tensors_.push_back(temporary_memory); 210 | return perftools::gputools::port::StatusOr( 211 | DeviceMemoryBytes::MakeFromByteSize( 212 | temporary_memory.flat().data(), 213 | temporary_memory.flat().size())); 214 | } 215 | 216 | private: 217 | OpKernelContext* context_; 218 | std::vector allocated_tensors_; 219 | }; 220 | } // namespace 221 | 222 | 223 | namespace functor{ 224 | // LaunchBatchMatMul from batch_matmul_impl.h, modifies so now only support 2d case 225 | template 226 | struct LaunchBatchMatMul { 227 | // static void Launch(OpKernelContext* context, const Tensor& in_x, 228 | // const Tensor& in_y, bool adj_x, bool adj_y, Scalar* out) { 229 | static void Launch(OpKernelContext* context, const TensorShape& in_x_shape, const TensorShape& in_y_shape, const Scalar* in_x_ptr, 230 | const Scalar* in_y_ptr, bool adj_x, bool adj_y, Scalar* out) { 231 | constexpr perftools::gputools::blas::Transpose kTranspose = 232 | is_complex::value 233 | ? perftools::gputools::blas::Transpose::kConjugateTranspose 234 | : perftools::gputools::blas::Transpose::kTranspose; 235 | perftools::gputools::blas::Transpose trans[] = { 236 | perftools::gputools::blas::Transpose::kNoTranspose, kTranspose}; 237 | const uint64 m = in_x_shape.dim_size(adj_x ? 2 : 1); 238 | const uint64 k = in_x_shape.dim_size(adj_x ? 1 : 2); 239 | const uint64 n = in_y_shape.dim_size(adj_y ? 1 : 2); 240 | const uint64 batch_size = in_x_shape.dim_size(0); 241 | auto blas_transpose_a = trans[adj_x]; 242 | auto blas_transpose_b = trans[adj_y]; 243 | 244 | auto* stream = context->op_device_context()->stream(); 245 | OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); 246 | 247 | typedef perftools::gputools::DeviceMemory DeviceMemoryType; 248 | std::vector a_device_memory; 249 | std::vector b_device_memory; 250 | std::vector c_device_memory; 251 | std::vector a_ptrs; 252 | std::vector b_ptrs; 253 | std::vector c_ptrs; 254 | a_device_memory.reserve(batch_size); 255 | b_device_memory.reserve(batch_size); 256 | c_device_memory.reserve(batch_size); 257 | a_ptrs.reserve(batch_size); 258 | b_ptrs.reserve(batch_size); 259 | c_ptrs.reserve(batch_size); 260 | auto* a_base_ptr = in_x_ptr; 261 | auto* b_base_ptr = in_y_ptr; 262 | // auto* c_base_ptr = out->template flat().data(); 263 | auto* c_base_ptr = out; 264 | for (int64 i = 0; i ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m, 299 | static_cast(1.0), *(a_ptrs[0]), 300 | adj_x ? m : k, *(b_ptrs[0]), 1, 301 | static_cast(0.0), c_ptrs[0], 1) 302 | .ok(); 303 | if (!blas_launch_status) { 304 | context->SetStatus(errors::Internal( 305 | "Blas xGEMV launch failed : a.shape=", in_x_shape.DebugString(), 306 | ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n, 307 | ", k=", k)); 308 | } 309 | } else { 310 | bool blas_launch_status = 311 | stream 312 | ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 313 | static_cast(1.0), *(b_ptrs[0]), 314 | adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k, 315 | static_cast(0.0), c_ptrs[0], n) 316 | .ok(); 317 | if (!blas_launch_status) { 318 | context->SetStatus(errors::Internal( 319 | "Blas xGEMM launch failed : a.shape=", in_x_shape.DebugString(), 320 | ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n, 321 | ", k=", k)); 322 | } 323 | } 324 | } else { 325 | CublasScratchAllocator scratch_allocator(context); 326 | bool blas_launch_status = 327 | stream 328 | ->ThenBlasGemmBatchedWithScratch( 329 | blas_transpose_b, blas_transpose_a, n, m, k, 330 | static_cast(1.0), b_ptrs, adj_y ? k : n, a_ptrs, 331 | adj_x ? m : k, static_cast(0.0), c_ptrs, n, 332 | batch_size, &scratch_allocator) 333 | .ok(); 334 | if (!blas_launch_status) { 335 | context->SetStatus(errors::Internal( 336 | "Blas xGEMMBatched launch failed : a.shape=", 337 | in_x_shape.DebugString(), 338 | ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n, 339 | ", k=", k, ", batch_size=", batch_size)); 340 | } 341 | } 342 | } 343 | }; 344 | } 345 | 346 | template 347 | class DeformConvOp : public OpKernel { 348 | public: 349 | explicit DeformConvOp(OpKernelConstruction* context) : OpKernel(context) { 350 | OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); 351 | OP_REQUIRES_OK(context, context->GetAttr("rates", &rates_)); 352 | string data_format; 353 | OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 354 | // TensorFormat data_format_; 355 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 356 | errors::InvalidArgument("Invalid data format")); 357 | OP_REQUIRES(context, strides_.size() == 4, 358 | errors::InvalidArgument("Sliding window strides field must " 359 | "specify 4 dimensions")); 360 | const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); 361 | const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); 362 | OP_REQUIRES( 363 | context, stride_n == 1 && stride_c == 1, 364 | errors::InvalidArgument("Current implementation does not yet support " 365 | "strides in the batch and depth dimensions.")); 366 | OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 367 | const int64 stride_H = GetTensorDim(strides_, data_format_, 'H'); 368 | const int64 stride_W = GetTensorDim(strides_, data_format_, 'W'); 369 | OP_REQUIRES_OK(context, context->GetAttr("num_groups", &num_groups)); 370 | 371 | } 372 | 373 | void Compute(OpKernelContext* context) override { 374 | CHECK(data_format_ == FORMAT_NCHW) << "Generic conv implementation only " 375 | "supports NCHW tensor format for now."; 376 | // Input tensor is of the following dimensions: 377 | // [ batch, in_rows, in_cols, in_depth ] 378 | 379 | const Tensor& input = context->input(0); 380 | const TensorShape& ishape = input.shape(); 381 | // Input filter is of the following dimensions: 382 | // [ out_depth, in_depth, filter_rows, filter_cols] 383 | const Tensor& filter = context->input(1); 384 | 385 | const Tensor& offset = context->input(2); 386 | const TensorShape& offset_shape = offset.shape(); 387 | int num_filter = filter.dim_size(0); 388 | // param_->num_filter = depth_out; 389 | // For 2D convolution, there should be 4 dimensions. 390 | OP_REQUIRES(context, input.dims() == 4, 391 | errors::InvalidArgument("input must be 4-dimensional", 392 | input.shape().DebugString())); 393 | OP_REQUIRES(context, filter.dims() == 4, 394 | errors::InvalidArgument("filter must be 4-dimensional: ", 395 | filter.shape().DebugString())); 396 | OP_REQUIRES(context, offset.dims() == 4, 397 | errors::InvalidArgument("offset must be 4-dimensional: ", 398 | filter.shape().DebugString())); 399 | for (int i = 0; i < 3; i++) { 400 | OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), 401 | std::numeric_limits::max()), 402 | errors::InvalidArgument("filter too large")); 403 | } 404 | 405 | // The last dimension for input is in_depth. It must be the same as the 406 | // filter's in_depth. 407 | const int64 in_depth = GetTensorDim(input, data_format_, 'C'); 408 | OP_REQUIRES( 409 | context, in_depth == filter.dim_size(1)* num_groups, 410 | errors::InvalidArgument("input and filter must have the same depth: ", 411 | in_depth, " vs ", filter.dim_size(1))); 412 | OP_REQUIRES( 413 | context, offset_shape.dim_size(1) % (filter.dim_size(2) * filter.dim_size(3)) == 0, 414 | errors::InvalidArgument("offset channels must divide deformable group size: ", 415 | offset_shape.dim_size(1), " vs ", filter.dim_size(2) * filter.dim_size(3))); 416 | OP_REQUIRES( 417 | context, num_filter % num_groups == 0, 418 | errors::InvalidArgument("num_filter must divide deformable group size: ", 419 | filter.dim_size(0), " vs ", num_groups)); 420 | 421 | // The second dimension for input is rows/height. 422 | // The first dimension for filter is rows/height. 423 | const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); 424 | OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, 425 | std::numeric_limits::max()), 426 | errors::InvalidArgument("Input rows too large")); 427 | const int input_rows = static_cast(input_rows_raw); 428 | const int filter_rows = static_cast(filter.dim_size(2)); 429 | 430 | // The third dimension for input is columns/width. 431 | // The second dimension for filter is columns/width. 432 | const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); 433 | OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, 434 | std::numeric_limits::max()), 435 | errors::InvalidArgument("Input cols too large")); 436 | const int input_cols = static_cast(input_cols_raw); 437 | const int filter_cols = static_cast(filter.dim_size(3)); 438 | 439 | // The first dimension for input is batch. 440 | const int64 batch_raw = GetTensorDim(input, data_format_, 'N'); 441 | OP_REQUIRES(context, 442 | FastBoundsCheck(batch_raw, std::numeric_limits::max()), 443 | errors::InvalidArgument("batch is too large")); 444 | const int batch = static_cast(batch_raw); 445 | 446 | // For now we take the stride from the second and third dimensions only (we 447 | // do not support striding on the batch or depth dimension). 448 | const int stride_rows = GetTensorDim(strides_, data_format_, 'H'); 449 | const int stride_cols = GetTensorDim(strides_, data_format_, 'W'); 450 | const int rate_rows = GetTensorDim(rates_, data_format_, 'H'); 451 | const int rate_cols = GetTensorDim(rates_, data_format_, 'W'); 452 | 453 | int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; 454 | OP_REQUIRES_OK(context, 455 | GetWindowedOutputSize(input_rows, filter_rows, stride_rows, 456 | padding_, &out_rows, &pad_rows)); 457 | OP_REQUIRES_OK(context, 458 | GetWindowedOutputSize(input_cols, filter_cols, stride_cols, 459 | padding_, &out_cols, &pad_cols)); 460 | TShape pad({static_cast(pad_rows), static_cast(pad_cols)}); 461 | TShape stride({stride_rows, stride_cols}); 462 | TShape kernels({filter_rows, filter_cols}); 463 | TShape rates({rate_rows, rate_cols}); 464 | TensorShape out_shape = ShapeFromFormat(data_format_, batch, out_rows, out_cols, num_filter); 465 | auto temp = DeformConvParam(kernels, stride, pad, rates, num_groups, num_filter, true); 466 | this->param_ = &temp; 467 | // LOG(INFO)<<"rates "<<(this->param_->rates)[0]<<" "<<(this->param_->rates)[1]; 468 | LayerSetUp(ishape, offset_shape, out_shape); 469 | 470 | int M = conv_out_channels_ / group_; 471 | int N = conv_out_spatial_dim_; 472 | int K = kernel_dim_; 473 | Tensor weight_3d; 474 | OP_REQUIRES(context, 475 | weight_3d.CopyFrom(filter, TensorShape({group_, M, K})), errors::InvalidArgument("shape doesn't match")); 476 | const T* weight_3d_ptr = weight_3d.template flat().data(); 477 | Tensor* output_4d = nullptr; 478 | OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output_4d)); 479 | T* output_4d_ptr = output_4d->template flat().data(); 480 | // this two shape size are equal 481 | auto col_buf_3d_shape = TensorShape({group_, K, N}); 482 | auto col_buf_shape = TensorShape({conv_in_channels_*param_->kernel[0]*param_->kernel[1], out_rows, out_cols}); 483 | Tensor col_buffer_3d; 484 | OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum::value, col_buf_3d_shape, &col_buffer_3d)); 485 | auto in_data_ptr = input.template flat().data(); 486 | auto offset_ptr = offset.template flat().data(); 487 | auto col_buffer_3d_ptr = col_buffer_3d.template flat().data(); 488 | const Device& d = context->eigen_device(); 489 | 490 | for (int n = 0; n ()(d, in_data_ptr + n*input_dim_, 493 | offset_ptr + n*input_offset_dim_, ToVector(ishape), 494 | ToVector(col_buf_shape), (this->param_->kernel), (this->param_->pad), (this->param_->stride), (this->param_->rates), 1, 495 | col_buffer_3d_ptr); 496 | // Tensor output_3d = output_4d->Slice(n, n+1); 497 | T* output_3d_ptr = output_4d_ptr + n * output_dim_; 498 | functor::LaunchBatchMatMul::Launch(context, weight_3d.shape(), col_buffer_3d.shape(), weight_3d_ptr, col_buffer_3d_ptr, false, false, output_3d_ptr); 499 | } 500 | 501 | 502 | 503 | VLOG(2) << "Conv2D: in_depth = " << in_depth 504 | << ", input_cols = " << input_cols 505 | << ", filter_cols = " << filter_cols 506 | << ", input_rows = " << input_rows 507 | << ", filter_rows = " << filter_rows 508 | << ", stride_rows = " << stride_rows 509 | << ", stride_cols = " << stride_cols 510 | << ", out_depth = " << num_filter; 511 | 512 | // If there is nothing to compute, return. 513 | if (out_shape.num_elements() == 0) 514 | return; 515 | 516 | } 517 | 518 | private: 519 | void LayerSetUp(const TensorShape& ishape, const TensorShape& offset_shape, 520 | const TensorShape& oshape) { 521 | channel_axis_ = 1; // hard code channel axis 522 | const int first_spatial_axis = channel_axis_ + 1; 523 | const int num_axes = param_->kernel.size() + 2; 524 | num_spatial_axes_ = num_axes - first_spatial_axis; 525 | is_1x1_ = true; 526 | for (int i = 0; i < param_->kernel.size(); ++i) { 527 | is_1x1_ &= 528 | param_->kernel[i] == 1 && param_->stride[i] == 1 && param_->pad[i] == 0; 529 | if (!is_1x1_) break; 530 | } 531 | 532 | // batch size 533 | num_ = ishape.dim_size(0); 534 | // number of input channels 535 | channels_ = ishape.dim_size(1); 536 | group_ = param_->num_group; 537 | conv_out_channels_ = param_->num_filter; 538 | conv_in_channels_ = channels_; 539 | bias_term_ = !param_->no_bias; 540 | kernel_dim_ = conv_in_channels_ / group_ * param_->kernel[0]*param_->kernel[1]; 541 | weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; 542 | conv_out_spatial_dim_ = ProdShape(oshape, 2); 543 | col_offset_ = kernel_dim_ * conv_out_spatial_dim_; 544 | output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; 545 | // size of the column buffer used for storing im2col-ed pixels 546 | col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; 547 | // input/output image size (#channels * height * width) 548 | input_dim_ = ProdShape(ishape, 1); 549 | input_offset_dim_ = ProdShape(offset_shape, 1); 550 | output_dim_ = ProdShape(oshape, 1); 551 | num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; 552 | num_kernels_col2im_ = input_dim_; 553 | } 554 | 555 | // DeformableConvolutionParam param_; 556 | int channel_axis_; // channel axis of the input 557 | int channels_; // number of channels of input image 558 | int num_spatial_axes_; // number of spatial axes 559 | int num_; // batch size 560 | int group_; // number of groups 561 | int conv_out_channels_; // number of output channels (num_filter) 562 | int conv_out_spatial_dim_; // number of pixels of output images per channel 563 | int conv_in_channels_; // number of input channels 564 | int kernel_dim_; // number of input channels per group * kernel size 565 | int weight_offset_; // number of output channels per group * kernel_dim_ 566 | int col_offset_; 567 | int output_offset_; 568 | int col_buffer_size_; 569 | int input_dim_; 570 | int input_offset_dim_; 571 | int output_dim_; 572 | int num_kernels_im2col_; 573 | int num_kernels_col2im_; 574 | int num_groups; 575 | bool bias_term_; // has bias term? 576 | bool is_1x1_; 577 | 578 | std::vector strides_; 579 | std::vector rates_; 580 | Padding padding_; 581 | TensorFormat data_format_; 582 | DeformConvParam* param_; 583 | }; 584 | 585 | 586 | 587 | template 588 | class DeformConvBackpropOp : public OpKernel { 589 | public: 590 | explicit DeformConvBackpropOp(OpKernelConstruction* context) 591 | : OpKernel(context) { 592 | OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); 593 | OP_REQUIRES_OK(context, context->GetAttr("rates", &rates_)); 594 | string data_format; 595 | OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); 596 | // TensorFormat data_format_; 597 | OP_REQUIRES(context, FormatFromString(data_format, &data_format_), 598 | errors::InvalidArgument("Invalid data format")); 599 | OP_REQUIRES(context, strides_.size() == 4, 600 | errors::InvalidArgument("Sliding window strides field must " 601 | "specify 4 dimensions")); 602 | const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); 603 | const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); 604 | OP_REQUIRES( 605 | context, stride_n == 1 && stride_c == 1, 606 | errors::InvalidArgument("Current implementation does not yet support " 607 | "strides in the batch and depth dimensions.")); 608 | OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 609 | const int64 stride_H = GetTensorDim(strides_, data_format_, 'H'); 610 | const int64 stride_W = GetTensorDim(strides_, data_format_, 'W'); 611 | OP_REQUIRES_OK(context, context->GetAttr("num_groups", &num_groups)); 612 | } 613 | 614 | void Compute(OpKernelContext* context) override { 615 | const Tensor& input = context->input(0); 616 | const Tensor& filter = context->input(1); 617 | const Tensor& offset = context->input(2); 618 | const Tensor& out_backprop = context->input(3); 619 | const T* input_ptr = input.template flat().data(); 620 | const T* filter_ptr = filter.template flat().data(); 621 | const T* offset_ptr = offset.template flat().data(); 622 | const T* out_backprop_ptr = out_backprop.template flat().data(); 623 | const TensorShape& input_shape = input.shape(); 624 | const TensorShape& filter_shape = filter.shape(); 625 | const TensorShape& offset_shape = offset.shape(); 626 | const TensorShape& out_backprop_shape = out_backprop.shape(); 627 | 628 | // const Tensor& filter_backprop = context->input(4); 629 | // const Tensor& offset_backprop = context->input(5); 630 | int num_filter = filter.dim_size(0); 631 | const int64 in_depth = GetTensorDim(input, data_format_, 'C'); 632 | const int batch = input.dim_size(0); 633 | const int depth = input.dim_size(1); 634 | int64 out_rows = input.dim_size(2); 635 | int64 out_cols = input.dim_size(3); 636 | 637 | const int64 input_rows_raw = GetTensorDim(input, data_format_, 'H'); 638 | const int64 input_cols_raw = GetTensorDim(input, data_format_, 'W'); 639 | const int stride_rows = GetTensorDim(strides_, data_format_, 'H'); 640 | const int stride_cols = GetTensorDim(strides_, data_format_, 'W'); 641 | const int rate_rows = GetTensorDim(rates_, data_format_, 'H'); 642 | const int rate_cols = GetTensorDim(rates_, data_format_, 'W'); 643 | const int input_rows = static_cast(input_rows_raw); 644 | const int filter_rows = static_cast(filter.dim_size(2)); 645 | const int input_cols = static_cast(input_cols_raw); 646 | const int filter_cols = static_cast(filter.dim_size(3)); 647 | int64 pad_rows = 0, pad_cols = 0; 648 | OP_REQUIRES_OK(context, 649 | GetWindowedOutputSize(input_rows, filter_rows, stride_rows, 650 | padding_, &out_rows, &pad_rows)); 651 | OP_REQUIRES_OK(context, 652 | GetWindowedOutputSize(input_cols, filter_cols, stride_cols, 653 | padding_, &out_cols, &pad_cols)); 654 | TShape pad({static_cast(pad_rows), static_cast(pad_cols)}); 655 | TShape stride({stride_rows, stride_cols}); 656 | TShape kernels({filter_rows, filter_cols}); 657 | TShape rates({rate_rows, rate_cols}); 658 | auto temp = DeformConvParam(kernels, stride, pad, rates, num_groups, num_filter, true); 659 | param_ = &temp; 660 | LayerSetUp(input_shape, offset_shape, out_backprop_shape); 661 | int M = kernel_dim_; 662 | int N = conv_out_spatial_dim_; 663 | int K = conv_out_channels_ / group_; 664 | 665 | Tensor* in_backprop = nullptr; 666 | OP_REQUIRES_OK(context, 667 | context->allocate_output(0, input.shape(), &in_backprop)); 668 | auto in_backprop_ptr = in_backprop->template flat().data(); 669 | Tensor* filter_backprop = nullptr; 670 | OP_REQUIRES_OK(context, 671 | context->allocate_output(1, filter.shape(), &filter_backprop)); 672 | auto filter_backprop_ptr = filter_backprop->template flat().data(); 673 | Tensor temp_filter_backprop; 674 | Tensor* offset_backprop = nullptr; 675 | OP_REQUIRES_OK(context, 676 | context->allocate_output(2, offset.shape(), &offset_backprop)); 677 | auto offset_backprop_ptr = offset_backprop->template flat().data(); 678 | OP_REQUIRES_OK(context, 679 | context->allocate_temp(DataTypeToEnum::value, filter.shape(), &temp_filter_backprop)); 680 | auto temp_filter_backprop_ptr = temp_filter_backprop.template flat().data(); 681 | TensorShape col_buffer_shape({conv_in_channels_*filter_rows*filter_cols, out_backprop.dim_size(2), out_backprop.dim_size(3)}); 682 | TensorShape col_buffer_3d_shape({group_, M, N}); 683 | Tensor col_buffer_3d; 684 | OP_REQUIRES_OK(context, 685 | context->allocate_temp(DataTypeToEnum::value, col_buffer_3d_shape, &col_buffer_3d)); 686 | // auto col_temp = col_buffer_3d.template flat(); 687 | // col_temp.device(d) = col_temp.constant(T(0)); 688 | T* col_buffer_ptr = col_buffer_3d.template flat().data(); 689 | auto weight_3d_shape=TensorShape({group_, K, M}); 690 | const T* weight_3d_ptr = filter_ptr; 691 | Tensor out_grad_4d; 692 | TensorShape out_grad_4d_shape = TensorShape({num_, group_, K, N}); 693 | int out_grad_3d_dim = group_ * K * N; 694 | const T* out_grad_4d_ptr = out_backprop_ptr; 695 | // If there is nothing to compute, return. 696 | if (input.shape().num_elements() == 0) { 697 | return; 698 | } 699 | TensorShape out_grad_3d_shape = out_grad_4d_shape; 700 | out_grad_3d_shape.RemoveDim(0); 701 | const Device& d = context->eigen_device(); 702 | functor::setZero()(d, group_*M*N, col_buffer_ptr); 703 | functor::setZero()(d, ProdShape(filter.shape(), 0), temp_filter_backprop_ptr); 704 | functor::setZero()(d, ProdShape(input.shape(), 0), in_backprop_ptr); 705 | functor::setZero()(d, ProdShape(filter.shape(), 0), filter_backprop_ptr); 706 | 707 | // functor::setZero()(d, group_*M*N, col_buffer_ptr); 708 | // 32 120 8 3 7 4 709 | // 32 120 8 3 7 4 710 | // LOG(WARNING) << input_offset_dim_<<' ' << input_dim_<<' '<::Launch(context, weight_3d_shape, out_grad_3d_shape, weight_3d_ptr, 716 | out_grad_4d_ptr+n*out_grad_3d_dim, true, false, col_buffer_ptr); 717 | 718 | 719 | // gradient w.r.t. input coordinate data 720 | functor::deformable_col2im_coord()(d, col_buffer_ptr, 721 | input_ptr + n*input_dim_, offset_ptr + n*input_offset_dim_, 722 | ToVector(input_shape), ToVector(col_buffer_shape), 723 | param_->kernel, param_->pad, param_->stride, param_->rates, 1, 724 | offset_backprop_ptr + n*input_offset_dim_); 725 | 726 | // gradient w.r.t. input data 727 | functor::deformable_col2im()(d, col_buffer_ptr, 728 | offset_ptr + n*input_offset_dim_, ToVector(input_shape), ToVector(col_buffer_shape), 729 | param_->kernel, param_->pad, param_->stride, param_->rates, 1, 730 | in_backprop_ptr + n*input_dim_); 731 | 732 | // gradient w.r.t. weight, dWeight should accumulate across the batch and group 733 | functor::im2col()(d, input_ptr + n*input_dim_, ToVector(input_shape), 734 | ToVector(col_buffer_shape), param_->kernel, param_->pad, param_->stride, param_->rates, 735 | col_buffer_ptr); 736 | if (0 == n) { 737 | functor::LaunchBatchMatMul::Launch(context, out_grad_3d_shape, col_buffer_3d_shape, out_grad_4d_ptr+n*out_grad_3d_dim, 738 | col_buffer_ptr, false, true, filter_backprop_ptr); 739 | } 740 | else { 741 | functor::LaunchBatchMatMul::Launch(context, out_grad_3d_shape, col_buffer_3d_shape, out_grad_4d_ptr+n*out_grad_3d_dim, 742 | col_buffer_ptr, false, true, temp_filter_backprop_ptr); 743 | functor::pureAddTo()(d, ProdShape(filter.shape(), 0), filter_backprop_ptr, temp_filter_backprop_ptr); 744 | } 745 | } 746 | // functor::pureSubTo()(d, ProdShape(input_shape, 0), in_backprop_ptr, input_ptr); 747 | } 748 | 749 | 750 | private: 751 | void LayerSetUp(const TensorShape& ishape, const TensorShape& offset_shape, 752 | const TensorShape& oshape) { 753 | channel_axis_ = 1; // hard code channel axis 754 | const int first_spatial_axis = channel_axis_ + 1; 755 | const int num_axes = param_->kernel.size() + 2; 756 | num_spatial_axes_ = num_axes - first_spatial_axis; 757 | is_1x1_ = true; 758 | for (int i = 0; i < param_->kernel.size(); ++i) { 759 | is_1x1_ &= 760 | param_->kernel[i] == 1 && param_->stride[i] == 1 && param_->pad[i] == 0; 761 | if (!is_1x1_) break; 762 | } 763 | 764 | // batch size 765 | num_ = ishape.dim_size(0); 766 | // number of input channels 767 | channels_ = ishape.dim_size(1); 768 | group_ = param_->num_group; 769 | conv_out_channels_ = param_->num_filter; 770 | conv_in_channels_ = channels_; 771 | bias_term_ = !param_->no_bias; 772 | kernel_dim_ = conv_in_channels_ / group_ * param_->kernel[0]*param_->kernel[1]; 773 | weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; 774 | conv_out_spatial_dim_ = ProdShape(oshape, 2); 775 | col_offset_ = kernel_dim_ * conv_out_spatial_dim_; 776 | output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; 777 | // size of the column buffer used for storing im2col-ed pixels 778 | col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; 779 | // input/output image size (#channels * height * width) 780 | input_dim_ = ProdShape(ishape, 1); 781 | input_offset_dim_ = ProdShape(offset_shape, 1); 782 | output_dim_ = ProdShape(oshape, 1); 783 | num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; 784 | num_kernels_col2im_ = input_dim_; 785 | } 786 | 787 | // DeformableConvolutionParam param_; 788 | int channel_axis_; // channel axis of the input 789 | int channels_; // number of channels of input image 790 | int num_spatial_axes_; // number of spatial axes 791 | int num_; // batch size 792 | int group_; // number of groups 793 | int conv_out_channels_; // number of output channels (num_filter) 794 | int conv_out_spatial_dim_; // number of pixels of output images per channel 795 | int conv_in_channels_; // number of input channels 796 | int kernel_dim_; // number of input channels per group * kernel size 797 | int weight_offset_; // number of output channels per group * kernel_dim_ 798 | int col_offset_; 799 | int output_offset_; 800 | int col_buffer_size_; 801 | int input_dim_; 802 | int input_offset_dim_; 803 | int output_dim_; 804 | int num_kernels_im2col_; 805 | int num_kernels_col2im_; 806 | int num_groups; 807 | bool bias_term_; // has bias term? 808 | bool is_1x1_; 809 | 810 | std::vector strides_; 811 | std::vector rates_; 812 | Padding padding_; 813 | TensorFormat data_format_; 814 | DeformConvParam* param_; 815 | 816 | }; 817 | 818 | #if GOOGLE_CUDA 819 | 820 | #define REGISTER(T) \ 821 | REGISTER_KERNEL_BUILDER( \ 822 | Name("DeformConvOp").Device(DEVICE_GPU).TypeConstraint("T"), \ 823 | DeformConvOp); \ 824 | REGISTER_KERNEL_BUILDER( \ 825 | Name("DeformConvBackpropOp").Device(DEVICE_GPU).TypeConstraint("T"), \ 826 | DeformConvBackpropOp); 827 | 828 | // TF_CALL_GPU_NUMBER_TYPES(REGISTER); 829 | TF_CALL_float(REGISTER); 830 | TF_CALL_double(REGISTER); 831 | 832 | #undef REGISTER 833 | 834 | #endif // GOOGLE_CUDA 835 | 836 | } // namespace tensorflow 837 | --------------------------------------------------------------------------------