├── .gitignore
├── utils.h
├── utils.c
├── init.lua
├── init.cu
├── ptnbhwd-scm-1.rockspec
├── init.c
├── LICENSE
├── README.md
├── CMakeLists.txt
├── PerspectiveGridGenerator.lua
├── BilinearSamplerPerspective.lua
├── generic
    └── BilinearSamplerPerspective.c
└── BilinearSamplerPerspective.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | # Folders
2 | build/
3 | 


--------------------------------------------------------------------------------
/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUNN_UTILS_H
 2 | #define CUNN_UTILS_H
 3 | 
 4 | #include <lua.h>
 5 | #include "THCGeneral.h"
 6 | 
 7 | THCState* getCutorchState(lua_State* L);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/utils.c:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | THCState* getCutorchState(lua_State* L)
 4 | {
 5 |     lua_getglobal(L, "cutorch");
 6 |     lua_getfield(L, -1, "getState");
 7 |     lua_call(L, 0, 1);
 8 |     THCState *state = (THCState*) lua_touserdata(L, -1);
 9 |     lua_pop(L, 2);
10 |     return state;
11 | }
12 | 


--------------------------------------------------------------------------------
/init.lua:
--------------------------------------------------------------------------------
 1 | require 'nn'
 2 | local withCuda = pcall(require, 'cutorch')
 3 | 
 4 | require 'libptn'
 5 | if withCuda then
 6 |    require 'libcuptn'
 7 | end
 8 | 
 9 | --require('ptn.AffineTransformMatrixGenerator')
10 | --require('ptn.AffineGridGeneratorBHWD')
11 | --require('ptn.BilinearSamplerBHWD')
12 | 
13 | require('ptn.PerspectiveGridGenerator')
14 | require('ptn.BilinearSamplerPerspective')
15 | 
16 | --require('ptn.test')
17 | 
18 | return nn
19 | 


--------------------------------------------------------------------------------
/init.cu:
--------------------------------------------------------------------------------
 1 | #include "luaT.h"
 2 | #include "THC.h"
 3 | 
 4 | #include "utils.c"
 5 | 
 6 | //#include "BilinearSamplerBHWD.cu"
 7 | #include "BilinearSamplerPerspective.cu"
 8 | 
 9 | LUA_EXTERNC DLL_EXPORT int luaopen_libcuptn(lua_State *L);
10 | 
11 | int luaopen_libcuptn(lua_State *L)
12 | {
13 |   lua_newtable(L);
14 |   //cunn_BilinearSamplerBHWD_init(L);
15 | 
16 |   //lua_newtable(L);
17 |   cunn_BilinearSamplerPerspective_init(L);
18 |   return 1;
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/ptnbhwd-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "ptnbhwd"
 2 | version = "scm-1"
 3 | 
 4 | 
 5 | source = {
 6 |   url = "https://github.com/xcyan/ptnbhwd.git",
 7 | }
 8 | 
 9 | 
10 | description = {
11 |   summary = "Perspective Transformer Layer for Torch",
12 |   detailed = [[
13 |   ]],
14 |   homepage = "https://github.com/xcyan/ptnbhwd",
15 |   license = "MIT"
16 | }
17 | 
18 | 
19 | dependencies = {
20 |    "torch >= 7.0",
21 |    "nn >= 1.0",
22 | }
23 | 
24 | build = {
25 |    type = "command",
26 |    build_command = [[
27 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
28 | ]],
29 |    install_command = "cd build && $(MAKE) install"
30 | }
31 | 


--------------------------------------------------------------------------------
/init.c:
--------------------------------------------------------------------------------
 1 | #include "TH.h"
 2 | #include "luaT.h"
 3 | 
 4 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
 5 | #define torch_Tensor TH_CONCAT_STRING_3(torch.,Real,Tensor)
 6 | #define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)
 7 | 
 8 | #include "generic/BilinearSamplerPerspective.c"
 9 | //#include "generic/BilinearSamplerBHWD.c"
10 | #include "THGenerateFloatTypes.h"
11 | 
12 | LUA_EXTERNC DLL_EXPORT int luaopen_libptn(lua_State *L);
13 | //LUA_EXTERNC DLL_EXPORT int luaopen_libptn_pp(lua_State *L);
14 | 
15 | int luaopen_libptn(lua_State *L)
16 | {
17 |   lua_newtable(L);
18 |   lua_pushvalue(L, -1);
19 |   lua_setglobal(L, "ptn");
20 | 
21 |   //nn_FloatBilinearSamplerBHWD_init(L);
22 | 
23 |   //nn_DoubleBilinearSamplerBHWD_init(L);
24 | 
25 |   nn_FloatBilinearSamplerPerspective_init(L);
26 | 
27 |   nn_DoubleBilinearSamplerPerspective_init(L);
28 | 
29 |   return 1;
30 | }
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Xinchen Yan, Jimei Yang, Ersin Yumer, Yijie Guo and Honglak Lee
 4 | Copyright (c) 2015 qassemoquab
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Perspective Transformer Layer
 2 | 
 3 | This is the torch implementation of the [Perspective Transformer Layer](https://papers.nips.cc/paper/6206-perspective-transformer-nets-learning-single-view-3d-object-reconstruction-without-3d-supervision.pdf), which is built on top of the [STN torch implementation](https://github.com/qassemoquab/stnbhwd).
 4 | 
 5 | ## Build
 6 | To build the ptn libriary, run the following script.
 7 | ```
 8 | luarocks make ptnbhwd-scm-1.rockspec
 9 | ```
10 | 
11 | ## Usage
12 | 
13 | ``` lua
14 | require 'ptn'
15 | 
16 | nn.PerspectiveGridGenerator(depth, height, width, focal_length)
17 | -- takes B x 4 x 4 affine transform matrices as input, 
18 | -- outputs a depth x height x width grid in normalized [dmin,dmax] x [-1,1] x [-1,1] coordinates, where dmin and dmax represent the minimal and maximal disparity.
19 | 
20 | nn.BilinearSamplerPerspective()
21 | -- takes a table {inputImages, grids} as inputs
22 | -- outputs the interpolated images according to the grids
23 | -- inputImages is a batch of samples in BHWD layout
24 | -- grids is a batch of grids (output of PerspectiveGridGenerator)
25 | -- output is also BHWD
26 | ```
27 | 
28 | ## Citation
29 | If you find this useful, please cite our work as follows:
30 | ```
31 | @incollection{NIPS2016_6206,
32 | title = {Perspective Transformer Nets: Learning Single-View 3D Object Reconstruction without 3D Supervision},
33 | author = {Yan, Xinchen and Yang, Jimei and Yumer, Ersin and Guo, Yijie and Lee, Honglak},
34 | booktitle = {Advances in Neural Information Processing Systems 29},
35 | editor = {D. D. Lee and M. Sugiyama and U. V. Luxburg and I. Guyon and R. Garnett},
36 | pages = {1696--1704},
37 | year = {2016},
38 | publisher = {Curran Associates, Inc.},
39 | url = {http://papers.nips.cc/paper/6206-perspective-transformer-nets-learning-single-view-3d-object-reconstruction-without-3d-supervision.pdf}
40 | }
41 | ```
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
 2 | CMAKE_POLICY(VERSION 2.8)
 3 | 
 4 | SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
 5 | 
 6 | FIND_PACKAGE(Torch REQUIRED)
 7 | 
 8 | # Flags
 9 | # When using MSVC
10 | IF(MSVC)
11 |   # we want to respect the standard, and we are bored of those **** .
12 |   ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
13 | ENDIF(MSVC)
14 | 
15 | # OpenMP support?
16 | SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
17 | IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
18 |   EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
19 |   STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
20 |   MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
21 |   IF (DARWIN_VERSION GREATER 9)
22 |     SET(APPLE_OPENMP_SUCKS 1)
23 |   ENDIF (DARWIN_VERSION GREATER 9)
24 |   EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
25 |     OUTPUT_VARIABLE GCC_VERSION)
26 |   IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
27 |     MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
28 |     MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
29 |     SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
30 |     SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
31 |   ENDIF ()
32 | ENDIF ()
33 | 
34 | IF (WITH_OPENMP)
35 |   FIND_PACKAGE(OpenMP)
36 |   IF(OPENMP_FOUND)
37 |     MESSAGE(STATUS "Compiling with OpenMP support")
38 |     SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
39 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
40 |     SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
41 |   ENDIF(OPENMP_FOUND)
42 | ENDIF (WITH_OPENMP)
43 | 
44 | LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
45 | 
46 | SET(src init.c)
47 | FILE(GLOB luasrc *.lua)
48 | ADD_TORCH_PACKAGE(ptn "${src}" "${luasrc}")
49 | TARGET_LINK_LIBRARIES(ptn luaT TH)
50 | 
51 | 
52 | FIND_PACKAGE(CUDA 5.5)
53 | 
54 | IF (CUDA_FOUND)
55 |   LIST(APPEND CUDA_NVCC_FLAGS "-arch=sm_20")
56 | 
57 |   INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}/THC")
58 |   SET(src-cuda init.cu)
59 |   CUDA_ADD_LIBRARY(cuptn MODULE ${src-cuda})
60 |   TARGET_LINK_LIBRARIES(cuptn luaT THC TH)
61 |   IF(APPLE)
62 |     SET_TARGET_PROPERTIES(cuptn PROPERTIES
63 |       LINK_FLAGS "-undefined dynamic_lookup")
64 |   ENDIF()
65 |   ### Torch packages supposes libraries prefix is "lib"
66 |   SET_TARGET_PROPERTIES(cuptn PROPERTIES
67 |     PREFIX "lib"
68 |     IMPORT_PREFIX "lib")
69 | 
70 |   INSTALL(TARGETS cuptn
71 |     RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
72 |     LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
73 | ENDIF(CUDA_FOUND)
74 | 


--------------------------------------------------------------------------------
/PerspectiveGridGenerator.lua:
--------------------------------------------------------------------------------
  1 | -- code adapted from github repo
  2 | -- implemented by Yijie Guo (guoyijie@umich.edu) and Xinchen Yan (skywalkeryxc@gmail.com)
  3 | 
  4 | local PGG, parent = torch.class('nn.PerspectiveGridGenerator', 'nn.Module')
  5 | 
  6 | function PGG:__init(depth,height,width,focal_length)
  7 |    parent.__init(self)
  8 |    assert(depth > 1)
  9 |    assert(height > 1)
 10 |    assert(width > 1)
 11 |    self.depth = depth
 12 |    self.height = height
 13 |    self.width = width
 14 |    local dmin = 1/(focal_length + math.sqrt(3))
 15 |    local dmax = 1/(focal_length)
 16 |    print(focal_length)
 17 |    print(dmin .. ' ' .. dmax)
 18 | 
 19 |    --zt = 1, xt, yt [-1, 1]
 20 |    self.baseGrid = torch.Tensor(depth, height, width, 4)
 21 |    
 22 |    for k=1,self.depth do
 23 |      for i=1,self.height do
 24 |        for j=1,self.width do
 25 |           local disf = dmin + (k-1)/(self.depth-1) * (dmax-dmin)
 26 |           --print(disf)
 27 |           self.baseGrid[k][i][j][1] = 1/disf
 28 |           self.baseGrid[k][i][j][2] = (-1 + (i-1)/(self.height-1) * 2)/disf
 29 |           self.baseGrid[k][i][j][3] = (-1 + (j-1)/(self.width-1) * 2)/disf
 30 |           self.baseGrid[k][i][j][4] = 1
 31 |        end
 32 |      end
 33 |    end
 34 |    self.batchGrid = torch.Tensor(1, depth, height, width, 4):copy(self.baseGrid)
 35 | end
 36 | 
 37 | local function addOuterDim(t)
 38 |    local sizes = t:size()
 39 |    local newsizes = torch.LongStorage(sizes:size()+1)
 40 |    newsizes[1]=1
 41 |    for i=1,sizes:size() do
 42 |       newsizes[i+1]=sizes[i]
 43 |    end
 44 |    return t:view(newsizes)
 45 | end
 46 | 
 47 | function PGG:updateOutput(_transformMatrix)
 48 |    local transformMatrix
 49 |    if _transformMatrix:nDimension()==2 then
 50 |       transformMatrix = addOuterDim(_transformMatrix)
 51 |    else
 52 |       transformMatrix = _transformMatrix
 53 |    end
 54 |    assert(transformMatrix:nDimension()==3
 55 |           and transformMatrix:size(2)==4
 56 |           and transformMatrix:size(3)==4
 57 |           , 'please input affine transform matrices (bx4x4)')
 58 |    local batchsize = transformMatrix:size(1)
 59 |    
 60 |    if self.batchGrid:size(1) ~= batchsize then
 61 |       self.batchGrid:resize(batchsize, self.depth, self.height, self.width, 4)
 62 |       for i=1,batchsize do
 63 |          self.batchGrid:select(1,i):copy(self.baseGrid)
 64 |       end
 65 |    end
 66 | 
 67 |    self.output:resize(batchsize, self.depth, self.height, self.width, 4)
 68 |    local flattenedBatchGrid = self.batchGrid:view(batchsize, self.depth*self.width*self.height, 4)
 69 |    local flattenedOutput = self.output:view(batchsize, self.depth*self.width*self.height, 4)
 70 |    torch.bmm(flattenedOutput, flattenedBatchGrid, transformMatrix:transpose(2,3)) 
 71 |    if _transformMatrix:nDimension()==2 then
 72 |       self.output = self.output:select(1,1)
 73 |    end
 74 |    return self.output
 75 | end
 76 | 
 77 | function PGG:updateGradInput(_transformMatrix, _gradGrid)
 78 |    local transformMatrix, gradGrid
 79 |    if _transformMatrix:nDimension()==2 then
 80 |       transformMatrix = addOuterDim(_transformMatrix)
 81 |       gradGrid = addOuterDim(_gradGrid)
 82 |    else
 83 |       transformMatrix = _transformMatrix
 84 |       gradGrid = _gradGrid
 85 |    end
 86 | 
 87 |    local batchsize = transformMatrix:size(1)
 88 | 
 89 |    local flattenedGradGrid = gradGrid:view(batchsize, self.depth*self.width*self.height, 4)
 90 |    local flattenedBatchGrid = self.batchGrid:view(batchsize, self.depth*self.width*self.height, 4)
 91 |    self.gradInput:resizeAs(transformMatrix):zero()
 92 |    self.gradInput:baddbmm(flattenedGradGrid:transpose(2,3), flattenedBatchGrid) ---????
 93 |    -- torch.baddbmm doesn't work on cudatensors for some reason
 94 | 
 95 |    if _transformMatrix:nDimension()==2 then
 96 |       self.gradInput = self.gradInput:select(1,1)
 97 |    end
 98 | 
 99 |    return self.gradInput
100 | end
101 | 


--------------------------------------------------------------------------------
/BilinearSamplerPerspective.lua:
--------------------------------------------------------------------------------
  1 | -- code adapted from github
  2 | -- implemented by Yijie Guo (guoyijie@umich.edu) and Xinchen Yan (skywalkeryxc@gmail.com)
  3 | 
  4 | local BilinearSamplerPerspective, parent = torch.class('nn.BilinearSamplerPerspective', 'nn.Module')
  5 | 
  6 | --[[
  7 |    BilinearSamplerBHWD() :
  8 |    BilinearSamplerBHWD:updateOutput({inputImages, grids})
  9 |    BilinearSamplerBHWD:updateGradInput({inputImages, grids}, gradOutput)
 10 | 
 11 |    BilinearSamplerBHWD will perform bilinear sampling of the input images according to the
 12 |    normalized coordinates provided in the grid. Output will be of same size as the grids, 
 13 |    with as many features as the input images.
 14 | 
 15 |    - inputImages has to be in BDHWD layout
 16 | 
 17 |    - grids have to be in BDHWD layout, with dim(D)=4
 18 |    - grids contains, for each sample (first dim), the normalized coordinates of the output wrt the input sample
 19 |       - Z, Y, X coordinate
 20 |       - normalized coordinates : (-1,-1, -1) points to front top left, (-1, -1,1) points to front top right
 21 |       - if the normalized coordinates fall outside of the image, then output will be filled with zeros
 22 | ]]
 23 | 
 24 | function BilinearSamplerPerspective:__init(focal_length)
 25 |   parent.__init(self)
 26 |   self.gradInput={}
 27 |   self.focal_length = focal_length
 28 | end
 29 | 
 30 | function BilinearSamplerPerspective:check(input, gradOutput)
 31 |   local inputImages = input[1]
 32 | 	local grids = input[2]
 33 | 
 34 |   assert(inputImages:isContiguous(), 'Input images have to be contiguous')
 35 |   assert(inputImages:nDimension()==5)
 36 |   assert(grids:nDimension()==5)
 37 |   assert(inputImages:size(1)==grids:size(1)) -- batch
 38 |   assert(grids:size(5)==4) -- coordinates
 39 |    
 40 |   if gradOutput then
 41 |     assert(grids:size(1)==gradOutput:size(1)) --batchsize
 42 |     assert(grids:size(2)==gradOutput:size(2)) --depth == dist
 43 |     assert(grids:size(3)==gradOutput:size(3)) --height
 44 |     assert(grids:size(4)==gradOutput:size(4)) --width
 45 |   end
 46 | end
 47 | 
 48 | local function addOuterDim(t)
 49 |   local sizes = t:size()
 50 |   local newsizes = torch.LongStorage(sizes:size()+1)
 51 |   newsizes[1]=1
 52 |   for i=1,sizes:size() do
 53 |     newsizes[i+1]=sizes[i]
 54 |   end
 55 |   return t:view(newsizes)
 56 | end
 57 | 
 58 | function BilinearSamplerPerspective:updateOutput(input)
 59 |   local _inputImages = input[1]
 60 |   local _grids = input[2]
 61 |    --print("D")
 62 |    --print(_grids)
 63 |    
 64 |   local inputImages, grids
 65 |   if _inputImages:nDimension()==4 then  --image:size(4) = channel, image:size(1)=depth, image:size(2) = height, image:size(3)=width
 66 |     inputImages = addOuterDim(_inputImages)
 67 |     grids = addOuterDim(_grids)
 68 |   else
 69 |     inputImages = _inputImages
 70 |     grids = _grids
 71 |   end
 72 | 
 73 |   local input = {inputImages, grids}
 74 | 
 75 |   self:check(input)
 76 | 
 77 |   self.output:resize(inputImages:size(1), grids:size(2), grids:size(3), grids:size(4), inputImages:size(5))
 78 |   inputImages.nn.BilinearSamplerPerspective_updateOutput(self, inputImages, grids, self.output, self.focal_length)
 79 | 
 80 |   if _inputImages:nDimension()==4 then
 81 |     self.output=self.output:select(1,1)
 82 |   end
 83 | 	
 84 |   return self.output
 85 | end
 86 | 
 87 | function BilinearSamplerPerspective:updateGradInput(_input, _gradOutput)
 88 |   local _inputImages = _input[1]
 89 | 	local _grids = _input[2]
 90 | 
 91 |   local inputImages, grids, gradOutput
 92 |   if _inputImages:nDimension()==4 then
 93 |     inputImages = addOuterDim(_inputImages)
 94 |     grids = addOuterDim(_grids)
 95 |     gradOutput = addOuterDim(_gradOutput)
 96 |   else
 97 |     inputImages = _inputImages
 98 |     grids = _grids
 99 |     gradOutput = _gradOutput
100 |   end
101 | 
102 |   local input = {inputImages, grids}
103 | 
104 |   self:check(input, gradOutput)
105 | 	for i=1,#input do
106 | 	  self.gradInput[i] = self.gradInput[i] or input[1].new()
107 |     self.gradInput[i]:resizeAs(input[i]):zero()
108 |   end
109 | 
110 |   local gradInputImages = self.gradInput[1]
111 |   local gradGrids = self.gradInput[2]
112 | 
113 |   inputImages.nn.BilinearSamplerPerspective_updateGradInput(self, inputImages, grids, gradInputImages, gradGrids, gradOutput, self.focal_length)
114 | 
115 |   if _gradOutput:nDimension()==4 then
116 |     self.gradInput[1]=self.gradInput[1]:select(1,1)
117 |     self.gradInput[2]=self.gradInput[2]:select(1,1)
118 |   end
119 |    
120 |   return self.gradInput
121 | end
122 | 


--------------------------------------------------------------------------------
/generic/BilinearSamplerPerspective.c:
--------------------------------------------------------------------------------
  1 | #ifndef TH_GENERIC_FILE
  2 | #define TH_GENERIC_FILE "generic/BilinearSamplerPerspective.c"
  3 | #else
  4 | 
  5 | #include <stdbool.h>
  6 | #include <stdio.h>
  7 | // code adapted from the github repo
  8 | // implemented by Yijie Guo (guoyijie@umich.edu) and Xinchen Yan (skywalkeryxc@gmail.com)
  9 | 
 10 | // Affine Transformation
 11 | static int nn_(BilinearSamplerBHWD_updateOutput)(lua_State *L)
 12 | {
 13 |   THTensor *inputImages = luaT_checkudata(L, 2, torch_Tensor);
 14 |   THTensor *grids = luaT_checkudata(L, 3, torch_Tensor);
 15 |   THTensor *output = luaT_checkudata(L, 4, torch_Tensor);
 16 | 
 17 |   int batchsize = inputImages->size[0];
 18 |   int inputImages_depth = inputImages->size[1];
 19 |   int inputImages_height = inputImages->size[2];
 20 |   int inputImages_width = inputImages->size[3];
 21 |   int output_height = output->size[2];
 22 |   int output_width = output->size[3];
 23 |   int output_depth = output->size[1];
 24 |   int inputImages_channels = inputImages->size[4];
 25 | 
 26 |   int output_strideBatch = output->stride[0];
 27 |   int output_strideHeight = output->stride[2];
 28 |   int output_strideWidth = output->stride[3];
 29 |   int output_strideDepth = output->stride[1];
 30 | 
 31 |   int inputImages_strideBatch = inputImages->stride[0];
 32 |   int inputImages_strideHeight = inputImages->stride[2];
 33 |   int inputImages_strideWidth = inputImages->stride[3];
 34 |   int inputImages_strideDepth = inputImages->stride[1];
 35 |     
 36 |   int grids_strideBatch = grids->stride[0];
 37 |   int grids_strideHeight = grids->stride[2];
 38 |   int grids_strideWidth = grids->stride[3];
 39 |   int grids_strideDepth = grids->stride[1];
 40 | 
 41 |   real *inputImages_data, *output_data, *grids_data;
 42 |   inputImages_data = THTensor_(data)(inputImages);
 43 |   output_data = THTensor_(data)(output);
 44 |   grids_data = THTensor_(data)(grids);
 45 | 
 46 |   int b, yOut, xOut, zOut;
 47 | 
 48 |   for(b=0; b < batchsize; b++)
 49 |   {
 50 |     for(zOut = 0; zOut < output_depth; zOut++)
 51 |     {    
 52 |       for(yOut=0; yOut < output_height; yOut++) 
 53 |       {
 54 |         for(xOut=0; xOut < output_width; xOut++) 
 55 |         {
 56 |         //read the grid
 57 |         real yf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + zOut*grids_strideDepth + 1];
 58 |         real xf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + zOut*grids_strideDepth + 2];
 59 |         real zf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + zOut*grids_strideDepth];
 60 | 
 61 |         // get the weights for interpolation
 62 |         int yInTopLeftFront, xInTopLeftFront, zInTopLeftFront;
 63 |         real yWeightTopLeftFront, xWeightTopLeftFront, zWeightTopLeftFront;
 64 |  
 65 |         real xcoord = (xf + 1) * (inputImages_width - 1) / 2;
 66 |         xInTopLeftFront = floor(xcoord);
 67 |         xWeightTopLeftFront = 1 - (xcoord - xInTopLeftFront);
 68 | 
 69 |         real ycoord = (yf + 1) * (inputImages_height - 1) / 2;
 70 |         yInTopLeftFront = floor(ycoord);
 71 |         yWeightTopLeftFront = 1 - (ycoord - yInTopLeftFront);
 72 |             
 73 |         real zcoord = (zf + 1) * (inputImages_depth - 1) / 2;
 74 |         zInTopLeftFront = floor(zcoord);
 75 |         zWeightTopLeftFront = 1 - (zcoord - zInTopLeftFront);
 76 | 
 77 |         const int outAddress = output_strideBatch * b + output_strideHeight * yOut + output_strideWidth * xOut + output_strideDepth * zOut;
 78 |         const int inTopLeftFrontAddress = inputImages_strideBatch * b + inputImages_strideHeight * yInTopLeftFront 
 79 |           + inputImages_strideWidth * xInTopLeftFront + inputImages_strideDepth * zInTopLeftFront;
 80 |         
 81 |         const int inTopLeftBackAddress = inTopLeftFrontAddress + inputImages_strideDepth;
 82 |             
 83 |         const int inTopRightFrontAddress = inTopLeftFrontAddress + inputImages_strideWidth;
 84 |         const int inTopRightBackAddress = inTopRightFrontAddress + inputImages_strideDepth;
 85 |             
 86 |         const int inBottomLeftFrontAddress = inTopLeftFrontAddress + inputImages_strideHeight;
 87 |         const int inBottomLeftBackAddress = inBottomLeftFrontAddress + inputImages_strideDepth;
 88 |             
 89 |         const int inBottomRightFrontAddress = inBottomLeftFrontAddress + inputImages_strideWidth;
 90 |         const int inBottomRightBackAddress = inBottomRightFrontAddress + inputImages_strideDepth;
 91 |             
 92 |         real v=0;
 93 |         real inTopLeftFront=0;
 94 |         real inTopLeftBack=0;
 95 |         real inTopRightFront=0;
 96 |         real inTopRightBack=0;
 97 |         real inBottomLeftFront=0;
 98 |         real inBottomLeftBack=0;
 99 |         real inBottomRightFront=0;
100 |         real inBottomRightBack=0;
101 | 
102 |         // we are careful with the boundaries
103 |         bool topLeftFrontIsIn = (xInTopLeftFront >= 0 && xInTopLeftFront <= inputImages_width-1 
104 |           && yInTopLeftFront >= 0 && yInTopLeftFront<= inputImages_height-1 
105 |           && zInTopLeftFront >=0 && zInTopLeftFront <= inputImages_depth-1);
106 |             
107 |         bool topLeftBackIsIn = (xInTopLeftFront >= 0 && xInTopLeftFront <= inputImages_width-1 
108 |           && yInTopLeftFront >= 0 && yInTopLeftFront<= inputImages_height-1 
109 |           && zInTopLeftFront+1 >=0 && zInTopLeftFront+1<= inputImages_depth-1);
110 |             
111 |         bool topRightFrontIsIn = xInTopLeftFront+1 >= 0 && xInTopLeftFront+1 <= inputImages_width-1 
112 |           && yInTopLeftFront >= 0 && yInTopLeftFront <= inputImages_height-1 
113 |           && zInTopLeftFront >=0 && zInTopLeftFront <= inputImages_depth - 1;
114 | 
115 |         bool topRightBackIsIn = (xInTopLeftFront+1 >= 0 && xInTopLeftFront+1 <= inputImages_width-1 
116 |           && yInTopLeftFront >= 0 && yInTopLeftFront <= inputImages_height-1 
117 |           && zInTopLeftFront+1 >=0 && zInTopLeftFront+1 <= inputImages_depth-1);
118 |         
119 |         bool bottomLeftFrontIsIn = (xInTopLeftFront >= 0 && xInTopLeftFront <= inputImages_width-1 
120 |           && yInTopLeftFront+1 >= 0 && yInTopLeftFront+1 <= inputImages_height-1 
121 |           && zInTopLeftFront >=0 && zInTopLeftFront <= inputImages_depth-1);
122 |             
123 |         bool bottomLeftBackIsIn = (xInTopLeftFront >= 0 && xInTopLeftFront <= inputImages_width-1 
124 |           && yInTopLeftFront+1 >= 0 && yInTopLeftFront+1 <= inputImages_height-1 
125 |           && zInTopLeftFront+1 >=0 && zInTopLeftFront+1 <= inputImages_depth-1);
126 |             
127 |         bool bottomRightFrontIsIn = (xInTopLeftFront+1 >= 0 && xInTopLeftFront+1 <= inputImages_width-1 
128 |           && yInTopLeftFront+1 >= 0 && yInTopLeftFront+1 <= inputImages_height-1 
129 |           && zInTopLeftFront >=0 && zInTopLeftFront <= inputImages_depth-1);
130 |     
131 |         bool bottomRightBackIsIn = (xInTopLeftFront+1 >= 0 && xInTopLeftFront+1 <= inputImages_width-1 
132 |           && yInTopLeftFront+1 >= 0 && yInTopLeftFront+1 <= inputImages_height-1 
133 |           && zInTopLeftFront+1 >=0 && zInTopLeftFront+1 <= inputImages_depth-1);
134 | 
135 |         int t;
136 |         // interpolation happens here
137 |         for(t=0; t<inputImages_channels; t++)
138 |         {
139 |            if(topLeftFrontIsIn) inTopLeftFront = inputImages_data[inTopLeftFrontAddress + t];
140 |            if(topLeftBackIsIn) inTopLeftBack = inputImages_data[inTopLeftBackAddress + t];
141 |             
142 |            if(topRightFrontIsIn) inTopRightFront = inputImages_data[inTopRightFrontAddress + t];
143 |            if(topRightBackIsIn) inTopRightBack = inputImages_data[inTopRightBackAddress + t];
144 |             
145 |            if(bottomLeftFrontIsIn) inBottomLeftFront = inputImages_data[inBottomLeftFrontAddress + t];
146 |            if(bottomLeftBackIsIn) inBottomLeftBack = inputImages_data[inBottomLeftBackAddress + t];
147 |             
148 |            if(bottomRightFrontIsIn) inBottomRightFront = inputImages_data[inBottomRightFrontAddress + t];
149 |            if(bottomRightBackIsIn) inBottomRightBack = inputImages_data[inBottomRightBackAddress + t];
150 | 
151 |            v = xWeightTopLeftFront * yWeightTopLeftFront * zWeightTopLeftFront * inTopLeftFront
152 |              + xWeightTopLeftFront * yWeightTopLeftFront * (1-zWeightTopLeftFront) * inTopLeftBack
153 |              + (1 - xWeightTopLeftFront) * yWeightTopLeftFront * zWeightTopLeftFront * inTopRightFront
154 |              + (1 - xWeightTopLeftFront) * yWeightTopLeftFront * (1-zWeightTopLeftFront) * inTopRightBack
155 |              + xWeightTopLeftFront * (1 - yWeightTopLeftFront) * zWeightTopLeftFront * inBottomLeftFront
156 |              + xWeightTopLeftFront * (1 - yWeightTopLeftFront) * (1-zWeightTopLeftFront) * inBottomLeftBack
157 |              + (1 - xWeightTopLeftFront) * (1 - yWeightTopLeftFront) * zWeightTopLeftFront * inBottomRightFront
158 |              + (1 - xWeightTopLeftFront) * (1 - yWeightTopLeftFront) * (1-zWeightTopLeftFront) * inBottomRightBack;
159 |            
160 |            output_data[outAddress + t] = v;
161 |         }
162 |             
163 |         }
164 |       }
165 |     }
166 |   }
167 | 
168 |   return 1;
169 | }
170 | 
171 | static int nn_(BilinearSamplerBHWD_updateGradInput)(lua_State *L)
172 | {
173 |   THTensor *inputImages = luaT_checkudata(L, 2, torch_Tensor);
174 |   THTensor *grids = luaT_checkudata(L, 3, torch_Tensor);
175 |   THTensor *gradInputImages = luaT_checkudata(L, 4, torch_Tensor);
176 |   THTensor *gradGrids = luaT_checkudata(L, 5, torch_Tensor);
177 |   THTensor *gradOutput = luaT_checkudata(L, 6, torch_Tensor);
178 | 
179 |   bool onlyGrid=false;
180 | 
181 |   int batchsize = inputImages->size[0];
182 |   int inputImages_height = inputImages->size[2];
183 |   int inputImages_width = inputImages->size[3];
184 |   int inputImages_depth = inputImages->size[1];
185 |     
186 |   int gradOutput_height = gradOutput->size[2];
187 |   int gradOutput_width = gradOutput->size[3];
188 |   int gradOutput_depth = gradOutput->size[1];
189 |   int inputImages_channels = inputImages->size[4];
190 | 
191 |   int gradOutput_strideBatch = gradOutput->stride[0];
192 |   int gradOutput_strideHeight = gradOutput->stride[2];
193 |   int gradOutput_strideWidth = gradOutput->stride[3];
194 |   int gradOutput_strideDepth = gradOutput->stride[1];
195 | 
196 |   int inputImages_strideBatch = inputImages->stride[0];
197 |   int inputImages_strideHeight = inputImages->stride[2];
198 |   int inputImages_strideWidth = inputImages->stride[3];
199 |   int inputImages_strideDepth = inputImages->stride[1];
200 | 
201 |   int gradInputImages_strideBatch = gradInputImages->stride[0];
202 |   int gradInputImages_strideHeight = gradInputImages->stride[2];
203 |   int gradInputImages_strideWidth = gradInputImages->stride[3];
204 |   int gradInputImages_strideDepth = gradInputImages->stride[1];
205 | 
206 |   int grids_strideBatch = grids->stride[0];
207 |   int grids_strideHeight = grids->stride[2];
208 |   int grids_strideWidth = grids->stride[3];
209 |   int grids_strideDepth = grids->stride[1];
210 | 
211 |   int gradGrids_strideBatch = gradGrids->stride[0];
212 |   int gradGrids_strideHeight = gradGrids->stride[2];
213 |   int gradGrids_strideWidth = gradGrids->stride[3];
214 |   int gradGrids_strideDepth = gradGrids->stride[1];
215 | 
216 |   real *inputImages_data, *gradOutput_data, *grids_data, *gradGrids_data, *gradInputImages_data;
217 |   inputImages_data = THTensor_(data)(inputImages);
218 |   gradOutput_data = THTensor_(data)(gradOutput);
219 |   grids_data = THTensor_(data)(grids);
220 |   gradGrids_data = THTensor_(data)(gradGrids);
221 |   gradInputImages_data = THTensor_(data)(gradInputImages);
222 | 
223 |     int b, yOut, xOut, zOut;
224 | 
225 |   for(b=0; b < batchsize; b++)
226 |   {
227 |     for(zOut=0; zOut < gradOutput_depth; zOut++)
228 |     {
229 |       for(yOut=0; yOut < gradOutput_height; yOut++)
230 |       {
231 |         for(xOut=0; xOut < gradOutput_width; xOut++)
232 |         {
233 |           //read the grid
234 |         real yf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + zOut*grids_strideDepth + 1];
235 |         real xf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + zOut*grids_strideDepth + 2];
236 |         real zf = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + zOut*grids_strideDepth];
237 |         
238 |         // get the weights for interpolation
239 |         int yInTopLeftFront, xInTopLeftFront, zInTopLeftFront;
240 |         real yWeightTopLeftFront, xWeightTopLeftFront, zWeightTopLeftFront;
241 |  
242 |         real xcoord = (xf + 1) * (inputImages_width - 1) / 2;
243 |         xInTopLeftFront = floor(xcoord);
244 |         xWeightTopLeftFront = 1 - (xcoord - xInTopLeftFront);
245 | 
246 |         real ycoord = (yf + 1) * (inputImages_height - 1) / 2;
247 |         yInTopLeftFront = floor(ycoord);
248 |         yWeightTopLeftFront = 1 - (ycoord - yInTopLeftFront);
249 |             
250 |         real zcoord = (zf + 1) * (inputImages_depth - 1) / 2;
251 |         zInTopLeftFront = floor(zcoord);
252 |         zWeightTopLeftFront = 1 - (zcoord - zInTopLeftFront);
253 |             
254 |         const int inTopLeftFrontAddress = inputImages_strideBatch * b + inputImages_strideHeight * yInTopLeftFront 
255 |           + inputImages_strideWidth * xInTopLeftFront + inputImages_strideDepth * zInTopLeftFront;
256 |   
257 |         const int inTopLeftBackAddress = inTopLeftFrontAddress + inputImages_strideDepth;
258 |             
259 |         const int inTopRightFrontAddress = inTopLeftFrontAddress + inputImages_strideWidth;
260 |         const int inTopRightBackAddress = inTopRightFrontAddress + inputImages_strideDepth;
261 |             
262 |         const int inBottomLeftFrontAddress = inTopLeftFrontAddress + inputImages_strideHeight;
263 |         const int inBottomLeftBackAddress = inBottomLeftFrontAddress + inputImages_strideDepth;
264 |             
265 |         const int inBottomRightFrontAddress = inBottomLeftFrontAddress + inputImages_strideWidth;
266 |         const int inBottomRightBackAddress = inBottomRightFrontAddress + inputImages_strideDepth;
267 | 
268 |         const int gradInputImagesTopLeftFrontAddress = gradInputImages_strideBatch * b + gradInputImages_strideHeight * yInTopLeftFront 
269 |           + gradInputImages_strideWidth * xInTopLeftFront + gradInputImages_strideDepth * zInTopLeftFront;
270 |         const int gradInputImagesTopLeftBackAddress = gradInputImagesTopLeftFrontAddress + gradInputImages_strideDepth;
271 |             
272 |         const int gradInputImagesTopRightFrontAddress = gradInputImagesTopLeftFrontAddress + gradInputImages_strideWidth;
273 |         const int gradInputImagesTopRightBackAddress = gradInputImagesTopRightFrontAddress + gradInputImages_strideDepth;
274 |         
275 |         const int gradInputImagesBottomLeftFrontAddress = gradInputImagesTopLeftFrontAddress + gradInputImages_strideHeight;
276 |         const int gradInputImagesBottomLeftBackAddress = gradInputImagesBottomLeftFrontAddress +gradInputImages_strideDepth;
277 |             
278 |         const int gradInputImagesBottomRightFrontAddress = gradInputImagesBottomLeftFrontAddress + gradInputImages_strideWidth;
279 |         const int gradInputImagesBottomRightBackAddress = gradInputImagesBottomRightFrontAddress + gradInputImages_strideDepth;
280 | 
281 |         const int gradOutputAddress = gradOutput_strideBatch * b + gradOutput_strideHeight * yOut 
282 |           + gradOutput_strideWidth * xOut + gradOutput_strideDepth * zOut;
283 | 
284 |         real topLeftFrontDotProduct = 0;
285 |         real topLeftBackDotProduct = 0;
286 |         real topRightFrontDotProduct = 0;
287 |         real topRightBackDotProduct = 0;
288 |             
289 |         real bottomLeftFrontDotProduct = 0;
290 |         real bottomLeftBackDotProduct = 0;
291 |         real bottomRightFrontDotProduct = 0;
292 |         real bottomRightBackDotProduct = 0;
293 | 
294 |         real v=0;
295 |         real inTopLeftFront=0;
296 |         real inTopLeftBack=0;
297 |         real inTopRightFront=0;
298 |         real inTopRightBack=0;
299 | 
300 |         real inBottomLeftFront=0;
301 |         real inBottomLeftBack=0;
302 |         real inBottomRightFront=0;
303 |         real inBottomRightBack=0;
304 |       
305 |         // we are careful with the boundaries
306 |         bool topLeftFrontIsIn = (xInTopLeftFront >= 0 && xInTopLeftFront <= inputImages_width-1 
307 |           && yInTopLeftFront >= 0 && yInTopLeftFront<= inputImages_height-1 
308 |           && zInTopLeftFront >=0 && zInTopLeftFront <= inputImages_depth-1);
309 |             
310 |         bool topLeftBackIsIn = (xInTopLeftFront >= 0 && xInTopLeftFront <= inputImages_width-1 
311 |           && yInTopLeftFront >= 0 && yInTopLeftFront<= inputImages_height-1 
312 |           && zInTopLeftFront+1 >=0 && zInTopLeftFront+1<= inputImages_depth-1);
313 |             
314 |         bool topRightFrontIsIn = (xInTopLeftFront+1 >= 0 && xInTopLeftFront+1 <= inputImages_width-1 
315 |           && yInTopLeftFront >= 0 && yInTopLeftFront <= inputImages_height-1 
316 |           && zInTopLeftFront >=0 && zInTopLeftFront <= inputImages_depth-1);
317 |             
318 |         bool topRightBackIsIn = (xInTopLeftFront+1 >= 0 && xInTopLeftFront+1 <= inputImages_width-1 
319 |           && yInTopLeftFront >= 0 && yInTopLeftFront <= inputImages_height-1 
320 |           && zInTopLeftFront+1 >=0 && zInTopLeftFront+1 <= inputImages_depth-1);
321 |             
322 |         bool bottomLeftFrontIsIn = (xInTopLeftFront >= 0 && xInTopLeftFront <= inputImages_width-1 
323 |           && yInTopLeftFront+1 >= 0 && yInTopLeftFront+1 <= inputImages_height-1 
324 |           && zInTopLeftFront >=0 && zInTopLeftFront <= inputImages_depth-1);
325 |             
326 |         bool bottomLeftBackIsIn = (xInTopLeftFront >= 0 && xInTopLeftFront <= inputImages_width-1 
327 |           && yInTopLeftFront+1 >= 0 && yInTopLeftFront+1 <= inputImages_height-1 
328 |           && zInTopLeftFront+1 >=0 && zInTopLeftFront+1 <= inputImages_depth-1);
329 |             
330 |         bool bottomRightFrontIsIn = (xInTopLeftFront+1 >= 0 && xInTopLeftFront+1 <= inputImages_width-1 
331 |           && yInTopLeftFront+1 >= 0 && yInTopLeftFront+1 <= inputImages_height-1 
332 |           && zInTopLeftFront >=0 && zInTopLeftFront <= inputImages_depth-1);
333 |  
334 |         bool bottomRightBackIsIn = (xInTopLeftFront+1 >= 0 && xInTopLeftFront+1 <= inputImages_width-1 
335 |           && yInTopLeftFront+1 >= 0 && yInTopLeftFront+1 <= inputImages_height-1 
336 |           && zInTopLeftFront+1 >=0 && zInTopLeftFront+1 <= inputImages_depth-1);
337 |         int t;
338 |         
339 |         for(t=0; t<inputImages_channels; t++)
340 |         {
341 |            real gradOutValue = gradOutput_data[gradOutputAddress + t];
342 |            if(topLeftFrontIsIn)
343 |            {
344 |               real inTopLeftFront = inputImages_data[inTopLeftFrontAddress + t];
345 |               topLeftFrontDotProduct += inTopLeftFront * gradOutValue;
346 |               if(!onlyGrid) gradInputImages_data[gradInputImagesTopLeftFrontAddress + t] += 
347 |                 xWeightTopLeftFront * yWeightTopLeftFront * zWeightTopLeftFront * gradOutValue;
348 |            }
349 |            if(topLeftBackIsIn)
350 |            {
351 |               real inTopLeftBack = inputImages_data[inTopLeftBackAddress + t];
352 |               topLeftBackDotProduct += inTopLeftBack * gradOutValue;
353 |               if(!onlyGrid) gradInputImages_data[gradInputImagesTopLeftBackAddress + t] += 
354 |                 xWeightTopLeftFront * yWeightTopLeftFront * (1-zWeightTopLeftFront) * gradOutValue;
355 |            }
356 | 
357 |            if(topRightFrontIsIn)
358 |            {
359 |               real inTopRightFront = inputImages_data[inTopRightFrontAddress + t];
360 |               topRightFrontDotProduct += inTopRightFront * gradOutValue;
361 |               if(!onlyGrid) gradInputImages_data[gradInputImagesTopRightFrontAddress + t] += 
362 |                 (1 - xWeightTopLeftFront) * yWeightTopLeftFront * zWeightTopLeftFront * gradOutValue;
363 |            }
364 |            if(topRightBackIsIn)
365 |            {
366 |               real inTopRightBack = inputImages_data[inTopRightBackAddress + t];
367 |               topRightBackDotProduct += inTopRightBack * gradOutValue;
368 |               if(!onlyGrid) gradInputImages_data[gradInputImagesTopRightBackAddress + t] += 
369 |                 (1 - xWeightTopLeftFront) * yWeightTopLeftFront * (1-zWeightTopLeftFront) * gradOutValue;
370 |            }
371 |            
372 |            if(bottomLeftFrontIsIn)
373 |            {
374 |               real inBottomLeftFront = inputImages_data[inBottomLeftFrontAddress + t];
375 |               bottomLeftFrontDotProduct += inBottomLeftFront * gradOutValue;
376 |               if(!onlyGrid) gradInputImages_data[gradInputImagesBottomLeftFrontAddress + t] += 
377 |                 xWeightTopLeftFront * (1 - yWeightTopLeftFront) * zWeightTopLeftFront * gradOutValue;
378 |            }
379 |            if(bottomLeftBackIsIn)
380 |            {
381 |               real inBottomLeftBack = inputImages_data[inBottomLeftBackAddress + t];
382 |               bottomLeftBackDotProduct += inBottomLeftBack * gradOutValue;
383 |               if(!onlyGrid) gradInputImages_data[gradInputImagesBottomLeftBackAddress + t] += 
384 |                 xWeightTopLeftFront * (1 - yWeightTopLeftFront) * (1-zWeightTopLeftFront) * gradOutValue;
385 |            }
386 |       
387 |             if(bottomRightFrontIsIn)
388 |            {
389 |               real inBottomRightFront = inputImages_data[inBottomRightFrontAddress + t];
390 |               bottomRightFrontDotProduct += inBottomRightFront * gradOutValue;
391 |               if(!onlyGrid) gradInputImages_data[gradInputImagesBottomRightFrontAddress + t] += 
392 |                 (1 - xWeightTopLeftFront) * (1 - yWeightTopLeftFront) * zWeightTopLeftFront * gradOutValue;
393 |            }
394 | 
395 |            if(bottomRightBackIsIn)
396 |            {
397 |               real inBottomRightBack = inputImages_data[inBottomRightBackAddress + t];
398 |               bottomRightBackDotProduct += inBottomRightBack * gradOutValue;
399 |               if(!onlyGrid) gradInputImages_data[gradInputImagesBottomRightBackAddress + t] += 
400 |                 (1 - xWeightTopLeftFront) * (1 - yWeightTopLeftFront) * (1-zWeightTopLeftFront) * gradOutValue;
401 |            }
402 |         }
403 | 
404 |         yf = topLeftFrontDotProduct * xWeightTopLeftFront * zWeightTopLeftFront * (-1)
405 |            + topLeftBackDotProduct * xWeightTopLeftFront * (1-zWeightTopLeftFront) * (-1)
406 |            + topRightFrontDotProduct * (1-xWeightTopLeftFront) * zWeightTopLeftFront * (-1)
407 |            + topRightBackDotProduct * (1-xWeightTopLeftFront) * (1-zWeightTopLeftFront) *(-1)
408 |            + bottomLeftFrontDotProduct * xWeightTopLeftFront * zWeightTopLeftFront * (1)
409 |            + bottomLeftBackDotProduct * xWeightTopLeftFront * (1-zWeightTopLeftFront) * (1)
410 |            + bottomRightFrontDotProduct * (1-xWeightTopLeftFront) * zWeightTopLeftFront * (1)
411 |             + bottomRightBackDotProduct * (1-xWeightTopLeftFront) * (1-zWeightTopLeftFront) *(1);
412 |             
413 |         xf = topLeftFrontDotProduct * yWeightTopLeftFront * zWeightTopLeftFront *(-1)
414 |            + topLeftBackDotProduct * yWeightTopLeftFront * (1-zWeightTopLeftFront) *(-1)
415 |            + topRightFrontDotProduct * yWeightTopLeftFront * zWeightTopLeftFront * 1
416 |            + topRightBackDotProduct * yWeightTopLeftFront * (1-zWeightTopLeftFront) * 1
417 |            + bottomLeftFrontDotProduct * (1-yWeightTopLeftFront) * zWeightTopLeftFront * (-1)
418 |            + bottomLeftBackDotProduct * (1-yWeightTopLeftFront) * (1-zWeightTopLeftFront) * (-1)
419 |            + bottomRightFrontDotProduct * (1-yWeightTopLeftFront) * zWeightTopLeftFront * (1)
420 |             + bottomRightBackDotProduct * (1-yWeightTopLeftFront) *(1-zWeightTopLeftFront) * (1);
421 |             
422 |         zf = topLeftFrontDotProduct * yWeightTopLeftFront * xWeightTopLeftFront * (-1)
423 |            + topLeftBackDotProduct * yWeightTopLeftFront * xWeightTopLeftFront *(1)
424 |            + topRightFrontDotProduct * yWeightTopLeftFront * (1-xWeightTopLeftFront) *(-1)
425 |            + topRightBackDotProduct * yWeightTopLeftFront * (1-xWeightTopLeftFront) *(1)
426 |            + bottomLeftFrontDotProduct * (1-yWeightTopLeftFront) * xWeightTopLeftFront * (-1)
427 |            + bottomLeftBackDotProduct * (1-yWeightTopLeftFront) * xWeightTopLeftFront * (1)
428 |            + bottomRightFrontDotProduct * (1-yWeightTopLeftFront) * (1-xWeightTopLeftFront) *(-1)
429 |             + bottomRightBackDotProduct * (1-yWeightTopLeftFront) * (1-xWeightTopLeftFront) * 1;
430 | 
431 |         gradGrids_data[b*gradGrids_strideBatch + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + zOut*gradGrids_strideDepth + 1] = yf * (inputImages_height-1) / 2;
432 |         
433 |         gradGrids_data[b*gradGrids_strideBatch + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + zOut*gradGrids_strideDepth + 2] = xf * (inputImages_width-1) / 2;
434 |             
435 |         gradGrids_data[b*gradGrids_strideBatch + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + zOut*gradGrids_strideDepth] = zf * (inputImages_depth-1) / 2;
436 |       }
437 |       }
438 |     }
439 |   }
440 |   return 1;
441 | }
442 | 
443 | // Perspective Transformation
444 | static int nn_(BilinearSamplerPerspective_updateOutput)(lua_State *L)
445 | {
446 |   THTensor *inputImages = luaT_checkudata(L, 2, torch_Tensor);
447 |   THTensor *grids = luaT_checkudata(L, 3, torch_Tensor);
448 |   THTensor *output = luaT_checkudata(L, 4, torch_Tensor);
449 |   float focal_length = lua_tonumber(L, 5);
450 | 
451 |   int batchsize = inputImages->size[0];
452 |   int inputImages_depth = inputImages->size[1];
453 |   int inputImages_height = inputImages->size[2];
454 |   int inputImages_width = inputImages->size[3];
455 |     
456 |   int output_dist = output->size[1];
457 |   int output_height = output->size[2];
458 |   int output_width = output->size[3];
459 | 
460 |   int inputImages_channels = inputImages->size[4];
461 | 
462 |   int output_strideBatch = output->stride[0];
463 |   int output_strideDist = output->stride[1];
464 |   int output_strideHeight = output->stride[2];
465 |   int output_strideWidth = output->stride[3];
466 | 
467 |   int inputImages_strideBatch = inputImages->stride[0];
468 |   int inputImages_strideDepth = inputImages->stride[1];
469 |   int inputImages_strideHeight = inputImages->stride[2];
470 |   int inputImages_strideWidth = inputImages->stride[3];
471 | 
472 |   int grids_strideBatch = grids->stride[0];
473 |   int grids_strideDepth = grids->stride[1];
474 |   int grids_strideHeight = grids->stride[2];
475 |   int grids_strideWidth = grids->stride[3];
476 | 
477 |   real *inputImages_data, *output_data, *grids_data;
478 |   inputImages_data = THTensor_(data)(inputImages);
479 |   output_data = THTensor_(data)(output);
480 |   grids_data = THTensor_(data)(grids);
481 | 
482 |   int b, yOut, xOut, disOut;
483 | 
484 |   for(b=0; b < batchsize; b++)
485 |   {
486 |     for(disOut=0; disOut < output_dist; disOut++)
487 |     {
488 |       for(yOut=0; yOut < output_height; yOut++)
489 |       {
490 |         for(xOut=0; xOut < output_width; xOut++)
491 |         {
492 |            
493 |         //read the grid
494 |         real zf = grids_data[b*grids_strideBatch + disOut*grids_strideDepth + yOut*grids_strideHeight + xOut*grids_strideWidth];
495 |         real yf = grids_data[b*grids_strideBatch + disOut*grids_strideDepth + yOut*grids_strideHeight + xOut*grids_strideWidth + 1];
496 |         real xf = grids_data[b*grids_strideBatch + disOut*grids_strideDepth + yOut*grids_strideHeight + xOut*grids_strideWidth + 2];
497 |         real disf = grids_data[b*grids_strideBatch + disOut*grids_strideDepth + yOut*grids_strideHeight + xOut*grids_strideWidth + 3];
498 |   
499 |         //printf("%.3f %.3f %.3f\n", zf, yf, xf);
500 |        // normalize the coordinates (x^w, y^w, z^w, 1)
501 |         //yf = yf / disf;
502 |         //xf = xf / disf;
503 |         //zf = zf / disf - (focal_length + 0.5);
504 | 
505 |         // get the weights for interpolation
506 |         int zInFrontTopLeft, yInFrontTopLeft, xInFrontTopLeft;
507 |         real zWeightFrontTopLeft, yWeightFrontTopLeft, xWeightFrontTopLeft;
508 |  
509 |         real xcoord = (xf + 1) * (inputImages_width - 1) / 2;
510 |         xInFrontTopLeft = floor(xcoord);
511 |         xWeightFrontTopLeft = 1 - (xcoord - xInFrontTopLeft);
512 | 
513 |         real ycoord = (yf + 1) * (inputImages_height - 1) / 2;
514 |         yInFrontTopLeft = floor(ycoord);
515 |         yWeightFrontTopLeft = 1 - (ycoord - yInFrontTopLeft);
516 | 
517 |         real zcoord = (zf + 1) * (inputImages_depth - 1) / 2;
518 |         zInFrontTopLeft = floor(zcoord);
519 |         zWeightFrontTopLeft = 1 - (zcoord - zInFrontTopLeft);
520 |            
521 |         const int outAddress = output_strideBatch * b + output_strideDist * disOut + output_strideHeight * yOut + output_strideWidth * xOut;
522 |             
523 |         const int inFrontTopLeftAddress = inputImages_strideBatch * b + inputImages_strideDepth * zInFrontTopLeft + inputImages_strideHeight * yInFrontTopLeft + inputImages_strideWidth * xInFrontTopLeft;
524 |         const int inFrontTopRightAddress = inFrontTopLeftAddress + inputImages_strideWidth;
525 |         const int inFrontBottomLeftAddress = inFrontTopLeftAddress + inputImages_strideHeight;
526 |         const int inFrontBottomRightAddress = inFrontBottomLeftAddress + inputImages_strideWidth;
527 |             
528 |         const int inBackTopLeftAddress = inFrontTopLeftAddress + inputImages_strideDepth;
529 |         const int inBackTopRightAddress = inBackTopLeftAddress + inputImages_strideWidth;
530 |         const int inBackBottomLeftAddress = inBackTopLeftAddress + inputImages_strideHeight;
531 |         const int inBackBottomRightAddress = inBackBottomLeftAddress + inputImages_strideWidth;
532 | 
533 |         real v=0;
534 |         real inFrontTopLeft=0;
535 |         real inFrontTopRight=0;
536 |         real inFrontBottomLeft=0;
537 |         real inFrontBottomRight=0;
538 |         real inBackTopLeft=0;
539 |         real inBackTopRight=0;
540 |         real inBackBottomLeft=0;
541 |         real inBackBottomRight=0;
542 | 
543 |         // we are careful with the boundaries
544 |         bool frontTopLeftIsIn = xInFrontTopLeft >= 0 && xInFrontTopLeft <= inputImages_width-1 && yInFrontTopLeft >= 0 && yInFrontTopLeft <= inputImages_height-1 && zInFrontTopLeft >= 0 && zInFrontTopLeft <= inputImages_depth-1;
545 |         bool frontTopRightIsIn = xInFrontTopLeft+1 >= 0 && xInFrontTopLeft+1 <= inputImages_width-1 && yInFrontTopLeft >= 0 && yInFrontTopLeft <= inputImages_height-1 && zInFrontTopLeft >= 0 && zInFrontTopLeft <= inputImages_depth-1;
546 |         bool frontBottomLeftIsIn = xInFrontTopLeft >= 0 && xInFrontTopLeft <= inputImages_width-1 && yInFrontTopLeft+1 >= 0 && yInFrontTopLeft+1 <= inputImages_height-1 && zInFrontTopLeft >= 0 && zInFrontTopLeft <= inputImages_depth-1;
547 |         bool frontBottomRightIsIn = xInFrontTopLeft+1 >= 0 && xInFrontTopLeft+1 <= inputImages_width-1 && yInFrontTopLeft+1 >= 0 && yInFrontTopLeft+1 <= inputImages_height-1 && zInFrontTopLeft >= 0 && zInFrontTopLeft <= inputImages_depth-1;
548 |             
549 |         bool backTopLeftIsIn = xInFrontTopLeft >= 0 && xInFrontTopLeft <= inputImages_width-1 && yInFrontTopLeft >= 0 && yInFrontTopLeft <= inputImages_height-1 && zInFrontTopLeft+1>= 0 && zInFrontTopLeft+1<= inputImages_depth-1;
550 |         bool backTopRightIsIn = xInFrontTopLeft+1 >= 0 && xInFrontTopLeft+1 <= inputImages_width-1 && yInFrontTopLeft >= 0 && yInFrontTopLeft <= inputImages_height-1 && zInFrontTopLeft+1 >= 0 && zInFrontTopLeft+1 <= inputImages_depth-1;
551 |         bool backBottomLeftIsIn = xInFrontTopLeft >= 0 && xInFrontTopLeft <= inputImages_width-1 && yInFrontTopLeft+1 >= 0 && yInFrontTopLeft+1 <= inputImages_height-1 && zInFrontTopLeft+1 >= 0 && zInFrontTopLeft+1 <= inputImages_depth-1;
552 |         bool backBottomRightIsIn = xInFrontTopLeft+1 >= 0 && xInFrontTopLeft+1 <= inputImages_width-1 && yInFrontTopLeft+1 >= 0 && yInFrontTopLeft+1 <= inputImages_height-1 && zInFrontTopLeft+1 >= 0 && zInFrontTopLeft+1 <= inputImages_depth-1;
553 | 
554 |         int t;
555 |         // interpolation happens here
556 |         for(t=0; t<inputImages_channels; t++)
557 |         {
558 |            if(frontTopLeftIsIn) inFrontTopLeft = inputImages_data[inFrontTopLeftAddress + t];
559 |            if(frontTopRightIsIn) inFrontTopRight = inputImages_data[inFrontTopRightAddress + t];
560 |            if(frontBottomLeftIsIn) inFrontBottomLeft = inputImages_data[inFrontBottomLeftAddress + t];
561 |            if(frontBottomRightIsIn) inFrontBottomRight = inputImages_data[inFrontBottomRightAddress + t];
562 |            
563 |            if(backTopLeftIsIn) inBackTopLeft = inputImages_data[inBackTopLeftAddress + t];
564 |            if(backTopRightIsIn) inBackTopRight = inputImages_data[inBackTopRightAddress + t];
565 |            if(backBottomLeftIsIn) inBackBottomLeft = inputImages_data[inBackBottomLeftAddress + t];
566 |            if(backBottomRightIsIn) inBackBottomRight = inputImages_data[inBackBottomRightAddress + t];
567 |            
568 |            v = xWeightFrontTopLeft * yWeightFrontTopLeft * zWeightFrontTopLeft * inFrontTopLeft
569 |              + (1 - xWeightFrontTopLeft) * yWeightFrontTopLeft * zWeightFrontTopLeft * inFrontTopRight
570 |              + xWeightFrontTopLeft * (1 - yWeightFrontTopLeft) * zWeightFrontTopLeft * inFrontBottomLeft
571 |              + (1 - xWeightFrontTopLeft) * (1 - yWeightFrontTopLeft) * zWeightFrontTopLeft * inFrontBottomRight
572 |              + xWeightFrontTopLeft * yWeightFrontTopLeft * (1 - zWeightFrontTopLeft) * inBackTopLeft
573 |              + (1 - xWeightFrontTopLeft) * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) * inBackTopRight
574 |              + xWeightFrontTopLeft * (1 - yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * inBackBottomLeft
575 |              + (1 - xWeightFrontTopLeft) * (1 - yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * inBackBottomRight;
576 |            
577 |            output_data[outAddress + t] = v;
578 |        }
579 | 
580 |       }
581 |     }
582 |   }
583 |   }
584 |   return 1;
585 | }
586 | 
587 | static int nn_(BilinearSamplerPerspective_updateGradInput)(lua_State *L)
588 | {
589 |   THTensor *inputImages = luaT_checkudata(L, 2, torch_Tensor);
590 |   THTensor *grids = luaT_checkudata(L, 3, torch_Tensor);
591 |   THTensor *gradInputImages = luaT_checkudata(L, 4, torch_Tensor);
592 |   THTensor *gradGrids = luaT_checkudata(L, 5, torch_Tensor);
593 |   THTensor *gradOutput = luaT_checkudata(L, 6, torch_Tensor);
594 |   float focal_length = lua_tonumber(L, 7);
595 | 
596 |   bool onlyGrid=false;
597 | 
598 |   int batchsize = inputImages->size[0];
599 |   int inputImages_depth = inputImages->size[1];
600 |   int inputImages_height = inputImages->size[2];
601 |   int inputImages_width = inputImages->size[3];
602 |     
603 |   int gradOutput_dist = gradOutput->size[1];
604 |   int gradOutput_height = gradOutput->size[2];
605 |   int gradOutput_width = gradOutput->size[3];
606 |     
607 |   int inputImages_channels = inputImages->size[4];
608 | 
609 |   int gradOutput_strideBatch = gradOutput->stride[0];
610 |   int gradOutput_strideDist = gradOutput->stride[1];
611 |   int gradOutput_strideHeight = gradOutput->stride[2];
612 |   int gradOutput_strideWidth = gradOutput->stride[3];
613 | 
614 |   int inputImages_strideBatch = inputImages->stride[0];
615 |   int inputImages_strideDepth = inputImages->stride[1];
616 |   int inputImages_strideHeight = inputImages->stride[2];
617 |   int inputImages_strideWidth = inputImages->stride[3];
618 | 
619 |   int gradInputImages_strideBatch = gradInputImages->stride[0];
620 |   int gradInputImages_strideDepth = gradInputImages->stride[1];
621 |   int gradInputImages_strideHeight = gradInputImages->stride[2];
622 |   int gradInputImages_strideWidth = gradInputImages->stride[3];
623 | 
624 |   int grids_strideBatch = grids->stride[0];
625 |   int grids_strideDepth = grids->stride[1];
626 |   int grids_strideHeight = grids->stride[2];
627 |   int grids_strideWidth = grids->stride[3];
628 | 
629 |   int gradGrids_strideBatch = gradGrids->stride[0];
630 |   int gradGrids_strideDepth = gradGrids->stride[1];
631 |   int gradGrids_strideHeight = gradGrids->stride[2];
632 |   int gradGrids_strideWidth = gradGrids->stride[3];
633 | 
634 |   real *inputImages_data, *gradOutput_data, *grids_data, *gradGrids_data, *gradInputImages_data;
635 |   inputImages_data = THTensor_(data)(inputImages);
636 |   gradOutput_data = THTensor_(data)(gradOutput);
637 |   grids_data = THTensor_(data)(grids);
638 |   gradGrids_data = THTensor_(data)(gradGrids);
639 |   gradInputImages_data = THTensor_(data)(gradInputImages);
640 | 
641 |   int b, yOut, xOut, disOut;
642 | 
643 |   for(b=0; b < batchsize; b++)
644 |   {
645 |     for(disOut = 0; disOut < gradOutput_dist; disOut++)
646 |     {
647 |       for(yOut=0; yOut < gradOutput_height; yOut++)
648 |       {
649 |         for(xOut=0; xOut < gradOutput_width; xOut++)
650 |         {
651 |         
652 |        //read the grid
653 |         real zf = grids_data[b*grids_strideBatch + disOut*grids_strideDepth + yOut*grids_strideHeight + xOut*grids_strideWidth];
654 |         real yf = grids_data[b*grids_strideBatch + disOut*grids_strideDepth + yOut*grids_strideHeight + xOut*grids_strideWidth + 1];
655 |         real xf = grids_data[b*grids_strideBatch + disOut*grids_strideDepth + yOut*grids_strideHeight + xOut*grids_strideWidth+ 2];
656 |         real disf = grids_data[b*grids_strideBatch + disOut*grids_strideDepth + yOut*grids_strideHeight + xOut*grids_strideWidth+ 3];
657 |        
658 |         // yf = yf / disf;
659 |         //xf = xf / disf;
660 |         //zf = zf / disf;
661 | 
662 |         // get the weights for interpolation
663 |         int zInFrontTopLeft, yInFrontTopLeft, xInFrontTopLeft;
664 |         real zWeightFrontTopLeft, yWeightFrontTopLeft, xWeightFrontTopLeft;
665 |             
666 |         real xcoord = (xf + 1) * (inputImages_width - 1) / 2;
667 |         xInFrontTopLeft = floor(xcoord);
668 |         xWeightFrontTopLeft = 1 - (xcoord - xInFrontTopLeft);
669 |             
670 |         real ycoord = (yf + 1) * (inputImages_height - 1) / 2;
671 |         yInFrontTopLeft = floor(ycoord);
672 |         yWeightFrontTopLeft = 1 - (ycoord - yInFrontTopLeft);
673 |             
674 |         real zcoord = (zf + 1) * (inputImages_depth - 1) / 2;
675 |         zInFrontTopLeft = floor(zcoord);
676 |         zWeightFrontTopLeft = 1 - (zcoord - zInFrontTopLeft);
677 |            
678 |         const int inFrontTopLeftAddress = inputImages_strideBatch * b + inputImages_strideDepth * zInFrontTopLeft + inputImages_strideHeight * yInFrontTopLeft + inputImages_strideWidth * xInFrontTopLeft;
679 |         const int inFrontTopRightAddress = inFrontTopLeftAddress + inputImages_strideWidth;
680 |         const int inFrontBottomLeftAddress = inFrontTopLeftAddress + inputImages_strideHeight;
681 |         const int inFrontBottomRightAddress = inFrontBottomLeftAddress + inputImages_strideWidth;
682 |             
683 |         const int inBackTopLeftAddress = inFrontTopLeftAddress + inputImages_strideDepth;
684 |         const int inBackTopRightAddress = inBackTopLeftAddress + inputImages_strideWidth;
685 |         const int inBackBottomLeftAddress = inBackTopLeftAddress + inputImages_strideHeight;
686 |         const int inBackBottomRightAddress = inBackBottomLeftAddress + inputImages_strideWidth;
687 | 
688 |         const int gradInputImagesFrontTopLeftAddress = gradInputImages_strideBatch * b + gradInputImages_strideDepth * zInFrontTopLeft + gradInputImages_strideHeight * yInFrontTopLeft + gradInputImages_strideWidth * xInFrontTopLeft;
689 |         const int gradInputImagesFrontTopRightAddress = gradInputImagesFrontTopLeftAddress + gradInputImages_strideWidth;
690 |         const int gradInputImagesFrontBottomLeftAddress = gradInputImagesFrontTopLeftAddress + gradInputImages_strideHeight;
691 |         const int gradInputImagesFrontBottomRightAddress = gradInputImagesFrontBottomLeftAddress + gradInputImages_strideWidth;
692 |             
693 |         const int gradInputImagesBackTopLeftAddress = gradInputImagesFrontTopLeftAddress + gradInputImages_strideDepth;
694 |         const int gradInputImagesBackTopRightAddress = gradInputImagesBackTopLeftAddress + gradInputImages_strideWidth;
695 |         const int gradInputImagesBackBottomLeftAddress = gradInputImagesBackTopLeftAddress + gradInputImages_strideHeight;
696 |         const int gradInputImagesBackBottomRightAddress = gradInputImagesBackBottomLeftAddress + gradInputImages_strideWidth;
697 | 
698 |         const int gradOutputAddress = gradOutput_strideBatch * b + gradOutput_strideDist * disOut + gradOutput_strideHeight * yOut + gradOutput_strideWidth * xOut;
699 | 
700 |         real frontTopLeftDotProduct = 0;
701 |         real frontTopRightDotProduct = 0;
702 |         real frontBottomLeftDotProduct = 0;
703 |         real frontBottomRightDotProduct = 0;
704 |         real backTopLeftDotProduct = 0;
705 |         real backTopRightDotProduct = 0;
706 |         real backBottomLeftDotProduct = 0;
707 |         real backBottomRightDotProduct = 0;
708 | 
709 |         real v=0;
710 |         real inFrontTopLeft=0;
711 |         real inFrontTopRight=0;
712 |         real inFrontBottomLeft=0;
713 |         real inFrontBottomRight=0;
714 |         real inBackTopLeft=0;
715 |         real inBackTopRight=0;
716 |         real inBackBottomLeft=0;
717 |         real inBackBottomRight=0;
718 | 
719 |         // we are careful with the boundaries
720 |         bool frontTopLeftIsIn = xInFrontTopLeft >= 0 && xInFrontTopLeft <= inputImages_width-1 && yInFrontTopLeft >= 0 && yInFrontTopLeft <= inputImages_height-1 && zInFrontTopLeft >= 0 && zInFrontTopLeft <= inputImages_depth-1;
721 |         bool frontTopRightIsIn = xInFrontTopLeft+1 >= 0 && xInFrontTopLeft+1 <= inputImages_width-1 && yInFrontTopLeft >= 0 && yInFrontTopLeft <= inputImages_height-1 && zInFrontTopLeft >= 0 && zInFrontTopLeft <= inputImages_depth-1;
722 |         bool frontBottomLeftIsIn = xInFrontTopLeft >= 0 && xInFrontTopLeft <= inputImages_width-1 && yInFrontTopLeft+1 >= 0 && yInFrontTopLeft+1 <= inputImages_height-1 && zInFrontTopLeft >= 0 && zInFrontTopLeft <= inputImages_depth-1;
723 |         bool frontBottomRightIsIn = xInFrontTopLeft+1 >= 0 && xInFrontTopLeft+1 <= inputImages_width-1 && yInFrontTopLeft+1 >= 0 && yInFrontTopLeft+1 <= inputImages_height-1 && zInFrontTopLeft >= 0 && zInFrontTopLeft <= inputImages_depth-1;
724 |             
725 |         bool backTopLeftIsIn = xInFrontTopLeft >= 0 && xInFrontTopLeft <= inputImages_width-1 && yInFrontTopLeft >= 0 && yInFrontTopLeft <= inputImages_height-1 && zInFrontTopLeft+1>= 0 && zInFrontTopLeft+1<= inputImages_depth-1;
726 |         bool backTopRightIsIn = xInFrontTopLeft+1 >= 0 && xInFrontTopLeft+1 <= inputImages_width-1 && yInFrontTopLeft >= 0 && yInFrontTopLeft <= inputImages_height-1 && zInFrontTopLeft+1 >= 0 && zInFrontTopLeft+1 <= inputImages_depth-1;
727 |         bool backBottomLeftIsIn = xInFrontTopLeft >= 0 && xInFrontTopLeft <= inputImages_width-1 && yInFrontTopLeft+1 >= 0 && yInFrontTopLeft+1 <= inputImages_height-1 && zInFrontTopLeft+1 >= 0 && zInFrontTopLeft+1 <= inputImages_depth-1;
728 |         bool backBottomRightIsIn = xInFrontTopLeft+1 >= 0 && xInFrontTopLeft+1 <= inputImages_width-1 && yInFrontTopLeft+1 >= 0 && yInFrontTopLeft+1 <= inputImages_height-1 && zInFrontTopLeft+1 >= 0 && zInFrontTopLeft+1 <= inputImages_depth-1;
729 | 
730 |         int t;
731 | 
732 |         for(t=0; t<inputImages_channels; t++)
733 |         {
734 |            real gradOutValue = gradOutput_data[gradOutputAddress + t];
735 |            if(frontTopLeftIsIn)
736 |            {
737 |               real inFrontTopLeft = inputImages_data[inFrontTopLeftAddress + t];
738 |               frontTopLeftDotProduct += inFrontTopLeft * gradOutValue;
739 |               if(!onlyGrid) gradInputImages_data[gradInputImagesFrontTopLeftAddress + t] += xWeightFrontTopLeft * yWeightFrontTopLeft * zWeightFrontTopLeft * gradOutValue;
740 |            }
741 | 
742 |            if(frontTopRightIsIn)
743 |            {
744 |               real inFrontTopRight = inputImages_data[inFrontTopRightAddress + t];
745 |               frontTopRightDotProduct += inFrontTopRight * gradOutValue;
746 |               if(!onlyGrid) gradInputImages_data[gradInputImagesFrontTopRightAddress + t] += (1 - xWeightFrontTopLeft) * yWeightFrontTopLeft * zWeightFrontTopLeft * gradOutValue;
747 |            }
748 | 
749 |            if(frontBottomLeftIsIn)
750 |            {
751 |               real inFrontBottomLeft = inputImages_data[inFrontBottomLeftAddress + t];
752 |               frontBottomLeftDotProduct += inFrontBottomLeft * gradOutValue;
753 |               if(!onlyGrid) gradInputImages_data[gradInputImagesFrontBottomLeftAddress + t] += xWeightFrontTopLeft * (1 - yWeightFrontTopLeft) * zWeightFrontTopLeft * gradOutValue;
754 |            }
755 | 
756 |            if(frontBottomRightIsIn)
757 |            {
758 |               real inFrontBottomRight = inputImages_data[inFrontBottomRightAddress + t];
759 |               frontBottomRightDotProduct += inFrontBottomRight * gradOutValue;
760 |               if(!onlyGrid) gradInputImages_data[gradInputImagesFrontBottomRightAddress + t] += (1 - xWeightFrontTopLeft) * (1 - yWeightFrontTopLeft) * zWeightFrontTopLeft * gradOutValue;
761 |            }
762 |             
763 |            if(backTopLeftIsIn)
764 |             {
765 |                 real inBackTopLeft = inputImages_data[inBackTopLeftAddress + t];
766 |                 backTopLeftDotProduct += inBackTopLeft * gradOutValue;
767 |                 if(!onlyGrid) gradInputImages_data[gradInputImagesBackTopLeftAddress + t] += xWeightFrontTopLeft * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) * gradOutValue;
768 |             }
769 |             
770 |            if(backTopRightIsIn)
771 |             {
772 |                 real inBackTopRight = inputImages_data[inBackTopRightAddress + t];
773 |                 backTopRightDotProduct += inBackTopRight * gradOutValue;
774 |                 if(!onlyGrid) gradInputImages_data[gradInputImagesBackTopRightAddress + t] += (1 - xWeightFrontTopLeft) * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) * gradOutValue;
775 |             }
776 |             
777 |            if(backBottomLeftIsIn)
778 |             {
779 |                 real inBackBottomLeft = inputImages_data[inBackBottomLeftAddress + t];
780 |                 backBottomLeftDotProduct += inBackBottomLeft * gradOutValue;
781 |                 if(!onlyGrid) gradInputImages_data[gradInputImagesBackBottomLeftAddress + t] += xWeightFrontTopLeft * (1 - yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * gradOutValue;
782 |             }
783 |             
784 |            if(backBottomRightIsIn)
785 |             {
786 |                 real inBackBottomRight = inputImages_data[inBackBottomRightAddress + t];
787 |                 backBottomRightDotProduct += inBackBottomRight * gradOutValue;
788 |                 if(!onlyGrid) gradInputImages_data[gradInputImagesBackBottomRightAddress + t] += (1 - xWeightFrontTopLeft) * (1 - yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * gradOutValue;
789 |             }
790 | 
791 |         }
792 | 
793 |             
794 |         real dyf = frontTopLeftDotProduct * xWeightFrontTopLeft * zWeightFrontTopLeft * (-1)
795 |             + backTopLeftDotProduct * xWeightFrontTopLeft * (1-zWeightFrontTopLeft) * (-1)
796 |             + frontTopRightDotProduct * (1-xWeightFrontTopLeft) * zWeightFrontTopLeft * (-1)
797 |             + backTopRightDotProduct * (1-xWeightFrontTopLeft) * (1-zWeightFrontTopLeft) *(-1)
798 |             + frontBottomLeftDotProduct * xWeightFrontTopLeft * zWeightFrontTopLeft * (1)
799 |             + backBottomLeftDotProduct * xWeightFrontTopLeft * (1-zWeightFrontTopLeft) * (1)
800 |             + frontBottomRightDotProduct * (1-xWeightFrontTopLeft) * zWeightFrontTopLeft * (1)
801 |             + backBottomRightDotProduct * (1-xWeightFrontTopLeft) * (1-zWeightFrontTopLeft) *(1);
802 |             
803 |         real dxf = frontTopLeftDotProduct * yWeightFrontTopLeft * zWeightFrontTopLeft *(-1)
804 |             + backTopLeftDotProduct * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) *(-1)
805 |             + frontTopRightDotProduct * yWeightFrontTopLeft * zWeightFrontTopLeft * 1
806 |             + backTopRightDotProduct * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) * 1
807 |             + frontBottomLeftDotProduct * (1-yWeightFrontTopLeft) * zWeightFrontTopLeft * (-1)
808 |             + backBottomLeftDotProduct * (1-yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * (-1)
809 |             + frontBottomRightDotProduct * (1-yWeightFrontTopLeft) * zWeightFrontTopLeft * 1
810 |             + backBottomRightDotProduct * (1-yWeightFrontTopLeft) *(1-zWeightFrontTopLeft) * 1;
811 |             
812 |         real dzf = frontTopLeftDotProduct * yWeightFrontTopLeft * xWeightFrontTopLeft * (-1)
813 |             + backTopLeftDotProduct * yWeightFrontTopLeft * xWeightFrontTopLeft *1
814 |             + frontTopRightDotProduct * yWeightFrontTopLeft * (1-xWeightFrontTopLeft) *(-1)
815 |             + backTopRightDotProduct * yWeightFrontTopLeft * (1-xWeightFrontTopLeft) *1
816 |             + frontBottomLeftDotProduct * (1-yWeightFrontTopLeft) * xWeightFrontTopLeft * (-1)
817 |             + backBottomLeftDotProduct * (1-yWeightFrontTopLeft) * xWeightFrontTopLeft * 1
818 |             + frontBottomRightDotProduct * (1-yWeightFrontTopLeft) * (1-xWeightFrontTopLeft) *(-1)
819 |             + backBottomRightDotProduct * (1-yWeightFrontTopLeft) * (1-xWeightFrontTopLeft) * 1;
820 |     
821 |         gradGrids_data[b*gradGrids_strideBatch + disOut*gradGrids_strideDepth + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth] = dzf * (inputImages_depth-1) / 2;
822 |         gradGrids_data[b*gradGrids_strideBatch + disOut*gradGrids_strideDepth + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + 1] = dyf * (inputImages_height-1) / 2;
823 |         gradGrids_data[b*gradGrids_strideBatch + disOut*gradGrids_strideDepth + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + 2] = dxf * (inputImages_width-1) / 2;
824 |         gradGrids_data[b*gradGrids_strideBatch + disOut*gradGrids_strideDepth + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + 3] = 0;
825 |         //  -(dyf* (inputImages_height-1) / 2*yf + dxf* (inputImages_width-1) / 2*xf + dzf* (inputImages_depth-1) / 2*(zf+focal_length+0.5))/disf;
826 |       }
827 |     }
828 |   }
829 |   }
830 |   return 1;
831 | }
832 | 
833 | static const struct luaL_Reg nn_(BilinearSamplerPerspective__) [] = {
834 |   {"BilinearSamplerPerspective_updateOutput", nn_(BilinearSamplerPerspective_updateOutput)},
835 |   {"BilinearSamplerPerspective_updateGradInput", nn_(BilinearSamplerPerspective_updateGradInput)},
836 |   {"BilinearSamplerBHWD_updateOutput", nn_(BilinearSamplerBHWD_updateOutput)},
837 |   {"BilinearSamplerBHWD_updateGradInput", nn_(BilinearSamplerBHWD_updateGradInput)},
838 |   {NULL, NULL}
839 | };
840 | 
841 | static void nn_(BilinearSamplerPerspective_init)(lua_State *L)
842 | {
843 |   luaT_pushmetatable(L, torch_Tensor);
844 |   luaT_registeratname(L, nn_(BilinearSamplerPerspective__), "nn");
845 |   lua_pop(L,1);
846 | }
847 | 
848 | #endif
849 | 


--------------------------------------------------------------------------------
/BilinearSamplerPerspective.cu:
--------------------------------------------------------------------------------
   1 | #include "utils.h"
   2 | 
   3 | // code adapted from github repo
   4 | // implemented by Yijie Guo (guoyijie@umich.edu) and Xinchen Yan (skywalkeryxc@gmail.com)
   5 | 
   6 | // Bilinear sampling is done in BHWD (coalescing is not obvious in BDHW)
   7 | // we assume BHWD format in inputImages
   8 | // we assume BHW(YX) format on grids
   9 | 
  10 | __device__ void getTopLeftFront(float x, int width, int& point, float& weight)
  11 | {
  12 |    /* for interpolation :
  13 |       stores in point and weight :
  14 |       - the x-coordinate of the pixel on the left (or y-coordinate of the upper pixel)
  15 |       - the weight for interpolating
  16 |    */
  17 | 
  18 |    float xcoord = (x + 1) * (width - 1) / 2;
  19 |    point = floor(xcoord);
  20 |    weight = 1 - (xcoord - point);
  21 | }
  22 | 
  23 | __device__ bool between(int value, int lowerBound, int upperBound)
  24 | {
  25 |    return (value >= lowerBound && value <= upperBound);
  26 | }
  27 | 
  28 | __device__ void sumReduceShMem(volatile float s[])
  29 | {
  30 |    /* obviously only works for 32 elements */
  31 |    /* sums up a shared memory array of 32 elements, stores it in s[0] */
  32 |    /* whole warp can then read first element (broadcasting) */
  33 |    //if(threadIdx.x<32) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+32]; }
  34 |    if(threadIdx.x<16) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+16] + s[threadIdx.x+32]; }
  35 |    if(threadIdx.x<8) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+8]; }
  36 |    if(threadIdx.x<4) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+4]; }
  37 |    if(threadIdx.x<2) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+2]; }
  38 |    if(threadIdx.x<1) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+1]; }
  39 | }
  40 | 
  41 | __device__ void sumReduceShMemPerspective(volatile float s[])
  42 | {
  43 |    /* obviously only works for 32 elements */
  44 |    /* sums up a shared memory array of 32 elements, stores it in s[0] */
  45 |    /* whole warp can then read first element (broadcasting) */
  46 |    if(threadIdx.x<16) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+16]; }
  47 |    if(threadIdx.x<8) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+8]; }
  48 |    if(threadIdx.x<4) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+4]; }
  49 |    if(threadIdx.x<2) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+2]; }
  50 |    if(threadIdx.x<1) { s[threadIdx.x] = s[threadIdx.x] + s[threadIdx.x+1]; }
  51 | }
  52 | 
  53 | 
  54 | // Affine Transformation
  55 | 
  56 | __global__ void bilinearSamplingFromGrid(float* inputImages_data, int inputImages_strideBatch, int inputImages_strideChannels, int inputImages_strideDepth, int inputImages_strideHeight, int inputImages_strideWidth,
  57 |                                          float* grids_data, int grids_strideBatch, int grids_strideYX, int grids_strideDepth, int grids_strideHeight, int grids_strideWidth,
  58 |                                          float* output_data, int output_strideBatch, int output_strideChannels, int output_strideDepth, int output_strideHeight, int output_strideWidth,
  59 |                                          int inputImages_channels, int inputImages_depth, int inputImages_height, int inputImages_width, int output_depth, int output_width)
  60 | {
  61 |    // each (32,16) block 16 output pixels (for coalescing the grid read)
  62 |    // x,y = coordinates (xOut = blockIdx.x*16+blockDim.y+threadIdx.y)
  63 |    // z = batch index
  64 |    // threadIdx.x : used for features (coalescing is trivial)
  65 |       
  66 |    const int xOut = blockIdx.x*blockDim.y+threadIdx.y;
  67 |    const bool withinImageBounds = xOut < output_width;
  68 |    const bool withinGridBounds = blockIdx.x*blockDim.y + threadIdx.x / 3 < output_width;
  69 |    const int yOut = blockIdx.y;
  70 |    const int width = inputImages_width;
  71 |    const int height = inputImages_height;
  72 |    const int depth = output_depth;
  73 | 
  74 |    const int b = blockIdx.z/depth;
  75 |    const int zOut = blockIdx.z%depth;
  76 | 
  77 |    float zf, yf,xf;
  78 | 
  79 |    
  80 |    __shared__ float gridData[48];
  81 |    if (threadIdx.y==0 && withinGridBounds)
  82 |    {
  83 |       gridData[threadIdx.x] = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + zOut*grids_strideDepth + threadIdx.x];
  84 |    }
  85 |    __syncthreads();
  86 |    if(!withinImageBounds) return;
  87 |    zf = gridData[threadIdx.y*3];
  88 |    yf = gridData[threadIdx.y*3+1];
  89 |    xf = gridData[threadIdx.y*3+2];
  90 |    //printf("%.3f %.3f %.3f\n", zf, yf, xf);
  91 |   
  92 |    int yInTopLeftFront, xInTopLeftFront, zInTopLeftFront;
  93 |    float yWeightTopLeftFront, xWeightTopLeftFront, zWeightTopLeftFront;
  94 |    getTopLeftFront(zf, inputImages_depth, zInTopLeftFront, zWeightTopLeftFront);
  95 |    getTopLeftFront(yf, inputImages_height, yInTopLeftFront, yWeightTopLeftFront);
  96 |    getTopLeftFront(xf, inputImages_width, xInTopLeftFront, xWeightTopLeftFront);
  97 |   
  98 |    //printf("GPU y[%.3f] x[%.3f] z[%.3f] WeightTopLeftFront\n",yWeightTopLeftFront, xWeightTopLeftFront, zWeightTopLeftFront);
  99 |   // printf("GPU y[%d] x[%d] z[%d] InTopLeftFront\n",yInTopLeftFront, xInTopLeftFront, zInTopLeftFront);
 100 |    
 101 |    const int outAddress = output_strideBatch * b + output_strideHeight * yOut + output_strideWidth * xOut + output_strideDepth * zOut;
 102 |    
 103 |    const int inTopLeftFrontAddress = inputImages_strideBatch * b + inputImages_strideHeight * yInTopLeftFront
 104 |   + inputImages_strideWidth * xInTopLeftFront + inputImages_strideDepth * zInTopLeftFront;
 105 | 
 106 |    const int inTopLeftBackAddress = inTopLeftFrontAddress + inputImages_strideDepth;
 107 | 
 108 |    const int inTopRightFrontAddress = inTopLeftFrontAddress + inputImages_strideWidth;
 109 |    const int inTopRightBackAddress = inTopRightFrontAddress + inputImages_strideDepth;
 110 | 
 111 |    const int inBottomLeftFrontAddress = inTopLeftFrontAddress + inputImages_strideHeight;
 112 |    const int inBottomLeftBackAddress = inBottomLeftFrontAddress + inputImages_strideDepth;
 113 | 
 114 |    const int inBottomRightFrontAddress = inBottomLeftFrontAddress + inputImages_strideWidth;
 115 |    const int inBottomRightBackAddress = inBottomRightFrontAddress + inputImages_strideDepth;
 116 | 
 117 |    float v=0;
 118 |    float inTopLeftFront=0;
 119 |    float inTopLeftBack=0;
 120 |    float inTopRightFront=0;
 121 |    float inTopRightBack=0;
 122 |    float inBottomLeftFront=0;
 123 |    float inBottomLeftBack=0;
 124 |    float inBottomRightFront=0;
 125 |    float inBottomRightBack=0;
 126 | 
 127 |    bool topLeftFrontIsIn = between(xInTopLeftFront, 0, inputImages_width-1) 
 128 |       && between(yInTopLeftFront, 0, inputImages_height-1) && between(zInTopLeftFront, 0, inputImages_depth-1);
 129 |       
 130 |    bool topLeftBackIsIn = between(xInTopLeftFront, 0, inputImages_width-1) 
 131 |       && between(yInTopLeftFront, 0, inputImages_height-1) && between(zInTopLeftFront+1, 0, inputImages_depth-1);
 132 |       
 133 |    bool topRightFrontIsIn = between(xInTopLeftFront+1, 0, inputImages_width-1) 
 134 |       && between(yInTopLeftFront, 0, inputImages_height-1) && between(zInTopLeftFront, 0, inputImages_depth-1);
 135 |       
 136 |    bool topRightBackIsIn = between(xInTopLeftFront+1, 0, inputImages_width-1) 
 137 |       && between(yInTopLeftFront, 0, inputImages_height-1) && between(zInTopLeftFront+1, 0, inputImages_depth-1);
 138 |       
 139 |    bool bottomLeftFrontIsIn = between(xInTopLeftFront, 0, inputImages_width-1) 
 140 |       && between(yInTopLeftFront+1, 0, inputImages_height-1) && between(zInTopLeftFront, 0, inputImages_depth-1);
 141 |       
 142 |    bool bottomLeftBackIsIn = between(xInTopLeftFront, 0, inputImages_width-1) 
 143 |       && between(yInTopLeftFront+1, 0, inputImages_height-1) && between(zInTopLeftFront+1, 0, inputImages_depth-1);
 144 |       
 145 |    bool bottomRightFrontIsIn = between(xInTopLeftFront+1, 0, inputImages_width-1) 
 146 |       && between(yInTopLeftFront+1, 0, inputImages_height-1) && between(zInTopLeftFront, 0, inputImages_depth-1);
 147 |       
 148 |    bool bottomRightBackIsIn = between(xInTopLeftFront+1, 0, inputImages_width-1) 
 149 |       && between(yInTopLeftFront+1, 0, inputImages_height-1) && between(zInTopLeftFront+1, 0, inputImages_depth-1);
 150 | 
 151 | 
 152 | // interpolation happens here
 153 |    for(int t=threadIdx.x; t<inputImages_channels; t+= blockDim.x)
 154 |    {
 155 |       if(topLeftFrontIsIn) inTopLeftFront = inputImages_data[inTopLeftFrontAddress + t];
 156 |       if(topLeftBackIsIn) inTopLeftBack = inputImages_data[inTopLeftBackAddress + t];
 157 | 
 158 |       if(topRightFrontIsIn) inTopRightFront = inputImages_data[inTopRightFrontAddress + t];
 159 |       if(topRightBackIsIn) inTopRightBack = inputImages_data[inTopRightBackAddress + t];
 160 | 
 161 |       if(bottomLeftFrontIsIn) inBottomLeftFront = inputImages_data[inBottomLeftFrontAddress + t];
 162 |       if(bottomLeftBackIsIn) inBottomLeftBack = inputImages_data[inBottomLeftBackAddress + t];
 163 | 
 164 |       if(bottomRightFrontIsIn) inBottomRightFront = inputImages_data[inBottomRightFrontAddress + t];
 165 |       if(bottomRightBackIsIn) inBottomRightBack = inputImages_data[inBottomRightBackAddress + t];
 166 | 
 167 | 
 168 |       v = xWeightTopLeftFront * yWeightTopLeftFront * zWeightTopLeftFront * inTopLeftFront
 169 |         + xWeightTopLeftFront * yWeightTopLeftFront * (1-zWeightTopLeftFront) * inTopLeftBack
 170 |         + (1 - xWeightTopLeftFront) * yWeightTopLeftFront * zWeightTopLeftFront * inTopRightFront
 171 |         + (1 - xWeightTopLeftFront) * yWeightTopLeftFront * (1-zWeightTopLeftFront) * inTopRightBack
 172 |         + xWeightTopLeftFront * (1 - yWeightTopLeftFront) * zWeightTopLeftFront * inBottomLeftFront
 173 |         + xWeightTopLeftFront * (1 - yWeightTopLeftFront) * (1-zWeightTopLeftFront) * inBottomLeftBack
 174 |         + (1 - xWeightTopLeftFront) * (1 - yWeightTopLeftFront) * zWeightTopLeftFront * inBottomRightFront
 175 |         + (1 - xWeightTopLeftFront) * (1 - yWeightTopLeftFront) * (1-zWeightTopLeftFront) * inBottomRightBack;
 176 | 
 177 |       output_data[outAddress + t] = v;
 178 |    }
 179 | }
 180 | 
 181 | 
 182 | static int cunn_BilinearSamplerBHWD_updateOutput(lua_State *L)
 183 | {
 184 |   THCState *state = getCutorchState(L);
 185 |   THCudaTensor *inputImages = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
 186 |   THCudaTensor *grids = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
 187 |   THCudaTensor *output = (THCudaTensor *)luaT_checkudata(L, 4, "torch.CudaTensor");
 188 |   //printf("GPU hello\n");
 189 | 
 190 |    dim3 blocks((output->size[3]+15)/16, output->size[2], output->size[0]*output->size[1]);
 191 |    dim3 threads(48,16);
 192 | 
 193 |    /* assume BHWD */
 194 |    bilinearSamplingFromGrid <<< blocks, threads, 0, THCState_getCurrentStream(state) >>> (THCudaTensor_data(state, inputImages), 
 195 |                                                       THCudaTensor_stride(state, inputImages, 0), 
 196 |                                                       THCudaTensor_stride(state, inputImages, 4),
 197 |                                                       THCudaTensor_stride(state, inputImages, 1), 
 198 |                                                       THCudaTensor_stride(state, inputImages, 2),
 199 |                                                       THCudaTensor_stride(state, inputImages, 3),
 200 | 
 201 |                                                       THCudaTensor_data(state, grids),
 202 |                                                       THCudaTensor_stride(state, grids, 0), 
 203 |                                                       THCudaTensor_stride(state, grids, 4),
 204 |                                                       THCudaTensor_stride(state, grids, 1), 
 205 |                                                       THCudaTensor_stride(state, grids, 2),
 206 |                                                       THCudaTensor_stride(state, grids, 3),
 207 | 
 208 |                                                       THCudaTensor_data(state, output),  
 209 |                                                       THCudaTensor_stride(state, output, 0), 
 210 |                                                       THCudaTensor_stride(state, output, 4),
 211 |                                                       THCudaTensor_stride(state, output, 1), 
 212 |                                                       THCudaTensor_stride(state, output, 2),
 213 |                                                       THCudaTensor_stride(state, output, 3),
 214 | 
 215 |                                                       THCudaTensor_size(state, inputImages, 4),
 216 |                                                       THCudaTensor_size(state, inputImages, 1), 
 217 |                                                       THCudaTensor_size(state, inputImages, 2),
 218 |                                                       THCudaTensor_size(state, inputImages, 3),
 219 |                                                       THCudaTensor_size(state, output, 1),
 220 |                                                       THCudaTensor_size(state, output, 3));
 221 | 
 222 | 
 223 |   // check for errors
 224 |   cudaError_t err = cudaGetLastError();
 225 |   if (err != cudaSuccess) {
 226 |     printf("error in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
 227 |     THError("aborting");
 228 |   }
 229 |   //printf("GPU forward end!\n");
 230 |   return 1;
 231 | }
 232 | 
 233 | template<bool onlyGrid> __global__ void backwardBilinearSampling(float* inputImages_data, int inputImages_strideBatch, int inputImages_strideChannels, int inputImages_strideDepth, int inputImages_strideHeight, int inputImages_strideWidth,
 234 |                                          float* gradInputImages_data, int gradInputImages_strideBatch, int gradInputImages_strideChannels, int gradInputImages_strideDepth, int gradInputImages_strideHeight, int gradInputImages_strideWidth,
 235 |                                          float* grids_data, int grids_strideBatch, int grids_strideYX, int grids_strideDepth, int grids_strideHeight, int grids_strideWidth,
 236 |                                          float* gradGrids_data, int gradGrids_strideBatch, int gradGrids_strideYX, int gradGrids_strideDepth, int gradGrids_strideHeight, int gradGrids_strideWidth,
 237 |                                          float* gradOutput_data, int gradOutput_strideBatch, int gradOutput_strideChannels, int gradOutput_strideDepth, int gradOutput_strideHeight, int gradOutput_strideWidth,
 238 |                                          int inputImages_channels, int inputImages_depth, int inputImages_height, int inputImages_width, int gradOutput_depth, int gradOutput_width)
 239 | {
 240 |    // each (32,16) block 16 output pixels (for coalescing the grid read)
 241 |    // x,y = coordinates
 242 |    // z = batch index
 243 |    // threads : used for features
 244 |       
 245 |    const int xOut = blockIdx.x*blockDim.y+threadIdx.y;
 246 |    const bool withinImageBounds = xOut < gradOutput_width;
 247 |    const bool withinGridBounds = blockIdx.x*blockDim.y + threadIdx.x / 3 < gradOutput_width;
 248 | 
 249 |    const int yOut = blockIdx.y;
 250 |    const int width = inputImages_width;
 251 |    const int height = inputImages_height;
 252 | 
 253 |    const int depth = gradOutput_depth;
 254 | 
 255 |    const int b = blockIdx.z/depth;
 256 |    const int zOut = blockIdx.z%depth;
 257 |    
 258 |    float yf,xf, zf;
 259 | 
 260 |    __shared__ float gridData[48];
 261 |    if (threadIdx.y==0 && withinGridBounds)
 262 |    {
 263 |       gridData[threadIdx.x] = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + zOut*grids_strideDepth + threadIdx.x];
 264 |    }
 265 |    __syncthreads();
 266 | 
 267 |    if(withinImageBounds)
 268 |    {
 269 |       zf = gridData[threadIdx.y*3];
 270 |       yf = gridData[threadIdx.y*3+1];
 271 |       xf = gridData[threadIdx.y*3+2];
 272 | 
 273 |       int yInTopLeftFront, xInTopLeftFront, zInTopLeftFront;
 274 |       float yWeightTopLeftFront, xWeightTopLeftFront, zWeightTopLeftFront;
 275 |       getTopLeftFront(zf, inputImages_depth, zInTopLeftFront, zWeightTopLeftFront);
 276 |       getTopLeftFront(yf, inputImages_height, yInTopLeftFront, yWeightTopLeftFront);
 277 |       getTopLeftFront(xf, inputImages_width, xInTopLeftFront, xWeightTopLeftFront);
 278 | 
 279 |       const int inTopLeftFrontAddress = inputImages_strideBatch * b + inputImages_strideHeight * yInTopLeftFront
 280 |   + inputImages_strideWidth * xInTopLeftFront + inputImages_strideDepth * zInTopLeftFront;
 281 | 
 282 |       const int inTopLeftBackAddress = inTopLeftFrontAddress + inputImages_strideDepth;
 283 | 
 284 |       const int inTopRightFrontAddress = inTopLeftFrontAddress + inputImages_strideWidth;
 285 |       const int inTopRightBackAddress = inTopRightFrontAddress + inputImages_strideDepth;
 286 | 
 287 |       const int inBottomLeftFrontAddress = inTopLeftFrontAddress + inputImages_strideHeight;
 288 |       const int inBottomLeftBackAddress = inBottomLeftFrontAddress + inputImages_strideDepth;
 289 | 
 290 |       const int inBottomRightFrontAddress = inBottomLeftFrontAddress + inputImages_strideWidth;
 291 |       const int inBottomRightBackAddress = inBottomRightFrontAddress + inputImages_strideDepth;
 292 | 
 293 |       const int gradInputImagesTopLeftFrontAddress = gradInputImages_strideBatch * b + gradInputImages_strideHeight * yInTopLeftFront + gradInputImages_strideWidth * xInTopLeftFront + gradInputImages_strideDepth * zInTopLeftFront;
 294 |       const int gradInputImagesTopLeftBackAddress = gradInputImagesTopLeftFrontAddress + gradInputImages_strideDepth;
 295 |       const int gradInputImagesTopRightFrontAddress = gradInputImagesTopLeftFrontAddress + gradInputImages_strideWidth;
 296 |       const int gradInputImagesTopRightBackAddress = gradInputImagesTopRightFrontAddress + gradInputImages_strideDepth;
 297 | 
 298 |       const int gradInputImagesBottomLeftFrontAddress = gradInputImagesTopLeftFrontAddress + gradInputImages_strideHeight;
 299 |       const int gradInputImagesBottomLeftBackAddress = gradInputImagesBottomLeftFrontAddress + gradInputImages_strideDepth;
 300 |       const int gradInputImagesBottomRightFrontAddress = gradInputImagesBottomLeftFrontAddress + gradInputImages_strideWidth;
 301 |       const int gradInputImagesBottomRightBackAddress = gradInputImagesBottomRightFrontAddress + gradInputImages_strideDepth;
 302 | 
 303 | 
 304 |       const int gradOutputAddress = gradOutput_strideBatch * b + gradOutput_strideHeight * yOut + gradOutput_strideWidth * xOut + gradOutput_strideDepth * zOut;
 305 | 
 306 |       float topLeftFrontDotProduct = 0;
 307 |       float topLeftBackDotProduct = 0;
 308 |       float topRightFrontDotProduct = 0;
 309 |       float topRightBackDotProduct = 0;
 310 | 
 311 |       float bottomLeftFrontDotProduct = 0;
 312 |       float bottomLeftBackDotProduct = 0;
 313 |       float bottomRightFrontDotProduct = 0;
 314 |       float bottomRightBackDotProduct = 0;
 315 | 
 316 |       bool topLeftFrontIsIn = between(xInTopLeftFront, 0, inputImages_width-1) 
 317 |          && between(yInTopLeftFront, 0, inputImages_height-1) && between(zInTopLeftFront, 0, inputImages_depth-1);
 318 |       bool topLeftBackIsIn = between(xInTopLeftFront, 0, inputImages_width-1) 
 319 |          && between(yInTopLeftFront, 0, inputImages_height-1) && between(zInTopLeftFront+1, 0, inputImages_depth-1);
 320 | 
 321 |       bool topRightFrontIsIn = between(xInTopLeftFront+1, 0, inputImages_width-1) 
 322 |          && between(yInTopLeftFront, 0, inputImages_height-1) && between(zInTopLeftFront, 0, inputImages_depth-1);
 323 |       bool topRightBackIsIn = between(xInTopLeftFront+1, 0, inputImages_width-1) 
 324 |          && between(yInTopLeftFront, 0, inputImages_height-1) && between(zInTopLeftFront+1, 0, inputImages_depth-1);
 325 | 
 326 |       bool bottomLeftFrontIsIn = between(xInTopLeftFront, 0, inputImages_width-1) 
 327 |          && between(yInTopLeftFront+1, 0, inputImages_height-1) && between(zInTopLeftFront, 0, inputImages_depth-1);
 328 |       bool bottomLeftBackIsIn = between(xInTopLeftFront, 0, inputImages_width-1) 
 329 |          && between(yInTopLeftFront+1, 0, inputImages_height-1) && between(zInTopLeftFront+1, 0, inputImages_depth-1);
 330 | 
 331 |       bool bottomRightFrontIsIn = between(xInTopLeftFront+1, 0, inputImages_width-1) 
 332 |          && between(yInTopLeftFront+1, 0, inputImages_height-1) && between(zInTopLeftFront, 0, inputImages_depth-1);
 333 |       bool bottomRightBackIsIn = between(xInTopLeftFront+1, 0, inputImages_width-1) 
 334 |          && between(yInTopLeftFront+1, 0, inputImages_height-1) && between(zInTopLeftFront+1, 0, inputImages_depth-1);
 335 | 
 336 |       /*
 337 |          In that loop we accumulate
 338 |          - gradients into the gradInputImages array with atomic adds
 339 |          - we compute the dot product that we need for the grid gradient
 340 |       */
 341 | 
 342 |       for(int t=threadIdx.x; t<inputImages_channels; t+= blockDim.x)
 343 |       {
 344 |          float gradOutValue = gradOutput_data[gradOutputAddress + t];
 345 |          // bool between(int value, int lowerBound, int upperBound)
 346 |          if(topLeftFrontIsIn)
 347 |          {
 348 |             float inTopLeftFront = inputImages_data[inTopLeftFrontAddress + t];
 349 |             topLeftFrontDotProduct += inTopLeftFront * gradOutValue;
 350 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesTopLeftFrontAddress + t], xWeightTopLeftFront * yWeightTopLeftFront * zWeightTopLeftFront * gradOutValue);
 351 |          }
 352 | 
 353 |         if(topLeftBackIsIn)
 354 |          {
 355 |             float inTopLeftBack = inputImages_data[inTopLeftBackAddress + t];
 356 |             topLeftBackDotProduct += inTopLeftBack * gradOutValue;
 357 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesTopLeftBackAddress + t], xWeightTopLeftFront * yWeightTopLeftFront * (1-zWeightTopLeftFront) * gradOutValue);
 358 |          }
 359 | 
 360 |          if(topRightFrontIsIn)
 361 |          {
 362 |             float inTopRightFront = inputImages_data[inTopRightFrontAddress + t];
 363 |             topRightFrontDotProduct += inTopRightFront * gradOutValue;
 364 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesTopRightFrontAddress + t], (1 - xWeightTopLeftFront) * yWeightTopLeftFront * zWeightTopLeftFront * gradOutValue);
 365 |          }
 366 | 
 367 |          if(topRightBackIsIn)
 368 |          {
 369 |             float inTopRightBack = inputImages_data[inTopRightBackAddress + t];
 370 |             topRightBackDotProduct += inTopRightBack * gradOutValue;
 371 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesTopRightBackAddress + t], (1 - xWeightTopLeftFront) * yWeightTopLeftFront * (1-zWeightTopLeftFront) * gradOutValue);
 372 |           }
 373 | 
 374 |          if(bottomLeftFrontIsIn)
 375 |          {
 376 |             float inBottomLeftFront = inputImages_data[inBottomLeftFrontAddress + t];
 377 |             bottomLeftFrontDotProduct += inBottomLeftFront * gradOutValue;
 378 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesBottomLeftFrontAddress + t], xWeightTopLeftFront * (1 - yWeightTopLeftFront) * zWeightTopLeftFront * gradOutValue);
 379 |          }
 380 | 
 381 |          if(bottomLeftBackIsIn)
 382 |          {
 383 |             float inBottomLeftBack = inputImages_data[inBottomLeftBackAddress + t];
 384 |             bottomLeftBackDotProduct += inBottomLeftBack * gradOutValue;
 385 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesBottomLeftBackAddress + t], xWeightTopLeftFront * (1 - yWeightTopLeftFront) * (1-zWeightTopLeftFront) * gradOutValue);
 386 |           }
 387 | 
 388 |          if(bottomRightFrontIsIn)
 389 |          {
 390 |             float inBottomRightFront = inputImages_data[inBottomRightFrontAddress + t];
 391 |             bottomRightFrontDotProduct += inBottomRightFront * gradOutValue;
 392 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesBottomRightFrontAddress + t], (1 - xWeightTopLeftFront) * (1 - yWeightTopLeftFront) * zWeightTopLeftFront * gradOutValue);
 393 |          }
 394 | 
 395 |          if(bottomRightBackIsIn)
 396 |          {
 397 |             float inBottomRightBack = inputImages_data[inBottomRightBackAddress + t];
 398 |             bottomRightBackDotProduct += inBottomRightBack * gradOutValue;
 399 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesBottomRightBackAddress + t], (1 - xWeightTopLeftFront) * (1 - yWeightTopLeftFront) * (1-zWeightTopLeftFront) * gradOutValue);
 400 |           }
 401 | 
 402 | 
 403 |       }
 404 | 
 405 |       /*
 406 |          Here we reduce the dot product and compute the grid gradient before writing it.
 407 |       */
 408 | 
 409 |       /* could do shuffles and use no shmem at all but cuda arch is 2.0 */
 410 | /*
 411 |       __shared__ volatile float __shmem[16][32];
 412 |       __shmem[threadIdx.y][threadIdx.x] = topLeftDotProduct;
 413 |       sumReduceShMem(__shmem[threadIdx.y]);
 414 |       topLeftDotProduct = __shmem[threadIdx.y][0];
 415 | 
 416 |       __shmem[threadIdx.y][threadIdx.x] = topRightDotProduct;
 417 |       sumReduceShMem(__shmem[threadIdx.y]);
 418 |       topRightDotProduct = __shmem[threadIdx.y][0];
 419 | 
 420 |       __shmem[threadIdx.y][threadIdx.x] = bottomLeftDotProduct;
 421 |       sumReduceShMem(__shmem[threadIdx.y]);
 422 |       bottomLeftDotProduct = __shmem[threadIdx.y][0];
 423 | 
 424 |       __shmem[threadIdx.y][threadIdx.x] = bottomRightDotProduct;
 425 |       sumReduceShMem(__shmem[threadIdx.y]);
 426 |       bottomRightDotProduct = __shmem[threadIdx.y][0];
 427 | */
 428 | 
 429 | __shared__ volatile float __shmem[16][48];
 430 | __shmem[threadIdx.y][threadIdx.x] = topLeftFrontDotProduct;
 431 | sumReduceShMem(__shmem[threadIdx.y]);
 432 | topLeftFrontDotProduct = __shmem[threadIdx.y][0];
 433 | //__syncthreads();
 434 | 
 435 | __shmem[threadIdx.y][threadIdx.x] = topLeftBackDotProduct;
 436 | sumReduceShMem(__shmem[threadIdx.y]);
 437 | topLeftBackDotProduct = __shmem[threadIdx.y][0];
 438 | //__syncthreads();
 439 | 
 440 | 
 441 | __shmem[threadIdx.y][threadIdx.x] = topRightFrontDotProduct;
 442 | sumReduceShMem(__shmem[threadIdx.y]);
 443 | topRightFrontDotProduct = __shmem[threadIdx.y][0];
 444 | //__syncthreads();
 445 | 
 446 | __shmem[threadIdx.y][threadIdx.x] = topRightBackDotProduct;
 447 | sumReduceShMem(__shmem[threadIdx.y]);
 448 | topRightBackDotProduct = __shmem[threadIdx.y][0];
 449 | //__syncthreads();
 450 | 
 451 | __shmem[threadIdx.y][threadIdx.x] = bottomLeftFrontDotProduct;
 452 | sumReduceShMem(__shmem[threadIdx.y]);
 453 | bottomLeftFrontDotProduct = __shmem[threadIdx.y][0];
 454 | //__syncthreads();
 455 | 
 456 | __shmem[threadIdx.y][threadIdx.x] = bottomLeftBackDotProduct;
 457 | sumReduceShMem(__shmem[threadIdx.y]);
 458 | bottomLeftBackDotProduct = __shmem[threadIdx.y][0];
 459 | //__syncthreads();
 460 | 
 461 | __shmem[threadIdx.y][threadIdx.x] = bottomRightFrontDotProduct;
 462 | sumReduceShMem(__shmem[threadIdx.y]);
 463 | bottomRightFrontDotProduct = __shmem[threadIdx.y][0];
 464 | //__syncthreads();
 465 | 
 466 | __shmem[threadIdx.y][threadIdx.x] = bottomRightBackDotProduct;
 467 | sumReduceShMem(__shmem[threadIdx.y]);
 468 | bottomRightBackDotProduct = __shmem[threadIdx.y][0];
 469 | //__syncthreads();
 470 | 
 471 | 
 472 |       yf = topLeftFrontDotProduct * xWeightTopLeftFront * zWeightTopLeftFront * (-1)
 473 |          + topLeftBackDotProduct * xWeightTopLeftFront * (1-zWeightTopLeftFront) * (-1)
 474 |          + topRightFrontDotProduct * (1-xWeightTopLeftFront) * zWeightTopLeftFront * (-1)
 475 |          + topRightBackDotProduct * (1-xWeightTopLeftFront) * (1-zWeightTopLeftFront) *(-1)
 476 |          + bottomLeftFrontDotProduct * xWeightTopLeftFront * zWeightTopLeftFront * (1)
 477 |          + bottomLeftBackDotProduct * xWeightTopLeftFront * (1-zWeightTopLeftFront) * (1)
 478 |          + bottomRightFrontDotProduct * (1-xWeightTopLeftFront) * zWeightTopLeftFront * (1)
 479 |          + bottomRightBackDotProduct * (1-xWeightTopLeftFront) * (1-zWeightTopLeftFront) *(1);
 480 | 
 481 |       xf = topLeftFrontDotProduct * yWeightTopLeftFront * zWeightTopLeftFront *(-1)
 482 |          + topLeftBackDotProduct * yWeightTopLeftFront * (1-zWeightTopLeftFront) *(-1)
 483 |          + topRightFrontDotProduct * yWeightTopLeftFront * zWeightTopLeftFront * 1
 484 |          + topRightBackDotProduct * yWeightTopLeftFront * (1-zWeightTopLeftFront) * 1
 485 |          + bottomLeftFrontDotProduct * (1-yWeightTopLeftFront) * zWeightTopLeftFront * (-1)
 486 |          + bottomLeftBackDotProduct * (1-yWeightTopLeftFront) * (1-zWeightTopLeftFront) * (-1)
 487 |          + bottomRightFrontDotProduct * (1-yWeightTopLeftFront) * zWeightTopLeftFront * (1)
 488 |          + bottomRightBackDotProduct * (1-yWeightTopLeftFront) *(1-zWeightTopLeftFront) * (1);
 489 | 
 490 |       zf = topLeftFrontDotProduct * yWeightTopLeftFront * xWeightTopLeftFront * (-1)
 491 |          + topLeftBackDotProduct * yWeightTopLeftFront * xWeightTopLeftFront *(1)
 492 |          + topRightFrontDotProduct * yWeightTopLeftFront * (1-xWeightTopLeftFront) *(-1)
 493 |          + topRightBackDotProduct * yWeightTopLeftFront * (1-xWeightTopLeftFront) *(1)
 494 |          + bottomLeftFrontDotProduct * (1-yWeightTopLeftFront) * xWeightTopLeftFront * (-1)
 495 |          + bottomLeftBackDotProduct * (1-yWeightTopLeftFront) * xWeightTopLeftFront * (1)
 496 |          + bottomRightFrontDotProduct * (1-yWeightTopLeftFront) * (1-xWeightTopLeftFront) *(-1)
 497 |          + bottomRightBackDotProduct * (1-yWeightTopLeftFront) * (1-xWeightTopLeftFront) * 1;
 498 | 
 499 | 
 500 | 
 501 |       if(threadIdx.x==0)
 502 |       {
 503 |          gridData[threadIdx.y*3] = zf * (inputImages_depth-1) / 2;
 504 |          gridData[threadIdx.y*3+1] = yf * (inputImages_height-1) / 2;
 505 |          gridData[threadIdx.y*3+2] = xf * (inputImages_width-1) / 2;
 506 |       }
 507 |    }// must put a big if condition in order not to hang at __syncthreads()...
 508 |    __syncthreads();
 509 | 
 510 |    if(threadIdx.y==0 && withinGridBounds)      
 511 |        gradGrids_data[b*gradGrids_strideBatch + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + zOut*gradGrids_strideDepth + threadIdx.x] = gridData[threadIdx.x];
 512 | }
 513 | 
 514 | static int cunn_BilinearSamplerBHWD_updateGradInput(lua_State *L)
 515 | {
 516 |   THCState *state = getCutorchState(L);
 517 |   THCudaTensor *inputImages = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
 518 |   THCudaTensor *grids = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
 519 |   THCudaTensor *gradInputImages = (THCudaTensor *)luaT_checkudata(L, 4, "torch.CudaTensor");
 520 |   THCudaTensor *gradGrids = (THCudaTensor *)luaT_checkudata(L, 5, "torch.CudaTensor");
 521 |   THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 6, "torch.CudaTensor");
 522 | 
 523 |    dim3 blocks((gradOutput->size[3]+15)/16, gradOutput->size[2], gradOutput->size[0]*gradOutput->size[1]);
 524 |    dim3 threads(48,16);
 525 | 
 526 |    backwardBilinearSampling <false> <<< blocks, threads, 0, THCState_getCurrentStream(state) >>> (
 527 |                                                       THCudaTensor_data(state, inputImages), 
 528 |                                                       THCudaTensor_stride(state, inputImages, 0),
 529 |                                                       THCudaTensor_stride(state, inputImages, 4),
 530 |                                                       THCudaTensor_stride(state, inputImages, 1),
 531 |                                                       THCudaTensor_stride(state, inputImages, 2),
 532 |                                                       THCudaTensor_stride(state, inputImages, 3),
 533 | 
 534 |                                                       THCudaTensor_data(state, gradInputImages),
 535 |                                                       THCudaTensor_stride(state, gradInputImages, 0),
 536 |                                                       THCudaTensor_stride(state, gradInputImages, 4),
 537 |                                                       THCudaTensor_stride(state, gradInputImages, 1),
 538 |                                                       THCudaTensor_stride(state, gradInputImages, 2),
 539 |                                                       THCudaTensor_stride(state, gradInputImages, 3),
 540 | 
 541 |                                                       THCudaTensor_data(state, grids), 
 542 |                                                       THCudaTensor_stride(state, grids, 0),
 543 |                                                       THCudaTensor_stride(state, grids, 4),
 544 |                                                       THCudaTensor_stride(state, grids, 1),
 545 |                                                       THCudaTensor_stride(state, grids, 2),
 546 |                                                       THCudaTensor_stride(state, grids, 3),
 547 | 
 548 |                                                       THCudaTensor_data(state, gradGrids), 
 549 |                                                       THCudaTensor_stride(state, gradGrids, 0),
 550 |                                                       THCudaTensor_stride(state, gradGrids, 4),
 551 |                                                       THCudaTensor_stride(state, gradGrids, 1),
 552 |                                                       THCudaTensor_stride(state, gradGrids, 2),
 553 |                                                       THCudaTensor_stride(state, gradGrids, 3),
 554 | 
 555 |                                                       THCudaTensor_data(state, gradOutput),
 556 |                                                       THCudaTensor_stride(state, gradOutput, 0),
 557 |                                                       THCudaTensor_stride(state, gradOutput, 4),
 558 |                                                       THCudaTensor_stride(state, gradOutput, 1),
 559 |                                                       THCudaTensor_stride(state, gradOutput, 2),
 560 |                                                       THCudaTensor_stride(state, gradOutput, 3),
 561 | 
 562 |                                                       THCudaTensor_size(state, inputImages, 4),
 563 |                                                       THCudaTensor_size(state, inputImages, 1), 
 564 |                                                       THCudaTensor_size(state, inputImages, 2),
 565 |                                                       THCudaTensor_size(state, inputImages, 3),
 566 |                                                       THCudaTensor_size(state, gradOutput, 1),
 567 |                                                       THCudaTensor_size(state, gradOutput, 3));
 568 | 
 569 | 
 570 | 
 571 |   // check for errors
 572 |   cudaError_t err = cudaGetLastError();
 573 |   if (err != cudaSuccess) {
 574 |     printf("error in BilinearSampler.updateGradInput: %s\n", cudaGetErrorString(err));
 575 |     THError("aborting");
 576 |   }
 577 |   return 1;
 578 | }
 579 | 
 580 | 
 581 | // Perspective Transformation
 582 | 
 583 | __global__ void bilinearSamplingFromGridPerspective(float* inputImages_data, int inputImages_strideBatch, int inputImages_strideChannels, int inputImages_strideDepth, int inputImages_strideHeight, int inputImages_strideWidth,
 584 | float* grids_data, int grids_strideBatch, int grids_strideYX, int grids_strideDepth, int grids_strideHeight, int grids_strideWidth,
 585 | float* output_data, int output_strideBatch, int output_strideChannels, int output_strideDist, int output_strideHeight, int output_strideWidth,
 586 | int inputImages_channels, int inputImages_depth, int inputImages_height, int inputImages_width, int output_dist, int output_width, float focal_length)
 587 | {
 588 |    // each (32,16) block 16 output pixels (for coalescing the grid read)
 589 |    // x,y = coordinates (xOut = blockIdx.x*16+blockDim.y+threadIdx.y)
 590 |    // z = batch index
 591 |    // threadIdx.x : used for features (coalescing is trivial)
 592 |       
 593 |    const int xOut = blockIdx.x*blockDim.y+threadIdx.y;
 594 |    const bool withinImageBounds = xOut < output_width;
 595 |    const bool withinGridBounds = blockIdx.x*blockDim.y + threadIdx.x / 4 < output_width;
 596 |    const int yOut = blockIdx.y;
 597 |    const int width = inputImages_width;
 598 |    const int height = inputImages_height;
 599 |    const int dist = output_dist;
 600 | 
 601 |    const int b = blockIdx.z/dist;
 602 |    const int disOut = blockIdx.z%dist;
 603 |    
 604 |    float zf,yf,xf, disf;
 605 | 
 606 |    __shared__ float gridData[32];
 607 |    if (threadIdx.y==0 && withinGridBounds)
 608 |    {
 609 |       gridData[threadIdx.x] = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + disOut*grids_strideDepth + threadIdx.x];
 610 |    }
 611 |    __syncthreads();
 612 |    if(!withinImageBounds) return;
 613 |    zf = gridData[threadIdx.y*4];
 614 |    yf = gridData[threadIdx.y*4+1];
 615 |    xf = gridData[threadIdx.y*4+2];
 616 |    disf = gridData[threadIdx.y*4+3];
 617 | 
 618 |    //yf = yf / disf;
 619 |    //xf = xf / disf;
 620 |    //zf = zf / disf - (focal_length + 0.5);
 621 | 
 622 |    const int outAddress = output_strideBatch * b + output_strideDist * disOut + output_strideHeight * yOut + output_strideWidth * xOut;
 623 | 
 624 |    int zInFrontTopLeft, yInFrontTopLeft, xInFrontTopLeft;
 625 |    float yWeightFrontTopLeft, xWeightFrontTopLeft, zWeightFrontTopLeft;
 626 |    getTopLeftFront(zf, inputImages_depth, zInFrontTopLeft, zWeightFrontTopLeft);
 627 |    getTopLeftFront(yf, inputImages_height, yInFrontTopLeft, yWeightFrontTopLeft);
 628 |    getTopLeftFront(xf, inputImages_width, xInFrontTopLeft, xWeightFrontTopLeft);
 629 | 
 630 |    const int inFrontTopLeftAddress = inputImages_strideBatch * b + inputImages_strideDepth * zInFrontTopLeft + inputImages_strideHeight * yInFrontTopLeft + inputImages_strideWidth * xInFrontTopLeft;
 631 |    const int inFrontTopRightAddress = inFrontTopLeftAddress + inputImages_strideWidth;
 632 |    const int inFrontBottomLeftAddress = inFrontTopLeftAddress + inputImages_strideHeight;
 633 |    const int inFrontBottomRightAddress = inFrontBottomLeftAddress + inputImages_strideWidth;
 634 | 
 635 |    const int inBackTopLeftAddress = inFrontTopLeftAddress + inputImages_strideDepth;
 636 |    const int inBackTopRightAddress = inBackTopLeftAddress + inputImages_strideWidth;
 637 |    const int inBackBottomLeftAddress = inBackTopLeftAddress + inputImages_strideHeight;
 638 |    const int inBackBottomRightAddress = inBackBottomLeftAddress + inputImages_strideWidth;
 639 | 
 640 |    float v=0;
 641 |    float inFrontTopLeft=0;
 642 |    float inFrontTopRight=0;
 643 |    float inFrontBottomLeft=0;
 644 |    float inFrontBottomRight=0;
 645 |    float inBackTopLeft=0;
 646 |    float inBackTopRight=0;
 647 |    float inBackBottomLeft=0;
 648 |    float inBackBottomRight=0;
 649 | 
 650 |    bool frontTopLeftIsIn = between(xInFrontTopLeft, 0, inputImages_width-1)
 651 |     && between(yInFrontTopLeft, 0, inputImages_height-1) && between(zInFrontTopLeft, 0, inputImages_depth-1);
 652 | 
 653 |    bool backTopLeftIsIn = between(xInFrontTopLeft, 0, inputImages_width-1)
 654 |     && between(yInFrontTopLeft, 0, inputImages_height-1) && between(zInFrontTopLeft+1, 0, inputImages_depth-1);
 655 | 
 656 |    bool frontTopRightIsIn = between(xInFrontTopLeft+1, 0, inputImages_width-1)
 657 |     && between(yInFrontTopLeft, 0, inputImages_height-1) && between(zInFrontTopLeft, 0, inputImages_depth-1);
 658 | 
 659 |    bool backTopRightIsIn = between(xInFrontTopLeft+1, 0, inputImages_width-1)
 660 |     && between(yInFrontTopLeft, 0, inputImages_height-1) && between(zInFrontTopLeft+1, 0, inputImages_depth-1);
 661 | 
 662 |    bool frontBottomLeftIsIn = between(xInFrontTopLeft, 0, inputImages_width-1)
 663 |     && between(yInFrontTopLeft+1, 0, inputImages_height-1) && between(zInFrontTopLeft, 0, inputImages_depth-1);
 664 | 
 665 |    bool backBottomLeftIsIn = between(xInFrontTopLeft, 0, inputImages_width-1)
 666 |     && between(yInFrontTopLeft+1, 0, inputImages_height-1) && between(zInFrontTopLeft+1, 0, inputImages_depth-1);
 667 | 
 668 |    bool frontBottomRightIsIn = between(xInFrontTopLeft+1, 0, inputImages_width-1)
 669 |     && between(yInFrontTopLeft+1, 0, inputImages_height-1) && between(zInFrontTopLeft, 0, inputImages_depth-1);
 670 | 
 671 |    bool backBottomRightIsIn = between(xInFrontTopLeft+1, 0, inputImages_width-1)
 672 |     && between(yInFrontTopLeft+1, 0, inputImages_height-1) && between(zInFrontTopLeft+1, 0, inputImages_depth-1);
 673 | 
 674 |    // interpolation happens here
 675 |    for(int t=threadIdx.x; t<inputImages_channels; t+= blockDim.x)
 676 |    {
 677 |       if(frontTopLeftIsIn) inFrontTopLeft = inputImages_data[inFrontTopLeftAddress + t];
 678 |       if(frontTopRightIsIn) inFrontTopRight = inputImages_data[inFrontTopRightAddress + t];
 679 |       if(frontBottomLeftIsIn) inFrontBottomLeft = inputImages_data[inFrontBottomLeftAddress + t];
 680 |       if(frontBottomRightIsIn) inFrontBottomRight = inputImages_data[inFrontBottomRightAddress + t];
 681 | 
 682 |       if(backTopLeftIsIn) inBackTopLeft = inputImages_data[inBackTopLeftAddress + t];
 683 |       if(backTopRightIsIn) inBackTopRight = inputImages_data[inBackTopRightAddress + t];
 684 |       if(backBottomLeftIsIn) inBackBottomLeft = inputImages_data[inBackBottomLeftAddress + t];
 685 |       if(backBottomRightIsIn) inBackBottomRight = inputImages_data[inBackBottomRightAddress + t];
 686 | 
 687 |       v = xWeightFrontTopLeft * yWeightFrontTopLeft * zWeightFrontTopLeft * inFrontTopLeft
 688 |         + (1 - xWeightFrontTopLeft) * yWeightFrontTopLeft * zWeightFrontTopLeft * inFrontTopRight
 689 |         + xWeightFrontTopLeft * (1 - yWeightFrontTopLeft) * zWeightFrontTopLeft * inFrontBottomLeft
 690 |         + (1 - xWeightFrontTopLeft) * (1 - yWeightFrontTopLeft) * zWeightFrontTopLeft * inFrontBottomRight
 691 |         + xWeightFrontTopLeft * yWeightFrontTopLeft * (1 - zWeightFrontTopLeft) * inBackTopLeft
 692 |         + (1 - xWeightFrontTopLeft) * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) * inBackTopRight
 693 |         + xWeightFrontTopLeft * (1 - yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * inBackBottomLeft
 694 |         + (1 - xWeightFrontTopLeft) * (1 - yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * inBackBottomRight;
 695 | 
 696 |       output_data[outAddress + t] = v;
 697 |    }
 698 | }
 699 | 
 700 | 
 701 | static int cunn_BilinearSamplerPerspective_updateOutput(lua_State *L)
 702 | {
 703 |   THCState *state = getCutorchState(L);
 704 |   THCudaTensor *inputImages = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
 705 |   THCudaTensor *grids = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
 706 |   THCudaTensor *output = (THCudaTensor *)luaT_checkudata(L, 4, "torch.CudaTensor");
 707 |   float focal_length = lua_tonumber(L, 5);
 708 | 
 709 |   dim3 blocks((output->size[3]+7)/8, output->size[2], output->size[0]*output->size[1]);
 710 |   dim3 threads(32,8);
 711 | 
 712 |    /* assume BHWD */
 713 |   bilinearSamplingFromGridPerspective <<< blocks, threads, 0, THCState_getCurrentStream(state) >>> (THCudaTensor_data(state, inputImages), 
 714 | 
 715 |   THCudaTensor_stride(state, inputImages, 0),
 716 |   THCudaTensor_stride(state, inputImages, 4),
 717 |   THCudaTensor_stride(state, inputImages, 1),
 718 |   THCudaTensor_stride(state, inputImages, 2),
 719 |   THCudaTensor_stride(state, inputImages, 3),
 720 | 
 721 |   THCudaTensor_data(state, grids),
 722 |   THCudaTensor_stride(state, grids, 0),
 723 |   THCudaTensor_stride(state, grids, 4),
 724 |   THCudaTensor_stride(state, grids, 1),
 725 |   THCudaTensor_stride(state, grids, 2),
 726 |   THCudaTensor_stride(state, grids, 3),
 727 | 
 728 |   THCudaTensor_data(state, output),
 729 |   THCudaTensor_stride(state, output, 0),
 730 |   THCudaTensor_stride(state, output, 4),
 731 |   THCudaTensor_stride(state, output, 1),
 732 |   THCudaTensor_stride(state, output, 2),
 733 |   THCudaTensor_stride(state, output, 3),
 734 | 
 735 |   THCudaTensor_size(state, inputImages, 4),
 736 |   THCudaTensor_size(state, inputImages, 1),
 737 |   THCudaTensor_size(state, inputImages, 2),
 738 |   THCudaTensor_size(state, inputImages, 3),
 739 |   THCudaTensor_size(state, output, 1),
 740 |   THCudaTensor_size(state, output, 3),
 741 |   focal_length);
 742 | 
 743 |   // check for errors
 744 |   cudaError_t err = cudaGetLastError();
 745 |   if (err != cudaSuccess) {
 746 |     printf("error in BilinearSampler.updateOutput: %s\n", cudaGetErrorString(err));
 747 |     THError("aborting");
 748 |   }
 749 |   return 1;
 750 | }
 751 | 
 752 | 
 753 | template<bool onlyGrid> __global__ void backwardBilinearSamplingPerspective(float* inputImages_data, int inputImages_strideBatch, int inputImages_strideChannels, int inputImages_strideDepth, int inputImages_strideHeight, int inputImages_strideWidth,
 754 | float* gradInputImages_data, int gradInputImages_strideBatch, int gradInputImages_strideChannels, int gradInputImages_strideDepth, int gradInputImages_strideHeight, int gradInputImages_strideWidth,
 755 | float* grids_data, int grids_strideBatch, int grids_strideYX, int grids_strideDepth, int grids_strideHeight, int grids_strideWidth,
 756 | float* gradGrids_data, int gradGrids_strideBatch, int gradGrids_strideYX, int gradGrids_strideDepth, int gradGrids_strideHeight, int gradGrids_strideWidth,
 757 | float* gradOutput_data, int gradOutput_strideBatch, int gradOutput_strideChannels, int gradOutput_strideDist, int gradOutput_strideHeight, int gradOutput_strideWidth,
 758 | int inputImages_channels, int inputImages_depth, int inputImages_height, int inputImages_width, int gradOutput_dist, int gradOutput_width, float focal_length)
 759 | {
 760 |    // each (32,16) block 16 output pixels (for coalescing the grid read)
 761 |    // x,y = coordinates
 762 |    // z = batch index
 763 |    // threads : used for features
 764 | 
 765 |    const int xOut = blockIdx.x*blockDim.y+threadIdx.y;
 766 |    const bool withinImageBounds = xOut < gradOutput_width;
 767 |    const bool withinGridBounds = blockIdx.x*blockDim.y + threadIdx.x / 4 < gradOutput_width;
 768 | 
 769 |    const int yOut = blockIdx.y;
 770 |    const int width = inputImages_width;
 771 |    const int height = inputImages_height;
 772 |    
 773 |    const int dist = gradOutput_dist;
 774 | 
 775 |    const int b = blockIdx.z/dist;
 776 |    const int disOut = blockIdx.z%dist;
 777 | 
 778 |    float zf,yf,xf, disf;
 779 | 
 780 |    __shared__ float gridData[32];
 781 |    if (threadIdx.y==0 && withinGridBounds)
 782 |    {
 783 |       gridData[threadIdx.x] = grids_data[b*grids_strideBatch + yOut*grids_strideHeight + xOut*grids_strideWidth + disOut*grids_strideDepth + threadIdx.x];
 784 |    }
 785 |    __syncthreads();
 786 | 
 787 |    if(withinImageBounds)
 788 |    {
 789 | 
 790 |       zf = gridData[threadIdx.y*4];
 791 |       yf = gridData[threadIdx.y*4+1];
 792 |       xf = gridData[threadIdx.y*4+2];
 793 |       disf = gridData[threadIdx.y*4+3];
 794 | 
 795 |       //yf = yf / disf;
 796 |       //xf = xf / disf;
 797 |       //zf = zf / disf - (focal_length + 0.5);
 798 | 
 799 | 
 800 |       int zInFrontTopLeft, yInFrontTopLeft, xInFrontTopLeft;
 801 |       float yWeightFrontTopLeft, xWeightFrontTopLeft, zWeightFrontTopLeft;
 802 |       getTopLeftFront(zf, inputImages_depth, zInFrontTopLeft, zWeightFrontTopLeft);
 803 |       getTopLeftFront(yf, inputImages_height, yInFrontTopLeft, yWeightFrontTopLeft);
 804 |       getTopLeftFront(xf, inputImages_width, xInFrontTopLeft, xWeightFrontTopLeft);
 805 | 
 806 |       const int inFrontTopLeftAddress = inputImages_strideBatch * b + inputImages_strideDepth * zInFrontTopLeft + inputImages_strideHeight * yInFrontTopLeft + inputImages_strideWidth * xInFrontTopLeft;
 807 |       const int inFrontTopRightAddress = inFrontTopLeftAddress + inputImages_strideWidth;
 808 |       const int inFrontBottomLeftAddress = inFrontTopLeftAddress + inputImages_strideHeight;
 809 |       const int inFrontBottomRightAddress = inFrontBottomLeftAddress + inputImages_strideWidth;
 810 | 
 811 |       const int inBackTopLeftAddress = inFrontTopLeftAddress + inputImages_strideDepth;
 812 |       const int inBackTopRightAddress = inBackTopLeftAddress + inputImages_strideWidth;
 813 |       const int inBackBottomLeftAddress = inBackTopLeftAddress + inputImages_strideHeight;
 814 |       const int inBackBottomRightAddress = inBackBottomLeftAddress + inputImages_strideWidth;
 815 | 
 816 | 
 817 |       const int gradInputImagesFrontTopLeftAddress = gradInputImages_strideBatch * b + gradInputImages_strideDepth * zInFrontTopLeft + gradInputImages_strideHeight * yInFrontTopLeft + gradInputImages_strideWidth *xInFrontTopLeft;
 818 |       const int gradInputImagesFrontTopRightAddress = gradInputImagesFrontTopLeftAddress + gradInputImages_strideWidth;
 819 |       const int gradInputImagesFrontBottomLeftAddress = gradInputImagesFrontTopLeftAddress + gradInputImages_strideHeight;
 820 |       const int gradInputImagesFrontBottomRightAddress = gradInputImagesFrontBottomLeftAddress + gradInputImages_strideWidth;
 821 | 
 822 |       const int gradInputImagesBackTopLeftAddress = gradInputImagesFrontTopLeftAddress + gradInputImages_strideDepth;
 823 |       const int gradInputImagesBackTopRightAddress = gradInputImagesBackTopLeftAddress +  gradInputImages_strideWidth;
 824 |       const int gradInputImagesBackBottomLeftAddress = gradInputImagesBackTopLeftAddress + gradInputImages_strideHeight;
 825 |       const int gradInputImagesBackBottomRightAddress = gradInputImagesBackBottomLeftAddress + gradInputImages_strideWidth;
 826 | 
 827 |       const int gradOutputAddress = gradOutput_strideBatch * b + gradOutput_strideDist * disOut + gradOutput_strideHeight * yOut + gradOutput_strideWidth * xOut;
 828 | 
 829 | 
 830 |       float frontTopLeftDotProduct = 0;
 831 |       float frontTopRightDotProduct = 0;
 832 |       float frontBottomLeftDotProduct = 0;
 833 |       float frontBottomRightDotProduct = 0;
 834 |       float backTopLeftDotProduct = 0;
 835 |       float backTopRightDotProduct = 0;
 836 |       float backBottomLeftDotProduct = 0;
 837 |       float backBottomRightDotProduct = 0;
 838 | 
 839 |       bool frontTopLeftIsIn = between(xInFrontTopLeft, 0, inputImages_width-1)
 840 |         && between(yInFrontTopLeft, 0, inputImages_height-1) && between(zInFrontTopLeft, 0, inputImages_depth-1);
 841 | 
 842 |       bool backTopLeftIsIn = between(xInFrontTopLeft, 0, inputImages_width-1)
 843 |         && between(yInFrontTopLeft, 0, inputImages_height-1) && between(zInFrontTopLeft+1, 0, inputImages_depth-1);
 844 | 
 845 |       bool frontTopRightIsIn = between(xInFrontTopLeft+1, 0, inputImages_width-1)
 846 |         && between(yInFrontTopLeft, 0, inputImages_height-1) && between(zInFrontTopLeft, 0, inputImages_depth-1);
 847 | 
 848 |       bool backTopRightIsIn = between(xInFrontTopLeft+1, 0, inputImages_width-1)
 849 |         && between(yInFrontTopLeft, 0, inputImages_height-1) && between(zInFrontTopLeft+1, 0, inputImages_depth-1);
 850 | 
 851 |       bool frontBottomLeftIsIn = between(xInFrontTopLeft, 0, inputImages_width-1)
 852 |         && between(yInFrontTopLeft+1, 0, inputImages_height-1) && between(zInFrontTopLeft, 0, inputImages_depth-1);
 853 | 
 854 |       bool backBottomLeftIsIn = between(xInFrontTopLeft, 0, inputImages_width-1)
 855 |         && between(yInFrontTopLeft+1, 0, inputImages_height-1) && between(zInFrontTopLeft+1, 0, inputImages_depth-1);
 856 | 
 857 |       bool frontBottomRightIsIn = between(xInFrontTopLeft+1, 0, inputImages_width-1)
 858 |         && between(yInFrontTopLeft+1, 0, inputImages_height-1) && between(zInFrontTopLeft, 0, inputImages_depth-1);
 859 | 
 860 |       bool backBottomRightIsIn = between(xInFrontTopLeft+1, 0, inputImages_width-1)
 861 |         && between(yInFrontTopLeft+1, 0, inputImages_height-1) && between(zInFrontTopLeft+1, 0, inputImages_depth-1);
 862 | 
 863 |       /*
 864 |          In that loop we accumulate
 865 |          - gradients into the gradInputImages array with atomic adds
 866 |          - we compute the dot product that we need for the grid gradient
 867 |       */
 868 | 
 869 |       for(int t=threadIdx.x; t<inputImages_channels; t+= blockDim.x)
 870 |       {
 871 |         float gradOutValue = gradOutput_data[gradOutputAddress + t];
 872 |         if(frontTopLeftIsIn)
 873 |         {
 874 |             float inFrontTopLeft = inputImages_data[inFrontTopLeftAddress + t];
 875 |             frontTopLeftDotProduct += inFrontTopLeft * gradOutValue;
 876 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesFrontTopLeftAddress + t], xWeightFrontTopLeft * yWeightFrontTopLeft * zWeightFrontTopLeft * gradOutValue);
 877 |         }
 878 | 
 879 |         if(frontTopRightIsIn)
 880 |         {
 881 |             float inFrontTopRight = inputImages_data[inFrontTopRightAddress + t];
 882 |             frontTopRightDotProduct += inFrontTopRight * gradOutValue;
 883 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesFrontTopRightAddress + t], (1 - xWeightFrontTopLeft) * yWeightFrontTopLeft * zWeightFrontTopLeft * gradOutValue);
 884 |         }
 885 | 
 886 |         if(frontBottomLeftIsIn)
 887 |         {
 888 |             float inFrontBottomLeft = inputImages_data[inFrontBottomLeftAddress + t];
 889 |             frontBottomLeftDotProduct += inFrontBottomLeft * gradOutValue;
 890 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesFrontBottomLeftAddress + t], xWeightFrontTopLeft * (1 - yWeightFrontTopLeft) * zWeightFrontTopLeft * gradOutValue);
 891 |         }
 892 | 
 893 |         if(frontBottomRightIsIn)
 894 |         {
 895 |             float inFrontBottomRight = inputImages_data[inFrontBottomRightAddress + t];
 896 |             frontBottomRightDotProduct += inFrontBottomRight * gradOutValue;
 897 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesFrontBottomRightAddress + t], (1 - xWeightFrontTopLeft) * (1 - yWeightFrontTopLeft) * zWeightFrontTopLeft * gradOutValue);
 898 |         }
 899 | 
 900 |         if(backTopLeftIsIn)
 901 |         {
 902 |             float inBackTopLeft = inputImages_data[inBackTopLeftAddress + t];
 903 |             backTopLeftDotProduct += inBackTopLeft * gradOutValue;
 904 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesBackTopLeftAddress + t], xWeightFrontTopLeft * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) * gradOutValue);
 905 |         }
 906 | 
 907 |         if(backTopRightIsIn)
 908 |         {
 909 |             float inBackTopRight = inputImages_data[inBackTopRightAddress + t];
 910 |             backTopRightDotProduct += inBackTopRight * gradOutValue;
 911 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesBackTopRightAddress + t], (1 - xWeightFrontTopLeft) * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) * gradOutValue);
 912 |         }
 913 | 
 914 |         if(backBottomLeftIsIn)
 915 |         {
 916 |             float inBackBottomLeft = inputImages_data[inBackBottomLeftAddress + t];
 917 |             backBottomLeftDotProduct += inBackBottomLeft * gradOutValue;
 918 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesBackBottomLeftAddress + t], xWeightFrontTopLeft * (1 - yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * gradOutValue);
 919 |         }
 920 | 
 921 |         if(backBottomRightIsIn)
 922 |         {
 923 |             float inBackBottomRight = inputImages_data[inBackBottomRightAddress + t];
 924 |             backBottomRightDotProduct += inBackBottomRight * gradOutValue;
 925 |             if(!onlyGrid) atomicAdd(&gradInputImages_data[gradInputImagesBackBottomRightAddress + t], (1 - xWeightFrontTopLeft) * (1 - yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * gradOutValue);
 926 |         }
 927 | 
 928 |       }
 929 | 
 930 |       /*
 931 |          Here we reduce the dot product and compute the grid gradient before writing it.
 932 |       */
 933 | 
 934 |       /* could do shuffles and use no shmem at all but cuda arch is 2.0 */
 935 |       __shared__ volatile float __shmem[8][32];
 936 |       __shmem[threadIdx.y][threadIdx.x] = frontTopLeftDotProduct;
 937 |       sumReduceShMemPerspective(__shmem[threadIdx.y]);
 938 |       frontTopLeftDotProduct = __shmem[threadIdx.y][0];
 939 |       //__syncthreads();
 940 | 
 941 |       __shmem[threadIdx.y][threadIdx.x] = backTopLeftDotProduct;
 942 |       sumReduceShMemPerspective(__shmem[threadIdx.y]);
 943 |       backTopLeftDotProduct = __shmem[threadIdx.y][0];
 944 |       //__syncthreads();
 945 | 
 946 |       __shmem[threadIdx.y][threadIdx.x] = frontTopRightDotProduct;
 947 |       sumReduceShMemPerspective(__shmem[threadIdx.y]);
 948 |       frontTopRightDotProduct = __shmem[threadIdx.y][0];
 949 |       //__syncthreads();
 950 | 
 951 |       __shmem[threadIdx.y][threadIdx.x] = backTopRightDotProduct;
 952 |       sumReduceShMemPerspective(__shmem[threadIdx.y]);
 953 |       backTopRightDotProduct = __shmem[threadIdx.y][0];
 954 |       //__syncthreads();
 955 | 
 956 |       __shmem[threadIdx.y][threadIdx.x] = frontBottomLeftDotProduct;
 957 |       sumReduceShMemPerspective(__shmem[threadIdx.y]);
 958 |       frontBottomLeftDotProduct = __shmem[threadIdx.y][0];
 959 |       //__syncthreads();
 960 | 
 961 |       __shmem[threadIdx.y][threadIdx.x] = backBottomLeftDotProduct;
 962 |       sumReduceShMemPerspective(__shmem[threadIdx.y]);
 963 |       backBottomLeftDotProduct = __shmem[threadIdx.y][0];
 964 |       //__syncthreads();
 965 | 
 966 |       __shmem[threadIdx.y][threadIdx.x] = frontBottomRightDotProduct;
 967 |       sumReduceShMemPerspective(__shmem[threadIdx.y]);
 968 |       frontBottomRightDotProduct = __shmem[threadIdx.y][0];
 969 |       //__syncthreads();
 970 | 
 971 |       __shmem[threadIdx.y][threadIdx.x] = backBottomRightDotProduct;
 972 |       sumReduceShMemPerspective(__shmem[threadIdx.y]);
 973 |       backBottomRightDotProduct = __shmem[threadIdx.y][0];
 974 |       //__syncthreads();
 975 | 
 976 |       float dyf = frontTopLeftDotProduct * xWeightFrontTopLeft * zWeightFrontTopLeft * (-1)
 977 |               + backTopLeftDotProduct * xWeightFrontTopLeft * (1-zWeightFrontTopLeft) * (-1)
 978 |               + frontTopRightDotProduct * (1-xWeightFrontTopLeft) * zWeightFrontTopLeft * (-1)
 979 |               + backTopRightDotProduct * (1-xWeightFrontTopLeft) * (1-zWeightFrontTopLeft) *(-1)
 980 |               + frontBottomLeftDotProduct * xWeightFrontTopLeft * zWeightFrontTopLeft * (1)
 981 |               + backBottomLeftDotProduct * xWeightFrontTopLeft * (1-zWeightFrontTopLeft) * (1)
 982 |               + frontBottomRightDotProduct * (1-xWeightFrontTopLeft) * zWeightFrontTopLeft * (1)
 983 |               + backBottomRightDotProduct * (1-xWeightFrontTopLeft) * (1-zWeightFrontTopLeft) *(1);
 984 | 
 985 |       float dxf = frontTopLeftDotProduct * yWeightFrontTopLeft * zWeightFrontTopLeft *(-1)
 986 |               + backTopLeftDotProduct * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) *(-1)
 987 |               + frontTopRightDotProduct * yWeightFrontTopLeft * zWeightFrontTopLeft * 1
 988 |               + backTopRightDotProduct * yWeightFrontTopLeft * (1-zWeightFrontTopLeft) * 1
 989 |               + frontBottomLeftDotProduct * (1-yWeightFrontTopLeft) * zWeightFrontTopLeft * (-1)
 990 |               + backBottomLeftDotProduct * (1-yWeightFrontTopLeft) * (1-zWeightFrontTopLeft) * (-1)
 991 |               + frontBottomRightDotProduct * (1-yWeightFrontTopLeft) * zWeightFrontTopLeft * 1
 992 |               + backBottomRightDotProduct * (1-yWeightFrontTopLeft) *(1-zWeightFrontTopLeft) * 1;
 993 | 
 994 |       float dzf = frontTopLeftDotProduct * yWeightFrontTopLeft * xWeightFrontTopLeft * (-1)
 995 |               + backTopLeftDotProduct * yWeightFrontTopLeft * xWeightFrontTopLeft *1
 996 |               + frontTopRightDotProduct * yWeightFrontTopLeft * (1-xWeightFrontTopLeft) *(-1)
 997 |               + backTopRightDotProduct * yWeightFrontTopLeft * (1-xWeightFrontTopLeft) *1
 998 |               + frontBottomLeftDotProduct * (1-yWeightFrontTopLeft) * xWeightFrontTopLeft * (-1)
 999 |               + backBottomLeftDotProduct * (1-yWeightFrontTopLeft) * xWeightFrontTopLeft * 1
1000 |               + frontBottomRightDotProduct * (1-yWeightFrontTopLeft) * (1-xWeightFrontTopLeft) *(-1)
1001 |               + backBottomRightDotProduct * (1-yWeightFrontTopLeft) * (1-xWeightFrontTopLeft) * 1;
1002 | 
1003 |       if(threadIdx.x==0)
1004 |       {
1005 |          gridData[threadIdx.y*4] = dzf * (inputImages_depth-1) / 2;
1006 |          gridData[threadIdx.y*4+1] = dyf * (inputImages_height-1) / 2;
1007 |          gridData[threadIdx.y*4+2] = dxf * (inputImages_width-1) / 2;
1008 |          gridData[threadIdx.y*4+3] = 0;
1009 |            //-(dyf* (inputImages_height-1) / 2*yf + dxf* (inputImages_width-1) / 2*xf + dzf* (inputImages_depth-1) / 2*(zf+focal_length+0.5))/disf;
1010 |       }
1011 |    }// must put a big if condition in order not to hang at __syncthreads()...
1012 |    __syncthreads();
1013 | 
1014 |    if(threadIdx.y==0 && withinGridBounds)
1015 |          gradGrids_data[b*gradGrids_strideBatch + yOut*gradGrids_strideHeight + xOut*gradGrids_strideWidth + disOut*gradGrids_strideDepth + threadIdx.x] = gridData[threadIdx.x];
1016 | 
1017 | }
1018 | 
1019 | static int cunn_BilinearSamplerPerspective_updateGradInput(lua_State *L)
1020 | {
1021 |   THCState *state = getCutorchState(L);
1022 |   THCudaTensor *inputImages = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
1023 |   THCudaTensor *grids = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
1024 |   THCudaTensor *gradInputImages = (THCudaTensor *)luaT_checkudata(L, 4, "torch.CudaTensor");
1025 |   THCudaTensor *gradGrids = (THCudaTensor *)luaT_checkudata(L, 5, "torch.CudaTensor");
1026 |   THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 6, "torch.CudaTensor");
1027 |   float focal_length = lua_tonumber(L, 7);
1028 | 
1029 |   dim3 blocks((gradOutput->size[3]+7)/8, gradOutput->size[2], gradOutput->size[0]*gradOutput->size[1]);
1030 |   dim3 threads(32,8);
1031 | 
1032 |   backwardBilinearSamplingPerspective <false> <<< blocks, threads, 0, THCState_getCurrentStream(state) >>> (
1033 | 
1034 |   THCudaTensor_data(state, inputImages),
1035 |   THCudaTensor_stride(state, inputImages, 0),
1036 |   THCudaTensor_stride(state, inputImages, 4),
1037 |   THCudaTensor_stride(state, inputImages, 1),
1038 |   THCudaTensor_stride(state, inputImages, 2),
1039 |   THCudaTensor_stride(state, inputImages, 3),
1040 | 
1041 |   THCudaTensor_data(state, gradInputImages),
1042 |   THCudaTensor_stride(state, gradInputImages, 0),
1043 |   THCudaTensor_stride(state, gradInputImages, 4),
1044 |   THCudaTensor_stride(state, gradInputImages, 1),
1045 |   THCudaTensor_stride(state, gradInputImages, 2),
1046 |   THCudaTensor_stride(state, gradInputImages, 3),
1047 | 
1048 |   THCudaTensor_data(state, grids),
1049 |   THCudaTensor_stride(state, grids, 0),
1050 |   THCudaTensor_stride(state, grids, 4),
1051 |   THCudaTensor_stride(state, grids, 1),
1052 |   THCudaTensor_stride(state, grids, 2),
1053 |   THCudaTensor_stride(state, grids, 3),
1054 | 
1055 |   THCudaTensor_data(state, gradGrids),
1056 |   THCudaTensor_stride(state, gradGrids, 0),
1057 |   THCudaTensor_stride(state, gradGrids, 4),
1058 |   THCudaTensor_stride(state, gradGrids, 1),
1059 |   THCudaTensor_stride(state, gradGrids, 2),
1060 |   THCudaTensor_stride(state, gradGrids, 3),
1061 | 
1062 |   THCudaTensor_data(state, gradOutput),
1063 |   THCudaTensor_stride(state, gradOutput, 0),
1064 |   THCudaTensor_stride(state, gradOutput, 4),
1065 |   THCudaTensor_stride(state, gradOutput, 1),
1066 |   THCudaTensor_stride(state, gradOutput, 2),
1067 |   THCudaTensor_stride(state, gradOutput, 3),
1068 | 
1069 |   THCudaTensor_size(state, inputImages, 4),
1070 |   THCudaTensor_size(state, inputImages, 1),
1071 |   THCudaTensor_size(state, inputImages, 2),
1072 |   THCudaTensor_size(state, inputImages, 3),
1073 |   THCudaTensor_size(state, gradOutput, 1),
1074 |   THCudaTensor_size(state, gradOutput, 3),
1075 |   focal_length);
1076 | 
1077 |   // check for errors
1078 |   cudaError_t err = cudaGetLastError();
1079 |   if (err != cudaSuccess) {
1080 |     printf("error in BilinearSampler.updateGradInput: %s\n", cudaGetErrorString(err));
1081 |     THError("aborting");
1082 |   }
1083 |   return 1;
1084 | }
1085 | 
1086 | static const struct luaL_Reg cunn_BilinearSamplerPerspective__ [] = {
1087 |   {"BilinearSamplerBHWD_updateOutput", cunn_BilinearSamplerBHWD_updateOutput},
1088 |   {"BilinearSamplerBHWD_updateGradInput", cunn_BilinearSamplerBHWD_updateGradInput},
1089 |   {"BilinearSamplerPerspective_updateOutput", cunn_BilinearSamplerPerspective_updateOutput},
1090 |   {"BilinearSamplerPerspective_updateGradInput", cunn_BilinearSamplerPerspective_updateGradInput},
1091 |   //{"BilinearSamplerPerspective_updateGradInputOnlyGrid", cunn_BilinearSamplerBHWD_updateGradInputOnlyGrid},
1092 |   {NULL, NULL}
1093 | };
1094 | 
1095 | static void cunn_BilinearSamplerPerspective_init(lua_State *L)
1096 | {
1097 |   luaT_pushmetatable(L, "torch.CudaTensor");
1098 |   luaT_registeratname(L, cunn_BilinearSamplerPerspective__, "nn");
1099 |   lua_pop(L,1);
1100 | }
1101 | 


--------------------------------------------------------------------------------