├── cunn-conv1d ├── init.lua ├── utils.h ├── utils.c ├── .gitignore ├── init.cu ├── common.h ├── rocks │ └── cunnconv1d-scm-1.rockspec ├── CMakeLists.txt ├── LateralConvolution.cu ├── VerticalConvolution.cu └── HorizontalConvolution.cu ├── nn-conv1d ├── init.lua ├── .gitignore ├── rocks │ └── nnconv1d-scm-1.rockspec ├── init.c ├── CMakeLists.txt ├── LateralConvolution.lua ├── VerticalConvolution.lua ├── HorizontalConvolution.lua └── generic │ ├── LateralConvolution.c │ ├── VerticalConvolution.c │ └── HorizontalConvolution.c ├── nnconv1d-scm-1.rockspec ├── cunnconv1d-scm-1.rockspec ├── LICENSE ├── README.md └── example.lua /cunn-conv1d/init.lua: -------------------------------------------------------------------------------- 1 | require('cutorch') 2 | require('nn') 3 | require('nnconv1d') 4 | require('cunn') 5 | require('libcunnconv1d') 6 | 7 | return cunn 8 | -------------------------------------------------------------------------------- /cunn-conv1d/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef CUNN_UTILS_H 2 | #define CUNN_UTILS_H 3 | 4 | #include 5 | #include "THCGeneral.h" 6 | 7 | THCState* getCutorchState(lua_State* L); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /nn-conv1d/init.lua: -------------------------------------------------------------------------------- 1 | require('torch') 2 | require('nn') 3 | require('libnnconv1d') 4 | 5 | include('LateralConvolution.lua') 6 | include('HorizontalConvolution.lua') 7 | include('VerticalConvolution.lua') 8 | 9 | return nn 10 | -------------------------------------------------------------------------------- /cunn-conv1d/utils.c: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | THCState* getCutorchState(lua_State* L) 4 | { 5 | lua_getglobal(L, "cutorch"); 6 | lua_getfield(L, -1, "getState"); 7 | lua_call(L, 0, 1); 8 | THCState *state = (THCState*) lua_touserdata(L, -1); 9 | lua_pop(L, 2); 10 | return state; 11 | } 12 | -------------------------------------------------------------------------------- /cunn-conv1d/.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Libraries 12 | *.lib 13 | *.a 14 | *.la 15 | *.lo 16 | 17 | # Shared objects (inc. Windows DLLs) 18 | *.dll 19 | *.so 20 | *.so.* 21 | *.dylib 22 | 23 | # Executables 24 | *.exe 25 | *.out 26 | *.app 27 | *.i*86 28 | *.x86_64 29 | *.hex 30 | 31 | # Debug files 32 | *.dSYM/ 33 | 34 | # Mac attribute 35 | .DS_Store* 36 | -------------------------------------------------------------------------------- /nn-conv1d/.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Libraries 12 | *.lib 13 | *.a 14 | *.la 15 | *.lo 16 | 17 | # Shared objects (inc. Windows DLLs) 18 | *.dll 19 | *.so 20 | *.so.* 21 | *.dylib 22 | 23 | # Executables 24 | *.exe 25 | *.out 26 | *.app 27 | *.i*86 28 | *.x86_64 29 | *.hex 30 | 31 | # Debug files 32 | *.dSYM/ 33 | 34 | # Mac attribute 35 | .DS_Store* 36 | -------------------------------------------------------------------------------- /cunn-conv1d/init.cu: -------------------------------------------------------------------------------- 1 | #include "luaT.h" 2 | #include "THC.h" 3 | 4 | #include "utils.c" 5 | 6 | #include "LateralConvolution.cu" 7 | #include "VerticalConvolution.cu" 8 | #include "HorizontalConvolution.cu" 9 | 10 | LUA_EXTERNC DLL_EXPORT int luaopen_libcunnconv1d(lua_State *L); 11 | 12 | int luaopen_libcunnconv1d(lua_State *L) 13 | { 14 | lua_newtable(L); 15 | 16 | cunnconv1d_LateralConvolution_init(L); 17 | cunnconv1d_VerticalConvolution_init(L); 18 | cunnconv1d_HorizontalConvolution_init(L); 19 | 20 | return 1; 21 | } 22 | -------------------------------------------------------------------------------- /cunn-conv1d/common.h: -------------------------------------------------------------------------------- 1 | #ifndef CUNN_COMMON_H 2 | #define CUNN_COMMON_H 3 | 4 | // CUDA: grid stride looping 5 | #define CUDA_KERNEL_LOOP(i, n) \ 6 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 7 | i < (n); \ 8 | i += blockDim.x * gridDim.x) 9 | 10 | // Use 1024 threads per block, which requires cuda sm_2x or above 11 | const int CUDA_NUM_THREADS = 1024; 12 | 13 | // CUDA: number of blocks for threads. 14 | inline int GET_BLOCKS(const int N) { 15 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 16 | } 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /nn-conv1d/rocks/nnconv1d-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "nnconv1d" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/jhjin/flattened-cnn.git", 6 | } 7 | 8 | description = { 9 | summary = "1D Convolutions for Torch nn", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/jhjin/flattened-cnn", 13 | license = "MIT" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | } 19 | 20 | build = { 21 | type = "command", 22 | build_command = [[ 23 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 24 | ]], 25 | install_command = "cd build && $(MAKE) install" 26 | } 27 | -------------------------------------------------------------------------------- /nnconv1d-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "nnconv1d" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/jhjin/flattened-cnn.git", 6 | } 7 | 8 | description = { 9 | summary = "1D Convolutions for Torch nn", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/jhjin/flattened-cnn", 13 | license = "MIT" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | } 19 | 20 | build = { 21 | type = "command", 22 | build_command = [[ 23 | cd nn-conv1d && cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) 24 | ]], 25 | install_command = "cd nn-conv1d && cd build && $(MAKE) install" 26 | } 27 | -------------------------------------------------------------------------------- /cunn-conv1d/rocks/cunnconv1d-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "cunnconv1d" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/jhjin/flattened-cnn.git", 6 | } 7 | 8 | description = { 9 | summary = "1D Convolutions for Torch cunn", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/jhjin/flattened-cnn", 13 | license = "MIT" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | "nn >= 1.0", 19 | "cutorch >= 1.0" 20 | } 21 | 22 | build = { 23 | type = "command", 24 | build_command = [[ 25 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install 26 | ]], 27 | install_command = "cd build" 28 | } 29 | -------------------------------------------------------------------------------- /cunnconv1d-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "cunnconv1d" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/jhjin/flattened-cnn.git", 6 | } 7 | 8 | description = { 9 | summary = "1D Convolutions for Torch cunn", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/jhjin/flattened-cnn", 13 | license = "MIT" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | "nn >= 1.0", 19 | "cutorch >= 1.0" 20 | } 21 | 22 | build = { 23 | type = "command", 24 | build_command = [[ 25 | cd cunn-conv1d && cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install 26 | ]], 27 | install_command = "cd cunn-conv1d/build" 28 | } 29 | -------------------------------------------------------------------------------- /nn-conv1d/init.c: -------------------------------------------------------------------------------- 1 | #include "TH.h" 2 | #include "luaT.h" 3 | 4 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME) 5 | #define torch_Tensor TH_CONCAT_STRING_3(torch.,Real,Tensor) 6 | #define nnconv1d_(NAME) TH_CONCAT_3(nnconv1d_, Real, NAME) 7 | 8 | #include "generic/LateralConvolution.c" 9 | #include "THGenerateFloatTypes.h" 10 | 11 | #include "generic/VerticalConvolution.c" 12 | #include "THGenerateFloatTypes.h" 13 | 14 | #include "generic/HorizontalConvolution.c" 15 | #include "THGenerateFloatTypes.h" 16 | 17 | LUA_EXTERNC DLL_EXPORT int luaopen_libnnconv1d(lua_State *L); 18 | 19 | int luaopen_libnnconv1d(lua_State *L) 20 | { 21 | lua_newtable(L); 22 | 23 | nnconv1d_FloatLateralConvolution_init(L); 24 | nnconv1d_FloatVerticalConvolution_init(L); 25 | nnconv1d_FloatHorizontalConvolution_init(L); 26 | 27 | nnconv1d_DoubleLateralConvolution_init(L); 28 | nnconv1d_DoubleVerticalConvolution_init(L); 29 | nnconv1d_DoubleHorizontalConvolution_init(L); 30 | 31 | return 1; 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Jonghoon Jin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Flattened convolutional neural networks 2 | 3 | This package has 1D convolution modules (over channel, in vertical, in horizontal) used in 4 | [Flattened Convolutional Neural Networks for Feedforward Acceleration] (http://arxiv.org/abs/1412.5474) 5 | where we denote the flattened convolution layer as a sequence of one-dimensional filters across all 3D directions. 6 | 7 | 8 | ### Install 9 | 10 | Choose both or either of `nn`/`cunn` backend packages depending on your computing environment. 11 | 12 | ```bash 13 | luarocks install https://raw.githubusercontent.com/jhjin/flattened-cnn/master/nnconv1d-scm-1.rockspec # cpu 14 | luarocks install https://raw.githubusercontent.com/jhjin/flattened-cnn/master/cunnconv1d-scm-1.rockspec # cuda 15 | ``` 16 | 17 | or use this command if you already cloned this repo. 18 | 19 | ```bash 20 | cd nn-conv1d 21 | luarocks make rocks/nnconv1d-scm-1.rockspec 22 | cd ../cunn-conv1d 23 | luarocks make rocks/cunnconv1d-scm-1.rockspec 24 | ``` 25 | 26 | 27 | ### Available modules 28 | 29 | This is a list of available modules. 30 | 31 | ```lua 32 | nn.LateralConvolution(nInputPlane, nOutputPlane) -- 1d conv over feature 33 | nn.HorizontalConvolution(nInputPlane, nOutputPlane, kL) -- 1d conv in horizontal 34 | nn.VerticalConvolution(nInputPlane, nOutputPlane, kL) -- 1d conv in vertical 35 | ``` 36 | 37 | 38 | ### Example 39 | 40 | Run the command below. 41 | 42 | ```bash 43 | th example.lua 44 | ``` 45 | -------------------------------------------------------------------------------- /cunn-conv1d/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR) 2 | CMAKE_POLICY(VERSION 2.8) 3 | 4 | SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}") 5 | 6 | FIND_PACKAGE(Torch REQUIRED) 7 | FIND_PACKAGE(JPEG REQUIRED) 8 | 9 | if (JPEG_FOUND) 10 | include_directories (${JPEG_INCLUDE_DIR}) 11 | endif (JPEG_FOUND) 12 | 13 | IF(APPLE) 14 | # work around for mac os x bug: 15 | # http://stackoverflow.com/questions/16286588/cuda-5-0-cmake-and-make-failing-on-osx-10-8-3 16 | if (NOT DEFINED CUDA_HOST_COMPILER AND CMAKE_C_COMPILER_ID STREQUAL "Clang" AND EXISTS /usr/bin/gcc) 17 | set(CUDA_HOST_COMPILER /usr/bin/gcc CACHE FILEPATH "Host side compiler used by NVCC") 18 | message(STATUS "Setting CMAKE_HOST_COMPILER to /usr/bin/gcc instead of ${CMAKE_C_COMPILER}.") 19 | endif() 20 | ENDIF() 21 | 22 | FIND_PACKAGE(CUDA 4.0 REQUIRED) 23 | 24 | LIST(APPEND CUDA_NVCC_FLAGS "-arch=sm_20") 25 | 26 | INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}/THC") 27 | LINK_DIRECTORIES("${Torch_INSTALL_LIB}") 28 | 29 | SET(src-cuda init.cu) 30 | 31 | FILE(GLOB luasrc *.lua) 32 | 33 | CUDA_ADD_LIBRARY(cunnconv1d MODULE ${src-cuda}) 34 | TARGET_LINK_LIBRARIES(cunnconv1d luaT THC TH ${JPEG_LIBRARIES}) 35 | IF(APPLE) 36 | SET_TARGET_PROPERTIES(cunnconv1d PROPERTIES 37 | LINK_FLAGS "-undefined dynamic_lookup") 38 | ENDIF() 39 | 40 | ### Torch packages supposes libraries prefix is "lib" 41 | SET_TARGET_PROPERTIES(cunnconv1d PROPERTIES 42 | PREFIX "lib" 43 | IMPORT_PREFIX "lib") 44 | 45 | INSTALL(TARGETS cunnconv1d 46 | RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}" 47 | LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}") 48 | 49 | INSTALL( 50 | FILES 51 | ${luasrc} 52 | DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/cunnconv1d") 53 | -------------------------------------------------------------------------------- /nn-conv1d/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) 2 | CMAKE_POLICY(VERSION 2.6) 3 | 4 | FIND_PACKAGE(Torch REQUIRED) 5 | FIND_PACKAGE(JPEG REQUIRED) 6 | 7 | 8 | # Flags 9 | # When using MSVC 10 | IF(MSVC) 11 | # we want to respect the standard, and we are bored of those **** . 12 | ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1) 13 | ENDIF(MSVC) 14 | 15 | # OpenMP support? 16 | SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?") 17 | IF (APPLE AND CMAKE_COMPILER_IS_GNUCC) 18 | EXEC_PROGRAM (uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) 19 | STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) 20 | MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}") 21 | IF (DARWIN_VERSION GREATER 9) 22 | SET(APPLE_OPENMP_SUCKS 1) 23 | ENDIF (DARWIN_VERSION GREATER 9) 24 | EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion 25 | OUTPUT_VARIABLE GCC_VERSION) 26 | IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2) 27 | MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)") 28 | MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP") 29 | SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas") 30 | SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE) 31 | ENDIF () 32 | ENDIF () 33 | 34 | IF (WITH_OPENMP) 35 | FIND_PACKAGE(OpenMP) 36 | IF(OPENMP_FOUND) 37 | MESSAGE(STATUS "Compiling with OpenMP support") 38 | SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 39 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 40 | SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") 41 | ENDIF(OPENMP_FOUND) 42 | ENDIF (WITH_OPENMP) 43 | 44 | LINK_DIRECTORIES("${Torch_INSTALL_LIB}") 45 | 46 | SET(src init.c) 47 | 48 | FILE(GLOB luasrc *.lua) 49 | 50 | ADD_TORCH_PACKAGE(nnconv1d "${src}" "${luasrc}") 51 | 52 | TARGET_LINK_LIBRARIES(nnconv1d luaT TH) 53 | -------------------------------------------------------------------------------- /nn-conv1d/LateralConvolution.lua: -------------------------------------------------------------------------------- 1 | local LateralConvolution, parent = torch.class('nn.LateralConvolution', 'nn.Module') 2 | 3 | function LateralConvolution:__init(nInputPlane, nOutputPlane) 4 | parent.__init(self) 5 | 6 | self.nInputPlane = nInputPlane 7 | self.nOutputPlane = nOutputPlane 8 | 9 | self.weight = torch.Tensor(nOutputPlane, nInputPlane) 10 | self.bias = torch.Tensor(nOutputPlane) 11 | self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane) 12 | self.gradBias = torch.Tensor(nOutputPlane) 13 | 14 | self.ones = torch.Tensor() 15 | 16 | self:reset() 17 | end 18 | 19 | function LateralConvolution:reset(stdv) 20 | if stdv then 21 | stdv = stdv * math.sqrt(3) 22 | else 23 | stdv = 1/math.sqrt(self.nInputPlane) 24 | end 25 | if nn.oldSeed then 26 | self.weight:apply(function() 27 | return torch.uniform(-stdv, stdv) 28 | end) 29 | self.bias:apply(function() 30 | return torch.uniform(-stdv, stdv) 31 | end) 32 | else 33 | self.weight:uniform(-stdv, stdv) 34 | self.bias:uniform(-stdv, stdv) 35 | end 36 | end 37 | 38 | local function makeContiguous(self, input, gradOutput) 39 | if not input:isContiguous() then 40 | self._input = self._input or input.new() 41 | self._input:resizeAs(input):copy(input) 42 | input = self._input 43 | end 44 | if gradOutput then 45 | if not gradOutput:isContiguous() then 46 | self._gradOutput = self._gradOutput or gradOutput.new() 47 | self._gradOutput:resizeAs(gradOutput):copy(gradOutput) 48 | gradOutput = self._gradOutput 49 | end 50 | end 51 | return input, gradOutput 52 | end 53 | 54 | function LateralConvolution:updateOutput(input) 55 | input = makeContiguous(self, input) 56 | return input.nn.LateralConvolution_updateOutput(self, input) 57 | end 58 | 59 | function LateralConvolution:updateGradInput(input, gradOutput) 60 | if self.gradInput then 61 | input, gradOutput = makeContiguous(self, input, gradOutput) 62 | return input.nn.LateralConvolution_updateGradInput(self, input, gradOutput) 63 | end 64 | end 65 | 66 | function LateralConvolution:accGradParameters(input, gradOutput, scale) 67 | input, gradOutput = makeContiguous(self, input, gradOutput) 68 | return input.nn.LateralConvolution_accGradParameters(self, input, gradOutput, scale) 69 | end 70 | -------------------------------------------------------------------------------- /nn-conv1d/VerticalConvolution.lua: -------------------------------------------------------------------------------- 1 | local VerticalConvolution, parent = torch.class('nn.VerticalConvolution', 'nn.Module') 2 | 3 | function VerticalConvolution:__init(nInputPlane, nOutputPlane, kL) 4 | parent.__init(self) 5 | 6 | assert(nInputPlane == nOutputPlane) 7 | self.nInputPlane = nInputPlane 8 | self.nOutputPlane = nOutputPlane 9 | 10 | self.kL = kL 11 | 12 | self.weight = torch.Tensor(nInputPlane, kL) 13 | self.bias = torch.Tensor(nOutputPlane) 14 | self.gradWeight = torch.Tensor(nInputPlane, kL) 15 | self.gradBias = torch.Tensor(nOutputPlane) 16 | 17 | self.ones = torch.Tensor() 18 | self.finput = torch.Tensor() 19 | self.fgradWeight = torch.Tensor() 20 | 21 | self:reset() 22 | end 23 | 24 | function VerticalConvolution:reset(stdv) 25 | if stdv then 26 | stdv = stdv * math.sqrt(3) 27 | else 28 | stdv = 1/math.sqrt(self.nInputPlane) 29 | end 30 | if nn.oldSeed then 31 | self.weight:apply(function() 32 | return torch.uniform(-stdv, stdv) 33 | end) 34 | self.bias:apply(function() 35 | return torch.uniform(-stdv, stdv) 36 | end) 37 | else 38 | self.weight:uniform(-stdv, stdv) 39 | self.bias:uniform(-stdv, stdv) 40 | end 41 | end 42 | 43 | local function makeContiguous(self, input, gradOutput) 44 | if not input:isContiguous() then 45 | self._input = self._input or input.new() 46 | self._input:resizeAs(input):copy(input) 47 | input = self._input 48 | end 49 | if gradOutput then 50 | if not gradOutput:isContiguous() then 51 | self._gradOutput = self._gradOutput or gradOutput.new() 52 | self._gradOutput:resizeAs(gradOutput):copy(gradOutput) 53 | gradOutput = self._gradOutput 54 | end 55 | end 56 | return input, gradOutput 57 | end 58 | 59 | function VerticalConvolution:updateOutput(input) 60 | input = makeContiguous(self, input) 61 | return input.nn.VerticalConvolution_updateOutput(self, input) 62 | end 63 | 64 | function VerticalConvolution:updateGradInput(input, gradOutput) 65 | if self.gradInput then 66 | input, gradOutput = makeContiguous(self, input, gradOutput) 67 | return input.nn.VerticalConvolution_updateGradInput(self, input, gradOutput) 68 | end 69 | end 70 | 71 | function VerticalConvolution:accGradParameters(input, gradOutput, scale) 72 | input, gradOutput = makeContiguous(self, input, gradOutput) 73 | return input.nn.VerticalConvolution_accGradParameters(self, input, gradOutput, scale) 74 | end 75 | -------------------------------------------------------------------------------- /nn-conv1d/HorizontalConvolution.lua: -------------------------------------------------------------------------------- 1 | local HorizontalConvolution, parent = torch.class('nn.HorizontalConvolution', 'nn.Module') 2 | 3 | function HorizontalConvolution:__init(nInputPlane, nOutputPlane, kL) 4 | parent.__init(self) 5 | 6 | assert(nInputPlane == nOutputPlane) 7 | self.nInputPlane = nInputPlane 8 | self.nOutputPlane = nOutputPlane 9 | 10 | self.kL = kL 11 | 12 | self.weight = torch.Tensor(nInputPlane, kL) 13 | self.bias = torch.Tensor(nOutputPlane) 14 | self.gradWeight = torch.Tensor(nInputPlane, kL) 15 | self.gradBias = torch.Tensor(nOutputPlane) 16 | 17 | self.ones = torch.Tensor() 18 | self.finput = torch.Tensor() 19 | self.fgradWeight = torch.Tensor() 20 | 21 | self:reset() 22 | end 23 | 24 | function HorizontalConvolution:reset(stdv) 25 | if stdv then 26 | stdv = stdv * math.sqrt(3) 27 | else 28 | stdv = 1/math.sqrt(self.nInputPlane) 29 | end 30 | if nn.oldSeed then 31 | self.weight:apply(function() 32 | return torch.uniform(-stdv, stdv) 33 | end) 34 | self.bias:apply(function() 35 | return torch.uniform(-stdv, stdv) 36 | end) 37 | else 38 | self.weight:uniform(-stdv, stdv) 39 | self.bias:uniform(-stdv, stdv) 40 | end 41 | end 42 | 43 | local function makeContiguous(self, input, gradOutput) 44 | if not input:isContiguous() then 45 | self._input = self._input or input.new() 46 | self._input:resizeAs(input):copy(input) 47 | input = self._input 48 | end 49 | if gradOutput then 50 | if not gradOutput:isContiguous() then 51 | self._gradOutput = self._gradOutput or gradOutput.new() 52 | self._gradOutput:resizeAs(gradOutput):copy(gradOutput) 53 | gradOutput = self._gradOutput 54 | end 55 | end 56 | return input, gradOutput 57 | end 58 | 59 | function HorizontalConvolution:updateOutput(input) 60 | input = makeContiguous(self, input) 61 | return input.nn.HorizontalConvolution_updateOutput(self, input) 62 | end 63 | 64 | function HorizontalConvolution:updateGradInput(input, gradOutput) 65 | if self.gradInput then 66 | input, gradOutput = makeContiguous(self, input, gradOutput) 67 | return input.nn.HorizontalConvolution_updateGradInput(self, input, gradOutput) 68 | end 69 | end 70 | 71 | function HorizontalConvolution:accGradParameters(input, gradOutput, scale) 72 | input, gradOutput = makeContiguous(self, input, gradOutput) 73 | return input.nn.HorizontalConvolution_accGradParameters(self, input, gradOutput, scale) 74 | end 75 | -------------------------------------------------------------------------------- /example.lua: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env th 2 | -- 3 | -- Rank-1 3D filter decomposition test 4 | -- 5 | require('torch') 6 | require('nnconv1d') 7 | torch.setdefaulttensortype('torch.FloatTensor') 8 | 9 | 10 | local function check_error(msg, a, b) 11 | local diff = torch.add(a, -b):abs() 12 | print('==> '..msg..' error (max/mean): ', diff:max(), diff:mean()) 13 | end 14 | 15 | local function compose_filter(z, y, x) 16 | local zyx = torch.Tensor(z:size(1), z:size(2), y:size(2)*x:size(2)) 17 | for i = 1, z:size(1) do 18 | local yx = torch.ger(y[i], x[i]) 19 | for j = 1, z:size(2) do 20 | zyx[i][j]:copy(yx):mul(z[i][j]) 21 | end 22 | end 23 | return zyx 24 | end 25 | 26 | 27 | -- set parameters 28 | local batch = 3 29 | local nInputPlanes = 4 30 | local nOutputPlanes = 5 31 | local iH = 5 32 | local iW = 5 33 | local kW = 3 34 | local kH = 3 35 | local use_cuda = false 36 | 37 | 38 | -- pick an input 39 | local input = torch.randn(batch, nInputPlanes, iH, iW) 40 | 41 | -- get rank-1 filters 42 | local z = torch.randn(nOutputPlanes, nInputPlanes) -- over feature 43 | local y = torch.randn(nOutputPlanes, kH) -- in vertical 44 | local x = torch.randn(nOutputPlanes, kW) -- in horizontal 45 | local b = torch.randn(nOutputPlanes) -- bias 46 | 47 | -- reconstruct 3d filter 48 | local zyx = compose_filter(z, y, x) 49 | 50 | 51 | -- define models 52 | local model_full = nn.Sequential() 53 | model_full:add(nn.SpatialConvolutionMM(nInputPlanes, nOutputPlanes, kW, kH)) 54 | 55 | local model_low = nn.Sequential() 56 | model_low:add(nn.LateralConvolution(nInputPlanes, nOutputPlanes)) 57 | model_low:add(nn.VerticalConvolution(nOutputPlanes, nOutputPlanes, kH)) 58 | model_low:add(nn.HorizontalConvolution(nOutputPlanes, nOutputPlanes, kW)) 59 | 60 | 61 | -- overwrite parameters 62 | model_full.modules[1].weight:copy(zyx) 63 | model_full.modules[1].bias:copy(b) 64 | 65 | model_low.modules[1].weight:copy(z) 66 | model_low.modules[2].weight:copy(y) 67 | model_low.modules[3].weight:copy(x) 68 | model_low.modules[1].bias:zero() 69 | model_low.modules[2].bias:zero() 70 | model_low.modules[3].bias:copy(b) 71 | 72 | 73 | -- enable GPU 74 | if use_cuda then 75 | require('cunnconv1d') 76 | model_full = model_full:cuda() 77 | model_low = model_low:cuda() 78 | input = input:cuda() 79 | end 80 | 81 | 82 | -- test 83 | local output_full = model_full:updateOutput(input) 84 | local output_low = model_low:updateOutput(input) 85 | check_error('output ', output_full, output_low) 86 | 87 | local gradOutput_full = output_full:clone():add(0.1) 88 | local gradOutput_low = output_low:clone():add(0.1) 89 | local gradInput_full = model_full:updateGradInput(input, gradOutput_full) 90 | local gradInput_low = model_low:updateGradInput(input, gradOutput_low) 91 | check_error('gradInput', gradInput_full, gradInput_low) 92 | 93 | model_full:zeroGradParameters() 94 | model_low:zeroGradParameters() 95 | model_full:accGradParameters(input, gradOutput_full, 1) 96 | model_low:accGradParameters(input, gradOutput_low, 1) 97 | local w_full, dw_full = model_full:getParameters() 98 | local w_low, dw_low = model_low:getParameters() 99 | -------------------------------------------------------------------------------- /nn-conv1d/generic/LateralConvolution.c: -------------------------------------------------------------------------------- 1 | #ifndef TH_GENERIC_FILE 2 | #define TH_GENERIC_FILE "generic/LateralConvolution.c" 3 | #else 4 | 5 | 6 | static int nnconv1d_(LateralConvolution_updateOutput)(lua_State *L) 7 | { 8 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 9 | 10 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 11 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 12 | 13 | THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); 14 | THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); 15 | THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor); 16 | 17 | luaL_argcheck(L, input->nDimension == 3 || 18 | input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected"); 19 | 20 | // change to batch mode 21 | int batch = 1; 22 | if (input->nDimension == 3) { 23 | batch = 0; 24 | THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]); 25 | } 26 | 27 | long batchSize = input->size[0]; 28 | long inputHeight = input->size[2]; 29 | long inputWidth = input->size[3]; 30 | long outputHeight = inputHeight; 31 | long outputWidth = inputWidth; 32 | 33 | THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); 34 | 35 | int elt; 36 | #pragma omp parallel for private(elt) 37 | for (elt = 0; elt < batchSize; elt++) { 38 | 39 | // select each batch in 2D 40 | THTensor *input_t = THTensor_(newSelect)(input, 0, elt); 41 | THTensor *output_t = THTensor_(newSelect)(output, 0, elt); 42 | THTensor *input2d = THTensor_(newWithStorage2d) 43 | (input_t->storage, input_t->storageOffset, 44 | nInputPlane, -1, inputHeight*inputWidth, -1); 45 | THTensor *output2d = THTensor_(newWithStorage2d) 46 | (output_t->storage, output_t->storageOffset, 47 | nOutputPlane, -1, outputHeight*outputWidth, -1); 48 | 49 | // fill biases 50 | int i; 51 | for (i = 0; i < nOutputPlane; i++) 52 | THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i, 53 | THTensor_(get1d)(bias, i), outputHeight*outputWidth); 54 | 55 | // convolve 56 | THTensor_(addmm)(output2d, 1, output2d, 1, weight, input2d); 57 | 58 | // release temp tensors 59 | THTensor_(free)(input2d); 60 | THTensor_(free)(output2d); 61 | THTensor_(free)(input_t); 62 | THTensor_(free)(output_t); 63 | } 64 | 65 | // revert to single batch 66 | if (batch == 0) { 67 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 68 | THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); 69 | } 70 | 71 | return 1; 72 | } 73 | 74 | 75 | static int nnconv1d_(LateralConvolution_updateGradInput)(lua_State *L) 76 | { 77 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 78 | THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); 79 | 80 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 81 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 82 | 83 | THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); 84 | THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor); 85 | 86 | THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, 87 | "Number of output features is not equal to nOutputPlane" ); 88 | 89 | // change to batch mode 90 | int batch = 1; 91 | if (input->nDimension == 3) { 92 | batch = 0; 93 | THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); 94 | THTensor_(resize4d)(gradOutput, 1, nOutputPlane, gradOutput->size[1], gradOutput->size[2]); 95 | } 96 | 97 | long batchSize = input->size[0]; 98 | long inputWidth = input->size[3]; 99 | long inputHeight = input->size[2]; 100 | long outputWidth = inputWidth; 101 | long outputHeight = inputHeight; 102 | 103 | THTensor_(resizeAs)(gradInput, input); 104 | THTensor_(transpose)(weight, weight, 0, 1); 105 | 106 | int elt; 107 | #pragma omp parallel for private(elt) 108 | for (elt = 0; elt < batchSize; elt++) { 109 | 110 | // select each batch in 2D 111 | THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, elt); 112 | THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt); 113 | THTensor *gradInput2d = THTensor_(newWithStorage2d) 114 | (gradInput_t->storage, gradInput_t->storageOffset, 115 | nInputPlane, -1, inputWidth*inputHeight, -1); 116 | THTensor *gradOutput2d = THTensor_(newWithStorage2d) 117 | (gradOutput_t->storage, gradOutput_t->storageOffset, 118 | nOutputPlane, -1, outputWidth*outputHeight, -1); 119 | 120 | // convolve 121 | THTensor_(addmm)(gradInput2d, 0, gradInput2d, 1, weight, gradOutput2d); 122 | 123 | // release temp tensors 124 | THTensor_(free)(gradInput2d); 125 | THTensor_(free)(gradOutput2d); 126 | THTensor_(free)(gradInput_t); 127 | THTensor_(free)(gradOutput_t); 128 | } 129 | 130 | THTensor_(transpose)(weight, weight, 0, 1); 131 | 132 | // revert to single batch 133 | if (batch == 0) { 134 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 135 | THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); 136 | THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); 137 | } 138 | 139 | return 1; 140 | } 141 | 142 | 143 | static int nnconv1d_(LateralConvolution_accGradParameters)(lua_State *L) 144 | { 145 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 146 | THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); 147 | real scale = luaL_optnumber(L, 4, 1); 148 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 149 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 150 | 151 | THTensor *ones = luaT_getfieldcheckudata(L, 1, "ones", torch_Tensor); 152 | THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor); 153 | THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor); 154 | 155 | THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, 156 | "Number of output features is not equal to nOutputPlane" ); 157 | 158 | // change to batch mode 159 | int batch = 1; 160 | if (input->nDimension == 3) { 161 | batch = 0; 162 | THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); 163 | THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 164 | } 165 | 166 | long batchSize = input->size[0]; 167 | long inputWidth = input->size[3]; 168 | long inputHeight = input->size[2]; 169 | long outputWidth = inputWidth; 170 | long outputHeight = inputHeight; 171 | 172 | if (ones->nDimension != 1 || ones->size[0] < outputHeight*outputWidth) { 173 | THTensor_(resize1d)(ones, outputHeight*outputWidth); 174 | THTensor_(fill)(ones, 1); 175 | } 176 | 177 | int elt; 178 | for (elt = 0; elt < batchSize; elt++) { 179 | 180 | // select each batch in 2D 181 | THTensor *input_t = THTensor_(newSelect)(input, 0, elt); 182 | THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt); 183 | THTensor *input2d = THTensor_(newWithStorage2d) 184 | (input_t->storage, input_t->storageOffset, 185 | nInputPlane, -1, inputWidth*inputHeight, -1); 186 | THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset, 187 | nOutputPlane, -1, outputWidth*outputHeight, -1); 188 | 189 | // convolve 190 | THTensor_(transpose)(input2d, input2d, 0, 1); 191 | THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, input2d); 192 | THTensor_(transpose)(input2d, input2d, 0, 1); 193 | 194 | // fill biases 195 | THTensor_(addmv)(gradBias, 1, gradBias, scale, gradOutput2d, ones); 196 | 197 | THTensor_(free)(input2d); 198 | THTensor_(free)(gradOutput2d); 199 | THTensor_(free)(input_t); 200 | THTensor_(free)(gradOutput_t); 201 | } 202 | 203 | // revert to single batch 204 | if (batch == 0) { 205 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 206 | THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); 207 | } 208 | 209 | return 0; 210 | } 211 | 212 | static const struct luaL_Reg nnconv1d_(LateralConvolution__) [] = { 213 | {"LateralConvolution_updateOutput", nnconv1d_(LateralConvolution_updateOutput)}, 214 | {"LateralConvolution_updateGradInput", nnconv1d_(LateralConvolution_updateGradInput)}, 215 | {"LateralConvolution_accGradParameters", nnconv1d_(LateralConvolution_accGradParameters)}, 216 | {NULL, NULL} 217 | }; 218 | 219 | static void nnconv1d_(LateralConvolution_init)(lua_State *L) 220 | { 221 | luaT_pushmetatable(L, torch_Tensor); 222 | luaT_registeratname(L, nnconv1d_(LateralConvolution__), "nn"); 223 | lua_pop(L,1); 224 | } 225 | 226 | #endif 227 | -------------------------------------------------------------------------------- /nn-conv1d/generic/VerticalConvolution.c: -------------------------------------------------------------------------------- 1 | #ifndef TH_GENERIC_FILE 2 | #define TH_GENERIC_FILE "generic/VerticalConvolution.c" 3 | #else 4 | 5 | 6 | static int nnconv1d_(VerticalConvolution_updateOutput)(lua_State *L) 7 | { 8 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 9 | 10 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 11 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 12 | int kL = luaT_getfieldcheckint(L, 1, "kL"); 13 | 14 | THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); 15 | THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); 16 | THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor); 17 | 18 | luaL_argcheck(L, input->nDimension == 3 || 19 | input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected"); 20 | 21 | // change to batch mode 22 | int batch = 1; 23 | if (input->nDimension == 3) { 24 | batch = 0; 25 | THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]); 26 | } 27 | 28 | long batchSize = input->size[0]; 29 | long inputHeight = input->size[2]; 30 | long inputWidth = input->size[3]; 31 | long outputHeight = inputHeight - kL + 1; 32 | long outputWidth = inputWidth; 33 | 34 | THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); 35 | 36 | int elt; 37 | #pragma omp parallel for private(elt) 38 | for (elt = 0; elt < batchSize; elt++) { 39 | 40 | // select each batch 41 | THTensor *input_t = THTensor_(newSelect)(input, 0, elt); 42 | THTensor *output_t = THTensor_(newSelect)(output, 0, elt); 43 | 44 | // fill biases 45 | int i, j, k; 46 | for (i = 0; i < nOutputPlane; i++) { 47 | THVector_(fill)(output_t->storage->data + output_t->storageOffset + output_t->stride[0]*i, 48 | THTensor_(get1d)(bias, i), outputHeight*outputWidth); 49 | } 50 | 51 | // convolve vertically 52 | for (i = 0; i < nInputPlane; i++) { 53 | for (k = 0; k < kL; k++) { 54 | THVector_(add)(output_t->storage->data + output_t->storageOffset + output_t->stride[0]*i, 55 | input_t->storage->data + input_t->storageOffset + 56 | input_t->stride[0]*i + input_t->stride[1]*k, 57 | *(THTensor_(data)(weight)+i*kL+k), outputHeight*outputWidth); 58 | } 59 | } 60 | 61 | // release temp tensors 62 | THTensor_(free)(input_t); 63 | THTensor_(free)(output_t); 64 | } 65 | 66 | // revert to single batch 67 | if (batch == 0) { 68 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 69 | THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); 70 | } 71 | 72 | return 1; 73 | } 74 | 75 | 76 | static int nnconv1d_(VerticalConvolution_updateGradInput)(lua_State *L) 77 | { 78 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 79 | THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); 80 | 81 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 82 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 83 | int kL = luaT_getfieldcheckint(L, 1, "kL"); 84 | 85 | THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); 86 | THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor); 87 | 88 | THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, 89 | "Number of output features is not equal to nOutputPlane" ); 90 | 91 | // change to batch mode 92 | int batch = 1; 93 | if (input->nDimension == 3) { 94 | batch = 0; 95 | THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); 96 | THTensor_(resize4d)(gradOutput, 1, nOutputPlane, gradOutput->size[1], gradOutput->size[2]); 97 | } 98 | 99 | long batchSize = input->size[0]; 100 | long inputHeight = input->size[2]; 101 | long inputWidth = input->size[3]; 102 | long outputHeight = inputHeight - kL + 1; 103 | long outputWidth = inputWidth; 104 | 105 | THTensor_(resizeAs)(gradInput, input); 106 | THTensor_(zero)(gradInput); 107 | 108 | int elt; 109 | #pragma omp parallel for private(elt) 110 | for (elt = 0; elt < batchSize; elt++) { 111 | 112 | // select each batch 113 | THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, elt); 114 | THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt); 115 | 116 | // convolve vertically 117 | int i, k; 118 | for (i = 0; i < nOutputPlane; i++) { 119 | for (k = 0; k < kL; k++) { 120 | THVector_(add)(gradInput_t->storage->data + gradInput_t->storageOffset + 121 | gradInput_t->stride[0]*i + gradInput_t->stride[1]*k, 122 | gradOutput_t->storage->data + gradOutput_t->storageOffset + 123 | gradOutput_t->stride[0]*i, 124 | *(THTensor_(data)(weight)+i*kL+k), outputHeight*outputWidth); 125 | } 126 | } 127 | 128 | // release temp tensors 129 | THTensor_(free)(gradInput_t); 130 | THTensor_(free)(gradOutput_t); 131 | } 132 | 133 | // revert to single batch 134 | if (batch == 0) { 135 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 136 | THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); 137 | THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); 138 | } 139 | 140 | return 1; 141 | } 142 | 143 | 144 | static int nnconv1d_(VerticalConvolution_accGradParameters)(lua_State *L) 145 | { 146 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 147 | THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); 148 | real scale = luaL_optnumber(L, 4, 1); 149 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 150 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 151 | int kL = luaT_getfieldcheckint(L, 1, "kL"); 152 | 153 | THTensor *ones = luaT_getfieldcheckudata(L, 1, "ones", torch_Tensor); 154 | THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor); 155 | THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor); 156 | 157 | THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, 158 | "Number of output features is not equal to nOutputPlane" ); 159 | 160 | // change to batch mode 161 | int batch = 1; 162 | if (input->nDimension == 3) { 163 | batch = 0; 164 | THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); 165 | THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 166 | } 167 | 168 | long batchSize = input->size[0]; 169 | long inputHeight = input->size[2]; 170 | long inputWidth = input->size[3]; 171 | long outputHeight = inputHeight - kL + 1; 172 | long outputWidth = inputWidth; 173 | 174 | if (ones->nDimension != 1 || ones->size[0] < outputHeight*outputWidth) { 175 | THTensor_(resize1d)(ones, outputHeight*outputWidth); 176 | THTensor_(fill)(ones, 1); 177 | } 178 | 179 | int elt; 180 | for (elt = 0; elt < batchSize; elt++) { 181 | 182 | // select each batch in 2D 183 | THTensor *input_t = THTensor_(newSelect)(input, 0, elt); 184 | THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt); 185 | THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset, 186 | nOutputPlane, -1, outputWidth*outputHeight, -1); 187 | 188 | // dot products 189 | int i, k; 190 | for (i = 0; i < nInputPlane; i++) { 191 | for (k = 0; k < kL; k++) { 192 | *(gradWeight->storage->data + gradWeight->storageOffset + i*gradWeight->stride[0] + k) += 193 | scale*THBlas_(dot) 194 | (outputHeight*outputWidth, 195 | gradOutput_t->storage->data + gradOutput_t->storageOffset + i*gradOutput_t->stride[0], 196 | gradOutput_t->stride[2], 197 | input_t->storage->data + input_t->storageOffset + i*input_t->stride[0] + k*outputWidth, 198 | input_t->stride[2]); 199 | } 200 | } 201 | 202 | // fill biases 203 | THTensor_(addmv)(gradBias, 1, gradBias, scale, gradOutput2d, ones); 204 | 205 | THTensor_(free)(gradOutput2d); 206 | THTensor_(free)(input_t); 207 | THTensor_(free)(gradOutput_t); 208 | } 209 | 210 | // revert to single batch 211 | if (batch == 0) { 212 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 213 | THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); 214 | } 215 | 216 | return 0; 217 | } 218 | 219 | 220 | static const struct luaL_Reg nnconv1d_(VerticalConvolution__) [] = { 221 | {"VerticalConvolution_updateOutput", nnconv1d_(VerticalConvolution_updateOutput)}, 222 | {"VerticalConvolution_updateGradInput", nnconv1d_(VerticalConvolution_updateGradInput)}, 223 | {"VerticalConvolution_accGradParameters", nnconv1d_(VerticalConvolution_accGradParameters)}, 224 | {NULL, NULL} 225 | }; 226 | 227 | 228 | static void nnconv1d_(VerticalConvolution_init)(lua_State *L) 229 | { 230 | luaT_pushmetatable(L, torch_Tensor); 231 | luaT_registeratname(L, nnconv1d_(VerticalConvolution__), "nn"); 232 | lua_pop(L,1); 233 | } 234 | 235 | #endif 236 | -------------------------------------------------------------------------------- /nn-conv1d/generic/HorizontalConvolution.c: -------------------------------------------------------------------------------- 1 | #ifndef TH_GENERIC_FILE 2 | #define TH_GENERIC_FILE "generic/HorizontalConvolution.c" 3 | #else 4 | 5 | 6 | static int nnconv1d_(HorizontalConvolution_updateOutput)(lua_State *L) 7 | { 8 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 9 | 10 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 11 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 12 | int kL = luaT_getfieldcheckint(L, 1, "kL"); 13 | 14 | THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); 15 | THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor); 16 | THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor); 17 | 18 | luaL_argcheck(L, input->nDimension == 3 || 19 | input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected"); 20 | 21 | // change to batch mode 22 | int batch = 1; 23 | if (input->nDimension == 3) { 24 | batch = 0; 25 | THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]); 26 | } 27 | 28 | long batchSize = input->size[0]; 29 | long inputHeight = input->size[2]; 30 | long inputWidth = input->size[3]; 31 | long outputHeight = inputHeight; 32 | long outputWidth = inputWidth - kL + 1; 33 | 34 | THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); 35 | 36 | int elt; 37 | #pragma omp parallel for private(elt) 38 | for (elt = 0; elt < batchSize; elt++) { 39 | 40 | // select each batch 41 | THTensor *input_t = THTensor_(newSelect)(input, 0, elt); 42 | THTensor *output_t = THTensor_(newSelect)(output, 0, elt); 43 | 44 | // fill biases 45 | int i, j, k; 46 | for (i = 0; i < nOutputPlane; i++) { 47 | THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i, 48 | THTensor_(get1d)(bias, i), outputHeight*outputWidth); 49 | } 50 | 51 | // convolve horizontally 52 | for (i = 0; i < nInputPlane; i++) { 53 | for (j = 0; j < inputHeight; j++) { 54 | for (k = 0; k < kL; k++) { 55 | THVector_(add)(output_t->storage->data + output_t->storageOffset + 56 | output_t->stride[0]*i + output_t->stride[1]*j, 57 | input_t->storage->data + input_t->storageOffset + 58 | input_t->stride[0]*i + input_t->stride[1]*j + k, 59 | *(THTensor_(data)(weight)+i*kL+k), outputWidth); 60 | } 61 | } 62 | } 63 | 64 | // release temp tensors 65 | THTensor_(free)(input_t); 66 | THTensor_(free)(output_t); 67 | } 68 | 69 | // revert to single batch 70 | if (batch == 0) { 71 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 72 | THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); 73 | } 74 | 75 | return 1; 76 | } 77 | 78 | 79 | static int nnconv1d_(HorizontalConvolution_updateGradInput)(lua_State *L) 80 | { 81 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 82 | THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); 83 | 84 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 85 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 86 | int kL = luaT_getfieldcheckint(L, 1, "kL"); 87 | 88 | THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor); 89 | THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor); 90 | 91 | THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, 92 | "Number of output features is not equal to nOutputPlane" ); 93 | 94 | // change to batch mode 95 | int batch = 1; 96 | if (input->nDimension == 3) { 97 | batch = 0; 98 | THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); 99 | THTensor_(resize4d)(gradOutput, 1, nOutputPlane, gradOutput->size[1], gradOutput->size[2]); 100 | } 101 | 102 | long batchSize = input->size[0]; 103 | long inputHeight = input->size[2]; 104 | long inputWidth = input->size[3]; 105 | long outputHeight = inputHeight; 106 | long outputWidth = inputWidth - kL + 1; 107 | 108 | THTensor_(resizeAs)(gradInput, input); 109 | THTensor_(zero)(gradInput); 110 | 111 | int elt; 112 | #pragma omp parallel for private(elt) 113 | for (elt = 0; elt < batchSize; elt++) { 114 | 115 | // select each batch 116 | THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, elt); 117 | THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt); 118 | 119 | // convolve horizontally 120 | int i, j, k; 121 | for (i = 0; i < nOutputPlane; i++) { 122 | for (j = 0; j < outputHeight; j++) { 123 | for (k = 0; k < kL; k++) { 124 | THVector_(add)(gradInput_t->storage->data + gradInput_t->storageOffset + 125 | gradInput_t->stride[0]*i + gradInput_t->stride[1]*j + k, 126 | gradOutput_t->storage->data + gradOutput_t->storageOffset + 127 | gradOutput_t->stride[0]*i + gradOutput_t->stride[1]*j, 128 | *(THTensor_(data)(weight)+i*kL+k), outputWidth); // needs to change 129 | } 130 | } 131 | } 132 | 133 | // release temp tensors 134 | THTensor_(free)(gradInput_t); 135 | THTensor_(free)(gradOutput_t); 136 | } 137 | 138 | // revert to single batch 139 | if (batch == 0) { 140 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 141 | THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); 142 | THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); 143 | } 144 | 145 | return 1; 146 | } 147 | 148 | 149 | static int nnconv1d_(HorizontalConvolution_accGradParameters)(lua_State *L) 150 | { 151 | THTensor *input = luaT_checkudata(L, 2, torch_Tensor); 152 | THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor); 153 | real scale = luaL_optnumber(L, 4, 1); 154 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 155 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 156 | int kL = luaT_getfieldcheckint(L, 1, "kL"); 157 | 158 | THTensor *ones = luaT_getfieldcheckudata(L, 1, "ones", torch_Tensor); 159 | THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor); 160 | THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor); 161 | 162 | THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1, 163 | "Number of output features is not equal to nOutputPlane" ); 164 | 165 | // change to batch mode 166 | int batch = 1; 167 | if (input->nDimension == 3) { 168 | batch = 0; 169 | THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); 170 | THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 171 | } 172 | 173 | long batchSize = input->size[0]; 174 | long inputHeight = input->size[2]; 175 | long inputWidth = input->size[3]; 176 | long outputHeight = inputHeight; 177 | long outputWidth = inputWidth - kL + 1; 178 | 179 | if (ones->nDimension != 1 || ones->size[0] < outputHeight*outputWidth) { 180 | THTensor_(resize1d)(ones, outputHeight*outputWidth); 181 | THTensor_(fill)(ones, 1); 182 | } 183 | 184 | int elt; 185 | for (elt = 0; elt < batchSize; elt++) { 186 | 187 | // select each batch in 2D 188 | THTensor *input_t = THTensor_(newSelect)(input, 0, elt); 189 | THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt); 190 | THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset, 191 | nOutputPlane, -1, outputWidth*outputHeight, -1); 192 | 193 | // dot products 194 | int i, j, k; 195 | for (i = 0; i < nInputPlane; i++) { 196 | for (k = 0; k < kL; k++) { 197 | for (j = 0; j < outputHeight; j++) { 198 | *(gradWeight->storage->data + gradWeight->storageOffset + i*gradWeight->stride[0] + k) += 199 | scale*THBlas_(dot) 200 | (outputWidth, 201 | gradOutput_t->storage->data + gradOutput_t->storageOffset + 202 | i*gradOutput_t->stride[0] + j*gradOutput_t->stride[1], 203 | gradOutput_t->stride[2], 204 | input_t->storage->data + input_t->storageOffset + 205 | i*input_t->stride[0] + j*input_t->stride[1] + k, 206 | input_t->stride[2]); 207 | } 208 | } 209 | } 210 | 211 | // fill biases 212 | THTensor_(addmv)(gradBias, 1, gradBias, scale, gradOutput2d, ones); 213 | 214 | THTensor_(free)(gradOutput2d); 215 | THTensor_(free)(input_t); 216 | THTensor_(free)(gradOutput_t); 217 | } 218 | 219 | // revert to single batch 220 | if (batch == 0) { 221 | THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); 222 | THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); 223 | } 224 | 225 | return 0; 226 | } 227 | 228 | 229 | static const struct luaL_Reg nnconv1d_(HorizontalConvolution__) [] = { 230 | {"HorizontalConvolution_updateOutput", nnconv1d_(HorizontalConvolution_updateOutput)}, 231 | {"HorizontalConvolution_updateGradInput", nnconv1d_(HorizontalConvolution_updateGradInput)}, 232 | {"HorizontalConvolution_accGradParameters", nnconv1d_(HorizontalConvolution_accGradParameters)}, 233 | {NULL, NULL} 234 | }; 235 | 236 | 237 | static void nnconv1d_(HorizontalConvolution_init)(lua_State *L) 238 | { 239 | luaT_pushmetatable(L, torch_Tensor); 240 | luaT_registeratname(L, nnconv1d_(HorizontalConvolution__), "nn"); 241 | lua_pop(L,1); 242 | } 243 | 244 | #endif 245 | -------------------------------------------------------------------------------- /cunn-conv1d/LateralConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | 4 | static int cunnconv1d_LateralConvolution_updateOutput(lua_State *L) { 5 | THCState *state = getCutorchState(L); 6 | THCudaTensor *input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 7 | 8 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 9 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 10 | 11 | THCudaTensor *weight = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor"); 12 | THCudaTensor *bias = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "bias", "torch.CudaTensor"); 13 | THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor"); 14 | THCudaTensor *output = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "output", "torch.CudaTensor"); 15 | 16 | const int device = THCudaTensor_getDevice(state, weight); 17 | luaL_argcheck(L, THCudaTensor_getDevice(state, bias) == device, 1, 18 | "weight and bias need to be on the same device"); 19 | luaL_argcheck(L, THCudaTensor_getDevice(state, output) == device || 20 | THCudaTensor_getDevice(state, output) == -1, 1, 21 | "weight and output need to be on the same device"); 22 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2, 23 | "weight and input need to be on the same device"); 24 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 25 | "3D or 4D (batch mode) tensor is expected"); 26 | 27 | // change to batch mode 28 | int batch = 1; 29 | if (input->nDimension == 3) { 30 | luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); 31 | batch = 0; 32 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 33 | } else { 34 | luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); 35 | } 36 | 37 | long batchSize = input->size[0]; 38 | long inputHeight = input->size[2]; 39 | long inputWidth = input->size[3]; 40 | long outputHeight = inputHeight; 41 | long outputWidth = inputWidth; 42 | 43 | THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); 44 | 45 | if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { 46 | THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); 47 | THCudaTensor_fill(state, ones, 1); 48 | } 49 | 50 | THCudaTensor *input_n = THCudaTensor_new(state); 51 | THCudaTensor *output_n = THCudaTensor_new(state); 52 | 53 | for (int elt = 0; elt < batchSize; elt ++) { 54 | 55 | // select each batch 56 | THCudaTensor_select(state, input_n, input, 0, elt); 57 | THCudaTensor_select(state, output_n, output, 0, elt); 58 | 59 | // fill biases 60 | THCudaBlas_gemm( 61 | state, 't', 'n', 62 | outputHeight*outputWidth, nOutputPlane, 1, 63 | 1, 64 | THCudaTensor_data(state, ones), 1, 65 | THCudaTensor_data(state, bias), 1, 66 | 0, 67 | THCudaTensor_data(state, output_n), outputHeight*outputWidth 68 | ); 69 | 70 | // convolve 71 | THCudaBlas_gemm( 72 | state, 73 | 'n', 'n', 74 | outputHeight*outputWidth, nOutputPlane, nInputPlane, 75 | 1, 76 | THCudaTensor_data(state, input_n), outputHeight*outputWidth, 77 | THCudaTensor_data(state, weight), nInputPlane, 78 | 1, 79 | THCudaTensor_data(state, output_n), outputHeight*outputWidth 80 | ); 81 | } 82 | 83 | THCudaTensor_free(state, input_n); 84 | THCudaTensor_free(state, output_n); 85 | 86 | // revert to single batch 87 | if (batch == 0) { 88 | THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth); 89 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 90 | } 91 | 92 | return 1; 93 | } 94 | 95 | 96 | static int cunnconv1d_LateralConvolution_updateGradInput(lua_State *L) { 97 | THCState *state = getCutorchState(L); 98 | THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 99 | THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 100 | 101 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 102 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 103 | 104 | THCudaTensor *weight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor"); 105 | THCudaTensor *gradInput = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", "torch.CudaTensor"); 106 | 107 | const int device = THCudaTensor_getDevice(state, weight); 108 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2, 109 | "weight and input need to be on the same device"); 110 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradInput) == device 111 | || THCudaTensor_getDevice(state, gradInput) == -1, 2, 112 | "weight and gradInput need to be on the same device"); 113 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device 114 | || THCudaTensor_getDevice(state, gradOutput) == -1, 2, 115 | "weight and gradOutput need to be on the same device"); 116 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 117 | "3D or 4D (batch mode) tensor is expected"); 118 | 119 | int batch = 1; 120 | if (input->nDimension == 3) { 121 | batch = 0; 122 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 123 | THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 124 | } 125 | 126 | long batchSize = input->size[0]; 127 | long inputHeight = input->size[2]; 128 | long inputWidth = input->size[3]; 129 | long outputHeight = inputHeight; 130 | long outputWidth = inputWidth; 131 | 132 | THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); 133 | 134 | THCudaTensor *gradInput_n = THCudaTensor_new(state); 135 | THCudaTensor *gradOutput_n = THCudaTensor_new(state); 136 | 137 | for (int elt = 0; elt < batchSize; elt ++) { 138 | 139 | // select each batch in 2D 140 | THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); 141 | THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); 142 | 143 | // convolve 144 | THCudaBlas_gemm( 145 | state, 146 | 'n', 't', 147 | outputHeight*outputWidth, nInputPlane, nOutputPlane, 148 | 1, 149 | THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth, 150 | THCudaTensor_data(state, weight), nInputPlane, 151 | 0, 152 | THCudaTensor_data(state, gradInput_n), outputHeight*outputWidth 153 | ); 154 | } 155 | 156 | THCudaTensor_free(state, gradInput_n); 157 | THCudaTensor_free(state, gradOutput_n); 158 | 159 | // revert to single batch 160 | if (batch == 0) { 161 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 162 | THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth); 163 | THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); 164 | } 165 | 166 | return 1; 167 | } 168 | 169 | 170 | static int cunnconv1d_LateralConvolution_accGradParameters(lua_State *L) { 171 | THCState *state = getCutorchState(L); 172 | THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 173 | THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 174 | 175 | float scale = luaL_optnumber(L, 4, 1); 176 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 177 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 178 | 179 | THCudaTensor *gradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", "torch.CudaTensor"); 180 | THCudaTensor *gradBias = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", "torch.CudaTensor"); 181 | THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor"); 182 | 183 | const int device = THCudaTensor_getDevice(state, gradWeight); 184 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradBias) == device, 1, 185 | "gradWeight and gradBias need to be on the same device"); 186 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 1, 187 | "gradWeight and input need to be on the same device"); 188 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device, 1, 189 | "gradWeight and gradOutput need to be on the same device"); 190 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 191 | "3D or 4D (batch mode) tensor is expected"); 192 | 193 | // change to batch mode 194 | int batch = 1; 195 | if (input->nDimension == 3) { 196 | batch = 0; 197 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 198 | THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 199 | } 200 | 201 | long batchSize = input->size[0]; 202 | long inputHeight = input->size[2]; 203 | long inputWidth = input->size[3]; 204 | long outputHeight = inputHeight; 205 | long outputWidth = inputWidth; 206 | 207 | if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { 208 | THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); 209 | THCudaTensor_fill(state, ones, 1); 210 | } 211 | 212 | THCudaTensor *input_n = THCudaTensor_new(state); 213 | THCudaTensor *gradOutput_n = THCudaTensor_new(state); 214 | 215 | for (int elt = 0; elt < batchSize; elt ++) { 216 | 217 | // select each batch 218 | THCudaTensor_select(state, input_n, input, 0, elt); 219 | THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); 220 | 221 | // convolve 222 | THCudaBlas_gemm( 223 | state, 224 | 't', 'n', 225 | nInputPlane, nOutputPlane, outputHeight*outputWidth, 226 | scale, 227 | THCudaTensor_data(state, input_n), inputHeight*inputWidth, 228 | THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth, 229 | 1, 230 | THCudaTensor_data(state, gradWeight), nInputPlane 231 | ); 232 | 233 | // fill biases 234 | THCudaBlas_gemv( 235 | state, 236 | 't', 237 | outputHeight*outputWidth, nOutputPlane, 238 | scale, 239 | THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth, 240 | THCudaTensor_data(state, ones), 1, 241 | 1, 242 | THCudaTensor_data(state, gradBias), 1 243 | ); 244 | } 245 | 246 | THCudaTensor_free(state, input_n); 247 | THCudaTensor_free(state, gradOutput_n); 248 | 249 | if (batch == 0) { 250 | THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); 251 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 252 | } 253 | 254 | return 0; 255 | } 256 | 257 | 258 | static const struct luaL_Reg cunnconv1d_LateralConvolution__ [] = { 259 | {"LateralConvolution_updateOutput", cunnconv1d_LateralConvolution_updateOutput}, 260 | {"LateralConvolution_updateGradInput", cunnconv1d_LateralConvolution_updateGradInput}, 261 | {"LateralConvolution_accGradParameters", cunnconv1d_LateralConvolution_accGradParameters}, 262 | {NULL, NULL} 263 | }; 264 | 265 | 266 | void cunnconv1d_LateralConvolution_init(lua_State *L) 267 | { 268 | luaT_pushmetatable(L, "torch.CudaTensor"); 269 | luaT_registeratname(L, cunnconv1d_LateralConvolution__, "nn"); 270 | lua_pop(L,1); 271 | } 272 | -------------------------------------------------------------------------------- /cunn-conv1d/VerticalConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | #include "common.h" 3 | 4 | // Kernel for fast unfold+copy 5 | // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu) 6 | __global__ void im2col_kernel_v(const int n, const float* data_im, 7 | const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, 8 | const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, 9 | float* data_col) { 10 | CUDA_KERNEL_LOOP(index, n) { 11 | int w_out = index % width_col; 12 | index /= width_col; 13 | int h_out = index % height_col; 14 | int channel_in = index / height_col; 15 | int channel_out = channel_in * ksize_h * ksize_w; 16 | int h_in = h_out * stride_h - pad_h; 17 | int w_in = w_out * stride_w - pad_w; 18 | data_col += (channel_out * height_col + h_out) * width_col + w_out; 19 | data_im += (channel_in * height + h_in) * width + w_in; 20 | for (int i = 0; i < ksize_h; ++i) { 21 | for (int j = 0; j < ksize_w; ++j) { 22 | int h = h_in + i; 23 | int w = w_in + j; 24 | *data_col = (h >= 0 && w >= 0 && h < height && w < width) ? 25 | data_im[i * width + j] : 0; 26 | data_col += height_col * width_col; 27 | } 28 | } 29 | } 30 | } 31 | 32 | 33 | __global__ void conv_vertical_naive_output(const int n, float *y, 34 | const float *x, const float *w, 35 | const int iH, const int iW, const int kL) 36 | { 37 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) { 38 | int oH = iH - kL + 1; 39 | int x_offset = (i/(oH*iW))*iH*iW + i%(oH*iW); 40 | int w_offset = (i/(oH*iW))*kL; 41 | 42 | for (int k = 0; k < kL; k++) { 43 | y[i] += w[w_offset + k]*x[x_offset + k*iW]; 44 | } 45 | } 46 | } 47 | 48 | 49 | __global__ void conv_vertical_naive_gradInput(const int n, float *dx, 50 | const float *dy, const float *w, 51 | const int oH, const int oW, const int kL) 52 | { 53 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) { 54 | int iH = oH + kL - 1; 55 | int iC = i/(iH*oW); 56 | int row = (i%(iH*oW))/oW; 57 | int dy_offset = iC*oH*oW + i%(iH*oW); 58 | int w_offset = iC*kL; 59 | 60 | int k_begin = max(0, row-oH+1); 61 | int k_end = min(kL, row+1); 62 | 63 | dx[i] = 0.0f; 64 | for (int k = k_begin; k < k_end; k++) { 65 | dx[i] += w[w_offset + k]*dy[dy_offset - k*oW]; 66 | } 67 | } 68 | } 69 | 70 | 71 | __global__ void conv_vertical_naive_gradParam(const int n, float *dw, 72 | const float *x, const float *dy, 73 | const int kL, const int oH, const int oW) 74 | { 75 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) { 76 | int dy_offset = (i/kL)*oH*oW; 77 | int x_offset = (i/kL)*oH*oW + (i%kL)*oW; 78 | 79 | for (int k = 0; k < oH*oW; k++) { 80 | dw[i] += dy[dy_offset + k]*x[x_offset + k]; 81 | } 82 | } 83 | } 84 | 85 | 86 | __global__ void conv_vertical_naive_gradWeight(const int n, float *y, 87 | const float *x, const int kL, 88 | const int iC) 89 | { 90 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) { 91 | y[i] = x[(i/kL)*kL*iC + i]; 92 | } 93 | } 94 | 95 | 96 | static int cunnconv1d_VerticalConvolution_updateOutput(lua_State *L) { 97 | THCState *state = getCutorchState(L); 98 | THCudaTensor *input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 99 | 100 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 101 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 102 | 103 | THCudaTensor *weight = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor"); 104 | THCudaTensor *bias = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "bias", "torch.CudaTensor"); 105 | THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor"); 106 | THCudaTensor *output = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "output", "torch.CudaTensor"); 107 | 108 | const int device = THCudaTensor_getDevice(state, weight); 109 | luaL_argcheck(L, THCudaTensor_getDevice(state, bias) == device, 1, 110 | "weight and bias need to be on the same device"); 111 | luaL_argcheck(L, THCudaTensor_getDevice(state, output) == device || 112 | THCudaTensor_getDevice(state, output) == -1, 1, 113 | "weight and output need to be on the same device"); 114 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2, 115 | "weight and input need to be on the same device"); 116 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 117 | "3D or 4D (batch mode) tensor is expected"); 118 | 119 | // change to batch mode 120 | int batch = 1; 121 | if (input->nDimension == 3) { 122 | luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); 123 | batch = 0; 124 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 125 | } else { 126 | luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); 127 | } 128 | 129 | long batchSize = input->size[0]; 130 | long inputHeight = input->size[2]; 131 | long inputWidth = input->size[3]; 132 | long outputHeight = inputHeight - weight->size[1] + 1; 133 | long outputWidth = inputWidth; 134 | 135 | THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); 136 | 137 | if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { 138 | THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); 139 | THCudaTensor_fill(state, ones, 1); 140 | } 141 | 142 | THCudaTensor *input_n = THCudaTensor_new(state); 143 | THCudaTensor *output_n = THCudaTensor_new(state); 144 | 145 | for (int elt = 0; elt < batchSize; elt ++) { 146 | 147 | // select each batch 148 | THCudaTensor_select(state, input_n, input, 0, elt); 149 | THCudaTensor_select(state, output_n, output, 0, elt); 150 | 151 | // fill biases 152 | THCudaBlas_gemm( 153 | state, 't', 'n', 154 | outputHeight*outputWidth, nOutputPlane, 1, 155 | 1, 156 | THCudaTensor_data(state, ones), 1, 157 | THCudaTensor_data(state, bias), 1, 158 | 0, 159 | THCudaTensor_data(state, output_n), outputHeight*outputWidth 160 | ); 161 | 162 | // convolve 163 | long num_threads = nOutputPlane*outputHeight*outputWidth; 164 | conv_vertical_naive_output <<>> 165 | (num_threads, 166 | THCudaTensor_data(state, output_n), 167 | THCudaTensor_data(state, input_n), 168 | THCudaTensor_data(state, weight), 169 | inputHeight, inputWidth, weight->size[1]); 170 | } 171 | 172 | THCudaTensor_free(state, input_n); 173 | THCudaTensor_free(state, output_n); 174 | 175 | // revert to single batch 176 | if (batch == 0) { 177 | THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth); 178 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 179 | } 180 | 181 | return 1; 182 | } 183 | 184 | 185 | static int cunnconv1d_VerticalConvolution_updateGradInput(lua_State *L) { 186 | THCState *state = getCutorchState(L); 187 | THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 188 | THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 189 | 190 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 191 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 192 | 193 | THCudaTensor *weight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor"); 194 | THCudaTensor *gradInput = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", "torch.CudaTensor"); 195 | 196 | const int device = THCudaTensor_getDevice(state, weight); 197 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2, 198 | "weight and input need to be on the same device"); 199 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradInput) == device 200 | || THCudaTensor_getDevice(state, gradInput) == -1, 2, 201 | "weight and gradInput need to be on the same device"); 202 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device 203 | || THCudaTensor_getDevice(state, gradOutput) == -1, 2, 204 | "weight and gradOutput need to be on the same device"); 205 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 206 | "3D or 4D (batch mode) tensor is expected"); 207 | 208 | int batch = 1; 209 | if (input->nDimension == 3) { 210 | batch = 0; 211 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 212 | THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 213 | } 214 | 215 | long batchSize = input->size[0]; 216 | long inputHeight = input->size[2]; 217 | long inputWidth = input->size[3]; 218 | long outputHeight = inputHeight - weight->size[1] + 1; 219 | long outputWidth = inputWidth; 220 | 221 | THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); 222 | 223 | THCudaTensor *gradInput_n = THCudaTensor_new(state); 224 | THCudaTensor *gradOutput_n = THCudaTensor_new(state); 225 | 226 | for (int elt = 0; elt < batchSize; elt ++) { 227 | 228 | // select each batch in 2D 229 | THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); 230 | THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); 231 | 232 | // convolve 233 | long num_threads = nInputPlane*inputHeight*inputWidth; 234 | conv_vertical_naive_gradInput <<>> 235 | (num_threads, 236 | THCudaTensor_data(state, gradInput_n), 237 | THCudaTensor_data(state, gradOutput_n), 238 | THCudaTensor_data(state, weight), 239 | outputHeight, outputWidth, weight->size[1]); 240 | } 241 | 242 | THCudaTensor_free(state, gradInput_n); 243 | THCudaTensor_free(state, gradOutput_n); 244 | 245 | // revert to single batch 246 | if (batch == 0) { 247 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 248 | THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth); 249 | THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); 250 | } 251 | 252 | return 1; 253 | } 254 | 255 | 256 | static int cunnconv1d_VerticalConvolution_accGradParameters(lua_State *L) { 257 | THCState *state = getCutorchState(L); 258 | THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 259 | THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 260 | 261 | float scale = luaL_optnumber(L, 4, 1); 262 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 263 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 264 | int kL = luaT_getfieldcheckint(L, 1, "kL"); 265 | 266 | THCudaTensor *gradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", "torch.CudaTensor"); 267 | THCudaTensor *gradBias = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", "torch.CudaTensor"); 268 | THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor"); 269 | THCudaTensor *finput = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "finput", "torch.CudaTensor"); 270 | THCudaTensor *fgradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "fgradWeight", "torch.CudaTensor"); 271 | 272 | const int device = THCudaTensor_getDevice(state, gradWeight); 273 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradBias) == device, 1, 274 | "gradWeight and gradBias need to be on the same device"); 275 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 1, 276 | "gradWeight and input need to be on the same device"); 277 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device, 1, 278 | "gradWeight and gradOutput need to be on the same device"); 279 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 280 | "3D or 4D (batch mode) tensor is expected"); 281 | 282 | // change to batch mode 283 | int batch = 1; 284 | if (input->nDimension == 3) { 285 | batch = 0; 286 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 287 | THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 288 | } 289 | 290 | long batchSize = input->size[0]; 291 | long inputHeight = input->size[2]; 292 | long inputWidth = input->size[3]; 293 | long outputHeight = inputHeight - kL + 1; 294 | long outputWidth = inputWidth; 295 | 296 | if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { 297 | THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); 298 | THCudaTensor_fill(state, ones, 1); 299 | } 300 | 301 | THCudaTensor_resize2d(state, finput, kL*nInputPlane, outputHeight*outputWidth); 302 | THCudaTensor_resize2d(state, fgradWeight, nOutputPlane, kL*nInputPlane); 303 | 304 | THCudaTensor *input_n = THCudaTensor_new(state); 305 | THCudaTensor *gradOutput_n = THCudaTensor_new(state); 306 | 307 | for (int elt = 0; elt < batchSize; elt ++) { 308 | 309 | // select each batch 310 | THCudaTensor_select(state, input_n, input, 0, elt); 311 | THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); 312 | 313 | // unroll 314 | long num_threads = nInputPlane*outputHeight*outputWidth; 315 | im2col_kernel_v <<>> ( 316 | num_threads, 317 | THCudaTensor_data(state, input_n), 318 | inputHeight, inputWidth, kL, 1, 0, 0, 1, 1, 319 | outputHeight, outputWidth, 320 | THCudaTensor_data(state, finput) 321 | ); 322 | 323 | // convolve 324 | THCudaBlas_gemm( 325 | state, 't', 'n', 326 | kL*nInputPlane, nOutputPlane, outputHeight*outputWidth, 327 | scale, 328 | THCudaTensor_data(state, finput), outputHeight*outputWidth, 329 | THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth, 330 | (elt > 0), 331 | THCudaTensor_data(state, fgradWeight), kL*nInputPlane 332 | ); 333 | 334 | // fill biases 335 | THCudaBlas_gemv( 336 | state, 337 | 't', 338 | outputHeight*outputWidth, nOutputPlane, 339 | scale, 340 | THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth, 341 | THCudaTensor_data(state, ones), 1, 342 | 1, 343 | THCudaTensor_data(state, gradBias), 1 344 | ); 345 | } 346 | 347 | // extract gradWeight 348 | long num_threads_ = kL*nInputPlane; 349 | conv_vertical_naive_gradWeight <<>> ( 350 | num_threads_, 351 | THCudaTensor_data(state, gradWeight), 352 | THCudaTensor_data(state, fgradWeight), 353 | kL, nInputPlane 354 | ); 355 | 356 | THCudaTensor_free(state, input_n); 357 | THCudaTensor_free(state, gradOutput_n); 358 | 359 | if (batch == 0) { 360 | THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); 361 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 362 | } 363 | 364 | return 0; 365 | } 366 | 367 | 368 | static const struct luaL_Reg cunnconv1d_VerticalConvolution__ [] = { 369 | {"VerticalConvolution_updateOutput", cunnconv1d_VerticalConvolution_updateOutput}, 370 | {"VerticalConvolution_updateGradInput", cunnconv1d_VerticalConvolution_updateGradInput}, 371 | {"VerticalConvolution_accGradParameters", cunnconv1d_VerticalConvolution_accGradParameters}, 372 | {NULL, NULL} 373 | }; 374 | 375 | 376 | void cunnconv1d_VerticalConvolution_init(lua_State *L) 377 | { 378 | luaT_pushmetatable(L, "torch.CudaTensor"); 379 | luaT_registeratname(L, cunnconv1d_VerticalConvolution__, "nn"); 380 | lua_pop(L,1); 381 | } 382 | -------------------------------------------------------------------------------- /cunn-conv1d/HorizontalConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | #include "common.h" 3 | 4 | // Kernel for fast unfold+copy 5 | // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu) 6 | __global__ void im2col_kernel_h(const int n, const float* data_im, 7 | const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, 8 | const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col, 9 | float* data_col) { 10 | CUDA_KERNEL_LOOP(index, n) { 11 | int w_out = index % width_col; 12 | index /= width_col; 13 | int h_out = index % height_col; 14 | int channel_in = index / height_col; 15 | int channel_out = channel_in * ksize_h * ksize_w; 16 | int h_in = h_out * stride_h - pad_h; 17 | int w_in = w_out * stride_w - pad_w; 18 | data_col += (channel_out * height_col + h_out) * width_col + w_out; 19 | data_im += (channel_in * height + h_in) * width + w_in; 20 | for (int i = 0; i < ksize_h; ++i) { 21 | for (int j = 0; j < ksize_w; ++j) { 22 | int h = h_in + i; 23 | int w = w_in + j; 24 | *data_col = (h >= 0 && w >= 0 && h < height && w < width) ? 25 | data_im[i * width + j] : 0; 26 | data_col += height_col * width_col; 27 | } 28 | } 29 | } 30 | } 31 | 32 | 33 | __global__ void conv_horizontal_naive_output(const int n, float *y, 34 | const float *x, const float *w, 35 | const int iH, const int iW, const int kL) 36 | { 37 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) { 38 | int oW = iW - kL + 1; 39 | int x_offset = (i/oW)*iW + i%oW; 40 | int w_offset = (i/(oW*iH))*kL; 41 | 42 | for (int k = 0; k < kL; k++) { 43 | y[i] += w[w_offset + k]*x[x_offset + k]; 44 | } 45 | } 46 | } 47 | 48 | 49 | __global__ void conv_horizontal_naive_gradInput(const int n, float *dx, 50 | const float *dy, const float *w, 51 | const int oH, const int oW, const int kL) 52 | { 53 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) { 54 | int iW = oW + kL - 1; 55 | int col = i%iW; 56 | int dy_offset = (i/iW)*oW + i%iW; 57 | int w_offset = (i/(iW*oH))*kL; 58 | 59 | int k_begin = max(0, col-oW+1); 60 | int k_end = min(kL, col+1); 61 | 62 | dx[i] = 0.0f; 63 | for (int k = k_begin; k < k_end; k++) { 64 | dx[i] += w[w_offset + k]*dy[dy_offset - k]; 65 | } 66 | } 67 | } 68 | 69 | 70 | __global__ void conv_horizontal_naive_gradParam(const int n, float *dw, 71 | const float *x, const float *dy, 72 | const int kL, const int oH, const int oW) 73 | { 74 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) { 75 | int iW = oW + kL - 1; 76 | int dy_offset = (i/kL)*oH*oW; 77 | int x_offset = (i/kL)*oH*oW + i%kL; 78 | 79 | for (int j = 0; j < oH; j++) { 80 | for (int k = 0; k < oW; k++) { 81 | dw[i] += dy[dy_offset + j*oW + k]*x[x_offset + j*iW + k]; 82 | } 83 | } 84 | } 85 | } 86 | 87 | 88 | __global__ void conv_horizontal_naive_gradWeight(const int n, float *y, 89 | const float *x, const int kL, 90 | const int iC) 91 | { 92 | for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) { 93 | y[i] = x[(i/kL)*kL*iC + i]; 94 | } 95 | } 96 | 97 | 98 | static int cunnconv1d_HorizontalConvolution_updateOutput(lua_State *L) { 99 | THCState *state = getCutorchState(L); 100 | THCudaTensor *input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 101 | 102 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 103 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 104 | 105 | THCudaTensor *weight = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor"); 106 | THCudaTensor *bias = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "bias", "torch.CudaTensor"); 107 | THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor"); 108 | THCudaTensor *output = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "output", "torch.CudaTensor"); 109 | 110 | const int device = THCudaTensor_getDevice(state, weight); 111 | luaL_argcheck(L, THCudaTensor_getDevice(state, bias) == device, 1, 112 | "weight and bias need to be on the same device"); 113 | luaL_argcheck(L, THCudaTensor_getDevice(state, output) == device || 114 | THCudaTensor_getDevice(state, output) == -1, 1, 115 | "weight and output need to be on the same device"); 116 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2, 117 | "weight and input need to be on the same device"); 118 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 119 | "3D or 4D (batch mode) tensor is expected"); 120 | 121 | // change to batch mode 122 | int batch = 1; 123 | if (input->nDimension == 3) { 124 | luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match"); 125 | batch = 0; 126 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 127 | } else { 128 | luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match"); 129 | } 130 | 131 | long batchSize = input->size[0]; 132 | long inputHeight = input->size[2]; 133 | long inputWidth = input->size[3]; 134 | long outputHeight = inputHeight; 135 | long outputWidth = inputWidth - weight->size[1] + 1; 136 | 137 | THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth); 138 | 139 | if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { 140 | THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); 141 | THCudaTensor_fill(state, ones, 1); 142 | } 143 | 144 | THCudaTensor *input_n = THCudaTensor_new(state); 145 | THCudaTensor *output_n = THCudaTensor_new(state); 146 | 147 | for (int elt = 0; elt < batchSize; elt ++) { 148 | 149 | // select each batch 150 | THCudaTensor_select(state, input_n, input, 0, elt); 151 | THCudaTensor_select(state, output_n, output, 0, elt); 152 | 153 | // fill biases 154 | THCudaBlas_gemm( 155 | state, 't', 'n', 156 | outputHeight*outputWidth, nOutputPlane, 1, 157 | 1, 158 | THCudaTensor_data(state, ones), 1, 159 | THCudaTensor_data(state, bias), 1, 160 | 0, 161 | THCudaTensor_data(state, output_n), outputHeight*outputWidth 162 | ); 163 | 164 | // convolve 165 | long num_threads = nOutputPlane*outputHeight*outputWidth; 166 | conv_horizontal_naive_output <<>> 167 | (num_threads, 168 | THCudaTensor_data(state, output_n), 169 | THCudaTensor_data(state, input_n), 170 | THCudaTensor_data(state, weight), 171 | inputHeight, inputWidth, weight->size[1]); 172 | } 173 | 174 | THCudaTensor_free(state, input_n); 175 | THCudaTensor_free(state, output_n); 176 | 177 | // revert to single batch 178 | if (batch == 0) { 179 | THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth); 180 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 181 | } 182 | 183 | return 1; 184 | } 185 | 186 | 187 | static int cunnconv1d_HorizontalConvolution_updateGradInput(lua_State *L) { 188 | THCState *state = getCutorchState(L); 189 | THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 190 | THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 191 | 192 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 193 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 194 | 195 | THCudaTensor *weight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor"); 196 | THCudaTensor *gradInput = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", "torch.CudaTensor"); 197 | 198 | const int device = THCudaTensor_getDevice(state, weight); 199 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2, 200 | "weight and input need to be on the same device"); 201 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradInput) == device 202 | || THCudaTensor_getDevice(state, gradInput) == -1, 2, 203 | "weight and gradInput need to be on the same device"); 204 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device 205 | || THCudaTensor_getDevice(state, gradOutput) == -1, 2, 206 | "weight and gradOutput need to be on the same device"); 207 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 208 | "3D or 4D (batch mode) tensor is expected"); 209 | 210 | int batch = 1; 211 | if (input->nDimension == 3) { 212 | batch = 0; 213 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 214 | THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 215 | } 216 | 217 | long batchSize = input->size[0]; 218 | long inputHeight = input->size[2]; 219 | long inputWidth = input->size[3]; 220 | long outputHeight = inputHeight; 221 | long outputWidth = inputWidth - weight->size[1] + 1; 222 | 223 | THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth); 224 | 225 | THCudaTensor *gradInput_n = THCudaTensor_new(state); 226 | THCudaTensor *gradOutput_n = THCudaTensor_new(state); 227 | 228 | for (int elt = 0; elt < batchSize; elt ++) { 229 | 230 | // select each batch in 2D 231 | THCudaTensor_select(state, gradInput_n, gradInput, 0, elt); 232 | THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); 233 | 234 | // convolve 235 | long num_threads = nInputPlane*inputHeight*inputWidth; 236 | conv_horizontal_naive_gradInput <<>> 237 | (num_threads, 238 | THCudaTensor_data(state, gradInput_n), 239 | THCudaTensor_data(state, gradOutput_n), 240 | THCudaTensor_data(state, weight), 241 | outputHeight, outputWidth, weight->size[1]); 242 | } 243 | 244 | THCudaTensor_free(state, gradInput_n); 245 | THCudaTensor_free(state, gradOutput_n); 246 | 247 | // revert to single batch 248 | if (batch == 0) { 249 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 250 | THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth); 251 | THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); 252 | } 253 | 254 | return 1; 255 | } 256 | 257 | 258 | static int cunnconv1d_HorizontalConvolution_accGradParameters(lua_State *L) { 259 | THCState *state = getCutorchState(L); 260 | THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor"); 261 | THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor"); 262 | 263 | float scale = luaL_optnumber(L, 4, 1); 264 | int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane"); 265 | int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane"); 266 | int kL = luaT_getfieldcheckint(L, 1, "kL"); 267 | 268 | THCudaTensor *gradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", "torch.CudaTensor"); 269 | THCudaTensor *gradBias = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", "torch.CudaTensor"); 270 | THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor"); 271 | THCudaTensor *finput = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "finput", "torch.CudaTensor"); 272 | THCudaTensor *fgradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "fgradWeight", "torch.CudaTensor"); 273 | 274 | const int device = THCudaTensor_getDevice(state, gradWeight); 275 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradBias) == device, 1, 276 | "gradWeight and gradBias need to be on the same device"); 277 | luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 1, 278 | "gradWeight and input need to be on the same device"); 279 | luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device, 1, 280 | "gradWeight and gradOutput need to be on the same device"); 281 | luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, 282 | "3D or 4D (batch mode) tensor is expected"); 283 | 284 | // change to batch mode 285 | int batch = 1; 286 | if (input->nDimension == 3) { 287 | batch = 0; 288 | THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]); 289 | THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); 290 | } 291 | 292 | long batchSize = input->size[0]; 293 | long inputHeight = input->size[2]; 294 | long inputWidth = input->size[3]; 295 | long outputHeight = inputHeight; 296 | long outputWidth = inputWidth - kL + 1; 297 | 298 | if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { 299 | THCudaTensor_resize2d(state, ones, outputHeight, outputWidth); 300 | THCudaTensor_fill(state, ones, 1); 301 | } 302 | 303 | THCudaTensor_resize2d(state, finput, kL*nInputPlane, outputHeight*outputWidth); 304 | THCudaTensor_resize2d(state, fgradWeight, nOutputPlane, kL*nInputPlane); 305 | 306 | THCudaTensor *input_n = THCudaTensor_new(state); 307 | THCudaTensor *gradOutput_n = THCudaTensor_new(state); 308 | 309 | for (int elt = 0; elt < batchSize; elt ++) { 310 | 311 | // select each batch 312 | THCudaTensor_select(state, input_n, input, 0, elt); 313 | THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt); 314 | 315 | // unroll 316 | long num_threads = nInputPlane*outputHeight*outputWidth; 317 | im2col_kernel_h <<>> ( 318 | num_threads, 319 | THCudaTensor_data(state, input_n), 320 | inputHeight, inputWidth, 1, kL, 0, 0, 1, 1, 321 | outputHeight, outputWidth, 322 | THCudaTensor_data(state, finput) 323 | ); 324 | 325 | // convolve 326 | THCudaBlas_gemm( 327 | state, 't', 'n', 328 | kL*nInputPlane, nOutputPlane, outputHeight*outputWidth, 329 | scale, 330 | THCudaTensor_data(state, finput), outputHeight*outputWidth, 331 | THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth, 332 | (elt > 0), 333 | THCudaTensor_data(state, fgradWeight), kL*nInputPlane 334 | ); 335 | 336 | // fill biases 337 | THCudaBlas_gemv( 338 | state, 339 | 't', 340 | outputHeight*outputWidth, nOutputPlane, 341 | scale, 342 | THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth, 343 | THCudaTensor_data(state, ones), 1, 344 | 1, 345 | THCudaTensor_data(state, gradBias), 1 346 | ); 347 | } 348 | 349 | // extract gradWeight 350 | long num_threads_ = kL*nInputPlane; 351 | conv_horizontal_naive_gradWeight <<>> ( 352 | num_threads_, 353 | THCudaTensor_data(state, gradWeight), 354 | THCudaTensor_data(state, fgradWeight), 355 | kL, nInputPlane 356 | ); 357 | 358 | THCudaTensor_free(state, input_n); 359 | THCudaTensor_free(state, gradOutput_n); 360 | 361 | if (batch == 0) { 362 | THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth); 363 | THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth); 364 | } 365 | 366 | return 0; 367 | } 368 | 369 | 370 | static const struct luaL_Reg cunnconv1d_HorizontalConvolution__ [] = { 371 | {"HorizontalConvolution_updateOutput", cunnconv1d_HorizontalConvolution_updateOutput}, 372 | {"HorizontalConvolution_updateGradInput", cunnconv1d_HorizontalConvolution_updateGradInput}, 373 | {"HorizontalConvolution_accGradParameters", cunnconv1d_HorizontalConvolution_accGradParameters}, 374 | {NULL, NULL} 375 | }; 376 | 377 | 378 | void cunnconv1d_HorizontalConvolution_init(lua_State *L) 379 | { 380 | luaT_pushmetatable(L, "torch.CudaTensor"); 381 | luaT_registeratname(L, cunnconv1d_HorizontalConvolution__, "nn"); 382 | lua_pop(L,1); 383 | } 384 | --------------------------------------------------------------------------------