├── cunn-conv1d
    ├── init.lua
    ├── utils.h
    ├── utils.c
    ├── .gitignore
    ├── init.cu
    ├── common.h
    ├── rocks
    │   └── cunnconv1d-scm-1.rockspec
    ├── CMakeLists.txt
    ├── LateralConvolution.cu
    ├── VerticalConvolution.cu
    └── HorizontalConvolution.cu
├── nn-conv1d
    ├── init.lua
    ├── .gitignore
    ├── rocks
    │   └── nnconv1d-scm-1.rockspec
    ├── init.c
    ├── CMakeLists.txt
    ├── LateralConvolution.lua
    ├── VerticalConvolution.lua
    ├── HorizontalConvolution.lua
    └── generic
    │   ├── LateralConvolution.c
    │   ├── VerticalConvolution.c
    │   └── HorizontalConvolution.c
├── nnconv1d-scm-1.rockspec
├── cunnconv1d-scm-1.rockspec
├── LICENSE
├── README.md
└── example.lua


/cunn-conv1d/init.lua:
--------------------------------------------------------------------------------
1 | require('cutorch')
2 | require('nn')
3 | require('nnconv1d')
4 | require('cunn')
5 | require('libcunnconv1d')
6 | 
7 | return cunn
8 | 


--------------------------------------------------------------------------------
/cunn-conv1d/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUNN_UTILS_H
 2 | #define CUNN_UTILS_H
 3 | 
 4 | #include <lua.h>
 5 | #include "THCGeneral.h"
 6 | 
 7 | THCState* getCutorchState(lua_State* L);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/nn-conv1d/init.lua:
--------------------------------------------------------------------------------
 1 | require('torch')
 2 | require('nn')
 3 | require('libnnconv1d')
 4 | 
 5 | include('LateralConvolution.lua')
 6 | include('HorizontalConvolution.lua')
 7 | include('VerticalConvolution.lua')
 8 | 
 9 | return nn
10 | 


--------------------------------------------------------------------------------
/cunn-conv1d/utils.c:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | THCState* getCutorchState(lua_State* L)
 4 | {
 5 |     lua_getglobal(L, "cutorch");
 6 |     lua_getfield(L, -1, "getState");
 7 |     lua_call(L, 0, 1);
 8 |     THCState *state = (THCState*) lua_touserdata(L, -1);
 9 |     lua_pop(L, 2);
10 |     return state;
11 | }
12 | 


--------------------------------------------------------------------------------
/cunn-conv1d/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | *.obj
 5 | *.elf
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Libraries
12 | *.lib
13 | *.a
14 | *.la
15 | *.lo
16 | 
17 | # Shared objects (inc. Windows DLLs)
18 | *.dll
19 | *.so
20 | *.so.*
21 | *.dylib
22 | 
23 | # Executables
24 | *.exe
25 | *.out
26 | *.app
27 | *.i*86
28 | *.x86_64
29 | *.hex
30 | 
31 | # Debug files
32 | *.dSYM/
33 | 
34 | # Mac attribute
35 | .DS_Store*
36 | 


--------------------------------------------------------------------------------
/nn-conv1d/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | *.obj
 5 | *.elf
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Libraries
12 | *.lib
13 | *.a
14 | *.la
15 | *.lo
16 | 
17 | # Shared objects (inc. Windows DLLs)
18 | *.dll
19 | *.so
20 | *.so.*
21 | *.dylib
22 | 
23 | # Executables
24 | *.exe
25 | *.out
26 | *.app
27 | *.i*86
28 | *.x86_64
29 | *.hex
30 | 
31 | # Debug files
32 | *.dSYM/
33 | 
34 | # Mac attribute
35 | .DS_Store*
36 | 


--------------------------------------------------------------------------------
/cunn-conv1d/init.cu:
--------------------------------------------------------------------------------
 1 | #include "luaT.h"
 2 | #include "THC.h"
 3 | 
 4 | #include "utils.c"
 5 | 
 6 | #include "LateralConvolution.cu"
 7 | #include "VerticalConvolution.cu"
 8 | #include "HorizontalConvolution.cu"
 9 | 
10 | LUA_EXTERNC DLL_EXPORT int luaopen_libcunnconv1d(lua_State *L);
11 | 
12 | int luaopen_libcunnconv1d(lua_State *L)
13 | {
14 |   lua_newtable(L);
15 | 
16 |   cunnconv1d_LateralConvolution_init(L);
17 |   cunnconv1d_VerticalConvolution_init(L);
18 |   cunnconv1d_HorizontalConvolution_init(L);
19 | 
20 |   return 1;
21 | }
22 | 


--------------------------------------------------------------------------------
/cunn-conv1d/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUNN_COMMON_H
 2 | #define CUNN_COMMON_H
 3 | 
 4 | // CUDA: grid stride looping
 5 | #define CUDA_KERNEL_LOOP(i, n)                        \
 6 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 7 |       i < (n);                                       \
 8 |       i += blockDim.x * gridDim.x)
 9 | 
10 | // Use 1024 threads per block, which requires cuda sm_2x or above
11 | const int CUDA_NUM_THREADS = 1024;
12 | 
13 | // CUDA: number of blocks for threads.
14 | inline int GET_BLOCKS(const int N) {
15 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
16 | }
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/nn-conv1d/rocks/nnconv1d-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "nnconv1d"
 2 | version = "scm-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/jhjin/flattened-cnn.git",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "1D Convolutions for Torch nn",
10 |    detailed = [[
11 |    ]],
12 |    homepage = "https://github.com/jhjin/flattened-cnn",
13 |    license = "MIT"
14 | }
15 | 
16 | dependencies = {
17 |    "torch >= 7.0",
18 | }
19 | 
20 | build = {
21 |    type = "command",
22 |    build_command = [[
23 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
24 | ]],
25 |    install_command = "cd build && $(MAKE) install"
26 | }
27 | 


--------------------------------------------------------------------------------
/nnconv1d-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "nnconv1d"
 2 | version = "scm-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/jhjin/flattened-cnn.git",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "1D Convolutions for Torch nn",
10 |    detailed = [[
11 |    ]],
12 |    homepage = "https://github.com/jhjin/flattened-cnn",
13 |    license = "MIT"
14 | }
15 | 
16 | dependencies = {
17 |    "torch >= 7.0",
18 | }
19 | 
20 | build = {
21 |    type = "command",
22 |    build_command = [[
23 | cd nn-conv1d && cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE)
24 | ]],
25 |    install_command = "cd nn-conv1d && cd build && $(MAKE) install"
26 | }
27 | 


--------------------------------------------------------------------------------
/cunn-conv1d/rocks/cunnconv1d-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "cunnconv1d"
 2 | version = "scm-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/jhjin/flattened-cnn.git",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "1D Convolutions for Torch cunn",
10 |    detailed = [[
11 |    ]],
12 |    homepage = "https://github.com/jhjin/flattened-cnn",
13 |    license = "MIT"
14 | }
15 | 
16 | dependencies = {
17 |    "torch >= 7.0",
18 |    "nn >= 1.0",
19 |    "cutorch >= 1.0"
20 | }
21 | 
22 | build = {
23 |    type = "command",
24 |    build_command = [[
25 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install
26 | ]],
27 |    install_command = "cd build"
28 | }
29 | 


--------------------------------------------------------------------------------
/cunnconv1d-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "cunnconv1d"
 2 | version = "scm-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/jhjin/flattened-cnn.git",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "1D Convolutions for Torch cunn",
10 |    detailed = [[
11 |    ]],
12 |    homepage = "https://github.com/jhjin/flattened-cnn",
13 |    license = "MIT"
14 | }
15 | 
16 | dependencies = {
17 |    "torch >= 7.0",
18 |    "nn >= 1.0",
19 |    "cutorch >= 1.0"
20 | }
21 | 
22 | build = {
23 |    type = "command",
24 |    build_command = [[
25 | cd cunn-conv1d && cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install
26 | ]],
27 |    install_command = "cd cunn-conv1d/build"
28 | }
29 | 


--------------------------------------------------------------------------------
/nn-conv1d/init.c:
--------------------------------------------------------------------------------
 1 | #include "TH.h"
 2 | #include "luaT.h"
 3 | 
 4 | #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
 5 | #define torch_Tensor TH_CONCAT_STRING_3(torch.,Real,Tensor)
 6 | #define nnconv1d_(NAME) TH_CONCAT_3(nnconv1d_, Real, NAME)
 7 | 
 8 | #include "generic/LateralConvolution.c"
 9 | #include "THGenerateFloatTypes.h"
10 | 
11 | #include "generic/VerticalConvolution.c"
12 | #include "THGenerateFloatTypes.h"
13 | 
14 | #include "generic/HorizontalConvolution.c"
15 | #include "THGenerateFloatTypes.h"
16 | 
17 | LUA_EXTERNC DLL_EXPORT int luaopen_libnnconv1d(lua_State *L);
18 | 
19 | int luaopen_libnnconv1d(lua_State *L)
20 | {
21 |   lua_newtable(L);
22 | 
23 |   nnconv1d_FloatLateralConvolution_init(L);
24 |   nnconv1d_FloatVerticalConvolution_init(L);
25 |   nnconv1d_FloatHorizontalConvolution_init(L);
26 | 
27 |   nnconv1d_DoubleLateralConvolution_init(L);
28 |   nnconv1d_DoubleVerticalConvolution_init(L);
29 |   nnconv1d_DoubleHorizontalConvolution_init(L);
30 | 
31 |   return 1;
32 | }
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Jonghoon Jin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Flattened convolutional neural networks
 2 | 
 3 | This package has 1D convolution modules (over channel, in vertical, in horizontal) used in
 4 | [Flattened Convolutional Neural Networks for Feedforward Acceleration] (http://arxiv.org/abs/1412.5474)
 5 | where we denote the flattened convolution layer as a sequence of one-dimensional filters across all 3D directions.
 6 | 
 7 | 
 8 | ### Install
 9 | 
10 | Choose both or either of `nn`/`cunn` backend packages depending on your computing environment.
11 | 
12 | ```bash
13 | luarocks install https://raw.githubusercontent.com/jhjin/flattened-cnn/master/nnconv1d-scm-1.rockspec    # cpu
14 | luarocks install https://raw.githubusercontent.com/jhjin/flattened-cnn/master/cunnconv1d-scm-1.rockspec  # cuda
15 | ```
16 | 
17 | or use this command if you already cloned this repo.
18 | 
19 | ```bash
20 | cd nn-conv1d
21 | luarocks make rocks/nnconv1d-scm-1.rockspec
22 | cd ../cunn-conv1d
23 | luarocks make rocks/cunnconv1d-scm-1.rockspec
24 | ```
25 | 
26 | 
27 | ### Available modules
28 | 
29 | This is a list of available modules.
30 | 
31 | ```lua
32 | nn.LateralConvolution(nInputPlane, nOutputPlane)        -- 1d conv over feature
33 | nn.HorizontalConvolution(nInputPlane, nOutputPlane, kL) -- 1d conv in horizontal
34 | nn.VerticalConvolution(nInputPlane, nOutputPlane, kL)   -- 1d conv in vertical
35 | ```
36 | 
37 | 
38 | ### Example
39 | 
40 | Run the command below.
41 | 
42 | ```bash
43 | th example.lua
44 | ```
45 | 


--------------------------------------------------------------------------------
/cunn-conv1d/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
 2 | CMAKE_POLICY(VERSION 2.8)
 3 | 
 4 | SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
 5 | 
 6 | FIND_PACKAGE(Torch REQUIRED)
 7 | FIND_PACKAGE(JPEG REQUIRED)
 8 | 
 9 | if (JPEG_FOUND)
10 |   include_directories (${JPEG_INCLUDE_DIR})
11 | endif (JPEG_FOUND)
12 | 
13 | IF(APPLE)
14 |   # work around for mac os x bug:
15 |   # http://stackoverflow.com/questions/16286588/cuda-5-0-cmake-and-make-failing-on-osx-10-8-3
16 |   if (NOT DEFINED CUDA_HOST_COMPILER AND CMAKE_C_COMPILER_ID STREQUAL "Clang" AND EXISTS /usr/bin/gcc)
17 |     set(CUDA_HOST_COMPILER /usr/bin/gcc CACHE FILEPATH "Host side compiler used by NVCC")
18 |     message(STATUS "Setting CMAKE_HOST_COMPILER to /usr/bin/gcc instead of ${CMAKE_C_COMPILER}.")
19 |   endif()
20 | ENDIF()
21 | 
22 | FIND_PACKAGE(CUDA 4.0 REQUIRED)
23 | 
24 | LIST(APPEND CUDA_NVCC_FLAGS "-arch=sm_20")
25 | 
26 | INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}/THC")
27 | LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
28 | 
29 | SET(src-cuda init.cu)
30 | 
31 | FILE(GLOB luasrc *.lua)
32 | 
33 | CUDA_ADD_LIBRARY(cunnconv1d MODULE ${src-cuda})
34 | TARGET_LINK_LIBRARIES(cunnconv1d luaT THC TH ${JPEG_LIBRARIES})
35 | IF(APPLE)
36 |   SET_TARGET_PROPERTIES(cunnconv1d PROPERTIES
37 |     LINK_FLAGS "-undefined dynamic_lookup")
38 | ENDIF()
39 | 
40 | ### Torch packages supposes libraries prefix is "lib"
41 | SET_TARGET_PROPERTIES(cunnconv1d PROPERTIES
42 |   PREFIX "lib"
43 |   IMPORT_PREFIX "lib")
44 | 
45 | INSTALL(TARGETS cunnconv1d
46 |   RUNTIME DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}"
47 |   LIBRARY DESTINATION "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
48 | 
49 | INSTALL(
50 |   FILES
51 |   ${luasrc}
52 |   DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/cunnconv1d")
53 | 


--------------------------------------------------------------------------------
/nn-conv1d/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
 2 | CMAKE_POLICY(VERSION 2.6)
 3 | 
 4 | FIND_PACKAGE(Torch REQUIRED)
 5 | FIND_PACKAGE(JPEG REQUIRED)
 6 | 
 7 | 
 8 | # Flags
 9 | # When using MSVC
10 | IF(MSVC)
11 |   # we want to respect the standard, and we are bored of those **** .
12 |   ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
13 | ENDIF(MSVC)
14 | 
15 | # OpenMP support?
16 | SET(WITH_OPENMP ON CACHE BOOL "OpenMP support if available?")
17 | IF (APPLE AND CMAKE_COMPILER_IS_GNUCC)
18 |   EXEC_PROGRAM (uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
19 |   STRING (REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
20 |   MESSAGE (STATUS "MAC OS Darwin Version: ${DARWIN_VERSION}")
21 |   IF (DARWIN_VERSION GREATER 9)
22 |     SET(APPLE_OPENMP_SUCKS 1)
23 |   ENDIF (DARWIN_VERSION GREATER 9)
24 |   EXECUTE_PROCESS (COMMAND ${CMAKE_C_COMPILER} -dumpversion
25 |     OUTPUT_VARIABLE GCC_VERSION)
26 |   IF (APPLE_OPENMP_SUCKS AND GCC_VERSION VERSION_LESS 4.6.2)
27 |     MESSAGE(STATUS "Warning: Disabling OpenMP (unstable with this version of GCC)")
28 |     MESSAGE(STATUS " Install GCC >= 4.6.2 or change your OS to enable OpenMP")
29 |     SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unknown-pragmas")
30 |     SET(WITH_OPENMP OFF CACHE BOOL "OpenMP support if available?" FORCE)
31 |   ENDIF ()
32 | ENDIF ()
33 | 
34 | IF (WITH_OPENMP)
35 |   FIND_PACKAGE(OpenMP)
36 |   IF(OPENMP_FOUND)
37 |     MESSAGE(STATUS "Compiling with OpenMP support")
38 |     SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
39 |     SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
40 |     SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
41 |   ENDIF(OPENMP_FOUND)
42 | ENDIF (WITH_OPENMP)
43 | 
44 | LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
45 | 
46 | SET(src init.c)
47 | 
48 | FILE(GLOB luasrc *.lua)
49 | 
50 | ADD_TORCH_PACKAGE(nnconv1d "${src}" "${luasrc}")
51 | 
52 | TARGET_LINK_LIBRARIES(nnconv1d luaT TH)
53 | 


--------------------------------------------------------------------------------
/nn-conv1d/LateralConvolution.lua:
--------------------------------------------------------------------------------
 1 | local LateralConvolution, parent = torch.class('nn.LateralConvolution', 'nn.Module')
 2 | 
 3 | function LateralConvolution:__init(nInputPlane, nOutputPlane)
 4 |    parent.__init(self)
 5 | 
 6 |    self.nInputPlane = nInputPlane
 7 |    self.nOutputPlane = nOutputPlane
 8 | 
 9 |    self.weight = torch.Tensor(nOutputPlane, nInputPlane)
10 |    self.bias = torch.Tensor(nOutputPlane)
11 |    self.gradWeight = torch.Tensor(nOutputPlane, nInputPlane)
12 |    self.gradBias = torch.Tensor(nOutputPlane)
13 | 
14 |    self.ones = torch.Tensor()
15 | 
16 |    self:reset()
17 | end
18 | 
19 | function LateralConvolution:reset(stdv)
20 |    if stdv then
21 |       stdv = stdv * math.sqrt(3)
22 |    else
23 |       stdv = 1/math.sqrt(self.nInputPlane)
24 |    end
25 |    if nn.oldSeed then
26 |       self.weight:apply(function()
27 |          return torch.uniform(-stdv, stdv)
28 |       end)
29 |       self.bias:apply(function()
30 |          return torch.uniform(-stdv, stdv)
31 |       end)
32 |    else
33 |       self.weight:uniform(-stdv, stdv)
34 |       self.bias:uniform(-stdv, stdv)
35 |    end
36 | end
37 | 
38 | local function makeContiguous(self, input, gradOutput)
39 |    if not input:isContiguous() then
40 |       self._input = self._input or input.new()
41 |       self._input:resizeAs(input):copy(input)
42 |       input = self._input
43 |    end
44 |    if gradOutput then
45 |       if not gradOutput:isContiguous() then
46 |          self._gradOutput = self._gradOutput or gradOutput.new()
47 |          self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
48 |          gradOutput = self._gradOutput
49 |       end
50 |    end
51 |    return input, gradOutput
52 | end
53 | 
54 | function LateralConvolution:updateOutput(input)
55 |    input = makeContiguous(self, input)
56 |    return input.nn.LateralConvolution_updateOutput(self, input)
57 | end
58 | 
59 | function LateralConvolution:updateGradInput(input, gradOutput)
60 |    if self.gradInput then
61 |       input, gradOutput = makeContiguous(self, input, gradOutput)
62 |       return input.nn.LateralConvolution_updateGradInput(self, input, gradOutput)
63 |    end
64 | end
65 | 
66 | function LateralConvolution:accGradParameters(input, gradOutput, scale)
67 |    input, gradOutput = makeContiguous(self, input, gradOutput)
68 |    return input.nn.LateralConvolution_accGradParameters(self, input, gradOutput, scale)
69 | end
70 | 


--------------------------------------------------------------------------------
/nn-conv1d/VerticalConvolution.lua:
--------------------------------------------------------------------------------
 1 | local VerticalConvolution, parent = torch.class('nn.VerticalConvolution', 'nn.Module')
 2 | 
 3 | function VerticalConvolution:__init(nInputPlane, nOutputPlane, kL)
 4 |    parent.__init(self)
 5 | 
 6 |    assert(nInputPlane == nOutputPlane)
 7 |    self.nInputPlane = nInputPlane
 8 |    self.nOutputPlane = nOutputPlane
 9 | 
10 |    self.kL = kL
11 | 
12 |    self.weight = torch.Tensor(nInputPlane, kL)
13 |    self.bias = torch.Tensor(nOutputPlane)
14 |    self.gradWeight = torch.Tensor(nInputPlane, kL)
15 |    self.gradBias = torch.Tensor(nOutputPlane)
16 | 
17 |    self.ones = torch.Tensor()
18 |    self.finput = torch.Tensor()
19 |    self.fgradWeight = torch.Tensor()
20 | 
21 |    self:reset()
22 | end
23 | 
24 | function VerticalConvolution:reset(stdv)
25 |    if stdv then
26 |       stdv = stdv * math.sqrt(3)
27 |    else
28 |       stdv = 1/math.sqrt(self.nInputPlane)
29 |    end
30 |    if nn.oldSeed then
31 |       self.weight:apply(function()
32 |          return torch.uniform(-stdv, stdv)
33 |       end)
34 |       self.bias:apply(function()
35 |          return torch.uniform(-stdv, stdv)
36 |       end)
37 |    else
38 |       self.weight:uniform(-stdv, stdv)
39 |       self.bias:uniform(-stdv, stdv)
40 |    end
41 | end
42 | 
43 | local function makeContiguous(self, input, gradOutput)
44 |    if not input:isContiguous() then
45 |       self._input = self._input or input.new()
46 |       self._input:resizeAs(input):copy(input)
47 |       input = self._input
48 |    end
49 |    if gradOutput then
50 |       if not gradOutput:isContiguous() then
51 |          self._gradOutput = self._gradOutput or gradOutput.new()
52 |          self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
53 |          gradOutput = self._gradOutput
54 |       end
55 |    end
56 |    return input, gradOutput
57 | end
58 | 
59 | function VerticalConvolution:updateOutput(input)
60 |    input = makeContiguous(self, input)
61 |    return input.nn.VerticalConvolution_updateOutput(self, input)
62 | end
63 | 
64 | function VerticalConvolution:updateGradInput(input, gradOutput)
65 |    if self.gradInput then
66 |       input, gradOutput = makeContiguous(self, input, gradOutput)
67 |       return input.nn.VerticalConvolution_updateGradInput(self, input, gradOutput)
68 |    end
69 | end
70 | 
71 | function VerticalConvolution:accGradParameters(input, gradOutput, scale)
72 |    input, gradOutput = makeContiguous(self, input, gradOutput)
73 |    return input.nn.VerticalConvolution_accGradParameters(self, input, gradOutput, scale)
74 | end
75 | 


--------------------------------------------------------------------------------
/nn-conv1d/HorizontalConvolution.lua:
--------------------------------------------------------------------------------
 1 | local HorizontalConvolution, parent = torch.class('nn.HorizontalConvolution', 'nn.Module')
 2 | 
 3 | function HorizontalConvolution:__init(nInputPlane, nOutputPlane, kL)
 4 |    parent.__init(self)
 5 | 
 6 |    assert(nInputPlane == nOutputPlane)
 7 |    self.nInputPlane = nInputPlane
 8 |    self.nOutputPlane = nOutputPlane
 9 | 
10 |    self.kL = kL
11 | 
12 |    self.weight = torch.Tensor(nInputPlane, kL)
13 |    self.bias = torch.Tensor(nOutputPlane)
14 |    self.gradWeight = torch.Tensor(nInputPlane, kL)
15 |    self.gradBias = torch.Tensor(nOutputPlane)
16 | 
17 |    self.ones = torch.Tensor()
18 |    self.finput = torch.Tensor()
19 |    self.fgradWeight = torch.Tensor()
20 | 
21 |    self:reset()
22 | end
23 | 
24 | function HorizontalConvolution:reset(stdv)
25 |    if stdv then
26 |       stdv = stdv * math.sqrt(3)
27 |    else
28 |       stdv = 1/math.sqrt(self.nInputPlane)
29 |    end
30 |    if nn.oldSeed then
31 |       self.weight:apply(function()
32 |          return torch.uniform(-stdv, stdv)
33 |       end)
34 |       self.bias:apply(function()
35 |          return torch.uniform(-stdv, stdv)
36 |       end)
37 |    else
38 |       self.weight:uniform(-stdv, stdv)
39 |       self.bias:uniform(-stdv, stdv)
40 |    end
41 | end
42 | 
43 | local function makeContiguous(self, input, gradOutput)
44 |    if not input:isContiguous() then
45 |       self._input = self._input or input.new()
46 |       self._input:resizeAs(input):copy(input)
47 |       input = self._input
48 |    end
49 |    if gradOutput then
50 |       if not gradOutput:isContiguous() then
51 |          self._gradOutput = self._gradOutput or gradOutput.new()
52 |          self._gradOutput:resizeAs(gradOutput):copy(gradOutput)
53 |          gradOutput = self._gradOutput
54 |       end
55 |    end
56 |    return input, gradOutput
57 | end
58 | 
59 | function HorizontalConvolution:updateOutput(input)
60 |    input = makeContiguous(self, input)
61 |    return input.nn.HorizontalConvolution_updateOutput(self, input)
62 | end
63 | 
64 | function HorizontalConvolution:updateGradInput(input, gradOutput)
65 |    if self.gradInput then
66 |       input, gradOutput = makeContiguous(self, input, gradOutput)
67 |       return input.nn.HorizontalConvolution_updateGradInput(self, input, gradOutput)
68 |    end
69 | end
70 | 
71 | function HorizontalConvolution:accGradParameters(input, gradOutput, scale)
72 |    input, gradOutput = makeContiguous(self, input, gradOutput)
73 |    return input.nn.HorizontalConvolution_accGradParameters(self, input, gradOutput, scale)
74 | end
75 | 


--------------------------------------------------------------------------------
/example.lua:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env th
 2 | --
 3 | --  Rank-1 3D filter decomposition test
 4 | --
 5 | require('torch')
 6 | require('nnconv1d')
 7 | torch.setdefaulttensortype('torch.FloatTensor')
 8 | 
 9 | 
10 | local function check_error(msg, a, b)
11 |    local diff = torch.add(a, -b):abs()
12 |    print('==> '..msg..' error (max/mean): ', diff:max(), diff:mean())
13 | end
14 | 
15 | local function compose_filter(z, y, x)
16 |    local zyx = torch.Tensor(z:size(1), z:size(2), y:size(2)*x:size(2))
17 |    for i = 1, z:size(1) do
18 |       local yx = torch.ger(y[i], x[i])
19 |       for j = 1, z:size(2) do
20 |          zyx[i][j]:copy(yx):mul(z[i][j])
21 |       end
22 |    end
23 |    return zyx
24 | end
25 | 
26 | 
27 | -- set parameters
28 | local batch = 3
29 | local nInputPlanes = 4
30 | local nOutputPlanes = 5
31 | local iH = 5
32 | local iW = 5
33 | local kW = 3
34 | local kH = 3
35 | local use_cuda = false
36 | 
37 | 
38 | -- pick an input
39 | local input = torch.randn(batch, nInputPlanes, iH, iW)
40 | 
41 | -- get rank-1 filters
42 | local z = torch.randn(nOutputPlanes, nInputPlanes) -- over feature
43 | local y = torch.randn(nOutputPlanes, kH)           -- in vertical
44 | local x = torch.randn(nOutputPlanes, kW)           -- in horizontal
45 | local b = torch.randn(nOutputPlanes)               -- bias
46 | 
47 | -- reconstruct 3d filter
48 | local zyx = compose_filter(z, y, x)
49 | 
50 | 
51 | -- define models
52 | local model_full = nn.Sequential()
53 | model_full:add(nn.SpatialConvolutionMM(nInputPlanes, nOutputPlanes, kW, kH))
54 | 
55 | local model_low = nn.Sequential()
56 | model_low:add(nn.LateralConvolution(nInputPlanes, nOutputPlanes))
57 | model_low:add(nn.VerticalConvolution(nOutputPlanes, nOutputPlanes, kH))
58 | model_low:add(nn.HorizontalConvolution(nOutputPlanes, nOutputPlanes, kW))
59 | 
60 | 
61 | -- overwrite parameters
62 | model_full.modules[1].weight:copy(zyx)
63 | model_full.modules[1].bias:copy(b)
64 | 
65 | model_low.modules[1].weight:copy(z)
66 | model_low.modules[2].weight:copy(y)
67 | model_low.modules[3].weight:copy(x)
68 | model_low.modules[1].bias:zero()
69 | model_low.modules[2].bias:zero()
70 | model_low.modules[3].bias:copy(b)
71 | 
72 | 
73 | -- enable GPU
74 | if use_cuda then
75 |    require('cunnconv1d')
76 |    model_full = model_full:cuda()
77 |    model_low = model_low:cuda()
78 |    input = input:cuda()
79 | end
80 | 
81 | 
82 | -- test
83 | local output_full = model_full:updateOutput(input)
84 | local output_low = model_low:updateOutput(input)
85 | check_error('output   ', output_full, output_low)
86 | 
87 | local gradOutput_full = output_full:clone():add(0.1)
88 | local gradOutput_low = output_low:clone():add(0.1)
89 | local gradInput_full = model_full:updateGradInput(input, gradOutput_full)
90 | local gradInput_low = model_low:updateGradInput(input, gradOutput_low)
91 | check_error('gradInput', gradInput_full, gradInput_low)
92 | 
93 | model_full:zeroGradParameters()
94 | model_low:zeroGradParameters()
95 | model_full:accGradParameters(input, gradOutput_full, 1)
96 | model_low:accGradParameters(input, gradOutput_low, 1)
97 | local w_full, dw_full = model_full:getParameters()
98 | local w_low,  dw_low = model_low:getParameters()
99 | 


--------------------------------------------------------------------------------
/nn-conv1d/generic/LateralConvolution.c:
--------------------------------------------------------------------------------
  1 | #ifndef TH_GENERIC_FILE
  2 | #define TH_GENERIC_FILE "generic/LateralConvolution.c"
  3 | #else
  4 | 
  5 | 
  6 | static int nnconv1d_(LateralConvolution_updateOutput)(lua_State *L)
  7 | {
  8 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
  9 | 
 10 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
 11 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
 12 | 
 13 |    THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
 14 |    THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
 15 |    THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
 16 | 
 17 |    luaL_argcheck(L, input->nDimension == 3 ||
 18 |                     input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
 19 | 
 20 |    // change to batch mode
 21 |    int batch = 1;
 22 |    if (input->nDimension == 3) {
 23 |       batch = 0;
 24 |       THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]);
 25 |    }
 26 | 
 27 |    long batchSize    = input->size[0];
 28 |    long inputHeight  = input->size[2];
 29 |    long inputWidth   = input->size[3];
 30 |    long outputHeight = inputHeight;
 31 |    long outputWidth  = inputWidth;
 32 | 
 33 |    THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
 34 | 
 35 |    int elt;
 36 | #pragma omp parallel for private(elt)
 37 |    for (elt = 0; elt < batchSize; elt++) {
 38 | 
 39 |       // select each batch in 2D
 40 |       THTensor *input_t  = THTensor_(newSelect)(input, 0, elt);
 41 |       THTensor *output_t = THTensor_(newSelect)(output, 0, elt);
 42 |       THTensor *input2d  = THTensor_(newWithStorage2d)
 43 |                               (input_t->storage, input_t->storageOffset,
 44 |                                nInputPlane, -1, inputHeight*inputWidth, -1);
 45 |       THTensor *output2d = THTensor_(newWithStorage2d)
 46 |                               (output_t->storage, output_t->storageOffset,
 47 |                                nOutputPlane, -1, outputHeight*outputWidth, -1);
 48 | 
 49 |       // fill biases
 50 |       int i;
 51 |       for (i = 0; i < nOutputPlane; i++)
 52 |          THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i,
 53 |                          THTensor_(get1d)(bias, i), outputHeight*outputWidth);
 54 | 
 55 |       // convolve
 56 |       THTensor_(addmm)(output2d, 1, output2d, 1, weight, input2d);
 57 | 
 58 |       // release temp tensors
 59 |       THTensor_(free)(input2d);
 60 |       THTensor_(free)(output2d);
 61 |       THTensor_(free)(input_t);
 62 |       THTensor_(free)(output_t);
 63 |    }
 64 | 
 65 |    // revert to single batch
 66 |    if (batch == 0) {
 67 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
 68 |       THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
 69 |    }
 70 | 
 71 |    return 1;
 72 | }
 73 | 
 74 | 
 75 | static int nnconv1d_(LateralConvolution_updateGradInput)(lua_State *L)
 76 | {
 77 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
 78 |    THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
 79 | 
 80 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
 81 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
 82 | 
 83 |    THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
 84 |    THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
 85 | 
 86 |    THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1,
 87 |               "Number of output features is not equal to nOutputPlane" );
 88 | 
 89 |    // change to batch mode
 90 |    int batch = 1;
 91 |    if (input->nDimension == 3) {
 92 |       batch = 0;
 93 |       THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
 94 |       THTensor_(resize4d)(gradOutput, 1, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
 95 |    }
 96 | 
 97 |    long batchSize    = input->size[0];
 98 |    long inputWidth   = input->size[3];
 99 |    long inputHeight  = input->size[2];
100 |    long outputWidth  = inputWidth;
101 |    long outputHeight = inputHeight;
102 | 
103 |    THTensor_(resizeAs)(gradInput, input);
104 |    THTensor_(transpose)(weight, weight, 0, 1);
105 | 
106 |    int elt;
107 | #pragma omp parallel for private(elt)
108 |    for (elt = 0; elt < batchSize; elt++) {
109 | 
110 |       // select each batch in 2D
111 |       THTensor *gradInput_t  = THTensor_(newSelect)(gradInput, 0, elt);
112 |       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt);
113 |       THTensor *gradInput2d  = THTensor_(newWithStorage2d)
114 |                                   (gradInput_t->storage, gradInput_t->storageOffset,
115 |                                    nInputPlane, -1, inputWidth*inputHeight, -1);
116 |       THTensor *gradOutput2d = THTensor_(newWithStorage2d)
117 |                                   (gradOutput_t->storage, gradOutput_t->storageOffset,
118 |                                    nOutputPlane, -1, outputWidth*outputHeight, -1);
119 | 
120 |       // convolve
121 |       THTensor_(addmm)(gradInput2d, 0, gradInput2d, 1, weight, gradOutput2d);
122 | 
123 |       // release temp tensors
124 |       THTensor_(free)(gradInput2d);
125 |       THTensor_(free)(gradOutput2d);
126 |       THTensor_(free)(gradInput_t);
127 |       THTensor_(free)(gradOutput_t);
128 |    }
129 | 
130 |    THTensor_(transpose)(weight, weight, 0, 1);
131 | 
132 |    // revert to single batch
133 |    if (batch == 0) {
134 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
135 |       THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
136 |       THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
137 |    }
138 | 
139 |    return 1;
140 | }
141 | 
142 | 
143 | static int nnconv1d_(LateralConvolution_accGradParameters)(lua_State *L)
144 | {
145 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
146 |    THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
147 |    real scale = luaL_optnumber(L, 4, 1);
148 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
149 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
150 | 
151 |    THTensor *ones = luaT_getfieldcheckudata(L, 1, "ones", torch_Tensor);
152 |    THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
153 |    THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
154 | 
155 |    THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1,
156 |               "Number of output features is not equal to nOutputPlane" );
157 | 
158 |    // change to batch mode
159 |    int batch = 1;
160 |    if (input->nDimension == 3) {
161 |       batch = 0;
162 |       THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
163 |       THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
164 |    }
165 | 
166 |    long batchSize    = input->size[0];
167 |    long inputWidth   = input->size[3];
168 |    long inputHeight  = input->size[2];
169 |    long outputWidth  = inputWidth;
170 |    long outputHeight = inputHeight;
171 | 
172 |    if (ones->nDimension != 1 || ones->size[0] < outputHeight*outputWidth) {
173 |       THTensor_(resize1d)(ones, outputHeight*outputWidth);
174 |       THTensor_(fill)(ones, 1);
175 |    }
176 | 
177 |    int elt;
178 |    for (elt = 0; elt < batchSize; elt++) {
179 | 
180 |       // select each batch in 2D
181 |       THTensor *input_t      = THTensor_(newSelect)(input, 0, elt);
182 |       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt);
183 |       THTensor *input2d      = THTensor_(newWithStorage2d)
184 |                                   (input_t->storage, input_t->storageOffset,
185 |                                    nInputPlane, -1, inputWidth*inputHeight, -1);
186 |       THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
187 |                                    nOutputPlane, -1, outputWidth*outputHeight, -1);
188 | 
189 |       // convolve
190 |       THTensor_(transpose)(input2d, input2d, 0, 1);
191 |       THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, input2d);
192 |       THTensor_(transpose)(input2d, input2d, 0, 1);
193 | 
194 |       // fill biases
195 |       THTensor_(addmv)(gradBias, 1, gradBias, scale, gradOutput2d, ones);
196 | 
197 |       THTensor_(free)(input2d);
198 |       THTensor_(free)(gradOutput2d);
199 |       THTensor_(free)(input_t);
200 |       THTensor_(free)(gradOutput_t);
201 |    }
202 | 
203 |    // revert to single batch
204 |    if (batch == 0) {
205 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
206 |       THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
207 |    }
208 | 
209 |    return 0;
210 | }
211 | 
212 | static const struct luaL_Reg nnconv1d_(LateralConvolution__) [] = {
213 |   {"LateralConvolution_updateOutput", nnconv1d_(LateralConvolution_updateOutput)},
214 |   {"LateralConvolution_updateGradInput", nnconv1d_(LateralConvolution_updateGradInput)},
215 |   {"LateralConvolution_accGradParameters", nnconv1d_(LateralConvolution_accGradParameters)},
216 |   {NULL, NULL}
217 | };
218 | 
219 | static void nnconv1d_(LateralConvolution_init)(lua_State *L)
220 | {
221 |   luaT_pushmetatable(L, torch_Tensor);
222 |   luaT_registeratname(L, nnconv1d_(LateralConvolution__), "nn");
223 |   lua_pop(L,1);
224 | }
225 | 
226 | #endif
227 | 


--------------------------------------------------------------------------------
/nn-conv1d/generic/VerticalConvolution.c:
--------------------------------------------------------------------------------
  1 | #ifndef TH_GENERIC_FILE
  2 | #define TH_GENERIC_FILE "generic/VerticalConvolution.c"
  3 | #else
  4 | 
  5 | 
  6 | static int nnconv1d_(VerticalConvolution_updateOutput)(lua_State *L)
  7 | {
  8 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
  9 | 
 10 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
 11 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
 12 |    int kL = luaT_getfieldcheckint(L, 1, "kL");
 13 | 
 14 |    THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
 15 |    THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
 16 |    THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
 17 | 
 18 |    luaL_argcheck(L, input->nDimension == 3 ||
 19 |                     input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
 20 | 
 21 |    // change to batch mode
 22 |    int batch = 1;
 23 |    if (input->nDimension == 3) {
 24 |       batch = 0;
 25 |       THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]);
 26 |    }
 27 | 
 28 |    long batchSize    = input->size[0];
 29 |    long inputHeight  = input->size[2];
 30 |    long inputWidth   = input->size[3];
 31 |    long outputHeight = inputHeight - kL + 1;
 32 |    long outputWidth  = inputWidth;
 33 | 
 34 |    THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
 35 | 
 36 |    int elt;
 37 | #pragma omp parallel for private(elt)
 38 |    for (elt = 0; elt < batchSize; elt++) {
 39 | 
 40 |       // select each batch
 41 |       THTensor *input_t  = THTensor_(newSelect)(input, 0, elt);
 42 |       THTensor *output_t = THTensor_(newSelect)(output, 0, elt);
 43 | 
 44 |       // fill biases
 45 |       int i, j, k;
 46 |       for (i = 0; i < nOutputPlane; i++) {
 47 |          THVector_(fill)(output_t->storage->data + output_t->storageOffset + output_t->stride[0]*i,
 48 |                          THTensor_(get1d)(bias, i), outputHeight*outputWidth);
 49 |       }
 50 | 
 51 |       // convolve vertically
 52 |       for (i = 0; i < nInputPlane; i++) {
 53 |          for (k = 0; k < kL; k++) {
 54 |             THVector_(add)(output_t->storage->data + output_t->storageOffset + output_t->stride[0]*i,
 55 |                            input_t->storage->data + input_t->storageOffset +
 56 |                            input_t->stride[0]*i + input_t->stride[1]*k,
 57 |                            *(THTensor_(data)(weight)+i*kL+k), outputHeight*outputWidth);
 58 |          }
 59 |       }
 60 | 
 61 |       // release temp tensors
 62 |       THTensor_(free)(input_t);
 63 |       THTensor_(free)(output_t);
 64 |    }
 65 | 
 66 |    // revert to single batch
 67 |    if (batch == 0) {
 68 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
 69 |       THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
 70 |    }
 71 | 
 72 |    return 1;
 73 | }
 74 | 
 75 | 
 76 | static int nnconv1d_(VerticalConvolution_updateGradInput)(lua_State *L)
 77 | {
 78 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
 79 |    THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
 80 | 
 81 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
 82 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
 83 |    int kL = luaT_getfieldcheckint(L, 1, "kL");
 84 | 
 85 |    THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
 86 |    THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
 87 | 
 88 |    THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1,
 89 |               "Number of output features is not equal to nOutputPlane" );
 90 | 
 91 |    // change to batch mode
 92 |    int batch = 1;
 93 |    if (input->nDimension == 3) {
 94 |       batch = 0;
 95 |       THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
 96 |       THTensor_(resize4d)(gradOutput, 1, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
 97 |    }
 98 | 
 99 |    long batchSize    = input->size[0];
100 |    long inputHeight  = input->size[2];
101 |    long inputWidth   = input->size[3];
102 |    long outputHeight = inputHeight - kL + 1;
103 |    long outputWidth  = inputWidth;
104 | 
105 |    THTensor_(resizeAs)(gradInput, input);
106 |    THTensor_(zero)(gradInput);
107 | 
108 |    int elt;
109 | #pragma omp parallel for private(elt)
110 |    for (elt = 0; elt < batchSize; elt++) {
111 | 
112 |       // select each batch
113 |       THTensor *gradInput_t  = THTensor_(newSelect)(gradInput, 0, elt);
114 |       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt);
115 | 
116 |       // convolve vertically
117 |       int i, k;
118 |       for (i = 0; i < nOutputPlane; i++) {
119 |          for (k = 0; k < kL; k++) {
120 |             THVector_(add)(gradInput_t->storage->data + gradInput_t->storageOffset +
121 |                            gradInput_t->stride[0]*i + gradInput_t->stride[1]*k,
122 |                            gradOutput_t->storage->data + gradOutput_t->storageOffset +
123 |                            gradOutput_t->stride[0]*i,
124 |                            *(THTensor_(data)(weight)+i*kL+k), outputHeight*outputWidth);
125 |          }
126 |       }
127 | 
128 |       // release temp tensors
129 |       THTensor_(free)(gradInput_t);
130 |       THTensor_(free)(gradOutput_t);
131 |    }
132 | 
133 |    // revert to single batch
134 |    if (batch == 0) {
135 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
136 |       THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
137 |       THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
138 |    }
139 | 
140 |    return 1;
141 | }
142 | 
143 | 
144 | static int nnconv1d_(VerticalConvolution_accGradParameters)(lua_State *L)
145 | {
146 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
147 |    THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
148 |    real scale = luaL_optnumber(L, 4, 1);
149 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
150 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
151 |    int kL = luaT_getfieldcheckint(L, 1, "kL");
152 | 
153 |    THTensor *ones = luaT_getfieldcheckudata(L, 1, "ones", torch_Tensor);
154 |    THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
155 |    THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
156 | 
157 |    THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1,
158 |               "Number of output features is not equal to nOutputPlane" );
159 | 
160 |    // change to batch mode
161 |    int batch = 1;
162 |    if (input->nDimension == 3) {
163 |       batch = 0;
164 |       THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
165 |       THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
166 |    }
167 | 
168 |    long batchSize    = input->size[0];
169 |    long inputHeight  = input->size[2];
170 |    long inputWidth   = input->size[3];
171 |    long outputHeight = inputHeight - kL + 1;
172 |    long outputWidth  = inputWidth;
173 | 
174 |    if (ones->nDimension != 1 || ones->size[0] < outputHeight*outputWidth) {
175 |       THTensor_(resize1d)(ones, outputHeight*outputWidth);
176 |       THTensor_(fill)(ones, 1);
177 |    }
178 | 
179 |    int elt;
180 |    for (elt = 0; elt < batchSize; elt++) {
181 | 
182 |       // select each batch in 2D
183 |       THTensor *input_t      = THTensor_(newSelect)(input, 0, elt);
184 |       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt);
185 |       THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
186 |                                    nOutputPlane, -1, outputWidth*outputHeight, -1);
187 | 
188 |       // dot products
189 |       int i, k;
190 |       for (i = 0; i < nInputPlane; i++) {
191 |          for (k = 0; k < kL; k++) {
192 |             *(gradWeight->storage->data + gradWeight->storageOffset + i*gradWeight->stride[0] + k) +=
193 |                scale*THBlas_(dot)
194 |                   (outputHeight*outputWidth,
195 |                    gradOutput_t->storage->data + gradOutput_t->storageOffset + i*gradOutput_t->stride[0],
196 |                    gradOutput_t->stride[2],
197 |                    input_t->storage->data + input_t->storageOffset + i*input_t->stride[0] + k*outputWidth,
198 |                    input_t->stride[2]);
199 |          }
200 |       }
201 | 
202 |       // fill biases
203 |       THTensor_(addmv)(gradBias, 1, gradBias, scale, gradOutput2d, ones);
204 | 
205 |       THTensor_(free)(gradOutput2d);
206 |       THTensor_(free)(input_t);
207 |       THTensor_(free)(gradOutput_t);
208 |    }
209 | 
210 |    // revert to single batch
211 |    if (batch == 0) {
212 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
213 |       THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
214 |    }
215 | 
216 |    return 0;
217 | }
218 | 
219 | 
220 | static const struct luaL_Reg nnconv1d_(VerticalConvolution__) [] = {
221 |   {"VerticalConvolution_updateOutput", nnconv1d_(VerticalConvolution_updateOutput)},
222 |   {"VerticalConvolution_updateGradInput", nnconv1d_(VerticalConvolution_updateGradInput)},
223 |   {"VerticalConvolution_accGradParameters", nnconv1d_(VerticalConvolution_accGradParameters)},
224 |   {NULL, NULL}
225 | };
226 | 
227 | 
228 | static void nnconv1d_(VerticalConvolution_init)(lua_State *L)
229 | {
230 |   luaT_pushmetatable(L, torch_Tensor);
231 |   luaT_registeratname(L, nnconv1d_(VerticalConvolution__), "nn");
232 |   lua_pop(L,1);
233 | }
234 | 
235 | #endif
236 | 


--------------------------------------------------------------------------------
/nn-conv1d/generic/HorizontalConvolution.c:
--------------------------------------------------------------------------------
  1 | #ifndef TH_GENERIC_FILE
  2 | #define TH_GENERIC_FILE "generic/HorizontalConvolution.c"
  3 | #else
  4 | 
  5 | 
  6 | static int nnconv1d_(HorizontalConvolution_updateOutput)(lua_State *L)
  7 | {
  8 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
  9 | 
 10 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
 11 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
 12 |    int kL = luaT_getfieldcheckint(L, 1, "kL");
 13 | 
 14 |    THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
 15 |    THTensor *bias = luaT_getfieldcheckudata(L, 1, "bias", torch_Tensor);
 16 |    THTensor *output = luaT_getfieldcheckudata(L, 1, "output", torch_Tensor);
 17 | 
 18 |    luaL_argcheck(L, input->nDimension == 3 ||
 19 |                     input->nDimension == 4, 2, "3D or 4D (batch mode) tensor expected");
 20 | 
 21 |    // change to batch mode
 22 |    int batch = 1;
 23 |    if (input->nDimension == 3) {
 24 |       batch = 0;
 25 |       THTensor_(resize4d)(input, 1, nInputPlane, input->size[1], input->size[2]);
 26 |    }
 27 | 
 28 |    long batchSize    = input->size[0];
 29 |    long inputHeight  = input->size[2];
 30 |    long inputWidth   = input->size[3];
 31 |    long outputHeight = inputHeight;
 32 |    long outputWidth  = inputWidth - kL + 1;
 33 | 
 34 |    THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
 35 | 
 36 |    int elt;
 37 | #pragma omp parallel for private(elt)
 38 |    for (elt = 0; elt < batchSize; elt++) {
 39 | 
 40 |       // select each batch
 41 |       THTensor *input_t  = THTensor_(newSelect)(input, 0, elt);
 42 |       THTensor *output_t = THTensor_(newSelect)(output, 0, elt);
 43 | 
 44 |       // fill biases
 45 |       int i, j, k;
 46 |       for (i = 0; i < nOutputPlane; i++) {
 47 |          THVector_(fill)(output_t->storage->data+output_t->storageOffset+output_t->stride[0]*i,
 48 |                          THTensor_(get1d)(bias, i), outputHeight*outputWidth);
 49 |       }
 50 | 
 51 |       // convolve horizontally
 52 |       for (i = 0; i < nInputPlane; i++) {
 53 |          for (j = 0; j < inputHeight; j++) {
 54 |             for (k = 0; k < kL; k++) {
 55 |                THVector_(add)(output_t->storage->data + output_t->storageOffset +
 56 |                               output_t->stride[0]*i + output_t->stride[1]*j,
 57 |                               input_t->storage->data + input_t->storageOffset +
 58 |                               input_t->stride[0]*i + input_t->stride[1]*j + k,
 59 |                               *(THTensor_(data)(weight)+i*kL+k), outputWidth);
 60 |             }
 61 |          }
 62 |       }
 63 | 
 64 |       // release temp tensors
 65 |       THTensor_(free)(input_t);
 66 |       THTensor_(free)(output_t);
 67 |    }
 68 | 
 69 |    // revert to single batch
 70 |    if (batch == 0) {
 71 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
 72 |       THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
 73 |    }
 74 | 
 75 |    return 1;
 76 | }
 77 | 
 78 | 
 79 | static int nnconv1d_(HorizontalConvolution_updateGradInput)(lua_State *L)
 80 | {
 81 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
 82 |    THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
 83 | 
 84 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
 85 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
 86 |    int kL = luaT_getfieldcheckint(L, 1, "kL");
 87 | 
 88 |    THTensor *weight = luaT_getfieldcheckudata(L, 1, "weight", torch_Tensor);
 89 |    THTensor *gradInput = luaT_getfieldcheckudata(L, 1, "gradInput", torch_Tensor);
 90 | 
 91 |    THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1,
 92 |               "Number of output features is not equal to nOutputPlane" );
 93 | 
 94 |    // change to batch mode
 95 |    int batch = 1;
 96 |    if (input->nDimension == 3) {
 97 |       batch = 0;
 98 |       THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
 99 |       THTensor_(resize4d)(gradOutput, 1, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
100 |    }
101 | 
102 |    long batchSize    = input->size[0];
103 |    long inputHeight  = input->size[2];
104 |    long inputWidth   = input->size[3];
105 |    long outputHeight = inputHeight;
106 |    long outputWidth  = inputWidth - kL + 1;
107 | 
108 |    THTensor_(resizeAs)(gradInput, input);
109 |    THTensor_(zero)(gradInput);
110 | 
111 |    int elt;
112 | #pragma omp parallel for private(elt)
113 |    for (elt = 0; elt < batchSize; elt++) {
114 | 
115 |       // select each batch
116 |       THTensor *gradInput_t  = THTensor_(newSelect)(gradInput, 0, elt);
117 |       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt);
118 | 
119 |       // convolve horizontally
120 |       int i, j, k;
121 |       for (i = 0; i < nOutputPlane; i++) {
122 |          for (j = 0; j < outputHeight; j++) {
123 |             for (k = 0; k < kL; k++) {
124 |                THVector_(add)(gradInput_t->storage->data + gradInput_t->storageOffset +
125 |                               gradInput_t->stride[0]*i + gradInput_t->stride[1]*j + k,
126 |                               gradOutput_t->storage->data + gradOutput_t->storageOffset +
127 |                               gradOutput_t->stride[0]*i + gradOutput_t->stride[1]*j,
128 |                               *(THTensor_(data)(weight)+i*kL+k), outputWidth);   // needs to change
129 |             }
130 |          }
131 |       }
132 | 
133 |       // release temp tensors
134 |       THTensor_(free)(gradInput_t);
135 |       THTensor_(free)(gradOutput_t);
136 |    }
137 | 
138 |    // revert to single batch
139 |    if (batch == 0) {
140 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
141 |       THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
142 |       THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
143 |    }
144 | 
145 |    return 1;
146 | }
147 | 
148 | 
149 | static int nnconv1d_(HorizontalConvolution_accGradParameters)(lua_State *L)
150 | {
151 |    THTensor *input = luaT_checkudata(L, 2, torch_Tensor);
152 |    THTensor *gradOutput = luaT_checkudata(L, 3, torch_Tensor);
153 |    real scale = luaL_optnumber(L, 4, 1);
154 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
155 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
156 |    int kL = luaT_getfieldcheckint(L, 1, "kL");
157 | 
158 |    THTensor *ones = luaT_getfieldcheckudata(L, 1, "ones", torch_Tensor);
159 |    THTensor *gradWeight = luaT_getfieldcheckudata(L, 1, "gradWeight", torch_Tensor);
160 |    THTensor *gradBias = luaT_getfieldcheckudata(L, 1, "gradBias", torch_Tensor);
161 | 
162 |    THArgCheck(nOutputPlane == gradOutput->size[input->nDimension == 4 ? 1 : 0], 1,
163 |               "Number of output features is not equal to nOutputPlane" );
164 | 
165 |    // change to batch mode
166 |    int batch = 1;
167 |    if (input->nDimension == 3) {
168 |       batch = 0;
169 |       THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
170 |       THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
171 |    }
172 | 
173 |    long batchSize    = input->size[0];
174 |    long inputHeight  = input->size[2];
175 |    long inputWidth   = input->size[3];
176 |    long outputHeight = inputHeight;
177 |    long outputWidth  = inputWidth - kL + 1;
178 | 
179 |    if (ones->nDimension != 1 || ones->size[0] < outputHeight*outputWidth) {
180 |       THTensor_(resize1d)(ones, outputHeight*outputWidth);
181 |       THTensor_(fill)(ones, 1);
182 |    }
183 | 
184 |    int elt;
185 |    for (elt = 0; elt < batchSize; elt++) {
186 | 
187 |       // select each batch in 2D
188 |       THTensor *input_t      = THTensor_(newSelect)(input, 0, elt);
189 |       THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, elt);
190 |       THTensor *gradOutput2d = THTensor_(newWithStorage2d)(gradOutput->storage, gradOutput->storageOffset,
191 |                                    nOutputPlane, -1, outputWidth*outputHeight, -1);
192 | 
193 |       // dot products
194 |       int i, j, k;
195 |       for (i = 0; i < nInputPlane; i++) {
196 |          for (k = 0; k < kL; k++) {
197 |              for (j = 0; j < outputHeight; j++) {
198 |                 *(gradWeight->storage->data + gradWeight->storageOffset + i*gradWeight->stride[0] + k) +=
199 |                    scale*THBlas_(dot)
200 |                       (outputWidth,
201 |                        gradOutput_t->storage->data + gradOutput_t->storageOffset +
202 |                        i*gradOutput_t->stride[0] + j*gradOutput_t->stride[1],
203 |                        gradOutput_t->stride[2],
204 |                        input_t->storage->data + input_t->storageOffset +
205 |                        i*input_t->stride[0] + j*input_t->stride[1] + k,
206 |                        input_t->stride[2]);
207 |             }
208 |          }
209 |       }
210 | 
211 |       // fill biases
212 |       THTensor_(addmv)(gradBias, 1, gradBias, scale, gradOutput2d, ones);
213 | 
214 |       THTensor_(free)(gradOutput2d);
215 |       THTensor_(free)(input_t);
216 |       THTensor_(free)(gradOutput_t);
217 |    }
218 | 
219 |    // revert to single batch
220 |    if (batch == 0) {
221 |       THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
222 |       THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
223 |    }
224 | 
225 |    return 0;
226 | }
227 | 
228 | 
229 | static const struct luaL_Reg nnconv1d_(HorizontalConvolution__) [] = {
230 |   {"HorizontalConvolution_updateOutput", nnconv1d_(HorizontalConvolution_updateOutput)},
231 |   {"HorizontalConvolution_updateGradInput", nnconv1d_(HorizontalConvolution_updateGradInput)},
232 |   {"HorizontalConvolution_accGradParameters", nnconv1d_(HorizontalConvolution_accGradParameters)},
233 |   {NULL, NULL}
234 | };
235 | 
236 | 
237 | static void nnconv1d_(HorizontalConvolution_init)(lua_State *L)
238 | {
239 |   luaT_pushmetatable(L, torch_Tensor);
240 |   luaT_registeratname(L, nnconv1d_(HorizontalConvolution__), "nn");
241 |   lua_pop(L,1);
242 | }
243 | 
244 | #endif
245 | 


--------------------------------------------------------------------------------
/cunn-conv1d/LateralConvolution.cu:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | 
  3 | 
  4 | static int cunnconv1d_LateralConvolution_updateOutput(lua_State *L) {
  5 |    THCState *state = getCutorchState(L);
  6 |    THCudaTensor *input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
  7 | 
  8 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
  9 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
 10 | 
 11 |    THCudaTensor *weight = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor");
 12 |    THCudaTensor *bias = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "bias", "torch.CudaTensor");
 13 |    THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor");
 14 |    THCudaTensor *output = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "output", "torch.CudaTensor");
 15 | 
 16 |    const int device = THCudaTensor_getDevice(state, weight);
 17 |    luaL_argcheck(L, THCudaTensor_getDevice(state, bias) == device, 1,
 18 |                  "weight and bias need to be on the same device");
 19 |    luaL_argcheck(L, THCudaTensor_getDevice(state, output) == device ||
 20 |                  THCudaTensor_getDevice(state, output) == -1, 1,
 21 |                  "weight and output need to be on the same device");
 22 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2,
 23 |                  "weight and input need to be on the same device");
 24 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
 25 |                  "3D or 4D (batch mode) tensor is expected");
 26 | 
 27 |    // change to batch mode
 28 |    int batch = 1;
 29 |    if (input->nDimension == 3) {
 30 |       luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
 31 |       batch = 0;
 32 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
 33 |    } else {
 34 |       luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
 35 |    }
 36 | 
 37 |    long batchSize    = input->size[0];
 38 |    long inputHeight  = input->size[2];
 39 |    long inputWidth   = input->size[3];
 40 |    long outputHeight = inputHeight;
 41 |    long outputWidth  = inputWidth;
 42 | 
 43 |    THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
 44 | 
 45 |    if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
 46 |       THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
 47 |       THCudaTensor_fill(state, ones, 1);
 48 |    }
 49 | 
 50 |    THCudaTensor *input_n = THCudaTensor_new(state);
 51 |    THCudaTensor *output_n = THCudaTensor_new(state);
 52 | 
 53 |    for (int elt = 0; elt < batchSize; elt ++) {
 54 | 
 55 |       // select each batch
 56 |       THCudaTensor_select(state, input_n, input, 0, elt);
 57 |       THCudaTensor_select(state, output_n, output, 0, elt);
 58 | 
 59 |       // fill biases
 60 |       THCudaBlas_gemm(
 61 |          state, 't', 'n',
 62 |          outputHeight*outputWidth, nOutputPlane, 1,
 63 |          1,
 64 |          THCudaTensor_data(state, ones), 1,
 65 |          THCudaTensor_data(state, bias), 1,
 66 |          0,
 67 |          THCudaTensor_data(state, output_n), outputHeight*outputWidth
 68 |       );
 69 | 
 70 |       // convolve
 71 |       THCudaBlas_gemm(
 72 |          state,
 73 |          'n', 'n',
 74 |          outputHeight*outputWidth, nOutputPlane, nInputPlane,
 75 |          1,
 76 |          THCudaTensor_data(state, input_n), outputHeight*outputWidth,
 77 |          THCudaTensor_data(state, weight), nInputPlane,
 78 |          1,
 79 |          THCudaTensor_data(state, output_n), outputHeight*outputWidth
 80 |       );
 81 |    }
 82 | 
 83 |    THCudaTensor_free(state, input_n);
 84 |    THCudaTensor_free(state, output_n);
 85 | 
 86 |    // revert to single batch
 87 |    if (batch == 0) {
 88 |       THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
 89 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
 90 |    }
 91 | 
 92 |    return 1;
 93 | }
 94 | 
 95 | 
 96 | static int cunnconv1d_LateralConvolution_updateGradInput(lua_State *L) {
 97 |    THCState *state = getCutorchState(L);
 98 |    THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
 99 |    THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
100 | 
101 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
102 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
103 | 
104 |    THCudaTensor *weight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor");
105 |    THCudaTensor *gradInput = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", "torch.CudaTensor");
106 | 
107 |    const int device = THCudaTensor_getDevice(state, weight);
108 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2,
109 |                  "weight and input need to be on the same device");
110 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradInput) == device
111 |                  || THCudaTensor_getDevice(state, gradInput) == -1, 2,
112 |                  "weight and gradInput need to be on the same device");
113 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device
114 |                  || THCudaTensor_getDevice(state, gradOutput) == -1, 2,
115 |                  "weight and gradOutput need to be on the same device");
116 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
117 |                  "3D or 4D (batch mode) tensor is expected");
118 | 
119 |    int batch = 1;
120 |    if (input->nDimension == 3) {
121 |       batch = 0;
122 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
123 |       THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
124 |    }
125 | 
126 |    long batchSize    = input->size[0];
127 |    long inputHeight  = input->size[2];
128 |    long inputWidth   = input->size[3];
129 |    long outputHeight = inputHeight;
130 |    long outputWidth  = inputWidth;
131 | 
132 |    THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
133 | 
134 |    THCudaTensor *gradInput_n = THCudaTensor_new(state);
135 |    THCudaTensor *gradOutput_n = THCudaTensor_new(state);
136 | 
137 |    for (int elt = 0; elt < batchSize; elt ++) {
138 | 
139 |       // select each batch in 2D
140 |       THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
141 |       THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
142 | 
143 |       // convolve
144 |       THCudaBlas_gemm(
145 |          state,
146 |          'n', 't',
147 |          outputHeight*outputWidth, nInputPlane, nOutputPlane,
148 |          1,
149 |          THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth,
150 |          THCudaTensor_data(state, weight), nInputPlane,
151 |          0,
152 |          THCudaTensor_data(state, gradInput_n), outputHeight*outputWidth
153 |       );
154 |    }
155 | 
156 |    THCudaTensor_free(state, gradInput_n);
157 |    THCudaTensor_free(state, gradOutput_n);
158 | 
159 |    // revert to single batch
160 |    if (batch == 0) {
161 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
162 |       THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
163 |       THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
164 |    }
165 | 
166 |    return 1;
167 | }
168 | 
169 | 
170 | static int cunnconv1d_LateralConvolution_accGradParameters(lua_State *L) {
171 |    THCState *state = getCutorchState(L);
172 |    THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
173 |    THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
174 | 
175 |    float scale = luaL_optnumber(L, 4, 1);
176 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
177 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
178 | 
179 |    THCudaTensor *gradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", "torch.CudaTensor");
180 |    THCudaTensor *gradBias = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", "torch.CudaTensor");
181 |    THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor");
182 | 
183 |    const int device = THCudaTensor_getDevice(state, gradWeight);
184 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradBias) == device, 1,
185 |                  "gradWeight and gradBias need to be on the same device");
186 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 1,
187 |                  "gradWeight and input need to be on the same device");
188 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device, 1,
189 |                  "gradWeight and gradOutput need to be on the same device");
190 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
191 |                  "3D or 4D (batch mode) tensor is expected");
192 | 
193 |    // change to batch mode
194 |    int batch = 1;
195 |    if (input->nDimension == 3) {
196 |       batch = 0;
197 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
198 |       THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
199 |    }
200 | 
201 |    long batchSize    = input->size[0];
202 |    long inputHeight  = input->size[2];
203 |    long inputWidth   = input->size[3];
204 |    long outputHeight = inputHeight;
205 |    long outputWidth  = inputWidth;
206 | 
207 |    if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
208 |       THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
209 |       THCudaTensor_fill(state, ones, 1);
210 |    }
211 | 
212 |    THCudaTensor *input_n = THCudaTensor_new(state);
213 |    THCudaTensor *gradOutput_n = THCudaTensor_new(state);
214 | 
215 |    for (int elt = 0; elt < batchSize; elt ++) {
216 | 
217 |       // select each batch
218 |       THCudaTensor_select(state, input_n, input, 0, elt);
219 |       THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
220 | 
221 |       // convolve
222 |       THCudaBlas_gemm(
223 |          state,
224 |          't', 'n',
225 |          nInputPlane, nOutputPlane, outputHeight*outputWidth,
226 |          scale,
227 |          THCudaTensor_data(state, input_n), inputHeight*inputWidth,
228 |          THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth,
229 |          1,
230 |          THCudaTensor_data(state, gradWeight), nInputPlane
231 |       );
232 | 
233 |       // fill biases
234 |       THCudaBlas_gemv(
235 |          state,
236 |          't',
237 |          outputHeight*outputWidth, nOutputPlane,
238 |          scale,
239 |          THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth,
240 |          THCudaTensor_data(state, ones), 1,
241 |          1,
242 |          THCudaTensor_data(state, gradBias), 1
243 |       );
244 |    }
245 | 
246 |    THCudaTensor_free(state, input_n);
247 |    THCudaTensor_free(state, gradOutput_n);
248 | 
249 |    if (batch == 0) {
250 |       THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
251 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
252 |    }
253 | 
254 |    return 0;
255 | }
256 | 
257 | 
258 | static const struct luaL_Reg cunnconv1d_LateralConvolution__ [] = {
259 |    {"LateralConvolution_updateOutput", cunnconv1d_LateralConvolution_updateOutput},
260 |    {"LateralConvolution_updateGradInput", cunnconv1d_LateralConvolution_updateGradInput},
261 |    {"LateralConvolution_accGradParameters", cunnconv1d_LateralConvolution_accGradParameters},
262 |    {NULL, NULL}
263 | };
264 | 
265 | 
266 | void cunnconv1d_LateralConvolution_init(lua_State *L)
267 | {
268 |    luaT_pushmetatable(L, "torch.CudaTensor");
269 |    luaT_registeratname(L, cunnconv1d_LateralConvolution__, "nn");
270 |    lua_pop(L,1);
271 | }
272 | 


--------------------------------------------------------------------------------
/cunn-conv1d/VerticalConvolution.cu:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | #include "common.h"
  3 | 
  4 | // Kernel for fast unfold+copy
  5 | // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
  6 | __global__ void im2col_kernel_v(const int n, const float* data_im,
  7 |     const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
  8 |     const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col,
  9 |     float* data_col) {
 10 |   CUDA_KERNEL_LOOP(index, n) {
 11 |     int w_out = index % width_col;
 12 |     index /= width_col;
 13 |     int h_out = index % height_col;
 14 |     int channel_in = index / height_col;
 15 |     int channel_out = channel_in * ksize_h * ksize_w;
 16 |     int h_in = h_out * stride_h - pad_h;
 17 |     int w_in = w_out * stride_w - pad_w;
 18 |     data_col += (channel_out * height_col + h_out) * width_col + w_out;
 19 |     data_im += (channel_in * height + h_in) * width + w_in;
 20 |     for (int i = 0; i < ksize_h; ++i) {
 21 |       for (int j = 0; j < ksize_w; ++j) {
 22 |         int h = h_in + i;
 23 |         int w = w_in + j;
 24 |         *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
 25 |           data_im[i * width + j] : 0;
 26 |         data_col += height_col * width_col;
 27 |       }
 28 |     }
 29 |   }
 30 | }
 31 | 
 32 | 
 33 | __global__ void conv_vertical_naive_output(const int n, float *y,
 34 |                                            const float *x, const float *w,
 35 |                                            const int iH, const int iW, const int kL)
 36 | {
 37 |    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) {
 38 |       int oH = iH - kL + 1;
 39 |       int x_offset = (i/(oH*iW))*iH*iW + i%(oH*iW);
 40 |       int w_offset = (i/(oH*iW))*kL;
 41 | 
 42 |       for (int k = 0; k < kL; k++) {
 43 |          y[i] += w[w_offset + k]*x[x_offset + k*iW];
 44 |       }
 45 |    }
 46 | }
 47 | 
 48 | 
 49 | __global__ void conv_vertical_naive_gradInput(const int n, float *dx,
 50 |                                               const float *dy, const float *w,
 51 |                                               const int oH, const int oW, const int kL)
 52 | {
 53 |    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) {
 54 |       int iH = oH + kL - 1;
 55 |       int iC = i/(iH*oW);
 56 |       int row = (i%(iH*oW))/oW;
 57 |       int dy_offset = iC*oH*oW + i%(iH*oW);
 58 |       int w_offset = iC*kL;
 59 | 
 60 |       int k_begin = max(0, row-oH+1);
 61 |       int k_end = min(kL, row+1);
 62 | 
 63 |       dx[i] = 0.0f;
 64 |       for (int k = k_begin; k < k_end; k++) {
 65 |          dx[i] += w[w_offset + k]*dy[dy_offset - k*oW];
 66 |       }
 67 |    }
 68 | }
 69 | 
 70 | 
 71 | __global__ void conv_vertical_naive_gradParam(const int n, float *dw,
 72 |                                               const float *x, const float *dy,
 73 |                                               const int kL, const int oH, const int oW)
 74 | {
 75 |    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) {
 76 |       int dy_offset = (i/kL)*oH*oW;
 77 |       int x_offset = (i/kL)*oH*oW + (i%kL)*oW;
 78 | 
 79 |       for (int k = 0; k < oH*oW; k++) {
 80 |          dw[i] += dy[dy_offset + k]*x[x_offset + k];
 81 |       }
 82 |    }
 83 | }
 84 | 
 85 | 
 86 | __global__ void conv_vertical_naive_gradWeight(const int n, float *y,
 87 |                                                const float *x, const int kL,
 88 |                                                const int iC)
 89 | {
 90 |    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) {
 91 |       y[i] = x[(i/kL)*kL*iC + i];
 92 |    }
 93 | }
 94 | 
 95 | 
 96 | static int cunnconv1d_VerticalConvolution_updateOutput(lua_State *L) {
 97 |    THCState *state = getCutorchState(L);
 98 |    THCudaTensor *input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
 99 | 
100 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
101 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
102 | 
103 |    THCudaTensor *weight = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor");
104 |    THCudaTensor *bias = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "bias", "torch.CudaTensor");
105 |    THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor");
106 |    THCudaTensor *output = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "output", "torch.CudaTensor");
107 | 
108 |    const int device = THCudaTensor_getDevice(state, weight);
109 |    luaL_argcheck(L, THCudaTensor_getDevice(state, bias) == device, 1,
110 |                  "weight and bias need to be on the same device");
111 |    luaL_argcheck(L, THCudaTensor_getDevice(state, output) == device ||
112 |                  THCudaTensor_getDevice(state, output) == -1, 1,
113 |                  "weight and output need to be on the same device");
114 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2,
115 |                  "weight and input need to be on the same device");
116 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
117 |                  "3D or 4D (batch mode) tensor is expected");
118 | 
119 |    // change to batch mode
120 |    int batch = 1;
121 |    if (input->nDimension == 3) {
122 |       luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
123 |       batch = 0;
124 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
125 |    } else {
126 |       luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
127 |    }
128 | 
129 |    long batchSize    = input->size[0];
130 |    long inputHeight  = input->size[2];
131 |    long inputWidth   = input->size[3];
132 |    long outputHeight = inputHeight - weight->size[1] + 1;
133 |    long outputWidth  = inputWidth;
134 | 
135 |    THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
136 | 
137 |    if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
138 |       THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
139 |       THCudaTensor_fill(state, ones, 1);
140 |    }
141 | 
142 |    THCudaTensor *input_n = THCudaTensor_new(state);
143 |    THCudaTensor *output_n = THCudaTensor_new(state);
144 | 
145 |    for (int elt = 0; elt < batchSize; elt ++) {
146 | 
147 |       // select each batch
148 |       THCudaTensor_select(state, input_n, input, 0, elt);
149 |       THCudaTensor_select(state, output_n, output, 0, elt);
150 | 
151 |       // fill biases
152 |       THCudaBlas_gemm(
153 |          state, 't', 'n',
154 |          outputHeight*outputWidth, nOutputPlane, 1,
155 |          1,
156 |          THCudaTensor_data(state, ones), 1,
157 |          THCudaTensor_data(state, bias), 1,
158 |          0,
159 |          THCudaTensor_data(state, output_n), outputHeight*outputWidth
160 |       );
161 | 
162 |       // convolve
163 |       long num_threads = nOutputPlane*outputHeight*outputWidth;
164 |       conv_vertical_naive_output <<<GET_BLOCKS(num_threads), CUDA_NUM_THREADS>>>
165 |          (num_threads,
166 |           THCudaTensor_data(state, output_n),
167 |           THCudaTensor_data(state, input_n),
168 |           THCudaTensor_data(state, weight),
169 |           inputHeight, inputWidth, weight->size[1]);
170 |    }
171 | 
172 |    THCudaTensor_free(state, input_n);
173 |    THCudaTensor_free(state, output_n);
174 | 
175 |    // revert to single batch
176 |    if (batch == 0) {
177 |       THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
178 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
179 |    }
180 | 
181 |    return 1;
182 | }
183 | 
184 | 
185 | static int cunnconv1d_VerticalConvolution_updateGradInput(lua_State *L) {
186 |    THCState *state = getCutorchState(L);
187 |    THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
188 |    THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
189 | 
190 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
191 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
192 | 
193 |    THCudaTensor *weight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor");
194 |    THCudaTensor *gradInput = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", "torch.CudaTensor");
195 | 
196 |    const int device = THCudaTensor_getDevice(state, weight);
197 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2,
198 |                  "weight and input need to be on the same device");
199 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradInput) == device
200 |                  || THCudaTensor_getDevice(state, gradInput) == -1, 2,
201 |                  "weight and gradInput need to be on the same device");
202 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device
203 |                  || THCudaTensor_getDevice(state, gradOutput) == -1, 2,
204 |                  "weight and gradOutput need to be on the same device");
205 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
206 |                  "3D or 4D (batch mode) tensor is expected");
207 | 
208 |    int batch = 1;
209 |    if (input->nDimension == 3) {
210 |       batch = 0;
211 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
212 |       THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
213 |    }
214 | 
215 |    long batchSize    = input->size[0];
216 |    long inputHeight  = input->size[2];
217 |    long inputWidth   = input->size[3];
218 |    long outputHeight = inputHeight - weight->size[1] + 1;
219 |    long outputWidth  = inputWidth;
220 | 
221 |    THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
222 | 
223 |    THCudaTensor *gradInput_n = THCudaTensor_new(state);
224 |    THCudaTensor *gradOutput_n = THCudaTensor_new(state);
225 | 
226 |    for (int elt = 0; elt < batchSize; elt ++) {
227 | 
228 |       // select each batch in 2D
229 |       THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
230 |       THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
231 | 
232 |       // convolve
233 |       long num_threads = nInputPlane*inputHeight*inputWidth;
234 |       conv_vertical_naive_gradInput <<<GET_BLOCKS(num_threads), CUDA_NUM_THREADS>>>
235 |          (num_threads,
236 |           THCudaTensor_data(state, gradInput_n),
237 |           THCudaTensor_data(state, gradOutput_n),
238 |           THCudaTensor_data(state, weight),
239 |           outputHeight, outputWidth, weight->size[1]);
240 |    }
241 | 
242 |    THCudaTensor_free(state, gradInput_n);
243 |    THCudaTensor_free(state, gradOutput_n);
244 | 
245 |    // revert to single batch
246 |    if (batch == 0) {
247 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
248 |       THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
249 |       THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
250 |    }
251 | 
252 |    return 1;
253 | }
254 | 
255 | 
256 | static int cunnconv1d_VerticalConvolution_accGradParameters(lua_State *L) {
257 |    THCState *state = getCutorchState(L);
258 |    THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
259 |    THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
260 | 
261 |    float scale = luaL_optnumber(L, 4, 1);
262 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
263 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
264 |    int kL = luaT_getfieldcheckint(L, 1, "kL");
265 | 
266 |    THCudaTensor *gradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", "torch.CudaTensor");
267 |    THCudaTensor *gradBias = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", "torch.CudaTensor");
268 |    THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor");
269 |    THCudaTensor *finput = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "finput", "torch.CudaTensor");
270 |    THCudaTensor *fgradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "fgradWeight", "torch.CudaTensor");
271 | 
272 |    const int device = THCudaTensor_getDevice(state, gradWeight);
273 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradBias) == device, 1,
274 |                  "gradWeight and gradBias need to be on the same device");
275 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 1,
276 |                  "gradWeight and input need to be on the same device");
277 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device, 1,
278 |                  "gradWeight and gradOutput need to be on the same device");
279 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
280 |                  "3D or 4D (batch mode) tensor is expected");
281 | 
282 |    // change to batch mode
283 |    int batch = 1;
284 |    if (input->nDimension == 3) {
285 |       batch = 0;
286 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
287 |       THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
288 |    }
289 | 
290 |    long batchSize    = input->size[0];
291 |    long inputHeight  = input->size[2];
292 |    long inputWidth   = input->size[3];
293 |    long outputHeight = inputHeight - kL + 1;
294 |    long outputWidth  = inputWidth;
295 | 
296 |    if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
297 |       THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
298 |       THCudaTensor_fill(state, ones, 1);
299 |    }
300 | 
301 |    THCudaTensor_resize2d(state, finput, kL*nInputPlane, outputHeight*outputWidth);
302 |    THCudaTensor_resize2d(state, fgradWeight, nOutputPlane, kL*nInputPlane);
303 | 
304 |    THCudaTensor *input_n = THCudaTensor_new(state);
305 |    THCudaTensor *gradOutput_n = THCudaTensor_new(state);
306 | 
307 |    for (int elt = 0; elt < batchSize; elt ++) {
308 | 
309 |       // select each batch
310 |       THCudaTensor_select(state, input_n, input, 0, elt);
311 |       THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
312 | 
313 |       // unroll
314 |       long num_threads = nInputPlane*outputHeight*outputWidth;
315 |       im2col_kernel_v <<<GET_BLOCKS(num_threads), CUDA_NUM_THREADS>>> (
316 |          num_threads,
317 |          THCudaTensor_data(state, input_n),
318 |          inputHeight, inputWidth, kL, 1, 0, 0, 1, 1,
319 |          outputHeight, outputWidth,
320 |          THCudaTensor_data(state, finput)
321 |       );
322 | 
323 |       // convolve
324 |       THCudaBlas_gemm(
325 |          state, 't', 'n',
326 |          kL*nInputPlane, nOutputPlane, outputHeight*outputWidth,
327 |          scale,
328 |          THCudaTensor_data(state, finput), outputHeight*outputWidth,
329 |          THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth,
330 |          (elt > 0),
331 |          THCudaTensor_data(state, fgradWeight), kL*nInputPlane
332 |       );
333 | 
334 |       // fill biases
335 |       THCudaBlas_gemv(
336 |          state,
337 |          't',
338 |          outputHeight*outputWidth, nOutputPlane,
339 |          scale,
340 |          THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth,
341 |          THCudaTensor_data(state, ones), 1,
342 |          1,
343 |          THCudaTensor_data(state, gradBias), 1
344 |       );
345 |    }
346 | 
347 |    // extract gradWeight
348 |    long num_threads_ = kL*nInputPlane;
349 |    conv_vertical_naive_gradWeight <<<GET_BLOCKS(num_threads_), CUDA_NUM_THREADS>>> (
350 |       num_threads_,
351 |       THCudaTensor_data(state, gradWeight),
352 |       THCudaTensor_data(state, fgradWeight),
353 |       kL, nInputPlane
354 |    );
355 | 
356 |    THCudaTensor_free(state, input_n);
357 |    THCudaTensor_free(state, gradOutput_n);
358 | 
359 |    if (batch == 0) {
360 |       THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
361 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
362 |    }
363 | 
364 |    return 0;
365 | }
366 | 
367 | 
368 | static const struct luaL_Reg cunnconv1d_VerticalConvolution__ [] = {
369 |    {"VerticalConvolution_updateOutput", cunnconv1d_VerticalConvolution_updateOutput},
370 |    {"VerticalConvolution_updateGradInput", cunnconv1d_VerticalConvolution_updateGradInput},
371 |    {"VerticalConvolution_accGradParameters", cunnconv1d_VerticalConvolution_accGradParameters},
372 |    {NULL, NULL}
373 | };
374 | 
375 | 
376 | void cunnconv1d_VerticalConvolution_init(lua_State *L)
377 | {
378 |    luaT_pushmetatable(L, "torch.CudaTensor");
379 |    luaT_registeratname(L, cunnconv1d_VerticalConvolution__, "nn");
380 |    lua_pop(L,1);
381 | }
382 | 


--------------------------------------------------------------------------------
/cunn-conv1d/HorizontalConvolution.cu:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | #include "common.h"
  3 | 
  4 | // Kernel for fast unfold+copy
  5 | // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
  6 | __global__ void im2col_kernel_h(const int n, const float* data_im,
  7 |     const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
  8 |     const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col,
  9 |     float* data_col) {
 10 |   CUDA_KERNEL_LOOP(index, n) {
 11 |     int w_out = index % width_col;
 12 |     index /= width_col;
 13 |     int h_out = index % height_col;
 14 |     int channel_in = index / height_col;
 15 |     int channel_out = channel_in * ksize_h * ksize_w;
 16 |     int h_in = h_out * stride_h - pad_h;
 17 |     int w_in = w_out * stride_w - pad_w;
 18 |     data_col += (channel_out * height_col + h_out) * width_col + w_out;
 19 |     data_im += (channel_in * height + h_in) * width + w_in;
 20 |     for (int i = 0; i < ksize_h; ++i) {
 21 |       for (int j = 0; j < ksize_w; ++j) {
 22 |         int h = h_in + i;
 23 |         int w = w_in + j;
 24 |         *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
 25 |           data_im[i * width + j] : 0;
 26 |         data_col += height_col * width_col;
 27 |       }
 28 |     }
 29 |   }
 30 | }
 31 | 
 32 | 
 33 | __global__ void conv_horizontal_naive_output(const int n, float *y,
 34 |                                              const float *x, const float *w,
 35 |                                              const int iH, const int iW, const int kL)
 36 | {
 37 |    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) {
 38 |       int oW = iW - kL + 1;
 39 |       int x_offset = (i/oW)*iW + i%oW;
 40 |       int w_offset = (i/(oW*iH))*kL;
 41 | 
 42 |       for (int k = 0; k < kL; k++) {
 43 |          y[i] += w[w_offset + k]*x[x_offset + k];
 44 |       }
 45 |    }
 46 | }
 47 | 
 48 | 
 49 | __global__ void conv_horizontal_naive_gradInput(const int n, float *dx,
 50 |                                                 const float *dy, const float *w,
 51 |                                                 const int oH, const int oW, const int kL)
 52 | {
 53 |    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) {
 54 |       int iW = oW + kL - 1;
 55 |       int col = i%iW;
 56 |       int dy_offset = (i/iW)*oW + i%iW;
 57 |       int w_offset = (i/(iW*oH))*kL;
 58 | 
 59 |       int k_begin = max(0, col-oW+1);
 60 |       int k_end = min(kL, col+1);
 61 | 
 62 |       dx[i] = 0.0f;
 63 |       for (int k = k_begin; k < k_end; k++) {
 64 |          dx[i] += w[w_offset + k]*dy[dy_offset - k];
 65 |       }
 66 |    }
 67 | }
 68 | 
 69 | 
 70 | __global__ void conv_horizontal_naive_gradParam(const int n, float *dw,
 71 |                                                 const float *x, const float *dy,
 72 |                                                 const int kL, const int oH, const int oW)
 73 | {
 74 |    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) {
 75 |       int iW = oW + kL - 1;
 76 |       int dy_offset = (i/kL)*oH*oW;
 77 |       int x_offset = (i/kL)*oH*oW + i%kL;
 78 | 
 79 |       for (int j = 0; j < oH; j++) {
 80 |          for (int k = 0; k < oW; k++) {
 81 |             dw[i] += dy[dy_offset + j*oW + k]*x[x_offset + j*iW + k];
 82 |          }
 83 |       }
 84 |    }
 85 | }
 86 | 
 87 | 
 88 | __global__ void conv_horizontal_naive_gradWeight(const int n, float *y,
 89 |                                                  const float *x, const int kL,
 90 |                                                  const int iC)
 91 | {
 92 |    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < n; i += blockDim.x*gridDim.x) {
 93 |       y[i] = x[(i/kL)*kL*iC + i];
 94 |    }
 95 | }
 96 | 
 97 | 
 98 | static int cunnconv1d_HorizontalConvolution_updateOutput(lua_State *L) {
 99 |    THCState *state = getCutorchState(L);
100 |    THCudaTensor *input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
101 | 
102 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
103 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
104 | 
105 |    THCudaTensor *weight = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor");
106 |    THCudaTensor *bias = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "bias", "torch.CudaTensor");
107 |    THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor");
108 |    THCudaTensor *output = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "output", "torch.CudaTensor");
109 | 
110 |    const int device = THCudaTensor_getDevice(state, weight);
111 |    luaL_argcheck(L, THCudaTensor_getDevice(state, bias) == device, 1,
112 |                  "weight and bias need to be on the same device");
113 |    luaL_argcheck(L, THCudaTensor_getDevice(state, output) == device ||
114 |                  THCudaTensor_getDevice(state, output) == -1, 1,
115 |                  "weight and output need to be on the same device");
116 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2,
117 |                  "weight and input need to be on the same device");
118 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
119 |                  "3D or 4D (batch mode) tensor is expected");
120 | 
121 |    // change to batch mode
122 |    int batch = 1;
123 |    if (input->nDimension == 3) {
124 |       luaL_argcheck(L, input->size[0] == nInputPlane, 2, "input channels and nInputPlane dont match");
125 |       batch = 0;
126 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
127 |    } else {
128 |       luaL_argcheck(L, input->size[1] == nInputPlane, 2, "input channels and nInputPlane dont match");
129 |    }
130 | 
131 |    long batchSize    = input->size[0];
132 |    long inputHeight  = input->size[2];
133 |    long inputWidth   = input->size[3];
134 |    long outputHeight = inputHeight;
135 |    long outputWidth  = inputWidth - weight->size[1] + 1;
136 | 
137 |    THCudaTensor_resize4d(state, output, batchSize, nOutputPlane, outputHeight, outputWidth);
138 | 
139 |    if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
140 |       THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
141 |       THCudaTensor_fill(state, ones, 1);
142 |    }
143 | 
144 |    THCudaTensor *input_n = THCudaTensor_new(state);
145 |    THCudaTensor *output_n = THCudaTensor_new(state);
146 | 
147 |    for (int elt = 0; elt < batchSize; elt ++) {
148 | 
149 |       // select each batch
150 |       THCudaTensor_select(state, input_n, input, 0, elt);
151 |       THCudaTensor_select(state, output_n, output, 0, elt);
152 | 
153 |       // fill biases
154 |       THCudaBlas_gemm(
155 |          state, 't', 'n',
156 |          outputHeight*outputWidth, nOutputPlane, 1,
157 |          1,
158 |          THCudaTensor_data(state, ones), 1,
159 |          THCudaTensor_data(state, bias), 1,
160 |          0,
161 |          THCudaTensor_data(state, output_n), outputHeight*outputWidth
162 |       );
163 | 
164 |       // convolve
165 |       long num_threads = nOutputPlane*outputHeight*outputWidth;
166 |       conv_horizontal_naive_output <<<GET_BLOCKS(num_threads), CUDA_NUM_THREADS>>>
167 |          (num_threads,
168 |           THCudaTensor_data(state, output_n),
169 |           THCudaTensor_data(state, input_n),
170 |           THCudaTensor_data(state, weight),
171 |           inputHeight, inputWidth, weight->size[1]);
172 |    }
173 | 
174 |    THCudaTensor_free(state, input_n);
175 |    THCudaTensor_free(state, output_n);
176 | 
177 |    // revert to single batch
178 |    if (batch == 0) {
179 |       THCudaTensor_resize3d(state, output, nOutputPlane, outputHeight, outputWidth);
180 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
181 |    }
182 | 
183 |    return 1;
184 | }
185 | 
186 | 
187 | static int cunnconv1d_HorizontalConvolution_updateGradInput(lua_State *L) {
188 |    THCState *state = getCutorchState(L);
189 |    THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
190 |    THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
191 | 
192 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
193 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
194 | 
195 |    THCudaTensor *weight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "weight", "torch.CudaTensor");
196 |    THCudaTensor *gradInput = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", "torch.CudaTensor");
197 | 
198 |    const int device = THCudaTensor_getDevice(state, weight);
199 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 2,
200 |                  "weight and input need to be on the same device");
201 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradInput) == device
202 |                  || THCudaTensor_getDevice(state, gradInput) == -1, 2,
203 |                  "weight and gradInput need to be on the same device");
204 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device
205 |                  || THCudaTensor_getDevice(state, gradOutput) == -1, 2,
206 |                  "weight and gradOutput need to be on the same device");
207 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
208 |                  "3D or 4D (batch mode) tensor is expected");
209 | 
210 |    int batch = 1;
211 |    if (input->nDimension == 3) {
212 |       batch = 0;
213 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
214 |       THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
215 |    }
216 | 
217 |    long batchSize    = input->size[0];
218 |    long inputHeight  = input->size[2];
219 |    long inputWidth   = input->size[3];
220 |    long outputHeight = inputHeight;
221 |    long outputWidth  = inputWidth - weight->size[1] + 1;
222 | 
223 |    THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
224 | 
225 |    THCudaTensor *gradInput_n = THCudaTensor_new(state);
226 |    THCudaTensor *gradOutput_n = THCudaTensor_new(state);
227 | 
228 |    for (int elt = 0; elt < batchSize; elt ++) {
229 | 
230 |       // select each batch in 2D
231 |       THCudaTensor_select(state, gradInput_n, gradInput, 0, elt);
232 |       THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
233 | 
234 |       // convolve
235 |       long num_threads = nInputPlane*inputHeight*inputWidth;
236 |       conv_horizontal_naive_gradInput <<<GET_BLOCKS(num_threads), CUDA_NUM_THREADS>>>
237 |          (num_threads,
238 |           THCudaTensor_data(state, gradInput_n),
239 |           THCudaTensor_data(state, gradOutput_n),
240 |           THCudaTensor_data(state, weight),
241 |           outputHeight, outputWidth, weight->size[1]);
242 |    }
243 | 
244 |    THCudaTensor_free(state, gradInput_n);
245 |    THCudaTensor_free(state, gradOutput_n);
246 | 
247 |    // revert to single batch
248 |    if (batch == 0) {
249 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
250 |       THCudaTensor_resize3d(state, gradInput, nInputPlane, inputHeight, inputWidth);
251 |       THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
252 |    }
253 | 
254 |    return 1;
255 | }
256 | 
257 | 
258 | static int cunnconv1d_HorizontalConvolution_accGradParameters(lua_State *L) {
259 |    THCState *state = getCutorchState(L);
260 |    THCudaTensor *input = (THCudaTensor *)luaT_checkudata(L, 2, "torch.CudaTensor");
261 |    THCudaTensor *gradOutput = (THCudaTensor *)luaT_checkudata(L, 3, "torch.CudaTensor");
262 | 
263 |    float scale = luaL_optnumber(L, 4, 1);
264 |    int nInputPlane = luaT_getfieldcheckint(L, 1, "nInputPlane");
265 |    int nOutputPlane = luaT_getfieldcheckint(L, 1, "nOutputPlane");
266 |    int kL = luaT_getfieldcheckint(L, 1, "kL");
267 | 
268 |    THCudaTensor *gradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradWeight", "torch.CudaTensor");
269 |    THCudaTensor *gradBias = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "gradBias", "torch.CudaTensor");
270 |    THCudaTensor *ones = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "ones", "torch.CudaTensor");
271 |    THCudaTensor *finput = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "finput", "torch.CudaTensor");
272 |    THCudaTensor *fgradWeight = (THCudaTensor *)luaT_getfieldcheckudata(L, 1, "fgradWeight", "torch.CudaTensor");
273 | 
274 |    const int device = THCudaTensor_getDevice(state, gradWeight);
275 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradBias) == device, 1,
276 |                  "gradWeight and gradBias need to be on the same device");
277 |    luaL_argcheck(L, THCudaTensor_getDevice(state, input) == device, 1,
278 |                  "gradWeight and input need to be on the same device");
279 |    luaL_argcheck(L, THCudaTensor_getDevice(state, gradOutput) == device, 1,
280 |                  "gradWeight and gradOutput need to be on the same device");
281 |    luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2,
282 |                  "3D or 4D (batch mode) tensor is expected");
283 | 
284 |    // change to batch mode
285 |    int batch = 1;
286 |    if (input->nDimension == 3) {
287 |       batch = 0;
288 |       THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
289 |       THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
290 |    }
291 | 
292 |    long batchSize    = input->size[0];
293 |    long inputHeight  = input->size[2];
294 |    long inputWidth   = input->size[3];
295 |    long outputHeight = inputHeight;
296 |    long outputWidth  = inputWidth - kL + 1;
297 | 
298 |    if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
299 |       THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
300 |       THCudaTensor_fill(state, ones, 1);
301 |    }
302 | 
303 |    THCudaTensor_resize2d(state, finput, kL*nInputPlane, outputHeight*outputWidth);
304 |    THCudaTensor_resize2d(state, fgradWeight, nOutputPlane, kL*nInputPlane);
305 | 
306 |    THCudaTensor *input_n = THCudaTensor_new(state);
307 |    THCudaTensor *gradOutput_n = THCudaTensor_new(state);
308 | 
309 |    for (int elt = 0; elt < batchSize; elt ++) {
310 | 
311 |       // select each batch
312 |       THCudaTensor_select(state, input_n, input, 0, elt);
313 |       THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
314 | 
315 |       // unroll
316 |       long num_threads = nInputPlane*outputHeight*outputWidth;
317 |       im2col_kernel_h <<<GET_BLOCKS(num_threads), CUDA_NUM_THREADS>>> (
318 |          num_threads,
319 |          THCudaTensor_data(state, input_n),
320 |          inputHeight, inputWidth, 1, kL, 0, 0, 1, 1,
321 |          outputHeight, outputWidth,
322 |          THCudaTensor_data(state, finput)
323 |       );
324 | 
325 |       // convolve
326 |       THCudaBlas_gemm(
327 |          state, 't', 'n',
328 |          kL*nInputPlane, nOutputPlane, outputHeight*outputWidth,
329 |          scale,
330 |          THCudaTensor_data(state, finput), outputHeight*outputWidth,
331 |          THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth,
332 |          (elt > 0),
333 |          THCudaTensor_data(state, fgradWeight), kL*nInputPlane
334 |       );
335 | 
336 |       // fill biases
337 |       THCudaBlas_gemv(
338 |          state,
339 |          't',
340 |          outputHeight*outputWidth, nOutputPlane,
341 |          scale,
342 |          THCudaTensor_data(state, gradOutput_n), outputHeight*outputWidth,
343 |          THCudaTensor_data(state, ones), 1,
344 |          1,
345 |          THCudaTensor_data(state, gradBias), 1
346 |       );
347 |    }
348 | 
349 |    // extract gradWeight
350 |    long num_threads_ = kL*nInputPlane;
351 |    conv_horizontal_naive_gradWeight <<<GET_BLOCKS(num_threads_), CUDA_NUM_THREADS>>> (
352 |       num_threads_,
353 |       THCudaTensor_data(state, gradWeight),
354 |       THCudaTensor_data(state, fgradWeight),
355 |       kL, nInputPlane
356 |    );
357 | 
358 |    THCudaTensor_free(state, input_n);
359 |    THCudaTensor_free(state, gradOutput_n);
360 | 
361 |    if (batch == 0) {
362 |       THCudaTensor_resize3d(state, gradOutput, nOutputPlane, outputHeight, outputWidth);
363 |       THCudaTensor_resize3d(state, input, nInputPlane, inputHeight, inputWidth);
364 |    }
365 | 
366 |    return 0;
367 | }
368 | 
369 | 
370 | static const struct luaL_Reg cunnconv1d_HorizontalConvolution__ [] = {
371 |    {"HorizontalConvolution_updateOutput", cunnconv1d_HorizontalConvolution_updateOutput},
372 |    {"HorizontalConvolution_updateGradInput", cunnconv1d_HorizontalConvolution_updateGradInput},
373 |    {"HorizontalConvolution_accGradParameters", cunnconv1d_HorizontalConvolution_accGradParameters},
374 |    {NULL, NULL}
375 | };
376 | 
377 | 
378 | void cunnconv1d_HorizontalConvolution_init(lua_State *L)
379 | {
380 |    luaT_pushmetatable(L, "torch.CudaTensor");
381 |    luaT_registeratname(L, cunnconv1d_HorizontalConvolution__, "nn");
382 |    lua_pop(L,1);
383 | }
384 | 


--------------------------------------------------------------------------------