├── .gitignore ├── lib ├── THCUNN │ ├── SpatialMaxPooling.cu │ ├── TemporalConvolution.cu │ ├── SpatialFullConvolution.cu │ ├── VolumetricFullConvolution.cu │ ├── SpatialFullDilatedConvolution.cu │ ├── SpatialConvolutionMM.cu │ ├── VolumetricDilatedConvolution.cu │ ├── SpatialConvolutionLocal.cu │ ├── SpatialDilatedConvolution.cu │ ├── TemporalRowConvolution.cu │ ├── VolumetricFullDilatedConvolution.cu │ ├── SpatialDepthWiseConvolution.cu │ ├── VolumetricMaxPooling.cu │ ├── THCUNN.h │ ├── Abs.cu │ ├── Square.cu │ ├── SharedMem.cuh │ ├── Sqrt.cu │ ├── generic │ │ ├── Sigmoid.cu │ │ ├── Abs.cu │ │ ├── Tanh.cu │ │ ├── Square.cu │ │ ├── LogSigmoid.cu │ │ ├── Sqrt.cu │ │ ├── SpatialMaxPooling.cu │ │ ├── SoftShrink.cu │ │ ├── VolumetricMaxPooling.cu │ │ ├── SoftPlus.cu │ │ ├── L1Cost.cu │ │ ├── ELU.cu │ │ ├── LeakyReLU.cu │ │ ├── HardTanh.cu │ │ ├── Threshold.cu │ │ ├── SpatialFullConvolution.cu │ │ ├── VolumetricFullConvolution.cu │ │ ├── AbsCriterion.cu │ │ ├── SoftMarginCriterion.cu │ │ ├── MarginCriterion.cu │ │ ├── MSECriterion.cu │ │ ├── DistKLDivCriterion.cu │ │ ├── GatedLinearUnit.cu │ │ ├── SmoothL1Criterion.cu │ │ ├── RReLU.cu │ │ ├── SpatialMaxUnpooling.cu │ │ ├── BCECriterion.cu │ │ ├── SpatialCrossMapLRN.cu │ │ ├── BatchNormalization.cu │ │ ├── SoftMax.cu │ │ ├── SpatialReplicationPadding.cu │ │ ├── SpatialReflectionPadding.cu │ │ ├── MultiLabelMarginCriterion.cu │ │ └── SpatialUpSamplingBilinear.cu │ ├── L1Cost.cu │ ├── AbsCriterion.cu │ ├── Sigmoid.cu │ ├── DistKLDivCriterion.cu │ ├── SoftShrink.cu │ ├── Tanh.cu │ ├── GatedLinearUnit.cu │ ├── SoftMarginCriterion.cu │ ├── SoftPlus.cu │ ├── MSECriterion.cu │ ├── MarginCriterion.cu │ ├── SpatialMaxUnpooling.cu │ ├── FusedRNNKernel.cu │ ├── SmoothL1Criterion.cu │ ├── ELU.cu │ ├── HardTanh.cu │ ├── LeakyReLU.cu │ ├── Threshold.cu │ ├── SpatialUpSamplingNearest.cu │ ├── SpatialReplicationPadding.cu │ ├── BCECriterion.cu │ ├── VolumetricMaxUnpooling.cu │ ├── PReLU.cu │ ├── VolumetricUpSamplingNearest.cu │ ├── SparseLinear.cu │ ├── LogSigmoid.cu │ ├── MultiMarginCriterion.cu │ ├── SpatialReflectionPadding.cu │ ├── SpatialClassNLLCriterion.cu │ ├── RReLU.cu │ ├── VolumetricReplicationPadding.cu │ ├── SoftMax.cu │ ├── SpatialDilatedMaxPooling.cu │ ├── common.h │ ├── TemporalMaxPooling.cu │ ├── CMakeLists.txt │ ├── row2col.h │ ├── SpatialAveragePooling.cu │ ├── SpatialFractionalMaxPooling.cu │ ├── SpatialUpSamplingBilinear.cu │ └── SpatialCrossMapLRN.cu └── CMakeLists.txt ├── init.lua ├── rocks ├── version.sh ├── cunn-scm-1.rockspec └── cunn-1.0-0.rockspec ├── CMakeLists.txt ├── LICENSE ├── doc └── cunnmodules.md ├── README.md └── THCUNN.lua /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | THCUNN_h.lua 3 | THCUNN_generic_h.lua 4 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialMaxPooling.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | 3 | #include "generic/SpatialMaxPooling.cu" 4 | #include "THCGenerateFloatTypes.h" 5 | -------------------------------------------------------------------------------- /lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR) 2 | SET(THCUNN_INSTALL_LIB_SUBDIR "${Torch_INSTALL_LUA_CPATH_SUBDIR}") 3 | SET(THCUNN_INSTALL_INCLUDE_SUBDIR "${Torch_INSTALL_INCLUDE_SUBDIR}") 4 | ADD_SUBDIRECTORY(THCUNN) -------------------------------------------------------------------------------- /lib/THCUNN/TemporalConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include "generic/TemporalConvolution.cu" 7 | #include "THCGenerateFloatTypes.h" 8 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialFullConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "im2col.h" 3 | 4 | #include "THCHalf.h" 5 | #include "THCHalfAutoNumerics.cuh" 6 | 7 | #include "generic/SpatialFullConvolution.cu" 8 | #include "THCGenerateFloatTypes.h" 9 | -------------------------------------------------------------------------------- /lib/THCUNN/VolumetricFullConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include "generic/VolumetricFullConvolution.cu" 7 | #include "THCGenerateFloatTypes.h" 8 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialFullDilatedConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "im2col.h" 3 | 4 | #include "THCHalf.h" 5 | #include "THCHalfAutoNumerics.cuh" 6 | 7 | #include "generic/SpatialFullDilatedConvolution.cu" 8 | #include "THCGenerateFloatTypes.h" 9 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialConvolutionMM.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "im2col.h" 4 | 5 | #include "THCHalf.h" 6 | #include "THCHalfAutoNumerics.cuh" 7 | 8 | #include "generic/SpatialConvolutionMM.cu" 9 | #include "THCGenerateFloatTypes.h" 10 | -------------------------------------------------------------------------------- /lib/THCUNN/VolumetricDilatedConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "vol2col.h" 4 | #include "THCHalf.h" 5 | #include "THCHalfAutoNumerics.cuh" 6 | 7 | #include "generic/VolumetricDilatedConvolution.cu" 8 | #include "THCGenerateFloatTypes.h" 9 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialConvolutionLocal.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "im2col.h" 4 | 5 | #include "THCHalf.h" 6 | #include "THCHalfAutoNumerics.cuh" 7 | 8 | #include "generic/SpatialConvolutionLocal.cu" 9 | #include "THCGenerateFloatTypes.h" 10 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialDilatedConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "im2col.h" 4 | 5 | #include "THCHalf.h" 6 | #include "THCHalfAutoNumerics.cuh" 7 | 8 | #include "generic/SpatialDilatedConvolution.cu" 9 | #include "THCGenerateFloatTypes.h" 10 | -------------------------------------------------------------------------------- /lib/THCUNN/TemporalRowConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "row2col.h" 4 | 5 | #include "THCHalf.h" 6 | #include "THCHalfAutoNumerics.cuh" 7 | 8 | #include "generic/TemporalRowConvolution.cu" 9 | 10 | #include "THCGenerateFloatTypes.h" 11 | -------------------------------------------------------------------------------- /lib/THCUNN/VolumetricFullDilatedConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "vol2col.h" 4 | #include "THCHalf.h" 5 | #include "THCHalfAutoNumerics.cuh" 6 | 7 | #include "generic/VolumetricFullDilatedConvolution.cu" 8 | #include "THCGenerateFloatTypes.h" 9 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialDepthWiseConvolution.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "im2col.h" 4 | 5 | #include "THCHalf.h" 6 | #include "THCHalfAutoNumerics.cuh" 7 | 8 | #include "generic/SpatialDepthWiseConvolution.cu" 9 | #include "THCGenerateFloatTypes.h" 10 | -------------------------------------------------------------------------------- /lib/THCUNN/VolumetricMaxPooling.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCDeviceTensor.cuh" 4 | #include "THCDeviceTensorUtils.cuh" 5 | #include "THCDeviceUtils.cuh" 6 | 7 | #include 8 | 9 | #include "generic/VolumetricMaxPooling.cu" 10 | #include "THCGenerateFloatTypes.h" 11 | -------------------------------------------------------------------------------- /lib/THCUNN/THCUNN.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define THCIndexTensor THCudaLongTensor 4 | #define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME 5 | typedef long THCIndex_t; 6 | 7 | #define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME) 8 | 9 | #include "generic/THCUNN.h" 10 | #include 11 | -------------------------------------------------------------------------------- /init.lua: -------------------------------------------------------------------------------- 1 | cunn = nil 2 | 3 | require "cutorch" 4 | require "nn" 5 | require "cunn.THCUNN" 6 | 7 | require('cunn.test') 8 | require('cunn.DataParallelTable') 9 | 10 | nn.Module._flattenTensorBuffer['torch.CudaTensor'] = torch.FloatTensor.new 11 | nn.Module._flattenTensorBuffer['torch.CudaDoubleTensor'] = torch.DoubleTensor.new 12 | -- FIXME: change this to torch.HalfTensor when available 13 | nn.Module._flattenTensorBuffer['torch.CudaHalfTensor'] = torch.FloatTensor.new 14 | -------------------------------------------------------------------------------- /lib/THCUNN/Abs.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct absupdateOutput_functor 8 | { 9 | __device__ void operator()(T* output, const T* input) const 10 | { 11 | *output = abs(*input); 12 | } 13 | }; 14 | 15 | template 16 | struct absupdateGradInput_functor 17 | { 18 | __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const 19 | { 20 | *gradInput = *input < 0 ? - *gradOutput : *gradOutput; 21 | } 22 | }; 23 | 24 | #include "generic/Abs.cu" 25 | #include "THCGenerateFloatTypes.h" 26 | -------------------------------------------------------------------------------- /lib/THCUNN/Square.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct squareupdateOutput_functor 8 | { 9 | __device__ void operator()(T* output, const T* input) const 10 | { 11 | *output = (*input) * (*input); 12 | } 13 | }; 14 | 15 | template 16 | struct squareupdateGradInput_functor 17 | { 18 | __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const 19 | { 20 | *gradInput = ScalarConvert::to(2.0) * (*gradOutput) * (*input); 21 | } 22 | }; 23 | 24 | #include "generic/Square.cu" 25 | #include "THCGenerateFloatTypes.h" 26 | -------------------------------------------------------------------------------- /rocks/version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd "$(dirname "$0")" 3 | fname=$(ls|grep rockspec|grep -v scm | sort -r -V|head -n1) 4 | echo "Last known version: $fname" 5 | luarocks new_version $fname 6 | 7 | new_fname=$(ls|grep rockspec|grep -v scm | sort -r -V|head -n1) 8 | new_version=$(echo $new_fname | cut -f2,3,4,5 -d'-'|sed -e 's/.rockspec//g') 9 | echo "new rockspec: $new_fname" 10 | echo "new version: $new_version" 11 | git add $new_fname 12 | git commit -m "Cutting version $new_version" 13 | git branch $new_version 14 | 15 | git push origin master:master 16 | git push origin $new_version:$new_version 17 | 18 | git clone https://github.com/torch/rocks 19 | cp $new_fname rocks/ 20 | cd rocks 21 | th make-manifest.lua 22 | git add $new_fname 23 | git commit -am "adding rockspec $new_fname" 24 | git push 25 | cd .. 26 | rm -rf rocks 27 | cd .. 28 | 29 | -------------------------------------------------------------------------------- /lib/THCUNN/SharedMem.cuh: -------------------------------------------------------------------------------- 1 | // Based on the simpleTempltes CUDA example 2 | 3 | #ifndef THCUNN_SHAREDMEM_H 4 | #define THCUNN_SHAREDMEM_H 5 | 6 | template 7 | struct SharedMem { 8 | __device__ T *getPointer() 9 | { 10 | extern __device__ void error(void); 11 | error(); 12 | return NULL; 13 | } 14 | }; 15 | 16 | #ifdef CUDA_HALF_TENSOR 17 | template <> 18 | struct SharedMem 19 | { 20 | __device__ half *getPointer() { 21 | extern __shared__ half s_half[]; 22 | return s_half; 23 | } 24 | }; 25 | #endif 26 | 27 | template <> 28 | struct SharedMem 29 | { 30 | __device__ float *getPointer() { 31 | extern __shared__ float s_float[]; 32 | return s_float; 33 | } 34 | }; 35 | 36 | template <> 37 | struct SharedMem 38 | { 39 | __device__ double *getPointer() { 40 | extern __shared__ double s_double[]; 41 | return s_double; 42 | } 43 | }; 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /lib/THCUNN/Sqrt.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct sqrtupdateOutput_functor 8 | { 9 | const T bias; 10 | 11 | sqrtupdateOutput_functor(T bias_) 12 | : bias(bias_) 13 | {} 14 | 15 | __device__ void operator()(T *output, const T *input) const 16 | { 17 | *output = sqrt(*input + bias); 18 | } 19 | }; 20 | 21 | template 22 | struct sqrtupdateGradInput_functor 23 | { 24 | sqrtupdateGradInput_functor() {} 25 | 26 | __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const 27 | { 28 | *gradInput = (THCNumerics::eq(*output,ScalarConvert::to(0.0f))) ? ScalarConvert::to(0.0f) : ((ScalarConvert::to(0.5f) * *gradOutput) / *output); 29 | } 30 | }; 31 | 32 | #include "generic/Sqrt.cu" 33 | #include "THCGenerateFloatTypes.h" 34 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/Sigmoid.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/Sigmoid.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(Sigmoid_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output) 11 | { 12 | THCUNN_assertSameGPU(state, 2, input, output); 13 | THCTensor_(sigmoid)(state, output, input); 14 | } 15 | 16 | void THNN_(Sigmoid_updateGradInput)( 17 | THCState *state, 18 | THCTensor *input, 19 | THCTensor *gradOutput, 20 | THCTensor *gradInput, 21 | THCTensor *output) 22 | { 23 | THCUNN_check_nElement(state, output, gradOutput); 24 | THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); 25 | THCTensor_(resizeAs)(state, gradInput, output); 26 | THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoid_updateGradInput_functor()); 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /lib/THCUNN/L1Cost.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | template 11 | struct l1cost_functor 12 | { 13 | __host__ __device__ Acctype operator()(Dtype x) const 14 | { 15 | return THCNumerics::abs(ScalarConvert::to(x)); 16 | } 17 | }; 18 | 19 | template 20 | struct l1cost_updateGradInput_functor 21 | { 22 | __host__ __device__ Dtype operator()(Dtype x) const 23 | { 24 | if (x > 0) 25 | return ScalarConvert::to(1); 26 | else if (x < 0) 27 | return ScalarConvert::to(-1); 28 | else 29 | return ScalarConvert::to(0); 30 | } 31 | }; 32 | 33 | #include "generic/L1Cost.cu" 34 | #include "THCGenerateFloatTypes.h" 35 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/Abs.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/Abs.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(Abs_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output) 11 | { 12 | THCUNN_assertSameGPU(state, 2, input, output); 13 | THCTensor_(resizeAs)(state, output, input); 14 | THC_pointwiseApply2(state, output, input, absupdateOutput_functor()); 15 | } 16 | 17 | void THNN_(Abs_updateGradInput)( 18 | THCState *state, 19 | THCTensor *input, 20 | THCTensor *gradOutput, 21 | THCTensor *gradInput) 22 | { 23 | THCUNN_check_nElement(state, input, gradOutput); 24 | THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); 25 | THCTensor_(resizeAs)(state, gradInput, input); 26 | THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor()); 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/Tanh.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/Tanh.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(Tanh_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output) 11 | { 12 | THCUNN_assertSameGPU(state, 2, input, output); 13 | THCTensor_(resizeAs)(state, output, input); 14 | THCTensor_(tanh)(state, output, input); 15 | } 16 | 17 | void THNN_(Tanh_updateGradInput)( 18 | THCState *state, 19 | THCTensor *input, 20 | THCTensor *gradOutput, 21 | THCTensor *gradInput, 22 | THCTensor *output) 23 | { 24 | THCUNN_check_shape(state, output, gradOutput); 25 | THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); 26 | THCTensor_(resizeAs)(state, gradInput, output); 27 | THC_pointwiseApply3(state, gradInput, output, gradOutput, tanh_updateGradInput_functor()); 28 | } 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/Square.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/Square.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(Square_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output) 11 | { 12 | THCUNN_assertSameGPU(state, 2, input, output); 13 | THCTensor_(resizeAs)(state, output, input); 14 | THC_pointwiseApply2(state, output, input, squareupdateOutput_functor()); 15 | } 16 | 17 | void THNN_(Square_updateGradInput)( 18 | THCState *state, 19 | THCTensor *input, 20 | THCTensor *gradOutput, 21 | THCTensor *gradInput) 22 | { 23 | THCUNN_check_shape(state, input, gradOutput); 24 | THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); 25 | THCTensor_(resizeAs)(state, gradInput, input); 26 | THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor()); 27 | } 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR) 2 | CMAKE_POLICY(VERSION 2.8) 3 | 4 | SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}") 5 | 6 | FIND_PACKAGE(Torch REQUIRED) 7 | FIND_PACKAGE(CUDA 6.5 REQUIRED) 8 | 9 | INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}/THC") 10 | LINK_DIRECTORIES("${Torch_INSTALL_LIB}") 11 | 12 | FILE(STRINGS lib/THCUNN/THCUNN.h THCUNN_headers NEWLINE_CONSUME) 13 | FILE(WRITE THCUNN_h.lua "return [[") 14 | FILE(APPEND THCUNN_h.lua ${THCUNN_headers}) 15 | FILE(APPEND THCUNN_h.lua "]]") 16 | 17 | FILE(STRINGS lib/THCUNN/generic/THCUNN.h THCUNN_generic_headers NEWLINE_CONSUME) 18 | FILE(WRITE THCUNN_generic_h.lua "return [[") 19 | FILE(APPEND THCUNN_generic_h.lua ${THCUNN_generic_headers}) 20 | FILE(APPEND THCUNN_generic_h.lua "]]") 21 | 22 | FILE(GLOB luasrc *.lua) 23 | 24 | ADD_SUBDIRECTORY(lib) 25 | 26 | INSTALL( 27 | FILES 28 | ${luasrc} 29 | DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/cunn") 30 | -------------------------------------------------------------------------------- /lib/THCUNN/AbsCriterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct abs_functor 14 | { 15 | __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const 16 | { 17 | Dtype z = x-y; 18 | return ScalarConvert::to(z >= 0 ? z : -z); 19 | } 20 | }; 21 | 22 | template 23 | struct abs_updateGradInput_functor 24 | { 25 | const Dtype norm; 26 | 27 | abs_updateGradInput_functor(Dtype norm_) 28 | : norm(norm_) 29 | {} 30 | 31 | __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const 32 | { 33 | return (x - y) >= 0 ? norm : -norm; 34 | } 35 | }; 36 | 37 | #include "generic/AbsCriterion.cu" 38 | #include "THCGenerateFloatTypes.h" 39 | -------------------------------------------------------------------------------- /lib/THCUNN/Sigmoid.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct sigmoid_updateGradInput_functor { 8 | __device__ __forceinline__ void operator()(T* gradInput, const T *output, const T *gradOutput) const { 9 | *gradInput = *gradOutput * (1.f - *output) * (*output); 10 | } 11 | }; 12 | 13 | #ifdef CUDA_HALF_TENSOR 14 | template <> 15 | struct sigmoid_updateGradInput_functor { 16 | __device__ __forceinline__ void operator()(half* gradInput, const half *output, const half *gradOutput) const { 17 | #ifdef CUDA_HALF_INSTRUCTIONS 18 | const half one = __float2half(1.f); 19 | *gradInput = __hmul(*gradOutput, __hmul(__hadd(one, __hneg(*output)), *output)); 20 | #else 21 | const float out = __half2float(*output); 22 | const float go = __half2float(*gradOutput); 23 | *gradInput = __float2half(go * (1.f - out) * out); 24 | #endif 25 | } 26 | }; 27 | #endif 28 | 29 | #include "generic/Sigmoid.cu" 30 | #include "THCGenerateFloatTypes.h" 31 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/LogSigmoid.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/LogSigmoid.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(LogSigmoid_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | THCTensor *buffer) 12 | { 13 | THCUNN_assertSameGPU(state, 2, input, output); 14 | THCTensor_(resizeAs)(state, output, input); 15 | THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor()); 16 | } 17 | 18 | void THNN_(LogSigmoid_updateGradInput)( 19 | THCState *state, 20 | THCTensor *input, 21 | THCTensor *gradOutput, 22 | THCTensor *gradInput, 23 | THCTensor *buffer) 24 | { 25 | THCUNN_check_nElement(state, input, gradOutput); 26 | THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); 27 | THCTensor_(resizeAs)(state, gradInput, input); 28 | THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor()); 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/Sqrt.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/Sqrt.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(Sqrt_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | accreal eps_) 12 | { 13 | real eps = ScalarConvert::to(eps_); 14 | THCUNN_assertSameGPU(state, 2, input, output); 15 | THCTensor_(resizeAs)(state, output, input); 16 | THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor(eps)); 17 | } 18 | 19 | void THNN_(Sqrt_updateGradInput)( 20 | THCState *state, 21 | THCTensor *input, 22 | THCTensor *gradOutput, 23 | THCTensor *gradInput, 24 | THCTensor *output) 25 | { 26 | THCUNN_check_shape(state, output, gradOutput); 27 | THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); 28 | THCTensor_(resizeAs)(state, gradInput, output); 29 | THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor()); 30 | } 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /lib/THCUNN/DistKLDivCriterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct kl_functor 14 | { 15 | __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const 16 | { 17 | Acctype yAcc = ScalarConvert::to(y); 18 | return y > 0 ? yAcc * (THCNumerics::log(yAcc) - x) : Acctype(0); 19 | } 20 | }; 21 | 22 | template 23 | struct kl_updateGradInput_functor 24 | { 25 | const Dtype norm; 26 | 27 | kl_updateGradInput_functor(Dtype norm_) 28 | : norm(norm_) 29 | {} 30 | 31 | __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const 32 | { 33 | return y > 0 ? norm * (-y) : ScalarConvert::to(0); 34 | } 35 | }; 36 | 37 | #include "generic/DistKLDivCriterion.cu" 38 | #include "THCGenerateFloatTypes.h" 39 | -------------------------------------------------------------------------------- /rocks/cunn-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "cunn" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/torch/cunn.git", 6 | } 7 | 8 | description = { 9 | summary = "Torch CUDA Neural Network Implementation", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/torch/cunn", 13 | license = "BSD" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | "nn >= 1.0", 19 | "cutorch >= 1.0" 20 | } 21 | 22 | build = { 23 | type = "command", 24 | build_command = [[ 25 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install 26 | ]], 27 | platforms = { 28 | windows = { 29 | build_command = [[ 30 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install 31 | ]] 32 | } 33 | }, 34 | install_command = "cd build" 35 | } 36 | -------------------------------------------------------------------------------- /lib/THCUNN/SoftShrink.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct SoftShrinkUpdateOutput 8 | { 9 | const T lambda_; 10 | 11 | SoftShrinkUpdateOutput(T lambda) 12 | : lambda_(lambda) 13 | {} 14 | 15 | __device__ __forceinline__ void operator()(T *out, T *in) 16 | { 17 | T x = *in; 18 | if (x > lambda_) *out = x - lambda_; 19 | else if (x < -lambda_) *out = x + lambda_; 20 | else *out = ScalarConvert::to(0); 21 | } 22 | }; 23 | 24 | template 25 | struct SoftShrinkUpdateGradInput 26 | { 27 | const T lambda_; 28 | 29 | SoftShrinkUpdateGradInput(T lambda) 30 | : lambda_(lambda) 31 | {} 32 | 33 | __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) const 34 | { 35 | T x = *input; 36 | if (x > lambda_ || x < -lambda_) 37 | *gradInput = *gradOutput; 38 | else 39 | *gradInput = ScalarConvert::to(0); 40 | } 41 | }; 42 | 43 | #include "generic/SoftShrink.cu" 44 | #include "THCGenerateFloatTypes.h" 45 | -------------------------------------------------------------------------------- /lib/THCUNN/Tanh.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct tanh_updateGradInput_functor 8 | { 9 | __device__ __forceinline__ void operator()(T *gradInput, 10 | const T *output, const T *gradOutput) const { 11 | *gradInput = *gradOutput * (1.f - *output * *output); 12 | } 13 | }; 14 | 15 | #ifdef CUDA_HALF_TENSOR 16 | template <> 17 | struct tanh_updateGradInput_functor 18 | { 19 | __device__ __forceinline__ void operator()(half *gradInput, 20 | const half *output, const half *gradOutput) const { 21 | #ifdef CUDA_HALF_INSTRUCTIONS 22 | const half one = __float2half(1.f); 23 | const half out_square = __hmul(*output, *output); 24 | *gradInput = __hmul(*gradOutput, __hadd(one, __hneg(out_square))); 25 | #else 26 | const float out = __half2float(*output); 27 | const float go = __half2float(*gradOutput); 28 | *gradInput = __float2half(go * (1.f - out * out)); 29 | #endif 30 | } 31 | }; 32 | #endif 33 | 34 | #include "generic/Tanh.cu" 35 | #include "THCGenerateFloatTypes.h" 36 | -------------------------------------------------------------------------------- /rocks/cunn-1.0-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "cunn" 2 | version = "1.0-0" 3 | 4 | source = { 5 | url = "git://github.com/torch/cunn.git", 6 | tag = "1.0-0" 7 | } 8 | 9 | description = { 10 | summary = "Torch CUDA Neural Network Implementation", 11 | detailed = [[ 12 | ]], 13 | homepage = "https://github.com/torch/cunn", 14 | license = "BSD" 15 | } 16 | 17 | dependencies = { 18 | "torch >= 7.0", 19 | "nn >= 1.0", 20 | "cutorch == 1.0-0" 21 | } 22 | 23 | build = { 24 | type = "command", 25 | build_command = [[ 26 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install 27 | ]], 28 | platforms = { 29 | windows = { 30 | build_command = [[ 31 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install 32 | ]] 33 | } 34 | }, 35 | install_command = "cd build" 36 | } 37 | -------------------------------------------------------------------------------- /lib/THCUNN/GatedLinearUnit.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | #include "common.h" 6 | 7 | template 8 | struct gatedLinearCSigMul_functor 9 | { 10 | __device__ void operator()(Dtype *target, const Dtype *sigTensor, const Dtype *mulTensor) const 11 | { 12 | const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert::to(-*sigTensor))); 13 | const Dtype mulNum = *mulTensor; 14 | *target = ScalarConvert::to(sigNum * mulNum); 15 | } 16 | }; 17 | 18 | template 19 | struct gatedLinearDerivativeSecondHalf_functor 20 | { 21 | __device__ void operator()(Dtype *target, const Dtype *sigTensor, const Dtype *mulTensor) const 22 | { 23 | const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert::to(-*sigTensor))); 24 | const Dtype mulNum = *mulTensor; 25 | *target *= ScalarConvert::to((Acctype(1) - sigNum) * sigNum * mulNum); 26 | } 27 | }; 28 | 29 | #include "generic/GatedLinearUnit.cu" 30 | #include "THCGenerateFloatTypes.h" -------------------------------------------------------------------------------- /lib/THCUNN/SoftMarginCriterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct softmargin_functor 14 | { 15 | __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const 16 | { 17 | return log(1 + exp(ScalarConvert::to(-x)*y)); 18 | } 19 | }; 20 | 21 | template 22 | struct softmargin_updateGradInput_functor 23 | { 24 | const Acctype norm; 25 | 26 | softmargin_updateGradInput_functor(Acctype norm_) : 27 | norm(norm_) {} 28 | 29 | __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const 30 | { 31 | Acctype temp = exp(ScalarConvert::to(-x)*y); 32 | return ScalarConvert::to(-y*temp*norm/(ScalarConvert::to(1) + temp)); 33 | } 34 | }; 35 | 36 | #include "generic/SoftMarginCriterion.cu" 37 | #include "THCGenerateFloatTypes.h" 38 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SpatialMaxPooling.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SpatialMaxPooling.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(SpatialMaxPooling_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | THCIndexTensor *indices, 12 | int kW, int kH, 13 | int dW, int dH, 14 | int padW, int padH, 15 | bool ceil_mode) 16 | { 17 | THNN_(SpatialDilatedMaxPooling_updateOutput)( 18 | state, input, output, indices, 19 | kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); 20 | 21 | } 22 | 23 | void THNN_(SpatialMaxPooling_updateGradInput)( 24 | THCState *state, 25 | THCTensor *input, 26 | THCTensor *gradOutput, 27 | THCTensor *gradInput, 28 | THCIndexTensor *indices, 29 | int kW, int kH, 30 | int dW, int dH, 31 | int padW, int padH, 32 | bool ceil_mode) 33 | { 34 | THNN_(SpatialDilatedMaxPooling_updateGradInput)( 35 | state, input, gradOutput, gradInput, indices, 36 | kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode); 37 | 38 | } 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /lib/THCUNN/SoftPlus.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct softPlusupdateOutput_functor 8 | { 9 | const T threshold; 10 | const T beta; 11 | 12 | softPlusupdateOutput_functor(T threshold_, T beta_) 13 | : threshold(threshold_) 14 | , beta(beta_) 15 | {} 16 | 17 | __device__ void operator()(T *output, const T *input) const { 18 | T betain = beta * (*input); 19 | *output = ((betain) > threshold) ? *input : (1/beta) * log1p(exp(betain)); 20 | } 21 | }; 22 | 23 | template 24 | struct softPlusupdateGradInput_functor 25 | { 26 | const T threshold; 27 | const T beta; 28 | 29 | softPlusupdateGradInput_functor(T threshold_, T beta_) 30 | : threshold(threshold_) 31 | , beta(beta_) 32 | {} 33 | 34 | __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const 35 | { 36 | T betaout = beta * (*output); 37 | T exp_bo = exp(betaout); 38 | *gradInput = ((betaout) > threshold) ? *gradOutput : *gradOutput * (exp_bo - 1) / exp_bo; 39 | } 40 | }; 41 | 42 | #include "generic/SoftPlus.cu" 43 | #include "THCGenerateFloatTypes.h" 44 | -------------------------------------------------------------------------------- /lib/THCUNN/MSECriterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | #include "THCThrustAllocator.cuh" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #if CUDA_VERSION >= 7000 13 | #include 14 | #endif 15 | 16 | template 17 | struct mse_functor 18 | { 19 | mse_functor() {} 20 | 21 | __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const 22 | { 23 | Acctype z = ScalarConvert::to(x)-y; 24 | return z*z; 25 | } 26 | }; 27 | 28 | template 29 | struct mse_updateGradInput_functor 30 | { 31 | const Acctype norm; 32 | 33 | mse_updateGradInput_functor(Acctype norm_) 34 | : norm(norm_) 35 | {} 36 | 37 | __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const 38 | { 39 | return ScalarConvert::to(norm * (ScalarConvert::to(x) - y)); 40 | } 41 | }; 42 | 43 | #include "generic/MSECriterion.cu" 44 | #include "THCGenerateFloatTypes.h" 45 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SoftShrink.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SoftShrink.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(SoftShrink_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | accreal lambda_) 12 | { 13 | real lambda = ScalarConvert::to(lambda_); 14 | THCUNN_assertSameGPU(state, 2, input, output); 15 | THCTensor_(resizeAs)(state, output, input); 16 | THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput(lambda)); 17 | THCudaCheck(cudaGetLastError()); 18 | } 19 | 20 | void THNN_(SoftShrink_updateGradInput)( 21 | THCState *state, 22 | THCTensor *input, 23 | THCTensor *gradOutput, 24 | THCTensor *gradInput, 25 | accreal lambda_) 26 | { 27 | real lambda = ScalarConvert::to(lambda_); 28 | THCUNN_check_nElement(state, input, gradOutput); 29 | THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); 30 | THCTensor_(resizeAs)(state, gradInput, input); 31 | THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput(lambda)); 32 | THCudaCheck(cudaGetLastError()); 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /lib/THCUNN/MarginCriterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | struct margin_functor 14 | { 15 | margin_functor(Acctype margin) 16 | : margin(margin) 17 | {} 18 | 19 | __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const 20 | { 21 | Acctype z = margin - ScalarConvert::to(x) * y; 22 | return z >= 0 ? z : 0; 23 | } 24 | 25 | const Acctype margin; 26 | }; 27 | 28 | template 29 | struct margin_updateGradInput_functor 30 | { 31 | const Acctype margin, norm; 32 | 33 | margin_updateGradInput_functor(Acctype margin_, Acctype norm_) 34 | : margin(margin_) 35 | , norm(norm_) 36 | {} 37 | 38 | __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const 39 | { 40 | return ScalarConvert::to((ScalarConvert::to(x) * y) < margin ? -norm * y : 0); 41 | } 42 | }; 43 | 44 | #include "generic/MarginCriterion.cu" 45 | #include "THCGenerateFloatTypes.h" 46 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/VolumetricMaxPooling.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/VolumetricMaxPooling.cu" 3 | #else 4 | 5 | void THNN_(VolumetricMaxPooling_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *output, 9 | THCIndexTensor *indices, 10 | int kT, int kW, int kH, 11 | int dT, int dW, int dH, 12 | int padT, int padW, int padH, 13 | bool ceilMode) 14 | { 15 | THNN_(VolumetricDilatedMaxPooling_updateOutput)( 16 | state, input, output, indices, 17 | kT, kW, kH, dT, dW, dH, padT, padW, padH, 18 | 1, 1, 1, ceilMode); 19 | 20 | } 21 | 22 | void THNN_(VolumetricMaxPooling_updateGradInput)( 23 | THCState *state, 24 | THCTensor *input, 25 | THCTensor *gradOutput, 26 | THCTensor *gradInput, 27 | THCIndexTensor *indices, 28 | int kT, int kW, int kH, 29 | int dT, int dW, int dH, 30 | int padT, int padW, int padH, 31 | bool ceilMode) 32 | { 33 | THNN_(VolumetricDilatedMaxPooling_updateGradInput)( 34 | state, input, gradOutput, gradInput, indices, 35 | kT, kW, kH, dT, dW, dH, padT, padW, padH, 36 | 1, 1, 1, ceilMode); 37 | 38 | } 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialMaxUnpooling.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | 4 | template 5 | __global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const long* bottom_mask, 6 | const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* top_data) { 7 | CUDA_KERNEL_LOOP(index, nthreads) { //index here indices the input pixels 8 | int c = (index / iwidth / iheight) % channels; 9 | int n = index / iwidth / iheight / channels; 10 | top_data += (n*channels + c)*oheight*owidth; 11 | int maxind = bottom_mask[index] - TH_INDEX_BASE; 12 | 13 | top_data[maxind] = bottom_data[index]; 14 | } 15 | } 16 | 17 | template 18 | __global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const long* bottom_mask, 19 | const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* bottom_diff) { 20 | CUDA_KERNEL_LOOP(index, nthreads) { 21 | int c = (index / iwidth / iheight) % channels; 22 | int n = index / iwidth / iheight / channels; 23 | top_diff += (n*channels + c)*oheight*owidth; 24 | int maxind = bottom_mask[index] - TH_INDEX_BASE; 25 | 26 | bottom_diff[index] = top_diff[maxind]; 27 | } 28 | } 29 | 30 | #include "generic/SpatialMaxUnpooling.cu" 31 | #include "THCGenerateFloatTypes.h" 32 | -------------------------------------------------------------------------------- /lib/THCUNN/FusedRNNKernel.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include "THCNumerics.cuh" 5 | #include 6 | 7 | template 8 | struct TensorSigmoidOp { 9 | __device__ __forceinline__ void operator()(T* out, T* in) const { 10 | T one = (T) 1.0; 11 | *out = one / (one + THCNumerics::exp(- *in)); 12 | } 13 | 14 | __device__ __forceinline__ void operator()(T* v) const { 15 | T one = (T) 1.0; 16 | *v = one / (one + THCNumerics::exp(- *v)); 17 | } 18 | }; 19 | 20 | #ifdef CUDA_HALF_TENSOR 21 | template <> 22 | struct TensorSigmoidOp { 23 | __device__ __forceinline__ void operator()(half* out, half* in) const { 24 | #ifdef CUDA_HALF_INSTRUCTIONS 25 | half one = ScalarConvert::to(1); 26 | *out = hdiv(one, __hadd(one, hexp(__hneg(*in)))); 27 | #else 28 | float fin = __half2float(*in); 29 | *out = __float2half(1.0f / (1.0f + expf(- fin))); 30 | #endif 31 | } 32 | 33 | __device__ __forceinline__ void operator()(half* v) const { 34 | #ifdef CUDA_HALF_INSTRUCTIONS 35 | half one = ScalarConvert::to(1); 36 | *v = hdiv(one, __hadd(one, hexp(__hneg(*v)))); 37 | #else 38 | float fv = __half2float(*v); 39 | *v = __float2half(1.0f / (1.0f + expf(- fv))); 40 | #endif 41 | } 42 | }; 43 | #endif 44 | 45 | #include "generic/FusedRNNKernel.cu" 46 | #include "THCGenerateFloatTypes.h" 47 | -------------------------------------------------------------------------------- /lib/THCUNN/SmoothL1Criterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | #include "THCThrustAllocator.cuh" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #if CUDA_VERSION >= 7000 13 | #include 14 | #endif 15 | 16 | template 17 | struct smoothl1_functor 18 | { 19 | smoothl1_functor() {} 20 | 21 | __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const 22 | { 23 | Acctype z = ScalarConvert::to(THCNumerics::abs(x-y)); 24 | return z < Acctype(1) ? 0.5f*z*z : z - 0.5f; 25 | } 26 | }; 27 | 28 | template 29 | struct smoothl1_updateGradInput_functor 30 | { 31 | const Dtype norm; 32 | 33 | smoothl1_updateGradInput_functor(Dtype norm_) 34 | : norm(norm_) 35 | {} 36 | 37 | __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const 38 | { 39 | Dtype z = x - y; 40 | if (z < ScalarConvert::to(-1)) 41 | return -norm; 42 | else if (z > ScalarConvert::to(1)) 43 | return norm; 44 | else 45 | return norm * z; 46 | } 47 | }; 48 | 49 | #include "generic/SmoothL1Criterion.cu" 50 | #include "THCGenerateFloatTypes.h" 51 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SoftPlus.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SoftPlus.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(SoftPlus_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | accreal beta_, 12 | accreal threshold_) 13 | { 14 | real beta = ScalarConvert::to(beta_); 15 | real threshold = ScalarConvert::to(threshold_); 16 | THCUNN_assertSameGPU(state, 2, input, output); 17 | THCTensor_(resizeAs)(state, output, input); 18 | THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor(threshold, beta)); 19 | } 20 | 21 | void THNN_(SoftPlus_updateGradInput)( 22 | THCState *state, 23 | THCTensor *input, 24 | THCTensor *gradOutput, 25 | THCTensor *gradInput, 26 | THCTensor *output, 27 | accreal beta_, 28 | accreal threshold_) 29 | { 30 | real beta = ScalarConvert::to(beta_); 31 | real threshold = ScalarConvert::to(threshold_); 32 | THCUNN_check_nElement(state, input, gradOutput); 33 | THCUNN_assertSameGPU(state, 4, input, output, gradOutput, gradInput); 34 | THCTensor_(resizeAs)(state, gradInput, output); 35 | THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor(threshold, beta)); 36 | } 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /lib/THCUNN/ELU.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct ELUupdateOutput_functor 8 | { 9 | const T alpha_; 10 | 11 | ELUupdateOutput_functor(T alpha) 12 | : alpha_(alpha) 13 | {} 14 | 15 | __device__ void operator()(T *output, const T *input) const 16 | { 17 | *output = *input <= 0 ? (exp(*input) - 1) * alpha_ : *input; 18 | } 19 | }; 20 | 21 | // in-place variant 22 | template 23 | struct ELUupdateOutputIP_functor 24 | { 25 | const T alpha_; 26 | 27 | ELUupdateOutputIP_functor(T alpha) 28 | : alpha_(alpha) 29 | {} 30 | 31 | __device__ void operator()(T *x) const 32 | { 33 | *x = *x <= 0 ? (exp(*x) - 1) * alpha_ : *x; 34 | } 35 | }; 36 | 37 | template 38 | struct ELUupdateGradInput_functor 39 | { 40 | const T alpha_; 41 | 42 | ELUupdateGradInput_functor(T alpha) 43 | : alpha_(alpha) 44 | {} 45 | 46 | __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const 47 | { 48 | *gradInput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput); 49 | } 50 | }; 51 | 52 | template 53 | struct ELUupdateGradInputIP_functor 54 | { 55 | const T alpha_; 56 | 57 | ELUupdateGradInputIP_functor(T alpha) 58 | : alpha_(alpha) 59 | {} 60 | 61 | __device__ void operator()(T *gradOutput, const T *output) const 62 | { 63 | *gradOutput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput); 64 | } 65 | }; 66 | 67 | #include "generic/ELU.cu" 68 | #include "THCGenerateFloatTypes.h" 69 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/L1Cost.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/L1Cost.cu" 3 | #else 4 | 5 | void THNN_(L1Cost_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *output) 9 | { 10 | THCUNN_check_dim_size(state, output, 1, 0, 1); 11 | THCUNN_assertSameGPU(state, 1, input); 12 | accreal sum; 13 | ptrdiff_t size = THCTensor_(nElement)(state, input); 14 | input = THCTensor_(newContiguous)(state, input); 15 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 16 | sum = thrust::transform_reduce(input_data, input_data+size, l1cost_functor(), accreal(0), thrust::plus()); 17 | 18 | THCTensor_(free)(state, input); 19 | 20 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); 21 | } 22 | 23 | void THNN_(L1Cost_updateGradInput)( 24 | THCState *state, 25 | THCTensor *input, 26 | THCTensor *gradOutput, 27 | THCTensor *gradInput) 28 | { 29 | THCUNN_check_nElement(state, input, gradOutput); 30 | THCUNN_assertSameGPU(state, 2, input, gradInput); 31 | ptrdiff_t size = THCTensor_(nElement)(state, input); 32 | 33 | input = THCTensor_(newContiguous)(state, input); 34 | THCTensor_(resizeAs)(state, gradInput, input); 35 | 36 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 37 | thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); 38 | 39 | thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor()); 40 | 41 | THCTensor_(free)(state, input); 42 | } 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /lib/THCUNN/HardTanh.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct hardtanhupdateOutput_functor 8 | { 9 | const T max_val_; 10 | const T min_val_; 11 | 12 | hardtanhupdateOutput_functor(T min_val, T max_val) 13 | : min_val_(min_val) 14 | , max_val_(max_val) 15 | {} 16 | 17 | __device__ void operator()(T *output, const T *input) const 18 | { 19 | if (*input < min_val_) 20 | *output = min_val_; 21 | else if (*input <= max_val_) 22 | *output = *input; 23 | else 24 | *output = max_val_; 25 | } 26 | 27 | __device__ void operator()(T *input) const 28 | { 29 | if (*input < min_val_) 30 | *input = min_val_; 31 | else if (*input > max_val_) 32 | *input = max_val_; 33 | } 34 | }; 35 | 36 | template 37 | struct hardtanhupdateGradInput_functor 38 | { 39 | const T max_val_; 40 | const T min_val_; 41 | 42 | hardtanhupdateGradInput_functor(T min_val, T max_val) 43 | : min_val_(min_val) 44 | , max_val_(max_val) 45 | {} 46 | 47 | __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const 48 | { 49 | if (*input <= min_val_ || *input >= max_val_) 50 | *gradInput = ScalarConvert::to(0); 51 | else 52 | *gradInput = *gradOutput; 53 | } 54 | 55 | __device__ void operator()(T *gradInput, const T *input) const 56 | { 57 | if (*input <= min_val_ || *input >= max_val_) 58 | *gradInput = ScalarConvert::to(0); 59 | } 60 | }; 61 | 62 | #include "generic/HardTanh.cu" 63 | #include "THCGenerateFloatTypes.h" 64 | -------------------------------------------------------------------------------- /lib/THCUNN/LeakyReLU.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct LeakyReLUUpdateOutput 8 | { 9 | const T negval_; 10 | 11 | LeakyReLUUpdateOutput(T negval) 12 | : negval_(negval) 13 | {} 14 | 15 | __device__ __forceinline__ void operator()(T *out, T *in) 16 | { 17 | T x = *in; 18 | *out = (x > 0) ? x : x * negval_; 19 | } 20 | }; 21 | 22 | // in-place variant 23 | template 24 | struct LeakyReLUUpdateOutputIP 25 | { 26 | const T negval_; 27 | 28 | LeakyReLUUpdateOutputIP(T negval) 29 | : negval_(negval) 30 | {} 31 | 32 | __device__ __forceinline__ void operator()(T *x) 33 | { 34 | *x = (*x > 0) ? *x : negval_ * (*x); 35 | } 36 | }; 37 | 38 | template 39 | struct LeakyReLUUpdateGradInput 40 | { 41 | const T negval_; 42 | 43 | LeakyReLUUpdateGradInput(T negval) 44 | : negval_(negval) 45 | {} 46 | 47 | __device__ __forceinline__ void operator()( 48 | T* gradInput, 49 | T* input, 50 | T* gradOutput) const 51 | { 52 | *gradInput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_; 53 | } 54 | }; 55 | 56 | template 57 | struct LeakyReLUUpdateGradInputIP 58 | { 59 | const T negval_; 60 | 61 | LeakyReLUUpdateGradInputIP(T negval) 62 | : negval_(negval) 63 | {} 64 | 65 | __device__ __forceinline__ void operator()( 66 | T* gradOutput, 67 | T* input) const 68 | { 69 | *gradOutput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_; 70 | } 71 | }; 72 | 73 | #include "generic/LeakyReLU.cu" 74 | #include "THCGenerateFloatTypes.h" 75 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/ELU.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/ELU.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | 8 | void THNN_(ELU_updateOutput)( 9 | THCState *state, 10 | THCTensor *input, 11 | THCTensor *output, 12 | accreal alpha_, 13 | bool inplace) 14 | { 15 | real alpha = ScalarConvert::to(alpha_); 16 | THCUNN_assertSameGPU(state, 2, input, output); 17 | 18 | if (inplace) 19 | { 20 | THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor(alpha)); 21 | THCTensor_(set)(state, output, input); 22 | } 23 | else 24 | { 25 | THCTensor_(resizeAs)(state, output, input); 26 | THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor(alpha)); 27 | } 28 | } 29 | 30 | 31 | void THNN_(ELU_updateGradInput)( 32 | THCState *state, 33 | THCTensor *input, 34 | THCTensor *gradOutput, 35 | THCTensor *gradInput, 36 | THCTensor *output, 37 | accreal alpha_, 38 | bool inplace) 39 | { 40 | real alpha = ScalarConvert::to(alpha_); 41 | THCUNN_check_nElement(state, input, gradOutput); 42 | THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); 43 | 44 | if (inplace) 45 | { 46 | THC_pointwiseApply2(state, gradOutput, output, ELUupdateGradInputIP_functor(alpha)); 47 | THCTensor_(set)(state, gradInput, gradOutput); 48 | } 49 | else 50 | { 51 | THCTensor_(resizeAs)(state, gradInput, output); 52 | THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor(alpha)); 53 | } 54 | } 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/LeakyReLU.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/LeakyReLU.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(LeakyReLU_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | accreal negval_, 12 | bool inplace) 13 | { 14 | real negval = ScalarConvert::to(negval_); 15 | 16 | THCUNN_assertSameGPU(state, 2, input, output); 17 | 18 | if (inplace) 19 | { 20 | THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP(negval)); 21 | THCTensor_(set)(state, output, input); 22 | } 23 | else 24 | { 25 | THCTensor_(resizeAs)(state, output, input); 26 | THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput(negval)); 27 | } 28 | 29 | THCudaCheck(cudaGetLastError()); 30 | } 31 | 32 | void THNN_(LeakyReLU_updateGradInput)( 33 | THCState *state, 34 | THCTensor *input, 35 | THCTensor *gradOutput, 36 | THCTensor *gradInput, 37 | accreal negval_, 38 | bool inplace) 39 | { 40 | real negval = ScalarConvert::to(negval_); 41 | 42 | THCUNN_check_nElement(state, input, gradOutput); 43 | THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput); 44 | 45 | if (inplace) 46 | { 47 | THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP(negval)); 48 | THCTensor_(set)(state, gradInput, gradOutput); 49 | } 50 | else 51 | { 52 | THCTensor_(resizeAs)(state, gradInput, input); 53 | THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput(negval)); 54 | } 55 | 56 | THCudaCheck(cudaGetLastError()); 57 | } 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /lib/THCUNN/Threshold.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct ThresholdUpdateOutput 8 | { 9 | const T threshold_; 10 | const T val_; 11 | 12 | ThresholdUpdateOutput(T threshold, T val) 13 | : threshold_(threshold) 14 | , val_(val) 15 | {} 16 | 17 | __device__ __forceinline__ void operator()(T *out, T *in) 18 | { 19 | T x = *in; 20 | *out = (x > threshold_) ? x : val_; 21 | } 22 | }; 23 | 24 | // in-place variant 25 | template 26 | struct ThresholdUpdateOutputIP 27 | { 28 | const T threshold_; 29 | const T val_; 30 | 31 | ThresholdUpdateOutputIP(T threshold, T val) 32 | : threshold_(threshold) 33 | , val_(val) 34 | {} 35 | 36 | __device__ __forceinline__ void operator()(T *x) 37 | { 38 | *x = (*x > threshold_) ? *x : val_; 39 | } 40 | }; 41 | 42 | template 43 | struct ThresholdUpdateGradInput 44 | { 45 | const T threshold_; 46 | 47 | ThresholdUpdateGradInput(T threshold) 48 | : threshold_(threshold) 49 | {} 50 | 51 | __device__ __forceinline__ void operator()( 52 | T *gradInput, T *input, T *gradOutput) const 53 | { 54 | *gradInput = (*input > threshold_) ? *gradOutput : ScalarConvert::to(0); 55 | } 56 | }; 57 | 58 | template 59 | struct ThresholdUpdateGradInputIP 60 | { 61 | const T threshold_; 62 | 63 | ThresholdUpdateGradInputIP(T threshold) 64 | : threshold_(threshold) 65 | {} 66 | 67 | __device__ __forceinline__ void operator()( 68 | T *gradOutput, T *input) const 69 | { 70 | *gradOutput = (*input > threshold_) ? *gradOutput : ScalarConvert::to(0); 71 | } 72 | }; 73 | 74 | #include "generic/Threshold.cu" 75 | #include "THCGenerateFloatTypes.h" 76 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/HardTanh.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/HardTanh.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(HardTanh_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | accreal min_val_, 12 | accreal max_val_, 13 | bool inplace) 14 | { 15 | real min_val = ScalarConvert::to(min_val_); 16 | real max_val = ScalarConvert::to(max_val_); 17 | 18 | THCUNN_assertSameGPU(state, 2, input, output); 19 | if(inplace) 20 | { 21 | THCTensor_(set)(state, output, input); 22 | THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor(min_val, max_val)); 23 | } 24 | else 25 | { 26 | THCTensor_(resizeAs)(state, output, input); 27 | THC_pointwiseApply2(state, output, input, 28 | hardtanhupdateOutput_functor(min_val, max_val)); 29 | } 30 | } 31 | 32 | void THNN_(HardTanh_updateGradInput)( 33 | THCState *state, 34 | THCTensor *input, 35 | THCTensor *gradOutput, 36 | THCTensor *gradInput, 37 | accreal min_val_, 38 | accreal max_val_, 39 | bool inplace) 40 | { 41 | real min_val = ScalarConvert::to(min_val_); 42 | real max_val = ScalarConvert::to(max_val_); 43 | 44 | THCUNN_check_nElement(state, input, gradOutput); 45 | THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput); 46 | 47 | if (inplace) 48 | { 49 | THCTensor_(set)(state, gradInput, gradOutput); 50 | THC_pointwiseApply2(state, gradInput, input, 51 | hardtanhupdateGradInput_functor(min_val, max_val)); 52 | } 53 | else 54 | { 55 | THCTensor_(resizeAs)(state, gradInput, input); 56 | THC_pointwiseApply3(state, gradInput, input, gradOutput, 57 | hardtanhupdateGradInput_functor(min_val, max_val)); 58 | } 59 | } 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/Threshold.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/Threshold.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(Threshold_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | accreal threshold_, 12 | accreal val_, 13 | bool inplace) 14 | { 15 | real threshold = ScalarConvert::to(threshold_); 16 | real val = ScalarConvert::to(val_); 17 | THCUNN_assertSameGPU(state, 2, input, output); 18 | 19 | if (inplace) 20 | { 21 | THC_pointwiseApply1(state, input, 22 | ThresholdUpdateOutputIP(threshold, val) 23 | ); 24 | THCTensor_(set)(state, output, input); 25 | } 26 | else 27 | { 28 | THCTensor_(resizeAs)(state, output, input); 29 | THC_pointwiseApply2(state, output, input, 30 | ThresholdUpdateOutput(threshold, val) 31 | ); 32 | } 33 | 34 | THCudaCheck(cudaGetLastError()); 35 | } 36 | 37 | void THNN_(Threshold_updateGradInput)( 38 | THCState *state, 39 | THCTensor *input, 40 | THCTensor *gradOutput, 41 | THCTensor *gradInput, 42 | accreal threshold_, 43 | accreal val_, 44 | bool inplace) 45 | { 46 | real threshold = ScalarConvert::to(threshold_); 47 | real val = ScalarConvert::to(val_); 48 | THCUNN_check_nElement(state, input, gradOutput); 49 | THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput); 50 | 51 | if (inplace) 52 | { 53 | THC_pointwiseApply2(state, gradOutput, input, 54 | ThresholdUpdateGradInputIP(threshold) 55 | ); 56 | THCTensor_(set)(state, gradInput, gradOutput); 57 | } 58 | else 59 | { 60 | THCTensor_(resizeAs)(state, gradInput, input); 61 | THC_pointwiseApply3(state, gradInput, input, gradOutput, 62 | ThresholdUpdateGradInput(threshold) 63 | ); 64 | } 65 | 66 | THCudaCheck(cudaGetLastError()); 67 | } 68 | 69 | #endif 70 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 2 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 3 | Copyright (c) 2011-2013 NYU (Clement Farabet) 4 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 5 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 6 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 7 | 8 | All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are met: 12 | 13 | 1. Redistributions of source code must retain the above copyright 14 | notice, this list of conditions and the following disclaimer. 15 | 16 | 2. Redistributions in binary form must reproduce the above copyright 17 | notice, this list of conditions and the following disclaimer in the 18 | documentation and/or other materials provided with the distribution. 19 | 20 | 3. Neither the names of NEC Laboratories American and IDIAP Research 21 | Institute nor the names of its contributors may be used to endorse or 22 | promote products derived from this software without specific prior 23 | written permission. 24 | 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 26 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 29 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 | POSSIBILITY OF SUCH DAMAGE. 36 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SpatialFullConvolution.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SpatialFullConvolution.cu" 3 | #else 4 | 5 | void THNN_(SpatialFullConvolution_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *output, 9 | THCTensor *weight, 10 | THCTensor *bias, 11 | THCTensor *columns, 12 | THCTensor *ones, 13 | int kW, int kH, 14 | int dW, int dH, 15 | int padW, int padH, 16 | int adjW, int adjH) 17 | { 18 | THNN_(SpatialFullDilatedConvolution_updateOutput)( 19 | state, input, output, weight, bias, columns, ones, 20 | kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH); 21 | } 22 | 23 | void THNN_(SpatialFullConvolution_updateGradInput)( 24 | THCState *state, 25 | THCTensor *input, 26 | THCTensor *gradOutput, 27 | THCTensor *gradInput, 28 | THCTensor *weight, 29 | THCTensor *gradColumns, 30 | int kW, int kH, 31 | int dW, int dH, 32 | int padW, int padH, 33 | int adjW, int adjH) 34 | { 35 | THNN_(SpatialFullDilatedConvolution_updateGradInput)( 36 | state, input, gradOutput, gradInput, weight, gradColumns, 37 | kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH); 38 | } 39 | 40 | 41 | void THNN_(SpatialFullConvolution_accGradParameters)( 42 | THCState *state, 43 | THCTensor *input, 44 | THCTensor *gradOutput, 45 | THCTensor *gradWeight, 46 | THCTensor *gradBias, 47 | THCTensor *columns, 48 | THCTensor *ones, 49 | int kW, int kH, 50 | int dW, int dH, 51 | int padW, int padH, 52 | int adjW, int adjH, 53 | accreal scale_) 54 | { 55 | THNN_(SpatialFullDilatedConvolution_accGradParameters)( 56 | state, input, gradOutput, gradWeight, gradBias, 57 | columns, ones, 58 | kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH, scale_); 59 | } 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/VolumetricFullConvolution.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/VolumetricFullConvolution.cu" 3 | #else 4 | 5 | void THNN_(VolumetricFullConvolution_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *output, 9 | THCTensor *weight, 10 | THCTensor *bias, 11 | THCTensor *finput, 12 | THCTensor *fgradInput, 13 | int dT, int dW, int dH, 14 | int padT, int padW, int padH, 15 | int adjT, int adjW, int adjH) 16 | { 17 | THNN_(VolumetricFullDilatedConvolution_updateOutput)( 18 | state, input, output, weight, bias, finput, fgradInput, 19 | dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH); 20 | } 21 | 22 | void THNN_(VolumetricFullConvolution_updateGradInput)( 23 | THCState *state, 24 | THCTensor *input, 25 | THCTensor *gradOutput, 26 | THCTensor *gradInput, 27 | THCTensor *weight, 28 | THCTensor *finput, 29 | THCTensor *fgradInput, 30 | int dT, int dW, int dH, 31 | int padT, int padW, int padH, 32 | int adjT, int adjW, int adjH) 33 | { 34 | THNN_(VolumetricFullDilatedConvolution_updateGradInput)( 35 | state, input, gradOutput, gradInput, weight, finput, fgradInput, 36 | dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH); 37 | } 38 | 39 | 40 | void THNN_(VolumetricFullConvolution_accGradParameters)( 41 | THCState *state, 42 | THCTensor *input, 43 | THCTensor *gradOutput, 44 | THCTensor *gradWeight, 45 | THCTensor *gradBias, 46 | THCTensor *finput, 47 | THCTensor *fgradInput, 48 | int dT, int dW, int dH, 49 | int padT, int padW, int padH, 50 | int adjT, int adjW, int adjH, 51 | accreal scale_) 52 | { 53 | THNN_(VolumetricFullDilatedConvolution_accGradParameters)( 54 | state, input, gradOutput, gradWeight, gradBias, finput, fgradInput, 55 | dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH, scale_); 56 | } 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/AbsCriterion.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/AbsCriterion.cu" 3 | #else 4 | 5 | void THNN_(AbsCriterion_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *target, 9 | THCTensor *output, 10 | bool sizeAverage) 11 | { 12 | THCUNN_check_nElement(state, input, target); 13 | THCUNN_assertSameGPU(state, 2, input, target); 14 | 15 | ptrdiff_t size = THCTensor_(nElement)(state, input); 16 | 17 | input = THCTensor_(newContiguous)(state, input); 18 | target = THCTensor_(newContiguous)(state, target); 19 | 20 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 21 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 22 | accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal)0, thrust::plus(), abs_functor()); 23 | 24 | if (sizeAverage) 25 | sum /= size; 26 | 27 | THCTensor_(free)(state, input); 28 | THCTensor_(free)(state, target); 29 | 30 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); 31 | } 32 | 33 | void THNN_(AbsCriterion_updateGradInput)( 34 | THCState *state, 35 | THCTensor *input, 36 | THCTensor *target, 37 | THCTensor *gradInput, 38 | bool sizeAverage) 39 | { 40 | THCUNN_check_nElement(state, input, target); 41 | THCUNN_assertSameGPU(state, 3, input, target, gradInput); 42 | 43 | ptrdiff_t size = THCTensor_(nElement)(state, input); 44 | real norm = ScalarConvert::to(sizeAverage ? 1./size : 1.); 45 | 46 | input = THCTensor_(newContiguous)(state, input); 47 | target = THCTensor_(newContiguous)(state, target); 48 | 49 | THCTensor_(resizeAs)(state, gradInput, input); 50 | 51 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 52 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 53 | thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); 54 | 55 | thrust::transform(input_data, input_data+size, target_data, gradInput_data, abs_updateGradInput_functor(norm)); 56 | 57 | THCTensor_(free)(state, input); 58 | THCTensor_(free)(state, target); 59 | } 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SoftMarginCriterion.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu" 3 | #else 4 | 5 | void THNN_(SoftMarginCriterion_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *target, 9 | THCTensor *output, 10 | bool sizeAverage) 11 | { 12 | THCUNN_check_nElement(state, input, target); 13 | THCUNN_check_dim_size(state, output, 1, 0, 1); 14 | THCUNN_assertSameGPU(state, 2, input, target); 15 | accreal sum; 16 | 17 | ptrdiff_t size = THCTensor_(nElement)(state, input); 18 | 19 | input = THCTensor_(newContiguous)(state, input); 20 | target = THCTensor_(newContiguous)(state, target); 21 | 22 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 23 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 24 | sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus(), softmargin_functor()); 25 | 26 | if(sizeAverage) 27 | sum /= size; 28 | 29 | THCTensor_(free)(state, input); 30 | THCTensor_(free)(state, target); 31 | 32 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); 33 | } 34 | 35 | void THNN_(SoftMarginCriterion_updateGradInput)( 36 | THCState *state, 37 | THCTensor *input, 38 | THCTensor *target, 39 | THCTensor *gradInput, 40 | bool sizeAverage) 41 | { 42 | THCUNN_check_nElement(state, input, target); 43 | THCUNN_assertSameGPU(state, 3, input, target, gradInput); 44 | 45 | ptrdiff_t size = THCTensor_(nElement)(state, input); 46 | accreal norm = (sizeAverage ? 1./size : 1.); 47 | 48 | input = THCTensor_(newContiguous)(state, input); 49 | target = THCTensor_(newContiguous)(state, target); 50 | 51 | THCTensor_(resizeAs)(state, gradInput, input); 52 | 53 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 54 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 55 | thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); 56 | 57 | thrust::transform(input_data, input_data+size, target_data, gradInput_data, softmargin_updateGradInput_functor(norm)); 58 | 59 | THCTensor_(free)(state, input); 60 | THCTensor_(free)(state, target); 61 | } 62 | 63 | #endif 64 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialUpSamplingNearest.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "THCHalf.h" 10 | #include "THCHalfAutoNumerics.cuh" 11 | 12 | /* 13 | * Description: 14 | */ 15 | 16 | __device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) 17 | { 18 | int x, y, z, w; 19 | w = ii % d3; 20 | ii = ii/d3; 21 | z = ii % d2; 22 | ii = ii/d2; 23 | y = ii % d1; 24 | ii = ii/d1; 25 | x = ii; 26 | w = w/scale_factor; 27 | z = z/scale_factor; 28 | d2 /= scale_factor; 29 | d3 /= scale_factor; 30 | return (((x*d1+y)*d2)+z)*d3+w; 31 | 32 | } 33 | __device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) 34 | { 35 | int x, y, z, w; 36 | w = ii % d3; 37 | ii = ii/d3; 38 | z = ii % d2; 39 | ii = ii/d2; 40 | y = ii % d1; 41 | ii = ii/d1; 42 | x = ii; 43 | w = w*scale_factor+off_x; 44 | z = z*scale_factor+off_y; 45 | d2 *= scale_factor; 46 | d3 *= scale_factor; 47 | return (((x*d1+y)*d2)+z)*d3+w; 48 | 49 | } 50 | 51 | template 52 | __global__ void upscale(Dtype *input, Dtype *output, long no_elements, 53 | int scale_factor, int d1, int d2, int d3) 54 | { 55 | // output offset: 56 | long ii = threadIdx.x + blockDim.x * blockIdx.x; 57 | ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; 58 | if (ii >= no_elements) return; 59 | int ipidx = translate_idx(ii, d1, d2, d3, scale_factor); 60 | output[ii]=input[ipidx]; 61 | } 62 | 63 | /* 64 | * Description: 65 | */ 66 | template 67 | __global__ void downscale(Dtype *gradInput_data, Dtype *gradOutput_data, long no_elements, 68 | int scale_factor, int d1, int d2, int d3) 69 | { 70 | // output offset: 71 | long ii = threadIdx.x + blockDim.x * blockIdx.x; 72 | ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; 73 | if (ii >= no_elements) return; 74 | Acctype sum = Acctype(0); 75 | for (int i=0; i < scale_factor; i++){ 76 | for(int j=0; j < scale_factor; j++){ 77 | int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j); 78 | sum += gradOutput_data[ipidx]; 79 | } 80 | } 81 | gradInput_data[ii] += ScalarConvert::to(sum); 82 | } 83 | 84 | #include "generic/SpatialUpSamplingNearest.cu" 85 | #include "THCGenerateFloatTypes.h" 86 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/MarginCriterion.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/MarginCriterion.cu" 3 | #else 4 | 5 | void THNN_(MarginCriterion_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *target, 9 | THCTensor *output, 10 | bool sizeAverage, 11 | accreal margin_) 12 | { 13 | real margin = ScalarConvert::to(margin_); 14 | THCUNN_check_nElement(state, input, target); 15 | THCUNN_check_dim_size(state, output, 1, 0, 1); 16 | THCUNN_assertSameGPU(state, 2, input, target); 17 | 18 | ptrdiff_t size = THCTensor_(nElement)(state, input); 19 | 20 | input = THCTensor_(newContiguous)(state, input); 21 | target = THCTensor_(newContiguous)(state, target); 22 | 23 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 24 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 25 | accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus(), 26 | margin_functor(ScalarConvert::to(margin))); 27 | 28 | if (sizeAverage) 29 | sum /= size; 30 | 31 | THCTensor_(free)(state, input); 32 | THCTensor_(free)(state, target); 33 | 34 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); 35 | } 36 | 37 | 38 | void THNN_(MarginCriterion_updateGradInput)( 39 | THCState *state, 40 | THCTensor *input, 41 | THCTensor *target, 42 | THCTensor *gradInput, 43 | bool sizeAverage, 44 | accreal margin_) 45 | { 46 | real margin = ScalarConvert::to(margin_); 47 | 48 | THCUNN_check_nElement(state, input, target); 49 | THCUNN_assertSameGPU(state, 3, input, target, gradInput); 50 | 51 | ptrdiff_t size = THCTensor_(nElement)(state, input); 52 | accreal norm = sizeAverage ? 1.f/size : 1; 53 | 54 | input = THCTensor_(newContiguous)(state, input); 55 | target = THCTensor_(newContiguous)(state, target); 56 | 57 | THCTensor_(resizeAs)(state, gradInput, input); 58 | 59 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 60 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 61 | thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); 62 | 63 | thrust::transform(input_data, input_data+size, target_data, gradInput_data, 64 | margin_updateGradInput_functor(ScalarConvert::to(margin), norm)); 65 | 66 | THCTensor_(free)(state, input); 67 | THCTensor_(free)(state, target); 68 | } 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialReplicationPadding.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCDeviceTensor.cuh" 4 | #include "THCDeviceTensorUtils.cuh" 5 | #include "THCDeviceUtils.cuh" 6 | #include "THCReduceApplyUtils.cuh" 7 | #include 8 | 9 | #include "THCHalf.h" 10 | #include "THCHalfAutoNumerics.cuh" 11 | #include "THCAtomics.cuh" 12 | 13 | template 14 | __global__ void SpatialReplicationPadding_updateOutput( 15 | THCDeviceTensor input, 16 | THCDeviceTensor output, 17 | int padT, int padB, int padL, int padR) { 18 | 19 | int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; 20 | int plane = blockIdx.y; 21 | int batch = blockIdx.z; 22 | if (outputPointId >= output.getSize(2) * output.getSize(3)) { 23 | return; 24 | } 25 | int outputPointX = outputPointId % output.getSize(3); 26 | int outputPointY = outputPointId / output.getSize(3); 27 | 28 | int iStartX = max(0, -padL); 29 | int iStartY = max(0, -padT); 30 | int oStartX = max(0, padL); 31 | int oStartY = max(0, padT); 32 | 33 | int inputPointX = min(max(padL, outputPointX), input.getSize(3) + padL - 1) - oStartX + iStartX; 34 | int inputPointY = min(max(padT, outputPointY), input.getSize(2) + padT - 1) - oStartY + iStartY; 35 | 36 | Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX]; 37 | output[batch][plane][outputPointY][outputPointX] = valueToCopy; 38 | } 39 | 40 | template 41 | __global__ void SpatialReplicationPadding_updateGradInput( 42 | THCDeviceTensor gradInput, 43 | THCDeviceTensor gradOutput, 44 | int padT, int padB, int padL, int padR) { 45 | 46 | int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; 47 | int plane = blockIdx.y; 48 | int batch = blockIdx.z; 49 | if (outputPointId >= gradOutput.getSize(2) * gradOutput.getSize(3)) { 50 | return; 51 | } 52 | int outputPointX = outputPointId % gradOutput.getSize(3); 53 | int outputPointY = outputPointId / gradOutput.getSize(3); 54 | 55 | int iStartX = max(0, -padL); 56 | int iStartY = max(0, -padT); 57 | int oStartX = max(0, padL); 58 | int oStartY = max(0, padT); 59 | 60 | int inputPointX = min(max(padL, outputPointX), gradInput.getSize(3) + padL - 1) - oStartX + iStartX; 61 | int inputPointY = min(max(padT, outputPointY), gradInput.getSize(2) + padT - 1) - oStartY + iStartY; 62 | 63 | Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX]; 64 | atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy); 65 | } 66 | 67 | 68 | #include "generic/SpatialReplicationPadding.cu" 69 | #include "THCGenerateFloatTypes.h" 70 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/MSECriterion.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/MSECriterion.cu" 3 | #else 4 | 5 | void THNN_(MSECriterion_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *target, 9 | THCTensor *output, 10 | bool sizeAverage) 11 | { 12 | THCUNN_check_nElement(state, input, target); 13 | THCUNN_check_dim_size(state, output, 1, 0, 1); 14 | THCUNN_assertSameGPU(state, 2, input, target); 15 | 16 | ptrdiff_t size = THCTensor_(nElement)(state, input); 17 | 18 | input = THCTensor_(newContiguous)(state, input); 19 | target = THCTensor_(newContiguous)(state, target); 20 | 21 | THCThrustAllocator thrustAlloc(state); 22 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 23 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 24 | accreal sum = thrust::inner_product( 25 | #if CUDA_VERSION >= 7000 26 | thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), 27 | #endif 28 | input_data, input_data+size, target_data, (accreal) 0, 29 | thrust::plus(), mse_functor()); 30 | 31 | if (sizeAverage) 32 | sum /= size; 33 | 34 | THCTensor_(free)(state, input); 35 | THCTensor_(free)(state, target); 36 | 37 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); 38 | } 39 | 40 | void THNN_(MSECriterion_updateGradInput)( 41 | THCState *state, 42 | THCTensor *input, 43 | THCTensor *target, 44 | THCTensor *gradInput, 45 | bool sizeAverage) 46 | { 47 | THCUNN_check_nElement(state, input, target); 48 | THCUNN_assertSameGPU(state, 3, input, target, gradInput); 49 | 50 | ptrdiff_t size = THCTensor_(nElement)(state, input); 51 | accreal norm = sizeAverage ? (accreal)(2)/size : (accreal)(2); 52 | 53 | input = THCTensor_(newContiguous)(state, input); 54 | target = THCTensor_(newContiguous)(state, target); 55 | 56 | THCTensor_(resizeAs)(state, gradInput, input); 57 | 58 | THCThrustAllocator thrustAlloc(state); 59 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 60 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 61 | thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); 62 | 63 | thrust::transform( 64 | #if CUDA_VERSION >= 7000 65 | thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), 66 | #endif 67 | input_data, input_data+size, target_data, gradInput_data, 68 | mse_updateGradInput_functor(norm)); 69 | 70 | THCTensor_(free)(state, input); 71 | THCTensor_(free)(state, target); 72 | } 73 | 74 | #endif 75 | -------------------------------------------------------------------------------- /lib/THCUNN/BCECriterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | template 13 | inline __device__ T eps(); 14 | 15 | template <> 16 | inline __device__ float eps() { return 1e-12f; } 17 | 18 | template <> 19 | inline __device__ double eps() { return 1e-12; } 20 | 21 | template 22 | struct bce_functor 23 | { 24 | template 25 | __host__ __device__ 26 | Acctype operator()(Tuple x) 27 | { 28 | Dtype input = thrust::get<0>(x); 29 | Dtype t = thrust::get<1>(x); 30 | assert(input >= 0. && input <= 1.); 31 | return - (t * THCNumerics::log(input + eps()) + (Acctype(1)- t) * THCNumerics::log(Acctype(1) - input + eps())); 32 | } 33 | }; 34 | 35 | template 36 | struct bce_functor_weights 37 | { 38 | template 39 | __host__ __device__ 40 | Acctype operator()(Tuple x) 41 | { 42 | Dtype input = thrust::get<0>(x); 43 | Dtype t = thrust::get<1>(x); 44 | Dtype w = thrust::get<2>(x); 45 | assert(input >= 0. && input <= 1.); 46 | return - w * (t * THCNumerics::log(input + eps()) + (Acctype(1) - t) * THCNumerics::log(Acctype(1) - input + eps())); 47 | } 48 | }; 49 | 50 | template 51 | struct bce_updateGradInput_functor 52 | { 53 | const Dtype norm; 54 | 55 | bce_updateGradInput_functor(Dtype norm_) 56 | : norm(norm_) 57 | {} 58 | 59 | template 60 | __host__ __device__ 61 | Dtype operator()(Tuple x) 62 | { 63 | Dtype o = thrust::get<0>(x); 64 | Dtype t = thrust::get<1>(x); 65 | return ScalarConvert::to(- (t - o) / ((Acctype(1) - o + eps()) * (o + eps())) * norm); 66 | } 67 | }; 68 | 69 | template 70 | struct bce_updateGradInput_functor_weights 71 | { 72 | const Dtype norm; 73 | 74 | bce_updateGradInput_functor_weights(Dtype norm_) 75 | : norm(norm_) 76 | {} 77 | 78 | template 79 | __host__ __device__ 80 | Dtype operator()(Tuple x) 81 | { 82 | Dtype o = thrust::get<0>(x); 83 | Dtype t = thrust::get<1>(x); 84 | Dtype w = thrust::get<2>(x); 85 | return ScalarConvert::to(- (t - o) / ((Acctype(1) - o + eps()) * (o + eps())) * norm * w); 86 | } 87 | }; 88 | 89 | #include "generic/BCECriterion.cu" 90 | #include "THCGenerateFloatTypes.h" 91 | -------------------------------------------------------------------------------- /lib/THCUNN/VolumetricMaxUnpooling.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCDeviceTensor.cuh" 4 | #include "THCDeviceTensorUtils.cuh" 5 | #include "THCDeviceUtils.cuh" 6 | #include "THCHalf.h" 7 | #include "THCHalfAutoNumerics.cuh" 8 | 9 | #include 10 | 11 | template 12 | __global__ void cuda_VolumetricMaxUnpooling_updateOutput( 13 | THCDeviceTensor input, 14 | THCDeviceTensor indices, 15 | THCDeviceTensor output, 16 | int dT, int dH, int dW, 17 | int padT, int padH, int padW, int offsetZ) 18 | { 19 | long iColumn = blockIdx.x * blockDim.x + threadIdx.x; 20 | long iRow = blockIdx.y * blockDim.y + threadIdx.y; 21 | long iFrame = (blockIdx.z + offsetZ) % input.getSize(1); // intput frame/time 22 | long slice = (blockIdx.z + offsetZ) / input.getSize(1); // intput slice/feature 23 | 24 | if (iRow < input.getSize(2) && iColumn < input.getSize(3)) 25 | { 26 | long start_t = iFrame * dT - padT; 27 | long start_h = iRow * dH - padH; 28 | long start_w = iColumn * dW - padW; 29 | 30 | Dtype val = input[slice][iFrame][iRow][iColumn]; 31 | 32 | THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn]; 33 | long maxz = ((unsigned char*)(idx))[0]; 34 | long maxy = ((unsigned char*)(idx))[1]; 35 | long maxx = ((unsigned char*)(idx))[2]; 36 | output[slice][start_t + maxz][start_h + maxy][start_w + maxx] = val; 37 | } 38 | } 39 | 40 | template 41 | __global__ void cuda_VolumetricMaxUnpooling_updateGradInput( 42 | THCDeviceTensor gradOutput, 43 | THCDeviceTensor indices, 44 | THCDeviceTensor gradInput, 45 | int dT, int dH, int dW, 46 | int padT, int padH, int padW, int offsetZ) 47 | { 48 | int iColumn = blockIdx.x * blockDim.x + threadIdx.x; 49 | int iRow = blockIdx.y * blockDim.y + threadIdx.y; 50 | int iFrame = (blockIdx.z + offsetZ) % gradInput.getSize(1); // output frame/time 51 | int slice = (blockIdx.z + offsetZ) / gradInput.getSize(1); // output slice/feature 52 | 53 | if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3)) 54 | { 55 | 56 | long start_t = iFrame * dT - padT; 57 | long start_h = iRow * dH - padH; 58 | long start_w = iColumn * dW - padW; 59 | 60 | THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn]; 61 | long maxz = ((unsigned char*)(idx))[0]; 62 | long maxy = ((unsigned char*)(idx))[1]; 63 | long maxx = ((unsigned char*)(idx))[2]; 64 | 65 | Dtype grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx]; 66 | 67 | gradInput[slice][iFrame][iRow][iColumn] = grad_val; 68 | } 69 | } 70 | 71 | #include "generic/VolumetricMaxUnpooling.cu" 72 | #include "THCGenerateFloatTypes.h" 73 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/DistKLDivCriterion.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu" 3 | #else 4 | 5 | void THNN_(DistKLDivCriterion_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *target, 9 | THCTensor *output, 10 | bool sizeAverage) 11 | { 12 | THCUNN_check_nElement(state, input, target); 13 | THCUNN_check_dim_size(state, output, 1, 0, 1); 14 | THCUNN_assertSameGPU(state, 2, input, target); 15 | 16 | THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, 17 | "input and target need to have the same number of elements"); 18 | 19 | accreal sum; 20 | 21 | ptrdiff_t size = THCTensor_(nElement)(state, input); 22 | 23 | input = THCTensor_(newContiguous)(state, input); 24 | target = THCTensor_(newContiguous)(state, target); 25 | 26 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 27 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 28 | sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus(), kl_functor()); 29 | 30 | if (sizeAverage) 31 | sum /= size; 32 | 33 | THCTensor_(free)(state, input); 34 | THCTensor_(free)(state, target); 35 | 36 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); 37 | } 38 | 39 | void THNN_(DistKLDivCriterion_updateGradInput)( 40 | THCState *state, 41 | THCTensor *input, 42 | THCTensor *target, 43 | THCTensor *gradInput, 44 | bool sizeAverage) 45 | { 46 | THCUNN_check_nElement(state, input, target); 47 | THCUNN_assertSameGPU(state, 3, input, target, gradInput); 48 | 49 | THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, 50 | "input and target need to have the same number of elements"); 51 | 52 | ptrdiff_t size = THCTensor_(nElement)(state, input); 53 | real norm = (sizeAverage ? ScalarConvert::to(accreal(1)/size) : ScalarConvert::to(1)); 54 | 55 | input = THCTensor_(newContiguous)(state, input); 56 | target = THCTensor_(newContiguous)(state, target); 57 | 58 | THCTensor_(resizeAs)(state, gradInput, input); 59 | 60 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 61 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 62 | thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); 63 | 64 | thrust::transform(input_data, input_data+size, target_data, gradInput_data, kl_updateGradInput_functor(norm)); 65 | 66 | THCTensor_(free)(state, input); 67 | THCTensor_(free)(state, target); 68 | } 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /lib/THCUNN/PReLU.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | #include "common.h" 7 | 8 | template 9 | struct PReLUUpdateOutput 10 | { 11 | T* weight_; 12 | 13 | PReLUUpdateOutput(T* weight) 14 | : weight_(weight) 15 | {} 16 | 17 | __device__ __forceinline__ void operator()(T *out, T *in) 18 | { 19 | T x = *in; 20 | *out = (x > 0) ? x : weight_[0] * x; 21 | } 22 | }; 23 | 24 | template 25 | __global__ void preluForward(T *output, const T *input, const T *weight, int n, int nElemsPerSample, int mapSize) 26 | { 27 | CUDA_KERNEL_LOOP(i, n) 28 | { 29 | int positionInSample = i % nElemsPerSample; 30 | int mapNumber = positionInSample / mapSize; 31 | output[i] = input[i] > 0 ? input[i] : input[i] * weight[mapNumber]; 32 | } 33 | } 34 | 35 | template 36 | struct PReLUUpdateGradInput 37 | { 38 | T *weight_; 39 | 40 | PReLUUpdateGradInput(T *weight) 41 | : weight_(weight) 42 | {} 43 | 44 | __device__ __forceinline__ void operator()(T *gradInput, T *gradOutput, T *input) 45 | { 46 | *gradInput = *input > 0 ? *gradOutput : *gradOutput * *weight_; 47 | } 48 | }; 49 | 50 | template 51 | __global__ void preluBackward( 52 | T *gradInput, 53 | const T *input, 54 | const T *weight, 55 | const T *gradOutput, 56 | int n, int nElemsPerSample, int mapSize) 57 | { 58 | CUDA_KERNEL_LOOP(i, n) 59 | { 60 | int positionInSample = i % nElemsPerSample; 61 | int mapNumber = positionInSample / mapSize; 62 | gradInput[i] = input[i] > 0 ? gradOutput[i] : gradOutput[i] * weight[mapNumber]; 63 | } 64 | } 65 | 66 | template 67 | struct PReLUAccGradParametersShared 68 | { 69 | __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) 70 | { 71 | *gradInput = (*input) * (*gradOutput) * (*input <= 0); 72 | } 73 | }; 74 | 75 | template 76 | struct PReLUAccGradParameters 77 | { 78 | T scale; 79 | 80 | PReLUAccGradParameters(T scale) 81 | : scale(scale) 82 | {} 83 | 84 | __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) 85 | { 86 | *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0); 87 | } 88 | }; 89 | 90 | template 91 | struct PReLUAccGradParameters1to1 92 | { 93 | T scale; 94 | 95 | PReLUAccGradParameters1to1(T scale) 96 | : scale(scale) 97 | {} 98 | 99 | __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput) 100 | { 101 | *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0); 102 | } 103 | }; 104 | 105 | #include "generic/PReLU.cu" 106 | #include "THCGenerateFloatTypes.h" 107 | -------------------------------------------------------------------------------- /lib/THCUNN/VolumetricUpSamplingNearest.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "THCHalf.h" 10 | #include "THCHalfAutoNumerics.cuh" 11 | 12 | /* 13 | * Description: 14 | */ 15 | 16 | __device__ int translate_idx(int ii, int d1, int d2, int d3, int d4, int scale_factor) 17 | { 18 | int x, y, z, w, v; 19 | v = ii % d4; 20 | ii = ii/d4; 21 | w = ii % d3; 22 | ii = ii/d3; 23 | z = ii % d2; 24 | ii = ii/d2; 25 | y = ii % d1; 26 | ii = ii/d1; 27 | x = ii; 28 | v = v/scale_factor; 29 | w = w/scale_factor; 30 | z = z/scale_factor; 31 | d2 /= scale_factor; 32 | d3 /= scale_factor; 33 | d4 /= scale_factor; 34 | return ((((x*d1+y)*d2)+z)*d3+w)*d4+v; 35 | 36 | } 37 | __device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int d4, int scale_factor, int off_x, int off_y, int off_z) 38 | { 39 | int x, y, z, w, v; 40 | v = ii % d4; 41 | ii = ii/d4; 42 | w = ii % d3; 43 | ii = ii/d3; 44 | z = ii % d2; 45 | ii = ii/d2; 46 | y = ii % d1; 47 | ii = ii/d1; 48 | x = ii; 49 | v = v*scale_factor+off_x; 50 | w = w*scale_factor+off_y; 51 | z = z*scale_factor+off_z; 52 | d2 *= scale_factor; 53 | d3 *= scale_factor; 54 | d4 *= scale_factor; 55 | return ((((x*d1+y)*d2)+z)*d3+w)*d4+v; 56 | 57 | } 58 | 59 | template 60 | __global__ void vupscale(Dtype *input, Dtype *output, long no_elements, 61 | int scale_factor, int d1, int d2, int d3, int d4) 62 | { 63 | // output offset: 64 | long ii = threadIdx.x + blockDim.x * blockIdx.x; 65 | ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; 66 | if (ii >= no_elements) return; 67 | int ipidx = translate_idx(ii, d1, d2, d3, d4, scale_factor); 68 | output[ii]=input[ipidx]; 69 | } 70 | 71 | /* 72 | * Description: 73 | */ 74 | template 75 | __global__ void vdownscale(Dtype *gradInput_data, Dtype *gradOutput_data, long no_elements, 76 | int scale_factor, int d1, int d2, int d3, int d4) 77 | { 78 | // output offset: 79 | long ii = threadIdx.x + blockDim.x * blockIdx.x; 80 | ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; 81 | if (ii >= no_elements) return; 82 | Acctype sum = Acctype(0); 83 | for (int i=0; i < scale_factor; i++){ 84 | for(int j=0; j < scale_factor; j++){ 85 | for(int k=0; k < scale_factor; k++){ 86 | int ipidx = translate_idx_inv(ii, d1, d2, d3, d4, scale_factor, i, j, k); 87 | sum += gradOutput_data[ipidx]; 88 | } 89 | } 90 | } 91 | gradInput_data[ii] += ScalarConvert::to(sum); 92 | } 93 | 94 | #include "generic/VolumetricUpSamplingNearest.cu" 95 | #include "THCGenerateFloatTypes.h" 96 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/GatedLinearUnit.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/GatedLinearUnit.cu" 3 | #else 4 | 5 | void THNN_(GatedLinear_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *output, 9 | int dim) 10 | { 11 | THCUNN_assertSameGPU(state, 2, input, output); 12 | 13 | // size output to half of input 14 | dim = dim - TH_INDEX_BASE; 15 | const long nIn = THCTensor_(size)(state, input, dim); 16 | THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", 17 | dim + TH_INDEX_BASE, nIn); 18 | const long inputSize = THCTensor_(size)(state, input, dim) / 2; 19 | THLongStorage *newSizes = THCTensor_(newSizeOf)(state, input); 20 | THLongStorage_set(newSizes, dim, inputSize); 21 | THCTensor_(resize)(state, output, newSizes, NULL); 22 | 23 | // halve tensor 24 | THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize); 25 | THCTensor *secondHalf = THCTensor_(newNarrow)(state, input, dim, inputSize, inputSize); 26 | 27 | // x = x1:cmul( sigmoid(x2) ) 28 | THC_pointwiseApply3(state, output, secondHalf, firstHalf, gatedLinearCSigMul_functor()); 29 | 30 | THLongStorage_free(newSizes); 31 | THCTensor_(free)(state, firstHalf); 32 | THCTensor_(free)(state, secondHalf); 33 | } 34 | 35 | void THNN_(GatedLinear_updateGradInput)( 36 | THCState *state, 37 | THCTensor *input, 38 | THCTensor *gradOutput, 39 | THCTensor *gradInput, 40 | int dim) 41 | { 42 | THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); 43 | dim = dim - TH_INDEX_BASE; 44 | const long nIn = THCTensor_(size)(state, input, dim); 45 | THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", 46 | dim + TH_INDEX_BASE, nIn); 47 | 48 | THCTensor_(resizeAs)(state, gradInput, input); 49 | const long inputSize = THCTensor_(size)(state, input, dim) / 2; 50 | THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize); 51 | THCTensor *secondHalf = THCTensor_(newNarrow)(state, input, dim, inputSize, inputSize); 52 | THCTensor *gradInputfirstHalf = THCTensor_(newNarrow)(state, gradInput, dim, 0, inputSize); 53 | THCTensor *gradInputsecondHalf = THCTensor_(newNarrow)(state, gradInput, dim, inputSize, inputSize); 54 | // first half of derivative 55 | THC_pointwiseApply3(state, gradInputfirstHalf, secondHalf, gradOutput, gatedLinearCSigMul_functor()); 56 | // second half of derivative 57 | THCTensor_(copy)(state, gradInputsecondHalf, firstHalf); 58 | THC_pointwiseApply3(state, gradInputsecondHalf, secondHalf, gradOutput, gatedLinearDerivativeSecondHalf_functor()); 59 | 60 | THCTensor_(free)(state, firstHalf); 61 | THCTensor_(free)(state, secondHalf); 62 | THCTensor_(free)(state, gradInputfirstHalf); 63 | THCTensor_(free)(state, gradInputsecondHalf); 64 | } 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /lib/THCUNN/SparseLinear.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | 5 | #include 6 | 7 | static cusparseHandle_t cusparse_handle = 0; 8 | 9 | static void init_cusparse() { 10 | if (cusparse_handle == 0) { 11 | cusparseStatus_t status = cusparseCreate(&cusparse_handle); 12 | if (status != CUSPARSE_STATUS_SUCCESS) { 13 | THError("CUSPARSE Library initialization failed"); 14 | } 15 | } 16 | } 17 | 18 | #ifdef CUDA_HALF_TENSOR 19 | void THNN_CudaHalfSparseLinear_updateOutput( 20 | THCState *state, 21 | THCudaHalfTensor *input, 22 | THCudaHalfTensor *output, 23 | THCudaHalfTensor *weight, 24 | THCudaHalfTensor *bias) { 25 | THError("THCudaHalfTensor not supported with SparseLinear"); 26 | } 27 | 28 | void THNN_CudaHalfSparseLinear_accGradParameters( 29 | THCState *state, 30 | THCudaHalfTensor *input, 31 | THCudaHalfTensor *gradOutput, 32 | THCudaHalfTensor *gradWeight, 33 | THCudaHalfTensor *gradBias, 34 | THCudaHalfTensor *weight, 35 | THCudaHalfTensor *bias, 36 | float weightDecay, 37 | float scale) { 38 | THError("THCudaHalfTensor not supported with SparseLinear"); 39 | } 40 | 41 | void THNN_CudaHalfSparseLinear_legacyUpdateOutput( 42 | THCState *state, 43 | THCudaHalfTensor *input, 44 | THCudaHalfTensor *output, 45 | THCudaHalfTensor *weight, 46 | THCudaHalfTensor *bias) { 47 | THError("THCudaHalfTensor not supported with SparseLinear"); 48 | } 49 | 50 | void THNN_CudaHalfSparseLinear_legacyAccGradParameters( 51 | THCState *state, 52 | THCudaHalfTensor *input, 53 | THCudaHalfTensor *gradOutput, 54 | THCudaHalfTensor *gradWeight, 55 | THCudaHalfTensor *gradBias, 56 | THCudaHalfTensor *weight, 57 | THCudaHalfTensor *bias, 58 | float weightDecay, 59 | float scale) { 60 | THError("THCudaHalfTensor not supported with SparseLinear"); 61 | } 62 | 63 | void THNN_CudaHalfSparseLinear_zeroGradParameters( 64 | THCState *state, 65 | THCudaHalfTensor *gradWeight, 66 | THCudaHalfTensor *gradBias, 67 | THCudaHalfTensor *lastInput) { 68 | THError("THCudaHalfTensor not supported with SparseLinear"); 69 | } 70 | 71 | void THNN_CudaHalfSparseLinear_updateParameters( 72 | THCState *state, 73 | THCudaHalfTensor *weight, 74 | THCudaHalfTensor *bias, 75 | THCudaHalfTensor *gradWeight, 76 | THCudaHalfTensor *gradBias, 77 | THCudaHalfTensor *lastInput, 78 | float learningRate) { 79 | THError("THCudaHalfTensor not supported with SparseLinear"); 80 | } 81 | #endif 82 | 83 | #include "generic/SparseLinear.cu" 84 | #include "THCGenerateFloatType.h" 85 | #include "generic/SparseLinear.cu" 86 | #include "THCGenerateDoubleType.h" 87 | -------------------------------------------------------------------------------- /doc/cunnmodules.md: -------------------------------------------------------------------------------- 1 | 2 | # Additional Modules # 3 | 4 | The following nn modules are also made available by the cunn package: 5 | * [DataParallelTable](#nn.DataParallelTable) : parallelize calls to `forward` and `backward` across multiple-GPUs. 6 | * [GPU](https://github.com/torch/nn/blob/master/doc/simple.md#nn.GPU) : decorates a module so that it can be executed on a specific GPU device. 7 | 8 | 9 | ## DataParallelTable ## 10 | 11 | ```lua 12 | module = nn.DataParallelTable(dim, [flattenParams], [useNCCL]) 13 | module:add(net, {gpu1, [gpu2, ...]}) 14 | ``` 15 | 16 | DataParallelTable implements data parallelism for Torch modules. The same model 17 | is replicated on multiple GPUs. The input is split, typically into smaller mini-batches. 18 | Each replicated model handles only its portion of the input. The weight updates for 19 | each replica are summed together on the first replica in accGradParameters. 20 | 21 | ### DataParallelTable(dim, [flattenParams], [useNCCL]) ### 22 | 23 | Creates a `DataParallelTable` that splits the input on the dimension `dim`. If `flattenParams` is `true`, [`getParameters()`](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module.getParameters) will be called on the replicated module. If `useNCCL` is `true` and both [NCCL](https://github.com/NVIDIA/nccl) and the [NCCL torch bindings](https://github.com/ngimel/nccl.torch) are installed, NCCL will be used for inter-GPU communication. 24 | 25 | For best performance, use `flattenParams` and `NCCL`. 26 | 27 | ### DataParallelTable:add(module, gpus) ### 28 | 29 | Replicates `module` on the table of `gpus`. For example: 30 | 31 | ```lua 32 | nn.DataParallelTable(1):add(module, {1, 2, 3, 4}) 33 | ``` 34 | 35 | ### DataParallelTable:threads(initFunc) ### 36 | 37 | Switches the internal implementation to use a seperate thread for each replica. This may hide the cost of kernel launches by dispatching them in parallel. The `initFunc` is executed in each thread. 38 | 39 | ```lua 40 | nn.DataParallelTable(1):threads(function() 41 | require 'cudnn' 42 | end) 43 | ``` 44 | 45 | ### DataParallelTable:syncParameters() ### 46 | 47 | Copies the model parameters from the first replica to all other replicas. This is automatically called from `updateOutput`, if it has not been called since the last `accGradParameters`. 48 | 49 | ### Example of training using DataParallelTable ### 50 | 51 | ```lua 52 | -- CONSTRUCT MODEL: 53 | conv_net = makeConvNet() -- i.e. create nn.Sequential() and fill it 54 | net = nn.DataParallelTable(1) -- Split along first (batch) dimension 55 | net:add(conv_net, {1, 2}) -- Use GPUs 1 and 2 56 | -- TRAINING: 57 | for i = 1, num_epochs do 58 | local output = net:forward(input) 59 | local err = criterion:forward(output, target) 60 | net:zeroGradParameters() 61 | local gradOutput = criterion:backward(output, target) 62 | local gradInput = net:backward(input, gradOutput) 63 | net:updateParameters(lr) 64 | end 65 | ``` 66 | 67 | -------------------------------------------------------------------------------- /lib/THCUNN/LogSigmoid.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | 6 | template 7 | struct logSigmoid_updateOutput_functor 8 | { 9 | __device__ void operator()(T *output, const T *input) const { 10 | const T max = fmaxType(0.f, - *input); 11 | const T z = THCNumerics::exp(-max) + THCNumerics::exp(-*input -max); 12 | *output = -(max + THCNumerics::log(z)); 13 | } 14 | }; 15 | 16 | template 17 | struct logSigmoid_updateGradInput_functor 18 | { 19 | __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const { 20 | const T max = fmaxType(0.f, -*input); 21 | const T z = THCNumerics::exp(-max) + THCNumerics::exp(-*input -max); 22 | T max_deriv = 0.f; 23 | T sign = -1.f; 24 | if (*input < 0.f){ 25 | max_deriv = -1.f; 26 | sign = 1.f; 27 | } 28 | *gradInput = *gradOutput * (-max_deriv - sign*((z - 1.f)/z)); 29 | } 30 | }; 31 | 32 | #ifdef CUDA_HALF_TENSOR 33 | template <> 34 | struct logSigmoid_updateOutput_functor { 35 | __device__ __forceinline__ void operator()(half* output, const half *input) const { 36 | #ifdef CUDA_HALF_INSTRUCTIONS 37 | const half max = fmaxType(__float2half(0.f), __hneg(*input)); 38 | const half z = THCNumerics::exp(__hneg(max)) + THCNumerics::exp(__hneg(*input) - max); 39 | *output = __hneg(max + THCNumerics::log(z)); 40 | #else 41 | float in = __half2float(*input); 42 | float max = fmaxType(0.f, -in); 43 | float z = THCNumerics::exp(-max) + THCNumerics::exp(-in - max); 44 | *output = __float2half(-(max + THCNumerics::log(z))); 45 | #endif 46 | } 47 | }; 48 | 49 | template <> 50 | struct logSigmoid_updateGradInput_functor { 51 | __device__ __forceinline__ void operator()(half* gradInput, const half *input, const half *gradOutput) const { 52 | #ifdef CUDA_HALF_INSTRUCTIONS 53 | const half one = __float2half(1.f); 54 | const half zero = __float2half(0.f); 55 | const half max = fmaxType(zero, __hneg(*input)); 56 | const half z = THCNumerics::exp(__hneg(max)) + THCNumerics::exp(__hneg(*input) - max); 57 | half max_deriv = zero; 58 | half sign = __hneg(one); 59 | if(*input < zero){ 60 | max_deriv = __hneg(one); 61 | sign = one; 62 | } 63 | *gradInput = __hmul(*gradOutput, (__hneg(max_deriv) - __hmul(sign, __hdiv(z - one, z)))); 64 | #else 65 | const float in = __half2float(*input); 66 | const float max = fmaxType(0.f, -in); 67 | const float z = THCNumerics::exp(-max) + THCNumerics::exp(-in - max); 68 | const float go = __half2float(*gradOutput); 69 | float max_deriv = 0.f; 70 | float sign = -1.f; 71 | if(in < 0.f){ 72 | max_deriv = -1.f; 73 | sign = 1.f; 74 | } 75 | *gradInput = __float2half(go * (-max_deriv - sign*((z - 1.f)/z))); 76 | #endif 77 | } 78 | }; 79 | #endif 80 | 81 | #include "generic/LogSigmoid.cu" 82 | #include "THCGenerateFloatTypes.h" 83 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SmoothL1Criterion.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu" 3 | #else 4 | 5 | void THNN_(SmoothL1Criterion_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *target, 9 | THCTensor *output, 10 | bool sizeAverage) 11 | { 12 | THCUNN_check_nElement(state, input, target); 13 | THCUNN_check_dim_size(state, output, 1, 0, 1); 14 | THCUNN_assertSameGPU(state, 2, input, target); 15 | THArgCheck( 16 | THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, 17 | "input and target need to have the same number of elements" 18 | ); 19 | 20 | ptrdiff_t size = THCTensor_(nElement)(state, input); 21 | 22 | input = THCTensor_(newContiguous)(state, input); 23 | target = THCTensor_(newContiguous)(state, target); 24 | 25 | THCThrustAllocator thrustAlloc(state); 26 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 27 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 28 | accreal sum = thrust::inner_product( 29 | #if CUDA_VERSION >= 7000 30 | thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), 31 | #endif 32 | input_data, input_data+size, target_data, (accreal) 0, 33 | thrust::plus(), smoothl1_functor() 34 | ); 35 | 36 | if (sizeAverage) 37 | sum /= size; 38 | 39 | THCTensor_(free)(state, input); 40 | THCTensor_(free)(state, target); 41 | 42 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); 43 | } 44 | 45 | void THNN_(SmoothL1Criterion_updateGradInput)( 46 | THCState *state, 47 | THCTensor *input, 48 | THCTensor *target, 49 | THCTensor *gradInput, 50 | bool sizeAverage) 51 | { 52 | THCUNN_check_nElement(state, input, target); 53 | THCUNN_assertSameGPU(state, 3, input, target, gradInput); 54 | THArgCheck( 55 | THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2, 56 | "input and target need to have the same number of elements" 57 | ); 58 | 59 | ptrdiff_t size = THCTensor_(nElement)(state, input); 60 | real norm = ScalarConvert::to(sizeAverage ? accreal(1)/size : accreal(1)); 61 | 62 | input = THCTensor_(newContiguous)(state, input); 63 | target = THCTensor_(newContiguous)(state, target); 64 | 65 | THCTensor_(resizeAs)(state, gradInput, input); 66 | 67 | THCThrustAllocator thrustAlloc(state); 68 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 69 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 70 | thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); 71 | 72 | thrust::transform( 73 | #if CUDA_VERSION >= 7000 74 | thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)), 75 | #endif 76 | input_data, input_data+size, target_data, gradInput_data, 77 | smoothl1_updateGradInput_functor(norm) 78 | ); 79 | 80 | THCTensor_(free)(state, input); 81 | THCTensor_(free)(state, target); 82 | } 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /lib/THCUNN/MultiMarginCriterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | 6 | #define MULTIMARGIN_THREADS 128 7 | 8 | template 9 | __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin) 10 | { 11 | __shared__ Acctype buffer[MULTIMARGIN_THREADS]; 12 | int k = blockIdx.x; 13 | Dtype *input_k = input + k*dim; 14 | Dtype *output_k = output + k; 15 | int target_k = ((int)target[k]) - TH_INDEX_BASE; 16 | Dtype input_target_k = input_k[target_k]; 17 | 18 | int i_start = threadIdx.x; 19 | int i_end = dim; 20 | int i_step = blockDim.x; 21 | 22 | buffer[threadIdx.x] = 0; 23 | for (int i = i_start; i < i_end; i += i_step) 24 | { 25 | Dtype z = margin - input_target_k + input_k[i]; 26 | if (i == target_k) 27 | continue; 28 | 29 | if (z > 0) { 30 | Dtype h = (P==1) ? z : z*z; 31 | if(weights) 32 | h *= weights[target_k]; 33 | buffer[threadIdx.x] += h; 34 | } 35 | } 36 | __syncthreads(); 37 | 38 | // reduce 39 | if (threadIdx.x == 0) 40 | { 41 | Acctype sum = 0; 42 | for (int i=0; i < blockDim.x; i++) 43 | sum += buffer[i]; 44 | 45 | *output_k = ScalarConvert::to(sum/dim); 46 | if(sizeAverage) 47 | *output_k /= nframe; 48 | } 49 | } 50 | 51 | template 52 | __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin) 53 | { 54 | __shared__ Acctype buffer[MULTIMARGIN_THREADS]; 55 | int k = blockIdx.x; 56 | Dtype *input_k = input + k*dim; 57 | Dtype *gradInput_k = gradInput + k*dim; 58 | int target_k = ((int)target[k]) - TH_INDEX_BASE; 59 | Dtype input_target_k = input_k[target_k]; 60 | Acctype g = (sizeAverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim)); 61 | 62 | int i_start = threadIdx.x; 63 | int i_end = dim; 64 | int i_step = blockDim.x; 65 | 66 | buffer[threadIdx.x] = 0; 67 | for (int i=i_start; i 0) 74 | { 75 | Dtype h = ScalarConvert::to((P == 1) ? g : 2*g*z); 76 | if(weights) 77 | h *= weights[target_k]; 78 | buffer[threadIdx.x] -= h; 79 | gradInput_k[i] = h; 80 | } 81 | else 82 | gradInput_k[i] = ScalarConvert::to(0); 83 | } 84 | 85 | __syncthreads(); 86 | 87 | // reduce 88 | if (threadIdx.x == 0) 89 | { 90 | Acctype gradInput_target_k = 0; 91 | for (int i=0; i::to(gradInput_target_k); 94 | } 95 | } 96 | 97 | #include "generic/MultiMarginCriterion.cu" 98 | #include "THCGenerateFloatTypes.h" 99 | 100 | #undef MULTIMARGIN_THREADS 101 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialReflectionPadding.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCDeviceTensor.cuh" 4 | #include "THCDeviceTensorUtils.cuh" 5 | #include "THCDeviceUtils.cuh" 6 | #include "THCReduceApplyUtils.cuh" 7 | #include 8 | 9 | #include "THCHalf.h" 10 | #include "THCHalfAutoNumerics.cuh" 11 | #include "THCAtomics.cuh" 12 | 13 | template 14 | __global__ void SpatialReflectionPadding_updateOutput( 15 | THCDeviceTensor input, 16 | THCDeviceTensor output, 17 | int padT, int padB, int padL, int padR) { 18 | 19 | int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; 20 | int plane = blockIdx.y; 21 | int batch = blockIdx.z; 22 | if (outputPointId >= output.getSize(2) * output.getSize(3)) { 23 | return; 24 | } 25 | int outputPointX = outputPointId % output.getSize(3); 26 | int outputPointY = outputPointId / output.getSize(3); 27 | 28 | int iStartX = max(0, -padL); 29 | int iStartY = max(0, -padT); 30 | int oStartX = max(0, padL); 31 | int oStartY = max(0, padT); 32 | 33 | int inputPointX = abs(outputPointX - padL) 34 | - abs(outputPointX - (input.getSize(3) + padL - 1)) 35 | - outputPointX 36 | + 2 * padL + input.getSize(3) - 1 37 | - oStartX + iStartX; 38 | 39 | int inputPointY = abs(outputPointY - padT) 40 | - abs(outputPointY - (input.getSize(2) + padT - 1)) 41 | - outputPointY 42 | + 2 * padT + input.getSize(2) - 1 43 | - oStartY + iStartY; 44 | 45 | Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX]; 46 | output[batch][plane][outputPointY][outputPointX] = valueToCopy; 47 | } 48 | 49 | template 50 | __global__ void SpatialReflectionPadding_updateGradInput( 51 | THCDeviceTensor gradInput, 52 | THCDeviceTensor gradOutput, 53 | int padT, int padB, int padL, int padR) { 54 | 55 | int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; 56 | int plane = blockIdx.y; 57 | int batch = blockIdx.z; 58 | if (outputPointId >= gradOutput.getSize(2) * gradOutput.getSize(3)) { 59 | return; 60 | } 61 | int outputPointX = outputPointId % gradOutput.getSize(3); 62 | int outputPointY = outputPointId / gradOutput.getSize(3); 63 | 64 | int iStartX = max(0, -padL); 65 | int iStartY = max(0, -padT); 66 | int oStartX = max(0, padL); 67 | int oStartY = max(0, padT); 68 | 69 | int inputPointX = abs(outputPointX - padL) 70 | - abs(outputPointX - (gradInput.getSize(3) + padL - 1)) 71 | - outputPointX 72 | + 2 * padL + gradInput.getSize(3) - 1 73 | - oStartX + iStartX; 74 | 75 | int inputPointY = abs(outputPointY - padT) 76 | - abs(outputPointY - (gradInput.getSize(2) + padT - 1)) 77 | - outputPointY 78 | + 2 * padT + gradInput.getSize(2) - 1 79 | - oStartY + iStartY; 80 | 81 | Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX]; 82 | atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy); 83 | } 84 | 85 | #include "generic/SpatialReflectionPadding.cu" 86 | #include "THCGenerateFloatTypes.h" 87 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialClassNLLCriterion.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include "THCAtomics.cuh" 5 | #include "common.h" 6 | #include 7 | 8 | #include 9 | 10 | template 11 | __global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel( 12 | T *output, 13 | T *total_weight, 14 | T *input, 15 | THCIndex_t *target, 16 | T *weights, 17 | int size_average, 18 | int batch_size, 19 | int n_classes, 20 | int map_nelem, 21 | int blocks_per_sample) 22 | { 23 | __shared__ AccumT partial_sums[CUDA_NUM_THREADS]; 24 | 25 | int i, t; 26 | T cur_weight; 27 | AccumT input_sum = 0; 28 | AccumT acc_weight = 0; 29 | 30 | int sample = blockIdx.x / blocks_per_sample; 31 | int toffset = sample * map_nelem; 32 | int ioffset = sample * map_nelem * n_classes; 33 | int step = blockDim.x * blocks_per_sample; 34 | for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x; 35 | i < map_nelem; 36 | i += step) { 37 | t = target[toffset + i] - TH_INDEX_BASE; 38 | assert(t >= 0 && t < n_classes); 39 | cur_weight = weights ? weights[t] : ScalarConvert::to(1); 40 | input_sum -= input[ioffset + i + map_nelem * t] * cur_weight; 41 | acc_weight += cur_weight; 42 | } 43 | 44 | __syncthreads(); 45 | 46 | input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus(), AccumT(0)); 47 | acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus(), AccumT(0)); 48 | 49 | if (threadIdx.x == 0) { 50 | atomicAdd(total_weight, ScalarConvert::to(acc_weight)); 51 | atomicAdd(output, ScalarConvert::to(input_sum)); 52 | } 53 | } 54 | 55 | template 56 | __global__ void cunn_SpatialClassNLLCriterion_sizeAverage_kernel( 57 | T *output, 58 | T *total_weight) 59 | { 60 | if (*total_weight > 0) 61 | *output = THCNumerics::div(*output, *total_weight); 62 | } 63 | 64 | template 65 | __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel( 66 | T *gradInput, 67 | THCIndex_t *target, 68 | T *weights, 69 | T *total_weight, 70 | int size_average, 71 | int batch_size, 72 | int n_classes, 73 | int map_nelem, 74 | int blocks_per_sample) 75 | { 76 | if (*total_weight <= 0) 77 | return; 78 | 79 | int i, t; 80 | T norm = size_average ? (ScalarConvert::to(1) / *total_weight) : ScalarConvert::to(1); 81 | 82 | int sample = blockIdx.x / blocks_per_sample; 83 | int step = blockDim.x * blocks_per_sample; 84 | int toffset = sample * map_nelem; 85 | int ioffset = sample * map_nelem * n_classes; 86 | for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x; 87 | i < map_nelem; 88 | i += step) { 89 | t = (int)target[toffset + i] - TH_INDEX_BASE; 90 | assert(t >= 0 && t < n_classes); 91 | gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : ScalarConvert::to(1)) * norm; 92 | } 93 | } 94 | 95 | #include "generic/SpatialClassNLLCriterion.cu" 96 | #include "THCGenerateFloatTypes.h" 97 | -------------------------------------------------------------------------------- /lib/THCUNN/RReLU.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include 5 | #include "common.h" 6 | #include 7 | #include 8 | 9 | // copied from cutorch/lib/THC/THCTensorRandom.cu 10 | #define MAX_NUM_BLOCKS 64 11 | #define BLOCK_SIZE 256 12 | #define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS) 13 | 14 | template 15 | inline T __device__ curand_uniform_type(curandStateMtgp32 *state); 16 | 17 | #ifdef CUDA_HALF_TENSOR 18 | template <> 19 | inline half __device__ curand_uniform_type(curandStateMtgp32 *state) { 20 | return ScalarConvert::to(curand_uniform(state)); 21 | } 22 | #endif 23 | 24 | template <> 25 | inline float __device__ curand_uniform_type(curandStateMtgp32 *state) { 26 | return curand_uniform(state); 27 | } 28 | 29 | template <> 30 | inline double __device__ curand_uniform_type(curandStateMtgp32 *state) { 31 | return curand_uniform_double(state); 32 | } 33 | 34 | template 35 | __global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state, 36 | T *input, T* noise, T *output, double a, double b) 37 | { 38 | CUDA_KERNEL_LOOP(i, n) 39 | { 40 | if (input[i] <= 0) 41 | { 42 | T r = curand_uniform_type(&state[blockIdx.x]); 43 | r = ScalarConvert::to(r * (b-a) + a); 44 | output[i] = input[i] * r; 45 | noise[i] = r; 46 | } 47 | else 48 | { 49 | output[i] = input[i]; 50 | noise[i] = ScalarConvert::to(1); 51 | } 52 | } 53 | } 54 | 55 | template 56 | struct RReLUUpdateOutputEval_functor 57 | { 58 | const T negSlope_; 59 | 60 | RReLUUpdateOutputEval_functor(T negSlope) 61 | : negSlope_(negSlope) 62 | {} 63 | 64 | __device__ __forceinline__ void operator()(T *out, T *in) 65 | { 66 | const T x = *in; 67 | const T r = x <= 0 ? negSlope_ : ScalarConvert::to(1); 68 | *out = x * r; 69 | } 70 | }; 71 | 72 | template 73 | struct RReLUUpdateOutputEvalIP_functor 74 | { 75 | const T negSlope_; 76 | 77 | RReLUUpdateOutputEvalIP_functor(T negSlope) 78 | : negSlope_(negSlope) 79 | {} 80 | 81 | __device__ __forceinline__ void operator()(T *x) 82 | { 83 | if (*x <= 0) 84 | { 85 | *x = *x * negSlope_; 86 | } 87 | } 88 | }; 89 | 90 | template 91 | struct RReLUupdateGradInputEval_functor 92 | { 93 | const T negSlope_; 94 | 95 | RReLUupdateGradInputEval_functor(T negSlope) 96 | : negSlope_(negSlope) 97 | {} 98 | 99 | __device__ __forceinline__ void operator()(T *gradIn, T *gradOut, T *in) 100 | { 101 | *gradIn = (*in) <= 0 ? (*gradOut) * negSlope_ : (*gradOut); 102 | } 103 | }; 104 | 105 | template 106 | struct RReLUupdateGradInputEvalIP_functor 107 | { 108 | const T negSlope_; 109 | 110 | RReLUupdateGradInputEvalIP_functor(T negSlope) 111 | : negSlope_(negSlope) 112 | {} 113 | 114 | __device__ __forceinline__ void operator()(T *gradOut, T *in) 115 | { 116 | if (*in <= 0) 117 | { 118 | *gradOut = (*gradOut) * negSlope_; 119 | } 120 | } 121 | }; 122 | 123 | #include "generic/RReLU.cu" 124 | #include "THCGenerateFloatTypes.h" 125 | -------------------------------------------------------------------------------- /lib/THCUNN/VolumetricReplicationPadding.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCDeviceTensor.cuh" 4 | #include "THCDeviceTensorUtils.cuh" 5 | #include "THCDeviceUtils.cuh" 6 | #include "THCReduceApplyUtils.cuh" 7 | #include "THCHalf.h" 8 | #include "THCHalfAutoNumerics.cuh" 9 | #include "THCAtomics.cuh" 10 | #include 11 | 12 | template 13 | __global__ void VolumetricReplicationPadding_updateOutput( 14 | THCDeviceTensor input, 15 | THCDeviceTensor output, 16 | int pfront, int pback, int ptop, int pbottom, int pleft, int pright) { 17 | 18 | int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; 19 | int plane = blockIdx.y; 20 | int batch = blockIdx.z; 21 | if (outputPointId >= (output.getSize(2) * output.getSize(3) * 22 | output.getSize(4))) { 23 | return; 24 | } 25 | int outputPointX = outputPointId % output.getSize(4); 26 | int outputPointY = (outputPointId / output.getSize(4)) % output.getSize(3); 27 | int outputPointZ = outputPointId / (output.getSize(3) * output.getSize(4)); 28 | 29 | int iStartX = max(0, -pleft); 30 | int iStartY = max(0, -ptop); 31 | int iStartZ = max(0, -pfront); 32 | int oStartX = max(0, pleft); 33 | int oStartY = max(0, ptop); 34 | int oStartZ = max(0, pfront); 35 | 36 | int inputPointX = min(max(pleft, outputPointX), 37 | input.getSize(4) + pleft - 1) - oStartX + iStartX; 38 | int inputPointY = min(max(ptop, outputPointY), 39 | input.getSize(3) + ptop - 1) - oStartY + iStartY; 40 | int inputPointZ = min(max(pfront, outputPointZ), 41 | input.getSize(2) + pfront - 1) - oStartZ + iStartZ; 42 | 43 | Dtype valueToCopy = 44 | input[batch][plane][inputPointZ][inputPointY][inputPointX]; 45 | output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy; 46 | } 47 | 48 | template 49 | __global__ void VolumetricReplicationPadding_updateGradInput( 50 | THCDeviceTensor gradInput, 51 | THCDeviceTensor gradOutput, 52 | int pfront, int pback, int ptop, int pbottom, int pleft, int pright) { 53 | int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; 54 | int plane = blockIdx.y; 55 | int batch = blockIdx.z; 56 | 57 | if (outputPointId >= (gradOutput.getSize(2) * gradOutput.getSize(3) * 58 | gradOutput.getSize(4))) { 59 | return; 60 | } 61 | int outputPointX = outputPointId % gradOutput.getSize(4); 62 | int outputPointY = (outputPointId / gradOutput.getSize(4)) % 63 | gradOutput.getSize(3); 64 | int outputPointZ = outputPointId / (gradOutput.getSize(3) * 65 | gradOutput.getSize(4)); 66 | 67 | int iStartX = max(0, -pleft); 68 | int iStartY = max(0, -ptop); 69 | int iStartZ = max(0, -pfront); 70 | int oStartX = max(0, pleft); 71 | int oStartY = max(0, ptop); 72 | int oStartZ = max(0, pfront); 73 | 74 | int inputPointX = min(max(pleft, outputPointX), 75 | gradInput.getSize(4) + pleft - 1) - oStartX + iStartX; 76 | int inputPointY = min(max(ptop, outputPointY), 77 | gradInput.getSize(3) + ptop - 1) - oStartY + iStartY; 78 | int inputPointZ = min(max(pfront, outputPointZ), 79 | gradInput.getSize(2) + pfront - 1) - oStartZ + iStartZ; 80 | 81 | Dtype valueToCopy = 82 | gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX]; 83 | atomicAdd(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX], 84 | valueToCopy); 85 | } 86 | 87 | 88 | #include "generic/VolumetricReplicationPadding.cu" 89 | #include "THCGenerateFloatTypes.h" 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # CUDA backend for the Neural Network Package # 3 | 4 | This package provides a CUDA implementation for many of the modules in the base nn package: [nn](https://github.com/torch/nn/blob/master/README.md) 5 | * [Modules](doc/cunnmodules.md#nn.cunnmodules.dok): There are also additional GPU-related modules not found in the nn package. 6 | 7 | ## Installing from source 8 | ```bash 9 | git clone https://github.com/torch/cunn 10 | cd cunn 11 | luarocks make rocks/cunn-scm-1.rockspec 12 | ``` 13 | 14 | ## To use 15 | 16 | Simply convert your network model to CUDA by calling `:cuda()`: 17 | ```lua 18 | local model = nn.Sequential() 19 | model:add(nn.Linear(2,2)) 20 | model:add(nn.LogSoftMax()) 21 | 22 | model:cuda() -- convert model to CUDA 23 | ``` 24 | 25 | ... and similarly for your tensors: 26 | ```lua 27 | local input = torch.Tensor(32,2):uniform() 28 | input = input:cuda() 29 | local output = model:forward(input) 30 | ``` 31 | ... or create them directly as `CudaTensor`s: 32 | ```lua 33 | local input = torch.CudaTensor(32,2):uniform() 34 | local output = model:forward(input) 35 | ``` 36 | 37 | ## To run unit-tests 38 | 39 | ```lua 40 | luajit -l cunn -e 'cunn.test()' 41 | ``` 42 | 43 | ## GPU Training Concepts 44 | 45 | __Performance__ 46 | 47 | * data should be transferred between main memory and gpu in batches, otherwise the transfer time will be dominated 48 | by latency associated with speed of light, and execution overheads, rather than by bandwidth 49 | * therefore, train and predict using mini-batches 50 | * allocating GPU memory causes a sync-point, which will noticeably affect performance 51 | * therefore try to allocate any `CudaTensor`s once, at the start of the program, 52 | and then simply copy data backwards and forwards 53 | between main memory and existing `CudaTensor`s 54 | * similarly, try to avoid any operations that implicitly allocate new tensors. For example, if you write: 55 | ```lua 56 | require 'cutorch' 57 | 58 | local a = torch.CudaTensor(1000):uniform() 59 | for it=1,1000 do 60 | local b = torch.add(a, 1) 61 | end 62 | ``` 63 | ... this will allocate one thousand new `CudaTensor`s, one for each call to `torch.add(a, 1)`. 64 | 65 | Use instead this form: 66 | ```lua 67 | require 'cutorch' 68 | 69 | local a = torch.CudaTensor(1000):uniform() 70 | local b = torch.CudaTensor(1000):uniform() 71 | for it=1,1000 do 72 | b:add(a, 1) 73 | end 74 | ``` 75 | In this form, `b` is allocated only once, before the loop. Then the `b:add(a,1)` operation will perform 76 | the add inside the GPU kernel, and store the result into the original `b` `CudaTensor`. This 77 | will run noticeably faster, in general. It's also a lot less likely to eat up arbitrary amounts of memory, 78 | and less likely to need frequent calls to `collectgarbage(); collectgarbage()`. 79 | 80 | __Benchmarking__ 81 | 82 | * GPU operations will typically continue after an instruction has been issued 83 | * eg, if you do: 84 | ```lua 85 | require 'cutorch' 86 | local a = torch.CudaTensor(1000,1000):uniform() 87 | a:add(1) 88 | ``` 89 | ... the GPU kernel to add 1 will only be scheduled for launch by `a:add(1)`. It might not have completed yet, or 90 | even have reached the GPU, at the time that the `a:add(1)` returns 91 | * therefore for running wall-clock timings, you should call `cutorch.synchronize()` before each timecheck 92 | point: 93 | ```lua 94 | require 'cutorch' 95 | require 'sys' 96 | 97 | local a = torch.CudaTensor(1000,1000):uniform() 98 | cutorch.synchronize() 99 | start = sys.tic() 100 | a:add(1) 101 | cutorch.synchronize() 102 | print(sys.toc()) 103 | ``` 104 | 105 | -------------------------------------------------------------------------------- /lib/THCUNN/SoftMax.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | 5 | #define SOFTMAX_THREADS 128 6 | 7 | template 8 | __global__ void cunn_SoftMax_updateOutput_kernel( 9 | T *output, T *input, int nframe, int dim, int stride0, int stride1) 10 | { 11 | __shared__ AccumT buffer[SOFTMAX_THREADS+1]; 12 | T *input_k = input + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z; 13 | T *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z; 14 | 15 | int i_start = threadIdx.x; 16 | int i_end = dim; 17 | int i_step = blockDim.x; 18 | 19 | // max? 20 | buffer[threadIdx.x] = -THCNumerics::max(); 21 | for (int i=i_start; i::to(z); 25 | if (buffer[threadIdx.x] < zAcc) 26 | buffer[threadIdx.x] = zAcc; 27 | } 28 | 29 | 30 | __syncthreads(); 31 | 32 | // reduce 33 | if (threadIdx.x == 0) 34 | { 35 | AccumT max_k = -THCNumerics::max(); 36 | for (int i=0; i::to(buffer[SOFTMAX_THREADS]); 48 | buffer[threadIdx.x] = ScalarConvert::to(0); 49 | for (int i=i_start; i::exp(input_k[i*stride0]-max_k); 51 | buffer[threadIdx.x] += ScalarConvert::to(z); 52 | output_k[i*stride0] = z; 53 | } 54 | 55 | __syncthreads(); 56 | 57 | // reduce 58 | if (threadIdx.x == 0) 59 | { 60 | AccumT sum_k = ScalarConvert::to(0); 61 | for (int i=0; i::to(buffer[SOFTMAX_THREADS]); 70 | for (int i=i_start; i 75 | __global__ void cunn_SoftMax_updateGradInput_kernel( 76 | T *gradInput, T *output, T *gradOutput, int nframe, int dim, int stride0, int stride1) 77 | { 78 | __shared__ AccumT buffer[SOFTMAX_THREADS]; 79 | T *gradInput_k = gradInput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; 80 | T *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; 81 | T *gradOutput_k = gradOutput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z; 82 | 83 | int i_start = threadIdx.x; 84 | int i_end = dim; 85 | int i_step = blockDim.x; 86 | 87 | // sum? 88 | buffer[threadIdx.x] = ScalarConvert::to(0); 89 | for (int i=i_start; i::to(gradOutput_k[i*stride0] * output_k[i*stride0]); 91 | 92 | __syncthreads(); 93 | 94 | // reduce 95 | if (threadIdx.x == 0) 96 | { 97 | AccumT sum_k = ScalarConvert::to(0); 98 | for (int i=0; i::to(buffer[0]); 106 | for (int i=i_start; i 8 | __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data, 9 | const int num, const int channels, const int height, 10 | const int width, const int pooled_height, const int pooled_width, 11 | const int kernel_h, const int kernel_w, const int stride_h, 12 | const int stride_w, const int pad_h, const int pad_w, 13 | const int dilation_h, const int dilation_w, Dtype* top_data, 14 | long* top_mask) { 15 | CUDA_KERNEL_LOOP(index, nthreads) { 16 | int pw = index % pooled_width; 17 | int ph = (index / pooled_width) % pooled_height; 18 | int c = (index / pooled_width / pooled_height) % channels; 19 | int n = index / pooled_width / pooled_height / channels; 20 | int hstart = ph * stride_h - pad_h; 21 | int wstart = pw * stride_w - pad_w; 22 | int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height); 23 | int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width); 24 | while(hstart < 0) 25 | hstart += dilation_h; 26 | while(wstart < 0) 27 | wstart += dilation_w; 28 | AccType maxval = THCNumerics::min(); 29 | int maxidx = -1; 30 | bottom_data += (n * channels + c) * height * width; 31 | for (int h = hstart; h < hend; h += dilation_h) { 32 | for (int w = wstart; w < wend; w += dilation_w) { 33 | if (ScalarConvert::to(bottom_data[h * width + w]) > maxval) { 34 | maxidx = h * width + w; 35 | maxval = ScalarConvert::to(bottom_data[maxidx]); 36 | } 37 | } 38 | } 39 | top_data[index] = ScalarConvert::to(maxval); 40 | top_mask[index] = maxidx + TH_INDEX_BASE; 41 | } 42 | } 43 | 44 | 45 | template 46 | __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff, 47 | const long* top_mask, const int num, const int channels, 48 | const int height, const int width, const int pooled_height, 49 | const int pooled_width, const int kernel_h, const int kernel_w, 50 | const int stride_h, const int stride_w, const int pad_h, const int pad_w, 51 | const int dilation_h, const int dilation_w, 52 | Dtype* bottom_diff) { 53 | CUDA_KERNEL_LOOP(index, nthreads) { 54 | // find out the local index 55 | // find out the local offset 56 | int w = index % width; 57 | int h = (index / width) % height; 58 | int c = (index / width / height) % channels; 59 | int n = index / width / height / channels; 60 | int phstart = 61 | (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h + 1; 62 | int phend = min((h + pad_h) / stride_h + 1, pooled_height); 63 | int pwstart = 64 | (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1; 65 | int pwend = min((w + pad_w) / stride_w + 1, pooled_width); 66 | 67 | AccType gradient = AccType(0); 68 | int offset = (n * channels + c) * pooled_height * pooled_width; 69 | top_diff += offset; 70 | top_mask += offset; 71 | for (int ph = phstart; ph < phend; ++ph) { 72 | for (int pw = pwstart; pw < pwend; ++pw) { 73 | if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) { 74 | gradient += ScalarConvert::to(top_diff[ph * pooled_width + pw]); 75 | } 76 | } 77 | } 78 | bottom_diff[index] = ScalarConvert::to(gradient); 79 | } 80 | } 81 | 82 | #include "generic/SpatialDilatedMaxPooling.cu" 83 | #include "THCGenerateFloatTypes.h" 84 | -------------------------------------------------------------------------------- /lib/THCUNN/common.h: -------------------------------------------------------------------------------- 1 | #ifndef THCUNN_COMMON_H 2 | #define THCUNN_COMMON_H 3 | 4 | // CUDA: grid stride looping 5 | #define CUDA_KERNEL_LOOP(i, n) \ 6 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) 7 | 8 | #define THCUNN_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \ 9 | "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.") 10 | 11 | // Use 1024 threads per block, which requires cuda sm_2x or above 12 | const int CUDA_NUM_THREADS = 1024; 13 | 14 | // CUDA: number of blocks for threads. 15 | inline int GET_BLOCKS(const int N) 16 | { 17 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 18 | } 19 | 20 | #define THCUNN_resizeAs_indices(STATE, I1, I2) \ 21 | THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \ 22 | if (!THCIndexTensor_(isSize)(STATE, I1, size2)) \ 23 | { \ 24 | THCudaLongTensor_resize(STATE, I1, size2, NULL); \ 25 | } \ 26 | THLongStorage_free(size2); 27 | 28 | #define THCUNN_check_shape(STATE, I1, I2) \ 29 | if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2)) \ 30 | { \ 31 | THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1); \ 32 | THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \ 33 | THError(#I1 " and " #I2 " shapes do not match: " \ 34 | #I1 " %s, " #I2 " %s", s1.str, s2.str); \ 35 | } 36 | 37 | 38 | #define THCUNN_check_shape_indices(STATE, I1, I2) \ 39 | THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2); \ 40 | if (!THCIndexTensor_(isSize)(STATE, I1, size2)) \ 41 | { \ 42 | THCDescBuff s1 = THCIndexTensor_(sizeDesc)(STATE, I1); \ 43 | THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2); \ 44 | THError(#I1 " and " #I2 " shapes do not match: " \ 45 | #I1 " %s, " #I2 " %s", s1.str, s2.str); \ 46 | } \ 47 | THLongStorage_free(size2); 48 | 49 | #define THCUNN_check_nElement(STATE, I1, I2) \ 50 | if (I1 != NULL && I2 != NULL ) { \ 51 | ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1); \ 52 | ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2); \ 53 | if (n1 != n2) \ 54 | { \ 55 | THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1); \ 56 | THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2); \ 57 | THError(#I1 " and " #I2 " have different number of elements: " \ 58 | #I1 "%s has %ld elements, while " \ 59 | #I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \ 60 | } \ 61 | } 62 | 63 | #define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \ 64 | if (THCTensor_(nDimension)(STATE, T) != DIM || \ 65 | THCTensor_(size)(STATE, T, DIM_SIZE) != SIZE) { \ 66 | THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \ 67 | THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ 68 | " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ 69 | } 70 | 71 | #define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE) \ 72 | if (THCIndexTensor_(nDimension)(STATE, T) != DIM || \ 73 | THCIndexTensor_(size)(STATE, T, DIM_SIZE) != SIZE) { \ 74 | THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T); \ 75 | THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \ 76 | " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \ 77 | } 78 | 79 | #define THCUNN_argCheck(STATE, COND, ARG, T, FORMAT) \ 80 | if (!(COND)) { \ 81 | THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \ 82 | THArgCheck(COND, ARG, FORMAT, s1.str); \ 83 | } 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/RReLU.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/RReLU.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(RReLU_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output, 11 | THCTensor *noise, 12 | double lower, 13 | double upper, 14 | bool train, 15 | bool inplace, 16 | void *generator) 17 | { 18 | THCUNN_assertSameGPU(state, 3, input, output, noise); 19 | struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state); 20 | 21 | if (train) 22 | { 23 | input = THCTensor_(newContiguous)(state, input); 24 | THCTensor_(resizeAs)(state, noise, input); 25 | real *input_data = THCTensor_(data)(state, input); 26 | real *noise_data = THCTensor_(data)(state, noise); 27 | ptrdiff_t n = THCTensor_(nElement)(state, input); 28 | if (inplace) 29 | { 30 | rreluUpdateOutputTrain<<>>( 31 | n, gen_states, input_data, noise_data, input_data, lower, upper); 32 | THCTensor_(set)(state, output, input); 33 | } 34 | else 35 | { 36 | THCTensor_(resizeAs)(state, output, input); 37 | real *output_data = THCTensor_(data)(state, output); 38 | rreluUpdateOutputTrain<<>>( 39 | n, gen_states, input_data, noise_data, output_data, lower, upper); 40 | } 41 | THCudaCheck(cudaGetLastError()); 42 | THCTensor_(free)(state, input); 43 | } 44 | else 45 | { 46 | const real negSlope = ScalarConvert::to((lower + upper) / 2); 47 | if (inplace) 48 | { 49 | THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor(negSlope)); 50 | THCTensor_(set)(state, output, input); 51 | } 52 | else 53 | { 54 | THCTensor_(resizeAs)(state, output, input); 55 | THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor(negSlope)); 56 | } 57 | } 58 | } 59 | 60 | void THNN_(RReLU_updateGradInput)( 61 | THCState *state, 62 | THCTensor *input, 63 | THCTensor *gradOutput, 64 | THCTensor *gradInput, 65 | THCTensor *noise, 66 | double lower, 67 | double upper, 68 | bool train, 69 | bool inplace) 70 | { 71 | THCUNN_check_nElement(state, input, gradOutput); 72 | THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, noise); 73 | 74 | gradOutput = THCTensor_(newContiguous)(state, gradOutput); 75 | 76 | if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU 77 | { 78 | // multiply the gradient by the noise tensor 79 | if (inplace) 80 | { 81 | THCTensor_(cmul)(state, gradOutput, gradOutput, noise); 82 | THCTensor_(set)(state, gradInput, gradOutput); 83 | } 84 | else 85 | { 86 | THCTensor_(resizeAs)(state, gradInput, input); 87 | THCTensor_(cmul)(state, gradInput, gradOutput, noise); 88 | } 89 | } 90 | else 91 | { 92 | // use constant factor for negative input values 93 | const real negSlope = ScalarConvert::to((lower + upper) / 2); 94 | if (inplace) 95 | { 96 | THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor(negSlope)); 97 | THCTensor_(set)(state, gradInput, gradOutput); 98 | } 99 | else 100 | { 101 | THCTensor_(resizeAs)(state, gradInput, input); 102 | THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor(negSlope)); 103 | } 104 | } 105 | 106 | THCTensor_(free)(state, gradOutput); 107 | } 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /lib/THCUNN/TemporalMaxPooling.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCHalf.h" 4 | #include "THCHalfAutoNumerics.cuh" 5 | #include "THCAtomics.cuh" 6 | 7 | #define TEMPORAL_MAX_POOLING_THREADS 1024 8 | 9 | template 10 | __global__ void cunn_TemporalMaxPooling_updateOutputKernel(Dtype *input, Dtype *output, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { 11 | // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index 12 | Dtype *input_data = input + blockIdx.x * input_w * input_n + ( 13 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; 14 | Dtype *output_data = output + blockIdx.x * output_w * input_n + ( 15 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; 16 | THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( 17 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; 18 | 19 | int feat = 0; 20 | int time = 0; 21 | int max_time = input_n * kW; 22 | 23 | Dtype max_value; 24 | THCIndex_t max_index = 0; 25 | 26 | if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { 27 | // For all features 28 | for (feat = 0; feat < input_n; ++feat) { 29 | max_value = THCNumerics::min(); 30 | // For all values in the kernel space 31 | for (time = 0; time < max_time; time += input_n) { 32 | if (max_value < input_data[time + feat]) { 33 | max_value = input_data[time + feat]; 34 | max_index = time / input_n; 35 | } 36 | } 37 | output_data[feat] = max_value; 38 | indices_data[feat] = max_index; 39 | } 40 | } 41 | } 42 | 43 | template 44 | __global__ void cunn_TemporalMaxPooling_updateGradInputKernel(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { 45 | // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index 46 | Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + ( 47 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; 48 | Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + ( 49 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; 50 | THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( 51 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; 52 | 53 | int feat = 0; 54 | 55 | if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { 56 | // For all features 57 | for (feat = 0; feat < input_n; ++feat) { 58 | gradInput_data[indices_data[feat] * input_n + feat] += gradOutput_data[feat]; 59 | } 60 | } 61 | } 62 | 63 | template 64 | __global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) { 65 | // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index 66 | Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + ( 67 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW; 68 | Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + ( 69 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; 70 | THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + ( 71 | threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n; 72 | 73 | int feat = 0; 74 | 75 | if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) { 76 | // For all features 77 | for (feat = 0; feat < input_n; ++feat) { 78 | atomicAdd(&gradInput_data[indices_data[feat] * input_n + feat], gradOutput_data[feat]); 79 | } 80 | } 81 | } 82 | 83 | #include "generic/TemporalMaxPooling.cu" 84 | #include "THCGenerateFloatTypes.h" 85 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SpatialMaxUnpooling.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SpatialMaxUnpooling.cu" 3 | #else 4 | 5 | void THNN_(SpatialMaxUnpooling_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *output, 9 | THCIndexTensor *indices, 10 | int owidth, int oheight) 11 | { 12 | THCUNN_assertSameGPU(state, 3, input, output, indices); 13 | THCUNN_argCheck(state, input->nDimension == 3 || input->nDimension == 4, 2, input, 14 | "3D or 4D (batch mode) tensor expected for input, but got: %s"); 15 | THCUNN_check_shape_indices(state, indices, input); 16 | 17 | long nInputCols, nInputRows, nInputPlane, batchSize; 18 | 19 | if (input->nDimension == 3) { 20 | nInputCols = input->size[2]; 21 | nInputRows = input->size[1]; 22 | nInputPlane = input->size[0]; 23 | batchSize = 1; 24 | } 25 | else 26 | { 27 | nInputCols = input->size[3]; 28 | nInputRows = input->size[2]; 29 | nInputPlane = input->size[1]; 30 | batchSize = input->size[0]; 31 | } 32 | 33 | input = THCTensor_(newContiguous)(state, input); 34 | indices = THCIndexTensor_(newContiguous)(state, indices); 35 | THCTensor_(resize4d)(state, output, batchSize, nInputPlane, oheight, owidth); 36 | THCTensor_(zero)(state, output); 37 | 38 | int count = THCTensor_(nElement)(state, input); 39 | 40 | MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> 41 | (count, THCTensor_(data)(state, input), THCIndexTensor_(data)(state, indices), 42 | batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, output)); 43 | THCudaCheck(cudaGetLastError()); 44 | 45 | if(input->nDimension == 3) 46 | THCTensor_(resize3d)(state, output, nInputPlane, oheight, owidth); 47 | 48 | THCTensor_(free)(state, input); 49 | THCIndexTensor_(free)(state, indices); 50 | } 51 | 52 | void THNN_(SpatialMaxUnpooling_updateGradInput)( 53 | THCState *state, 54 | THCTensor *input, 55 | THCTensor *gradOutput, 56 | THCTensor *gradInput, 57 | THCIndexTensor *indices, 58 | int owidth, int oheight) 59 | { 60 | THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput); 61 | THCUNN_check_shape_indices(state, indices, input); 62 | 63 | long nInputCols, nInputRows, nInputPlane, batchSize; 64 | int dimw = 2; 65 | int dimh = 1; 66 | 67 | if (input->nDimension == 3) { 68 | nInputPlane = input->size[0]; 69 | batchSize = 1; 70 | } 71 | else 72 | { 73 | ++dimw; 74 | ++dimh; 75 | nInputPlane = input->size[1]; 76 | batchSize = input->size[0]; 77 | } 78 | nInputCols = input->size[dimw]; 79 | nInputRows = input->size[dimh]; 80 | 81 | if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){ 82 | THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", 83 | oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]); 84 | } 85 | 86 | input = THCTensor_(newContiguous)(state, input); 87 | indices = THCIndexTensor_(newContiguous)(state, indices); 88 | gradOutput = THCTensor_(newContiguous)(state, gradOutput); 89 | THCTensor_(resizeAs)(state, gradInput, input); 90 | 91 | int count = THCTensor_(nElement)(state, input); 92 | 93 | MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>> 94 | (count, THCTensor_(data)(state, gradOutput), THCIndexTensor_(data)(state, indices), 95 | batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, gradInput)); 96 | THCudaCheck(cudaGetLastError()); 97 | 98 | // clean 99 | THCTensor_(free)(state, input); 100 | THCIndexTensor_(free)(state, indices); 101 | THCTensor_(free)(state, gradOutput); 102 | } 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /lib/THCUNN/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR) 2 | CMAKE_POLICY(VERSION 2.8) 3 | 4 | OPTION(NDEBUG "disable asserts (WARNING: this may result in silent UB e.g. with out-of-bound indices)") 5 | IF(NOT NDEBUG) 6 | MESSAGE(STATUS "Removing -DNDEBUG from compile flags") 7 | STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS "" ${CMAKE_C_FLAGS}) 8 | STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_DEBUG "" ${CMAKE_C_FLAGS_DEBUG}) 9 | STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "" ${CMAKE_C_FLAGS_RELEASE}) 10 | STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS "" ${CMAKE_CXX_FLAGS}) 11 | STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG}) 12 | STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE}) 13 | ENDIF() 14 | 15 | IF(NOT Torch_FOUND) 16 | FIND_PACKAGE(Torch REQUIRED) 17 | ENDIF() 18 | 19 | IF(NOT TH_LIBRARIES) 20 | SET(TH_LIBRARIES "TH") 21 | ENDIF(NOT TH_LIBRARIES) 22 | MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}") 23 | IF(NOT THC_LIBRARIES) 24 | SET(THC_LIBRARIES "THC") 25 | ENDIF(NOT THC_LIBRARIES) 26 | MESSAGE(STATUS "THC_LIBRARIES: ${THC_LIBRARIES}") 27 | 28 | IF(NOT CUDA_FOUND) 29 | FIND_PACKAGE(CUDA 6.5 REQUIRED) 30 | ENDIF() 31 | 32 | IF ($ENV{TH_BINARY_BUILD}) 33 | MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++") 34 | SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}") 35 | IF (UNIX AND NOT APPLE) 36 | # hiding statically linked library symbols, this flag is not available for the linker under MACOSX 37 | SET(CMAKE_CXX_FLAGS "-Wl,--exclude-libs,libstdc++.a ${CMAKE_CXX_FLAGS}") 38 | ENDIF(UNIX AND NOT APPLE) 39 | ENDIF() 40 | 41 | # Detect CUDA architecture and get best NVCC flags 42 | IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS OR MSVC) 43 | INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake) 44 | ENDIF() 45 | LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS}) 46 | CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST}) 47 | LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) 48 | 49 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 50 | if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3") 51 | if(CUDA_VERSION VERSION_LESS "8.0") 52 | MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags") 53 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__") 54 | endif(CUDA_VERSION VERSION_LESS "8.0") 55 | endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3") 56 | endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") 57 | 58 | if(CUDA_VERSION VERSION_GREATER "8.0") 59 | LIST(APPEND CUDA_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__") 60 | endif(CUDA_VERSION VERSION_GREATER "8.0") 61 | 62 | IF(MSVC) 63 | LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819") 64 | ADD_DEFINITIONS(-DTH_EXPORTS) 65 | ENDIF() 66 | 67 | IF(NOT THCUNN_INSTALL_LIB_SUBDIR) 68 | SET(THCUNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THCUNN install library directory") 69 | SET(THCUNN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THCUNN install include subdirectory") 70 | ENDIF() 71 | 72 | FILE(GLOB src-cuda *.cu) 73 | 74 | CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) 75 | CUDA_ADD_LIBRARY(THCUNN MODULE ${src-cuda}) 76 | 77 | IF(MSVC) 78 | SET_TARGET_PROPERTIES(THCUNN PROPERTIES PREFIX "lib" IMPORT_PREFIX "lib") 79 | ENDIF() 80 | 81 | INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) 82 | TARGET_LINK_LIBRARIES(THCUNN ${THC_LIBRARIES} ${TH_LIBRARIES} ${CUDA_cusparse_LIBRARY}) 83 | 84 | # Luarocks bug pre-14.04 prevents us from setting it for Lua-Torch 85 | IF(THCUNN_SO_VERSION) 86 | MESSAGE(STATUS "THCUNN_SO_VERSION: ${THCUNN_SO_VERSION}") 87 | SET_TARGET_PROPERTIES(THCUNN PROPERTIES 88 | VERSION ${THCUNN_SO_VERSION} 89 | SOVERSION ${THCUNN_SO_VERSION}) 90 | ENDIF(THCUNN_SO_VERSION) 91 | 92 | INSTALL(TARGETS THCUNN LIBRARY DESTINATION ${THCUNN_INSTALL_LIB_SUBDIR}) 93 | INSTALL(FILES THCUNN.h DESTINATION "${THCUNN_INSTALL_INCLUDE_SUBDIR}/THCUNN") 94 | INSTALL(FILES generic/THCUNN.h DESTINATION "${THCUNN_INSTALL_INCLUDE_SUBDIR}/THCUNN/generic") 95 | -------------------------------------------------------------------------------- /lib/THCUNN/row2col.h: -------------------------------------------------------------------------------- 1 | #ifndef THCUNN_ROW2COL_H 2 | #define THCUNN_ROW2COL_H 3 | 4 | #include "THCNumerics.cuh" 5 | #include "common.h" 6 | 7 | // Kernel for fast unfold+copy on rows 8 | template 9 | __global__ void 10 | row2col_kernel(const int n, const Dtype *data_row, const int width, 11 | const int ksize_w, const int pad_w, const int stride_w, 12 | const int dilation_w, const int width_col, Dtype *data_col) { 13 | CUDA_KERNEL_LOOP(index, n) { 14 | int w_out = index % width_col; 15 | index /= width_col; 16 | int channel_in = index; 17 | int channel_out = channel_in * ksize_w; 18 | int w_in = w_out * stride_w - pad_w; 19 | data_col += (channel_out)*width_col + w_out; 20 | data_row += (channel_in)*width + w_in; 21 | for (int j = 0; j < ksize_w; ++j) { 22 | int w = w_in + j * dilation_w; 23 | *data_col = (w >= 0 && w < width) ? data_row[j * dilation_w] 24 | : ScalarConvert::to(0); 25 | data_col += width_col; 26 | } 27 | } 28 | } 29 | 30 | template 31 | void row2col(cudaStream_t stream, const Dtype *data_row, const int channels, 32 | const int width, const int ksize_w, const int pad_w, 33 | const int stride_w, const int dilation_w, Dtype *data_col) { 34 | // We are going to launch channels * width_col kernels, each 35 | // kernel responsible for copying a single-channel grid. 36 | int width_col = 37 | (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; 38 | int num_kernels = channels * width_col; 39 | // Launch 40 | row2col_kernel<<>>( 41 | num_kernels, data_row, width, ksize_w, pad_w, stride_w, 1, width_col, 42 | data_col); 43 | THCudaCheck(cudaGetLastError()); 44 | } 45 | 46 | template 47 | __global__ void col2row_kernel(const int n, const Dtype *data_col, 48 | const int width, const int channels, 49 | const int kernel_w, const int pad_w, 50 | const int stride_w, const int dilation_w, 51 | const int width_col, Dtype *data_row) { 52 | CUDA_KERNEL_LOOP(index, n) { 53 | Acctype val = Acctype(0); 54 | const int w_row = index % width + pad_w; 55 | const int c_row = index / width; 56 | int kernel_extent_w = (kernel_w - 1) * dilation_w + 1; 57 | // compute the start and end of the output 58 | const int w_col_start = (w_row < kernel_extent_w) 59 | ? 0 60 | : (w_row - kernel_extent_w) / stride_w + 1; 61 | const int w_col_end = min(w_row / stride_w + 1, width_col); 62 | for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) { 63 | int w_k = (w_row - w_col * stride_w); 64 | if (w_k % dilation_w == 0) { 65 | w_k /= dilation_w; 66 | int data_col_index = (c_row * kernel_w + w_k) * width_col + w_col; 67 | val += data_col[data_col_index]; 68 | } 69 | } 70 | data_row[index] = ScalarConvert::to(val); 71 | } 72 | } 73 | 74 | template 75 | void col2row(cudaStream_t stream, const Dtype *data_col, const int channels, 76 | const int width, const int patch_w, const int pad_w, 77 | const int stride_w, const int dilation_w, Dtype *data_row) { 78 | int width_col = 79 | (width + 2 * pad_w - (dilation_w * (patch_w - 1) + 1)) / stride_w + 1; 80 | int num_kernels = channels * width; 81 | // To avoid involving atomic operations, we will launch one kernel per 82 | // bottom dimension, and then in the kernel add up the top dimensions. 83 | col2row_kernel< 84 | Dtype, Acctype><<>>( 85 | num_kernels, data_col, width, channels, patch_w, pad_w, stride_w, 86 | dilation_w, width_col, data_row); 87 | 88 | THCudaCheck(cudaGetLastError()); 89 | } 90 | #endif 91 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialAveragePooling.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include "common.h" 5 | 6 | template 7 | __global__ void AvePoolForward(const int nthreads, 8 | const Dtype* const bottom_data, const int num, const int channels, 9 | const int height, const int width, const int pooled_height, 10 | const int pooled_width, const int kernel_h, const int kernel_w, 11 | const int stride_h, const int stride_w, const int pad_h, const int pad_w, 12 | Dtype* const top_data) { 13 | CUDA_KERNEL_LOOP(index, nthreads) { 14 | const int pw = index % pooled_width; 15 | const int ph = (index / pooled_width) % pooled_height; 16 | const int c = (index / pooled_width / pooled_height) % channels; 17 | const int n = index / pooled_width / pooled_height / channels; 18 | int hstart = ph * stride_h - pad_h; 19 | int wstart = pw * stride_w - pad_w; 20 | int hend = min(hstart + kernel_h, height + pad_h); 21 | int wend = min(wstart + kernel_w, width + pad_w); 22 | const int pool_size = (hend - hstart) * (wend - wstart); 23 | hstart = max(hstart, 0); 24 | wstart = max(wstart, 0); 25 | hend = min(hend, height); 26 | wend = min(wend, width); 27 | Acctype aveval = Acctype(0); 28 | const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width; 29 | for (int h = hstart; h < hend; ++h) { 30 | for (int w = wstart; w < wend; ++w) { 31 | aveval += bottom_slice[h * width + w]; 32 | } 33 | } 34 | if(COUNT_INCLUDE_PAD) 35 | top_data[index] = ScalarConvert::to(aveval / pool_size); 36 | else 37 | top_data[index] = ScalarConvert::to(aveval / ((hend - hstart) * (wend - wstart))); 38 | } 39 | } 40 | 41 | template 42 | __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff, 43 | const int num, const int channels, const int height, 44 | const int width, const int pooled_height, const int pooled_width, 45 | const int kernel_h, const int kernel_w, const int stride_h, 46 | const int stride_w, const int pad_h, const int pad_w, 47 | Dtype* const bottom_diff) { 48 | CUDA_KERNEL_LOOP(index, nthreads) { 49 | // find out the local index 50 | // find out the local offset 51 | const int w = index % width + pad_w; 52 | const int h = (index / width) % height + pad_h; 53 | const int c = (index / width / height) % channels; 54 | const int n = index / width / height / channels; 55 | const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1; 56 | const int phend = min(h / stride_h + 1, pooled_height); 57 | const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1; 58 | const int pwend = min(w / stride_w + 1, pooled_width); 59 | Acctype gradient = Acctype(0); 60 | const Dtype* const top_diff_slice = 61 | top_diff + (n * channels + c) * pooled_height * pooled_width; 62 | for (int ph = phstart; ph < phend; ++ph) { 63 | for (int pw = pwstart; pw < pwend; ++pw) { 64 | // figure out the pooling size 65 | int hstart = ph * stride_h - pad_h; 66 | int wstart = pw * stride_w - pad_w; 67 | int hend = min(hstart + kernel_h, height + pad_h); 68 | int wend = min(wstart + kernel_w, width + pad_w); 69 | int pool_size = (hend - hstart) * (wend - wstart); 70 | hstart = max(hstart, 0); 71 | wstart = max(wstart, 0); 72 | hend = min(hend, height); 73 | wend = min(wend, width); 74 | if(COUNT_INCLUDE_PAD) 75 | gradient += top_diff_slice[ph * pooled_width + pw] / pool_size; 76 | else 77 | gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart)); 78 | } 79 | } 80 | bottom_diff[index] = ScalarConvert::to(gradient); 81 | } 82 | } 83 | 84 | #include "generic/SpatialAveragePooling.cu" 85 | #include "THCGenerateFloatTypes.h" 86 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/BCECriterion.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/BCECriterion.cu" 3 | #else 4 | 5 | void THNN_(BCECriterion_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *target, 9 | THCTensor *output, 10 | bool sizeAverage, 11 | THCTensor *weights) 12 | { 13 | THCUNN_check_nElement(state, input, target); 14 | THCUNN_check_nElement(state, input, weights); 15 | THCUNN_check_dim_size(state, output, 1, 0, 1); 16 | THCUNN_assertSameGPU(state, 3, input, target, weights); 17 | 18 | ptrdiff_t size = THCTensor_(nElement)(state, input); 19 | 20 | input = THCTensor_(newContiguous)(state, input); 21 | target = THCTensor_(newContiguous)(state, target); 22 | 23 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 24 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 25 | 26 | accreal sum; 27 | if (weights) { 28 | weights = THCTensor_(newContiguous)(state, weights); 29 | thrust::device_ptr weights_data(THCTensor_(data)(state, weights)); 30 | sum = thrust::transform_reduce( 31 | thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), 32 | thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), 33 | bce_functor_weights(), 34 | (accreal) 0, 35 | thrust::plus() 36 | ); 37 | THCTensor_(free)(state, weights); 38 | } else { 39 | sum = thrust::transform_reduce( 40 | thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), 41 | thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), 42 | bce_functor(), 43 | (accreal) 0, 44 | thrust::plus() 45 | ); 46 | } 47 | 48 | if (sizeAverage) 49 | sum /= size; 50 | 51 | THCTensor_(free)(state, input); 52 | THCTensor_(free)(state, target); 53 | 54 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(sum)); 55 | } 56 | 57 | void THNN_(BCECriterion_updateGradInput)( 58 | THCState *state, 59 | THCTensor *input, 60 | THCTensor *target, 61 | THCTensor *gradInput, 62 | bool sizeAverage, 63 | THCTensor *weights) 64 | { 65 | THCUNN_check_nElement(state, input, target); 66 | THCUNN_check_nElement(state, input, weights); 67 | THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights); 68 | 69 | ptrdiff_t size = THCTensor_(nElement)(state, input); 70 | real norm = ScalarConvert::to(sizeAverage ? accreal(1)/size : accreal(1)); 71 | 72 | input = THCTensor_(newContiguous)(state, input); 73 | target = THCTensor_(newContiguous)(state, target); 74 | 75 | THCTensor_(resizeAs)(state, gradInput, input); 76 | 77 | thrust::device_ptr input_data(THCTensor_(data)(state, input)); 78 | thrust::device_ptr target_data(THCTensor_(data)(state, target)); 79 | thrust::device_ptr gradInput_data(THCTensor_(data)(state, gradInput)); 80 | 81 | if (weights) { 82 | weights = THCTensor_(newContiguous)(state, weights); 83 | thrust::device_ptr weights_data(THCTensor_(data)(state, weights)); 84 | thrust::transform( 85 | thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)), 86 | thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)), 87 | gradInput_data, 88 | bce_updateGradInput_functor_weights(norm) 89 | ); 90 | THCTensor_(free)(state, weights); 91 | } else { 92 | thrust::transform( 93 | thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)), 94 | thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)), 95 | gradInput_data, 96 | bce_updateGradInput_functor(norm) 97 | ); 98 | } 99 | 100 | THCTensor_(free)(state, input); 101 | THCTensor_(free)(state, target); 102 | } 103 | 104 | #endif 105 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SpatialCrossMapLRN.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu" 3 | #else 4 | 5 | void LRNforward(THCState* state, THCTensor* input, THCTensor* output, 6 | THCTensor* scale, int local_size, accreal alpha_, accreal beta_, accreal k_) 7 | { 8 | real alpha = ScalarConvert::to(alpha_); 9 | real beta = ScalarConvert::to(beta_); 10 | real k = ScalarConvert::to(k_); 11 | 12 | THCTensor_(resizeAs)(state, output, input); 13 | THCTensor_(resizeAs)(state, scale, input); 14 | 15 | int batchSize; 16 | int nInputPlane; 17 | int imsize_h; 18 | int imsize_w; 19 | 20 | if (input->nDimension == 3) { 21 | batchSize = 1; 22 | nInputPlane = input->size[0]; 23 | imsize_h = input->size[1]; 24 | imsize_w = input->size[2]; 25 | } 26 | else 27 | { 28 | batchSize = input->size[0]; 29 | nInputPlane = input->size[1]; 30 | imsize_h = input->size[2]; 31 | imsize_w = input->size[3]; 32 | } 33 | 34 | input = THCTensor_(newContiguous)(state, input); 35 | 36 | int n_threads = batchSize * imsize_h * imsize_w; 37 | LRNFillScale <<>>( 38 | n_threads, THCTensor_(data)(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size, 39 | alpha / local_size, k, THCTensor_(data)(state, scale)); 40 | n_threads *= nInputPlane; 41 | THCudaCheck(cudaGetLastError()); 42 | LRNComputeOutput<<>>( 43 | n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, scale), -beta, THCTensor_(data)(state, output)); 44 | THCudaCheck(cudaGetLastError()); 45 | 46 | THCTensor_(free)(state, input); 47 | } 48 | 49 | 50 | void LRNbackward(THCState* state, THCTensor* input, THCTensor* output, 51 | THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale, 52 | int local_size, accreal alpha_, accreal beta_, accreal k_) 53 | { 54 | real alpha = ScalarConvert::to(alpha_); 55 | real beta = ScalarConvert::to(beta_); 56 | real k = ScalarConvert::to(k_); 57 | 58 | THCTensor_(resizeAs)(state, gradInput, input); 59 | 60 | int batchSize; 61 | int nInputPlane; 62 | int imsize_h; 63 | int imsize_w; 64 | 65 | if (input->nDimension == 3) { 66 | batchSize = 1; 67 | nInputPlane = input->size[0]; 68 | imsize_h = input->size[1]; 69 | imsize_w = input->size[2]; 70 | } 71 | else 72 | { 73 | batchSize = input->size[0]; 74 | nInputPlane = input->size[1]; 75 | imsize_h = input->size[2]; 76 | imsize_w = input->size[3]; 77 | } 78 | 79 | input = THCTensor_(newContiguous)(state, input); 80 | gradOutput = THCTensor_(newContiguous)(state, gradOutput); 81 | 82 | int n_threads = batchSize * imsize_h * imsize_w; 83 | LRNComputeDiff <<>>( 84 | n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, output), 85 | THCTensor_(data)(state, scale), THCTensor_(data)(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w, 86 | local_size, -beta, ScalarConvert::to(2) * alpha * beta / local_size, 87 | THCTensor_(data)(state, gradInput)); 88 | THCudaCheck(cudaGetLastError()); 89 | 90 | THCTensor_(free)(state, input); 91 | THCTensor_(free)(state, gradOutput); 92 | } 93 | 94 | void THNN_(SpatialCrossMapLRN_updateOutput)( 95 | THCState *state, 96 | THCTensor *input, 97 | THCTensor *output, 98 | THCTensor *scale, 99 | int size, 100 | accreal alpha, 101 | accreal beta, 102 | accreal k) 103 | { 104 | LRNforward(state, input, output, scale, size, alpha, beta, k); 105 | } 106 | 107 | void THNN_(SpatialCrossMapLRN_updateGradInput)( 108 | THCState *state, 109 | THCTensor *input, 110 | THCTensor *gradOutput, 111 | THCTensor *gradInput, 112 | THCTensor *scale, 113 | THCTensor *output, 114 | int size, 115 | accreal alpha, 116 | accreal beta, 117 | accreal k) 118 | { 119 | LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k); 120 | } 121 | 122 | #endif 123 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/BatchNormalization.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/BatchNormalization.cu" 3 | #else 4 | 5 | #define DeviceTensor3 THCDeviceTensor 6 | #define DeviceTensor1 THCDeviceTensor 7 | 8 | template 9 | static THCDeviceTensor devicetensor(THCState *state, THCTensor *t) { 10 | if (!t) { 11 | return THCDeviceTensor(); 12 | } 13 | 14 | int inDim = THCTensor_(nDimension)(state, t); 15 | if (inDim == Dim) { 16 | return toDeviceTensor(state, t); 17 | } 18 | 19 | // View in which the last dimensions are collapsed or expanded as needed 20 | THAssert(THCTensor_(isContiguous)(state, t)); 21 | int size[Dim]; 22 | for (int i = 0; i < Dim || i < inDim; ++i) { 23 | if (i < Dim && i < inDim) { 24 | size[i] = t->size[i]; 25 | } else if (i < Dim) { 26 | size[i] = 1; 27 | } else { 28 | size[Dim - 1] *= t->size[i]; 29 | } 30 | } 31 | return THCDeviceTensor(THCTensor_(data)(state, t), size); 32 | } 33 | 34 | void THNN_(BatchNormalization_updateOutput)( 35 | THCState *state, THCTensor *input_, THCTensor *output_, 36 | THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_, 37 | THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_, 38 | bool train, double momentum, double eps) { 39 | 40 | THCTensor_(resizeAs)(state, output_, input_); 41 | DeviceTensor3 input = devicetensor<3>(state, input_); 42 | DeviceTensor3 output = devicetensor<3>(state, output_); 43 | DeviceTensor1 weight = devicetensor<1>(state, weight_); 44 | DeviceTensor1 bias = devicetensor<1>(state, bias_); 45 | DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); 46 | DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); 47 | DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); 48 | DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); 49 | 50 | cudaStream_t s = THCState_getCurrentStream(state); 51 | cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state); 52 | 53 | if (!train) { 54 | dim3 blocks(input.getSize(1)); 55 | dim3 threads(getNumThreads(input.getSize(2))); 56 | BatchNormalizationUpdateOutputInference_kernel <<>>( 57 | input, output, runningMean, runningVar, weight, bias, eps); 58 | } else { 59 | dim3 blocks(input.getSize(1)); 60 | dim3 threads(getNumThreads(input.getSize(2))); 61 | BatchNormalizationUpdateOutput_kernel <<>>( 62 | input, output, weight, bias, eps, momentum, runningMean, runningVar, 63 | saveMean, saveStd); 64 | } 65 | THCudaCheck(cudaGetLastError()); 66 | } 67 | 68 | void THNN_(BatchNormalization_backward)( 69 | THCState *state, THCTensor *input_, THCTensor *gradOutput_, 70 | THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_, 71 | THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_, 72 | THCTensor *saveMean_, THCTensor *saveStd_, bool train, double scale, double eps) { 73 | 74 | THCUNN_check_shape(state, input_, gradOutput_); 75 | DeviceTensor3 input = devicetensor<3>(state, input_); 76 | DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_); 77 | DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_); 78 | DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_); 79 | DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_); 80 | DeviceTensor1 weight = devicetensor<1>(state, weight_); 81 | DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_); 82 | DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_); 83 | DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_); 84 | DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_); 85 | 86 | cudaStream_t s = THCState_getCurrentStream(state); 87 | 88 | dim3 blocks(gradOutput.getSize(1)); 89 | dim3 threads(getNumThreads(gradOutput.getSize(2))); 90 | BatchNormalizationBackward_kernel <<>>( 91 | input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar, 92 | saveMean, saveStd, train, scale, eps); 93 | THCudaCheck(cudaGetLastError()); 94 | } 95 | 96 | #undef DeviceTensor3 97 | #undef DeviceTensor1 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialFractionalMaxPooling.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "common.h" 3 | #include "THCDeviceTensor.cuh" 4 | #include "THCDeviceTensorUtils.cuh" 5 | #include "THCDeviceUtils.cuh" 6 | #include "THCHalf.h" 7 | #include "THCHalfAutoNumerics.cuh" 8 | #include "THCAtomics.cuh" 9 | 10 | #include 11 | 12 | template 13 | __device__ inline int getInterval(Acctype sample, 14 | int index, 15 | int inputSize, 16 | int outputSize, 17 | int poolSize) { 18 | Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1); 19 | if (index == outputSize - 1) { 20 | return inputSize - poolSize; 21 | } else { 22 | return (int) ((index + sample) * alpha) - (int) (sample * alpha); 23 | } 24 | } 25 | 26 | // We template on poolSizeW to allow the innermost loop to be unrolled 27 | template 28 | __global__ void SpatialFractionalMaxPooling_updateOutput( 29 | THCDeviceTensor input, 30 | THCDeviceTensor output, 31 | THCDeviceTensor indices, 32 | THCDeviceTensor samples, 33 | int poolSizeW, int poolSizeH) { 34 | 35 | // Output (h, w) point that this thread is responsible for 36 | int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; 37 | int plane = blockIdx.y; 38 | int batch = blockIdx.z; 39 | 40 | // Each thread generates a specific output point 41 | if (ourOutputPoint < output.getSize(2) * output.getSize(3)) { 42 | int outputW = ourOutputPoint % output.getSize(3); 43 | int outputH = ourOutputPoint / output.getSize(3); 44 | 45 | int poolW = getInterval(ScalarConvert::to(samples[batch][plane][0]), outputW, 46 | input.getSize(3), output.getSize(3), poolSizeW); 47 | int poolH = getInterval(ScalarConvert::to(samples[batch][plane][1]), outputH, 48 | input.getSize(2), output.getSize(2), poolSizeH); 49 | 50 | Dtype maxVal = THCNumerics::min(); 51 | int maxIndex = -1; 52 | 53 | for (int h = poolH; h < poolH + poolSizeH; ++h) { 54 | if (PoolSizeWStatic == -1) { 55 | for (int w = poolW; w < poolW + poolSizeW; ++w) { 56 | Dtype val = input[batch][plane][h][w]; 57 | // for consistency with THNN, favor the first max 58 | if (val > maxVal) { 59 | maxIndex = h * input.getSize(3) + w; 60 | maxVal = val; 61 | } 62 | } 63 | } else { 64 | #pragma unroll 65 | for (int i = 0; i < PoolSizeWStatic; ++i) { 66 | int w = i + poolW; 67 | Dtype val = input[batch][plane][h][w]; 68 | // for consistency with THNN, favor the first max 69 | if (val > maxVal) { 70 | maxIndex = h * input.getSize(3) + w; 71 | maxVal = val; 72 | } 73 | } 74 | } 75 | } 76 | 77 | assert(THCNumerics::ne(maxVal, THCNumerics::min())); 78 | assert(maxIndex != -1); 79 | 80 | // +1 for Lua index 81 | indices[batch][plane][outputH][outputW] = maxIndex + TH_INDEX_BASE; 82 | output[batch][plane][outputH][outputW] = maxVal; 83 | } 84 | } 85 | 86 | template 87 | __global__ void SpatialFractionalMaxPooling_updateGradInput( 88 | THCDeviceTensor gradInput, 89 | THCDeviceTensor gradOutput, 90 | THCDeviceTensor indices) { 91 | // Output (h, w) point that this thread is responsible for 92 | int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; 93 | int plane = blockIdx.y; 94 | int batch = blockIdx.z; 95 | 96 | // Each thread generates a specific output point 97 | if (ourOutputPoint < gradOutput.getSize(2) * gradOutput.getSize(3)) { 98 | int outputW = ourOutputPoint % gradOutput.getSize(3); 99 | int outputH = ourOutputPoint / gradOutput.getSize(3); 100 | 101 | int index = indices[batch][plane][outputH][outputW] - TH_INDEX_BASE; 102 | assert(index >= 0); 103 | int inputW = index % gradInput.getSize(3); 104 | int inputH = index / gradInput.getSize(3); 105 | assert(inputH < gradInput.getSize(2)); 106 | 107 | atomicAdd(gradInput[batch][plane][inputH][inputW].data(), 108 | gradOutput[batch][plane][outputH][outputW]); 109 | } 110 | } 111 | 112 | #include "generic/SpatialFractionalMaxPooling.cu" 113 | #include "THCGenerateFloatTypes.h" 114 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SoftMax.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SoftMax.cu" 3 | #else 4 | 5 | #include "../common.h" 6 | 7 | void THNN_(SoftMax_updateOutput)( 8 | THCState *state, 9 | THCTensor *input, 10 | THCTensor *output) 11 | { 12 | THCUNN_assertSameGPU(state, 2, input, output); 13 | 14 | input = THCTensor_(newContiguous)(state, input); 15 | THCTensor_(resizeAs)(state, output, input); 16 | long batchSize, dim, stride0, stride1 = 1; 17 | long blocksY = 1, blocksZ = 1; 18 | 19 | if (input->nDimension == 1) 20 | { 21 | batchSize = 1; 22 | dim = input->size[0]; 23 | stride0 = 1; 24 | } 25 | else if (input->nDimension == 2) 26 | { 27 | batchSize = input->size[0]; 28 | dim = input->size[1]; 29 | stride0 = 1; 30 | } 31 | else if (input->nDimension == 3) 32 | { 33 | batchSize = 1; 34 | dim = input->size[0]; 35 | blocksY = input->size[1]; 36 | blocksZ = input->size[2]; 37 | stride0 = blocksY * blocksZ; 38 | stride1 = blocksZ; 39 | } 40 | else if (input->nDimension == 4) 41 | { 42 | batchSize = input->size[0]; 43 | dim = input->size[1]; 44 | blocksY = input->size[2]; 45 | blocksZ = input->size[3]; 46 | stride0 = blocksY * blocksZ; 47 | stride1 = blocksZ; 48 | } 49 | else 50 | { 51 | THError("1D, 2D, 3D or 4D tensor expected"); 52 | } 53 | 54 | // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices. 55 | if (blocksY * blocksZ < 65536) 56 | { 57 | blocksY *= blocksZ; 58 | blocksZ = 1; 59 | if (input->nDimension == 3 || input->nDimension == 4) { 60 | stride0 = blocksY * blocksZ; 61 | stride1 = blocksZ; 62 | } 63 | } 64 | 65 | dim3 blocks(batchSize, blocksY, blocksZ); 66 | dim3 threads(SOFTMAX_THREADS); 67 | cunn_SoftMax_updateOutput_kernel<<>>( 68 | THCTensor_(data)(state, output), 69 | THCTensor_(data)(state, input), 70 | batchSize, dim, stride0, stride1 71 | ); 72 | THCudaCheck(cudaGetLastError()); 73 | 74 | THCTensor_(free)(state, input); 75 | } 76 | 77 | void THNN_(SoftMax_updateGradInput)( 78 | THCState *state, 79 | THCTensor *input, 80 | THCTensor *gradOutput, 81 | THCTensor *gradInput, 82 | THCTensor *output) 83 | { 84 | THCUNN_check_nElement(state, input, gradOutput); 85 | THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput); 86 | 87 | output = THCTensor_(newContiguous)(state, output); 88 | gradOutput = THCTensor_(newContiguous)(state, gradOutput); 89 | 90 | THCTensor_(resizeAs)(state, gradInput, output); 91 | long batchSize, dim, stride0, stride1 = 1; 92 | long blocksY = 1, blocksZ = 1; 93 | 94 | if (gradInput->nDimension == 1) 95 | { 96 | batchSize = 1; 97 | dim = gradInput->size[0]; 98 | stride0 = 1; 99 | } 100 | else if (gradInput->nDimension == 2) 101 | { 102 | batchSize = gradInput->size[0]; 103 | dim = gradInput->size[1]; 104 | stride0 = 1; 105 | } 106 | else if (gradInput->nDimension == 3) 107 | { 108 | batchSize = 1; 109 | dim = gradInput->size[0]; 110 | blocksY = gradInput->size[1]; 111 | blocksZ = gradInput->size[2]; 112 | stride0 = blocksY * blocksZ; 113 | stride1 = blocksZ; 114 | } 115 | else if (gradInput->nDimension == 4) 116 | { 117 | batchSize = gradInput->size[0]; 118 | dim = gradInput->size[1]; 119 | blocksY = gradInput->size[2]; 120 | blocksZ = gradInput->size[3]; 121 | stride0 = blocksY * blocksZ; 122 | stride1 = blocksZ; 123 | } 124 | else 125 | { 126 | THError("1D, 2D, 3D or 4D tensor expected"); 127 | } 128 | 129 | // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices. 130 | if (blocksY * blocksZ < 65536) 131 | { 132 | blocksY *= blocksZ; 133 | blocksZ = 1; 134 | if (input->nDimension == 3 || input->nDimension == 4) { 135 | stride0 = blocksY * blocksZ; 136 | stride1 = blocksZ; 137 | } 138 | } 139 | 140 | dim3 blocks(batchSize, blocksY, blocksZ); 141 | dim3 threads(SOFTMAX_THREADS); 142 | cunn_SoftMax_updateGradInput_kernel<<>>( 143 | THCTensor_(data)(state, gradInput), 144 | THCTensor_(data)(state, output), 145 | THCTensor_(data)(state, gradOutput), 146 | batchSize, dim, stride0, stride1 147 | ); 148 | THCudaCheck(cudaGetLastError()); 149 | 150 | THCTensor_(free)(state, gradOutput); 151 | THCTensor_(free)(state, output); 152 | } 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialUpSamplingBilinear.cu: -------------------------------------------------------------------------------- 1 | // Adapted from interp.cpp from Caffe util by Pauline Luc 2 | // Originally developed by George Papandreou 3 | #include "THCUNN.h" 4 | #include "common.h" 5 | #include "THCDeviceTensor.cuh" 6 | #include "THCDeviceTensorUtils.cuh" 7 | #include "THCDeviceUtils.cuh" 8 | #include "THCHalf.h" 9 | #include "THCHalfAutoNumerics.cuh" 10 | #include "THCAtomics.cuh" 11 | 12 | template 13 | __global__ void caffe_gpu_interp2_kernel(const int n, 14 | const Acctype rheight, const Acctype rwidth, 15 | const THCDeviceTensor data1, THCDeviceTensor data2) { 16 | int index = threadIdx.x + blockIdx.x * blockDim.x; 17 | const int batchsize = data1.getSize(0); 18 | const int channels = data1.getSize(1); 19 | const int height1 = data1.getSize(2); 20 | const int width1 = data1.getSize(3); 21 | const int height2 = data2.getSize(2); 22 | const int width2 = data2.getSize(3); 23 | 24 | if (index < n) { 25 | const int w2 = index % width2; // 0:width2-1 26 | const int h2 = index / width2; // 0:height2-1 27 | // special case: just copy 28 | if (height1 == height2 && width1 == width2) { 29 | const int h1 = h2; 30 | const int w1 = w2; 31 | for (int n = 0; n < batchsize ; n++){ 32 | for (int c = 0; c < channels; ++c) { 33 | const Dtype val = data1[n][c][h1][w1]; 34 | data2[n][c][h2][w2] = val; 35 | } 36 | } 37 | return; 38 | } 39 | // 40 | const Acctype h1r = rheight * h2; 41 | const int h1 = h1r; 42 | const int h1p = (h1 < height1 - 1) ? 1 : 0; 43 | const Acctype h1lambda = h1r - h1; 44 | const Acctype h0lambda = Acctype(1) - h1lambda; 45 | // 46 | const Acctype w1r = rwidth * w2; 47 | const int w1 = w1r; 48 | const int w1p = (w1 < width1 - 1) ? 1 : 0; 49 | const Acctype w1lambda = w1r - w1; 50 | const Acctype w0lambda = Acctype(1) - w1lambda; 51 | // 52 | for (int n = 0; n < batchsize ; n++){ 53 | for (int c = 0; c < channels; ++c) { 54 | const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1] 55 | + w1lambda * data1[n][c][h1][w1+w1p]) 56 | + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1] 57 | + w1lambda * data1[n][c][h1+h1p][w1+w1p]); 58 | data2[n][c][h2][w2] = ScalarConvert::to(val); 59 | } 60 | } 61 | } 62 | } 63 | 64 | // Backward (adjoint) operation 1 <- 2 (accumulates) 65 | template 66 | __global__ void caffe_gpu_interp2_kernel_backward(const int n, 67 | const Acctype rheight, const Acctype rwidth, 68 | THCDeviceTensor data1, const THCDeviceTensor data2){ 69 | int index = threadIdx.x + blockIdx.x * blockDim.x; 70 | const int batchsize = data1.getSize(0); 71 | const int channels = data1.getSize(1); 72 | const int height1 = data1.getSize(2); 73 | const int width1 = data1.getSize(3); 74 | const int height2 = data2.getSize(2); 75 | const int width2 = data2.getSize(3); 76 | if (index < n) { 77 | const int w2 = index % width2; // 0:width2-1 78 | const int h2 = index / width2; // 0:height2-1 79 | // special case: just copy 80 | if (height1 == height2 && width1 == width2) { 81 | const int h1 = h2; 82 | const int w1 = w2; 83 | for (int n = 0; n < batchsize ; n++){ 84 | for (int c = 0; c < channels; ++c) { 85 | const Dtype val = data2[n][c][h1][w1]; 86 | data1[n][c][h2][w2] += val; 87 | } 88 | } 89 | return; 90 | } 91 | // 92 | const Acctype h1r = rheight * h2; 93 | const int h1 = h1r; 94 | const int h1p = (h1 < height1 - 1) ? 1 : 0; 95 | const Acctype h1lambda = h1r - h1; 96 | const Acctype h0lambda = Acctype(1) - h1lambda; 97 | // 98 | const Acctype w1r = rwidth * w2; 99 | const int w1 = w1r; 100 | const int w1p = (w1 < width1 - 1) ? 1 : 0; 101 | const Acctype w1lambda = w1r - w1; 102 | const Acctype w0lambda = Acctype(1) - w1lambda; 103 | // 104 | for (int n = 0; n < batchsize ; n++){ 105 | for (int c = 0; c < channels; ++c) { 106 | const Dtype d2val = data2[n][c][h2][w2]; 107 | atomicAdd(data1[n][c][h1][w1].data(), 108 | ScalarConvert::to(h0lambda * w0lambda * d2val)); 109 | atomicAdd(data1[n][c][h1][w1+w1p].data(), 110 | ScalarConvert::to(h0lambda * w1lambda * d2val)); 111 | atomicAdd(data1[n][c][h1+h1p][w1].data(), 112 | ScalarConvert::to(h1lambda * w0lambda * d2val)); 113 | atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(), 114 | ScalarConvert::to(h1lambda * w1lambda * d2val)); 115 | } 116 | } 117 | } 118 | } 119 | 120 | 121 | #include "generic/SpatialUpSamplingBilinear.cu" 122 | #include "THCGenerateFloatTypes.h" 123 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SpatialReplicationPadding.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SpatialReplicationPadding.cu" 3 | #else 4 | 5 | void THNN_(SpatialReplicationPadding_updateOutput)( 6 | THCState *state, 7 | THCTensor *input, 8 | THCTensor *output, 9 | int padL, int padR, 10 | int padT, int padB) { 11 | THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, 12 | "input tensor must fit into 32-bit index math"); 13 | 14 | int planeDim = 0; 15 | int dimh = 1; 16 | int dimw = 2; 17 | int numBatch = 1; 18 | 19 | int numInputDims = THCTensor_(nDimension)(state, input); 20 | THCUNN_argCheck(state, numInputDims == 3 || numInputDims == 4, 2, input, 21 | "3D or 4D (batch mode) tensor expected for input, but got: %s") 22 | 23 | if (numInputDims == 4) { 24 | numBatch = THCTensor_(size)(state, input, 0); 25 | planeDim++; 26 | dimh++; 27 | dimw++; 28 | } 29 | 30 | int numPlanes = THCTensor_(size)(state, input, planeDim); 31 | int inputH = THCTensor_(size)(state, input, dimh); 32 | int inputW = THCTensor_(size)(state, input, dimw); 33 | int outputH = inputH + padT + padB; 34 | int outputW = inputW + padL + padR; 35 | 36 | THArgCheck(outputW >= 1 || outputH >= 1 , 2, 37 | "input (H: %d, W: %d)is too small." 38 | " Calculated output H: %d W: %d", 39 | inputH, inputW, outputH, outputW); 40 | 41 | THCDeviceTensor devInput; 42 | THCDeviceTensor devOutput; 43 | 44 | if (numInputDims == 3) { 45 | THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW); 46 | 47 | devInput = toDeviceTensor(state, input).upcastOuter<4>(); 48 | devOutput = toDeviceTensor(state, output).upcastOuter<4>(); 49 | } else { 50 | THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW); 51 | 52 | devInput = toDeviceTensor(state, input); 53 | devOutput = toDeviceTensor(state, output); 54 | } 55 | 56 | int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); 57 | dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), 58 | devOutput.getSize(1), 59 | devOutput.getSize(0)); 60 | dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); 61 | 62 | SpatialReplicationPadding_updateOutput<<>>( 63 | devInput, devOutput, padT, padB, padL, padR); 64 | 65 | } 66 | 67 | void THNN_(SpatialReplicationPadding_updateGradInput)( 68 | THCState *state, 69 | THCTensor *input, 70 | THCTensor *gradOutput, 71 | THCTensor *gradInput, 72 | int padL, int padR, 73 | int padT, int padB) { 74 | 75 | THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, 76 | "input tensor must fit into 32-bit index math"); 77 | THArgCheck(TensorUtils::canUse32BitIndexMath(state, gradOutput), 3, 78 | "output gradient tensor must fit into 32-bit index math"); 79 | 80 | int planeDim = 0; 81 | int dimh = 1; 82 | int dimw = 2; 83 | 84 | int numInputDims = THCTensor_(nDimension)(state, input); 85 | if (numInputDims == 4) { 86 | planeDim++; 87 | dimh++; 88 | dimw++; 89 | } 90 | int iheight = input->size[dimh]; 91 | int iwidth = input->size[dimw]; 92 | int oheight = iheight + padT + padB; 93 | int owidth = iwidth + padL + padR; 94 | 95 | THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, 96 | "gradOutput width unexpected. Expected: %d, Got: %d", 97 | owidth, THCTensor_(size)(state, gradOutput, dimw)); 98 | THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3, 99 | "gradOutput height unexpected. Expected: %d, Got: %d", 100 | oheight, THCTensor_(size)(state, gradOutput, dimh)); 101 | 102 | THCTensor_(resizeAs)(state, gradInput, input); 103 | THCTensor_(zero)(state, gradInput); 104 | 105 | THCDeviceTensor devGradInput; 106 | THCDeviceTensor devGradOutput; 107 | 108 | if (numInputDims == 3) { 109 | devGradInput = toDeviceTensor(state, gradInput).upcastOuter<4>(); 110 | devGradOutput = toDeviceTensor(state, gradOutput).upcastOuter<4>(); 111 | } else { 112 | devGradInput = toDeviceTensor(state, gradInput); 113 | devGradOutput = toDeviceTensor(state, gradOutput); 114 | } 115 | 116 | int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); 117 | dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), 118 | devGradOutput.getSize(1), 119 | devGradOutput.getSize(0)); 120 | dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); 121 | 122 | SpatialReplicationPadding_updateGradInput<<>>( 123 | devGradInput, devGradOutput, padT, padB, padL, padR); 124 | 125 | } 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /THCUNN.lua: -------------------------------------------------------------------------------- 1 | local ffi = require 'ffi' 2 | local THNN = require 'nn.THNN' 3 | 4 | local THCUNN = {} 5 | 6 | -- load libTHCUNN 7 | THCUNN.C = ffi.load(package.searchpath('libTHCUNN', package.cpath)) 8 | 9 | -- load THC 10 | local THC = ffi.os == 'Windows' and ffi.load('THC') or ffi.C 11 | 12 | local THCState_ptr = ffi.typeof('THCState*') 13 | 14 | function THCUNN.getState() 15 | return THCState_ptr(cutorch.getState()); 16 | end 17 | 18 | local THCUNN_generic_h = require 'cunn.THCUNN_generic_h' 19 | -- strip all lines starting with # 20 | -- to remove preprocessor directives originally present 21 | -- in THNN.h 22 | THCUNN_generic_h = THCUNN_generic_h:gsub("\n#[^\n]*", "") 23 | THCUNN_generic_h = THCUNN_generic_h:gsub("^#[^\n]*\n", "") 24 | 25 | local preprocessed_generic = string.gsub(THCUNN_generic_h, 'TH_API void THNN_%(([%a%d_]+)%)', 'void THNN_TYPE%1') 26 | 27 | local replacements = 28 | { 29 | { 30 | ['THTensor'] = 'THCudaTensor', 31 | ['THCIndexTensor'] = 'THCudaLongTensor', 32 | ['THIndex_t'] = 'long', 33 | ['THInteger_t'] = 'float' 34 | } 35 | } 36 | 37 | local cct2lt = { 38 | ['THCudaFloatTensor'] = 'torch.CudaTensor', 39 | ['THCudaDoubleTensor'] = 'torch.CudaDoubleTensor', 40 | } 41 | 42 | local replacements_generic = 43 | { 44 | { 45 | ['THCTensor'] = 'THCudaTensor', 46 | ['THCIndexTensor'] = 'THCudaLongTensor', 47 | ['TYPE'] = 'Cuda', 48 | ['accreal'] = 'float', 49 | }, 50 | { 51 | ['THCTensor'] = 'THCudaDoubleTensor', 52 | ['THCIndexTensor'] = 'THCudaLongTensor', 53 | ['TYPE'] = 'CudaDouble', 54 | ['accreal'] = 'double', 55 | } 56 | } 57 | 58 | if cutorch.hasHalf then 59 | ffi.cdef("half THC_float2half(float a);") 60 | ffi.cdef("float THC_half2float(half a);") 61 | cct2lt['THCudaHalfTensor'] = 'torch.CudaHalfTensor' 62 | local half_replacement = { 63 | ['THCTensor'] = 'THCudaHalfTensor', 64 | ['THCIndexTensor'] = 'THCudaLongTensor', 65 | ['TYPE'] = 'CudaHalf', 66 | ['accreal'] = 'float', 67 | } 68 | table.insert(replacements_generic, half_replacement) 69 | end 70 | 71 | for i=1,#replacements_generic do 72 | local r = replacements_generic[i] 73 | local s = preprocessed_generic 74 | for k,v in pairs(r) do 75 | s = string.gsub(s, k, v) 76 | end 77 | ffi.cdef(s) 78 | end 79 | 80 | local function extract_function_names_generic(s) 81 | local t = {} 82 | for n in string.gmatch(s, 'TH_API void THNN_%(([%a%d_]+)%)') do 83 | t[#t+1] = n 84 | end 85 | return t 86 | end 87 | 88 | local function find_positions(s, p) 89 | local begin = 0 90 | local positions = {} 91 | while true do 92 | local start, stop = string.find(s, p, begin) 93 | if (start == nil) then break end 94 | positions[#positions+1] = start 95 | begin = stop + 1 96 | end 97 | return positions 98 | end 99 | 100 | local function extract_function_names_and_real_args(s) 101 | local t = {} 102 | for n in string.gmatch(s, 'TH_API ([^;]+)') do 103 | local func_name = string.match(n, 'void THNN_%(([%a%d_]+)%)') 104 | local param_positions = find_positions(n, ',') 105 | local positions = {} 106 | for x,y in ipairs(find_positions(n, 'real')) do 107 | local found = false 108 | for cn,cp in ipairs(param_positions) do 109 | if cp > y then 110 | positions[#positions+1] = cn 111 | found = true 112 | break 113 | end 114 | end 115 | -- it is the last param 116 | if not found then positions[#positions+1] = #param_positions + 1 end 117 | end 118 | 119 | t[func_name] = positions 120 | end 121 | return t 122 | end 123 | 124 | local real_args = extract_function_names_and_real_args(THCUNN_generic_h) 125 | 126 | -- build function table 127 | local function_names_generic = extract_function_names_generic(THCUNN_generic_h) 128 | 129 | THNN.kernels['torch.CudaTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'Cuda', THCUNN.getState) 130 | torch.getmetatable('torch.CudaTensor').THNN = THNN.kernels['torch.CudaTensor'] 131 | 132 | THNN.kernels['torch.CudaDoubleTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'CudaDouble', THCUNN.getState) 133 | torch.getmetatable('torch.CudaDoubleTensor').THNN = THNN.kernels['torch.CudaDoubleTensor'] 134 | 135 | if cutorch.hasHalf then 136 | local raw_half_functions = THNN.bind(THCUNN.C, function_names_generic, 'CudaHalf', THCUNN.getState) 137 | THNN.kernels['torch.CudaHalfTensor'] = raw_half_functions 138 | torch.getmetatable('torch.CudaHalfTensor').THNN = THNN.kernels['torch.CudaHalfTensor'] 139 | end 140 | 141 | local function Module__converter(type) 142 | return function(self) 143 | return self:type(type) 144 | end 145 | end 146 | 147 | rawset(torch.getmetatable('nn.Module'), 'cudaDouble', Module__converter('torch.CudaDoubleTensor')) 148 | if cutorch.hasHalf then 149 | rawset(torch.getmetatable('nn.Module'), 'cudaHalf', Module__converter('torch.CudaHalfTensor')) 150 | end 151 | return THCUNN 152 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SpatialReflectionPadding.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SpatialReflectionPadding.cu" 3 | #else 4 | 5 | void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state, 6 | THCTensor *input, 7 | THCTensor *output, 8 | int padL, int padR, 9 | int padT, int padB) { 10 | THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, 11 | "input tensor must fit into 32-bit index math"); 12 | 13 | int planeDim = 0; 14 | int dimh = 1; 15 | int dimw = 2; 16 | int numBatch = 1; 17 | 18 | int numInputDims = THCTensor_(nDimension)(state, input); 19 | THCUNN_argCheck(state, numInputDims == 3 || numInputDims == 4, 2, input, 20 | "3D or 4D (batch mode) tensor expected for input, but got: %s") 21 | 22 | if (numInputDims == 4) { 23 | numBatch = THCTensor_(size)(state, input, 0); 24 | planeDim++; 25 | dimh++; 26 | dimw++; 27 | } 28 | 29 | int numPlanes = THCTensor_(size)(state, input, planeDim); 30 | int inputH = THCTensor_(size)(state, input, dimh); 31 | int inputW = THCTensor_(size)(state, input, dimw); 32 | int outputH = inputH + padT + padB; 33 | int outputW = inputW + padL + padR; 34 | 35 | THArgCheck(outputW >= 1 || outputH >= 1 , 2, 36 | "input (H: %d, W: %d)is too small." 37 | " Calculated output H: %d W: %d", 38 | inputH, inputW, outputH, outputW); 39 | 40 | THCDeviceTensor devInput; 41 | THCDeviceTensor devOutput; 42 | 43 | if (numInputDims == 3) { 44 | THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW); 45 | 46 | devInput = toDeviceTensor(state, input).upcastOuter<4>(); 47 | devOutput = toDeviceTensor(state, output).upcastOuter<4>(); 48 | } else { 49 | THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW); 50 | 51 | devInput = toDeviceTensor(state, input); 52 | devOutput = toDeviceTensor(state, output); 53 | } 54 | 55 | int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3); 56 | dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), 57 | devOutput.getSize(1), 58 | devOutput.getSize(0)); 59 | dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); 60 | 61 | SpatialReflectionPadding_updateOutput<<>>( 62 | devInput, devOutput, padT, padB, padL, padR); 63 | THCudaCheck(cudaGetLastError()); 64 | } 65 | 66 | void THNN_(SpatialReflectionPadding_updateGradInput)( 67 | THCState *state, 68 | THCTensor *input, 69 | THCTensor *gradOutput, 70 | THCTensor *gradInput, 71 | int padL, int padR, 72 | int padT, int padB) { 73 | 74 | THArgCheck(TensorUtils::canUse32BitIndexMath(state, input), 2, 75 | "input tensor must fit into 32-bit index math"); 76 | THArgCheck(TensorUtils::canUse32BitIndexMath(state, gradOutput), 3, 77 | "output gradient tensor must fit into 32-bit index math"); 78 | 79 | int planeDim = 0; 80 | int dimh = 1; 81 | int dimw = 2; 82 | 83 | int numInputDims = THCTensor_(nDimension)(state, input); 84 | if (numInputDims == 4) { 85 | planeDim++; 86 | dimh++; 87 | dimw++; 88 | } 89 | int iheight = input->size[dimh]; 90 | int iwidth = input->size[dimw]; 91 | int oheight = iheight + padT + padB; 92 | int owidth = iwidth + padL + padR; 93 | 94 | THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3, 95 | "gradOutput width unexpected. Expected: %d, Got: %d", 96 | owidth, THCTensor_(size)(state, gradOutput, dimw)); 97 | THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3, 98 | "gradOutput height unexpected. Expected: %d, Got: %d", 99 | oheight, THCTensor_(size)(state, gradOutput, dimh)); 100 | 101 | THCTensor_(resizeAs)(state, gradInput, input); 102 | THCTensor_(zero)(state, gradInput); 103 | 104 | THCDeviceTensor devGradInput; 105 | THCDeviceTensor devGradOutput; 106 | 107 | if (numInputDims == 3) { 108 | devGradInput = toDeviceTensor(state, gradInput).upcastOuter<4>(); 109 | devGradOutput = toDeviceTensor(state, gradOutput).upcastOuter<4>(); 110 | } else { 111 | devGradInput = toDeviceTensor(state, gradInput); 112 | devGradOutput = toDeviceTensor(state, gradOutput); 113 | } 114 | 115 | int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3); 116 | dim3 gridSize(THCCeilDiv(outputPlaneSize, 256), 117 | devGradOutput.getSize(1), 118 | devGradOutput.getSize(0)); 119 | dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize); 120 | 121 | SpatialReflectionPadding_updateGradInput<<>>( 122 | devGradInput, devGradOutput, padT, padB, padL, padR); 123 | THCudaCheck(cudaGetLastError()); 124 | } 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/MultiLabelMarginCriterion.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/MultiLabelMarginCriterion.cu" 3 | #else 4 | 5 | // TODO: improve error messages 6 | void THNN_(MultiLabelMarginCriterion_updateOutput)( 7 | THCState *state, 8 | THCTensor *input, 9 | THCIndexTensor *target, 10 | THCTensor *output, 11 | THCTensor *istarget, 12 | bool sizeaverage) 13 | { 14 | input = THCTensor_(newContiguous)(state, input); 15 | target = THCIndexTensor_(newContiguous)(state, target); 16 | istarget = THCTensor_(newContiguous)(state, istarget); 17 | THCTensor_(resizeAs)(state, istarget, input); 18 | 19 | if(input->nDimension == 1) 20 | { 21 | int dim = input->size[0]; 22 | THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, 23 | "inconsistent target size"); 24 | THCTensor_(resize1d)(state, output, 1); 25 | 26 | dim3 blocks(1); 27 | dim3 threads(MULTILABELMARGIN_THREADS); 28 | 29 | cunn_MultiLabelMarginCriterion_updateOutput_kernel <<>>( 30 | THCTensor_(data)(state, output), 31 | THCTensor_(data)(state, input), 32 | THCIndexTensor_(data)(state, target), 33 | THCTensor_(data)(state, istarget), 34 | 1, dim, 35 | sizeaverage 36 | ); 37 | THCudaCheck(cudaGetLastError()); 38 | } 39 | else if(input->nDimension == 2) 40 | { 41 | int nframe = input->size[0]; 42 | int dim = input->size[1]; 43 | THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) 44 | && (target->size[1] == dim), 3, "inconsistent target size"); 45 | THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]); 46 | 47 | dim3 blocks(input->size[0]); 48 | dim3 threads(MULTILABELMARGIN_THREADS); 49 | 50 | cunn_MultiLabelMarginCriterion_updateOutput_kernel <<>>( 51 | THCTensor_(data)(state, output_tmp), 52 | THCTensor_(data)(state, input), 53 | THCIndexTensor_(data)(state, target), 54 | THCTensor_(data)(state, istarget), 55 | nframe, dim, 56 | sizeaverage 57 | ); 58 | THCudaCheck(cudaGetLastError()); 59 | THCTensor_(resize1d)(state, output, 1); 60 | THCTensor_(set1d)(state, output, 0, ScalarConvert::to(THCTensor_(sumall)(state, output_tmp))); 61 | THCTensor_(free)(state, output_tmp); 62 | } 63 | else 64 | THError("vector or matrix expected"); 65 | 66 | THCTensor_(free)(state, input); 67 | THCIndexTensor_(free)(state, target); 68 | THCTensor_(free)(state, istarget); 69 | } 70 | 71 | void THNN_(MultiLabelMarginCriterion_updateGradInput)( 72 | THCState *state, 73 | THCTensor *input, 74 | THCIndexTensor *target, 75 | THCTensor *gradInput, 76 | THCTensor *istarget, 77 | bool sizeaverage) 78 | { 79 | input = THCTensor_(newContiguous)(state, input); 80 | target = THCIndexTensor_(newContiguous)(state, target); 81 | istarget = THCTensor_(newContiguous)(state, istarget); 82 | THCTensor_(resizeAs)(state, gradInput, input); 83 | 84 | if(gradInput->nDimension == 1) 85 | { 86 | int dim = gradInput->size[0]; 87 | THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, 88 | "inconsistent target size"); 89 | THArgCheck((istarget->nDimension == 1) && (istarget->size[0] == dim), 3, 90 | "inconsistent isTarget size"); 91 | dim3 blocks(1); 92 | dim3 threads(MULTILABELMARGIN_THREADS); 93 | 94 | cunn_MultiLabelMarginCriterion_updateGradInput_kernel <<>>(THCTensor_(data)(state, gradInput), 95 | THCTensor_(data)(state, input), 96 | THCIndexTensor_(data)(state, target), 97 | THCTensor_(data)(state, istarget), 98 | 1, gradInput->size[0], 99 | sizeaverage); 100 | 101 | } 102 | else if(gradInput->nDimension == 2) 103 | { 104 | int nframe = gradInput->size[0]; 105 | int dim = gradInput->size[1]; 106 | THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) 107 | && (target->size[1] == dim), 3, "inconsistent target size"); 108 | THArgCheck((istarget->nDimension == 2) && (istarget->size[0] == nframe) 109 | && (istarget->size[1] == dim), 3, "inconsistent isTarget size"); 110 | dim3 blocks(gradInput->size[0]); 111 | dim3 threads(MULTILABELMARGIN_THREADS); 112 | 113 | cunn_MultiLabelMarginCriterion_updateGradInput_kernel <<>>(THCTensor_(data)(state, gradInput), 114 | THCTensor_(data)(state, input), 115 | THCIndexTensor_(data)(state, target), 116 | THCTensor_(data)(state, istarget), 117 | gradInput->size[0], gradInput->size[1], 118 | sizeaverage); 119 | } 120 | else 121 | THError("vector or matrix expected"); 122 | 123 | THCudaCheck(cudaGetLastError()); 124 | 125 | THCTensor_(free)(state, input); 126 | THCIndexTensor_(free)(state, target); 127 | THCTensor_(free)(state, istarget); 128 | } 129 | 130 | #endif 131 | -------------------------------------------------------------------------------- /lib/THCUNN/generic/SpatialUpSamplingBilinear.cu: -------------------------------------------------------------------------------- 1 | #ifndef THC_GENERIC_FILE 2 | #define THC_GENERIC_FILE "generic/SpatialUpSamplingBilinear.cu" 3 | #else 4 | 5 | static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck) 6 | (THCState *state, 7 | THCTensor *input, THCTensor *gradOutput, 8 | int nBatch, int nChannels, 9 | int inputHeight, int inputWidth, 10 | int outputHeight, int outputWidth) { 11 | THArgCheck(inputHeight > 0 && inputWidth > 0 12 | && outputHeight > 0 && outputWidth > 0, 2, 13 | "input and output sizes should be greater than 0," 14 | " but got input (H: %d, W: %d) output (H: %d, W: %d)", 15 | inputHeight, inputWidth, outputHeight, outputWidth); 16 | if (input != NULL) { 17 | THCUNN_argCheck(state, input->nDimension == 4, 2, input, 18 | "4D input tensor expected but got: %s"); 19 | } 20 | 21 | if (gradOutput != NULL) { 22 | THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch); 23 | THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels); 24 | THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight); 25 | THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth); 26 | } 27 | } 28 | 29 | void THNN_(SpatialUpSamplingBilinear_updateOutput)( 30 | THCState *state, 31 | THCTensor *input, 32 | THCTensor *output, 33 | int outputHeight, 34 | int outputWidth) 35 | { 36 | int nbatch = THCTensor_(size)(state, input, 0); 37 | int channels = THCTensor_(size)(state, input, 1); 38 | int inputHeight = THCTensor_(size)(state, input, 2); 39 | int inputWidth = THCTensor_(size)(state, input, 3); 40 | THNN_(SpatialUpSamplingBilinear_shapeCheck) 41 | (state, input, NULL, 42 | nbatch, channels, 43 | inputHeight, inputWidth, 44 | outputHeight, outputWidth); 45 | input = THCTensor_(newContiguous)(state, input); 46 | THCUNN_assertSameGPU(state, 2, input, output); 47 | THCTensor_(resize4d)(state, output, 48 | THCTensor_(size)(state, input, 0), 49 | THCTensor_(size)(state, input, 1), 50 | outputHeight, outputWidth); 51 | THCTensor_(zero)(state, output); 52 | THCDeviceTensor idata = toDeviceTensor(state, input); 53 | THCDeviceTensor odata = toDeviceTensor(state, output); 54 | THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0); 55 | const accreal rheight= (outputHeight > 1) ? (accreal)(inputHeight - 1)/(outputHeight - 1) : accreal(0); 56 | const accreal rwidth = (outputWidth > 1) ? (accreal)(inputWidth - 1)/(outputWidth - 1) : accreal(0); 57 | const int num_kernels = outputHeight * outputWidth; 58 | const int num_threads = 59 | THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; 60 | cudaStream_t stream = THCState_getCurrentStream(state); 61 | caffe_gpu_interp2_kernel <<>>(num_kernels, rheight, rwidth, idata, odata); 63 | THCudaCheck(cudaGetLastError()); 64 | THCTensor_(free)(state, input); 65 | } 66 | 67 | 68 | void THNN_(SpatialUpSamplingBilinear_updateGradInput)( 69 | THCState *state, 70 | THCTensor *gradOutput, 71 | THCTensor *gradInput, 72 | int nbatch, 73 | int nchannels, 74 | int inputHeight, 75 | int inputWidth, 76 | int outputHeight, 77 | int outputWidth) 78 | { 79 | THNN_(SpatialUpSamplingBilinear_shapeCheck) 80 | (state, NULL, gradOutput, 81 | nbatch, nchannels, 82 | inputHeight, inputWidth, 83 | outputHeight, outputWidth); 84 | gradInput = THCTensor_(newContiguous)(state, gradInput); 85 | gradOutput = THCTensor_(newContiguous)(state, gradOutput); 86 | THCUNN_assertSameGPU(state, 2, gradOutput, gradInput); 87 | THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth); 88 | THCTensor_(zero)(state, gradInput); 89 | THCDeviceTensor data1 = toDeviceTensor(state, gradInput); 90 | THCDeviceTensor data2 = toDeviceTensor(state, gradOutput); 91 | int height1 = data1.getSize(2); 92 | int width1 = data1.getSize(3); 93 | int height2 = data2.getSize(2); 94 | int width2 = data2.getSize(3); 95 | assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0); 96 | const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0); 97 | const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1) / (width2 - 1) : accreal(0); 98 | const int num_kernels = height2 * width2; 99 | const int num_threads = 100 | THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock; 101 | cudaStream_t stream = THCState_getCurrentStream(state); 102 | caffe_gpu_interp2_kernel_backward <<>>(num_kernels, rheight, rwidth, data1, data2); 104 | THCudaCheck(cudaGetLastError()); 105 | THCTensor_(free)(state, gradInput); 106 | THCTensor_(free)(state, gradOutput); 107 | } 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /lib/THCUNN/SpatialCrossMapLRN.cu: -------------------------------------------------------------------------------- 1 | #include "THCUNN.h" 2 | #include "THCHalf.h" 3 | #include "THCHalfAutoNumerics.cuh" 4 | #include "common.h" 5 | 6 | template 7 | __global__ void 8 | #if __CUDA_ARCH__ >= 320 9 | __launch_bounds__(CUDA_NUM_THREADS) 10 | #endif 11 | LRNFillScale(const int nthreads, const Dtype* const in, 12 | const int num, const int channels, const int height, 13 | const int width, const int size, const Dtype alpha_over_size, 14 | const Dtype k, Dtype* const scale) { 15 | CUDA_KERNEL_LOOP(index, nthreads) { 16 | // find out the local offset 17 | const int w = index % width; 18 | const int h = (index / width) % height; 19 | const int n = index / width / height; 20 | const int offset = (n * channels * height + h) * width + w; 21 | const int step = height * width; 22 | const Dtype* const in_off = in + offset; 23 | Dtype* const scale_off = scale + offset; 24 | int head = 0; 25 | const int pre_pad = (size - 1) / 2; 26 | const int post_pad = size - pre_pad - 1; 27 | Acctype accum_scale = Acctype(0); 28 | // fill the scale at [n, :, h, w] 29 | // accumulate values 30 | while (head < post_pad && head < channels) { 31 | accum_scale += in_off[head * step] * in_off[head * step]; 32 | ++head; 33 | } 34 | // both add and subtract 35 | while (head < channels) { 36 | accum_scale += in_off[head * step] * in_off[head * step]; 37 | if (head - size >= 0) { 38 | accum_scale -= in_off[(head - size) * step] 39 | * in_off[(head - size) * step]; 40 | } 41 | scale_off[(head - post_pad) * step] = ScalarConvert::to(k + accum_scale * alpha_over_size); 42 | ++head; 43 | } 44 | // subtract only 45 | while (head < channels + post_pad) { 46 | if (head - size >= 0) { 47 | accum_scale -= in_off[(head - size) * step] 48 | * in_off[(head - size) * step]; 49 | } 50 | scale_off[(head - post_pad) * step] = ScalarConvert::to(k + accum_scale * alpha_over_size); 51 | ++head; 52 | } 53 | } 54 | } 55 | 56 | template 57 | __global__ void LRNComputeOutput(const int nthreads, const Dtype* in, 58 | const Dtype* scale, const Dtype negative_beta, Dtype* out) { 59 | CUDA_KERNEL_LOOP(index, nthreads) { 60 | out[index] = in[index] * pow(scale[index], negative_beta); 61 | } 62 | } 63 | 64 | template 65 | __global__ void LRNComputeDiff(const int nthreads, 66 | const Dtype* const bottom_data, const Dtype* const top_data, 67 | const Dtype* const scale, const Dtype* const top_diff, 68 | const int num, const int channels, const int height, 69 | const int width, const int size, const Dtype negative_beta, 70 | const Dtype cache_ratio, Dtype* const bottom_diff) { 71 | CUDA_KERNEL_LOOP(index, nthreads) { 72 | // find out the local offset 73 | const int w = index % width; 74 | const int h = (index / width) % height; 75 | const int n = index / width / height; 76 | const int offset = (n * channels * height + h) * width + w; 77 | const int step = height * width; 78 | const Dtype* const bottom_off = bottom_data + offset; 79 | const Dtype* const top_off = top_data + offset; 80 | const Dtype* const scale_off = scale + offset; 81 | const Dtype* const top_diff_off = top_diff + offset; 82 | Dtype* const bottom_diff_off = bottom_diff + offset; 83 | int head = 0; 84 | const int pre_pad = size - (size + 1) / 2; 85 | const int post_pad = size - pre_pad - 1; 86 | Acctype accum_ratio = Acctype(0); 87 | // accumulate values 88 | while (head < post_pad && head < channels) { 89 | accum_ratio += top_diff_off[head * step] * top_off[head * step] / 90 | scale_off[head * step]; 91 | ++head; 92 | } 93 | // both add and subtract 94 | while (head < channels) { 95 | accum_ratio += top_diff_off[head * step] * top_off[head * step] / 96 | scale_off[head * step]; 97 | if (head - size >= 0) { 98 | accum_ratio -= top_diff_off[(head - size) * step] * 99 | top_off[(head - size) * step] / scale_off[(head - size) * step]; 100 | } 101 | bottom_diff_off[(head - post_pad) * step] = 102 | ScalarConvert::to(top_diff_off[(head - post_pad) * step] 103 | * pow(scale_off[(head - post_pad) * step], negative_beta) 104 | - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio); 105 | ++head; 106 | } 107 | // subtract only 108 | while (head < channels + post_pad) { 109 | if (head - size >= 0) { 110 | accum_ratio -= top_diff_off[(head - size) * step] * 111 | top_off[(head - size) * step] / scale_off[(head - size) * step]; 112 | } 113 | bottom_diff_off[(head - post_pad) * step] = 114 | ScalarConvert::to(top_diff_off[(head - post_pad) * step] 115 | * pow(scale_off[(head - post_pad) * step], negative_beta) 116 | - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio); 117 | ++head; 118 | } 119 | } 120 | } 121 | 122 | 123 | #include "generic/SpatialCrossMapLRN.cu" 124 | #include "THCGenerateFloatTypes.h" 125 | --------------------------------------------------------------------------------