├── .gitignore
├── lib
    ├── THCUNN
    │   ├── SpatialMaxPooling.cu
    │   ├── TemporalConvolution.cu
    │   ├── SpatialFullConvolution.cu
    │   ├── VolumetricFullConvolution.cu
    │   ├── SpatialFullDilatedConvolution.cu
    │   ├── SpatialConvolutionMM.cu
    │   ├── VolumetricDilatedConvolution.cu
    │   ├── SpatialConvolutionLocal.cu
    │   ├── SpatialDilatedConvolution.cu
    │   ├── TemporalRowConvolution.cu
    │   ├── VolumetricFullDilatedConvolution.cu
    │   ├── SpatialDepthWiseConvolution.cu
    │   ├── VolumetricMaxPooling.cu
    │   ├── THCUNN.h
    │   ├── Abs.cu
    │   ├── Square.cu
    │   ├── SharedMem.cuh
    │   ├── Sqrt.cu
    │   ├── generic
    │   │   ├── Sigmoid.cu
    │   │   ├── Abs.cu
    │   │   ├── Tanh.cu
    │   │   ├── Square.cu
    │   │   ├── LogSigmoid.cu
    │   │   ├── Sqrt.cu
    │   │   ├── SpatialMaxPooling.cu
    │   │   ├── SoftShrink.cu
    │   │   ├── VolumetricMaxPooling.cu
    │   │   ├── SoftPlus.cu
    │   │   ├── L1Cost.cu
    │   │   ├── ELU.cu
    │   │   ├── LeakyReLU.cu
    │   │   ├── HardTanh.cu
    │   │   ├── Threshold.cu
    │   │   ├── SpatialFullConvolution.cu
    │   │   ├── VolumetricFullConvolution.cu
    │   │   ├── AbsCriterion.cu
    │   │   ├── SoftMarginCriterion.cu
    │   │   ├── MarginCriterion.cu
    │   │   ├── MSECriterion.cu
    │   │   ├── DistKLDivCriterion.cu
    │   │   ├── GatedLinearUnit.cu
    │   │   ├── SmoothL1Criterion.cu
    │   │   ├── RReLU.cu
    │   │   ├── SpatialMaxUnpooling.cu
    │   │   ├── BCECriterion.cu
    │   │   ├── SpatialCrossMapLRN.cu
    │   │   ├── BatchNormalization.cu
    │   │   ├── SoftMax.cu
    │   │   ├── SpatialReplicationPadding.cu
    │   │   ├── SpatialReflectionPadding.cu
    │   │   ├── MultiLabelMarginCriterion.cu
    │   │   └── SpatialUpSamplingBilinear.cu
    │   ├── L1Cost.cu
    │   ├── AbsCriterion.cu
    │   ├── Sigmoid.cu
    │   ├── DistKLDivCriterion.cu
    │   ├── SoftShrink.cu
    │   ├── Tanh.cu
    │   ├── GatedLinearUnit.cu
    │   ├── SoftMarginCriterion.cu
    │   ├── SoftPlus.cu
    │   ├── MSECriterion.cu
    │   ├── MarginCriterion.cu
    │   ├── SpatialMaxUnpooling.cu
    │   ├── FusedRNNKernel.cu
    │   ├── SmoothL1Criterion.cu
    │   ├── ELU.cu
    │   ├── HardTanh.cu
    │   ├── LeakyReLU.cu
    │   ├── Threshold.cu
    │   ├── SpatialUpSamplingNearest.cu
    │   ├── SpatialReplicationPadding.cu
    │   ├── BCECriterion.cu
    │   ├── VolumetricMaxUnpooling.cu
    │   ├── PReLU.cu
    │   ├── VolumetricUpSamplingNearest.cu
    │   ├── SparseLinear.cu
    │   ├── LogSigmoid.cu
    │   ├── MultiMarginCriterion.cu
    │   ├── SpatialReflectionPadding.cu
    │   ├── SpatialClassNLLCriterion.cu
    │   ├── RReLU.cu
    │   ├── VolumetricReplicationPadding.cu
    │   ├── SoftMax.cu
    │   ├── SpatialDilatedMaxPooling.cu
    │   ├── common.h
    │   ├── TemporalMaxPooling.cu
    │   ├── CMakeLists.txt
    │   ├── row2col.h
    │   ├── SpatialAveragePooling.cu
    │   ├── SpatialFractionalMaxPooling.cu
    │   ├── SpatialUpSamplingBilinear.cu
    │   └── SpatialCrossMapLRN.cu
    └── CMakeLists.txt
├── init.lua
├── rocks
    ├── version.sh
    ├── cunn-scm-1.rockspec
    └── cunn-1.0-0.rockspec
├── CMakeLists.txt
├── LICENSE
├── doc
    └── cunnmodules.md
├── README.md
└── THCUNN.lua


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | THCUNN_h.lua
3 | THCUNN_generic_h.lua
4 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialMaxPooling.cu:
--------------------------------------------------------------------------------
1 | #include "THCUNN.h"
2 | 
3 | #include "generic/SpatialMaxPooling.cu"
4 | #include "THCGenerateFloatTypes.h"
5 | 


--------------------------------------------------------------------------------
/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
2 | SET(THCUNN_INSTALL_LIB_SUBDIR "${Torch_INSTALL_LUA_CPATH_SUBDIR}")
3 | SET(THCUNN_INSTALL_INCLUDE_SUBDIR "${Torch_INSTALL_INCLUDE_SUBDIR}")
4 | ADD_SUBDIRECTORY(THCUNN)


--------------------------------------------------------------------------------
/lib/THCUNN/TemporalConvolution.cu:
--------------------------------------------------------------------------------
1 | #include "THCUNN.h"
2 | #include "common.h"
3 | #include "THCHalf.h"
4 | #include "THCHalfAutoNumerics.cuh"
5 | 
6 | #include "generic/TemporalConvolution.cu"
7 | #include "THCGenerateFloatTypes.h"
8 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialFullConvolution.cu:
--------------------------------------------------------------------------------
1 | #include "THCUNN.h"
2 | #include "im2col.h"
3 | 
4 | #include "THCHalf.h"
5 | #include "THCHalfAutoNumerics.cuh"
6 | 
7 | #include "generic/SpatialFullConvolution.cu"
8 | #include "THCGenerateFloatTypes.h"
9 | 


--------------------------------------------------------------------------------
/lib/THCUNN/VolumetricFullConvolution.cu:
--------------------------------------------------------------------------------
1 | #include "THCUNN.h"
2 | #include "common.h"
3 | #include "THCHalf.h"
4 | #include "THCHalfAutoNumerics.cuh"
5 | 
6 | #include "generic/VolumetricFullConvolution.cu"
7 | #include "THCGenerateFloatTypes.h"
8 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialFullDilatedConvolution.cu:
--------------------------------------------------------------------------------
1 | #include "THCUNN.h"
2 | #include "im2col.h"
3 | 
4 | #include "THCHalf.h"
5 | #include "THCHalfAutoNumerics.cuh"
6 | 
7 | #include "generic/SpatialFullDilatedConvolution.cu"
8 | #include "THCGenerateFloatTypes.h"
9 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialConvolutionMM.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "im2col.h"
 4 | 
 5 | #include "THCHalf.h"
 6 | #include "THCHalfAutoNumerics.cuh"
 7 | 
 8 | #include "generic/SpatialConvolutionMM.cu"
 9 | #include "THCGenerateFloatTypes.h"
10 | 


--------------------------------------------------------------------------------
/lib/THCUNN/VolumetricDilatedConvolution.cu:
--------------------------------------------------------------------------------
1 | #include "THCUNN.h"
2 | #include "common.h"
3 | #include "vol2col.h"
4 | #include "THCHalf.h"
5 | #include "THCHalfAutoNumerics.cuh"
6 | 
7 | #include "generic/VolumetricDilatedConvolution.cu"
8 | #include "THCGenerateFloatTypes.h"
9 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialConvolutionLocal.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "im2col.h"
 4 | 
 5 | #include "THCHalf.h"
 6 | #include "THCHalfAutoNumerics.cuh"
 7 | 
 8 | #include "generic/SpatialConvolutionLocal.cu"
 9 | #include "THCGenerateFloatTypes.h"
10 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialDilatedConvolution.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "im2col.h"
 4 | 
 5 | #include "THCHalf.h"
 6 | #include "THCHalfAutoNumerics.cuh"
 7 | 
 8 | #include "generic/SpatialDilatedConvolution.cu"
 9 | #include "THCGenerateFloatTypes.h"
10 | 


--------------------------------------------------------------------------------
/lib/THCUNN/TemporalRowConvolution.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "row2col.h"
 4 | 
 5 | #include "THCHalf.h"
 6 | #include "THCHalfAutoNumerics.cuh"
 7 | 
 8 | #include "generic/TemporalRowConvolution.cu"
 9 | 
10 | #include "THCGenerateFloatTypes.h"
11 | 


--------------------------------------------------------------------------------
/lib/THCUNN/VolumetricFullDilatedConvolution.cu:
--------------------------------------------------------------------------------
1 | #include "THCUNN.h"
2 | #include "common.h"
3 | #include "vol2col.h"
4 | #include "THCHalf.h"
5 | #include "THCHalfAutoNumerics.cuh"
6 | 
7 | #include "generic/VolumetricFullDilatedConvolution.cu"
8 | #include "THCGenerateFloatTypes.h"
9 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialDepthWiseConvolution.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "im2col.h"
 4 | 
 5 | #include "THCHalf.h"
 6 | #include "THCHalfAutoNumerics.cuh"
 7 | 
 8 | #include "generic/SpatialDepthWiseConvolution.cu"
 9 | #include "THCGenerateFloatTypes.h"
10 | 


--------------------------------------------------------------------------------
/lib/THCUNN/VolumetricMaxPooling.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCDeviceTensor.cuh"
 4 | #include "THCDeviceTensorUtils.cuh"
 5 | #include "THCDeviceUtils.cuh"
 6 | 
 7 | #include <cfloat>
 8 | 
 9 | #include "generic/VolumetricMaxPooling.cu"
10 | #include "THCGenerateFloatTypes.h"
11 | 


--------------------------------------------------------------------------------
/lib/THCUNN/THCUNN.h:
--------------------------------------------------------------------------------
 1 | #include <THC/THC.h>
 2 | 
 3 | #define THCIndexTensor THCudaLongTensor
 4 | #define THCIndexTensor_(NAME) THCudaLongTensor_ ## NAME
 5 | typedef long THCIndex_t;
 6 | 
 7 | #define THNN_(NAME) TH_CONCAT_3(THNN_, CReal, NAME)
 8 | 
 9 | #include "generic/THCUNN.h"
10 | #include <THC/THCGenerateFloatTypes.h>
11 | 


--------------------------------------------------------------------------------
/init.lua:
--------------------------------------------------------------------------------
 1 | cunn = nil
 2 | 
 3 | require "cutorch"
 4 | require "nn"
 5 | require "cunn.THCUNN"
 6 | 
 7 | require('cunn.test')
 8 | require('cunn.DataParallelTable')
 9 | 
10 | nn.Module._flattenTensorBuffer['torch.CudaTensor'] = torch.FloatTensor.new
11 | nn.Module._flattenTensorBuffer['torch.CudaDoubleTensor'] = torch.DoubleTensor.new
12 | -- FIXME: change this to torch.HalfTensor when available
13 | nn.Module._flattenTensorBuffer['torch.CudaHalfTensor'] = torch.FloatTensor.new
14 | 


--------------------------------------------------------------------------------
/lib/THCUNN/Abs.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct absupdateOutput_functor
 8 | {
 9 |   __device__ void operator()(T* output, const T* input) const
10 |   {
11 |     *output = abs(*input);
12 |   }
13 | };
14 | 
15 | template <typename T>
16 | struct absupdateGradInput_functor
17 | {
18 |   __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const
19 |   {
20 |     *gradInput = *input < 0 ? - *gradOutput : *gradOutput;
21 |   }
22 | };
23 | 
24 | #include "generic/Abs.cu"
25 | #include "THCGenerateFloatTypes.h"
26 | 


--------------------------------------------------------------------------------
/lib/THCUNN/Square.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct squareupdateOutput_functor
 8 | {
 9 |   __device__ void operator()(T* output, const T* input) const
10 |   {
11 |     *output = (*input) * (*input);
12 |   }
13 | };
14 | 
15 | template <typename T>
16 | struct squareupdateGradInput_functor
17 | {
18 |   __device__ void operator()(T* gradInput, const T* input, const T* gradOutput) const
19 |   {
20 |     *gradInput = ScalarConvert<double, T>::to(2.0) * (*gradOutput) * (*input);
21 |   }
22 | };
23 | 
24 | #include "generic/Square.cu"
25 | #include "THCGenerateFloatTypes.h"
26 | 


--------------------------------------------------------------------------------
/rocks/version.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd "$(dirname "$0")"
 3 | fname=$(ls|grep rockspec|grep -v scm | sort -r -V|head -n1)
 4 | echo "Last known version: $fname" 
 5 | luarocks new_version $fname
 6 | 
 7 | new_fname=$(ls|grep rockspec|grep -v scm | sort -r -V|head -n1)
 8 | new_version=$(echo $new_fname | cut -f2,3,4,5 -d'-'|sed -e 's/.rockspec//g')
 9 | echo "new rockspec: $new_fname"
10 | echo "new version: $new_version"
11 | git add $new_fname
12 | git commit -m "Cutting version $new_version"
13 | git branch $new_version
14 | 
15 | git push origin master:master
16 | git push origin $new_version:$new_version
17 | 
18 | git clone https://github.com/torch/rocks
19 | cp $new_fname rocks/
20 | cd rocks
21 | th make-manifest.lua
22 | git add $new_fname
23 | git commit -am "adding rockspec $new_fname"
24 | git push
25 | cd ..
26 | rm -rf rocks
27 | cd ..
28 | 
29 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SharedMem.cuh:
--------------------------------------------------------------------------------
 1 | // Based on the simpleTempltes CUDA example
 2 | 
 3 | #ifndef THCUNN_SHAREDMEM_H
 4 | #define THCUNN_SHAREDMEM_H
 5 | 
 6 | template <typename T>
 7 | struct SharedMem {
 8 |   __device__ T *getPointer()
 9 |   {
10 |     extern __device__ void error(void);
11 |     error();
12 |     return NULL;
13 |   }
14 | };
15 | 
16 | #ifdef CUDA_HALF_TENSOR
17 | template <>
18 | struct SharedMem<half>
19 | {
20 |   __device__ half *getPointer() {
21 |     extern __shared__ half s_half[];
22 |     return s_half;
23 |   }
24 | };
25 | #endif
26 | 
27 | template <>
28 | struct SharedMem<float>
29 | {
30 |   __device__ float *getPointer() {
31 |     extern __shared__ float s_float[];
32 |     return s_float;
33 |   }
34 | };
35 | 
36 | template <>
37 | struct SharedMem<double>
38 | {
39 |   __device__ double *getPointer() {
40 |     extern __shared__ double s_double[];
41 |     return s_double;
42 |   }
43 | };
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/lib/THCUNN/Sqrt.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct sqrtupdateOutput_functor
 8 | {
 9 |   const T bias;
10 | 
11 |   sqrtupdateOutput_functor(T bias_)
12 |     : bias(bias_)
13 |   {}
14 | 
15 |   __device__ void operator()(T *output, const T *input) const
16 |   {
17 |     *output = sqrt(*input + bias);
18 |   }
19 | };
20 | 
21 | template <typename T>
22 | struct sqrtupdateGradInput_functor
23 | {
24 |   sqrtupdateGradInput_functor() {}
25 | 
26 |   __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
27 |   {
28 |     *gradInput = (THCNumerics<T>::eq(*output,ScalarConvert<float, T>::to(0.0f))) ? ScalarConvert<float, T>::to(0.0f) : ((ScalarConvert<float, T>::to(0.5f) * *gradOutput) / *output);
29 |   }
30 | };
31 | 
32 | #include "generic/Sqrt.cu"
33 | #include "THCGenerateFloatTypes.h"
34 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/Sigmoid.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/Sigmoid.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(Sigmoid_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output)
11 | {
12 |   THCUNN_assertSameGPU(state, 2, input, output);
13 |   THCTensor_(sigmoid)(state, output, input);
14 | }
15 | 
16 | void THNN_(Sigmoid_updateGradInput)(
17 |            THCState *state,
18 |            THCTensor *input,
19 |            THCTensor *gradOutput,
20 |            THCTensor *gradInput,
21 |            THCTensor *output)
22 | {
23 |   THCUNN_check_nElement(state, output, gradOutput);
24 |   THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
25 |   THCTensor_(resizeAs)(state, gradInput, output);
26 |   THC_pointwiseApply3(state, gradInput, output, gradOutput, sigmoid_updateGradInput_functor<real>());
27 | }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/lib/THCUNN/L1Cost.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | 
 6 | #include <thrust/device_ptr.h>
 7 | #include <thrust/reduce.h>
 8 | #include <thrust/transform_reduce.h>
 9 | 
10 | template <typename Dtype, typename Acctype>
11 | struct l1cost_functor
12 | {
13 |   __host__ __device__ Acctype operator()(Dtype x) const
14 |   {
15 |     return THCNumerics<Acctype>::abs(ScalarConvert<Dtype, Acctype>::to(x));
16 |   }
17 | };
18 | 
19 | template <typename Dtype>
20 | struct l1cost_updateGradInput_functor
21 | {
22 |   __host__ __device__ Dtype operator()(Dtype x) const
23 |   {
24 |     if (x > 0)
25 |       return ScalarConvert<int, Dtype>::to(1);
26 |     else if (x < 0)
27 |       return ScalarConvert<int, Dtype>::to(-1);
28 |     else
29 |       return ScalarConvert<int, Dtype>::to(0);
30 |   }
31 | };
32 | 
33 | #include "generic/L1Cost.cu"
34 | #include "THCGenerateFloatTypes.h"
35 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/Abs.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/Abs.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(Abs_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output)
11 | {
12 |   THCUNN_assertSameGPU(state, 2, input, output);
13 |   THCTensor_(resizeAs)(state, output, input);
14 |   THC_pointwiseApply2(state, output, input, absupdateOutput_functor<real>());
15 | }
16 | 
17 | void THNN_(Abs_updateGradInput)(
18 |            THCState *state,
19 |            THCTensor *input,
20 |            THCTensor *gradOutput,
21 |            THCTensor *gradInput)
22 | {
23 |   THCUNN_check_nElement(state, input, gradOutput);
24 |   THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
25 |   THCTensor_(resizeAs)(state, gradInput, input);
26 |   THC_pointwiseApply3(state, gradInput, input, gradOutput, absupdateGradInput_functor<real>());
27 | }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/Tanh.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/Tanh.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(Tanh_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output)
11 | {
12 |   THCUNN_assertSameGPU(state, 2, input, output);
13 |   THCTensor_(resizeAs)(state, output, input);
14 |   THCTensor_(tanh)(state, output, input);
15 | }
16 | 
17 | void THNN_(Tanh_updateGradInput)(
18 |            THCState *state,
19 |            THCTensor *input,
20 |            THCTensor *gradOutput,
21 |            THCTensor *gradInput,
22 |            THCTensor *output)
23 | {
24 |   THCUNN_check_shape(state, output, gradOutput);
25 |   THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
26 |   THCTensor_(resizeAs)(state, gradInput, output);
27 |   THC_pointwiseApply3(state, gradInput, output, gradOutput, tanh_updateGradInput_functor<real>());
28 | }
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/Square.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/Square.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(Square_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output)
11 | {
12 |   THCUNN_assertSameGPU(state, 2, input, output);
13 |   THCTensor_(resizeAs)(state, output, input);
14 |   THC_pointwiseApply2(state, output, input, squareupdateOutput_functor<real>());
15 | }
16 | 
17 | void THNN_(Square_updateGradInput)(
18 |            THCState *state,
19 |            THCTensor *input,
20 |            THCTensor *gradOutput,
21 |            THCTensor *gradInput)
22 | {
23 |   THCUNN_check_shape(state, input, gradOutput);
24 |   THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
25 |   THCTensor_(resizeAs)(state, gradInput, input);
26 |   THC_pointwiseApply3(state, gradInput, input, gradOutput, squareupdateGradInput_functor<real>());
27 | }
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
 2 | CMAKE_POLICY(VERSION 2.8)
 3 | 
 4 | SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
 5 | 
 6 | FIND_PACKAGE(Torch REQUIRED)
 7 | FIND_PACKAGE(CUDA 6.5 REQUIRED)
 8 | 
 9 | INCLUDE_DIRECTORIES("${Torch_INSTALL_INCLUDE}/THC")
10 | LINK_DIRECTORIES("${Torch_INSTALL_LIB}")
11 | 
12 | FILE(STRINGS lib/THCUNN/THCUNN.h THCUNN_headers NEWLINE_CONSUME)
13 | FILE(WRITE THCUNN_h.lua "return [[")
14 | FILE(APPEND THCUNN_h.lua ${THCUNN_headers})
15 | FILE(APPEND THCUNN_h.lua "]]")
16 | 
17 | FILE(STRINGS lib/THCUNN/generic/THCUNN.h THCUNN_generic_headers NEWLINE_CONSUME)
18 | FILE(WRITE THCUNN_generic_h.lua "return [[")
19 | FILE(APPEND THCUNN_generic_h.lua ${THCUNN_generic_headers})
20 | FILE(APPEND THCUNN_generic_h.lua "]]")
21 | 
22 | FILE(GLOB luasrc *.lua)
23 | 
24 | ADD_SUBDIRECTORY(lib)
25 | 
26 | INSTALL(
27 |   FILES
28 |   ${luasrc}
29 |   DESTINATION "${Torch_INSTALL_LUA_PATH_SUBDIR}/cunn")
30 | 


--------------------------------------------------------------------------------
/lib/THCUNN/AbsCriterion.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | 
 6 | #include <thrust/fill.h>
 7 | #include <thrust/functional.h>
 8 | #include <thrust/device_ptr.h>
 9 | #include <thrust/reduce.h>
10 | #include <thrust/inner_product.h>
11 | 
12 | template <typename Dtype, typename Acctype>
13 | struct abs_functor
14 | {
15 |   __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
16 |   {
17 |     Dtype z = x-y;
18 |     return ScalarConvert<Dtype, Acctype>::to(z >= 0 ? z : -z);
19 |   }
20 | };
21 | 
22 | template <typename Dtype>
23 | struct abs_updateGradInput_functor
24 | {
25 |   const Dtype norm;
26 | 
27 |   abs_updateGradInput_functor(Dtype norm_)
28 |     : norm(norm_)
29 |   {}
30 | 
31 |   __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
32 |   {
33 |     return (x - y) >= 0 ? norm : -norm;
34 |   }
35 | };
36 | 
37 | #include "generic/AbsCriterion.cu"
38 | #include "THCGenerateFloatTypes.h"
39 | 


--------------------------------------------------------------------------------
/lib/THCUNN/Sigmoid.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct sigmoid_updateGradInput_functor {
 8 |   __device__ __forceinline__ void operator()(T* gradInput, const T *output, const T *gradOutput) const {
 9 |     *gradInput = *gradOutput * (1.f - *output) * (*output);
10 |   }
11 | };
12 | 
13 | #ifdef CUDA_HALF_TENSOR
14 | template <>
15 | struct sigmoid_updateGradInput_functor<half> {
16 |   __device__ __forceinline__ void operator()(half* gradInput, const half *output, const half *gradOutput) const {
17 | #ifdef CUDA_HALF_INSTRUCTIONS
18 |     const half one = __float2half(1.f);
19 |     *gradInput = __hmul(*gradOutput, __hmul(__hadd(one, __hneg(*output)), *output));
20 | #else
21 |     const float out = __half2float(*output);
22 |     const float go = __half2float(*gradOutput);
23 |     *gradInput = __float2half(go * (1.f - out) * out);
24 | #endif
25 |   }
26 | };
27 | #endif
28 | 
29 | #include "generic/Sigmoid.cu"
30 | #include "THCGenerateFloatTypes.h"
31 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/LogSigmoid.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/LogSigmoid.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(LogSigmoid_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output,
11 |            THCTensor *buffer)
12 | {
13 |   THCUNN_assertSameGPU(state, 2, input, output);
14 |   THCTensor_(resizeAs)(state, output, input);
15 |   THC_pointwiseApply2(state, output, input, logSigmoid_updateOutput_functor<real>());
16 | }
17 | 
18 | void THNN_(LogSigmoid_updateGradInput)(
19 |            THCState *state,
20 |            THCTensor *input,
21 |            THCTensor *gradOutput,
22 |            THCTensor *gradInput,
23 |            THCTensor *buffer)
24 | {
25 |   THCUNN_check_nElement(state, input, gradOutput);
26 |   THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
27 |   THCTensor_(resizeAs)(state, gradInput, input);
28 |   THC_pointwiseApply3(state, gradInput, input, gradOutput, logSigmoid_updateGradInput_functor<real>());
29 | }
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/Sqrt.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/Sqrt.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(Sqrt_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output,
11 |            accreal eps_)
12 | {
13 |   real eps = ScalarConvert<accreal, real>::to(eps_);
14 |   THCUNN_assertSameGPU(state, 2, input, output);
15 |   THCTensor_(resizeAs)(state, output, input);
16 |   THC_pointwiseApply2(state, output, input, sqrtupdateOutput_functor<real>(eps));
17 | }
18 | 
19 | void THNN_(Sqrt_updateGradInput)(
20 |            THCState *state,
21 |            THCTensor *input,
22 |            THCTensor *gradOutput,
23 |            THCTensor *gradInput,
24 |            THCTensor *output)
25 | {
26 |   THCUNN_check_shape(state, output, gradOutput);
27 |   THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
28 |   THCTensor_(resizeAs)(state, gradInput, output);
29 |   THC_pointwiseApply3(state, gradInput, output, gradOutput, sqrtupdateGradInput_functor<real>());
30 | }
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/lib/THCUNN/DistKLDivCriterion.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | 
 6 | #include <thrust/fill.h>
 7 | #include <thrust/functional.h>
 8 | #include <thrust/device_ptr.h>
 9 | #include <thrust/reduce.h>
10 | #include <thrust/inner_product.h>
11 | 
12 | template <typename Dtype, typename Acctype>
13 | struct kl_functor
14 | {
15 |   __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
16 |   {
17 |       Acctype yAcc = ScalarConvert<Dtype, Acctype>::to(y);
18 |       return y > 0 ? yAcc * (THCNumerics<Acctype>::log(yAcc) - x) : Acctype(0);
19 |   }
20 | };
21 | 
22 | template <typename Dtype>
23 | struct kl_updateGradInput_functor
24 | {
25 |   const Dtype norm;
26 | 
27 |   kl_updateGradInput_functor(Dtype norm_)
28 |     : norm(norm_)
29 |   {}
30 | 
31 |   __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
32 |   {
33 |       return y > 0 ? norm * (-y) : ScalarConvert<int, Dtype>::to(0);
34 |   }
35 | };
36 | 
37 | #include "generic/DistKLDivCriterion.cu"
38 | #include "THCGenerateFloatTypes.h"
39 | 


--------------------------------------------------------------------------------
/rocks/cunn-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "cunn"
 2 | version = "scm-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/torch/cunn.git",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "Torch CUDA Neural Network Implementation",
10 |    detailed = [[
11 |    ]],
12 |    homepage = "https://github.com/torch/cunn",
13 |    license = "BSD"
14 | }
15 | 
16 | dependencies = {
17 |    "torch >= 7.0",
18 |    "nn >= 1.0",
19 |    "cutorch >= 1.0"
20 | }
21 | 
22 | build = {
23 |    type = "command",
24 |    build_command = [[
25 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install
26 | ]],
27 | 	platforms = {
28 |       windows = {
29 |    build_command = [[
30 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
31 | ]]
32 | 	  }
33 |    },
34 |    install_command = "cd build"
35 | }
36 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SoftShrink.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct SoftShrinkUpdateOutput
 8 | {
 9 |   const T lambda_;
10 | 
11 |   SoftShrinkUpdateOutput(T lambda)
12 |     : lambda_(lambda)
13 |   {}
14 | 
15 |   __device__ __forceinline__ void operator()(T *out, T *in)
16 |   {
17 |     T x = *in;
18 |     if (x > lambda_) *out = x - lambda_;
19 |     else if (x < -lambda_) *out = x + lambda_;
20 |     else *out = ScalarConvert<int, T>::to(0);
21 |   }
22 | };
23 | 
24 | template <typename T>
25 | struct SoftShrinkUpdateGradInput
26 | {
27 |   const T lambda_;
28 | 
29 |   SoftShrinkUpdateGradInput(T lambda)
30 |     : lambda_(lambda)
31 |   {}
32 | 
33 |   __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput) const
34 |   {
35 |     T x = *input;
36 |     if (x > lambda_ || x < -lambda_)
37 |       *gradInput = *gradOutput;
38 |     else
39 |       *gradInput = ScalarConvert<int, T>::to(0);
40 |   }
41 | };
42 | 
43 | #include "generic/SoftShrink.cu"
44 | #include "THCGenerateFloatTypes.h"
45 | 


--------------------------------------------------------------------------------
/lib/THCUNN/Tanh.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct tanh_updateGradInput_functor
 8 | {
 9 |   __device__ __forceinline__ void operator()(T *gradInput,
10 |           const T *output, const T *gradOutput) const {
11 |     *gradInput = *gradOutput * (1.f - *output * *output);
12 |   }
13 | };
14 | 
15 | #ifdef CUDA_HALF_TENSOR
16 | template <>
17 | struct tanh_updateGradInput_functor<half>
18 | {
19 |   __device__ __forceinline__ void operator()(half *gradInput,
20 |           const half *output, const half *gradOutput) const {
21 | #ifdef CUDA_HALF_INSTRUCTIONS
22 |     const half one = __float2half(1.f);
23 |     const half out_square = __hmul(*output, *output);
24 |     *gradInput = __hmul(*gradOutput, __hadd(one, __hneg(out_square)));
25 | #else
26 |     const float out = __half2float(*output);
27 |     const float go = __half2float(*gradOutput);
28 |     *gradInput = __float2half(go * (1.f - out * out));
29 | #endif
30 |   }
31 | };
32 | #endif
33 | 
34 | #include "generic/Tanh.cu"
35 | #include "THCGenerateFloatTypes.h"
36 | 


--------------------------------------------------------------------------------
/rocks/cunn-1.0-0.rockspec:
--------------------------------------------------------------------------------
 1 | package = "cunn"
 2 | version = "1.0-0"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/torch/cunn.git",
 6 |    tag = "1.0-0"
 7 | }
 8 | 
 9 | description = {
10 |    summary = "Torch CUDA Neural Network Implementation",
11 |    detailed = [[
12 |    ]],
13 |    homepage = "https://github.com/torch/cunn",
14 |    license = "BSD"
15 | }
16 | 
17 | dependencies = {
18 |    "torch >= 7.0",
19 |    "nn >= 1.0",
20 |    "cutorch == 1.0-0"
21 | }
22 | 
23 | build = {
24 |    type = "command",
25 |    build_command = [[
26 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install
27 | ]],
28 | 	platforms = {
29 |       windows = {
30 |    build_command = [[
31 | cmake -E make_directory build && cd build && cmake .. -DLUALIB=$(LUALIB) -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
32 | ]]
33 | 	  }
34 |    },
35 |    install_command = "cd build"
36 | }
37 | 


--------------------------------------------------------------------------------
/lib/THCUNN/GatedLinearUnit.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | #include "common.h"
 6 | 
 7 | template <typename Dtype, typename Acctype>
 8 | struct gatedLinearCSigMul_functor
 9 | {
10 |   __device__ void operator()(Dtype *target, const Dtype *sigTensor, const Dtype *mulTensor) const
11 |   {
12 |     const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert<Dtype, Acctype>::to(-*sigTensor)));
13 |     const Dtype mulNum = *mulTensor;
14 |     *target = ScalarConvert<Acctype, Dtype>::to(sigNum * mulNum);
15 |   }
16 | };
17 | 
18 | template <typename Dtype, typename Acctype>
19 | struct gatedLinearDerivativeSecondHalf_functor
20 | {
21 |   __device__ void operator()(Dtype *target, const Dtype *sigTensor, const Dtype *mulTensor) const
22 |   {
23 |     const Acctype sigNum = Acctype(1)/(Acctype(1)+ exp(ScalarConvert<Dtype, Acctype>::to(-*sigTensor)));
24 |     const Dtype mulNum = *mulTensor;
25 |     *target *= ScalarConvert<Acctype, Dtype>::to((Acctype(1) - sigNum) * sigNum * mulNum);
26 |   }
27 | };
28 | 
29 | #include "generic/GatedLinearUnit.cu"
30 | #include "THCGenerateFloatTypes.h"


--------------------------------------------------------------------------------
/lib/THCUNN/SoftMarginCriterion.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | 
 6 | #include <thrust/fill.h>
 7 | #include <thrust/functional.h>
 8 | #include <thrust/device_ptr.h>
 9 | #include <thrust/reduce.h>
10 | #include <thrust/inner_product.h>
11 | 
12 | template <typename Dtype, typename Acctype>
13 | struct softmargin_functor
14 | {
15 |   __host__ __device__ Acctype operator()(const Dtype& x, const Dtype& y) const
16 |   {
17 |     return log(1 + exp(ScalarConvert<Dtype, Acctype>::to(-x)*y));
18 |   }
19 | };
20 | 
21 | template <typename Dtype, typename Acctype>
22 | struct softmargin_updateGradInput_functor
23 | {
24 |   const Acctype norm;
25 | 
26 |   softmargin_updateGradInput_functor(Acctype norm_) :
27 |     norm(norm_) {}
28 | 
29 |   __host__ __device__ Dtype operator()(const Dtype& x, const Dtype& y) const
30 |     {
31 |       Acctype temp = exp(ScalarConvert<Dtype, Acctype>::to(-x)*y);
32 |       return ScalarConvert<Acctype, Dtype>::to(-y*temp*norm/(ScalarConvert<int, Acctype>::to(1) + temp));
33 |     }
34 | };
35 | 
36 | #include "generic/SoftMarginCriterion.cu"
37 | #include "THCGenerateFloatTypes.h"
38 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SpatialMaxPooling.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/SpatialMaxPooling.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(SpatialMaxPooling_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output,
11 |            THCIndexTensor *indices,
12 |            int kW, int kH,
13 |            int dW, int dH,
14 |            int padW, int padH,
15 |            bool ceil_mode)
16 | {
17 |   THNN_(SpatialDilatedMaxPooling_updateOutput)(
18 |     state, input, output, indices,
19 |     kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
20 | 
21 | }
22 | 
23 | void THNN_(SpatialMaxPooling_updateGradInput)(
24 |            THCState *state,
25 |            THCTensor *input,
26 |            THCTensor *gradOutput,
27 |            THCTensor *gradInput,
28 |            THCIndexTensor *indices,
29 |            int kW, int kH,
30 |            int dW, int dH,
31 |            int padW, int padH,
32 |            bool ceil_mode)
33 | {
34 |   THNN_(SpatialDilatedMaxPooling_updateGradInput)(
35 |     state, input, gradOutput, gradInput, indices,
36 |     kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode);
37 | 
38 | }
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SoftPlus.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct softPlusupdateOutput_functor
 8 | {
 9 |   const T threshold;
10 |   const T beta;
11 | 
12 |   softPlusupdateOutput_functor(T threshold_, T beta_)
13 |     : threshold(threshold_)
14 |     , beta(beta_)
15 |   {}
16 | 
17 |   __device__ void operator()(T *output, const T *input) const {
18 |     T betain = beta * (*input);
19 |     *output = ((betain) > threshold) ? *input : (1/beta) * log1p(exp(betain));
20 |   }
21 | };
22 | 
23 | template <typename T>
24 | struct softPlusupdateGradInput_functor
25 | {
26 |   const T threshold;
27 |   const T beta;
28 | 
29 |   softPlusupdateGradInput_functor(T threshold_, T beta_)
30 |     : threshold(threshold_)
31 |     , beta(beta_)
32 |   {}
33 | 
34 |   __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
35 |   {
36 |     T betaout = beta * (*output);
37 |     T exp_bo = exp(betaout);
38 |     *gradInput = ((betaout) > threshold) ? *gradOutput : *gradOutput * (exp_bo - 1) / exp_bo;
39 |   }
40 | };
41 | 
42 | #include "generic/SoftPlus.cu"
43 | #include "THCGenerateFloatTypes.h"
44 | 


--------------------------------------------------------------------------------
/lib/THCUNN/MSECriterion.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | #include "THCThrustAllocator.cuh"
 6 | 
 7 | #include <thrust/fill.h>
 8 | #include <thrust/functional.h>
 9 | #include <thrust/device_ptr.h>
10 | #include <thrust/reduce.h>
11 | #include <thrust/inner_product.h>
12 | #if CUDA_VERSION >= 7000
13 | #include <thrust/system/cuda/execution_policy.h>
14 | #endif
15 | 
16 | template <typename Dtype, typename Acctype>
17 | struct mse_functor
18 | {
19 |   mse_functor() {}
20 | 
21 |   __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
22 |   {
23 |     Acctype z = ScalarConvert<Dtype, Acctype>::to(x)-y;
24 |     return z*z;
25 |   }
26 | };
27 | 
28 | template <typename Dtype, typename Acctype>
29 | struct mse_updateGradInput_functor
30 | {
31 |   const Acctype norm;
32 | 
33 |   mse_updateGradInput_functor(Acctype norm_)
34 |     : norm(norm_)
35 |   {}
36 | 
37 |   __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
38 |   {
39 |     return ScalarConvert<Acctype, Dtype>::to(norm * (ScalarConvert<Dtype, Acctype>::to(x) - y));
40 |   }
41 | };
42 | 
43 | #include "generic/MSECriterion.cu"
44 | #include "THCGenerateFloatTypes.h"
45 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SoftShrink.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/SoftShrink.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(SoftShrink_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output,
11 |            accreal lambda_)
12 | {
13 |   real lambda = ScalarConvert<accreal, real>::to(lambda_);
14 |   THCUNN_assertSameGPU(state, 2, input, output);
15 |   THCTensor_(resizeAs)(state, output, input);
16 |   THC_pointwiseApply2(state, output, input, SoftShrinkUpdateOutput<real>(lambda));
17 |   THCudaCheck(cudaGetLastError());
18 | }
19 | 
20 | void THNN_(SoftShrink_updateGradInput)(
21 |            THCState *state,
22 |            THCTensor *input,
23 |            THCTensor *gradOutput,
24 |            THCTensor *gradInput,
25 |            accreal lambda_)
26 | {
27 |   real lambda = ScalarConvert<accreal, real>::to(lambda_);
28 |   THCUNN_check_nElement(state, input, gradOutput);
29 |   THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
30 |   THCTensor_(resizeAs)(state, gradInput, input);
31 |   THC_pointwiseApply3(state, gradInput, input, gradOutput, SoftShrinkUpdateGradInput<real>(lambda));
32 |   THCudaCheck(cudaGetLastError());
33 | }
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/lib/THCUNN/MarginCriterion.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | 
 6 | #include <thrust/fill.h>
 7 | #include <thrust/functional.h>
 8 | #include <thrust/device_ptr.h>
 9 | #include <thrust/reduce.h>
10 | #include <thrust/inner_product.h>
11 | 
12 | template <typename Dtype, typename Acctype>
13 | struct margin_functor
14 | {
15 |   margin_functor(Acctype margin)
16 |     : margin(margin)
17 |   {}
18 | 
19 |   __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
20 |   {
21 |     Acctype z = margin - ScalarConvert<Dtype, Acctype>::to(x) * y;
22 |     return z >= 0 ? z : 0;
23 |   }
24 | 
25 |   const Acctype margin;
26 | };
27 | 
28 | template <typename Dtype, typename Acctype>
29 | struct margin_updateGradInput_functor
30 | {
31 |   const Acctype margin, norm;
32 | 
33 |   margin_updateGradInput_functor(Acctype margin_, Acctype norm_)
34 |     : margin(margin_)
35 |     , norm(norm_)
36 |   {}
37 | 
38 |   __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
39 |   {
40 |     return ScalarConvert<Acctype, Dtype>::to((ScalarConvert<Dtype, Acctype>::to(x) * y) < margin ? -norm * y : 0);
41 |   }
42 | };
43 | 
44 | #include "generic/MarginCriterion.cu"
45 | #include "THCGenerateFloatTypes.h"
46 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/VolumetricMaxPooling.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/VolumetricMaxPooling.cu"
 3 | #else
 4 | 
 5 | void THNN_(VolumetricMaxPooling_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *output,
 9 |            THCIndexTensor *indices,
10 |            int kT, int kW, int kH,
11 |            int dT, int dW, int dH,
12 |            int padT, int padW, int padH,
13 |            bool ceilMode)
14 | {
15 |   THNN_(VolumetricDilatedMaxPooling_updateOutput)(
16 |         state, input, output, indices,
17 |         kT, kW, kH, dT, dW, dH, padT, padW, padH,
18 |         1, 1, 1, ceilMode);
19 | 
20 | }
21 | 
22 | void THNN_(VolumetricMaxPooling_updateGradInput)(
23 |            THCState *state,
24 |            THCTensor *input,
25 |            THCTensor *gradOutput,
26 |            THCTensor *gradInput,
27 |            THCIndexTensor *indices,
28 |            int kT, int kW, int kH,
29 |            int dT, int dW, int dH,
30 |            int padT, int padW, int padH,
31 |            bool ceilMode)
32 | {
33 |   THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
34 |         state, input, gradOutput, gradInput, indices,
35 |         kT, kW, kH, dT, dW, dH, padT, padW, padH,
36 |         1, 1, 1, ceilMode);
37 | 
38 | }
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialMaxUnpooling.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | 
 4 | template <typename Dtype>
 5 | __global__ void MaxUnpoolForward(const int nthreads, const Dtype* bottom_data, const long* bottom_mask,
 6 |     const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* top_data) {
 7 |   CUDA_KERNEL_LOOP(index, nthreads) { //index here indices the input pixels
 8 |     int c = (index / iwidth / iheight) % channels;
 9 |     int n = index / iwidth / iheight / channels;
10 |     top_data += (n*channels + c)*oheight*owidth;
11 |     int maxind = bottom_mask[index] - TH_INDEX_BASE;
12 | 
13 |     top_data[maxind] = bottom_data[index];
14 |   }
15 | }
16 | 
17 | template <typename Dtype>
18 | __global__ void MaxUnpoolBackward(const int nthreads, const Dtype* top_diff, const long* bottom_mask,
19 |     const int num, const int channels, const int iheight, const int iwidth, const int oheight, const int owidth, Dtype* bottom_diff) {
20 |   CUDA_KERNEL_LOOP(index, nthreads) {
21 |     int c = (index / iwidth / iheight) % channels;
22 |     int n = index / iwidth / iheight / channels;
23 |     top_diff += (n*channels + c)*oheight*owidth;
24 |     int maxind = bottom_mask[index] - TH_INDEX_BASE;
25 | 
26 |     bottom_diff[index] = top_diff[maxind];
27 |   }
28 | }
29 | 
30 | #include "generic/SpatialMaxUnpooling.cu"
31 | #include "THCGenerateFloatTypes.h"
32 | 


--------------------------------------------------------------------------------
/lib/THCUNN/FusedRNNKernel.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include "THCNumerics.cuh"
 5 | #include <THC/THCApply.cuh>
 6 | 
 7 | template <typename T>
 8 | struct TensorSigmoidOp {
 9 |   __device__ __forceinline__ void operator()(T* out, T* in) const {
10 |     T one = (T) 1.0;
11 |     *out = one / (one + THCNumerics<T>::exp(- *in));
12 |   }
13 | 
14 |   __device__ __forceinline__ void operator()(T* v) const {
15 |     T one = (T) 1.0;
16 |     *v = one / (one + THCNumerics<T>::exp(- *v));
17 |   }
18 | };
19 | 
20 | #ifdef CUDA_HALF_TENSOR
21 | template <>
22 | struct TensorSigmoidOp<half> {
23 |   __device__ __forceinline__ void operator()(half* out, half* in) const {
24 | #ifdef CUDA_HALF_INSTRUCTIONS
25 |     half one = ScalarConvert<int, half>::to(1);
26 |     *out = hdiv(one, __hadd(one, hexp(__hneg(*in))));
27 | #else
28 |     float fin = __half2float(*in);
29 |     *out = __float2half(1.0f / (1.0f + expf(- fin)));
30 | #endif
31 |   }
32 | 
33 |   __device__ __forceinline__ void operator()(half* v) const {
34 | #ifdef CUDA_HALF_INSTRUCTIONS
35 |     half one = ScalarConvert<int, half>::to(1);
36 |     *v = hdiv(one, __hadd(one, hexp(__hneg(*v))));
37 | #else
38 |     float fv = __half2float(*v);
39 |     *v = __float2half(1.0f / (1.0f + expf(- fv)));
40 | #endif
41 |   }
42 | };
43 | #endif
44 | 
45 | #include "generic/FusedRNNKernel.cu"
46 | #include "THCGenerateFloatTypes.h"
47 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SmoothL1Criterion.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | #include "THCThrustAllocator.cuh"
 6 | 
 7 | #include <thrust/fill.h>
 8 | #include <thrust/functional.h>
 9 | #include <thrust/device_ptr.h>
10 | #include <thrust/reduce.h>
11 | #include <thrust/inner_product.h>
12 | #if CUDA_VERSION >= 7000
13 | #include <thrust/system/cuda/execution_policy.h>
14 | #endif
15 | 
16 | template <typename Dtype, typename Acctype>
17 | struct smoothl1_functor
18 | {
19 |   smoothl1_functor() {}
20 | 
21 |   __host__ __device__ Acctype operator()(const Dtype &x, const Dtype &y) const
22 |   {
23 |     Acctype z = ScalarConvert<Dtype, Acctype>::to(THCNumerics<Dtype>::abs(x-y));
24 |     return z < Acctype(1) ? 0.5f*z*z : z - 0.5f;
25 |   }
26 | };
27 | 
28 | template <typename Dtype>
29 | struct smoothl1_updateGradInput_functor
30 | {
31 |   const Dtype norm;
32 | 
33 |   smoothl1_updateGradInput_functor(Dtype norm_)
34 |     : norm(norm_)
35 |   {}
36 | 
37 |   __host__ __device__ Dtype operator()(const Dtype &x, const Dtype &y) const
38 |   {
39 |     Dtype z = x - y;
40 |     if (z < ScalarConvert<int, Dtype>::to(-1))
41 |       return -norm;
42 |     else if (z > ScalarConvert<int, Dtype>::to(1))
43 |       return norm;
44 |     else
45 |       return norm * z;
46 |   }
47 | };
48 | 
49 | #include "generic/SmoothL1Criterion.cu"
50 | #include "THCGenerateFloatTypes.h"
51 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SoftPlus.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/SoftPlus.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(SoftPlus_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output,
11 |            accreal beta_,
12 |            accreal threshold_)
13 | {
14 |   real beta = ScalarConvert<accreal, real>::to(beta_);
15 |   real threshold = ScalarConvert<accreal, real>::to(threshold_);
16 |   THCUNN_assertSameGPU(state, 2, input, output);
17 |   THCTensor_(resizeAs)(state, output, input);
18 |   THC_pointwiseApply2(state, output, input, softPlusupdateOutput_functor<real>(threshold, beta));
19 | }
20 | 
21 | void THNN_(SoftPlus_updateGradInput)(
22 |            THCState *state,
23 |            THCTensor *input,
24 |            THCTensor *gradOutput,
25 |            THCTensor *gradInput,
26 |            THCTensor *output,
27 |            accreal beta_,
28 |            accreal threshold_)
29 | {
30 |   real beta = ScalarConvert<accreal, real>::to(beta_);
31 |   real threshold = ScalarConvert<accreal, real>::to(threshold_);
32 |   THCUNN_check_nElement(state, input, gradOutput);
33 |   THCUNN_assertSameGPU(state, 4, input, output, gradOutput, gradInput);
34 |   THCTensor_(resizeAs)(state, gradInput, output);
35 |   THC_pointwiseApply3(state, gradInput, output, gradOutput, softPlusupdateGradInput_functor<real>(threshold, beta));
36 | }
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/lib/THCUNN/ELU.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct ELUupdateOutput_functor
 8 | {
 9 |   const T alpha_;
10 | 
11 |   ELUupdateOutput_functor(T alpha)
12 |     : alpha_(alpha)
13 |   {}
14 | 
15 |   __device__ void operator()(T *output, const T *input) const
16 |   {
17 |     *output = *input <= 0 ? (exp(*input) - 1) * alpha_ : *input;
18 |   }
19 | };
20 | 
21 | // in-place variant
22 | template <typename T>
23 | struct ELUupdateOutputIP_functor
24 | {
25 |   const T alpha_;
26 | 
27 |   ELUupdateOutputIP_functor(T alpha)
28 |     : alpha_(alpha)
29 |   {}
30 | 
31 |   __device__ void operator()(T *x) const
32 |   {
33 |     *x = *x <= 0 ? (exp(*x) - 1) * alpha_ : *x;
34 |   }
35 | };
36 | 
37 | template <typename T>
38 | struct ELUupdateGradInput_functor
39 | {
40 |   const T alpha_;
41 | 
42 |   ELUupdateGradInput_functor(T alpha)
43 |     : alpha_(alpha)
44 |   {}
45 | 
46 |   __device__ void operator()(T *gradInput, const T *output, const T *gradOutput) const
47 |   {
48 |     *gradInput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput);
49 |   }
50 | };
51 | 
52 | template <typename T>
53 | struct ELUupdateGradInputIP_functor
54 | {
55 |   const T alpha_;
56 | 
57 |   ELUupdateGradInputIP_functor(T alpha)
58 |     : alpha_(alpha)
59 |   {}
60 | 
61 |   __device__ void operator()(T *gradOutput, const T *output) const
62 |   {
63 |     *gradOutput = (*output) <= 0 ? (*gradOutput * (*output + alpha_)) : (*gradOutput);
64 |   }
65 | };
66 | 
67 | #include "generic/ELU.cu"
68 | #include "THCGenerateFloatTypes.h"
69 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/L1Cost.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/L1Cost.cu"
 3 | #else
 4 | 
 5 | void THNN_(L1Cost_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *output)
 9 | {
10 |   THCUNN_check_dim_size(state, output, 1, 0, 1);
11 |   THCUNN_assertSameGPU(state, 1, input);
12 |   accreal sum;
13 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
14 |   input = THCTensor_(newContiguous)(state, input);
15 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
16 |   sum = thrust::transform_reduce(input_data, input_data+size, l1cost_functor<real, accreal>(), accreal(0), thrust::plus<accreal>());
17 | 
18 |   THCTensor_(free)(state, input);
19 | 
20 |   THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
21 | }
22 | 
23 | void THNN_(L1Cost_updateGradInput)(
24 |            THCState *state,
25 |            THCTensor *input,
26 |            THCTensor *gradOutput,
27 |            THCTensor *gradInput)
28 | {
29 |   THCUNN_check_nElement(state, input, gradOutput);
30 |   THCUNN_assertSameGPU(state, 2, input, gradInput);
31 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
32 | 
33 |   input = THCTensor_(newContiguous)(state, input);
34 |   THCTensor_(resizeAs)(state, gradInput, input);
35 | 
36 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
37 |   thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
38 | 
39 |   thrust::transform(input_data, input_data+size, gradInput_data, l1cost_updateGradInput_functor<real>());
40 | 
41 |   THCTensor_(free)(state, input);
42 | }
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/lib/THCUNN/HardTanh.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct hardtanhupdateOutput_functor
 8 | {
 9 |   const T max_val_;
10 |   const T min_val_;
11 | 
12 |   hardtanhupdateOutput_functor(T min_val, T max_val)
13 |     : min_val_(min_val)
14 |     , max_val_(max_val)
15 |   {}
16 | 
17 |   __device__ void operator()(T *output, const T *input) const
18 |   {
19 |     if (*input < min_val_)
20 |       *output = min_val_;
21 |     else if (*input <= max_val_)
22 |       *output = *input;
23 |     else
24 |       *output = max_val_;
25 |   }
26 | 
27 |   __device__ void operator()(T *input) const
28 |   {
29 |     if (*input < min_val_)
30 |       *input = min_val_;
31 |     else if (*input > max_val_)
32 |       *input = max_val_;
33 |   }
34 | };
35 | 
36 | template <typename T>
37 | struct hardtanhupdateGradInput_functor
38 | {
39 |   const T max_val_;
40 |   const T min_val_;
41 | 
42 |   hardtanhupdateGradInput_functor(T min_val, T max_val)
43 |     : min_val_(min_val)
44 |     , max_val_(max_val)
45 |   {}
46 | 
47 |   __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const
48 |   {
49 |     if (*input <= min_val_ || *input >= max_val_)
50 |       *gradInput = ScalarConvert<int, T>::to(0);
51 |     else
52 |       *gradInput = *gradOutput;
53 |   }
54 | 
55 |   __device__ void operator()(T *gradInput, const T *input) const
56 |   {
57 |     if (*input <= min_val_ || *input >= max_val_)
58 |       *gradInput = ScalarConvert<int, T>::to(0);
59 |   }
60 | };
61 | 
62 | #include "generic/HardTanh.cu"
63 | #include "THCGenerateFloatTypes.h"
64 | 


--------------------------------------------------------------------------------
/lib/THCUNN/LeakyReLU.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct LeakyReLUUpdateOutput
 8 | {
 9 |   const T negval_;
10 | 
11 |   LeakyReLUUpdateOutput(T negval)
12 |     : negval_(negval)
13 |   {}
14 | 
15 |   __device__ __forceinline__ void operator()(T *out, T *in)
16 |   {
17 |     T x = *in;
18 |     *out = (x > 0) ? x : x * negval_;
19 |   }
20 | };
21 | 
22 | // in-place variant
23 | template <typename T>
24 | struct LeakyReLUUpdateOutputIP
25 | {
26 |   const T negval_;
27 | 
28 |   LeakyReLUUpdateOutputIP(T negval)
29 |     : negval_(negval)
30 |   {}
31 | 
32 |   __device__ __forceinline__ void operator()(T *x)
33 |   {
34 |     *x = (*x > 0) ? *x : negval_ * (*x);
35 |   }
36 | };
37 | 
38 | template <typename T>
39 | struct LeakyReLUUpdateGradInput
40 | {
41 |   const T negval_;
42 | 
43 |   LeakyReLUUpdateGradInput(T negval)
44 |     : negval_(negval)
45 |   {}
46 | 
47 |   __device__ __forceinline__ void operator()(
48 |     T* gradInput,
49 |     T* input,
50 |     T* gradOutput) const
51 |   {
52 |     *gradInput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_;
53 |   }
54 | };
55 | 
56 | template <typename T>
57 | struct LeakyReLUUpdateGradInputIP
58 | {
59 |   const T negval_;
60 | 
61 |   LeakyReLUUpdateGradInputIP(T negval)
62 |     : negval_(negval)
63 |   {}
64 | 
65 |   __device__ __forceinline__ void operator()(
66 |     T* gradOutput,
67 |     T* input) const
68 |   {
69 |     *gradOutput = (*input > 0) ? *gradOutput : (*gradOutput) * negval_;
70 |   }
71 | };
72 | 
73 | #include "generic/LeakyReLU.cu"
74 | #include "THCGenerateFloatTypes.h"
75 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/ELU.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/ELU.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | 
 8 | void THNN_(ELU_updateOutput)(
 9 |            THCState *state,
10 |            THCTensor *input,
11 |            THCTensor *output,
12 |            accreal alpha_,
13 |            bool inplace)
14 | {
15 |   real alpha = ScalarConvert<accreal, real>::to(alpha_);
16 |   THCUNN_assertSameGPU(state, 2, input, output);
17 | 
18 |   if (inplace)
19 |   {
20 |     THC_pointwiseApply1(state, input, ELUupdateOutputIP_functor<real>(alpha));
21 |     THCTensor_(set)(state, output, input);
22 |   }
23 |   else
24 |   {
25 |     THCTensor_(resizeAs)(state, output, input);
26 |     THC_pointwiseApply2(state, output, input, ELUupdateOutput_functor<real>(alpha));
27 |   }
28 | }
29 | 
30 | 
31 | void THNN_(ELU_updateGradInput)(
32 |            THCState *state,
33 |            THCTensor *input,
34 |            THCTensor *gradOutput,
35 |            THCTensor *gradInput,
36 |            THCTensor *output,
37 |            accreal alpha_,
38 |            bool inplace)
39 | {
40 |   real alpha = ScalarConvert<accreal, real>::to(alpha_);
41 |   THCUNN_check_nElement(state, input, gradOutput);
42 |   THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
43 | 
44 |   if (inplace)
45 |   {
46 |     THC_pointwiseApply2(state, gradOutput, output, ELUupdateGradInputIP_functor<real>(alpha));
47 |     THCTensor_(set)(state, gradInput, gradOutput);
48 |   }
49 |   else
50 |   {
51 |     THCTensor_(resizeAs)(state, gradInput, output);
52 |     THC_pointwiseApply3(state, gradInput, output, gradOutput, ELUupdateGradInput_functor<real>(alpha));
53 |   }
54 | }
55 | 
56 | #endif
57 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/LeakyReLU.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/LeakyReLU.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(LeakyReLU_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output,
11 |            accreal negval_,
12 |            bool inplace)
13 | {
14 |   real negval = ScalarConvert<accreal, real>::to(negval_);
15 | 
16 |   THCUNN_assertSameGPU(state, 2, input, output);
17 | 
18 |   if (inplace)
19 |   {
20 |     THC_pointwiseApply1(state, input, LeakyReLUUpdateOutputIP<real>(negval));
21 |     THCTensor_(set)(state, output, input);
22 |   }
23 |   else
24 |   {
25 |     THCTensor_(resizeAs)(state, output, input);
26 |     THC_pointwiseApply2(state, output, input, LeakyReLUUpdateOutput<real>(negval));
27 |   }
28 | 
29 |   THCudaCheck(cudaGetLastError());
30 | }
31 | 
32 | void THNN_(LeakyReLU_updateGradInput)(
33 |            THCState *state,
34 |            THCTensor *input,
35 |            THCTensor *gradOutput,
36 |            THCTensor *gradInput,
37 |            accreal negval_,
38 |            bool inplace)
39 | {
40 |   real negval = ScalarConvert<accreal, real>::to(negval_);
41 | 
42 |   THCUNN_check_nElement(state, input, gradOutput);
43 |   THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
44 | 
45 |   if (inplace)
46 |   {
47 |     THC_pointwiseApply2(state, gradOutput, input, LeakyReLUUpdateGradInputIP<real>(negval));
48 |     THCTensor_(set)(state, gradInput, gradOutput);
49 |   }
50 |   else
51 |   {
52 |     THCTensor_(resizeAs)(state, gradInput, input);
53 |     THC_pointwiseApply3(state, gradInput, input, gradOutput, LeakyReLUUpdateGradInput<real>(negval));
54 |   }
55 | 
56 |   THCudaCheck(cudaGetLastError());
57 | }
58 | 
59 | #endif
60 | 


--------------------------------------------------------------------------------
/lib/THCUNN/Threshold.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct ThresholdUpdateOutput
 8 | {
 9 |   const T threshold_;
10 |   const T val_;
11 | 
12 |   ThresholdUpdateOutput(T threshold, T val)
13 |     : threshold_(threshold)
14 |     , val_(val)
15 |   {}
16 | 
17 |   __device__ __forceinline__ void operator()(T *out, T *in)
18 |   {
19 |     T x = *in;
20 |     *out = (x > threshold_) ? x : val_;
21 |   }
22 | };
23 | 
24 | // in-place variant
25 | template <typename T>
26 | struct ThresholdUpdateOutputIP
27 | {
28 |   const T threshold_;
29 |   const T val_;
30 | 
31 |   ThresholdUpdateOutputIP(T threshold, T val)
32 |     : threshold_(threshold)
33 |     , val_(val)
34 |   {}
35 | 
36 |   __device__ __forceinline__ void operator()(T *x)
37 |   {
38 |     *x = (*x > threshold_) ? *x : val_;
39 |   }
40 | };
41 | 
42 | template <typename T>
43 | struct ThresholdUpdateGradInput
44 | {
45 |   const T threshold_;
46 | 
47 |   ThresholdUpdateGradInput(T threshold)
48 |     : threshold_(threshold)
49 |   {}
50 | 
51 |   __device__ __forceinline__ void operator()(
52 |     T *gradInput, T *input, T *gradOutput) const
53 |   {
54 |     *gradInput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0);
55 |   }
56 | };
57 | 
58 | template <typename T>
59 | struct ThresholdUpdateGradInputIP
60 | {
61 |   const T threshold_;
62 | 
63 |   ThresholdUpdateGradInputIP(T threshold)
64 |     : threshold_(threshold)
65 |   {}
66 | 
67 |   __device__ __forceinline__ void operator()(
68 |     T *gradOutput, T *input) const
69 |   {
70 |     *gradOutput = (*input > threshold_) ? *gradOutput : ScalarConvert<int, T>::to(0);
71 |   }
72 | };
73 | 
74 | #include "generic/Threshold.cu"
75 | #include "THCGenerateFloatTypes.h"
76 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/HardTanh.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/HardTanh.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(HardTanh_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output,
11 |            accreal min_val_,
12 |            accreal max_val_,
13 |            bool inplace)
14 | {
15 |   real min_val = ScalarConvert<accreal, real>::to(min_val_);
16 |   real max_val = ScalarConvert<accreal, real>::to(max_val_);
17 | 
18 |   THCUNN_assertSameGPU(state, 2, input, output);
19 |   if(inplace)
20 |   {
21 |     THCTensor_(set)(state, output, input);
22 |     THC_pointwiseApply1(state, output, hardtanhupdateOutput_functor<real>(min_val, max_val));
23 |   }
24 |   else
25 |   {
26 |     THCTensor_(resizeAs)(state, output, input);
27 |     THC_pointwiseApply2(state, output, input,
28 |                                hardtanhupdateOutput_functor<real>(min_val, max_val));
29 |   }
30 | }
31 | 
32 | void THNN_(HardTanh_updateGradInput)(
33 |            THCState *state,
34 |            THCTensor *input,
35 |            THCTensor *gradOutput,
36 |            THCTensor *gradInput,
37 |            accreal min_val_,
38 |            accreal max_val_,
39 |            bool inplace)
40 | {
41 |   real min_val = ScalarConvert<accreal, real>::to(min_val_);
42 |   real max_val = ScalarConvert<accreal, real>::to(max_val_);
43 | 
44 |   THCUNN_check_nElement(state, input, gradOutput);
45 |   THCUNN_assertSameGPU(state, 3, input, gradOutput, gradInput);
46 | 
47 |   if (inplace)
48 |   {
49 |     THCTensor_(set)(state, gradInput, gradOutput);
50 |     THC_pointwiseApply2(state, gradInput, input,
51 |                                  hardtanhupdateGradInput_functor<real>(min_val, max_val));
52 |   }
53 |   else
54 |   {
55 |     THCTensor_(resizeAs)(state, gradInput, input);
56 |     THC_pointwiseApply3(state, gradInput, input, gradOutput,
57 |                                  hardtanhupdateGradInput_functor<real>(min_val, max_val));
58 |   }
59 | }
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/Threshold.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/Threshold.cu"
 3 | #else
 4 | 
 5 | #include "../common.h"
 6 | 
 7 | void THNN_(Threshold_updateOutput)(
 8 |            THCState *state,
 9 |            THCTensor *input,
10 |            THCTensor *output,
11 |            accreal threshold_,
12 |            accreal val_,
13 |            bool inplace)
14 | {
15 |   real threshold = ScalarConvert<accreal, real>::to(threshold_);
16 |   real val = ScalarConvert<accreal, real>::to(val_);
17 |   THCUNN_assertSameGPU(state, 2, input, output);
18 | 
19 |   if (inplace)
20 |   {
21 |     THC_pointwiseApply1(state, input,
22 |       ThresholdUpdateOutputIP<real>(threshold, val)
23 |     );
24 |     THCTensor_(set)(state, output, input);
25 |   }
26 |   else
27 |   {
28 |     THCTensor_(resizeAs)(state, output, input);
29 |     THC_pointwiseApply2(state, output, input,
30 |       ThresholdUpdateOutput<real>(threshold, val)
31 |     );
32 |   }
33 | 
34 |   THCudaCheck(cudaGetLastError());
35 | }
36 | 
37 | void THNN_(Threshold_updateGradInput)(
38 |            THCState *state,
39 |            THCTensor *input,
40 |            THCTensor *gradOutput,
41 |            THCTensor *gradInput,
42 |            accreal threshold_,
43 |            accreal val_,
44 |            bool inplace)
45 | {
46 |   real threshold = ScalarConvert<accreal, real>::to(threshold_);
47 |   real val = ScalarConvert<accreal, real>::to(val_);
48 |   THCUNN_check_nElement(state, input, gradOutput);
49 |   THCUNN_assertSameGPU(state, 3, input, gradInput, gradOutput);
50 | 
51 |   if (inplace)
52 |   {
53 |     THC_pointwiseApply2(state, gradOutput, input,
54 |       ThresholdUpdateGradInputIP<real>(threshold)
55 |     );
56 |     THCTensor_(set)(state, gradInput, gradOutput);
57 |   }
58 |   else
59 |   {
60 |     THCTensor_(resizeAs)(state, gradInput, input);
61 |     THC_pointwiseApply3(state, gradInput, input, gradOutput,
62 |        ThresholdUpdateGradInput<real>(threshold)
63 |     );
64 |   }
65 | 
66 |   THCudaCheck(cudaGetLastError());
67 | }
68 | 
69 | #endif
70 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 2 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 3 | Copyright (c) 2011-2013 NYU (Clement Farabet)
 4 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 5 | Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 6 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 7 | 
 8 | All rights reserved.
 9 | 
10 | Redistribution and use in source and binary forms, with or without
11 | modification, are permitted provided that the following conditions are met:
12 | 
13 | 1. Redistributions of source code must retain the above copyright
14 |    notice, this list of conditions and the following disclaimer.
15 | 
16 | 2. Redistributions in binary form must reproduce the above copyright
17 |    notice, this list of conditions and the following disclaimer in the
18 |    documentation and/or other materials provided with the distribution.
19 | 
20 | 3. Neither the names of NEC Laboratories American and IDIAP Research
21 |    Institute nor the names of its contributors may be used to endorse or
22 |    promote products derived from this software without specific prior
23 |    written permission.
24 | 
25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
26 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 | POSSIBILITY OF SUCH DAMAGE.
36 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SpatialFullConvolution.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/SpatialFullConvolution.cu"
 3 | #else
 4 | 
 5 | void THNN_(SpatialFullConvolution_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *output,
 9 |            THCTensor *weight,
10 |            THCTensor *bias,
11 |            THCTensor *columns,
12 |            THCTensor *ones,
13 |            int kW, int kH,
14 |            int dW, int dH,
15 |            int padW, int padH,
16 |            int adjW, int adjH)
17 | {
18 |   THNN_(SpatialFullDilatedConvolution_updateOutput)(
19 |       state, input, output, weight, bias, columns, ones,
20 |       kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH);
21 | }
22 | 
23 | void THNN_(SpatialFullConvolution_updateGradInput)(
24 |            THCState *state,
25 |            THCTensor *input,
26 |            THCTensor *gradOutput,
27 |            THCTensor *gradInput,
28 |            THCTensor *weight,
29 |            THCTensor *gradColumns,
30 |            int kW, int kH,
31 |            int dW, int dH,
32 |            int padW, int padH,
33 |            int adjW, int adjH)
34 | {
35 |   THNN_(SpatialFullDilatedConvolution_updateGradInput)(
36 |       state, input, gradOutput, gradInput, weight, gradColumns,
37 |       kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH);
38 | }
39 | 
40 | 
41 | void THNN_(SpatialFullConvolution_accGradParameters)(
42 |            THCState *state,
43 |            THCTensor *input,
44 |            THCTensor *gradOutput,
45 |            THCTensor *gradWeight,
46 |            THCTensor *gradBias,
47 |            THCTensor *columns,
48 |            THCTensor *ones,
49 |            int kW, int kH,
50 |            int dW, int dH,
51 |            int padW, int padH,
52 |            int adjW, int adjH,
53 |            accreal scale_)
54 | {
55 |   THNN_(SpatialFullDilatedConvolution_accGradParameters)(
56 |       state, input, gradOutput, gradWeight, gradBias,
57 |       columns, ones,
58 |       kW, kH, dW, dH, padW, padH, 1, 1, adjW, adjH, scale_);
59 | }
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/VolumetricFullConvolution.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/VolumetricFullConvolution.cu"
 3 | #else
 4 | 
 5 | void THNN_(VolumetricFullConvolution_updateOutput)(
 6 |        THCState *state,
 7 |        THCTensor  *input,
 8 |        THCTensor  *output,
 9 |        THCTensor  *weight,
10 |        THCTensor  *bias,
11 |        THCTensor  *finput,
12 |        THCTensor  *fgradInput,
13 |        int dT, int dW, int dH,
14 |        int padT, int padW, int padH,
15 |        int adjT, int adjW, int adjH)
16 | {
17 |   THNN_(VolumetricFullDilatedConvolution_updateOutput)(
18 |        state, input, output, weight, bias, finput, fgradInput,
19 |        dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH);
20 | }
21 | 
22 | void THNN_(VolumetricFullConvolution_updateGradInput)(
23 |        THCState *state,
24 |        THCTensor  *input,
25 |        THCTensor  *gradOutput,
26 |        THCTensor  *gradInput,
27 |        THCTensor  *weight,
28 |        THCTensor  *finput,
29 |        THCTensor  *fgradInput,
30 |        int dT, int dW, int dH,
31 |        int padT, int padW, int padH,
32 |        int adjT, int adjW, int adjH)
33 | {
34 |   THNN_(VolumetricFullDilatedConvolution_updateGradInput)(
35 |        state, input, gradOutput, gradInput, weight, finput, fgradInput,
36 |        dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH);
37 | }
38 | 
39 | 
40 | void THNN_(VolumetricFullConvolution_accGradParameters)(
41 |            THCState *state,
42 |            THCTensor  *input,
43 |            THCTensor  *gradOutput,
44 |            THCTensor  *gradWeight,
45 |            THCTensor  *gradBias,
46 |            THCTensor  *finput,
47 |            THCTensor  *fgradInput,
48 |            int dT, int dW, int dH,
49 |            int padT, int padW, int padH,
50 |            int adjT, int adjW, int adjH,
51 |            accreal scale_)
52 | {
53 |   THNN_(VolumetricFullDilatedConvolution_accGradParameters)(
54 |        state, input, gradOutput, gradWeight, gradBias, finput, fgradInput,
55 |        dT, dW, dH, padT, padW, padH, 1, 1, 1, adjT, adjW, adjH, scale_);
56 | }
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/AbsCriterion.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/AbsCriterion.cu"
 3 | #else
 4 | 
 5 | void THNN_(AbsCriterion_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *target,
 9 |            THCTensor *output,
10 |            bool sizeAverage)
11 | {
12 |   THCUNN_check_nElement(state, input, target);
13 |   THCUNN_assertSameGPU(state, 2, input, target);
14 | 
15 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
16 | 
17 |   input = THCTensor_(newContiguous)(state, input);
18 |   target = THCTensor_(newContiguous)(state, target);
19 | 
20 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
21 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
22 |   accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal)0, thrust::plus<accreal>(), abs_functor<real, accreal>());
23 | 
24 |   if (sizeAverage)
25 |     sum /= size;
26 | 
27 |   THCTensor_(free)(state, input);
28 |   THCTensor_(free)(state, target);
29 | 
30 |   THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
31 | }
32 | 
33 | void THNN_(AbsCriterion_updateGradInput)(
34 |            THCState *state,
35 |            THCTensor *input,
36 |            THCTensor *target,
37 |            THCTensor *gradInput,
38 |            bool sizeAverage)
39 | {
40 |   THCUNN_check_nElement(state, input, target);
41 |   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
42 | 
43 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
44 |   real norm = ScalarConvert<double, real>::to(sizeAverage ? 1./size : 1.);
45 | 
46 |   input = THCTensor_(newContiguous)(state, input);
47 |   target = THCTensor_(newContiguous)(state, target);
48 | 
49 |   THCTensor_(resizeAs)(state, gradInput, input);
50 | 
51 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
52 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
53 |   thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
54 | 
55 |   thrust::transform(input_data, input_data+size, target_data, gradInput_data, abs_updateGradInput_functor<real>(norm));
56 | 
57 |   THCTensor_(free)(state, input);
58 |   THCTensor_(free)(state, target);
59 | }
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SoftMarginCriterion.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/SoftMarginCriterion.cu"
 3 | #else
 4 | 
 5 | void THNN_(SoftMarginCriterion_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *target,
 9 |            THCTensor *output,
10 |            bool sizeAverage)
11 | {
12 |   THCUNN_check_nElement(state, input, target);
13 |   THCUNN_check_dim_size(state, output, 1, 0, 1);
14 |   THCUNN_assertSameGPU(state, 2, input, target);
15 |   accreal sum;
16 | 
17 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
18 | 
19 |   input = THCTensor_(newContiguous)(state, input);
20 |   target = THCTensor_(newContiguous)(state, target);
21 | 
22 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
23 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
24 |   sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), softmargin_functor<real, accreal>());
25 | 
26 |   if(sizeAverage)
27 |     sum /= size;
28 | 
29 |   THCTensor_(free)(state, input);
30 |   THCTensor_(free)(state, target);
31 | 
32 |   THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
33 | }
34 | 
35 | void THNN_(SoftMarginCriterion_updateGradInput)(
36 |            THCState *state,
37 |            THCTensor *input,
38 |            THCTensor *target,
39 |            THCTensor *gradInput,
40 |            bool sizeAverage)
41 | {
42 |   THCUNN_check_nElement(state, input, target);
43 |   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
44 | 
45 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
46 |   accreal norm = (sizeAverage ? 1./size : 1.);
47 | 
48 |   input = THCTensor_(newContiguous)(state, input);
49 |   target = THCTensor_(newContiguous)(state, target);
50 | 
51 |   THCTensor_(resizeAs)(state, gradInput, input);
52 | 
53 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
54 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
55 |   thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
56 | 
57 |   thrust::transform(input_data, input_data+size, target_data, gradInput_data, softmargin_updateGradInput_functor<real, accreal>(norm));
58 | 
59 |   THCTensor_(free)(state, input);
60 |   THCTensor_(free)(state, target);
61 | }
62 | 
63 | #endif
64 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialUpSamplingNearest.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | 
 4 | #include <thrust/transform.h>
 5 | #include <thrust/reduce.h>
 6 | #include <thrust/transform_reduce.h>
 7 | #include <thrust/functional.h>
 8 | 
 9 | #include "THCHalf.h"
10 | #include "THCHalfAutoNumerics.cuh"
11 | 
12 | /*
13 |  * Description:
14 |  */
15 | 
16 | __device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor)
17 | {
18 |   int x, y, z, w;
19 |   w = ii % d3;
20 |   ii = ii/d3;
21 |   z = ii % d2;
22 |   ii = ii/d2;
23 |   y = ii % d1;
24 |   ii = ii/d1;
25 |   x = ii;
26 |   w = w/scale_factor;
27 |   z = z/scale_factor;
28 |   d2 /= scale_factor;
29 |   d3 /= scale_factor;
30 |   return (((x*d1+y)*d2)+z)*d3+w;
31 | 
32 | }
33 | __device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y)
34 | {
35 |   int x, y, z, w;
36 |   w = ii % d3;
37 |   ii = ii/d3;
38 |   z = ii % d2;
39 |   ii = ii/d2;
40 |   y = ii % d1;
41 |   ii = ii/d1;
42 |   x = ii;
43 |   w = w*scale_factor+off_x;
44 |   z = z*scale_factor+off_y;
45 |   d2 *= scale_factor;
46 |   d3 *= scale_factor;
47 |   return (((x*d1+y)*d2)+z)*d3+w;
48 | 
49 | }
50 | 
51 | template <typename Dtype>
52 | __global__ void upscale(Dtype *input, Dtype *output, long no_elements,
53 |                         int scale_factor, int d1, int d2, int d3)
54 | {
55 |   // output offset:
56 |   long ii = threadIdx.x + blockDim.x * blockIdx.x;
57 |   ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
58 |   if (ii >= no_elements) return;
59 |   int ipidx = translate_idx(ii, d1, d2, d3, scale_factor);
60 |   output[ii]=input[ipidx];
61 | }
62 | 
63 | /*
64 |  * Description:
65 |  */
66 | template <typename Dtype, typename Acctype>
67 | __global__ void downscale(Dtype *gradInput_data, Dtype *gradOutput_data, long no_elements,
68 |                               int scale_factor, int d1, int d2, int d3)
69 | {
70 |   // output offset:
71 |   long ii = threadIdx.x + blockDim.x * blockIdx.x;
72 |   ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
73 |   if (ii >= no_elements) return;
74 |   Acctype sum = Acctype(0);
75 |   for (int i=0; i < scale_factor; i++){
76 |     for(int j=0; j < scale_factor; j++){
77 |       int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j);
78 |       sum += gradOutput_data[ipidx];
79 |     }
80 |   }
81 |   gradInput_data[ii] += ScalarConvert<Acctype, Dtype>::to(sum);
82 | }
83 | 
84 | #include "generic/SpatialUpSamplingNearest.cu"
85 | #include "THCGenerateFloatTypes.h"
86 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/MarginCriterion.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/MarginCriterion.cu"
 3 | #else
 4 | 
 5 | void THNN_(MarginCriterion_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *target,
 9 |            THCTensor *output,
10 |            bool sizeAverage,
11 |            accreal margin_)
12 | {
13 |   real margin = ScalarConvert<accreal, real>::to(margin_);
14 |   THCUNN_check_nElement(state, input, target);
15 |   THCUNN_check_dim_size(state, output, 1, 0, 1);
16 |   THCUNN_assertSameGPU(state, 2, input, target);
17 | 
18 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
19 | 
20 |   input = THCTensor_(newContiguous)(state, input);
21 |   target = THCTensor_(newContiguous)(state, target);
22 | 
23 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
24 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
25 |   accreal sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(),
26 |       margin_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin)));
27 | 
28 |   if (sizeAverage)
29 |     sum /= size;
30 | 
31 |   THCTensor_(free)(state, input);
32 |   THCTensor_(free)(state, target);
33 | 
34 |   THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
35 | }
36 | 
37 | 
38 | void THNN_(MarginCriterion_updateGradInput)(
39 |            THCState *state,
40 |            THCTensor *input,
41 |            THCTensor *target,
42 |            THCTensor *gradInput,
43 |            bool sizeAverage,
44 |            accreal margin_)
45 | {
46 |   real margin = ScalarConvert<accreal, real>::to(margin_);
47 | 
48 |   THCUNN_check_nElement(state, input, target);
49 |   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
50 | 
51 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
52 |   accreal norm = sizeAverage ? 1.f/size : 1;
53 | 
54 |   input = THCTensor_(newContiguous)(state, input);
55 |   target = THCTensor_(newContiguous)(state, target);
56 | 
57 |   THCTensor_(resizeAs)(state, gradInput, input);
58 | 
59 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
60 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
61 |   thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
62 | 
63 |   thrust::transform(input_data, input_data+size, target_data, gradInput_data,
64 |       margin_updateGradInput_functor<real, accreal>(ScalarConvert<real, accreal>::to(margin), norm));
65 | 
66 |   THCTensor_(free)(state, input);
67 |   THCTensor_(free)(state, target);
68 | }
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialReplicationPadding.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCDeviceTensor.cuh"
 4 | #include "THCDeviceTensorUtils.cuh"
 5 | #include "THCDeviceUtils.cuh"
 6 | #include "THCReduceApplyUtils.cuh"
 7 | #include <THC/THCApply.cuh>
 8 | 
 9 | #include "THCHalf.h"
10 | #include "THCHalfAutoNumerics.cuh"
11 | #include "THCAtomics.cuh"
12 | 
13 | template <typename Dtype>
14 | __global__ void SpatialReplicationPadding_updateOutput(
15 |   THCDeviceTensor<Dtype, 4> input,
16 |   THCDeviceTensor<Dtype, 4> output,
17 |   int padT, int padB, int padL, int padR) {
18 | 
19 |   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
20 |   int plane = blockIdx.y;
21 |   int batch = blockIdx.z;
22 |   if (outputPointId >= output.getSize(2) * output.getSize(3)) {
23 |     return;
24 |   }
25 |   int outputPointX = outputPointId % output.getSize(3);
26 |   int outputPointY = outputPointId / output.getSize(3);
27 | 
28 |   int iStartX = max(0, -padL);
29 |   int iStartY = max(0, -padT);
30 |   int oStartX = max(0, padL);
31 |   int oStartY = max(0, padT);
32 | 
33 |   int inputPointX = min(max(padL, outputPointX), input.getSize(3) + padL - 1) - oStartX + iStartX;
34 |   int inputPointY = min(max(padT, outputPointY), input.getSize(2) + padT - 1) - oStartY + iStartY;
35 | 
36 |   Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX];
37 |   output[batch][plane][outputPointY][outputPointX] = valueToCopy;
38 | }
39 | 
40 | template <typename Dtype>
41 | __global__ void SpatialReplicationPadding_updateGradInput(
42 |   THCDeviceTensor<Dtype, 4> gradInput,
43 |   THCDeviceTensor<Dtype, 4> gradOutput,
44 |   int padT, int padB, int padL, int padR) {
45 | 
46 |   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
47 |   int plane = blockIdx.y;
48 |   int batch = blockIdx.z;
49 |   if (outputPointId >= gradOutput.getSize(2) * gradOutput.getSize(3)) {
50 |     return;
51 |   }
52 |   int outputPointX = outputPointId % gradOutput.getSize(3);
53 |   int outputPointY = outputPointId / gradOutput.getSize(3);
54 | 
55 |   int iStartX = max(0, -padL);
56 |   int iStartY = max(0, -padT);
57 |   int oStartX = max(0, padL);
58 |   int oStartY = max(0, padT);
59 | 
60 |   int inputPointX = min(max(padL, outputPointX), gradInput.getSize(3) + padL - 1) - oStartX + iStartX;
61 |   int inputPointY = min(max(padT, outputPointY), gradInput.getSize(2) + padT - 1) - oStartY + iStartY;
62 | 
63 |   Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
64 |   atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
65 | }
66 | 
67 | 
68 | #include "generic/SpatialReplicationPadding.cu"
69 | #include "THCGenerateFloatTypes.h"
70 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/MSECriterion.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/MSECriterion.cu"
 3 | #else
 4 | 
 5 | void THNN_(MSECriterion_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *target,
 9 |            THCTensor *output,
10 |            bool sizeAverage)
11 | {
12 |   THCUNN_check_nElement(state, input, target);
13 |   THCUNN_check_dim_size(state, output, 1, 0, 1);
14 |   THCUNN_assertSameGPU(state, 2, input, target);
15 | 
16 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
17 | 
18 |   input = THCTensor_(newContiguous)(state, input);
19 |   target = THCTensor_(newContiguous)(state, target);
20 | 
21 |   THCThrustAllocator thrustAlloc(state);
22 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
23 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
24 |   accreal sum = thrust::inner_product(
25 | #if CUDA_VERSION >= 7000
26 |     thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
27 | #endif
28 |     input_data, input_data+size, target_data, (accreal) 0,
29 |     thrust::plus<accreal>(), mse_functor<real, accreal>());
30 | 
31 |   if (sizeAverage)
32 |     sum /= size;
33 | 
34 |   THCTensor_(free)(state, input);
35 |   THCTensor_(free)(state, target);
36 | 
37 |   THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
38 | }
39 | 
40 | void THNN_(MSECriterion_updateGradInput)(
41 |            THCState *state,
42 |            THCTensor *input,
43 |            THCTensor *target,
44 |            THCTensor *gradInput,
45 |            bool sizeAverage)
46 | {
47 |   THCUNN_check_nElement(state, input, target);
48 |   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
49 | 
50 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
51 |   accreal norm = sizeAverage ? (accreal)(2)/size : (accreal)(2);
52 | 
53 |   input = THCTensor_(newContiguous)(state, input);
54 |   target = THCTensor_(newContiguous)(state, target);
55 | 
56 |   THCTensor_(resizeAs)(state, gradInput, input);
57 | 
58 |   THCThrustAllocator thrustAlloc(state);
59 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
60 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
61 |   thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
62 | 
63 |   thrust::transform(
64 | #if CUDA_VERSION >= 7000
65 |     thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
66 | #endif
67 |     input_data, input_data+size, target_data, gradInput_data,
68 |     mse_updateGradInput_functor<real, accreal>(norm));
69 | 
70 |   THCTensor_(free)(state, input);
71 |   THCTensor_(free)(state, target);
72 | }
73 | 
74 | #endif
75 | 


--------------------------------------------------------------------------------
/lib/THCUNN/BCECriterion.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | 
 6 | #include <thrust/functional.h>
 7 | #include <thrust/device_ptr.h>
 8 | #include <thrust/iterator/zip_iterator.h>
 9 | #include <thrust/transform.h>
10 | #include <thrust/transform_reduce.h>
11 | 
12 | template <typename T>
13 | inline __device__ T eps();
14 | 
15 | template <>
16 | inline __device__ float eps() { return 1e-12f; }
17 | 
18 | template <>
19 | inline __device__ double eps() { return 1e-12; }
20 | 
21 | template <typename Dtype, typename Acctype>
22 | struct bce_functor
23 | {
24 |   template <class Tuple>
25 |   __host__ __device__
26 |   Acctype operator()(Tuple x)
27 |   {
28 |     Dtype input = thrust::get<0>(x);
29 |     Dtype t = thrust::get<1>(x);
30 |     assert(input >= 0. && input <= 1.);
31 |     return - (t * THCNumerics<Acctype>::log(input + eps<Acctype>()) + (Acctype(1)- t) * THCNumerics<Acctype>::log(Acctype(1) - input + eps<Acctype>()));
32 |   }
33 | };
34 | 
35 | template <typename Dtype, typename Acctype>
36 | struct bce_functor_weights
37 | {
38 |   template <class Tuple>
39 |   __host__ __device__
40 |   Acctype operator()(Tuple x)
41 |   {
42 |     Dtype input = thrust::get<0>(x);
43 |     Dtype t = thrust::get<1>(x);
44 |     Dtype w = thrust::get<2>(x);
45 |     assert(input >= 0. && input <= 1.);
46 |     return - w * (t * THCNumerics<Acctype>::log(input + eps<Acctype>()) + (Acctype(1) - t) * THCNumerics<Acctype>::log(Acctype(1) - input + eps<Acctype>()));
47 |   }
48 | };
49 | 
50 | template <typename Dtype, typename Acctype>
51 | struct bce_updateGradInput_functor
52 | {
53 |   const Dtype norm;
54 | 
55 |   bce_updateGradInput_functor(Dtype norm_)
56 |     : norm(norm_)
57 |   {}
58 | 
59 |   template <class Tuple>
60 |   __host__ __device__
61 |   Dtype operator()(Tuple x)
62 |   {
63 |     Dtype o = thrust::get<0>(x);
64 |     Dtype t = thrust::get<1>(x);
65 |     return ScalarConvert<Acctype,Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm);
66 |   }
67 | };
68 | 
69 | template <typename Dtype, typename Acctype>
70 | struct bce_updateGradInput_functor_weights
71 | {
72 |   const Dtype norm;
73 | 
74 |   bce_updateGradInput_functor_weights(Dtype norm_)
75 |     : norm(norm_)
76 |   {}
77 | 
78 |   template <class Tuple>
79 |   __host__ __device__
80 |   Dtype operator()(Tuple x)
81 |   {
82 |     Dtype o = thrust::get<0>(x);
83 |     Dtype t = thrust::get<1>(x);
84 |     Dtype w = thrust::get<2>(x);
85 |     return ScalarConvert<Acctype, Dtype>::to(- (t - o) / ((Acctype(1) - o + eps<Acctype>()) * (o + eps<Acctype>())) * norm * w);
86 |   }
87 | };
88 | 
89 | #include "generic/BCECriterion.cu"
90 | #include "THCGenerateFloatTypes.h"
91 | 


--------------------------------------------------------------------------------
/lib/THCUNN/VolumetricMaxUnpooling.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCDeviceTensor.cuh"
 4 | #include "THCDeviceTensorUtils.cuh"
 5 | #include "THCDeviceUtils.cuh"
 6 | #include "THCHalf.h"
 7 | #include "THCHalfAutoNumerics.cuh"
 8 | 
 9 | #include <cfloat>
10 | 
11 | template <typename Dtype>
12 | __global__ void cuda_VolumetricMaxUnpooling_updateOutput(
13 |   THCDeviceTensor<Dtype, 4> input,
14 |   THCDeviceTensor<THCIndex_t, 4> indices,
15 |   THCDeviceTensor<Dtype, 4> output,
16 |   int dT, int dH, int dW,
17 |   int padT, int padH, int padW, int offsetZ)
18 | {
19 |   long iColumn = blockIdx.x * blockDim.x + threadIdx.x;
20 |   long iRow    = blockIdx.y * blockDim.y + threadIdx.y;
21 |   long iFrame  = (blockIdx.z + offsetZ) % input.getSize(1); // intput frame/time
22 |   long slice   = (blockIdx.z + offsetZ) / input.getSize(1); // intput slice/feature
23 | 
24 |   if (iRow < input.getSize(2) && iColumn < input.getSize(3))
25 |   {
26 |     long start_t = iFrame * dT - padT;
27 |     long start_h = iRow * dH - padH;
28 |     long start_w = iColumn * dW - padW;
29 | 
30 |     Dtype val = input[slice][iFrame][iRow][iColumn];
31 | 
32 |     THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
33 |     long maxz = ((unsigned char*)(idx))[0];
34 |     long maxy = ((unsigned char*)(idx))[1];
35 |     long maxx = ((unsigned char*)(idx))[2];
36 |     output[slice][start_t + maxz][start_h + maxy][start_w + maxx] = val;
37 |   }
38 | }
39 | 
40 | template <typename Dtype>
41 | __global__ void cuda_VolumetricMaxUnpooling_updateGradInput(
42 |   THCDeviceTensor<Dtype, 4> gradOutput,
43 |   THCDeviceTensor<THCIndex_t, 4> indices,
44 |   THCDeviceTensor<Dtype, 4> gradInput,
45 |   int dT, int dH, int dW,
46 |   int padT, int padH, int padW, int offsetZ)
47 | {
48 |   int iColumn = blockIdx.x * blockDim.x + threadIdx.x;
49 |   int iRow    = blockIdx.y * blockDim.y + threadIdx.y;
50 |   int iFrame  = (blockIdx.z + offsetZ) % gradInput.getSize(1); // output frame/time
51 |   int slice   = (blockIdx.z + offsetZ) / gradInput.getSize(1); // output slice/feature
52 | 
53 |   if (iRow < gradInput.getSize(2) && iColumn < gradInput.getSize(3))
54 |   {
55 | 
56 |     long start_t = iFrame * dT - padT;
57 |     long start_h = iRow * dH - padH;
58 |     long start_w = iColumn * dW - padW;
59 | 
60 |     THCIndex_t *idx = &indices[slice][iFrame][iRow][iColumn];
61 |     long maxz = ((unsigned char*)(idx))[0];
62 |     long maxy = ((unsigned char*)(idx))[1];
63 |     long maxx = ((unsigned char*)(idx))[2];
64 | 
65 |     Dtype grad_val = gradOutput[slice][start_t + maxz][start_h + maxy][start_w + maxx];
66 | 
67 |     gradInput[slice][iFrame][iRow][iColumn] = grad_val;
68 |   }
69 | }
70 | 
71 | #include "generic/VolumetricMaxUnpooling.cu"
72 | #include "THCGenerateFloatTypes.h"
73 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/DistKLDivCriterion.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/DistKLDivCriterion.cu"
 3 | #else
 4 | 
 5 | void THNN_(DistKLDivCriterion_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *target,
 9 |            THCTensor *output,
10 |            bool sizeAverage)
11 | {
12 |   THCUNN_check_nElement(state, input, target);
13 |   THCUNN_check_dim_size(state, output, 1, 0, 1);
14 |   THCUNN_assertSameGPU(state, 2, input, target);
15 | 
16 |   THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
17 |              "input and target need to have the same number of elements");
18 | 
19 |   accreal sum;
20 | 
21 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
22 | 
23 |   input = THCTensor_(newContiguous)(state, input);
24 |   target = THCTensor_(newContiguous)(state, target);
25 | 
26 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
27 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
28 |   sum = thrust::inner_product(input_data, input_data+size, target_data, (accreal) 0, thrust::plus<accreal>(), kl_functor<real, accreal>());
29 | 
30 |   if (sizeAverage)
31 |     sum /= size;
32 | 
33 |   THCTensor_(free)(state, input);
34 |   THCTensor_(free)(state, target);
35 | 
36 |   THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
37 | }
38 | 
39 | void THNN_(DistKLDivCriterion_updateGradInput)(
40 |            THCState *state,
41 |            THCTensor *input,
42 |            THCTensor *target,
43 |            THCTensor *gradInput,
44 |            bool sizeAverage)
45 | {
46 |   THCUNN_check_nElement(state, input, target);
47 |   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
48 | 
49 |   THArgCheck(THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
50 |              "input and target need to have the same number of elements");
51 | 
52 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
53 |   real norm = (sizeAverage ? ScalarConvert<accreal, real>::to(accreal(1)/size) : ScalarConvert<int, real>::to(1));
54 | 
55 |   input = THCTensor_(newContiguous)(state, input);
56 |   target = THCTensor_(newContiguous)(state, target);
57 | 
58 |   THCTensor_(resizeAs)(state, gradInput, input);
59 | 
60 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
61 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
62 |   thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
63 | 
64 |   thrust::transform(input_data, input_data+size, target_data, gradInput_data, kl_updateGradInput_functor<real>(norm));
65 | 
66 |   THCTensor_(free)(state, input);
67 |   THCTensor_(free)(state, target);
68 | }
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/lib/THCUNN/PReLU.cu:
--------------------------------------------------------------------------------
  1 | #include "THCUNN.h"
  2 | #include "THCHalf.h"
  3 | #include "THCHalfAutoNumerics.cuh"
  4 | #include <THC/THCApply.cuh>
  5 | 
  6 | #include "common.h"
  7 | 
  8 | template <typename T>
  9 | struct PReLUUpdateOutput
 10 | {
 11 |   T* weight_;
 12 | 
 13 |   PReLUUpdateOutput(T* weight)
 14 |     : weight_(weight)
 15 |   {}
 16 | 
 17 |   __device__ __forceinline__ void operator()(T *out, T *in)
 18 |   {
 19 |     T x = *in;
 20 |     *out = (x > 0) ? x : weight_[0] * x;
 21 |   }
 22 | };
 23 | 
 24 | template <typename T>
 25 | __global__ void preluForward(T *output, const T *input, const T *weight, int n, int nElemsPerSample, int mapSize)
 26 | {
 27 |   CUDA_KERNEL_LOOP(i, n)
 28 |   {
 29 |     int positionInSample = i % nElemsPerSample;
 30 |     int mapNumber = positionInSample / mapSize;
 31 |     output[i] = input[i] > 0 ? input[i] : input[i] * weight[mapNumber];
 32 |   }
 33 | }
 34 | 
 35 | template <typename T>
 36 | struct PReLUUpdateGradInput
 37 | {
 38 |   T *weight_;
 39 | 
 40 |   PReLUUpdateGradInput(T *weight)
 41 |     : weight_(weight)
 42 |   {}
 43 | 
 44 |   __device__ __forceinline__ void operator()(T *gradInput, T *gradOutput, T *input)
 45 |   {
 46 |     *gradInput = *input > 0 ? *gradOutput : *gradOutput * *weight_;
 47 |   }
 48 | };
 49 | 
 50 | template <typename T>
 51 | __global__ void preluBackward(
 52 |   T *gradInput,
 53 |   const T *input,
 54 |   const T *weight,
 55 |   const T *gradOutput,
 56 |   int n, int nElemsPerSample, int mapSize)
 57 | {
 58 |   CUDA_KERNEL_LOOP(i, n)
 59 |   {
 60 |     int positionInSample = i % nElemsPerSample;
 61 |     int mapNumber = positionInSample / mapSize;
 62 |     gradInput[i] = input[i] > 0 ? gradOutput[i] : gradOutput[i] * weight[mapNumber];
 63 |   }
 64 | }
 65 | 
 66 | template <typename T>
 67 | struct PReLUAccGradParametersShared
 68 | {
 69 |   __device__ __forceinline__ void operator()(T *gradInput, T  *input, T *gradOutput)
 70 |   {
 71 |     *gradInput = (*input) * (*gradOutput) * (*input <= 0);
 72 |   }
 73 | };
 74 | 
 75 | template <typename T>
 76 | struct PReLUAccGradParameters
 77 | {
 78 |   T scale;
 79 | 
 80 |   PReLUAccGradParameters(T scale)
 81 |     : scale(scale)
 82 |   {}
 83 | 
 84 |   __device__ __forceinline__ void operator()(T *gradInput, T *input, T *gradOutput)
 85 |   {
 86 |     *gradInput = (*input) * (*gradOutput) * scale * (*input <= 0);
 87 |   }
 88 | };
 89 | 
 90 | template <typename T>
 91 | struct PReLUAccGradParameters1to1
 92 | {
 93 |   T scale;
 94 | 
 95 |   PReLUAccGradParameters1to1(T scale)
 96 |     : scale(scale)
 97 |   {}
 98 | 
 99 |   __device__ __forceinline__ void operator()(T *gradWeight, T *input, T *gradOutput)
100 |   {
101 |     *gradWeight += (*input) * (*gradOutput) * scale * (*input <= 0);
102 |   }
103 | };
104 | 
105 | #include "generic/PReLU.cu"
106 | #include "THCGenerateFloatTypes.h"
107 | 


--------------------------------------------------------------------------------
/lib/THCUNN/VolumetricUpSamplingNearest.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | 
 4 | #include <thrust/transform.h>
 5 | #include <thrust/reduce.h>
 6 | #include <thrust/transform_reduce.h>
 7 | #include <thrust/functional.h>
 8 | 
 9 | #include "THCHalf.h"
10 | #include "THCHalfAutoNumerics.cuh"
11 | 
12 | /*
13 |  * Description:
14 |  */
15 | 
16 | __device__ int translate_idx(int ii, int d1, int d2, int d3, int d4, int scale_factor)
17 | {
18 |   int x, y, z, w, v;
19 |   v = ii % d4;
20 |   ii = ii/d4;
21 |   w = ii % d3;
22 |   ii = ii/d3;
23 |   z = ii % d2;
24 |   ii = ii/d2;
25 |   y = ii % d1;
26 |   ii = ii/d1;
27 |   x = ii;
28 |   v = v/scale_factor;
29 |   w = w/scale_factor;
30 |   z = z/scale_factor;
31 |   d2 /= scale_factor;
32 |   d3 /= scale_factor;
33 |   d4 /= scale_factor;
34 |   return ((((x*d1+y)*d2)+z)*d3+w)*d4+v;
35 | 
36 | }
37 | __device__ int translate_idx_inv(int ii, int d1, int d2, int d3, int d4, int scale_factor, int off_x, int off_y, int off_z)
38 | {
39 |   int x, y, z, w, v;
40 |   v = ii % d4;
41 |   ii = ii/d4;
42 |   w = ii % d3;
43 |   ii = ii/d3;
44 |   z = ii % d2;
45 |   ii = ii/d2;
46 |   y = ii % d1;
47 |   ii = ii/d1;
48 |   x = ii;
49 |   v = v*scale_factor+off_x;
50 |   w = w*scale_factor+off_y;
51 |   z = z*scale_factor+off_z;
52 |   d2 *= scale_factor;
53 |   d3 *= scale_factor;
54 |   d4 *= scale_factor;
55 |   return ((((x*d1+y)*d2)+z)*d3+w)*d4+v;
56 | 
57 | }
58 | 
59 | template <typename Dtype>
60 | __global__ void vupscale(Dtype *input, Dtype *output, long no_elements,
61 |                          int scale_factor, int d1, int d2, int d3, int d4)
62 | {
63 |   // output offset:
64 |   long ii = threadIdx.x + blockDim.x * blockIdx.x;
65 |   ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
66 |   if (ii >= no_elements) return;
67 |   int ipidx = translate_idx(ii, d1, d2, d3, d4, scale_factor);
68 |   output[ii]=input[ipidx];
69 | }
70 | 
71 | /*
72 |  * Description:
73 |  */
74 | template <typename Dtype, typename Acctype>
75 | __global__ void vdownscale(Dtype *gradInput_data, Dtype *gradOutput_data, long no_elements,
76 |                               int scale_factor, int d1, int d2, int d3, int d4)
77 | {
78 |   // output offset:
79 |   long ii = threadIdx.x + blockDim.x * blockIdx.x;
80 |   ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y;
81 |   if (ii >= no_elements) return;
82 |   Acctype sum = Acctype(0);
83 |   for (int i=0; i < scale_factor; i++){
84 |     for(int j=0; j < scale_factor; j++){
85 |       for(int k=0; k < scale_factor; k++){
86 |         int ipidx = translate_idx_inv(ii, d1, d2, d3, d4, scale_factor, i, j, k);
87 |         sum += gradOutput_data[ipidx];
88 |       }
89 |     }
90 |   }
91 |   gradInput_data[ii] += ScalarConvert<Acctype, Dtype>::to(sum);
92 | }
93 | 
94 | #include "generic/VolumetricUpSamplingNearest.cu"
95 | #include "THCGenerateFloatTypes.h"
96 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/GatedLinearUnit.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/GatedLinearUnit.cu"
 3 | #else
 4 | 
 5 | void THNN_(GatedLinear_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *output,
 9 |            int dim)
10 | {
11 |   THCUNN_assertSameGPU(state, 2, input, output);
12 | 
13 |   // size output to half of input
14 |   dim = dim - TH_INDEX_BASE;
15 |   const long nIn = THCTensor_(size)(state, input, dim);
16 |   THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
17 |       dim + TH_INDEX_BASE, nIn);
18 |   const long inputSize = THCTensor_(size)(state, input, dim) / 2;
19 |   THLongStorage *newSizes = THCTensor_(newSizeOf)(state, input);
20 |   THLongStorage_set(newSizes, dim, inputSize);
21 |   THCTensor_(resize)(state, output, newSizes, NULL);
22 | 
23 |   // halve tensor
24 |   THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize);
25 |   THCTensor *secondHalf = THCTensor_(newNarrow)(state, input, dim, inputSize, inputSize);
26 | 
27 |   // x = x1:cmul( sigmoid(x2) )
28 |   THC_pointwiseApply3(state, output, secondHalf, firstHalf, gatedLinearCSigMul_functor<real, accreal>());
29 | 
30 |   THLongStorage_free(newSizes);
31 |   THCTensor_(free)(state, firstHalf);
32 |   THCTensor_(free)(state, secondHalf);
33 | }
34 | 
35 | void THNN_(GatedLinear_updateGradInput)(
36 |            THCState *state,
37 |            THCTensor *input,
38 |            THCTensor *gradOutput,
39 |            THCTensor *gradInput,
40 |            int dim)
41 | {
42 |   THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
43 |   dim = dim - TH_INDEX_BASE;
44 |   const long nIn = THCTensor_(size)(state, input, dim);
45 |   THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
46 |       dim + TH_INDEX_BASE, nIn);
47 | 
48 |   THCTensor_(resizeAs)(state, gradInput, input);
49 |   const long inputSize = THCTensor_(size)(state, input, dim) / 2;
50 |   THCTensor *firstHalf = THCTensor_(newNarrow)(state, input, dim, 0, inputSize);
51 |   THCTensor *secondHalf = THCTensor_(newNarrow)(state, input, dim, inputSize, inputSize);
52 |   THCTensor *gradInputfirstHalf = THCTensor_(newNarrow)(state, gradInput, dim, 0, inputSize);
53 |   THCTensor *gradInputsecondHalf = THCTensor_(newNarrow)(state, gradInput, dim, inputSize, inputSize);
54 |   // first half of derivative
55 |   THC_pointwiseApply3(state, gradInputfirstHalf, secondHalf, gradOutput, gatedLinearCSigMul_functor<real, accreal>());
56 |   // second half of derivative
57 |   THCTensor_(copy)(state, gradInputsecondHalf, firstHalf);
58 |   THC_pointwiseApply3(state, gradInputsecondHalf, secondHalf, gradOutput, gatedLinearDerivativeSecondHalf_functor<real, accreal>());
59 | 
60 |   THCTensor_(free)(state, firstHalf);
61 |   THCTensor_(free)(state, secondHalf);
62 |   THCTensor_(free)(state, gradInputfirstHalf);
63 |   THCTensor_(free)(state, gradInputsecondHalf);
64 | }
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SparseLinear.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | 
 5 | #include <cusparse.h>
 6 | 
 7 | static cusparseHandle_t cusparse_handle = 0;
 8 | 
 9 | static void init_cusparse() {
10 |   if (cusparse_handle == 0) {
11 |     cusparseStatus_t status = cusparseCreate(&cusparse_handle);
12 |     if (status != CUSPARSE_STATUS_SUCCESS) {
13 |       THError("CUSPARSE Library initialization failed");
14 |     }
15 |   }
16 | }
17 | 
18 | #ifdef CUDA_HALF_TENSOR
19 | void THNN_CudaHalfSparseLinear_updateOutput(
20 |           THCState *state,
21 |           THCudaHalfTensor *input,
22 |           THCudaHalfTensor *output,
23 |           THCudaHalfTensor *weight,
24 |           THCudaHalfTensor *bias) {
25 |   THError("THCudaHalfTensor not supported with SparseLinear");
26 | }
27 | 
28 | void THNN_CudaHalfSparseLinear_accGradParameters(
29 |           THCState *state,
30 |           THCudaHalfTensor *input,
31 |           THCudaHalfTensor *gradOutput,
32 |           THCudaHalfTensor *gradWeight,
33 |           THCudaHalfTensor *gradBias,
34 |           THCudaHalfTensor *weight,
35 |           THCudaHalfTensor *bias,
36 |           float weightDecay,
37 |           float scale) {
38 |   THError("THCudaHalfTensor not supported with SparseLinear");
39 | }
40 | 
41 | void THNN_CudaHalfSparseLinear_legacyUpdateOutput(
42 |           THCState *state,
43 |           THCudaHalfTensor *input,
44 |           THCudaHalfTensor *output,
45 |           THCudaHalfTensor *weight,
46 |           THCudaHalfTensor *bias) {
47 |   THError("THCudaHalfTensor not supported with SparseLinear");
48 | }
49 | 
50 | void THNN_CudaHalfSparseLinear_legacyAccGradParameters(
51 |           THCState *state,
52 |           THCudaHalfTensor *input,
53 |           THCudaHalfTensor *gradOutput,
54 |           THCudaHalfTensor *gradWeight,
55 |           THCudaHalfTensor *gradBias,
56 |           THCudaHalfTensor *weight,
57 |           THCudaHalfTensor *bias,
58 |           float weightDecay,
59 |           float scale) {
60 |   THError("THCudaHalfTensor not supported with SparseLinear");
61 | }
62 | 
63 | void THNN_CudaHalfSparseLinear_zeroGradParameters(
64 |           THCState *state,
65 |           THCudaHalfTensor *gradWeight,
66 |           THCudaHalfTensor *gradBias,
67 |           THCudaHalfTensor *lastInput) {
68 |   THError("THCudaHalfTensor not supported with SparseLinear");
69 | }
70 | 
71 | void THNN_CudaHalfSparseLinear_updateParameters(
72 |           THCState *state,
73 |           THCudaHalfTensor *weight,
74 |           THCudaHalfTensor *bias,
75 |           THCudaHalfTensor *gradWeight,
76 |           THCudaHalfTensor *gradBias,
77 |           THCudaHalfTensor *lastInput,
78 |           float learningRate) {
79 |   THError("THCudaHalfTensor not supported with SparseLinear");
80 | }
81 | #endif
82 | 
83 | #include "generic/SparseLinear.cu"
84 | #include "THCGenerateFloatType.h"
85 | #include "generic/SparseLinear.cu"
86 | #include "THCGenerateDoubleType.h"
87 | 


--------------------------------------------------------------------------------
/doc/cunnmodules.md:
--------------------------------------------------------------------------------
 1 | <a name="nn.cunnmodules.dok"/>
 2 | # Additional Modules #
 3 | 
 4 | The following nn modules are also made available by the cunn package:
 5 |  * [DataParallelTable](#nn.DataParallelTable) : parallelize calls to `forward` and `backward` across multiple-GPUs.
 6 |  * [GPU](https://github.com/torch/nn/blob/master/doc/simple.md#nn.GPU) : decorates a module so that it can be executed on a specific GPU device.
 7 | 
 8 | <a name="nn.DataParallelTable"/>
 9 | ## DataParallelTable ##
10 | 
11 | ```lua
12 | module = nn.DataParallelTable(dim, [flattenParams], [useNCCL])
13 | module:add(net, {gpu1, [gpu2, ...]})
14 | ```
15 | 
16 | DataParallelTable implements data parallelism for Torch modules. The same model
17 | is replicated on multiple GPUs. The input is split, typically into smaller mini-batches.
18 | Each replicated model handles only its portion of the input. The weight updates for 
19 | each replica are summed together on the first replica in accGradParameters.
20 | 
21 | ### DataParallelTable(dim, [flattenParams], [useNCCL]) ###
22 | 
23 | Creates a `DataParallelTable` that splits the input on the dimension `dim`. If `flattenParams` is `true`, [`getParameters()`](https://github.com/torch/nn/blob/master/doc/module.md#nn.Module.getParameters) will be called on the replicated module. If `useNCCL` is `true` and both [NCCL](https://github.com/NVIDIA/nccl) and the [NCCL torch bindings](https://github.com/ngimel/nccl.torch) are installed, NCCL will be used for inter-GPU communication.
24 | 
25 | For best performance, use `flattenParams` and `NCCL`.
26 | 
27 | ### DataParallelTable:add(module, gpus) ###
28 | 
29 | Replicates `module` on the table of `gpus`. For example:
30 | 
31 | ```lua
32 | nn.DataParallelTable(1):add(module, {1, 2, 3, 4})
33 | ```
34 | 
35 | ### DataParallelTable:threads(initFunc) ###
36 | 
37 | Switches the internal implementation to  use a seperate thread for each replica. This may hide the cost of kernel launches by dispatching them in parallel. The `initFunc` is executed in each thread.
38 | 
39 | ```lua
40 | nn.DataParallelTable(1):threads(function()
41 |   require 'cudnn'
42 | end)
43 | ```
44 | 
45 | ### DataParallelTable:syncParameters() ###
46 | 
47 | Copies the model parameters from the first replica to all other replicas. This is automatically called from `updateOutput`, if it has not been called since the last `accGradParameters`.
48 | 
49 | ### Example of training using DataParallelTable ###
50 | 
51 | ```lua
52 | -- CONSTRUCT MODEL:
53 | conv_net = makeConvNet()  -- i.e. create nn.Sequential() and fill it
54 | net = nn.DataParallelTable(1)  -- Split along first (batch) dimension
55 | net:add(conv_net, {1, 2}) -- Use GPUs 1 and 2
56 | -- TRAINING:
57 | for i = 1, num_epochs do
58 |   local output = net:forward(input)
59 |   local err = criterion:forward(output, target)
60 |   net:zeroGradParameters()
61 |   local gradOutput = criterion:backward(output, target)
62 |   local gradInput = net:backward(input, gradOutput)
63 |   net:updateParameters(lr)
64 | end
65 | ```
66 | 
67 | 


--------------------------------------------------------------------------------
/lib/THCUNN/LogSigmoid.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include <THC/THCApply.cuh>
 5 | 
 6 | template <typename T>
 7 | struct logSigmoid_updateOutput_functor
 8 | {
 9 |   __device__ void operator()(T *output, const T *input) const {
10 |     const T max = fmaxType(0.f, - *input);
11 |     const T z = THCNumerics<T>::exp(-max) + THCNumerics<T>::exp(-*input -max);
12 |     *output = -(max + THCNumerics<T>::log(z));
13 |   }
14 | };
15 | 
16 | template <typename T>
17 | struct logSigmoid_updateGradInput_functor
18 | {
19 |   __device__ void operator()(T *gradInput, const T *input, const T *gradOutput) const {
20 |     const T max = fmaxType(0.f, -*input);
21 |     const T z = THCNumerics<T>::exp(-max) + THCNumerics<T>::exp(-*input -max);
22 |     T max_deriv = 0.f;
23 |     T sign = -1.f;
24 |     if (*input < 0.f){
25 |         max_deriv = -1.f;
26 |         sign = 1.f;
27 |     }
28 |     *gradInput = *gradOutput * (-max_deriv - sign*((z - 1.f)/z));
29 |   }
30 | };
31 | 
32 | #ifdef CUDA_HALF_TENSOR
33 | template <>
34 | struct logSigmoid_updateOutput_functor<half> {
35 |   __device__ __forceinline__ void operator()(half* output, const half *input) const {
36 | #ifdef CUDA_HALF_INSTRUCTIONS
37 |     const half max = fmaxType(__float2half(0.f), __hneg(*input));
38 |     const half z = THCNumerics<half>::exp(__hneg(max)) + THCNumerics<half>::exp(__hneg(*input) - max);
39 |     *output = __hneg(max + THCNumerics<half>::log(z));
40 | #else
41 |     float in = __half2float(*input);
42 |     float max = fmaxType(0.f, -in);
43 |     float z = THCNumerics<float>::exp(-max) + THCNumerics<float>::exp(-in - max);
44 |     *output = __float2half(-(max + THCNumerics<float>::log(z)));
45 | #endif
46 |   }
47 | };
48 | 
49 | template <>
50 | struct logSigmoid_updateGradInput_functor<half> {
51 |   __device__ __forceinline__ void operator()(half* gradInput, const half *input, const half *gradOutput) const {
52 | #ifdef CUDA_HALF_INSTRUCTIONS
53 |     const half one = __float2half(1.f);
54 |     const half zero = __float2half(0.f);
55 |     const half max = fmaxType(zero, __hneg(*input));
56 |     const half z = THCNumerics<half>::exp(__hneg(max)) + THCNumerics<half>::exp(__hneg(*input) - max);
57 |     half max_deriv = zero;
58 |     half sign = __hneg(one);
59 |     if(*input < zero){
60 |         max_deriv = __hneg(one);
61 |         sign = one;
62 |     }
63 |     *gradInput = __hmul(*gradOutput, (__hneg(max_deriv) - __hmul(sign, __hdiv(z - one, z))));
64 | #else
65 |     const float in = __half2float(*input);
66 |     const float max = fmaxType(0.f, -in);
67 |     const float z = THCNumerics<float>::exp(-max) + THCNumerics<float>::exp(-in - max);
68 |     const float go = __half2float(*gradOutput);
69 |     float max_deriv = 0.f;
70 |     float sign = -1.f;
71 |     if(in < 0.f){
72 |         max_deriv = -1.f;
73 |         sign = 1.f;
74 |     }
75 |     *gradInput = __float2half(go * (-max_deriv - sign*((z - 1.f)/z)));
76 | #endif
77 |   }
78 | };
79 | #endif
80 | 
81 | #include "generic/LogSigmoid.cu"
82 | #include "THCGenerateFloatTypes.h"
83 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SmoothL1Criterion.cu:
--------------------------------------------------------------------------------
 1 | #ifndef THC_GENERIC_FILE
 2 | #define THC_GENERIC_FILE "generic/SmoothL1Criterion.cu"
 3 | #else
 4 | 
 5 | void THNN_(SmoothL1Criterion_updateOutput)(
 6 |            THCState *state,
 7 |            THCTensor *input,
 8 |            THCTensor *target,
 9 |            THCTensor *output,
10 |            bool sizeAverage)
11 | {
12 |   THCUNN_check_nElement(state, input, target);
13 |   THCUNN_check_dim_size(state, output, 1, 0, 1);
14 |   THCUNN_assertSameGPU(state, 2, input, target);
15 |   THArgCheck(
16 |     THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
17 |     "input and target need to have the same number of elements"
18 |   );
19 | 
20 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
21 | 
22 |   input = THCTensor_(newContiguous)(state, input);
23 |   target = THCTensor_(newContiguous)(state, target);
24 | 
25 |   THCThrustAllocator thrustAlloc(state);
26 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
27 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
28 |   accreal sum = thrust::inner_product(
29 | #if CUDA_VERSION >= 7000
30 |     thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
31 | #endif
32 |     input_data, input_data+size, target_data, (accreal) 0,
33 |     thrust::plus<accreal>(), smoothl1_functor<real, accreal>()
34 |   );
35 | 
36 |   if (sizeAverage)
37 |     sum /= size;
38 | 
39 |   THCTensor_(free)(state, input);
40 |   THCTensor_(free)(state, target);
41 | 
42 |   THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
43 | }
44 | 
45 | void THNN_(SmoothL1Criterion_updateGradInput)(
46 |            THCState *state,
47 |            THCTensor *input,
48 |            THCTensor *target,
49 |            THCTensor *gradInput,
50 |            bool sizeAverage)
51 | {
52 |   THCUNN_check_nElement(state, input, target);
53 |   THCUNN_assertSameGPU(state, 3, input, target, gradInput);
54 |   THArgCheck(
55 |     THCTensor_(nElement)(state, input) == THCTensor_(nElement)(state, target), 2,
56 |     "input and target need to have the same number of elements"
57 |   );
58 | 
59 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
60 |   real norm = ScalarConvert<accreal, real>::to(sizeAverage ? accreal(1)/size : accreal(1));
61 | 
62 |   input = THCTensor_(newContiguous)(state, input);
63 |   target = THCTensor_(newContiguous)(state, target);
64 | 
65 |   THCTensor_(resizeAs)(state, gradInput, input);
66 | 
67 |   THCThrustAllocator thrustAlloc(state);
68 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
69 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
70 |   thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
71 | 
72 |   thrust::transform(
73 | #if CUDA_VERSION >= 7000
74 |     thrust::cuda::par(thrustAlloc).on(THCState_getCurrentStream(state)),
75 | #endif
76 |     input_data, input_data+size, target_data, gradInput_data,
77 |     smoothl1_updateGradInput_functor<real>(norm)
78 |   );
79 | 
80 |   THCTensor_(free)(state, input);
81 |   THCTensor_(free)(state, target);
82 | }
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/lib/THCUNN/MultiMarginCriterion.cu:
--------------------------------------------------------------------------------
  1 | #include "THCUNN.h"
  2 | #include "common.h"
  3 | #include "THCHalf.h"
  4 | #include "THCHalfAutoNumerics.cuh"
  5 | 
  6 | #define MULTIMARGIN_THREADS 128
  7 | 
  8 | template <int P, typename Dtype, typename Acctype>
  9 | __global__ void cunn_MultiMarginCriterion_updateOutput_kernel(Dtype *output, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
 10 | {
 11 |   __shared__ Acctype buffer[MULTIMARGIN_THREADS];
 12 |   int k = blockIdx.x;
 13 |   Dtype *input_k = input + k*dim;
 14 |   Dtype *output_k = output + k;
 15 |   int target_k = ((int)target[k]) - TH_INDEX_BASE;
 16 |   Dtype input_target_k = input_k[target_k];
 17 | 
 18 |   int i_start = threadIdx.x;
 19 |   int i_end = dim;
 20 |   int i_step = blockDim.x;
 21 | 
 22 |   buffer[threadIdx.x] = 0;
 23 |   for (int i = i_start; i < i_end; i += i_step)
 24 |   {
 25 |     Dtype z = margin - input_target_k + input_k[i];
 26 |     if (i == target_k)
 27 |       continue;
 28 | 
 29 |     if (z > 0) {
 30 |       Dtype h = (P==1) ? z : z*z;
 31 |       if(weights)
 32 |         h *= weights[target_k];
 33 |       buffer[threadIdx.x] += h;
 34 |     }
 35 |   }
 36 |   __syncthreads();
 37 | 
 38 |   // reduce
 39 |   if (threadIdx.x == 0)
 40 |   {
 41 |     Acctype sum = 0;
 42 |     for (int i=0; i < blockDim.x; i++)
 43 |       sum += buffer[i];
 44 | 
 45 |     *output_k = ScalarConvert<Acctype, Dtype>::to(sum/dim);
 46 |     if(sizeAverage)
 47 |       *output_k /= nframe;
 48 |   }
 49 | }
 50 | 
 51 | template <int P, typename Dtype, typename Acctype>
 52 | __global__ void cunn_MultiMarginCriterion_updateGradInput_kernel(Dtype *gradInput, Dtype *input, THCIndex_t *target, Dtype *weights, int nframe, int dim, bool sizeAverage, Dtype margin)
 53 | {
 54 |   __shared__ Acctype buffer[MULTIMARGIN_THREADS];
 55 |   int k = blockIdx.x;
 56 |   Dtype *input_k = input + k*dim;
 57 |   Dtype *gradInput_k = gradInput + k*dim;
 58 |   int target_k = ((int)target[k]) - TH_INDEX_BASE;
 59 |   Dtype input_target_k = input_k[target_k];
 60 |   Acctype g = (sizeAverage ? 1./((Acctype)(nframe*dim)) : 1./((Acctype)dim));
 61 | 
 62 |   int i_start = threadIdx.x;
 63 |   int i_end = dim;
 64 |   int i_step = blockDim.x;
 65 | 
 66 |   buffer[threadIdx.x] = 0;
 67 |   for (int i=i_start; i<i_end; i+=i_step)
 68 |   {
 69 |     Dtype z = margin - input_target_k + input_k[i];
 70 |     if (i == target_k)
 71 |       continue;
 72 | 
 73 |     if (z > 0)
 74 |     {
 75 |       Dtype h = ScalarConvert<Acctype, Dtype>::to((P == 1) ? g : 2*g*z);
 76 |       if(weights)
 77 |         h *= weights[target_k];
 78 |       buffer[threadIdx.x] -= h;
 79 |       gradInput_k[i] = h;
 80 |     }
 81 |     else
 82 |       gradInput_k[i] = ScalarConvert<int, Dtype>::to(0);
 83 |   }
 84 | 
 85 |   __syncthreads();
 86 | 
 87 |   // reduce
 88 |   if (threadIdx.x == 0)
 89 |   {
 90 |     Acctype gradInput_target_k = 0;
 91 |     for (int i=0; i<blockDim.x; i++)
 92 |       gradInput_target_k += buffer[i];
 93 |     gradInput_k[target_k] = ScalarConvert<Acctype, Dtype>::to(gradInput_target_k);
 94 |   }
 95 | }
 96 | 
 97 | #include "generic/MultiMarginCriterion.cu"
 98 | #include "THCGenerateFloatTypes.h"
 99 | 
100 | #undef MULTIMARGIN_THREADS
101 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialReflectionPadding.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCDeviceTensor.cuh"
 4 | #include "THCDeviceTensorUtils.cuh"
 5 | #include "THCDeviceUtils.cuh"
 6 | #include "THCReduceApplyUtils.cuh"
 7 | #include <THC/THCApply.cuh>
 8 | 
 9 | #include "THCHalf.h"
10 | #include "THCHalfAutoNumerics.cuh"
11 | #include "THCAtomics.cuh"
12 | 
13 | template<typename Dtype>
14 | __global__ void SpatialReflectionPadding_updateOutput(
15 |   THCDeviceTensor<Dtype, 4> input,
16 |   THCDeviceTensor<Dtype, 4> output,
17 |   int padT, int padB, int padL, int padR) {
18 | 
19 |   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
20 |   int plane = blockIdx.y;
21 |   int batch = blockIdx.z;
22 |   if (outputPointId >= output.getSize(2) * output.getSize(3)) {
23 |     return;
24 |   }
25 |   int outputPointX = outputPointId % output.getSize(3);
26 |   int outputPointY = outputPointId / output.getSize(3);
27 | 
28 |   int iStartX = max(0, -padL);
29 |   int iStartY = max(0, -padT);
30 |   int oStartX = max(0, padL);
31 |   int oStartY = max(0, padT);
32 | 
33 |   int inputPointX = abs(outputPointX - padL)
34 |                   - abs(outputPointX - (input.getSize(3) + padL - 1))
35 |                   - outputPointX
36 |                   + 2 * padL + input.getSize(3) - 1
37 |                   - oStartX + iStartX;
38 | 
39 |   int inputPointY = abs(outputPointY - padT)
40 |                   - abs(outputPointY - (input.getSize(2) + padT - 1))
41 |                   - outputPointY
42 |                   + 2 * padT + input.getSize(2) - 1
43 |                   - oStartY + iStartY;
44 | 
45 |   Dtype valueToCopy = input[batch][plane][inputPointY][inputPointX];
46 |   output[batch][plane][outputPointY][outputPointX] = valueToCopy;
47 | }
48 | 
49 | template <typename Dtype>
50 | __global__ void SpatialReflectionPadding_updateGradInput(
51 |   THCDeviceTensor<Dtype, 4> gradInput,
52 |   THCDeviceTensor<Dtype, 4> gradOutput,
53 |   int padT, int padB, int padL, int padR) {
54 | 
55 |   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
56 |   int plane = blockIdx.y;
57 |   int batch = blockIdx.z;
58 |   if (outputPointId >= gradOutput.getSize(2) * gradOutput.getSize(3)) {
59 |     return;
60 |   }
61 |   int outputPointX = outputPointId % gradOutput.getSize(3);
62 |   int outputPointY = outputPointId / gradOutput.getSize(3);
63 | 
64 |   int iStartX = max(0, -padL);
65 |   int iStartY = max(0, -padT);
66 |   int oStartX = max(0, padL);
67 |   int oStartY = max(0, padT);
68 | 
69 |   int inputPointX = abs(outputPointX - padL)
70 |                   - abs(outputPointX - (gradInput.getSize(3) + padL - 1))
71 |                   - outputPointX
72 |                   + 2 * padL + gradInput.getSize(3) - 1
73 |                   - oStartX + iStartX;
74 | 
75 |   int inputPointY = abs(outputPointY - padT)
76 |                   - abs(outputPointY - (gradInput.getSize(2) + padT - 1))
77 |                   - outputPointY
78 |                   + 2 * padT + gradInput.getSize(2) - 1
79 |                   - oStartY + iStartY;
80 | 
81 |   Dtype valueToCopy = gradOutput[batch][plane][outputPointY][outputPointX];
82 |   atomicAdd(&gradInput[batch][plane][inputPointY][inputPointX], valueToCopy);
83 | }
84 | 
85 | #include "generic/SpatialReflectionPadding.cu"
86 | #include "THCGenerateFloatTypes.h"
87 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialClassNLLCriterion.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include "THCAtomics.cuh"
 5 | #include "common.h"
 6 | #include <THC/THCApply.cuh>
 7 | 
 8 | #include <thrust/functional.h>
 9 | 
10 | template <typename T, typename AccumT>
11 | __global__ void cunn_SpatialClassNLLCriterion_updateOutput_kernel(
12 |           T *output,
13 |           T *total_weight,
14 |           T *input,
15 |           THCIndex_t *target,
16 |           T *weights,
17 |           int size_average,
18 |           int batch_size,
19 |           int n_classes,
20 |           int map_nelem,
21 |           int blocks_per_sample)
22 | {
23 |   __shared__ AccumT partial_sums[CUDA_NUM_THREADS];
24 | 
25 |   int i, t;
26 |   T cur_weight;
27 |   AccumT input_sum = 0;
28 |   AccumT acc_weight = 0;
29 | 
30 |   int sample = blockIdx.x / blocks_per_sample;
31 |   int toffset = sample * map_nelem;
32 |   int ioffset = sample * map_nelem * n_classes;
33 |   int step = blockDim.x * blocks_per_sample;
34 |   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
35 |        i < map_nelem;
36 |        i += step) {
37 |     t = target[toffset + i] - TH_INDEX_BASE;
38 |     assert(t >= 0 && t < n_classes);
39 |     cur_weight = weights ? weights[t] : ScalarConvert<int, T>::to(1);
40 |     input_sum -= input[ioffset + i + map_nelem * t] * cur_weight;
41 |     acc_weight += cur_weight;
42 |   }
43 | 
44 |   __syncthreads();
45 | 
46 |   input_sum = reduceBlock(partial_sums, blockDim.x, input_sum, thrust::plus<AccumT>(), AccumT(0));
47 |   acc_weight = reduceBlock(partial_sums, blockDim.x, acc_weight, thrust::plus<AccumT>(), AccumT(0));
48 | 
49 |   if (threadIdx.x == 0) {
50 |     atomicAdd(total_weight, ScalarConvert<AccumT, T>::to(acc_weight));
51 |     atomicAdd(output, ScalarConvert<AccumT, T>::to(input_sum));
52 |   }
53 | }
54 | 
55 | template<typename T>
56 | __global__ void cunn_SpatialClassNLLCriterion_sizeAverage_kernel(
57 |           T *output,
58 |           T *total_weight)
59 | {
60 |   if (*total_weight > 0)
61 |     *output = THCNumerics<T>::div(*output, *total_weight);
62 | }
63 | 
64 | template<typename T>
65 | __global__ void cunn_SpatialClassNLLCriterion_updateGradInput_kernel(
66 |           T *gradInput,
67 |           THCIndex_t *target,
68 |           T *weights,
69 |           T *total_weight,
70 |           int size_average,
71 |           int batch_size,
72 |           int n_classes,
73 |           int map_nelem,
74 |           int blocks_per_sample)
75 | {
76 |   if (*total_weight <= 0)
77 |     return;
78 | 
79 |   int i, t;
80 |   T norm = size_average ? (ScalarConvert<int, T>::to(1) / *total_weight) : ScalarConvert<int, T>::to(1);
81 | 
82 |   int sample = blockIdx.x / blocks_per_sample;
83 |   int step = blockDim.x * blocks_per_sample;
84 |   int toffset = sample * map_nelem;
85 |   int ioffset = sample * map_nelem * n_classes;
86 |   for (i = (blockIdx.x % blocks_per_sample) * blockDim.x + threadIdx.x;
87 |        i < map_nelem;
88 |        i += step) {
89 |     t = (int)target[toffset + i] - TH_INDEX_BASE;
90 |     assert(t >= 0 && t < n_classes);
91 |     gradInput[ioffset + i + map_nelem * t] = -(weights ? weights[t] : ScalarConvert<int, T>::to(1)) * norm;
92 |   }
93 | }
94 | 
95 | #include "generic/SpatialClassNLLCriterion.cu"
96 | #include "THCGenerateFloatTypes.h"
97 | 


--------------------------------------------------------------------------------
/lib/THCUNN/RReLU.cu:
--------------------------------------------------------------------------------
  1 | #include "THCUNN.h"
  2 | #include "THCHalf.h"
  3 | #include "THCHalfAutoNumerics.cuh"
  4 | #include <THC/THCApply.cuh>
  5 | #include "common.h"
  6 | #include <curand.h>
  7 | #include <curand_kernel.h>
  8 | 
  9 | // copied from cutorch/lib/THC/THCTensorRandom.cu
 10 | #define MAX_NUM_BLOCKS 64
 11 | #define BLOCK_SIZE 256
 12 | #define NUM_BLOCKS(n) min((int)THCCeilDiv(n, (ptrdiff_t) BLOCK_SIZE), MAX_NUM_BLOCKS)
 13 | 
 14 | template<typename T>
 15 | inline T __device__ curand_uniform_type(curandStateMtgp32 *state);
 16 | 
 17 | #ifdef CUDA_HALF_TENSOR
 18 | template <>
 19 | inline half __device__ curand_uniform_type<half>(curandStateMtgp32 *state) {
 20 |   return ScalarConvert<float, half>::to(curand_uniform(state));
 21 | }
 22 | #endif
 23 | 
 24 | template <>
 25 | inline float __device__ curand_uniform_type<float>(curandStateMtgp32 *state) {
 26 |   return curand_uniform(state);
 27 | }
 28 | 
 29 | template <>
 30 | inline double __device__ curand_uniform_type<double>(curandStateMtgp32 *state) {
 31 |   return curand_uniform_double(state);
 32 | }
 33 | 
 34 | template <typename T>
 35 | __global__ void rreluUpdateOutputTrain(int n, curandStateMtgp32 *state,
 36 |   T *input, T* noise, T *output, double a, double b)
 37 | {
 38 |   CUDA_KERNEL_LOOP(i, n)
 39 |   {
 40 |     if (input[i] <= 0)
 41 |     {
 42 |       T r = curand_uniform_type<T>(&state[blockIdx.x]);
 43 |       r = ScalarConvert<double, T>::to(r * (b-a) + a);
 44 |       output[i] = input[i] * r;
 45 |       noise[i] = r;
 46 |     }
 47 |     else
 48 |     {
 49 |       output[i] = input[i];
 50 |       noise[i] = ScalarConvert<int, T>::to(1);
 51 |     }
 52 |   }
 53 | }
 54 | 
 55 | template <typename T>
 56 | struct RReLUUpdateOutputEval_functor
 57 | {
 58 |   const T negSlope_;
 59 | 
 60 |   RReLUUpdateOutputEval_functor(T negSlope)
 61 |     : negSlope_(negSlope)
 62 |   {}
 63 | 
 64 |   __device__ __forceinline__ void operator()(T *out, T *in)
 65 |   {
 66 |     const T x = *in;
 67 |     const T r = x <= 0 ? negSlope_ : ScalarConvert<int, T>::to(1);
 68 |     *out = x * r;
 69 |   }
 70 | };
 71 | 
 72 | template <typename T>
 73 | struct RReLUUpdateOutputEvalIP_functor
 74 | {
 75 |   const T negSlope_;
 76 | 
 77 |   RReLUUpdateOutputEvalIP_functor(T negSlope)
 78 |     : negSlope_(negSlope)
 79 |   {}
 80 | 
 81 |   __device__ __forceinline__ void operator()(T *x)
 82 |   {
 83 |     if (*x <= 0)
 84 |     {
 85 |       *x = *x * negSlope_;
 86 |     }
 87 |   }
 88 | };
 89 | 
 90 | template <typename T>
 91 | struct RReLUupdateGradInputEval_functor
 92 | {
 93 |   const T negSlope_;
 94 | 
 95 |   RReLUupdateGradInputEval_functor(T negSlope)
 96 |     : negSlope_(negSlope)
 97 |   {}
 98 | 
 99 |   __device__ __forceinline__ void operator()(T *gradIn, T *gradOut, T *in)
100 |   {
101 |     *gradIn = (*in) <= 0 ? (*gradOut) * negSlope_ : (*gradOut);
102 |   }
103 | };
104 | 
105 | template <typename T>
106 | struct RReLUupdateGradInputEvalIP_functor
107 | {
108 |   const T negSlope_;
109 | 
110 |   RReLUupdateGradInputEvalIP_functor(T negSlope)
111 |     : negSlope_(negSlope)
112 |   {}
113 | 
114 |   __device__ __forceinline__ void operator()(T *gradOut, T *in)
115 |   {
116 |     if (*in <= 0)
117 |     {
118 |       *gradOut = (*gradOut) * negSlope_;
119 |     }
120 |   }
121 | };
122 | 
123 | #include "generic/RReLU.cu"
124 | #include "THCGenerateFloatTypes.h"
125 | 


--------------------------------------------------------------------------------
/lib/THCUNN/VolumetricReplicationPadding.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCDeviceTensor.cuh"
 4 | #include "THCDeviceTensorUtils.cuh"
 5 | #include "THCDeviceUtils.cuh"
 6 | #include "THCReduceApplyUtils.cuh"
 7 | #include "THCHalf.h"
 8 | #include "THCHalfAutoNumerics.cuh"
 9 | #include "THCAtomics.cuh"
10 | #include <THC/THCApply.cuh>
11 | 
12 | template <typename Dtype>
13 | __global__ void VolumetricReplicationPadding_updateOutput(
14 |   THCDeviceTensor<Dtype, 5> input,
15 |   THCDeviceTensor<Dtype, 5> output,
16 |   int pfront, int pback, int ptop, int pbottom, int pleft, int pright) {
17 | 
18 |   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
19 |   int plane = blockIdx.y;
20 |   int batch = blockIdx.z;
21 |   if (outputPointId >= (output.getSize(2) * output.getSize(3) *
22 |                         output.getSize(4))) {
23 |     return;
24 |   }
25 |   int outputPointX = outputPointId % output.getSize(4);
26 |   int outputPointY = (outputPointId / output.getSize(4)) % output.getSize(3);
27 |   int outputPointZ = outputPointId / (output.getSize(3) * output.getSize(4));
28 | 
29 |   int iStartX = max(0, -pleft);
30 |   int iStartY = max(0, -ptop);
31 |   int iStartZ = max(0, -pfront);
32 |   int oStartX = max(0, pleft);
33 |   int oStartY = max(0, ptop);
34 |   int oStartZ = max(0, pfront);
35 | 
36 |   int inputPointX = min(max(pleft, outputPointX),
37 |                         input.getSize(4) + pleft - 1) - oStartX + iStartX;
38 |   int inputPointY = min(max(ptop, outputPointY),
39 |                         input.getSize(3) + ptop - 1) - oStartY + iStartY;
40 |   int inputPointZ = min(max(pfront, outputPointZ),
41 |                         input.getSize(2) + pfront - 1) - oStartZ + iStartZ;
42 | 
43 |   Dtype valueToCopy =
44 |       input[batch][plane][inputPointZ][inputPointY][inputPointX];
45 |   output[batch][plane][outputPointZ][outputPointY][outputPointX] = valueToCopy;
46 | }
47 | 
48 | template <typename Dtype>
49 | __global__ void VolumetricReplicationPadding_updateGradInput(
50 |   THCDeviceTensor<Dtype, 5> gradInput,
51 |   THCDeviceTensor<Dtype, 5> gradOutput,
52 |   int pfront, int pback, int ptop, int pbottom, int pleft, int pright) {
53 |   int outputPointId = threadIdx.x + blockIdx.x * blockDim.x;
54 |   int plane = blockIdx.y;
55 |   int batch = blockIdx.z;
56 | 
57 |   if (outputPointId >= (gradOutput.getSize(2) * gradOutput.getSize(3) *
58 |                         gradOutput.getSize(4))) {
59 |     return;
60 |   }
61 |   int outputPointX = outputPointId % gradOutput.getSize(4);
62 |   int outputPointY = (outputPointId / gradOutput.getSize(4)) %
63 |       gradOutput.getSize(3);
64 |   int outputPointZ = outputPointId / (gradOutput.getSize(3) *
65 |       gradOutput.getSize(4));
66 | 
67 |   int iStartX = max(0, -pleft);
68 |   int iStartY = max(0, -ptop);
69 |   int iStartZ = max(0, -pfront);
70 |   int oStartX = max(0, pleft);
71 |   int oStartY = max(0, ptop);
72 |   int oStartZ = max(0, pfront);
73 | 
74 |   int inputPointX = min(max(pleft, outputPointX),
75 |                         gradInput.getSize(4) + pleft - 1) - oStartX + iStartX;
76 |   int inputPointY = min(max(ptop, outputPointY),
77 |                         gradInput.getSize(3) + ptop - 1) - oStartY + iStartY;
78 |   int inputPointZ = min(max(pfront, outputPointZ),
79 |                         gradInput.getSize(2) + pfront - 1) - oStartZ + iStartZ;
80 | 
81 |   Dtype valueToCopy =
82 |       gradOutput[batch][plane][outputPointZ][outputPointY][outputPointX];
83 |   atomicAdd(&gradInput[batch][plane][inputPointZ][inputPointY][inputPointX],
84 |             valueToCopy);
85 | }
86 | 
87 | 
88 | #include "generic/VolumetricReplicationPadding.cu"
89 | #include "THCGenerateFloatTypes.h"
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a name="cunn.dok"/>
  2 | # CUDA backend for the Neural Network Package #
  3 | 
  4 | This package provides a CUDA implementation for many of the modules in the base nn package: [nn](https://github.com/torch/nn/blob/master/README.md)
  5 |  * [Modules](doc/cunnmodules.md#nn.cunnmodules.dok): There are also additional GPU-related modules not found in the nn package.
  6 | 
  7 | ## Installing from source
  8 | ```bash
  9 | git clone https://github.com/torch/cunn
 10 | cd cunn
 11 | luarocks make rocks/cunn-scm-1.rockspec
 12 | ```
 13 | 
 14 | ## To use
 15 | 
 16 | Simply convert your network model to CUDA by calling `:cuda()`:
 17 | ```lua
 18 | local model = nn.Sequential()
 19 | model:add(nn.Linear(2,2))
 20 | model:add(nn.LogSoftMax())
 21 | 
 22 | model:cuda()  -- convert model to CUDA
 23 | ```
 24 | 
 25 | ... and similarly for your tensors:
 26 | ```lua
 27 | local input = torch.Tensor(32,2):uniform()
 28 | input = input:cuda()
 29 | local output = model:forward(input)
 30 | ```
 31 | ... or create them directly as `CudaTensor`s:
 32 | ```lua
 33 | local input = torch.CudaTensor(32,2):uniform()
 34 | local output = model:forward(input)
 35 | ```
 36 | 
 37 | ## To run unit-tests
 38 | 
 39 | ```lua
 40 | luajit -l cunn -e 'cunn.test()'
 41 | ```
 42 | 
 43 | ## GPU Training Concepts
 44 | 
 45 | __Performance__
 46 | 
 47 | * data should be transferred between main memory and gpu in batches, otherwise the transfer time will be dominated
 48 | by latency associated with speed of light, and execution overheads, rather than by bandwidth
 49 | * therefore, train and predict using mini-batches
 50 | * allocating GPU memory causes a sync-point, which will noticeably affect performance
 51 |   * therefore try to allocate any `CudaTensor`s once, at the start of the program,
 52 |   and then simply copy data backwards and forwards
 53 |   between main memory and existing `CudaTensor`s
 54 | * similarly, try to avoid any operations that implicitly allocate new tensors.  For example, if you write:
 55 | ```lua
 56 | require 'cutorch'
 57 | 
 58 | local a = torch.CudaTensor(1000):uniform()
 59 | for it=1,1000 do
 60 |   local b = torch.add(a, 1)
 61 | end
 62 | ```
 63 | ... this will allocate one thousand new `CudaTensor`s, one for each call to `torch.add(a, 1)`.
 64 | 
 65 | Use instead this form:
 66 | ```lua
 67 | require 'cutorch'
 68 | 
 69 | local a = torch.CudaTensor(1000):uniform()
 70 | local b = torch.CudaTensor(1000):uniform()
 71 | for it=1,1000 do
 72 |   b:add(a, 1)
 73 | end
 74 | ```
 75 | In this form, `b` is allocated only once, before the loop.  Then the `b:add(a,1)` operation will perform
 76 | the add inside the GPU kernel, and store the result into the original `b` `CudaTensor`.  This
 77 | will run noticeably faster, in general.  It's also a lot less likely to eat up arbitrary amounts of memory,
 78 | and less likely to need frequent calls to `collectgarbage(); collectgarbage()`.
 79 | 
 80 | __Benchmarking__
 81 | 
 82 | * GPU operations will typically continue after an instruction has been issued
 83 | * eg, if you do:
 84 | ```lua
 85 | require 'cutorch'
 86 | local a = torch.CudaTensor(1000,1000):uniform()
 87 | a:add(1)
 88 | ```
 89 | ... the GPU kernel to add 1 will only be scheduled for launch by `a:add(1)`.  It might not have completed yet, or
 90 | even have reached the GPU, at the time that the `a:add(1)` returns
 91 | * therefore for running wall-clock timings, you should call `cutorch.synchronize()` before each timecheck
 92 | point:
 93 | ```lua
 94 | require 'cutorch'
 95 | require 'sys'
 96 | 
 97 | local a = torch.CudaTensor(1000,1000):uniform()
 98 | cutorch.synchronize()
 99 | start = sys.tic()
100 | a:add(1)
101 | cutorch.synchronize()
102 | print(sys.toc())
103 | ```
104 | 
105 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SoftMax.cu:
--------------------------------------------------------------------------------
  1 | #include "THCUNN.h"
  2 | #include "THCHalf.h"
  3 | #include "THCHalfAutoNumerics.cuh"
  4 | 
  5 | #define SOFTMAX_THREADS 128
  6 | 
  7 | template <typename T, typename AccumT>
  8 | __global__ void cunn_SoftMax_updateOutput_kernel(
  9 |   T *output, T *input, int nframe, int dim, int stride0, int stride1)
 10 | {
 11 |   __shared__ AccumT buffer[SOFTMAX_THREADS+1];
 12 |   T *input_k  = input  + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
 13 |   T *output_k = output + blockIdx.x*dim*stride0 + blockIdx.y*stride1 + blockIdx.z;
 14 | 
 15 |   int i_start = threadIdx.x;
 16 |   int i_end = dim;
 17 |   int i_step = blockDim.x;
 18 | 
 19 |   // max?
 20 |   buffer[threadIdx.x] = -THCNumerics<AccumT>::max();
 21 |   for (int i=i_start; i<i_end; i+=i_step)
 22 |   {
 23 |     T z = input_k[i*stride0];
 24 |     AccumT zAcc = ScalarConvert<T, AccumT>::to(z);
 25 |     if (buffer[threadIdx.x] < zAcc)
 26 |       buffer[threadIdx.x] = zAcc;
 27 |   }
 28 | 
 29 | 
 30 |   __syncthreads();
 31 | 
 32 |   // reduce
 33 |   if (threadIdx.x == 0)
 34 |   {
 35 |     AccumT max_k = -THCNumerics<AccumT>::max();
 36 |     for (int i=0; i<blockDim.x; i++)
 37 |     {
 38 |       if (max_k < buffer[i])
 39 |         max_k = buffer[i];
 40 |     }
 41 |     buffer[SOFTMAX_THREADS] = max_k;
 42 |   }
 43 | 
 44 |   __syncthreads();
 45 | 
 46 |   // sum?
 47 |   T max_k = ScalarConvert<AccumT, T>::to(buffer[SOFTMAX_THREADS]);
 48 |   buffer[threadIdx.x] = ScalarConvert<int, AccumT>::to(0);
 49 |   for (int i=i_start; i<i_end; i+=i_step) {
 50 |     T z = THCNumerics<T>::exp(input_k[i*stride0]-max_k);
 51 |     buffer[threadIdx.x] += ScalarConvert<T, AccumT>::to(z);
 52 |     output_k[i*stride0] = z;
 53 |   }
 54 | 
 55 |   __syncthreads();
 56 | 
 57 |   // reduce
 58 |   if (threadIdx.x == 0)
 59 |   {
 60 |     AccumT sum_k = ScalarConvert<int, AccumT>::to(0);
 61 |     for (int i=0; i<blockDim.x; i++)
 62 |       sum_k += buffer[i];
 63 |     buffer[SOFTMAX_THREADS] = sum_k;
 64 |   }
 65 | 
 66 |   __syncthreads();
 67 | 
 68 |   // softmax
 69 |   T sum_k = ScalarConvert<AccumT, T>::to(buffer[SOFTMAX_THREADS]);
 70 |   for (int i=i_start; i<i_end; i+=i_step)
 71 |     output_k[i*stride0] = output_k[i*stride0] / sum_k;
 72 | }
 73 | 
 74 | template <typename T, typename AccumT>
 75 | __global__ void cunn_SoftMax_updateGradInput_kernel(
 76 |   T *gradInput, T *output, T *gradOutput, int nframe, int dim, int stride0, int stride1)
 77 | {
 78 |   __shared__ AccumT buffer[SOFTMAX_THREADS];
 79 |   T *gradInput_k  = gradInput  + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
 80 |   T *output_k     = output     + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
 81 |   T *gradOutput_k = gradOutput + blockIdx.x*dim*stride0 + blockIdx.y * stride1 + blockIdx.z;
 82 | 
 83 |   int i_start = threadIdx.x;
 84 |   int i_end = dim;
 85 |   int i_step = blockDim.x;
 86 | 
 87 |   // sum?
 88 |   buffer[threadIdx.x] = ScalarConvert<int, AccumT>::to(0);
 89 |   for (int i=i_start; i<i_end; i+=i_step)
 90 |     buffer[threadIdx.x] += ScalarConvert<T, AccumT>::to(gradOutput_k[i*stride0] * output_k[i*stride0]);
 91 | 
 92 |   __syncthreads();
 93 | 
 94 |   // reduce
 95 |   if (threadIdx.x == 0)
 96 |   {
 97 |     AccumT sum_k = ScalarConvert<int, AccumT>::to(0);
 98 |     for (int i=0; i<blockDim.x; i++)
 99 |       sum_k += buffer[i];
100 |     buffer[0] = sum_k;
101 |   }
102 | 
103 |   __syncthreads();
104 | 
105 |   T sum_k = ScalarConvert<AccumT, T>::to(buffer[0]);
106 |   for (int i=i_start; i<i_end; i+=i_step)
107 |     gradInput_k[i*stride0] = output_k[i*stride0] * (gradOutput_k[i*stride0] - sum_k);
108 | }
109 | 
110 | #include "generic/SoftMax.cu"
111 | #include "THCGenerateFloatTypes.h"
112 | 
113 | #undef SOFTMAX_THREADS
114 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialDilatedMaxPooling.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include "common.h"
 5 | 
 6 | // kernels borrowed from Caffe
 7 | template <typename Dtype, typename AccType>
 8 | __global__ void MaxPoolForward(const int nthreads, const Dtype* bottom_data,
 9 |     const int num, const int channels, const int height,
10 |     const int width, const int pooled_height, const int pooled_width,
11 |     const int kernel_h, const int kernel_w, const int stride_h,
12 |     const int stride_w, const int pad_h, const int pad_w,
13 |     const int dilation_h, const int dilation_w, Dtype* top_data,
14 |     long* top_mask) {
15 |   CUDA_KERNEL_LOOP(index, nthreads) {
16 |     int pw = index % pooled_width;
17 |     int ph = (index / pooled_width) % pooled_height;
18 |     int c = (index / pooled_width / pooled_height) % channels;
19 |     int n = index / pooled_width / pooled_height / channels;
20 |     int hstart = ph * stride_h - pad_h;
21 |     int wstart = pw * stride_w - pad_w;
22 |     int hend = min(hstart + (kernel_h - 1) * dilation_h + 1, height);
23 |     int wend = min(wstart + (kernel_w - 1) * dilation_w + 1, width);
24 |     while(hstart < 0)
25 |       hstart += dilation_h;
26 |     while(wstart < 0)
27 |       wstart += dilation_w;
28 |     AccType maxval = THCNumerics<AccType>::min();
29 |     int maxidx = -1;
30 |     bottom_data += (n * channels + c) * height * width;
31 |     for (int h = hstart; h < hend; h += dilation_h) {
32 |       for (int w = wstart; w < wend; w += dilation_w) {
33 |         if (ScalarConvert<Dtype, AccType>::to(bottom_data[h * width + w]) > maxval) {
34 |           maxidx = h * width + w;
35 |           maxval = ScalarConvert<Dtype, AccType>::to(bottom_data[maxidx]);
36 |         }
37 |       }
38 |     }
39 |     top_data[index] = ScalarConvert<AccType, Dtype>::to(maxval);
40 |     top_mask[index] = maxidx + TH_INDEX_BASE;
41 |   }
42 | }
43 | 
44 | 
45 | template <typename Dtype, typename AccType>
46 | __global__ void MaxPoolBackward(const int nthreads, const Dtype* top_diff,
47 |     const long* top_mask, const int num, const int channels,
48 |     const int height, const int width, const int pooled_height,
49 |     const int pooled_width, const int kernel_h, const int kernel_w,
50 |     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
51 |     const int dilation_h, const int dilation_w,
52 |     Dtype* bottom_diff) {
53 |   CUDA_KERNEL_LOOP(index, nthreads) {
54 |     // find out the local index
55 |     // find out the local offset
56 |     int w = index % width;
57 |     int h = (index / width) % height;
58 |     int c = (index / width / height) % channels;
59 |     int n = index / width / height / channels;
60 |     int phstart =
61 |         (h + pad_h < ((kernel_h - 1) * dilation_h + 1)) ? 0 : (h + pad_h - ((kernel_h - 1) * dilation_h + 1)) / stride_h + 1;
62 |     int phend = min((h + pad_h) / stride_h + 1, pooled_height);
63 |     int pwstart =
64 |         (w + pad_w < ((kernel_w - 1) * dilation_w + 1)) ? 0 : (w + pad_w - ((kernel_w - 1) * dilation_w + 1)) / stride_w + 1;
65 |     int pwend = min((w + pad_w) / stride_w + 1, pooled_width);
66 | 
67 |     AccType gradient = AccType(0);
68 |     int offset = (n * channels + c) * pooled_height * pooled_width;
69 |     top_diff += offset;
70 |     top_mask += offset;
71 |     for (int ph = phstart; ph < phend; ++ph) {
72 |       for (int pw = pwstart; pw < pwend; ++pw) {
73 |         if (top_mask[ph * pooled_width + pw] - TH_INDEX_BASE == h * width + w) {
74 |           gradient += ScalarConvert<Dtype, AccType>::to(top_diff[ph * pooled_width + pw]);
75 |         }
76 |       }
77 |     }
78 |     bottom_diff[index] = ScalarConvert<AccType, Dtype>::to(gradient);
79 |   }
80 | }
81 | 
82 | #include "generic/SpatialDilatedMaxPooling.cu"
83 | #include "THCGenerateFloatTypes.h"
84 | 


--------------------------------------------------------------------------------
/lib/THCUNN/common.h:
--------------------------------------------------------------------------------
 1 | #ifndef THCUNN_COMMON_H
 2 | #define THCUNN_COMMON_H
 3 | 
 4 | // CUDA: grid stride looping
 5 | #define CUDA_KERNEL_LOOP(i, n) \
 6 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
 7 | 
 8 | #define THCUNN_assertSameGPU(...) THAssertMsg(THCTensor_(checkGPU)(__VA_ARGS__), \
 9 |   "Some of weight/gradient/input tensors are located on different GPUs. Please move them to a single one.")
10 | 
11 | // Use 1024 threads per block, which requires cuda sm_2x or above
12 | const int CUDA_NUM_THREADS = 1024;
13 | 
14 | // CUDA: number of blocks for threads.
15 | inline int GET_BLOCKS(const int N)
16 | {
17 |   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
18 | }
19 | 
20 | #define THCUNN_resizeAs_indices(STATE, I1, I2)              \
21 |   THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2);  \
22 |   if (!THCIndexTensor_(isSize)(STATE, I1, size2))           \
23 |   { \
24 |     THCudaLongTensor_resize(STATE, I1, size2, NULL);        \
25 |   } \
26 |   THLongStorage_free(size2);
27 | 
28 | #define THCUNN_check_shape(STATE, I1, I2)                 \
29 |   if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2))	\
30 |   { \
31 |        THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1);  \
32 |        THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2);  \
33 |        THError(#I1 " and " #I2 " shapes do not match: "   \
34 |                #I1 " %s, " #I2 " %s", s1.str, s2.str);    \
35 |   }
36 | 
37 | 
38 | #define THCUNN_check_shape_indices(STATE, I1, I2)              \
39 |   THLongStorage *size2 = THCTensor_(newSizeOf)(STATE, I2);     \
40 |   if (!THCIndexTensor_(isSize)(STATE, I1, size2))              \
41 |   { \
42 |        THCDescBuff s1 = THCIndexTensor_(sizeDesc)(STATE, I1);  \
43 |        THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2);       \
44 |        THError(#I1 " and " #I2 " shapes do not match: "        \
45 |                #I1 " %s, " #I2 " %s", s1.str, s2.str);         \
46 |   } \
47 |   THLongStorage_free(size2);
48 | 
49 | #define THCUNN_check_nElement(STATE, I1, I2)                \
50 |   if (I1 != NULL && I2 != NULL ) {                          \
51 |     ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1);              \
52 |     ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2);              \
53 |     if (n1 != n2)                                           \
54 |     {	\
55 |       THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1);     \
56 |       THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2);     \
57 |       THError(#I1 " and " #I2 " have different number of elements: "	\
58 |               #I1 "%s has %ld elements, while "             \
59 |               #I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
60 |     }	\
61 |   }
62 | 
63 | #define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
64 |   if (THCTensor_(nDimension)(STATE, T) != DIM ||             \
65 |       THCTensor_(size)(STATE, T, DIM_SIZE) != SIZE) {        \
66 |       THCDescBuff s1 = THCTensor_(sizeDesc)(state, T);       \
67 |       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
68 |               " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
69 |   }
70 | 
71 | #define THCUNN_check_dim_size_indices(STATE, T, DIM, DIM_SIZE, SIZE)  \
72 |   if (THCIndexTensor_(nDimension)(STATE, T) != DIM ||                 \
73 |       THCIndexTensor_(size)(STATE, T, DIM_SIZE) != SIZE) {            \
74 |       THCDescBuff s1 = THCIndexTensor_(sizeDesc)(state, T);           \
75 |       THError("Need " #T " of dimension %d and " #T ".size[%d] == %d" \
76 |               " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
77 |   }
78 | 
79 | #define THCUNN_argCheck(STATE, COND, ARG, T, FORMAT) \
80 |   if (!(COND)) { \
81 |     THCDescBuff s1 = THCTensor_(sizeDesc)(state, T); \
82 |     THArgCheck(COND, ARG, FORMAT, s1.str);           \
83 |   }
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/RReLU.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/RReLU.cu"
  3 | #else
  4 | 
  5 | #include "../common.h"
  6 | 
  7 | void THNN_(RReLU_updateOutput)(
  8 |            THCState *state,
  9 |            THCTensor *input,
 10 |            THCTensor *output,
 11 |            THCTensor *noise,
 12 |            double lower,
 13 |            double upper,
 14 |            bool train,
 15 |            bool inplace,
 16 |            void *generator)
 17 | {
 18 |   THCUNN_assertSameGPU(state, 3, input, output, noise);
 19 |   struct curandStateMtgp32* gen_states = THCRandom_generatorStates(state);
 20 | 
 21 |   if (train)
 22 |   {
 23 |     input = THCTensor_(newContiguous)(state, input);
 24 |     THCTensor_(resizeAs)(state, noise, input);
 25 |     real *input_data = THCTensor_(data)(state, input);
 26 |     real *noise_data = THCTensor_(data)(state, noise);
 27 |     ptrdiff_t n = THCTensor_(nElement)(state, input);
 28 |     if (inplace)
 29 |     {
 30 |       rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
 31 |         n, gen_states, input_data, noise_data, input_data, lower, upper);
 32 |       THCTensor_(set)(state, output, input);
 33 |     }
 34 |     else
 35 |     {
 36 |       THCTensor_(resizeAs)(state, output, input);
 37 |       real *output_data = THCTensor_(data)(state, output);
 38 |       rreluUpdateOutputTrain<<<NUM_BLOCKS(n), BLOCK_SIZE, 0, THCState_getCurrentStream(state)>>>(
 39 |         n, gen_states, input_data, noise_data, output_data, lower, upper);
 40 |     }
 41 |     THCudaCheck(cudaGetLastError());
 42 |     THCTensor_(free)(state, input);
 43 |   }
 44 |   else
 45 |   {
 46 |     const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2);
 47 |     if (inplace)
 48 |     {
 49 |       THC_pointwiseApply1(state, input, RReLUUpdateOutputEvalIP_functor<real>(negSlope));
 50 |       THCTensor_(set)(state, output, input);
 51 |     }
 52 |     else
 53 |     {
 54 |       THCTensor_(resizeAs)(state, output, input);
 55 |       THC_pointwiseApply2(state, output, input, RReLUUpdateOutputEval_functor<real>(negSlope));
 56 |     }
 57 |   }
 58 | }
 59 | 
 60 | void THNN_(RReLU_updateGradInput)(
 61 |            THCState *state,
 62 |            THCTensor *input,
 63 |            THCTensor *gradOutput,
 64 |            THCTensor *gradInput,
 65 |            THCTensor *noise,
 66 |            double lower,
 67 |            double upper,
 68 |            bool train,
 69 |            bool inplace)
 70 | {
 71 |   THCUNN_check_nElement(state, input, gradOutput);
 72 |   THCUNN_assertSameGPU(state, 4, input, gradOutput, gradInput, noise);
 73 | 
 74 |   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 75 | 
 76 |   if (train && upper - lower > 1E-6)    // e.g. if upper == lower, RReLU behaves like LeakyReLU
 77 |   {
 78 |     // multiply the gradient by the noise tensor
 79 |     if (inplace)
 80 |     {
 81 |       THCTensor_(cmul)(state, gradOutput, gradOutput, noise);
 82 |       THCTensor_(set)(state, gradInput, gradOutput);
 83 |     }
 84 |     else
 85 |     {
 86 |       THCTensor_(resizeAs)(state, gradInput, input);
 87 |       THCTensor_(cmul)(state, gradInput, gradOutput, noise);
 88 |     }
 89 |   }
 90 |   else
 91 |   {
 92 |     // use constant factor for negative input values
 93 |     const real negSlope = ScalarConvert<double, real>::to((lower + upper) / 2);
 94 |     if (inplace)
 95 |     {
 96 |       THC_pointwiseApply2(state, gradOutput, input, RReLUupdateGradInputEvalIP_functor<real>(negSlope));
 97 |       THCTensor_(set)(state, gradInput, gradOutput);
 98 |     }
 99 |     else
100 |     {
101 |       THCTensor_(resizeAs)(state, gradInput, input);
102 |       THC_pointwiseApply3(state, gradInput, gradOutput, input, RReLUupdateGradInputEval_functor<real>(negSlope));
103 |     }
104 |   }
105 | 
106 |   THCTensor_(free)(state, gradOutput);
107 | }
108 | 
109 | #endif
110 | 


--------------------------------------------------------------------------------
/lib/THCUNN/TemporalMaxPooling.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "common.h"
 3 | #include "THCHalf.h"
 4 | #include "THCHalfAutoNumerics.cuh"
 5 | #include "THCAtomics.cuh"
 6 | 
 7 | #define TEMPORAL_MAX_POOLING_THREADS 1024
 8 | 
 9 | template <typename Dtype>
10 | __global__ void cunn_TemporalMaxPooling_updateOutputKernel(Dtype *input, Dtype *output, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
11 |   // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
12 |   Dtype *input_data = input + blockIdx.x * input_w * input_n + (
13 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
14 |   Dtype *output_data = output + blockIdx.x * output_w * input_n + (
15 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
16 |   THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
17 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
18 | 
19 |   int feat = 0;
20 |   int time = 0;
21 |   int max_time = input_n * kW;
22 | 
23 |   Dtype max_value;
24 |   THCIndex_t max_index = 0;
25 | 
26 |   if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
27 |     // For all features
28 |     for (feat = 0; feat < input_n; ++feat) {
29 |       max_value = THCNumerics<Dtype>::min();
30 |       // For all values in the kernel space
31 |       for (time = 0; time < max_time; time += input_n) {
32 |         if (max_value < input_data[time + feat]) {
33 |           max_value = input_data[time + feat];
34 |           max_index = time / input_n;
35 |         }
36 |       }
37 |       output_data[feat] = max_value;
38 |       indices_data[feat] = max_index;
39 |     }
40 |   }
41 | }
42 | 
43 | template <typename Dtype>
44 | __global__ void cunn_TemporalMaxPooling_updateGradInputKernel(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
45 |   // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
46 |   Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
47 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
48 |   Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
49 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
50 |   THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
51 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
52 | 
53 |   int feat = 0;
54 | 
55 |   if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
56 |     // For all features
57 |     for (feat = 0; feat < input_n; ++feat) {
58 |       gradInput_data[indices_data[feat] * input_n + feat] += gradOutput_data[feat];
59 |     }
60 |   }
61 | }
62 | 
63 | template <typename Dtype>
64 | __global__ void cunn_TemporalMaxPooling_updateGradInputKernelAtomic(Dtype *gradInput, Dtype *gradOutput, THCIndex_t *indices, int input_w, int input_n, int output_w, int kW, int dW) {
65 |   // Block idx is the batch index, thread idx + block idx y * MAX_THREADS is the time index
66 |   Dtype *gradInput_data = gradInput + blockIdx.x * input_w * input_n + (
67 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n * dW;
68 |   Dtype *gradOutput_data = gradOutput + blockIdx.x * output_w * input_n + (
69 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
70 |   THCIndex_t *indices_data = indices + blockIdx.x * output_w * input_n + (
71 |       threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS) * input_n;
72 | 
73 |   int feat = 0;
74 | 
75 |   if (threadIdx.x + blockIdx.y * TEMPORAL_MAX_POOLING_THREADS < output_w) {
76 |     // For all features
77 |     for (feat = 0; feat < input_n; ++feat) {
78 |       atomicAdd(&gradInput_data[indices_data[feat] * input_n + feat], gradOutput_data[feat]);
79 |     }
80 |   }
81 | }
82 | 
83 | #include "generic/TemporalMaxPooling.cu"
84 | #include "THCGenerateFloatTypes.h"
85 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SpatialMaxUnpooling.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/SpatialMaxUnpooling.cu"
  3 | #else
  4 | 
  5 | void THNN_(SpatialMaxUnpooling_updateOutput)(
  6 |            THCState *state,
  7 |            THCTensor *input,
  8 |            THCTensor *output,
  9 |            THCIndexTensor *indices,
 10 |            int owidth, int oheight)
 11 | {
 12 |   THCUNN_assertSameGPU(state, 3, input, output, indices);
 13 |   THCUNN_argCheck(state, input->nDimension == 3 || input->nDimension == 4, 2, input,
 14 |                   "3D or 4D (batch mode) tensor expected for input, but got: %s");
 15 |   THCUNN_check_shape_indices(state, indices, input);
 16 | 
 17 |   long nInputCols, nInputRows, nInputPlane, batchSize;
 18 | 
 19 |   if (input->nDimension == 3) {
 20 |     nInputCols = input->size[2];
 21 |     nInputRows = input->size[1];
 22 |     nInputPlane = input->size[0];
 23 |     batchSize = 1;
 24 |   }
 25 |   else
 26 |   {
 27 |     nInputCols = input->size[3];
 28 |     nInputRows = input->size[2];
 29 |     nInputPlane = input->size[1];
 30 |     batchSize = input->size[0];
 31 |   }
 32 | 
 33 |   input = THCTensor_(newContiguous)(state, input);
 34 |   indices = THCIndexTensor_(newContiguous)(state, indices);
 35 |   THCTensor_(resize4d)(state, output, batchSize, nInputPlane, oheight, owidth);
 36 |   THCTensor_(zero)(state, output);
 37 | 
 38 |   int count = THCTensor_(nElement)(state, input);
 39 | 
 40 |   MaxUnpoolForward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
 41 |       (count, THCTensor_(data)(state, input), THCIndexTensor_(data)(state, indices),
 42 |       batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, output));
 43 |   THCudaCheck(cudaGetLastError());
 44 | 
 45 |   if(input->nDimension == 3)
 46 |     THCTensor_(resize3d)(state, output, nInputPlane, oheight, owidth);
 47 | 
 48 |   THCTensor_(free)(state, input);
 49 |   THCIndexTensor_(free)(state, indices);
 50 | }
 51 | 
 52 | void THNN_(SpatialMaxUnpooling_updateGradInput)(
 53 |            THCState *state,
 54 |            THCTensor *input,
 55 |            THCTensor *gradOutput,
 56 |            THCTensor *gradInput,
 57 |            THCIndexTensor *indices,
 58 |            int owidth, int oheight)
 59 | {
 60 |   THCUNN_assertSameGPU(state, 4, input, gradOutput, indices, gradInput);
 61 |   THCUNN_check_shape_indices(state, indices, input);
 62 | 
 63 |   long nInputCols, nInputRows, nInputPlane, batchSize;
 64 |   int dimw = 2;
 65 |   int dimh = 1;
 66 | 
 67 |   if (input->nDimension == 3) {
 68 |     nInputPlane = input->size[0];
 69 |     batchSize = 1;
 70 |   }
 71 |   else
 72 |   {
 73 |     ++dimw;
 74 |     ++dimh;
 75 |     nInputPlane = input->size[1];
 76 |     batchSize = input->size[0];
 77 |   }
 78 |   nInputCols = input->size[dimw];
 79 |   nInputRows = input->size[dimh];
 80 | 
 81 |   if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
 82 |      THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
 83 |              oheight, owidth,gradOutput->size[dimh],gradOutput->size[dimw]);
 84 |   }
 85 | 
 86 |   input = THCTensor_(newContiguous)(state, input);
 87 |   indices = THCIndexTensor_(newContiguous)(state, indices);
 88 |   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 89 |   THCTensor_(resizeAs)(state, gradInput, input);
 90 | 
 91 |   int count = THCTensor_(nElement)(state, input);
 92 | 
 93 |   MaxUnpoolBackward <<< GET_BLOCKS(count), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state) >>>
 94 |       (count, THCTensor_(data)(state, gradOutput), THCIndexTensor_(data)(state, indices),
 95 |       batchSize, nInputPlane, nInputRows, nInputCols, oheight, owidth, THCTensor_(data)(state, gradInput));
 96 |   THCudaCheck(cudaGetLastError());
 97 | 
 98 |   // clean
 99 |   THCTensor_(free)(state, input);
100 |   THCIndexTensor_(free)(state, indices);
101 |   THCTensor_(free)(state, gradOutput);
102 | }
103 | 
104 | #endif
105 | 


--------------------------------------------------------------------------------
/lib/THCUNN/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8 FATAL_ERROR)
 2 | CMAKE_POLICY(VERSION 2.8)
 3 | 
 4 | OPTION(NDEBUG "disable asserts (WARNING: this may result in silent UB e.g. with out-of-bound indices)")
 5 | IF(NOT NDEBUG)
 6 |   MESSAGE(STATUS "Removing -DNDEBUG from compile flags")
 7 |   STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS "" ${CMAKE_C_FLAGS})
 8 |   STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_DEBUG "" ${CMAKE_C_FLAGS_DEBUG})
 9 |   STRING(REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "" ${CMAKE_C_FLAGS_RELEASE})
10 |   STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS "" ${CMAKE_CXX_FLAGS})
11 |   STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_DEBUG "" ${CMAKE_CXX_FLAGS_DEBUG})
12 |   STRING(REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "" ${CMAKE_CXX_FLAGS_RELEASE})
13 | ENDIF()
14 | 
15 | IF(NOT Torch_FOUND)
16 |   FIND_PACKAGE(Torch REQUIRED)
17 | ENDIF()
18 | 
19 | IF(NOT TH_LIBRARIES)
20 |   SET(TH_LIBRARIES "TH")
21 | ENDIF(NOT TH_LIBRARIES)
22 | MESSAGE(STATUS "TH_LIBRARIES: ${TH_LIBRARIES}")
23 | IF(NOT THC_LIBRARIES)
24 |   SET(THC_LIBRARIES "THC")
25 | ENDIF(NOT THC_LIBRARIES)
26 | MESSAGE(STATUS "THC_LIBRARIES: ${THC_LIBRARIES}")
27 | 
28 | IF(NOT CUDA_FOUND)
29 |   FIND_PACKAGE(CUDA 6.5 REQUIRED)
30 | ENDIF()
31 | 
32 | IF ($ENV{TH_BINARY_BUILD})
33 |   MESSAGE(STATUS "TH_BINARY_BUILD detected. Statically linking libstdc++")
34 |   SET(CMAKE_CXX_FLAGS "-static-libstdc++ ${CMAKE_CXX_FLAGS}")
35 |   IF (UNIX AND NOT APPLE)
36 |     # hiding statically linked library symbols, this flag is not available for the linker under MACOSX
37 |     SET(CMAKE_CXX_FLAGS "-Wl,--exclude-libs,libstdc++.a ${CMAKE_CXX_FLAGS}")
38 |   ENDIF(UNIX AND NOT APPLE)
39 | ENDIF()
40 | 
41 | # Detect CUDA architecture and get best NVCC flags
42 | IF(NOT COMMAND CUDA_SELECT_NVCC_ARCH_FLAGS OR MSVC)
43 |   INCLUDE(${CMAKE_CURRENT_SOURCE_DIR}/cmake/select_compute_arch.cmake)
44 | ENDIF()
45 | LIST(APPEND CUDA_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
46 | CUDA_SELECT_NVCC_ARCH_FLAGS(NVCC_FLAGS_EXTRA $ENV{TORCH_CUDA_ARCH_LIST})
47 | LIST(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
48 | 
49 | if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
50 |   if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
51 |     if(CUDA_VERSION VERSION_LESS "8.0")
52 |       MESSAGE(STATUS "Found gcc >=5 and CUDA <= 7.5, adding workaround C++ flags")
53 |       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__")
54 |     endif(CUDA_VERSION VERSION_LESS "8.0")
55 |   endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
56 | endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
57 | 
58 | if(CUDA_VERSION VERSION_GREATER "8.0")
59 |   LIST(APPEND CUDA_NVCC_FLAGS "-D__CUDA_NO_HALF_OPERATORS__")
60 | endif(CUDA_VERSION VERSION_GREATER "8.0")
61 | 
62 | IF(MSVC)
63 |   LIST(APPEND CUDA_NVCC_FLAGS "-Xcompiler /wd4819")
64 |   ADD_DEFINITIONS(-DTH_EXPORTS)
65 | ENDIF()
66 | 
67 | IF(NOT THCUNN_INSTALL_LIB_SUBDIR)
68 |   SET(THCUNN_INSTALL_LIB_SUBDIR "lib" CACHE PATH "THCUNN install library directory")
69 |   SET(THCUNN_INSTALL_INCLUDE_SUBDIR "include" CACHE PATH "THCUNN install include subdirectory")
70 | ENDIF()
71 | 
72 | FILE(GLOB src-cuda *.cu)
73 | 
74 | CUDA_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
75 | CUDA_ADD_LIBRARY(THCUNN MODULE ${src-cuda})
76 | 
77 | IF(MSVC)
78 |   SET_TARGET_PROPERTIES(THCUNN PROPERTIES PREFIX "lib" IMPORT_PREFIX "lib")
79 | ENDIF()
80 | 
81 | INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
82 | TARGET_LINK_LIBRARIES(THCUNN ${THC_LIBRARIES} ${TH_LIBRARIES} ${CUDA_cusparse_LIBRARY})
83 | 
84 | # Luarocks bug pre-14.04 prevents us from setting it for Lua-Torch
85 | IF(THCUNN_SO_VERSION)
86 |   MESSAGE(STATUS "THCUNN_SO_VERSION: ${THCUNN_SO_VERSION}")
87 |   SET_TARGET_PROPERTIES(THCUNN PROPERTIES
88 |     VERSION   ${THCUNN_SO_VERSION}
89 |     SOVERSION ${THCUNN_SO_VERSION})
90 | ENDIF(THCUNN_SO_VERSION)
91 | 
92 | INSTALL(TARGETS THCUNN LIBRARY DESTINATION ${THCUNN_INSTALL_LIB_SUBDIR})
93 | INSTALL(FILES THCUNN.h DESTINATION "${THCUNN_INSTALL_INCLUDE_SUBDIR}/THCUNN")
94 | INSTALL(FILES generic/THCUNN.h DESTINATION "${THCUNN_INSTALL_INCLUDE_SUBDIR}/THCUNN/generic")
95 | 


--------------------------------------------------------------------------------
/lib/THCUNN/row2col.h:
--------------------------------------------------------------------------------
 1 | #ifndef THCUNN_ROW2COL_H
 2 | #define THCUNN_ROW2COL_H
 3 | 
 4 | #include "THCNumerics.cuh"
 5 | #include "common.h"
 6 | 
 7 | // Kernel for fast unfold+copy on rows
 8 | template <typename Dtype>
 9 | __global__ void
10 | row2col_kernel(const int n, const Dtype *data_row, const int width,
11 |                const int ksize_w, const int pad_w, const int stride_w,
12 |                const int dilation_w, const int width_col, Dtype *data_col) {
13 |   CUDA_KERNEL_LOOP(index, n) {
14 |     int w_out = index % width_col;
15 |     index /= width_col;
16 |     int channel_in = index;
17 |     int channel_out = channel_in * ksize_w;
18 |     int w_in = w_out * stride_w - pad_w;
19 |     data_col += (channel_out)*width_col + w_out;
20 |     data_row += (channel_in)*width + w_in;
21 |     for (int j = 0; j < ksize_w; ++j) {
22 |       int w = w_in + j * dilation_w;
23 |       *data_col = (w >= 0 && w < width) ? data_row[j * dilation_w]
24 |                                         : ScalarConvert<int, Dtype>::to(0);
25 |       data_col += width_col;
26 |     }
27 |   }
28 | }
29 | 
30 | template <typename Dtype>
31 | void row2col(cudaStream_t stream, const Dtype *data_row, const int channels,
32 |              const int width, const int ksize_w, const int pad_w,
33 |              const int stride_w, const int dilation_w, Dtype *data_col) {
34 |   // We are going to launch channels * width_col kernels, each
35 |   // kernel responsible for copying a single-channel grid.
36 |   int width_col =
37 |       (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
38 |   int num_kernels = channels * width_col;
39 |   // Launch
40 |   row2col_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>(
41 |       num_kernels, data_row, width, ksize_w, pad_w, stride_w, 1, width_col,
42 |       data_col);
43 |   THCudaCheck(cudaGetLastError());
44 | }
45 | 
46 | template <typename Dtype, typename Acctype>
47 | __global__ void col2row_kernel(const int n, const Dtype *data_col,
48 |                                const int width, const int channels,
49 |                                const int kernel_w, const int pad_w,
50 |                                const int stride_w, const int dilation_w,
51 |                                const int width_col, Dtype *data_row) {
52 |   CUDA_KERNEL_LOOP(index, n) {
53 |     Acctype val = Acctype(0);
54 |     const int w_row = index % width + pad_w;
55 |     const int c_row = index / width;
56 |     int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
57 |     // compute the start and end of the output
58 |     const int w_col_start = (w_row < kernel_extent_w)
59 |                                 ? 0
60 |                                 : (w_row - kernel_extent_w) / stride_w + 1;
61 |     const int w_col_end = min(w_row / stride_w + 1, width_col);
62 |     for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
63 |       int w_k = (w_row - w_col * stride_w);
64 |       if (w_k % dilation_w == 0) {
65 |         w_k /= dilation_w;
66 |         int data_col_index = (c_row * kernel_w + w_k) * width_col + w_col;
67 |         val += data_col[data_col_index];
68 |       }
69 |     }
70 |     data_row[index] = ScalarConvert<Acctype, Dtype>::to(val);
71 |   }
72 |   }
73 | 
74 | template <typename Dtype, typename Acctype>
75 | void col2row(cudaStream_t stream, const Dtype *data_col, const int channels,
76 |              const int width, const int patch_w, const int pad_w,
77 |              const int stride_w, const int dilation_w, Dtype *data_row) {
78 |   int width_col =
79 |       (width + 2 * pad_w - (dilation_w * (patch_w - 1) + 1)) / stride_w + 1;
80 |   int num_kernels = channels * width;
81 |   // To avoid involving atomic operations, we will launch one kernel per
82 |   // bottom dimension, and then in the kernel add up the top dimensions.
83 |   col2row_kernel<
84 |       Dtype, Acctype><<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>>(
85 |       num_kernels, data_col, width, channels, patch_w, pad_w, stride_w,
86 |       dilation_w, width_col, data_row);
87 | 
88 |   THCudaCheck(cudaGetLastError());
89 | }
90 | #endif
91 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialAveragePooling.cu:
--------------------------------------------------------------------------------
 1 | #include "THCUNN.h"
 2 | #include "THCHalf.h"
 3 | #include "THCHalfAutoNumerics.cuh"
 4 | #include "common.h"
 5 | 
 6 | template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
 7 | __global__ void AvePoolForward(const int nthreads,
 8 |     const Dtype* const bottom_data, const int num, const int channels,
 9 |     const int height, const int width, const int pooled_height,
10 |     const int pooled_width, const int kernel_h, const int kernel_w,
11 |     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
12 |     Dtype* const top_data) {
13 |   CUDA_KERNEL_LOOP(index, nthreads) {
14 |     const int pw = index % pooled_width;
15 |     const int ph = (index / pooled_width) % pooled_height;
16 |     const int c = (index / pooled_width / pooled_height) % channels;
17 |     const int n = index / pooled_width / pooled_height / channels;
18 |     int hstart = ph * stride_h - pad_h;
19 |     int wstart = pw * stride_w - pad_w;
20 |     int hend = min(hstart + kernel_h, height + pad_h);
21 |     int wend = min(wstart + kernel_w, width + pad_w);
22 |     const int pool_size = (hend - hstart) * (wend - wstart);
23 |     hstart = max(hstart, 0);
24 |     wstart = max(wstart, 0);
25 |     hend = min(hend, height);
26 |     wend = min(wend, width);
27 |     Acctype aveval = Acctype(0);
28 |     const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width;
29 |     for (int h = hstart; h < hend; ++h) {
30 |       for (int w = wstart; w < wend; ++w) {
31 |         aveval += bottom_slice[h * width + w];
32 |       }
33 |     }
34 |     if(COUNT_INCLUDE_PAD)
35 |       top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / pool_size);
36 |     else
37 |       top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / ((hend - hstart) * (wend - wstart)));
38 |   }
39 | }
40 | 
41 | template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
42 | __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
43 |     const int num, const int channels, const int height,
44 |     const int width, const int pooled_height, const int pooled_width,
45 |     const int kernel_h, const int kernel_w, const int stride_h,
46 |     const int stride_w, const int pad_h, const int pad_w,
47 |     Dtype* const bottom_diff) {
48 |   CUDA_KERNEL_LOOP(index, nthreads) {
49 |     // find out the local index
50 |     // find out the local offset
51 |     const int w = index % width + pad_w;
52 |     const int h = (index / width) % height + pad_h;
53 |     const int c = (index / width / height) % channels;
54 |     const int n = index / width / height / channels;
55 |     const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
56 |     const int phend = min(h / stride_h + 1, pooled_height);
57 |     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
58 |     const int pwend = min(w / stride_w + 1, pooled_width);
59 |     Acctype gradient = Acctype(0);
60 |     const Dtype* const top_diff_slice =
61 |         top_diff + (n * channels + c) * pooled_height * pooled_width;
62 |     for (int ph = phstart; ph < phend; ++ph) {
63 |       for (int pw = pwstart; pw < pwend; ++pw) {
64 |         // figure out the pooling size
65 |         int hstart = ph * stride_h - pad_h;
66 |         int wstart = pw * stride_w - pad_w;
67 |         int hend = min(hstart + kernel_h, height + pad_h);
68 |         int wend = min(wstart + kernel_w, width + pad_w);
69 |         int pool_size = (hend - hstart) * (wend - wstart);
70 |         hstart = max(hstart, 0);
71 |         wstart = max(wstart, 0);
72 |         hend = min(hend, height);
73 |         wend = min(wend, width);
74 |         if(COUNT_INCLUDE_PAD)
75 |           gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
76 |         else
77 |           gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart));
78 |       }
79 |     }
80 |     bottom_diff[index] = ScalarConvert<Acctype, Dtype>::to(gradient);
81 |   }
82 | }
83 | 
84 | #include "generic/SpatialAveragePooling.cu"
85 | #include "THCGenerateFloatTypes.h"
86 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/BCECriterion.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/BCECriterion.cu"
  3 | #else
  4 | 
  5 | void THNN_(BCECriterion_updateOutput)(
  6 |            THCState *state,
  7 |            THCTensor *input,
  8 |            THCTensor *target,
  9 |            THCTensor *output,
 10 |            bool sizeAverage,
 11 |            THCTensor *weights)
 12 | {
 13 |   THCUNN_check_nElement(state, input, target);
 14 |   THCUNN_check_nElement(state, input, weights);
 15 |   THCUNN_check_dim_size(state, output, 1, 0, 1);
 16 |   THCUNN_assertSameGPU(state, 3, input, target, weights);
 17 | 
 18 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
 19 | 
 20 |   input = THCTensor_(newContiguous)(state, input);
 21 |   target = THCTensor_(newContiguous)(state, target);
 22 | 
 23 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
 24 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
 25 | 
 26 |   accreal sum;
 27 |   if (weights) {
 28 |     weights = THCTensor_(newContiguous)(state, weights);
 29 |     thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
 30 |     sum = thrust::transform_reduce(
 31 |       thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
 32 |       thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
 33 |       bce_functor_weights<real, accreal>(),
 34 |       (accreal) 0,
 35 |       thrust::plus<accreal>()
 36 |     );
 37 |     THCTensor_(free)(state, weights);
 38 |   } else {
 39 |     sum = thrust::transform_reduce(
 40 |       thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
 41 |       thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
 42 |       bce_functor<real, accreal>(),
 43 |       (accreal) 0,
 44 |       thrust::plus<accreal>()
 45 |     );
 46 |   }
 47 | 
 48 |   if (sizeAverage)
 49 |     sum /= size;
 50 | 
 51 |   THCTensor_(free)(state, input);
 52 |   THCTensor_(free)(state, target);
 53 | 
 54 |   THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(sum));
 55 | }
 56 | 
 57 | void THNN_(BCECriterion_updateGradInput)(
 58 |            THCState *state,
 59 |            THCTensor *input,
 60 |            THCTensor *target,
 61 |            THCTensor *gradInput,
 62 |            bool sizeAverage,
 63 |            THCTensor *weights)
 64 | {
 65 |   THCUNN_check_nElement(state, input, target);
 66 |   THCUNN_check_nElement(state, input, weights);
 67 |   THCUNN_assertSameGPU(state, 4, input, target, gradInput, weights);
 68 | 
 69 |   ptrdiff_t size = THCTensor_(nElement)(state, input);
 70 |   real norm = ScalarConvert<accreal, real>::to(sizeAverage ? accreal(1)/size : accreal(1));
 71 | 
 72 |   input = THCTensor_(newContiguous)(state, input);
 73 |   target = THCTensor_(newContiguous)(state, target);
 74 | 
 75 |   THCTensor_(resizeAs)(state, gradInput, input);
 76 | 
 77 |   thrust::device_ptr<real> input_data(THCTensor_(data)(state, input));
 78 |   thrust::device_ptr<real> target_data(THCTensor_(data)(state, target));
 79 |   thrust::device_ptr<real> gradInput_data(THCTensor_(data)(state, gradInput));
 80 | 
 81 |   if (weights) {
 82 |     weights = THCTensor_(newContiguous)(state, weights);
 83 |     thrust::device_ptr<real> weights_data(THCTensor_(data)(state, weights));
 84 |     thrust::transform(
 85 |       thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data, weights_data)),
 86 |       thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size, weights_data+size)),
 87 |       gradInput_data,
 88 |       bce_updateGradInput_functor_weights<real, accreal>(norm)
 89 |     );
 90 |     THCTensor_(free)(state, weights);
 91 |   } else {
 92 |     thrust::transform(
 93 |       thrust::make_zip_iterator(thrust::make_tuple(input_data, target_data)),
 94 |       thrust::make_zip_iterator(thrust::make_tuple(input_data+size, target_data+size)),
 95 |       gradInput_data,
 96 |       bce_updateGradInput_functor<real, accreal>(norm)
 97 |     );
 98 |   }
 99 | 
100 |   THCTensor_(free)(state, input);
101 |   THCTensor_(free)(state, target);
102 | }
103 | 
104 | #endif
105 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SpatialCrossMapLRN.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/SpatialCrossMapLRN.cu"
  3 | #else
  4 | 
  5 | void LRNforward(THCState* state, THCTensor* input, THCTensor* output,
  6 |     THCTensor* scale, int local_size, accreal alpha_, accreal beta_, accreal k_)
  7 | {
  8 |   real alpha = ScalarConvert<accreal, real>::to(alpha_);
  9 |   real beta = ScalarConvert<accreal, real>::to(beta_);
 10 |   real k = ScalarConvert<accreal, real>::to(k_);
 11 | 
 12 |   THCTensor_(resizeAs)(state, output, input);
 13 |   THCTensor_(resizeAs)(state, scale, input);
 14 | 
 15 |   int batchSize;
 16 |   int nInputPlane;
 17 |   int imsize_h;
 18 |   int imsize_w;
 19 | 
 20 |   if (input->nDimension == 3) {
 21 |     batchSize = 1;
 22 |     nInputPlane = input->size[0];
 23 |     imsize_h = input->size[1];
 24 |     imsize_w = input->size[2];
 25 |   }
 26 |   else
 27 |   {
 28 |     batchSize = input->size[0];
 29 |     nInputPlane = input->size[1];
 30 |     imsize_h = input->size[2];
 31 |     imsize_w = input->size[3];
 32 |   }
 33 | 
 34 |   input = THCTensor_(newContiguous)(state, input);
 35 | 
 36 |   int n_threads = batchSize * imsize_h * imsize_w;
 37 |   LRNFillScale<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
 38 |       n_threads, THCTensor_(data)(state, input), batchSize, nInputPlane, imsize_h, imsize_w, local_size,
 39 |       alpha / local_size, k, THCTensor_(data)(state, scale));
 40 |   n_threads *= nInputPlane;
 41 |   THCudaCheck(cudaGetLastError());
 42 |   LRNComputeOutput<<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
 43 |     n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, scale), -beta, THCTensor_(data)(state, output));
 44 |   THCudaCheck(cudaGetLastError());
 45 | 
 46 |   THCTensor_(free)(state, input);
 47 | }
 48 | 
 49 | 
 50 | void LRNbackward(THCState* state, THCTensor* input, THCTensor* output,
 51 |     THCTensor* gradOutput, THCTensor* gradInput, THCTensor* scale,
 52 |     int local_size, accreal alpha_, accreal beta_, accreal k_)
 53 | {
 54 |   real alpha = ScalarConvert<accreal, real>::to(alpha_);
 55 |   real beta = ScalarConvert<accreal, real>::to(beta_);
 56 |   real k = ScalarConvert<accreal, real>::to(k_);
 57 | 
 58 |   THCTensor_(resizeAs)(state, gradInput, input);
 59 | 
 60 |   int batchSize;
 61 |   int nInputPlane;
 62 |   int imsize_h;
 63 |   int imsize_w;
 64 | 
 65 |   if (input->nDimension == 3) {
 66 |     batchSize = 1;
 67 |     nInputPlane = input->size[0];
 68 |     imsize_h = input->size[1];
 69 |     imsize_w = input->size[2];
 70 |   }
 71 |   else
 72 |   {
 73 |     batchSize = input->size[0];
 74 |     nInputPlane = input->size[1];
 75 |     imsize_h = input->size[2];
 76 |     imsize_w = input->size[3];
 77 |   }
 78 | 
 79 |   input = THCTensor_(newContiguous)(state, input);
 80 |   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 81 | 
 82 |   int n_threads = batchSize * imsize_h * imsize_w;
 83 |   LRNComputeDiff<real, accreal> <<<GET_BLOCKS(n_threads), CUDA_NUM_THREADS, 0, THCState_getCurrentStream(state)>>>(
 84 |       n_threads, THCTensor_(data)(state, input), THCTensor_(data)(state, output),
 85 |       THCTensor_(data)(state, scale), THCTensor_(data)(state, gradOutput), batchSize, nInputPlane, imsize_h, imsize_w,
 86 |       local_size, -beta, ScalarConvert<int, real>::to(2) * alpha * beta / local_size,
 87 |       THCTensor_(data)(state, gradInput));
 88 |   THCudaCheck(cudaGetLastError());
 89 | 
 90 |   THCTensor_(free)(state, input);
 91 |   THCTensor_(free)(state, gradOutput);
 92 | }
 93 | 
 94 | void THNN_(SpatialCrossMapLRN_updateOutput)(
 95 |     THCState *state,
 96 |     THCTensor *input,
 97 |     THCTensor *output,
 98 |     THCTensor *scale,
 99 |     int size,
100 |     accreal alpha,
101 |     accreal beta,
102 |     accreal k)
103 | {
104 |   LRNforward(state, input, output, scale, size, alpha, beta, k);
105 | }
106 | 
107 | void THNN_(SpatialCrossMapLRN_updateGradInput)(
108 |     THCState *state,
109 |     THCTensor *input,
110 |     THCTensor *gradOutput,
111 |     THCTensor *gradInput,
112 |     THCTensor *scale,
113 |     THCTensor *output,
114 |     int size,
115 |     accreal alpha,
116 |     accreal beta,
117 |     accreal k)
118 | {
119 |   LRNbackward(state, input, output, gradOutput, gradInput, scale, size, alpha, beta, k);
120 | }
121 | 
122 | #endif
123 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/BatchNormalization.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/BatchNormalization.cu"
  3 | #else
  4 | 
  5 | #define DeviceTensor3 THCDeviceTensor<real, 3>
  6 | #define DeviceTensor1 THCDeviceTensor<real, 1>
  7 | 
  8 | template <int Dim>
  9 | static THCDeviceTensor<real, Dim> devicetensor(THCState *state, THCTensor *t) {
 10 |   if (!t) {
 11 |     return THCDeviceTensor<real, Dim>();
 12 |   }
 13 | 
 14 |   int inDim = THCTensor_(nDimension)(state, t);
 15 |   if (inDim == Dim) {
 16 |     return toDeviceTensor<real, Dim>(state, t);
 17 |   }
 18 | 
 19 |   // View in which the last dimensions are collapsed or expanded as needed
 20 |   THAssert(THCTensor_(isContiguous)(state, t));
 21 |   int size[Dim];
 22 |   for (int i = 0; i < Dim || i < inDim; ++i) {
 23 |     if (i < Dim && i < inDim) {
 24 |       size[i] = t->size[i];
 25 |     } else if (i < Dim) {
 26 |       size[i] = 1;
 27 |     } else {
 28 |       size[Dim - 1] *= t->size[i];
 29 |     }
 30 |   }
 31 |   return THCDeviceTensor<real, Dim>(THCTensor_(data)(state, t), size);
 32 | }
 33 | 
 34 | void THNN_(BatchNormalization_updateOutput)(
 35 |   THCState *state, THCTensor *input_, THCTensor *output_,
 36 |   THCTensor *weight_, THCTensor *bias_, THCTensor *runningMean_,
 37 |   THCTensor *runningVar_, THCTensor *saveMean_, THCTensor *saveStd_,
 38 |   bool train, double momentum, double eps) {
 39 | 
 40 |   THCTensor_(resizeAs)(state, output_, input_);
 41 |   DeviceTensor3 input = devicetensor<3>(state, input_);
 42 |   DeviceTensor3 output = devicetensor<3>(state, output_);
 43 |   DeviceTensor1 weight = devicetensor<1>(state, weight_);
 44 |   DeviceTensor1 bias = devicetensor<1>(state, bias_);
 45 |   DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
 46 |   DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
 47 |   DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
 48 |   DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
 49 | 
 50 |   cudaStream_t s = THCState_getCurrentStream(state);
 51 |   cudaDeviceProp *prop = THCState_getCurrentDeviceProperties(state);
 52 | 
 53 |   if (!train) {
 54 |     dim3 blocks(input.getSize(1));
 55 |     dim3 threads(getNumThreads(input.getSize(2)));
 56 |     BatchNormalizationUpdateOutputInference_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
 57 |       input, output, runningMean, runningVar, weight, bias, eps);
 58 |   } else {
 59 |     dim3 blocks(input.getSize(1));
 60 |     dim3 threads(getNumThreads(input.getSize(2)));
 61 |     BatchNormalizationUpdateOutput_kernel<real, accreal, DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
 62 |       input, output, weight, bias, eps, momentum, runningMean, runningVar,
 63 |       saveMean, saveStd);
 64 |   }
 65 |   THCudaCheck(cudaGetLastError());
 66 | }
 67 | 
 68 | void THNN_(BatchNormalization_backward)(
 69 |   THCState *state, THCTensor *input_, THCTensor *gradOutput_,
 70 |   THCTensor *gradInput_, THCTensor *gradWeight_, THCTensor *gradBias_,
 71 |   THCTensor *weight_, THCTensor *runningMean_, THCTensor *runningVar_,
 72 |   THCTensor *saveMean_, THCTensor *saveStd_, bool train, double scale, double eps) {
 73 | 
 74 |   THCUNN_check_shape(state, input_, gradOutput_);
 75 |   DeviceTensor3 input = devicetensor<3>(state, input_);
 76 |   DeviceTensor3 gradOutput = devicetensor<3>(state, gradOutput_);
 77 |   DeviceTensor3 gradInput = devicetensor<3>(state, gradInput_);
 78 |   DeviceTensor1 gradWeight = devicetensor<1>(state, gradWeight_);
 79 |   DeviceTensor1 gradBias = devicetensor<1>(state, gradBias_);
 80 |   DeviceTensor1 weight = devicetensor<1>(state, weight_);
 81 |   DeviceTensor1 runningMean = devicetensor<1>(state, runningMean_);
 82 |   DeviceTensor1 runningVar = devicetensor<1>(state, runningVar_);
 83 |   DeviceTensor1 saveMean = devicetensor<1>(state, saveMean_);
 84 |   DeviceTensor1 saveStd = devicetensor<1>(state, saveStd_);
 85 | 
 86 |   cudaStream_t s = THCState_getCurrentStream(state);
 87 | 
 88 |   dim3 blocks(gradOutput.getSize(1));
 89 |   dim3 threads(getNumThreads(gradOutput.getSize(2)));
 90 |   BatchNormalizationBackward_kernel<real,  accreal,  DeviceTensor1, DeviceTensor3> <<<blocks, threads, 0, s>>>(
 91 |     input, gradOutput, gradInput, gradWeight, gradBias, weight, runningMean, runningVar,
 92 |     saveMean, saveStd, train, scale, eps);
 93 |   THCudaCheck(cudaGetLastError());
 94 | }
 95 | 
 96 | #undef DeviceTensor3
 97 | #undef DeviceTensor1
 98 | 
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialFractionalMaxPooling.cu:
--------------------------------------------------------------------------------
  1 | #include "THCUNN.h"
  2 | #include "common.h"
  3 | #include "THCDeviceTensor.cuh"
  4 | #include "THCDeviceTensorUtils.cuh"
  5 | #include "THCDeviceUtils.cuh"
  6 | #include "THCHalf.h"
  7 | #include "THCHalfAutoNumerics.cuh"
  8 | #include "THCAtomics.cuh"
  9 | 
 10 | #include <cfloat>
 11 | 
 12 | template <typename Dtype, typename Acctype>
 13 | __device__ inline int getInterval(Acctype sample,
 14 |                                   int index,
 15 |                                   int inputSize,
 16 |                                   int outputSize,
 17 |                                   int poolSize) {
 18 |   Acctype alpha = (Acctype)(inputSize - poolSize) / (Acctype) (outputSize - 1);
 19 |   if (index == outputSize - 1) {
 20 |     return inputSize - poolSize;
 21 |   } else {
 22 |     return (int) ((index + sample) * alpha) - (int) (sample * alpha);
 23 |   }
 24 | }
 25 | 
 26 | // We template on poolSizeW to allow the innermost loop to be unrolled
 27 | template <int PoolSizeWStatic, typename Dtype, typename Acctype>
 28 | __global__ void SpatialFractionalMaxPooling_updateOutput(
 29 |   THCDeviceTensor<Dtype, 4> input,
 30 |   THCDeviceTensor<Dtype, 4> output,
 31 |   THCDeviceTensor<THCIndex_t, 4> indices,
 32 |   THCDeviceTensor<Dtype, 3> samples,
 33 |   int poolSizeW, int poolSizeH) {
 34 | 
 35 |   // Output (h, w) point that this thread is responsible for
 36 |   int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
 37 |   int plane = blockIdx.y;
 38 |   int batch = blockIdx.z;
 39 | 
 40 |   // Each thread generates a specific output point
 41 |   if (ourOutputPoint < output.getSize(2) * output.getSize(3)) {
 42 |     int outputW = ourOutputPoint % output.getSize(3);
 43 |     int outputH = ourOutputPoint / output.getSize(3);
 44 | 
 45 |     int poolW = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][0]), outputW,
 46 |                             input.getSize(3), output.getSize(3), poolSizeW);
 47 |     int poolH = getInterval<Dtype, Acctype>(ScalarConvert<Dtype, Acctype>::to(samples[batch][plane][1]), outputH,
 48 |                             input.getSize(2), output.getSize(2), poolSizeH);
 49 | 
 50 |     Dtype maxVal = THCNumerics<Dtype>::min();
 51 |     int maxIndex = -1;
 52 | 
 53 |     for (int h = poolH; h < poolH + poolSizeH; ++h) {
 54 |       if (PoolSizeWStatic == -1) {
 55 |         for (int w = poolW; w < poolW + poolSizeW; ++w) {
 56 |           Dtype val = input[batch][plane][h][w];
 57 |           // for consistency with THNN, favor the first max
 58 |           if (val > maxVal) {
 59 |             maxIndex = h * input.getSize(3) + w;
 60 |             maxVal = val;
 61 |           }
 62 |         }
 63 |       } else {
 64 | #pragma unroll
 65 |         for (int i = 0; i < PoolSizeWStatic; ++i) {
 66 |           int w = i + poolW;
 67 |           Dtype val = input[batch][plane][h][w];
 68 |           // for consistency with THNN, favor the first max
 69 |           if (val > maxVal) {
 70 |             maxIndex = h * input.getSize(3) + w;
 71 |             maxVal = val;
 72 |           }
 73 |         }
 74 |       }
 75 |     }
 76 | 
 77 |     assert(THCNumerics<Dtype>::ne(maxVal, THCNumerics<Dtype>::min()));
 78 |     assert(maxIndex != -1);
 79 | 
 80 |     // +1 for Lua index
 81 |     indices[batch][plane][outputH][outputW] = maxIndex + TH_INDEX_BASE;
 82 |     output[batch][plane][outputH][outputW] = maxVal;
 83 |   }
 84 | }
 85 | 
 86 | template <typename Dtype>
 87 | __global__ void SpatialFractionalMaxPooling_updateGradInput(
 88 |   THCDeviceTensor<Dtype, 4> gradInput,
 89 |   THCDeviceTensor<Dtype, 4> gradOutput,
 90 |   THCDeviceTensor<THCIndex_t, 4> indices) {
 91 |   // Output (h, w) point that this thread is responsible for
 92 |   int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x;
 93 |   int plane = blockIdx.y;
 94 |   int batch = blockIdx.z;
 95 | 
 96 |   // Each thread generates a specific output point
 97 |   if (ourOutputPoint < gradOutput.getSize(2) * gradOutput.getSize(3)) {
 98 |     int outputW = ourOutputPoint % gradOutput.getSize(3);
 99 |     int outputH = ourOutputPoint / gradOutput.getSize(3);
100 | 
101 |     int index = indices[batch][plane][outputH][outputW] - TH_INDEX_BASE;
102 |     assert(index >= 0);
103 |     int inputW = index % gradInput.getSize(3);
104 |     int inputH = index / gradInput.getSize(3);
105 |     assert(inputH < gradInput.getSize(2));
106 | 
107 |     atomicAdd(gradInput[batch][plane][inputH][inputW].data(),
108 |               gradOutput[batch][plane][outputH][outputW]);
109 |   }
110 | }
111 | 
112 | #include "generic/SpatialFractionalMaxPooling.cu"
113 | #include "THCGenerateFloatTypes.h"
114 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SoftMax.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/SoftMax.cu"
  3 | #else
  4 | 
  5 | #include "../common.h"
  6 | 
  7 | void THNN_(SoftMax_updateOutput)(
  8 |            THCState *state,
  9 |            THCTensor *input,
 10 |            THCTensor *output)
 11 | {
 12 |   THCUNN_assertSameGPU(state, 2, input, output);
 13 | 
 14 |   input = THCTensor_(newContiguous)(state, input);
 15 |   THCTensor_(resizeAs)(state, output, input);
 16 |   long batchSize, dim, stride0, stride1 = 1;
 17 |   long blocksY = 1, blocksZ = 1;
 18 | 
 19 |   if (input->nDimension == 1)
 20 |   {
 21 |     batchSize = 1;
 22 |     dim = input->size[0];
 23 |     stride0 = 1;
 24 |   }
 25 |   else if (input->nDimension == 2)
 26 |   {
 27 |     batchSize = input->size[0];
 28 |     dim = input->size[1];
 29 |     stride0 = 1;
 30 |   }
 31 |   else if (input->nDimension == 3)
 32 |   {
 33 |     batchSize = 1;
 34 |     dim = input->size[0];
 35 |     blocksY = input->size[1];
 36 |     blocksZ = input->size[2];
 37 |     stride0 = blocksY * blocksZ;
 38 |     stride1 = blocksZ;
 39 |   }
 40 |   else if (input->nDimension == 4)
 41 |   {
 42 |     batchSize = input->size[0];
 43 |     dim = input->size[1];
 44 |     blocksY = input->size[2];
 45 |     blocksZ = input->size[3];
 46 |     stride0 = blocksY * blocksZ;
 47 |     stride1 = blocksZ;
 48 |   }
 49 |   else
 50 |   {
 51 |     THError("1D, 2D, 3D or 4D tensor expected");
 52 |   }
 53 | 
 54 |   // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
 55 |   if (blocksY * blocksZ < 65536)
 56 |   {
 57 |     blocksY *= blocksZ;
 58 |     blocksZ = 1;
 59 |     if (input->nDimension == 3 || input->nDimension == 4) {
 60 |       stride0 = blocksY * blocksZ;
 61 |       stride1 = blocksZ;
 62 |     }
 63 |   }
 64 | 
 65 |   dim3 blocks(batchSize, blocksY, blocksZ);
 66 |   dim3 threads(SOFTMAX_THREADS);
 67 |   cunn_SoftMax_updateOutput_kernel<real, accreal><<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
 68 |     THCTensor_(data)(state, output),
 69 |     THCTensor_(data)(state, input),
 70 |     batchSize, dim, stride0, stride1
 71 |   );
 72 |   THCudaCheck(cudaGetLastError());
 73 | 
 74 |   THCTensor_(free)(state, input);
 75 | }
 76 | 
 77 | void THNN_(SoftMax_updateGradInput)(
 78 |            THCState *state,
 79 |            THCTensor *input,
 80 |            THCTensor *gradOutput,
 81 |            THCTensor *gradInput,
 82 |            THCTensor *output)
 83 | {
 84 |   THCUNN_check_nElement(state, input, gradOutput);
 85 |   THCUNN_assertSameGPU(state, 3, output, gradOutput, gradInput);
 86 | 
 87 |   output = THCTensor_(newContiguous)(state, output);
 88 |   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 89 | 
 90 |   THCTensor_(resizeAs)(state, gradInput, output);
 91 |   long batchSize, dim, stride0, stride1 = 1;
 92 |   long blocksY = 1, blocksZ = 1;
 93 | 
 94 |   if (gradInput->nDimension == 1)
 95 |   {
 96 |     batchSize = 1;
 97 |     dim = gradInput->size[0];
 98 |     stride0 = 1;
 99 |   }
100 |   else if (gradInput->nDimension == 2)
101 |   {
102 |     batchSize = gradInput->size[0];
103 |     dim = gradInput->size[1];
104 |     stride0 = 1;
105 |   }
106 |   else if (gradInput->nDimension == 3)
107 |   {
108 |     batchSize = 1;
109 |     dim = gradInput->size[0];
110 |     blocksY = gradInput->size[1];
111 |     blocksZ = gradInput->size[2];
112 |     stride0 = blocksY * blocksZ;
113 |     stride1 = blocksZ;
114 |   }
115 |   else if (gradInput->nDimension == 4)
116 |   {
117 |     batchSize = gradInput->size[0];
118 |     dim = gradInput->size[1];
119 |     blocksY = gradInput->size[2];
120 |     blocksZ = gradInput->size[3];
121 |     stride0 = blocksY * blocksZ;
122 |     stride1 = blocksZ;
123 |   }
124 |   else
125 |   {
126 |     THError("1D, 2D, 3D or 4D tensor expected");
127 |   }
128 | 
129 |   // when possible use only 2d grid of thread blocks to stay compatible with compute capability 2.X devices.
130 |   if (blocksY * blocksZ < 65536)
131 |   {
132 |     blocksY *= blocksZ;
133 |     blocksZ = 1;
134 |     if (input->nDimension == 3 || input->nDimension == 4) {
135 |       stride0 = blocksY * blocksZ;
136 |       stride1 = blocksZ;
137 |     }
138 |   }
139 | 
140 |   dim3 blocks(batchSize, blocksY, blocksZ);
141 |   dim3 threads(SOFTMAX_THREADS);
142 |   cunn_SoftMax_updateGradInput_kernel<real, accreal><<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
143 |     THCTensor_(data)(state, gradInput),
144 |     THCTensor_(data)(state, output),
145 |     THCTensor_(data)(state, gradOutput),
146 |     batchSize, dim, stride0, stride1
147 |   );
148 |   THCudaCheck(cudaGetLastError());
149 | 
150 |   THCTensor_(free)(state, gradOutput);
151 |   THCTensor_(free)(state, output);
152 | }
153 | 
154 | #endif
155 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialUpSamplingBilinear.cu:
--------------------------------------------------------------------------------
  1 | // Adapted from interp.cpp from Caffe util by Pauline Luc
  2 | // Originally developed by George Papandreou
  3 | #include "THCUNN.h"
  4 | #include "common.h"
  5 | #include "THCDeviceTensor.cuh"
  6 | #include "THCDeviceTensorUtils.cuh"
  7 | #include "THCDeviceUtils.cuh"
  8 | #include "THCHalf.h"
  9 | #include "THCHalfAutoNumerics.cuh"
 10 | #include "THCAtomics.cuh"
 11 | 
 12 | template<typename Dtype, typename Acctype>
 13 | __global__ void caffe_gpu_interp2_kernel(const int n,
 14 |     const Acctype rheight, const Acctype rwidth,
 15 |     const THCDeviceTensor<Dtype, 4> data1, THCDeviceTensor<Dtype, 4> data2) {
 16 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
 17 |   const int batchsize = data1.getSize(0);
 18 |   const int channels = data1.getSize(1);
 19 |   const int height1 = data1.getSize(2);
 20 |   const int width1 = data1.getSize(3);
 21 |   const int height2 = data2.getSize(2);
 22 |   const int width2 = data2.getSize(3);
 23 | 
 24 |   if (index < n) {
 25 |     const int w2 = index % width2; // 0:width2-1
 26 |     const int h2 = index / width2; // 0:height2-1
 27 |     // special case: just copy
 28 |     if (height1 == height2 && width1 == width2) {
 29 |       const int h1 = h2;
 30 |       const int w1 = w2;
 31 |       for (int n = 0; n < batchsize ; n++){
 32 |         for (int c = 0; c < channels; ++c) {
 33 |           const Dtype val = data1[n][c][h1][w1];
 34 |           data2[n][c][h2][w2] = val;
 35 |         }
 36 |       }
 37 |       return;
 38 |     }
 39 |     //
 40 |     const Acctype h1r = rheight * h2;
 41 |     const int h1 = h1r;
 42 |     const int h1p = (h1 < height1 - 1) ? 1 : 0;
 43 |     const Acctype h1lambda = h1r - h1;
 44 |     const Acctype h0lambda = Acctype(1) - h1lambda;
 45 |     //
 46 |     const Acctype w1r = rwidth * w2;
 47 |     const int w1 = w1r;
 48 |     const int w1p = (w1 < width1 - 1) ? 1 : 0;
 49 |     const Acctype w1lambda = w1r - w1;
 50 |     const Acctype w0lambda = Acctype(1) - w1lambda;
 51 |     //
 52 |     for (int n = 0; n < batchsize ; n++){
 53 |         for (int c = 0; c < channels; ++c) {
 54 |         const Acctype val = h0lambda * (w0lambda * data1[n][c][h1][w1]
 55 |                             + w1lambda * data1[n][c][h1][w1+w1p])
 56 |                             + h1lambda * (w0lambda * data1[n][c][h1+h1p][w1]
 57 |                             + w1lambda * data1[n][c][h1+h1p][w1+w1p]);
 58 |         data2[n][c][h2][w2] = ScalarConvert<Acctype, Dtype>::to(val);
 59 |       }
 60 |     }
 61 |   }
 62 | }
 63 | 
 64 | // Backward (adjoint) operation 1 <- 2 (accumulates)
 65 | template <typename Dtype, typename Acctype>
 66 | __global__ void caffe_gpu_interp2_kernel_backward(const int n,
 67 |     const Acctype rheight, const Acctype rwidth,
 68 |     THCDeviceTensor<Dtype, 4> data1, const THCDeviceTensor<Dtype, 4> data2){
 69 |   int index = threadIdx.x + blockIdx.x * blockDim.x;
 70 |   const int batchsize = data1.getSize(0);
 71 |   const int channels = data1.getSize(1);
 72 |   const int height1 = data1.getSize(2);
 73 |   const int width1 = data1.getSize(3);
 74 |   const int height2 = data2.getSize(2);
 75 |   const int width2 = data2.getSize(3);
 76 |   if (index < n) {
 77 |     const int w2 = index % width2; // 0:width2-1
 78 |     const int h2 = index / width2; // 0:height2-1
 79 |     // special case: just copy
 80 |     if (height1 == height2 && width1 == width2) {
 81 |       const int h1 = h2;
 82 |       const int w1 = w2;
 83 |       for (int n = 0; n < batchsize ; n++){
 84 |         for (int c = 0; c < channels; ++c) {
 85 |           const Dtype val = data2[n][c][h1][w1];
 86 |           data1[n][c][h2][w2] += val;
 87 |         }
 88 |       }
 89 |       return;
 90 |     }
 91 |     //
 92 |     const Acctype h1r = rheight * h2;
 93 |     const int h1 = h1r;
 94 |     const int h1p = (h1 < height1 - 1) ? 1 : 0;
 95 |     const Acctype h1lambda = h1r - h1;
 96 |     const Acctype h0lambda = Acctype(1) - h1lambda;
 97 |     //
 98 |     const Acctype w1r = rwidth * w2;
 99 |     const int w1 = w1r;
100 |     const int w1p = (w1 < width1 - 1) ? 1 : 0;
101 |     const Acctype w1lambda = w1r - w1;
102 |     const Acctype w0lambda = Acctype(1) - w1lambda;
103 |     //
104 |     for (int n = 0; n < batchsize ; n++){
105 |       for (int c = 0; c < channels; ++c) {
106 |         const Dtype d2val = data2[n][c][h2][w2];
107 |         atomicAdd(data1[n][c][h1][w1].data(),
108 |                   ScalarConvert<Acctype, Dtype>::to(h0lambda * w0lambda * d2val));
109 |         atomicAdd(data1[n][c][h1][w1+w1p].data(),
110 |                   ScalarConvert<Acctype, Dtype>::to(h0lambda * w1lambda * d2val));
111 |         atomicAdd(data1[n][c][h1+h1p][w1].data(),
112 |                   ScalarConvert<Acctype, Dtype>::to(h1lambda * w0lambda * d2val));
113 |         atomicAdd(data1[n][c][h1+h1p][w1+w1p].data(),
114 |                   ScalarConvert<Acctype, Dtype>::to(h1lambda * w1lambda * d2val));
115 |       }
116 |     }
117 |   }
118 | }
119 | 
120 | 
121 | #include "generic/SpatialUpSamplingBilinear.cu"
122 | #include "THCGenerateFloatTypes.h"
123 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SpatialReplicationPadding.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/SpatialReplicationPadding.cu"
  3 | #else
  4 | 
  5 | void THNN_(SpatialReplicationPadding_updateOutput)(
  6 |            THCState *state,
  7 |            THCTensor *input,
  8 |            THCTensor *output,
  9 |            int padL, int padR,
 10 |            int padT, int padB) {
 11 |   THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
 12 |              "input tensor must fit into 32-bit index math");
 13 | 
 14 |   int planeDim = 0;
 15 |   int dimh = 1;
 16 |   int dimw = 2;
 17 |   int numBatch = 1;
 18 | 
 19 |   int numInputDims = THCTensor_(nDimension)(state, input);
 20 |   THCUNN_argCheck(state, numInputDims == 3 || numInputDims == 4, 2, input,
 21 |                   "3D or 4D (batch mode) tensor expected for input, but got: %s")
 22 | 
 23 |   if (numInputDims == 4) {
 24 |     numBatch = THCTensor_(size)(state, input, 0);
 25 |     planeDim++;
 26 |     dimh++;
 27 |     dimw++;
 28 |   }
 29 | 
 30 |   int numPlanes = THCTensor_(size)(state, input, planeDim);
 31 |   int inputH = THCTensor_(size)(state, input, dimh);
 32 |   int inputW = THCTensor_(size)(state, input, dimw);
 33 |   int outputH = inputH + padT + padB;
 34 |   int outputW  = inputW + padL + padR;
 35 | 
 36 |   THArgCheck(outputW >= 1 || outputH >= 1 , 2,
 37 |              "input (H: %d, W: %d)is too small."
 38 |              " Calculated output H: %d W: %d",
 39 |              inputH, inputW, outputH, outputW);
 40 | 
 41 |   THCDeviceTensor<real, 4> devInput;
 42 |   THCDeviceTensor<real, 4> devOutput;
 43 | 
 44 |   if (numInputDims == 3) {
 45 |     THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
 46 | 
 47 |     devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
 48 |     devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
 49 |   } else {
 50 |     THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
 51 | 
 52 |     devInput = toDeviceTensor<real, 4>(state, input);
 53 |     devOutput = toDeviceTensor<real, 4>(state, output);
 54 |   }
 55 | 
 56 |   int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
 57 |   dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
 58 |             devOutput.getSize(1),
 59 |             devOutput.getSize(0));
 60 |   dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
 61 | 
 62 |   SpatialReplicationPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
 63 |     devInput, devOutput, padT, padB, padL, padR);
 64 | 
 65 | }
 66 | 
 67 | void THNN_(SpatialReplicationPadding_updateGradInput)(
 68 |            THCState *state,
 69 |            THCTensor *input,
 70 |            THCTensor *gradOutput,
 71 |            THCTensor *gradInput,
 72 |            int padL, int padR,
 73 |            int padT, int padB) {
 74 | 
 75 |   THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
 76 |                 "input tensor must fit into 32-bit index math");
 77 |   THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
 78 |                 "output gradient tensor must fit into 32-bit index math");
 79 | 
 80 |   int planeDim = 0;
 81 |   int dimh = 1;
 82 |   int dimw = 2;
 83 | 
 84 |   int numInputDims = THCTensor_(nDimension)(state, input);
 85 |   if (numInputDims == 4) {
 86 |     planeDim++;
 87 |     dimh++;
 88 |     dimw++;
 89 |   }
 90 |   int iheight = input->size[dimh];
 91 |   int iwidth = input->size[dimw];
 92 |   int oheight = iheight + padT + padB;
 93 |   int owidth  = iwidth + padL + padR;
 94 | 
 95 |   THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
 96 |              "gradOutput width unexpected. Expected: %d, Got: %d",
 97 |              owidth, THCTensor_(size)(state, gradOutput, dimw));
 98 |   THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3,
 99 |              "gradOutput height unexpected. Expected: %d, Got: %d",
100 |              oheight, THCTensor_(size)(state, gradOutput, dimh));
101 | 
102 |   THCTensor_(resizeAs)(state, gradInput, input);
103 |   THCTensor_(zero)(state, gradInput);
104 | 
105 |   THCDeviceTensor<real, 4> devGradInput;
106 |   THCDeviceTensor<real, 4> devGradOutput;
107 | 
108 |   if (numInputDims == 3) {
109 |     devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
110 |     devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
111 |   } else {
112 |     devGradInput = toDeviceTensor<real, 4>(state, gradInput);
113 |     devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
114 |   }
115 | 
116 |   int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
117 |   dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
118 |             devGradOutput.getSize(1),
119 |             devGradOutput.getSize(0));
120 |   dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
121 | 
122 |   SpatialReplicationPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
123 |     devGradInput, devGradOutput, padT, padB, padL, padR);
124 | 
125 | }
126 | 
127 | #endif
128 | 


--------------------------------------------------------------------------------
/THCUNN.lua:
--------------------------------------------------------------------------------
  1 | local ffi = require 'ffi'
  2 | local THNN = require 'nn.THNN'
  3 | 
  4 | local THCUNN = {}
  5 | 
  6 | -- load libTHCUNN
  7 | THCUNN.C = ffi.load(package.searchpath('libTHCUNN', package.cpath))
  8 | 
  9 | -- load THC
 10 | local THC = ffi.os == 'Windows' and ffi.load('THC') or ffi.C
 11 | 
 12 | local THCState_ptr = ffi.typeof('THCState*')
 13 | 
 14 | function THCUNN.getState()
 15 |    return THCState_ptr(cutorch.getState());
 16 | end
 17 | 
 18 | local THCUNN_generic_h = require 'cunn.THCUNN_generic_h'
 19 | -- strip all lines starting with #
 20 | -- to remove preprocessor directives originally present
 21 | -- in THNN.h
 22 | THCUNN_generic_h = THCUNN_generic_h:gsub("\n#[^\n]*", "")
 23 | THCUNN_generic_h = THCUNN_generic_h:gsub("^#[^\n]*\n", "")
 24 | 
 25 | local preprocessed_generic = string.gsub(THCUNN_generic_h, 'TH_API void THNN_%(([%a%d_]+)%)', 'void THNN_TYPE%1')
 26 | 
 27 | local replacements =
 28 | {
 29 |    {
 30 |       ['THTensor'] = 'THCudaTensor',
 31 |       ['THCIndexTensor'] = 'THCudaLongTensor',
 32 |       ['THIndex_t'] = 'long',
 33 |       ['THInteger_t'] = 'float'
 34 |    }
 35 | }
 36 | 
 37 | local cct2lt = {
 38 |    ['THCudaFloatTensor'] = 'torch.CudaTensor',
 39 |    ['THCudaDoubleTensor'] = 'torch.CudaDoubleTensor',
 40 | }
 41 | 
 42 | local replacements_generic =
 43 | {
 44 |   {
 45 |     ['THCTensor'] = 'THCudaTensor',
 46 |     ['THCIndexTensor'] = 'THCudaLongTensor',
 47 |     ['TYPE'] = 'Cuda',
 48 |     ['accreal'] = 'float',
 49 |   },
 50 |   {
 51 |     ['THCTensor'] = 'THCudaDoubleTensor',
 52 |     ['THCIndexTensor'] = 'THCudaLongTensor',
 53 |     ['TYPE'] = 'CudaDouble',
 54 |     ['accreal'] = 'double',
 55 |    }
 56 | }
 57 | 
 58 | if cutorch.hasHalf then
 59 |   ffi.cdef("half THC_float2half(float a);")
 60 |   ffi.cdef("float THC_half2float(half a);")
 61 |   cct2lt['THCudaHalfTensor'] = 'torch.CudaHalfTensor'
 62 |   local half_replacement = {
 63 |     ['THCTensor'] = 'THCudaHalfTensor',
 64 |     ['THCIndexTensor'] = 'THCudaLongTensor',
 65 |     ['TYPE'] = 'CudaHalf',
 66 |     ['accreal'] = 'float',
 67 |   }
 68 |   table.insert(replacements_generic, half_replacement)
 69 | end
 70 | 
 71 | for i=1,#replacements_generic do
 72 |     local r = replacements_generic[i]
 73 |     local s = preprocessed_generic
 74 |     for k,v in pairs(r) do
 75 |         s = string.gsub(s, k, v)
 76 |     end
 77 |     ffi.cdef(s)
 78 | end
 79 | 
 80 | local function extract_function_names_generic(s)
 81 |    local t = {}
 82 |    for n in string.gmatch(s, 'TH_API void THNN_%(([%a%d_]+)%)') do
 83 |        t[#t+1] = n
 84 |    end
 85 |    return t
 86 | end
 87 | 
 88 | local function find_positions(s, p)
 89 |    local begin = 0
 90 |    local positions = {}
 91 |    while true do
 92 |       local start, stop = string.find(s, p, begin)
 93 |       if (start == nil) then break end
 94 |       positions[#positions+1] = start
 95 |       begin = stop + 1
 96 |    end
 97 |    return positions
 98 | end
 99 | 
100 | local function extract_function_names_and_real_args(s)
101 |    local t = {}
102 |    for n in string.gmatch(s, 'TH_API ([^;]+)') do
103 |       local func_name = string.match(n, 'void THNN_%(([%a%d_]+)%)')
104 |       local param_positions = find_positions(n, ',')
105 |       local positions = {}
106 |       for x,y in ipairs(find_positions(n, 'real')) do
107 |           local found = false
108 |           for cn,cp in ipairs(param_positions) do
109 |               if cp > y then
110 |                 positions[#positions+1] = cn
111 |                 found = true
112 |                 break
113 |               end
114 |           end
115 |           -- it is the last param
116 |           if not found then positions[#positions+1] = #param_positions + 1 end
117 |       end
118 | 
119 |    t[func_name] = positions
120 |    end
121 |    return t
122 | end
123 | 
124 | local real_args = extract_function_names_and_real_args(THCUNN_generic_h)
125 | 
126 | -- build function table
127 | local function_names_generic = extract_function_names_generic(THCUNN_generic_h)
128 | 
129 | THNN.kernels['torch.CudaTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'Cuda', THCUNN.getState)
130 | torch.getmetatable('torch.CudaTensor').THNN = THNN.kernels['torch.CudaTensor']
131 | 
132 | THNN.kernels['torch.CudaDoubleTensor'] = THNN.bind(THCUNN.C, function_names_generic, 'CudaDouble', THCUNN.getState)
133 | torch.getmetatable('torch.CudaDoubleTensor').THNN = THNN.kernels['torch.CudaDoubleTensor']
134 | 
135 | if cutorch.hasHalf then
136 |    local raw_half_functions = THNN.bind(THCUNN.C, function_names_generic, 'CudaHalf', THCUNN.getState)
137 |    THNN.kernels['torch.CudaHalfTensor'] = raw_half_functions
138 |    torch.getmetatable('torch.CudaHalfTensor').THNN = THNN.kernels['torch.CudaHalfTensor']
139 | end
140 | 
141 | local function Module__converter(type)
142 |     return function(self)
143 |             return self:type(type)
144 |     end
145 | end
146 | 
147 | rawset(torch.getmetatable('nn.Module'), 'cudaDouble', Module__converter('torch.CudaDoubleTensor'))
148 | if cutorch.hasHalf then
149 |     rawset(torch.getmetatable('nn.Module'), 'cudaHalf', Module__converter('torch.CudaHalfTensor'))
150 | end
151 | return THCUNN
152 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SpatialReflectionPadding.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/SpatialReflectionPadding.cu"
  3 | #else
  4 | 
  5 | void THNN_(SpatialReflectionPadding_updateOutput)(THCState *state,
  6 |            THCTensor *input,
  7 |            THCTensor *output,
  8 |            int padL, int padR,
  9 |            int padT, int padB) {
 10 |   THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
 11 |              "input tensor must fit into 32-bit index math");
 12 | 
 13 |   int planeDim = 0;
 14 |   int dimh = 1;
 15 |   int dimw = 2;
 16 |   int numBatch = 1;
 17 | 
 18 |   int numInputDims = THCTensor_(nDimension)(state, input);
 19 |   THCUNN_argCheck(state, numInputDims == 3 || numInputDims == 4, 2, input,
 20 |                   "3D or 4D (batch mode) tensor expected for input, but got: %s")
 21 | 
 22 |   if (numInputDims == 4) {
 23 |     numBatch = THCTensor_(size)(state, input, 0);
 24 |     planeDim++;
 25 |     dimh++;
 26 |     dimw++;
 27 |   }
 28 | 
 29 |   int numPlanes = THCTensor_(size)(state, input, planeDim);
 30 |   int inputH = THCTensor_(size)(state, input, dimh);
 31 |   int inputW = THCTensor_(size)(state, input, dimw);
 32 |   int outputH = inputH + padT + padB;
 33 |   int outputW  = inputW + padL + padR;
 34 | 
 35 |   THArgCheck(outputW >= 1 || outputH >= 1 , 2,
 36 |              "input (H: %d, W: %d)is too small."
 37 |              " Calculated output H: %d W: %d",
 38 |              inputH, inputW, outputH, outputW);
 39 | 
 40 |   THCDeviceTensor<real, 4> devInput;
 41 |   THCDeviceTensor<real, 4> devOutput;
 42 | 
 43 |   if (numInputDims == 3) {
 44 |     THCTensor_(resize3d)(state, output, numPlanes, outputH, outputW);
 45 | 
 46 |     devInput = toDeviceTensor<real, 3>(state, input).upcastOuter<4>();
 47 |     devOutput = toDeviceTensor<real, 3>(state, output).upcastOuter<4>();
 48 |   } else {
 49 |     THCTensor_(resize4d)(state, output, numBatch, numPlanes, outputH, outputW);
 50 | 
 51 |     devInput = toDeviceTensor<real, 4>(state, input);
 52 |     devOutput = toDeviceTensor<real, 4>(state, output);
 53 |   }
 54 | 
 55 |   int outputPlaneSize = devOutput.getSize(2) * devOutput.getSize(3);
 56 |   dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
 57 |             devOutput.getSize(1),
 58 |             devOutput.getSize(0));
 59 |   dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
 60 | 
 61 |   SpatialReflectionPadding_updateOutput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
 62 |     devInput, devOutput, padT, padB, padL, padR);
 63 |   THCudaCheck(cudaGetLastError());
 64 | }
 65 | 
 66 | void THNN_(SpatialReflectionPadding_updateGradInput)(
 67 |            THCState *state,
 68 |            THCTensor *input,
 69 |            THCTensor *gradOutput,
 70 |            THCTensor *gradInput,
 71 |            int padL, int padR,
 72 |            int padT, int padB) {
 73 | 
 74 |   THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, input), 2,
 75 |                 "input tensor must fit into 32-bit index math");
 76 |   THArgCheck(TensorUtils<THCTensor>::canUse32BitIndexMath(state, gradOutput), 3,
 77 |                 "output gradient tensor must fit into 32-bit index math");
 78 | 
 79 |   int planeDim = 0;
 80 |   int dimh = 1;
 81 |   int dimw = 2;
 82 | 
 83 |   int numInputDims = THCTensor_(nDimension)(state, input);
 84 |   if (numInputDims == 4) {
 85 |     planeDim++;
 86 |     dimh++;
 87 |     dimw++;
 88 |   }
 89 |   int iheight = input->size[dimh];
 90 |   int iwidth = input->size[dimw];
 91 |   int oheight = iheight + padT + padB;
 92 |   int owidth  = iwidth + padL + padR;
 93 | 
 94 |   THArgCheck(owidth == THCTensor_(size)(state, gradOutput, dimw), 3,
 95 |              "gradOutput width unexpected. Expected: %d, Got: %d",
 96 |              owidth, THCTensor_(size)(state, gradOutput, dimw));
 97 |   THArgCheck(oheight == THCTensor_(size)(state, gradOutput, dimh), 3,
 98 |              "gradOutput height unexpected. Expected: %d, Got: %d",
 99 |              oheight, THCTensor_(size)(state, gradOutput, dimh));
100 | 
101 |   THCTensor_(resizeAs)(state, gradInput, input);
102 |   THCTensor_(zero)(state, gradInput);
103 | 
104 |   THCDeviceTensor<real, 4> devGradInput;
105 |   THCDeviceTensor<real, 4> devGradOutput;
106 | 
107 |   if (numInputDims == 3) {
108 |     devGradInput = toDeviceTensor<real, 3>(state, gradInput).upcastOuter<4>();
109 |     devGradOutput = toDeviceTensor<real, 3>(state, gradOutput).upcastOuter<4>();
110 |   } else {
111 |     devGradInput = toDeviceTensor<real, 4>(state, gradInput);
112 |     devGradOutput = toDeviceTensor<real, 4>(state, gradOutput);
113 |   }
114 | 
115 |   int outputPlaneSize = devGradOutput.getSize(2) * devGradOutput.getSize(3);
116 |   dim3 gridSize(THCCeilDiv(outputPlaneSize, 256),
117 |             devGradOutput.getSize(1),
118 |             devGradOutput.getSize(0));
119 |   dim3 blockSize(outputPlaneSize > 256 ? 256 : outputPlaneSize);
120 | 
121 |   SpatialReflectionPadding_updateGradInput<<<gridSize, blockSize, 0, THCState_getCurrentStream(state)>>>(
122 |     devGradInput, devGradOutput, padT, padB, padL, padR);
123 |   THCudaCheck(cudaGetLastError());
124 | }
125 | 
126 | #endif
127 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/MultiLabelMarginCriterion.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/MultiLabelMarginCriterion.cu"
  3 | #else
  4 | 
  5 | // TODO: improve error messages
  6 | void THNN_(MultiLabelMarginCriterion_updateOutput)(
  7 |            THCState *state,
  8 |            THCTensor *input,
  9 |            THCIndexTensor *target,
 10 |            THCTensor *output,
 11 |            THCTensor *istarget,
 12 |            bool sizeaverage)
 13 | {
 14 |   input = THCTensor_(newContiguous)(state, input);
 15 |   target = THCIndexTensor_(newContiguous)(state, target);
 16 |   istarget = THCTensor_(newContiguous)(state, istarget);
 17 |   THCTensor_(resizeAs)(state, istarget, input);
 18 | 
 19 |   if(input->nDimension == 1)
 20 |   {
 21 |     int dim = input->size[0];
 22 |     THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
 23 |         "inconsistent target size");
 24 |     THCTensor_(resize1d)(state, output, 1);
 25 | 
 26 |     dim3 blocks(1);
 27 |     dim3 threads(MULTILABELMARGIN_THREADS);
 28 | 
 29 |     cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal> <<<blocks,threads>>>(
 30 |         THCTensor_(data)(state, output),
 31 |         THCTensor_(data)(state, input),
 32 |         THCIndexTensor_(data)(state, target),
 33 |         THCTensor_(data)(state, istarget),
 34 |         1, dim,
 35 |         sizeaverage
 36 |         );
 37 |     THCudaCheck(cudaGetLastError());
 38 |   }
 39 |   else if(input->nDimension == 2)
 40 |   {
 41 |     int nframe = input->size[0];
 42 |     int dim = input->size[1];
 43 |     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
 44 |                && (target->size[1] == dim), 3, "inconsistent target size");
 45 |     THCTensor *output_tmp = THCTensor_(newWithSize1d)(state, input->size[0]);
 46 | 
 47 |     dim3 blocks(input->size[0]);
 48 |     dim3 threads(MULTILABELMARGIN_THREADS);
 49 | 
 50 |     cunn_MultiLabelMarginCriterion_updateOutput_kernel<real, accreal> <<<blocks,threads>>>(
 51 |         THCTensor_(data)(state, output_tmp),
 52 |         THCTensor_(data)(state, input),
 53 |         THCIndexTensor_(data)(state, target),
 54 |         THCTensor_(data)(state, istarget),
 55 |         nframe, dim,
 56 |         sizeaverage
 57 |         );
 58 |     THCudaCheck(cudaGetLastError());
 59 |     THCTensor_(resize1d)(state, output, 1);
 60 |     THCTensor_(set1d)(state, output, 0, ScalarConvert<accreal, real>::to(THCTensor_(sumall)(state, output_tmp)));
 61 |     THCTensor_(free)(state, output_tmp);
 62 |   }
 63 |   else
 64 |     THError("vector or matrix expected");
 65 | 
 66 |   THCTensor_(free)(state, input);
 67 |   THCIndexTensor_(free)(state, target);
 68 |   THCTensor_(free)(state, istarget);
 69 | }
 70 | 
 71 | void THNN_(MultiLabelMarginCriterion_updateGradInput)(
 72 |             THCState *state,
 73 |             THCTensor *input,
 74 |             THCIndexTensor *target,
 75 |             THCTensor *gradInput,
 76 |             THCTensor *istarget,
 77 |             bool sizeaverage)
 78 | {
 79 |   input = THCTensor_(newContiguous)(state, input);
 80 |   target = THCIndexTensor_(newContiguous)(state, target);
 81 |   istarget = THCTensor_(newContiguous)(state, istarget);
 82 |   THCTensor_(resizeAs)(state, gradInput, input);
 83 | 
 84 |   if(gradInput->nDimension == 1)
 85 |   {
 86 |     int dim = gradInput->size[0];
 87 |     THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
 88 |                "inconsistent target size");
 89 |     THArgCheck((istarget->nDimension == 1) && (istarget->size[0] == dim), 3,
 90 |                "inconsistent isTarget size");
 91 |     dim3 blocks(1);
 92 |     dim3 threads(MULTILABELMARGIN_THREADS);
 93 | 
 94 |     cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal> <<<blocks,threads>>>(THCTensor_(data)(state, gradInput),
 95 |         THCTensor_(data)(state, input),
 96 |         THCIndexTensor_(data)(state, target),
 97 |         THCTensor_(data)(state, istarget),
 98 |         1, gradInput->size[0],
 99 |         sizeaverage);
100 | 
101 |   }
102 |   else if(gradInput->nDimension == 2)
103 |   {
104 |     int nframe = gradInput->size[0];
105 |     int dim = gradInput->size[1];
106 |     THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
107 |                && (target->size[1] == dim), 3, "inconsistent target size");
108 |     THArgCheck((istarget->nDimension == 2) && (istarget->size[0] == nframe)
109 |                && (istarget->size[1] == dim), 3, "inconsistent isTarget size");
110 |     dim3 blocks(gradInput->size[0]);
111 |     dim3 threads(MULTILABELMARGIN_THREADS);
112 | 
113 |     cunn_MultiLabelMarginCriterion_updateGradInput_kernel<real, accreal> <<<blocks,threads>>>(THCTensor_(data)(state, gradInput),
114 |         THCTensor_(data)(state, input),
115 |         THCIndexTensor_(data)(state, target),
116 |         THCTensor_(data)(state, istarget),
117 |         gradInput->size[0], gradInput->size[1],
118 |         sizeaverage);
119 |   }
120 |   else
121 |     THError("vector or matrix expected");
122 | 
123 |   THCudaCheck(cudaGetLastError());
124 | 
125 |   THCTensor_(free)(state, input);
126 |   THCIndexTensor_(free)(state, target);
127 |   THCTensor_(free)(state, istarget);
128 | }
129 | 
130 | #endif
131 | 


--------------------------------------------------------------------------------
/lib/THCUNN/generic/SpatialUpSamplingBilinear.cu:
--------------------------------------------------------------------------------
  1 | #ifndef THC_GENERIC_FILE
  2 | #define THC_GENERIC_FILE "generic/SpatialUpSamplingBilinear.cu"
  3 | #else
  4 | 
  5 | static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
  6 |                         (THCState *state,
  7 |                          THCTensor *input, THCTensor *gradOutput,
  8 |                          int nBatch, int nChannels,
  9 |                          int inputHeight, int inputWidth,
 10 |                          int outputHeight, int outputWidth) {
 11 |   THArgCheck(inputHeight > 0 && inputWidth > 0
 12 |              && outputHeight > 0 && outputWidth > 0, 2,
 13 |              "input and output sizes should be greater than 0,"
 14 |              " but got input (H: %d, W: %d) output (H: %d, W: %d)",
 15 |              inputHeight, inputWidth, outputHeight, outputWidth);
 16 |   if (input != NULL) {
 17 |      THCUNN_argCheck(state, input->nDimension == 4, 2, input,
 18 |                      "4D input tensor expected but got: %s");
 19 |   }
 20 | 
 21 |   if (gradOutput != NULL) {
 22 |     THCUNN_check_dim_size(state, gradOutput, 4, 0, nBatch);
 23 |     THCUNN_check_dim_size(state, gradOutput, 4, 1, nChannels);
 24 |     THCUNN_check_dim_size(state, gradOutput, 4, 2, outputHeight);
 25 |     THCUNN_check_dim_size(state, gradOutput, 4, 3, outputWidth);
 26 |   }
 27 | }
 28 | 
 29 | void THNN_(SpatialUpSamplingBilinear_updateOutput)(
 30 |            THCState *state,
 31 |            THCTensor *input,
 32 |            THCTensor *output,
 33 |            int outputHeight,
 34 |            int outputWidth)
 35 | {
 36 |   int nbatch = THCTensor_(size)(state, input, 0);
 37 |   int channels = THCTensor_(size)(state, input, 1);
 38 |   int inputHeight = THCTensor_(size)(state, input, 2);
 39 |   int inputWidth = THCTensor_(size)(state, input, 3);
 40 |   THNN_(SpatialUpSamplingBilinear_shapeCheck)
 41 |        (state, input, NULL,
 42 |         nbatch, channels,
 43 |         inputHeight, inputWidth,
 44 |         outputHeight, outputWidth);
 45 |   input = THCTensor_(newContiguous)(state, input);
 46 |   THCUNN_assertSameGPU(state, 2, input, output);
 47 |   THCTensor_(resize4d)(state, output,
 48 |                        THCTensor_(size)(state, input, 0),
 49 |                        THCTensor_(size)(state, input, 1),
 50 |                        outputHeight, outputWidth);
 51 |   THCTensor_(zero)(state, output);
 52 |   THCDeviceTensor<real, 4> idata = toDeviceTensor<real, 4>(state, input);
 53 |   THCDeviceTensor<real, 4> odata = toDeviceTensor<real, 4>(state, output);
 54 |   THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
 55 |   const accreal rheight= (outputHeight > 1) ? (accreal)(inputHeight - 1)/(outputHeight - 1) : accreal(0);
 56 |   const accreal rwidth = (outputWidth > 1) ? (accreal)(inputWidth - 1)/(outputWidth - 1) : accreal(0);
 57 |   const int num_kernels = outputHeight * outputWidth;
 58 |   const int num_threads =
 59 |     THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
 60 |   cudaStream_t stream = THCState_getCurrentStream(state);
 61 |   caffe_gpu_interp2_kernel<real, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads ,
 62 |    0 , stream>>>(num_kernels, rheight, rwidth, idata, odata);
 63 |   THCudaCheck(cudaGetLastError());
 64 |   THCTensor_(free)(state, input);
 65 | }
 66 | 
 67 | 
 68 | void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
 69 |            THCState *state,
 70 |            THCTensor *gradOutput,
 71 |            THCTensor *gradInput,
 72 |            int nbatch,
 73 |            int nchannels,
 74 |            int inputHeight,
 75 |            int inputWidth,
 76 |            int outputHeight,
 77 |            int outputWidth)
 78 | {
 79 |   THNN_(SpatialUpSamplingBilinear_shapeCheck)
 80 |        (state, NULL, gradOutput,
 81 |         nbatch, nchannels,
 82 |         inputHeight, inputWidth,
 83 |         outputHeight, outputWidth);
 84 |   gradInput = THCTensor_(newContiguous)(state, gradInput);
 85 |   gradOutput = THCTensor_(newContiguous)(state, gradOutput);
 86 |   THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
 87 |   THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);
 88 |   THCTensor_(zero)(state, gradInput);
 89 |   THCDeviceTensor<real, 4> data1 = toDeviceTensor<real, 4>(state, gradInput);
 90 |   THCDeviceTensor<real, 4> data2 = toDeviceTensor<real, 4>(state, gradOutput);
 91 |   int height1 = data1.getSize(2);
 92 |   int width1 = data1.getSize(3);
 93 |   int height2 = data2.getSize(2);
 94 |   int width2 = data2.getSize(3);
 95 |   assert(height1 > 0 && width1 > 0 && height2 > 0 && width2 > 0);
 96 |   const accreal rheight= (height2 > 1) ? (accreal)(height1 - 1)/(height2 - 1) : accreal(0);
 97 |   const accreal rwidth = (width2 > 1) ? (accreal)(width1 - 1) / (width2 - 1) : accreal(0);
 98 |   const int num_kernels = height2 * width2;
 99 |   const int num_threads =
100 |     THCState_getCurrentDeviceProperties(state)->maxThreadsPerBlock;
101 |   cudaStream_t stream = THCState_getCurrentStream(state);
102 |   caffe_gpu_interp2_kernel_backward<real ,accreal> <<<THCCeilDiv(num_kernels, num_threads),
103 |   num_threads, 0, stream>>>(num_kernels, rheight, rwidth, data1, data2);
104 |   THCudaCheck(cudaGetLastError());
105 |   THCTensor_(free)(state, gradInput);
106 |   THCTensor_(free)(state, gradOutput);
107 | }
108 | 
109 | #endif
110 | 


--------------------------------------------------------------------------------
/lib/THCUNN/SpatialCrossMapLRN.cu:
--------------------------------------------------------------------------------
  1 | #include "THCUNN.h"
  2 | #include "THCHalf.h"
  3 | #include "THCHalfAutoNumerics.cuh"
  4 | #include "common.h"
  5 | 
  6 | template <typename Dtype, typename Acctype>
  7 | __global__ void
  8 | #if __CUDA_ARCH__ >= 320
  9 | __launch_bounds__(CUDA_NUM_THREADS)
 10 | #endif
 11 | LRNFillScale(const int nthreads, const Dtype* const in,
 12 |     const int num, const int channels, const int height,
 13 |     const int width, const int size, const Dtype alpha_over_size,
 14 |     const Dtype k, Dtype* const scale) {
 15 |   CUDA_KERNEL_LOOP(index, nthreads) {
 16 |     // find out the local offset
 17 |     const int w = index % width;
 18 |     const int h = (index / width) % height;
 19 |     const int n = index / width / height;
 20 |     const int offset = (n * channels * height + h) * width + w;
 21 |     const int step = height * width;
 22 |     const Dtype* const in_off = in + offset;
 23 |     Dtype* const scale_off = scale + offset;
 24 |     int head = 0;
 25 |     const int pre_pad = (size - 1) / 2;
 26 |     const int post_pad = size - pre_pad - 1;
 27 |     Acctype accum_scale = Acctype(0);
 28 |     // fill the scale at [n, :, h, w]
 29 |     // accumulate values
 30 |     while (head < post_pad && head < channels) {
 31 |       accum_scale += in_off[head * step] * in_off[head * step];
 32 |       ++head;
 33 |     }
 34 |     // both add and subtract
 35 |     while (head < channels) {
 36 |       accum_scale += in_off[head * step] * in_off[head * step];
 37 |       if (head - size >= 0) {
 38 |         accum_scale -= in_off[(head - size) * step]
 39 |                        * in_off[(head - size) * step];
 40 |       }
 41 |       scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size);
 42 |       ++head;
 43 |     }
 44 |     // subtract only
 45 |     while (head < channels + post_pad) {
 46 |       if (head - size >= 0) {
 47 |         accum_scale -= in_off[(head - size) * step]
 48 |                        * in_off[(head - size) * step];
 49 |       }
 50 |       scale_off[(head - post_pad) * step] = ScalarConvert<Acctype, Dtype>::to(k + accum_scale * alpha_over_size);
 51 |       ++head;
 52 |     }
 53 |   }
 54 | }
 55 | 
 56 | template <typename Dtype>
 57 | __global__ void LRNComputeOutput(const int nthreads, const Dtype* in,
 58 |     const Dtype* scale, const Dtype negative_beta, Dtype* out) {
 59 |   CUDA_KERNEL_LOOP(index, nthreads) {
 60 |     out[index] = in[index] * pow(scale[index], negative_beta);
 61 |   }
 62 | }
 63 | 
 64 | template <typename Dtype, typename Acctype>
 65 | __global__ void LRNComputeDiff(const int nthreads,
 66 |     const Dtype* const bottom_data, const Dtype* const top_data,
 67 |     const Dtype* const scale, const Dtype* const top_diff,
 68 |     const int num, const int channels, const int height,
 69 |     const int width, const int size, const Dtype negative_beta,
 70 |     const Dtype cache_ratio, Dtype* const bottom_diff) {
 71 |   CUDA_KERNEL_LOOP(index, nthreads) {
 72 |     // find out the local offset
 73 |     const int w = index % width;
 74 |     const int h = (index / width) % height;
 75 |     const int n = index / width / height;
 76 |     const int offset = (n * channels * height + h) * width + w;
 77 |     const int step = height * width;
 78 |     const Dtype* const bottom_off = bottom_data + offset;
 79 |     const Dtype* const top_off = top_data + offset;
 80 |     const Dtype* const scale_off = scale + offset;
 81 |     const Dtype* const top_diff_off = top_diff + offset;
 82 |     Dtype* const bottom_diff_off = bottom_diff + offset;
 83 |     int head = 0;
 84 |     const int pre_pad = size - (size + 1) / 2;
 85 |     const int post_pad = size - pre_pad - 1;
 86 |     Acctype accum_ratio = Acctype(0);
 87 |     // accumulate values
 88 |     while (head < post_pad && head < channels) {
 89 |       accum_ratio += top_diff_off[head * step] * top_off[head * step] /
 90 |           scale_off[head * step];
 91 |       ++head;
 92 |     }
 93 |     // both add and subtract
 94 |     while (head < channels) {
 95 |       accum_ratio += top_diff_off[head * step] * top_off[head * step] /
 96 |           scale_off[head * step];
 97 |       if (head - size >= 0) {
 98 |         accum_ratio -= top_diff_off[(head - size) * step] *
 99 |             top_off[(head - size) * step] / scale_off[(head - size) * step];
100 |       }
101 |       bottom_diff_off[(head - post_pad) * step] =
102 |           ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step]
103 |             * pow(scale_off[(head - post_pad) * step], negative_beta)
104 |           - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio);
105 |       ++head;
106 |     }
107 |     // subtract only
108 |     while (head < channels + post_pad) {
109 |       if (head - size >= 0) {
110 |         accum_ratio -= top_diff_off[(head - size) * step] *
111 |             top_off[(head - size) * step] / scale_off[(head - size) * step];
112 |       }
113 |       bottom_diff_off[(head - post_pad) * step] =
114 |           ScalarConvert<Acctype, Dtype>::to(top_diff_off[(head - post_pad) * step]
115 |             * pow(scale_off[(head - post_pad) * step], negative_beta)
116 |           - cache_ratio * bottom_off[(head - post_pad) * step] * accum_ratio);
117 |       ++head;
118 |     }
119 |   }
120 | }
121 | 
122 | 
123 | #include "generic/SpatialCrossMapLRN.cu"
124 | #include "THCGenerateFloatTypes.h"
125 | 


--------------------------------------------------------------------------------