├── ReadMe.md ├── base_conv_layer.cpp ├── base_conv_layer.hpp ├── doc └── ZTE_challenge_2019.pdf ├── math_functions.cpp ├── math_functions.cu └── math_functions.hpp /ReadMe.md: -------------------------------------------------------------------------------- 1 | ## ReadMe 2 | 3 | ### ./doc/ZTE_challenge_2019.pdf is the documentation of the methods I used in ZTE challenge preliminary 4 | 5 | 6 | 7 | ##### Firstly, you should add all of *.cu and *.cpp to src/caffe/layers, and add all of *.hpp to include/caffe/layers . Then, you need change the caffe.proto : 8 | 9 | message ConvolutionParameter { 10 | ... 11 | optional bool combine_relu = 20 [default = false]; // add this sentence 12 | ... 13 | } 14 | 15 | then,recompile Caffe。 16 | 17 | ## How to use 18 | 19 | 20 | 21 | If you want to combine ReLU layer with convolutional layer (only on .caffemodel TEST stage), just delete ReLU layer directly and set `"combine_relu : true"` [default=False]。 22 | 23 | ## Some blogs about how to compile Caffe 24 | 25 | ### On Ubuntu 26 | 27 | > [Ubuntu 16.04 下用 cmake 安装 caffe](https://blog.csdn.net/Chris_zhangrx/article/details/80867482) 28 | 29 | ### On windows 30 | 31 | > [Windows 下用 build_win.cmd 直接编译CPU版caffe](https://blog.csdn.net/Chris_zhangrx/article/details/79096015) 32 | > [Windows 下用 build_win.cmd 直接编译 GPU 版caffe](https://blog.csdn.net/Chris_zhangrx/article/details/83339684) 33 | -------------------------------------------------------------------------------- /base_conv_layer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "caffe/filler.hpp" 5 | #include "caffe/layers/base_conv_layer.hpp" 6 | #include "caffe/util/im2col.hpp" 7 | #include "caffe/util/math_functions.hpp" 8 | 9 | namespace caffe { 10 | 11 | template 12 | void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom, 13 | const vector*>& top) { 14 | // Configure the kernel size, padding, stride, and inputs. 15 | ConvolutionParameter conv_param = this->layer_param_.convolution_param(); 16 | force_nd_im2col_ = conv_param.force_nd_im2col(); 17 | myflag = conv_param.combine_relu(); 18 | channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_param.axis()); 19 | const int first_spatial_axis = channel_axis_ + 1; 20 | const int num_axes = bottom[0]->num_axes(); 21 | num_spatial_axes_ = num_axes - first_spatial_axis; 22 | CHECK_GE(num_spatial_axes_, 0); 23 | vector spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1)); 24 | // Setup filter kernel dimensions (kernel_shape_). 25 | kernel_shape_.Reshape(spatial_dim_blob_shape); 26 | int* kernel_shape_data = kernel_shape_.mutable_cpu_data(); 27 | if (conv_param.has_kernel_h() || conv_param.has_kernel_w()) { 28 | CHECK_EQ(num_spatial_axes_, 2) 29 | << "kernel_h & kernel_w can only be used for 2D convolution."; 30 | CHECK_EQ(0, conv_param.kernel_size_size()) 31 | << "Either kernel_size or kernel_h/w should be specified; not both."; 32 | kernel_shape_data[0] = conv_param.kernel_h(); 33 | kernel_shape_data[1] = conv_param.kernel_w(); 34 | } else { 35 | const int num_kernel_dims = conv_param.kernel_size_size(); 36 | CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_) 37 | << "kernel_size must be specified once, or once per spatial dimension " 38 | << "(kernel_size specified " << num_kernel_dims << " times; " 39 | << num_spatial_axes_ << " spatial dims)."; 40 | for (int i = 0; i < num_spatial_axes_; ++i) { 41 | kernel_shape_data[i] = 42 | conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i); 43 | } 44 | } 45 | for (int i = 0; i < num_spatial_axes_; ++i) { 46 | CHECK_GT(kernel_shape_data[i], 0) << "Filter dimensions must be nonzero."; 47 | } 48 | // Setup stride dimensions (stride_). 49 | stride_.Reshape(spatial_dim_blob_shape); 50 | int* stride_data = stride_.mutable_cpu_data(); 51 | if (conv_param.has_stride_h() || conv_param.has_stride_w()) { 52 | CHECK_EQ(num_spatial_axes_, 2) 53 | << "stride_h & stride_w can only be used for 2D convolution."; 54 | CHECK_EQ(0, conv_param.stride_size()) 55 | << "Either stride or stride_h/w should be specified; not both."; 56 | stride_data[0] = conv_param.stride_h(); 57 | stride_data[1] = conv_param.stride_w(); 58 | } else { 59 | const int num_stride_dims = conv_param.stride_size(); 60 | CHECK(num_stride_dims == 0 || num_stride_dims == 1 || 61 | num_stride_dims == num_spatial_axes_) 62 | << "stride must be specified once, or once per spatial dimension " 63 | << "(stride specified " << num_stride_dims << " times; " 64 | << num_spatial_axes_ << " spatial dims)."; 65 | const int kDefaultStride = 1; 66 | for (int i = 0; i < num_spatial_axes_; ++i) { 67 | stride_data[i] = (num_stride_dims == 0) ? kDefaultStride : 68 | conv_param.stride((num_stride_dims == 1) ? 0 : i); 69 | CHECK_GT(stride_data[i], 0) << "Stride dimensions must be nonzero."; 70 | } 71 | } 72 | // Setup pad dimensions (pad_). 73 | pad_.Reshape(spatial_dim_blob_shape); 74 | int* pad_data = pad_.mutable_cpu_data(); 75 | if (conv_param.has_pad_h() || conv_param.has_pad_w()) { 76 | CHECK_EQ(num_spatial_axes_, 2) 77 | << "pad_h & pad_w can only be used for 2D convolution."; 78 | CHECK_EQ(0, conv_param.pad_size()) 79 | << "Either pad or pad_h/w should be specified; not both."; 80 | pad_data[0] = conv_param.pad_h(); 81 | pad_data[1] = conv_param.pad_w(); 82 | } else { 83 | const int num_pad_dims = conv_param.pad_size(); 84 | CHECK(num_pad_dims == 0 || num_pad_dims == 1 || 85 | num_pad_dims == num_spatial_axes_) 86 | << "pad must be specified once, or once per spatial dimension " 87 | << "(pad specified " << num_pad_dims << " times; " 88 | << num_spatial_axes_ << " spatial dims)."; 89 | const int kDefaultPad = 0; 90 | for (int i = 0; i < num_spatial_axes_; ++i) { 91 | pad_data[i] = (num_pad_dims == 0) ? kDefaultPad : 92 | conv_param.pad((num_pad_dims == 1) ? 0 : i); 93 | } 94 | } 95 | // Setup dilation dimensions (dilation_). 96 | dilation_.Reshape(spatial_dim_blob_shape); 97 | int* dilation_data = dilation_.mutable_cpu_data(); 98 | const int num_dilation_dims = conv_param.dilation_size(); 99 | CHECK(num_dilation_dims == 0 || num_dilation_dims == 1 || 100 | num_dilation_dims == num_spatial_axes_) 101 | << "dilation must be specified once, or once per spatial dimension " 102 | << "(dilation specified " << num_dilation_dims << " times; " 103 | << num_spatial_axes_ << " spatial dims)."; 104 | const int kDefaultDilation = 1; 105 | for (int i = 0; i < num_spatial_axes_; ++i) { 106 | dilation_data[i] = (num_dilation_dims == 0) ? kDefaultDilation : 107 | conv_param.dilation((num_dilation_dims == 1) ? 0 : i); 108 | } 109 | // Special case: im2col is the identity for 1x1 convolution with stride 1 110 | // and no padding, so flag for skipping the buffer and transformation. 111 | is_1x1_ = true; 112 | for (int i = 0; i < num_spatial_axes_; ++i) { 113 | is_1x1_ &= 114 | kernel_shape_data[i] == 1 && stride_data[i] == 1 && pad_data[i] == 0; 115 | if (!is_1x1_) { break; } 116 | } 117 | // Configure output channels and groups. 118 | channels_ = bottom[0]->shape(channel_axis_); 119 | num_output_ = this->layer_param_.convolution_param().num_output(); 120 | CHECK_GT(num_output_, 0); 121 | group_ = this->layer_param_.convolution_param().group(); 122 | CHECK_EQ(channels_ % group_, 0); 123 | CHECK_EQ(num_output_ % group_, 0) 124 | << "Number of output should be multiples of group."; 125 | if (reverse_dimensions()) { 126 | conv_out_channels_ = channels_; 127 | conv_in_channels_ = num_output_; 128 | } else { 129 | conv_out_channels_ = num_output_; 130 | conv_in_channels_ = channels_; 131 | } 132 | // Handle the parameters: weights and biases. 133 | // - blobs_[0] holds the filter weights 134 | // - blobs_[1] holds the biases (optional) 135 | vector weight_shape(2); 136 | weight_shape[0] = conv_out_channels_; 137 | weight_shape[1] = conv_in_channels_ / group_; 138 | for (int i = 0; i < num_spatial_axes_; ++i) { 139 | weight_shape.push_back(kernel_shape_data[i]); 140 | } 141 | bias_term_ = this->layer_param_.convolution_param().bias_term(); 142 | vector bias_shape(bias_term_, num_output_); 143 | if (this->blobs_.size() > 0) { 144 | CHECK_EQ(1 + bias_term_, this->blobs_.size()) 145 | << "Incorrect number of weight blobs."; 146 | if (weight_shape != this->blobs_[0]->shape()) { 147 | Blob weight_shaped_blob(weight_shape); 148 | LOG(FATAL) << "Incorrect weight shape: expected shape " 149 | << weight_shaped_blob.shape_string() << "; instead, shape was " 150 | << this->blobs_[0]->shape_string(); 151 | } 152 | if (bias_term_ && bias_shape != this->blobs_[1]->shape()) { 153 | Blob bias_shaped_blob(bias_shape); 154 | LOG(FATAL) << "Incorrect bias shape: expected shape " 155 | << bias_shaped_blob.shape_string() << "; instead, shape was " 156 | << this->blobs_[1]->shape_string(); 157 | } 158 | LOG(INFO) << "Skipping parameter initialization"; 159 | } else { 160 | if (bias_term_) { 161 | this->blobs_.resize(2); 162 | } else { 163 | this->blobs_.resize(1); 164 | } 165 | // Initialize and fill the weights: 166 | // output channels x input channels per-group x kernel height x kernel width 167 | this->blobs_[0].reset(new Blob(weight_shape)); 168 | shared_ptr > weight_filler(GetFiller( 169 | this->layer_param_.convolution_param().weight_filler())); 170 | weight_filler->Fill(this->blobs_[0].get()); 171 | // If necessary, initialize and fill the biases. 172 | if (bias_term_) { 173 | this->blobs_[1].reset(new Blob(bias_shape)); 174 | shared_ptr > bias_filler(GetFiller( 175 | this->layer_param_.convolution_param().bias_filler())); 176 | bias_filler->Fill(this->blobs_[1].get()); 177 | } 178 | } 179 | kernel_dim_ = this->blobs_[0]->count(1); 180 | weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; 181 | // Propagate gradients to the parameters (as directed by backward pass). 182 | this->param_propagate_down_.resize(this->blobs_.size(), true); 183 | } 184 | 185 | template 186 | void BaseConvolutionLayer::Reshape(const vector*>& bottom, 187 | const vector*>& top) { 188 | const int first_spatial_axis = channel_axis_ + 1; 189 | CHECK_EQ(bottom[0]->num_axes(), first_spatial_axis + num_spatial_axes_) 190 | << "bottom num_axes may not change."; 191 | num_ = bottom[0]->count(0, channel_axis_); 192 | CHECK_EQ(bottom[0]->shape(channel_axis_), channels_) 193 | << "Input size incompatible with convolution kernel."; 194 | // TODO: generalize to handle inputs of different shapes. 195 | for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) { 196 | CHECK(bottom[0]->shape() == bottom[bottom_id]->shape()) 197 | << "All inputs must have the same shape."; 198 | } 199 | // Shape the tops. 200 | bottom_shape_ = &bottom[0]->shape(); 201 | compute_output_shape(); 202 | vector top_shape(bottom[0]->shape().begin(), 203 | bottom[0]->shape().begin() + channel_axis_); 204 | top_shape.push_back(num_output_); 205 | for (int i = 0; i < num_spatial_axes_; ++i) { 206 | top_shape.push_back(output_shape_[i]); 207 | } 208 | for (int top_id = 0; top_id < top.size(); ++top_id) { 209 | top[top_id]->Reshape(top_shape); 210 | } 211 | if (reverse_dimensions()) { 212 | conv_out_spatial_dim_ = bottom[0]->count(first_spatial_axis); 213 | } else { 214 | conv_out_spatial_dim_ = top[0]->count(first_spatial_axis); 215 | } 216 | col_offset_ = kernel_dim_ * conv_out_spatial_dim_; 217 | output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; 218 | // Setup input dimensions (conv_input_shape_). 219 | vector bottom_dim_blob_shape(1, num_spatial_axes_ + 1); 220 | conv_input_shape_.Reshape(bottom_dim_blob_shape); 221 | int* conv_input_shape_data = conv_input_shape_.mutable_cpu_data(); 222 | for (int i = 0; i < num_spatial_axes_ + 1; ++i) { 223 | if (reverse_dimensions()) { 224 | conv_input_shape_data[i] = top[0]->shape(channel_axis_ + i); 225 | } else { 226 | conv_input_shape_data[i] = bottom[0]->shape(channel_axis_ + i); 227 | } 228 | } 229 | // The im2col result buffer will only hold one image at a time to avoid 230 | // overly large memory usage. In the special case of 1x1 convolution 231 | // it goes lazily unused to save memory. 232 | col_buffer_shape_.clear(); 233 | col_buffer_shape_.push_back(kernel_dim_ * group_); 234 | for (int i = 0; i < num_spatial_axes_; ++i) { 235 | if (reverse_dimensions()) { 236 | col_buffer_shape_.push_back(input_shape(i + 1)); 237 | } else { 238 | col_buffer_shape_.push_back(output_shape_[i]); 239 | } 240 | } 241 | col_buffer_.Reshape(col_buffer_shape_); 242 | bottom_dim_ = bottom[0]->count(channel_axis_); 243 | top_dim_ = top[0]->count(channel_axis_); 244 | num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; 245 | num_kernels_col2im_ = reverse_dimensions() ? top_dim_ : bottom_dim_; 246 | // Set up the all ones "bias multiplier" for adding biases by BLAS 247 | out_spatial_dim_ = top[0]->count(first_spatial_axis); 248 | if (bias_term_) { 249 | vector bias_multiplier_shape(1, out_spatial_dim_); 250 | bias_multiplier_.Reshape(bias_multiplier_shape); 251 | caffe_set(bias_multiplier_.count(), Dtype(1), 252 | bias_multiplier_.mutable_cpu_data()); 253 | } 254 | } 255 | 256 | template 257 | void BaseConvolutionLayer::forward_cpu_gemm(const Dtype* input, 258 | const Dtype* weights, Dtype* output, bool skip_im2col) { 259 | const Dtype* col_buff = input; 260 | if (!is_1x1_) { 261 | if (!skip_im2col) { 262 | conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); 263 | } 264 | col_buff = col_buffer_.cpu_data(); 265 | } 266 | for (int g = 0; g < group_; ++g) { 267 | caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ / 268 | group_, conv_out_spatial_dim_, kernel_dim_, 269 | (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, 270 | (Dtype)0., output + output_offset_ * g, myflag); 271 | } 272 | } 273 | 274 | template 275 | void BaseConvolutionLayer::forward_cpu_bias(Dtype* output, 276 | const Dtype* bias) { 277 | caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, 278 | out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(), 279 | (Dtype)1., output, myflag); 280 | } 281 | 282 | template 283 | void BaseConvolutionLayer::backward_cpu_gemm(const Dtype* output, 284 | const Dtype* weights, Dtype* input) { 285 | Dtype* col_buff = col_buffer_.mutable_cpu_data(); 286 | if (is_1x1_) { 287 | col_buff = input; 288 | } 289 | for (int g = 0; g < group_; ++g) { 290 | caffe_cpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_, 291 | conv_out_spatial_dim_, conv_out_channels_ / group_, 292 | (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, 293 | (Dtype)0., col_buff + col_offset_ * g); 294 | } 295 | if (!is_1x1_) { 296 | conv_col2im_cpu(col_buff, input); 297 | } 298 | } 299 | 300 | template 301 | void BaseConvolutionLayer::weight_cpu_gemm(const Dtype* input, 302 | const Dtype* output, Dtype* weights) { 303 | const Dtype* col_buff = input; 304 | if (!is_1x1_) { 305 | conv_im2col_cpu(input, col_buffer_.mutable_cpu_data()); 306 | col_buff = col_buffer_.cpu_data(); 307 | } 308 | for (int g = 0; g < group_; ++g) { 309 | caffe_cpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, 310 | kernel_dim_, conv_out_spatial_dim_, 311 | (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, 312 | (Dtype)1., weights + weight_offset_ * g); 313 | } 314 | } 315 | 316 | template 317 | void BaseConvolutionLayer::backward_cpu_bias(Dtype* bias, 318 | const Dtype* input) { 319 | caffe_cpu_gemv(CblasNoTrans, num_output_, out_spatial_dim_, 1., 320 | input, bias_multiplier_.cpu_data(), 1., bias); 321 | } 322 | 323 | #ifndef CPU_ONLY 324 | 325 | template 326 | void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input, 327 | const Dtype* weights, Dtype* output, bool skip_im2col) { 328 | const Dtype* col_buff = input; 329 | if (!is_1x1_) { 330 | if (!skip_im2col) { 331 | conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); 332 | } 333 | col_buff = col_buffer_.gpu_data(); 334 | } 335 | for (int g = 0; g < group_; ++g) { 336 | caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ / 337 | group_, conv_out_spatial_dim_, kernel_dim_, 338 | (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g, 339 | (Dtype)0., output + output_offset_ * g, myflag); 340 | } 341 | } 342 | 343 | template 344 | void BaseConvolutionLayer::forward_gpu_bias(Dtype* output, 345 | const Dtype* bias) { 346 | caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_, 347 | out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(), 348 | (Dtype)1., output, myflag); 349 | } 350 | 351 | template 352 | void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output, 353 | const Dtype* weights, Dtype* input) { 354 | Dtype* col_buff = col_buffer_.mutable_gpu_data(); 355 | if (is_1x1_) { 356 | col_buff = input; 357 | } 358 | for (int g = 0; g < group_; ++g) { 359 | caffe_gpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_, 360 | conv_out_spatial_dim_, conv_out_channels_ / group_, 361 | (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g, 362 | (Dtype)0., col_buff + col_offset_ * g); 363 | } 364 | if (!is_1x1_) { 365 | conv_col2im_gpu(col_buff, input); 366 | } 367 | } 368 | 369 | template 370 | void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input, 371 | const Dtype* output, Dtype* weights) { 372 | const Dtype* col_buff = input; 373 | if (!is_1x1_) { 374 | conv_im2col_gpu(input, col_buffer_.mutable_gpu_data()); 375 | col_buff = col_buffer_.gpu_data(); 376 | } 377 | for (int g = 0; g < group_; ++g) { 378 | caffe_gpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_, 379 | kernel_dim_, conv_out_spatial_dim_, 380 | (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g, 381 | (Dtype)1., weights + weight_offset_ * g); 382 | } 383 | } 384 | 385 | template 386 | void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias, 387 | const Dtype* input) { 388 | caffe_gpu_gemv(CblasNoTrans, num_output_, out_spatial_dim_, 1., 389 | input, bias_multiplier_.gpu_data(), 1., bias); 390 | } 391 | 392 | #endif // !CPU_ONLY 393 | 394 | INSTANTIATE_CLASS(BaseConvolutionLayer); 395 | 396 | } // namespace caffe 397 | -------------------------------------------------------------------------------- /base_conv_layer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_BASE_CONVOLUTION_LAYER_HPP_ 2 | #define CAFFE_BASE_CONVOLUTION_LAYER_HPP_ 3 | 4 | #include 5 | 6 | #include "caffe/blob.hpp" 7 | #include "caffe/layer.hpp" 8 | #include "caffe/proto/caffe.pb.h" 9 | #include "caffe/util/im2col.hpp" 10 | 11 | namespace caffe { 12 | 13 | /** 14 | * @brief Abstract base class that factors out the BLAS code common to 15 | * ConvolutionLayer and DeconvolutionLayer. 16 | */ 17 | template 18 | class BaseConvolutionLayer : public Layer { 19 | public: 20 | explicit BaseConvolutionLayer(const LayerParameter& param) 21 | : Layer(param) {} 22 | virtual void LayerSetUp(const vector*>& bottom, 23 | const vector*>& top); 24 | virtual void Reshape(const vector*>& bottom, 25 | const vector*>& top); 26 | 27 | virtual inline int MinBottomBlobs() const { return 1; } 28 | virtual inline int MinTopBlobs() const { return 1; } 29 | virtual inline bool EqualNumBottomTopBlobs() const { return true; } 30 | 31 | protected: 32 | // Helper functions that abstract away the column buffer and gemm arguments. 33 | // The last argument in forward_cpu_gemm is so that we can skip the im2col if 34 | // we just called weight_cpu_gemm with the same input. 35 | void forward_cpu_gemm(const Dtype* input, const Dtype* weights, 36 | Dtype* output, bool skip_im2col = false); 37 | void forward_cpu_bias(Dtype* output, const Dtype* bias); 38 | void backward_cpu_gemm(const Dtype* input, const Dtype* weights, 39 | Dtype* output); 40 | void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype* 41 | weights); 42 | void backward_cpu_bias(Dtype* bias, const Dtype* input); 43 | 44 | #ifndef CPU_ONLY 45 | void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights, 46 | Dtype* output, bool skip_im2col = false); 47 | void forward_gpu_bias(Dtype* output, const Dtype* bias); 48 | void backward_gpu_gemm(const Dtype* input, const Dtype* weights, 49 | Dtype* col_output); 50 | void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype* 51 | weights); 52 | void backward_gpu_bias(Dtype* bias, const Dtype* input); 53 | #endif 54 | 55 | /// @brief The spatial dimensions of the input. 56 | inline int input_shape(int i) { 57 | return (*bottom_shape_)[channel_axis_ + i]; 58 | } 59 | // reverse_dimensions should return true iff we are implementing deconv, so 60 | // that conv helpers know which dimensions are which. 61 | virtual bool reverse_dimensions() = 0; 62 | // Compute height_out_ and width_out_ from other parameters. 63 | virtual void compute_output_shape() = 0; 64 | 65 | /// @brief The spatial dimensions of a filter kernel. 66 | Blob kernel_shape_; 67 | /// @brief The spatial dimensions of the stride. 68 | Blob stride_; 69 | /// @brief The spatial dimensions of the padding. 70 | Blob pad_; 71 | /// @brief The spatial dimensions of the dilation. 72 | Blob dilation_; 73 | /// @brief The spatial dimensions of the convolution input. 74 | Blob conv_input_shape_; 75 | /// @brief The spatial dimensions of the col_buffer. 76 | vector col_buffer_shape_; 77 | /// @brief The spatial dimensions of the output. 78 | vector output_shape_; 79 | const vector* bottom_shape_; 80 | 81 | int num_spatial_axes_; 82 | int bottom_dim_; 83 | int top_dim_; 84 | 85 | int channel_axis_; 86 | int num_; 87 | int channels_; 88 | int group_; 89 | int out_spatial_dim_; 90 | int weight_offset_; 91 | int num_output_; 92 | bool bias_term_; 93 | bool is_1x1_; 94 | bool force_nd_im2col_; 95 | bool myflag; 96 | 97 | private: 98 | // wrap im2col/col2im so we don't have to remember the (long) argument lists 99 | inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) { 100 | if (!force_nd_im2col_ && num_spatial_axes_ == 2) { 101 | im2col_cpu(data, conv_in_channels_, 102 | conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], 103 | kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], 104 | pad_.cpu_data()[0], pad_.cpu_data()[1], 105 | stride_.cpu_data()[0], stride_.cpu_data()[1], 106 | dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff); 107 | } else { 108 | im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(), 109 | col_buffer_shape_.data(), kernel_shape_.cpu_data(), 110 | pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff); 111 | } 112 | } 113 | inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) { 114 | if (!force_nd_im2col_ && num_spatial_axes_ == 2) { 115 | col2im_cpu(col_buff, conv_in_channels_, 116 | conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], 117 | kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], 118 | pad_.cpu_data()[0], pad_.cpu_data()[1], 119 | stride_.cpu_data()[0], stride_.cpu_data()[1], 120 | dilation_.cpu_data()[0], dilation_.cpu_data()[1], data); 121 | } else { 122 | col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(), 123 | col_buffer_shape_.data(), kernel_shape_.cpu_data(), 124 | pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data); 125 | } 126 | } 127 | #ifndef CPU_ONLY 128 | inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) { 129 | if (!force_nd_im2col_ && num_spatial_axes_ == 2) { 130 | im2col_gpu(data, conv_in_channels_, 131 | conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], 132 | kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], 133 | pad_.cpu_data()[0], pad_.cpu_data()[1], 134 | stride_.cpu_data()[0], stride_.cpu_data()[1], 135 | dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff); 136 | } else { 137 | im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_, 138 | conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(), 139 | kernel_shape_.gpu_data(), pad_.gpu_data(), 140 | stride_.gpu_data(), dilation_.gpu_data(), col_buff); 141 | } 142 | } 143 | inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) { 144 | if (!force_nd_im2col_ && num_spatial_axes_ == 2) { 145 | col2im_gpu(col_buff, conv_in_channels_, 146 | conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2], 147 | kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1], 148 | pad_.cpu_data()[0], pad_.cpu_data()[1], 149 | stride_.cpu_data()[0], stride_.cpu_data()[1], 150 | dilation_.cpu_data()[0], dilation_.cpu_data()[1], data); 151 | } else { 152 | col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_, 153 | conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(), 154 | kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(), 155 | dilation_.gpu_data(), data); 156 | } 157 | } 158 | #endif 159 | 160 | int num_kernels_im2col_; 161 | int num_kernels_col2im_; 162 | int conv_out_channels_; 163 | int conv_in_channels_; 164 | int conv_out_spatial_dim_; 165 | int kernel_dim_; 166 | int col_offset_; 167 | int output_offset_; 168 | 169 | Blob col_buffer_; 170 | Blob bias_multiplier_; 171 | }; 172 | 173 | } // namespace caffe 174 | 175 | #endif // CAFFE_BASE_CONVOLUTION_LAYER_HPP_ 176 | -------------------------------------------------------------------------------- /doc/ZTE_challenge_2019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anlongstory/caffe_combine_relu/7eb9762b475f1efa9b479b3f16a715101519f905/doc/ZTE_challenge_2019.pdf -------------------------------------------------------------------------------- /math_functions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | #include "caffe/common.hpp" 7 | #include "caffe/util/math_functions.hpp" 8 | #include "caffe/util/rng.hpp" 9 | 10 | namespace caffe { 11 | 12 | template<> 13 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, 14 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 15 | const float alpha, const float* A, const float* B, const float beta, 16 | float* C,const bool myflag) { 17 | int lda = (TransA == CblasNoTrans) ? K : M; 18 | int ldb = (TransB == CblasNoTrans) ? N : K; 19 | cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, 20 | ldb, beta, C, N); 21 | if(myflag) 22 | { 23 | for (int i = 0; i < M*N; i++) 24 | { 25 | *(C + i) = std::max(*(C + i), float(0.)); 26 | } 27 | } 28 | } 29 | 30 | template<> 31 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, 32 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 33 | const double alpha, const double* A, const double* B, const double beta, 34 | double* C,const bool myflag) { 35 | int lda = (TransA == CblasNoTrans) ? K : M; 36 | int ldb = (TransB == CblasNoTrans) ? N : K; 37 | cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, 38 | ldb, beta, C, N); 39 | if(myflag) 40 | { 41 | for (int i = 0; i < M*N; i++) 42 | { 43 | *(C + i) = std::max(*(C + i), double(0.)); 44 | } 45 | } 46 | } 47 | 48 | template <> 49 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, 50 | const int N, const float alpha, const float* A, const float* x, 51 | const float beta, float* y) { 52 | cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); 53 | } 54 | 55 | template <> 56 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, 57 | const int N, const double alpha, const double* A, const double* x, 58 | const double beta, double* y) { 59 | cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); 60 | } 61 | 62 | template <> 63 | void caffe_axpy(const int N, const float alpha, const float* X, 64 | float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } 65 | 66 | template <> 67 | void caffe_axpy(const int N, const double alpha, const double* X, 68 | double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } 69 | 70 | template 71 | void caffe_set(const int N, const Dtype alpha, Dtype* Y) { 72 | if (alpha == 0) { 73 | memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) 74 | return; 75 | } 76 | for (int i = 0; i < N; ++i) { 77 | Y[i] = alpha; 78 | } 79 | } 80 | 81 | template void caffe_set(const int N, const int alpha, int* Y); 82 | template void caffe_set(const int N, const float alpha, float* Y); 83 | template void caffe_set(const int N, const double alpha, double* Y); 84 | 85 | template <> 86 | void caffe_add_scalar(const int N, const float alpha, float* Y) { 87 | for (int i = 0; i < N; ++i) { 88 | Y[i] += alpha; 89 | } 90 | } 91 | 92 | template <> 93 | void caffe_add_scalar(const int N, const double alpha, double* Y) { 94 | for (int i = 0; i < N; ++i) { 95 | Y[i] += alpha; 96 | } 97 | } 98 | 99 | template 100 | void caffe_copy(const int N, const Dtype* X, Dtype* Y) { 101 | if (X != Y) { 102 | if (Caffe::mode() == Caffe::GPU) { 103 | #ifndef CPU_ONLY 104 | // NOLINT_NEXT_LINE(caffe/alt_fn) 105 | CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault)); 106 | #else 107 | NO_GPU; 108 | #endif 109 | } else { 110 | memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn) 111 | } 112 | } 113 | } 114 | 115 | template void caffe_copy(const int N, const int* X, int* Y); 116 | template void caffe_copy(const int N, const unsigned int* X, 117 | unsigned int* Y); 118 | template void caffe_copy(const int N, const float* X, float* Y); 119 | template void caffe_copy(const int N, const double* X, double* Y); 120 | 121 | template <> 122 | void caffe_scal(const int N, const float alpha, float *X) { 123 | cblas_sscal(N, alpha, X, 1); 124 | } 125 | 126 | template <> 127 | void caffe_scal(const int N, const double alpha, double *X) { 128 | cblas_dscal(N, alpha, X, 1); 129 | } 130 | 131 | template <> 132 | void caffe_cpu_axpby(const int N, const float alpha, const float* X, 133 | const float beta, float* Y) { 134 | cblas_saxpby(N, alpha, X, 1, beta, Y, 1); 135 | } 136 | 137 | template <> 138 | void caffe_cpu_axpby(const int N, const double alpha, const double* X, 139 | const double beta, double* Y) { 140 | cblas_daxpby(N, alpha, X, 1, beta, Y, 1); 141 | } 142 | 143 | template <> 144 | void caffe_add(const int n, const float* a, const float* b, 145 | float* y) { 146 | vsAdd(n, a, b, y); 147 | } 148 | 149 | template <> 150 | void caffe_add(const int n, const double* a, const double* b, 151 | double* y) { 152 | vdAdd(n, a, b, y); 153 | } 154 | 155 | template <> 156 | void caffe_sub(const int n, const float* a, const float* b, 157 | float* y) { 158 | vsSub(n, a, b, y); 159 | } 160 | 161 | template <> 162 | void caffe_sub(const int n, const double* a, const double* b, 163 | double* y) { 164 | vdSub(n, a, b, y); 165 | } 166 | 167 | template <> 168 | void caffe_mul(const int n, const float* a, const float* b, 169 | float* y) { 170 | vsMul(n, a, b, y); 171 | } 172 | 173 | template <> 174 | void caffe_mul(const int n, const double* a, const double* b, 175 | double* y) { 176 | vdMul(n, a, b, y); 177 | } 178 | 179 | template <> 180 | void caffe_div(const int n, const float* a, const float* b, 181 | float* y) { 182 | vsDiv(n, a, b, y); 183 | } 184 | 185 | template <> 186 | void caffe_div(const int n, const double* a, const double* b, 187 | double* y) { 188 | vdDiv(n, a, b, y); 189 | } 190 | 191 | template <> 192 | void caffe_powx(const int n, const float* a, const float b, 193 | float* y) { 194 | vsPowx(n, a, b, y); 195 | } 196 | 197 | template <> 198 | void caffe_powx(const int n, const double* a, const double b, 199 | double* y) { 200 | vdPowx(n, a, b, y); 201 | } 202 | 203 | template <> 204 | void caffe_sqr(const int n, const float* a, float* y) { 205 | vsSqr(n, a, y); 206 | } 207 | 208 | template <> 209 | void caffe_sqr(const int n, const double* a, double* y) { 210 | vdSqr(n, a, y); 211 | } 212 | 213 | template <> 214 | void caffe_sqrt(const int n, const float* a, float* y) { 215 | vsSqrt(n, a, y); 216 | } 217 | 218 | template <> 219 | void caffe_sqrt(const int n, const double* a, double* y) { 220 | vdSqrt(n, a, y); 221 | } 222 | 223 | template <> 224 | void caffe_exp(const int n, const float* a, float* y) { 225 | vsExp(n, a, y); 226 | } 227 | 228 | template <> 229 | void caffe_exp(const int n, const double* a, double* y) { 230 | vdExp(n, a, y); 231 | } 232 | 233 | template <> 234 | void caffe_log(const int n, const float* a, float* y) { 235 | vsLn(n, a, y); 236 | } 237 | 238 | template <> 239 | void caffe_log(const int n, const double* a, double* y) { 240 | vdLn(n, a, y); 241 | } 242 | 243 | template <> 244 | void caffe_abs(const int n, const float* a, float* y) { 245 | vsAbs(n, a, y); 246 | } 247 | 248 | template <> 249 | void caffe_abs(const int n, const double* a, double* y) { 250 | vdAbs(n, a, y); 251 | } 252 | 253 | unsigned int caffe_rng_rand() { 254 | return (*caffe_rng())(); 255 | } 256 | 257 | template 258 | Dtype caffe_nextafter(const Dtype b) { 259 | return boost::math::nextafter( 260 | b, std::numeric_limits::max()); 261 | } 262 | 263 | template 264 | float caffe_nextafter(const float b); 265 | 266 | template 267 | double caffe_nextafter(const double b); 268 | 269 | template 270 | void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) { 271 | CHECK_GE(n, 0); 272 | CHECK(r); 273 | CHECK_LE(a, b); 274 | boost::uniform_real random_distribution(a, caffe_nextafter(b)); 275 | boost::variate_generator > 276 | variate_generator(caffe_rng(), random_distribution); 277 | for (int i = 0; i < n; ++i) { 278 | r[i] = variate_generator(); 279 | } 280 | } 281 | 282 | template 283 | void caffe_rng_uniform(const int n, const float a, const float b, 284 | float* r); 285 | 286 | template 287 | void caffe_rng_uniform(const int n, const double a, const double b, 288 | double* r); 289 | 290 | template 291 | void caffe_rng_gaussian(const int n, const Dtype a, 292 | const Dtype sigma, Dtype* r) { 293 | CHECK_GE(n, 0); 294 | CHECK(r); 295 | CHECK_GT(sigma, 0); 296 | boost::normal_distribution random_distribution(a, sigma); 297 | boost::variate_generator > 298 | variate_generator(caffe_rng(), random_distribution); 299 | for (int i = 0; i < n; ++i) { 300 | r[i] = variate_generator(); 301 | } 302 | } 303 | 304 | template 305 | void caffe_rng_gaussian(const int n, const float mu, 306 | const float sigma, float* r); 307 | 308 | template 309 | void caffe_rng_gaussian(const int n, const double mu, 310 | const double sigma, double* r); 311 | 312 | template 313 | void caffe_rng_bernoulli(const int n, const Dtype p, int* r) { 314 | CHECK_GE(n, 0); 315 | CHECK(r); 316 | CHECK_GE(p, 0); 317 | CHECK_LE(p, 1); 318 | boost::bernoulli_distribution random_distribution(p); 319 | boost::variate_generator > 320 | variate_generator(caffe_rng(), random_distribution); 321 | for (int i = 0; i < n; ++i) { 322 | r[i] = variate_generator(); 323 | } 324 | } 325 | 326 | template 327 | void caffe_rng_bernoulli(const int n, const double p, int* r); 328 | 329 | template 330 | void caffe_rng_bernoulli(const int n, const float p, int* r); 331 | 332 | template 333 | void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) { 334 | CHECK_GE(n, 0); 335 | CHECK(r); 336 | CHECK_GE(p, 0); 337 | CHECK_LE(p, 1); 338 | boost::bernoulli_distribution random_distribution(p); 339 | boost::variate_generator > 340 | variate_generator(caffe_rng(), random_distribution); 341 | for (int i = 0; i < n; ++i) { 342 | r[i] = static_cast(variate_generator()); 343 | } 344 | } 345 | 346 | template 347 | void caffe_rng_bernoulli(const int n, const double p, unsigned int* r); 348 | 349 | template 350 | void caffe_rng_bernoulli(const int n, const float p, unsigned int* r); 351 | 352 | template <> 353 | float caffe_cpu_strided_dot(const int n, const float* x, const int incx, 354 | const float* y, const int incy) { 355 | return cblas_sdot(n, x, incx, y, incy); 356 | } 357 | 358 | template <> 359 | double caffe_cpu_strided_dot(const int n, const double* x, 360 | const int incx, const double* y, const int incy) { 361 | return cblas_ddot(n, x, incx, y, incy); 362 | } 363 | 364 | template 365 | Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) { 366 | return caffe_cpu_strided_dot(n, x, 1, y, 1); 367 | } 368 | 369 | template 370 | float caffe_cpu_dot(const int n, const float* x, const float* y); 371 | 372 | template 373 | double caffe_cpu_dot(const int n, const double* x, const double* y); 374 | 375 | template <> 376 | float caffe_cpu_asum(const int n, const float* x) { 377 | return cblas_sasum(n, x, 1); 378 | } 379 | 380 | template <> 381 | double caffe_cpu_asum(const int n, const double* x) { 382 | return cblas_dasum(n, x, 1); 383 | } 384 | 385 | template <> 386 | void caffe_cpu_scale(const int n, const float alpha, const float *x, 387 | float* y) { 388 | cblas_scopy(n, x, 1, y, 1); 389 | cblas_sscal(n, alpha, y, 1); 390 | } 391 | 392 | template <> 393 | void caffe_cpu_scale(const int n, const double alpha, const double *x, 394 | double* y) { 395 | cblas_dcopy(n, x, 1, y, 1); 396 | cblas_dscal(n, alpha, y, 1); 397 | } 398 | 399 | } // namespace caffe 400 | -------------------------------------------------------------------------------- /math_functions.cu: -------------------------------------------------------------------------------- 1 | #include // CUDA's, not caffe's, for fabs, signbit 2 | #include 3 | #include // thrust::plus 4 | #include 5 | 6 | #include 7 | 8 | #include "caffe/common.hpp" 9 | #include "caffe/util/math_functions.hpp" 10 | 11 | namespace caffe { 12 | 13 | template 14 | __global__ void ReLUForward2(const int n, const Dtype* in, Dtype* out) { 15 | CUDA_KERNEL_LOOP(index, n) { 16 | out[index] = in[index] > 0 ? in[index] : 0; 17 | } 18 | } 19 | 20 | template <> 21 | void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, 22 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 23 | const float alpha, const float* A, const float* B, const float beta, 24 | float* C,const bool myflag) { 25 | // Note that cublas follows fortran order. 26 | int lda = (TransA == CblasNoTrans) ? K : M; 27 | int ldb = (TransB == CblasNoTrans) ? N : K; 28 | cublasOperation_t cuTransA = 29 | (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; 30 | cublasOperation_t cuTransB = 31 | (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; 32 | CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA, 33 | N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); 34 | 35 | if (myflag) 36 | { 37 | ReLUForward2 << > >( 38 | M*N, C, C); 39 | CUDA_POST_KERNEL_CHECK; 40 | } 41 | } 42 | 43 | template <> 44 | void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, 45 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 46 | const double alpha, const double* A, const double* B, const double beta, 47 | double* C,const bool myflag) { 48 | // Note that cublas follows fortran order. 49 | int lda = (TransA == CblasNoTrans) ? K : M; 50 | int ldb = (TransB == CblasNoTrans) ? N : K; 51 | cublasOperation_t cuTransA = 52 | (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; 53 | cublasOperation_t cuTransB = 54 | (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; 55 | CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA, 56 | N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); 57 | if (myflag) 58 | { 59 | ReLUForward2 << > >( 60 | M*N, C, C); 61 | CUDA_POST_KERNEL_CHECK; 62 | } 63 | } 64 | 65 | template <> 66 | void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, 67 | const int N, const float alpha, const float* A, const float* x, 68 | const float beta, float* y) { 69 | cublasOperation_t cuTransA = 70 | (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; 71 | CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, 72 | A, N, x, 1, &beta, y, 1)); 73 | } 74 | 75 | template <> 76 | void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, 77 | const int N, const double alpha, const double* A, const double* x, 78 | const double beta, double* y) { 79 | cublasOperation_t cuTransA = 80 | (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; 81 | CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha, 82 | A, N, x, 1, &beta, y, 1)); 83 | } 84 | 85 | template <> 86 | void caffe_gpu_axpy(const int N, const float alpha, const float* X, 87 | float* Y) { 88 | CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); 89 | } 90 | 91 | template <> 92 | void caffe_gpu_axpy(const int N, const double alpha, const double* X, 93 | double* Y) { 94 | CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1)); 95 | } 96 | 97 | void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) { 98 | if (X != Y) { 99 | CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault)); // NOLINT(caffe/alt_fn) 100 | } 101 | } 102 | 103 | template <> 104 | void caffe_gpu_scal(const int N, const float alpha, float *X) { 105 | CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); 106 | } 107 | 108 | template <> 109 | void caffe_gpu_scal(const int N, const double alpha, double *X) { 110 | CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); 111 | } 112 | 113 | template <> 114 | void caffe_gpu_scal(const int N, const float alpha, float* X, 115 | cudaStream_t str) { 116 | cudaStream_t initial_stream; 117 | CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream)); 118 | CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str)); 119 | CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1)); 120 | CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream)); 121 | } 122 | 123 | template <> 124 | void caffe_gpu_scal(const int N, const double alpha, double* X, 125 | cudaStream_t str) { 126 | cudaStream_t initial_stream; 127 | CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream)); 128 | CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str)); 129 | CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1)); 130 | CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream)); 131 | } 132 | 133 | template <> 134 | void caffe_gpu_axpby(const int N, const float alpha, const float* X, 135 | const float beta, float* Y) { 136 | caffe_gpu_scal(N, beta, Y); 137 | caffe_gpu_axpy(N, alpha, X, Y); 138 | } 139 | 140 | template <> 141 | void caffe_gpu_axpby(const int N, const double alpha, const double* X, 142 | const double beta, double* Y) { 143 | caffe_gpu_scal(N, beta, Y); 144 | caffe_gpu_axpy(N, alpha, X, Y); 145 | } 146 | 147 | template <> 148 | void caffe_gpu_dot(const int n, const float* x, const float* y, 149 | float* out) { 150 | CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); 151 | } 152 | 153 | template <> 154 | void caffe_gpu_dot(const int n, const double* x, const double* y, 155 | double * out) { 156 | CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); 157 | } 158 | 159 | template <> 160 | void caffe_gpu_asum(const int n, const float* x, float* y) { 161 | CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); 162 | } 163 | 164 | template <> 165 | void caffe_gpu_asum(const int n, const double* x, double* y) { 166 | CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y)); 167 | } 168 | 169 | template <> 170 | void caffe_gpu_scale(const int n, const float alpha, const float *x, 171 | float* y) { 172 | CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); 173 | CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); 174 | } 175 | 176 | template <> 177 | void caffe_gpu_scale(const int n, const double alpha, const double *x, 178 | double* y) { 179 | CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1)); 180 | CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1)); 181 | } 182 | 183 | template 184 | __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) { 185 | CUDA_KERNEL_LOOP(index, n) { 186 | y[index] = alpha; 187 | } 188 | } 189 | 190 | template 191 | void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) { 192 | if (alpha == 0) { 193 | CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N)); // NOLINT(caffe/alt_fn) 194 | return; 195 | } 196 | // NOLINT_NEXT_LINE(whitespace/operators) 197 | set_kernel<<>>( 198 | N, alpha, Y); 199 | } 200 | 201 | template void caffe_gpu_set(const int N, const int alpha, int* Y); 202 | template void caffe_gpu_set(const int N, const float alpha, float* Y); 203 | template void caffe_gpu_set(const int N, const double alpha, double* Y); 204 | 205 | template 206 | __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) { 207 | CUDA_KERNEL_LOOP(index, n) { 208 | y[index] += alpha; 209 | } 210 | } 211 | 212 | template <> 213 | void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) { 214 | // NOLINT_NEXT_LINE(whitespace/operators) 215 | add_scalar_kernel<<>>( 216 | N, alpha, Y); 217 | } 218 | 219 | template <> 220 | void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) { 221 | // NOLINT_NEXT_LINE(whitespace/operators) 222 | add_scalar_kernel<<>>( 223 | N, alpha, Y); 224 | } 225 | 226 | template 227 | __global__ void add_kernel(const int n, const Dtype* a, 228 | const Dtype* b, Dtype* y) { 229 | CUDA_KERNEL_LOOP(index, n) { 230 | y[index] = a[index] + b[index]; 231 | } 232 | } 233 | 234 | template <> 235 | void caffe_gpu_add(const int N, const float* a, const float* b, 236 | float* y) { 237 | // NOLINT_NEXT_LINE(whitespace/operators) 238 | add_kernel<<>>( 239 | N, a, b, y); 240 | } 241 | 242 | template <> 243 | void caffe_gpu_add(const int N, const double* a, const double* b, 244 | double* y) { 245 | // NOLINT_NEXT_LINE(whitespace/operators) 246 | add_kernel<<>>( 247 | N, a, b, y); 248 | } 249 | 250 | template 251 | __global__ void sub_kernel(const int n, const Dtype* a, 252 | const Dtype* b, Dtype* y) { 253 | CUDA_KERNEL_LOOP(index, n) { 254 | y[index] = a[index] - b[index]; 255 | } 256 | } 257 | 258 | template <> 259 | void caffe_gpu_sub(const int N, const float* a, const float* b, 260 | float* y) { 261 | // NOLINT_NEXT_LINE(whitespace/operators) 262 | sub_kernel<<>>( 263 | N, a, b, y); 264 | } 265 | 266 | template <> 267 | void caffe_gpu_sub(const int N, const double* a, const double* b, 268 | double* y) { 269 | // NOLINT_NEXT_LINE(whitespace/operators) 270 | sub_kernel<<>>( 271 | N, a, b, y); 272 | } 273 | 274 | template 275 | __global__ void mul_kernel(const int n, const Dtype* a, 276 | const Dtype* b, Dtype* y) { 277 | CUDA_KERNEL_LOOP(index, n) { 278 | y[index] = a[index] * b[index]; 279 | } 280 | } 281 | 282 | template <> 283 | void caffe_gpu_mul(const int N, const float* a, 284 | const float* b, float* y) { 285 | // NOLINT_NEXT_LINE(whitespace/operators) 286 | mul_kernel<<>>( 287 | N, a, b, y); 288 | } 289 | 290 | template <> 291 | void caffe_gpu_mul(const int N, const double* a, 292 | const double* b, double* y) { 293 | // NOLINT_NEXT_LINE(whitespace/operators) 294 | mul_kernel<<>>( 295 | N, a, b, y); 296 | } 297 | 298 | template 299 | __global__ void div_kernel(const int n, const Dtype* a, 300 | const Dtype* b, Dtype* y) { 301 | CUDA_KERNEL_LOOP(index, n) { 302 | y[index] = a[index] / b[index]; 303 | } 304 | } 305 | 306 | template <> 307 | void caffe_gpu_div(const int N, const float* a, 308 | const float* b, float* y) { 309 | // NOLINT_NEXT_LINE(whitespace/operators) 310 | div_kernel<<>>( 311 | N, a, b, y); 312 | } 313 | 314 | template <> 315 | void caffe_gpu_div(const int N, const double* a, 316 | const double* b, double* y) { 317 | // NOLINT_NEXT_LINE(whitespace/operators) 318 | div_kernel<<>>( 319 | N, a, b, y); 320 | } 321 | 322 | template 323 | __global__ void abs_kernel(const int n, const Dtype* a, Dtype* y) { 324 | CUDA_KERNEL_LOOP(index, n) { 325 | y[index] = abs(a[index]); 326 | } 327 | } 328 | 329 | template <> 330 | void caffe_gpu_abs(const int N, const float* a, float* y) { 331 | // NOLINT_NEXT_LINE(whitespace/operators) 332 | abs_kernel<<>>( 333 | N, a, y); 334 | } 335 | 336 | template <> 337 | void caffe_gpu_abs(const int N, const double* a, double* y) { 338 | // NOLINT_NEXT_LINE(whitespace/operators) 339 | abs_kernel<<>>( 340 | N, a, y); 341 | } 342 | 343 | 344 | template 345 | __global__ void exp_kernel(const int n, const Dtype* a, Dtype* y) { 346 | CUDA_KERNEL_LOOP(index, n) { 347 | y[index] = exp(a[index]); 348 | } 349 | } 350 | 351 | template <> 352 | void caffe_gpu_exp(const int N, const float* a, float* y) { 353 | // NOLINT_NEXT_LINE(whitespace/operators) 354 | exp_kernel<<>>( 355 | N, a, y); 356 | } 357 | 358 | template <> 359 | void caffe_gpu_exp(const int N, const double* a, double* y) { 360 | // NOLINT_NEXT_LINE(whitespace/operators) 361 | exp_kernel<<>>( 362 | N, a, y); 363 | } 364 | 365 | template 366 | __global__ void log_kernel(const int n, const Dtype* a, Dtype* y) { 367 | CUDA_KERNEL_LOOP(index, n) { 368 | y[index] = log(a[index]); 369 | } 370 | } 371 | 372 | template <> 373 | void caffe_gpu_log(const int N, const float* a, float* y) { 374 | // NOLINT_NEXT_LINE(whitespace/operators) 375 | log_kernel<<>>( 376 | N, a, y); 377 | } 378 | 379 | template <> 380 | void caffe_gpu_log(const int N, const double* a, double* y) { 381 | // NOLINT_NEXT_LINE(whitespace/operators) 382 | log_kernel<<>>( 383 | N, a, y); 384 | } 385 | 386 | template 387 | __global__ void powx_kernel(const int n, const Dtype* a, 388 | const Dtype alpha, Dtype* y) { 389 | CUDA_KERNEL_LOOP(index, n) { 390 | y[index] = pow(a[index], alpha); 391 | } 392 | } 393 | 394 | template <> 395 | void caffe_gpu_powx(const int N, const float* a, 396 | const float alpha, float* y) { 397 | // NOLINT_NEXT_LINE(whitespace/operators) 398 | powx_kernel<<>>( 399 | N, a, alpha, y); 400 | } 401 | 402 | template <> 403 | void caffe_gpu_powx(const int N, const double* a, 404 | const double alpha, double* y) { 405 | // NOLINT_NEXT_LINE(whitespace/operators) 406 | powx_kernel<<>>( 407 | N, a, alpha, y); 408 | } 409 | 410 | template 411 | __global__ void sqrt_kernel(const int n, const Dtype* a, Dtype* y) { 412 | CUDA_KERNEL_LOOP(index, n) { 413 | y[index] = sqrt(a[index]); 414 | } 415 | } 416 | 417 | template <> 418 | void caffe_gpu_sqrt(const int N, const float* a, float* y) { 419 | // NOLINT_NEXT_LINE(whitespace/operators) 420 | sqrt_kernel<<>>( 421 | N, a, y); 422 | } 423 | 424 | template <> 425 | void caffe_gpu_sqrt(const int N, const double* a, double* y) { 426 | // NOLINT_NEXT_LINE(whitespace/operators) 427 | sqrt_kernel<<>>( 428 | N, a, y); 429 | } 430 | 431 | DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sign, y[index] = (Dtype(0) < x[index]) 432 | - (x[index] < Dtype(0))); 433 | DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(sgnbit, y[index] = signbit(x[index])); 434 | 435 | void caffe_gpu_rng_uniform(const int n, unsigned int* r) { 436 | CURAND_CHECK(curandGenerate(Caffe::curand_generator(), r, n)); 437 | } 438 | 439 | template <> 440 | void caffe_gpu_rng_uniform(const int n, const float a, const float b, 441 | float* r) { 442 | CURAND_CHECK(curandGenerateUniform(Caffe::curand_generator(), r, n)); 443 | const float range = b - a; 444 | if (range != static_cast(1)) { 445 | caffe_gpu_scal(n, range, r); 446 | } 447 | if (a != static_cast(0)) { 448 | caffe_gpu_add_scalar(n, a, r); 449 | } 450 | } 451 | 452 | template <> 453 | void caffe_gpu_rng_uniform(const int n, const double a, const double b, 454 | double* r) { 455 | CURAND_CHECK(curandGenerateUniformDouble(Caffe::curand_generator(), r, n)); 456 | const double range = b - a; 457 | if (range != static_cast(1)) { 458 | caffe_gpu_scal(n, range, r); 459 | } 460 | if (a != static_cast(0)) { 461 | caffe_gpu_add_scalar(n, a, r); 462 | } 463 | } 464 | 465 | template <> 466 | void caffe_gpu_rng_gaussian(const int n, const float mu, const float sigma, 467 | float* r) { 468 | CURAND_CHECK( 469 | curandGenerateNormal(Caffe::curand_generator(), r, n, mu, sigma)); 470 | } 471 | 472 | template <> 473 | void caffe_gpu_rng_gaussian(const int n, const double mu, const double sigma, 474 | double* r) { 475 | CURAND_CHECK( 476 | curandGenerateNormalDouble(Caffe::curand_generator(), r, n, mu, sigma)); 477 | } 478 | 479 | } // namespace caffe 480 | -------------------------------------------------------------------------------- /math_functions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_ 2 | #define CAFFE_UTIL_MATH_FUNCTIONS_H_ 3 | 4 | #include 5 | #include // for std::fabs and std::signbit 6 | 7 | #include "glog/logging.h" 8 | 9 | #include "caffe/common.hpp" 10 | #include "caffe/util/device_alternate.hpp" 11 | #include "caffe/util/mkl_alternate.hpp" 12 | 13 | namespace caffe { 14 | 15 | // Caffe gemm provides a simpler interface to the gemm functions, with the 16 | // limitation that the data has to be contiguous in memory. 17 | template 18 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, 19 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 20 | const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, 21 | Dtype* C, const bool myflag=false); 22 | 23 | template 24 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, 25 | const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, 26 | Dtype* y); 27 | 28 | template 29 | void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, 30 | Dtype* Y); 31 | 32 | template 33 | void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, 34 | const Dtype beta, Dtype* Y); 35 | 36 | template 37 | void caffe_copy(const int N, const Dtype *X, Dtype *Y); 38 | 39 | template 40 | void caffe_set(const int N, const Dtype alpha, Dtype *X); 41 | 42 | inline void caffe_memset(const size_t N, const int alpha, void* X) { 43 | memset(X, alpha, N); // NOLINT(caffe/alt_fn) 44 | } 45 | 46 | template 47 | void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); 48 | 49 | template 50 | void caffe_scal(const int N, const Dtype alpha, Dtype *X); 51 | 52 | template 53 | void caffe_sqr(const int N, const Dtype* a, Dtype* y); 54 | 55 | template 56 | void caffe_sqrt(const int N, const Dtype* a, Dtype* y); 57 | 58 | template 59 | void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); 60 | 61 | template 62 | void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); 63 | 64 | template 65 | void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); 66 | 67 | template 68 | void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); 69 | 70 | template 71 | void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); 72 | 73 | unsigned int caffe_rng_rand(); 74 | 75 | template 76 | Dtype caffe_nextafter(const Dtype b); 77 | 78 | template 79 | void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); 80 | 81 | template 82 | void caffe_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, 83 | Dtype* r); 84 | 85 | template 86 | void caffe_rng_bernoulli(const int n, const Dtype p, int* r); 87 | 88 | template 89 | void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r); 90 | 91 | template 92 | void caffe_exp(const int n, const Dtype* a, Dtype* y); 93 | 94 | template 95 | void caffe_log(const int n, const Dtype* a, Dtype* y); 96 | 97 | template 98 | void caffe_abs(const int n, const Dtype* a, Dtype* y); 99 | 100 | template 101 | Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); 102 | 103 | template 104 | Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, 105 | const Dtype* y, const int incy); 106 | 107 | // Returns the sum of the absolute values of the elements of vector x 108 | template 109 | Dtype caffe_cpu_asum(const int n, const Dtype* x); 110 | 111 | // the branchless, type-safe version from 112 | // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c 113 | template 114 | inline int8_t caffe_sign(Dtype val) { 115 | return (Dtype(0) < val) - (val < Dtype(0)); 116 | } 117 | 118 | // The following two macros are modifications of DEFINE_VSL_UNARY_FUNC 119 | // in include/caffe/util/mkl_alternate.hpp authored by @Rowland Depp. 120 | // Please refer to commit 7e8ef25c7 of the boost-eigen branch. 121 | // Git cherry picking that commit caused a conflict hard to resolve and 122 | // copying that file in convenient for code reviewing. 123 | // So they have to be pasted here temporarily. 124 | #define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \ 125 | template \ 126 | void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \ 127 | CHECK_GT(n, 0); CHECK(x); CHECK(y); \ 128 | for (int i = 0; i < n; ++i) { \ 129 | operation; \ 130 | } \ 131 | } 132 | 133 | // output is 1 for the positives, 0 for zero, and -1 for the negatives 134 | DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])) 135 | 136 | // This returns a nonzero value if the input has its sign bit set. 137 | // The name sngbit is meant to avoid conflicts with std::signbit in the macro. 138 | // The extra parens are needed because CUDA < 6.5 defines signbit as a macro, 139 | // and we don't want that to expand here when CUDA headers are also included. 140 | DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \ 141 | y[i] = static_cast((std::signbit)(x[i]))) 142 | 143 | DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])) 144 | 145 | template 146 | void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); 147 | 148 | #ifndef CPU_ONLY // GPU 149 | 150 | // Decaf gpu gemm provides an interface that is almost the same as the cpu 151 | // gemm function - following the c convention and calling the fortran-order 152 | // gpu code under the hood. 153 | template 154 | void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, 155 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 156 | const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, 157 | Dtype* C, const bool myflag=false); 158 | 159 | template 160 | void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, 161 | const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, 162 | Dtype* y); 163 | 164 | template 165 | void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, 166 | Dtype* Y); 167 | 168 | template 169 | void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, 170 | const Dtype beta, Dtype* Y); 171 | 172 | void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); 173 | 174 | template 175 | void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); 176 | 177 | inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { 178 | #ifndef CPU_ONLY 179 | CUDA_CHECK(cudaMemset(X, alpha, N)); // NOLINT(caffe/alt_fn) 180 | #else 181 | NO_GPU; 182 | #endif 183 | } 184 | 185 | template 186 | void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X); 187 | 188 | template 189 | void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X); 190 | 191 | #ifndef CPU_ONLY 192 | template 193 | void caffe_gpu_scal(const int N, const Dtype alpha, Dtype* X, cudaStream_t str); 194 | #endif 195 | 196 | template 197 | void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); 198 | 199 | template 200 | void caffe_gpu_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); 201 | 202 | template 203 | void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); 204 | 205 | template 206 | void caffe_gpu_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); 207 | 208 | template 209 | void caffe_gpu_abs(const int n, const Dtype* a, Dtype* y); 210 | 211 | template 212 | void caffe_gpu_exp(const int n, const Dtype* a, Dtype* y); 213 | 214 | template 215 | void caffe_gpu_log(const int n, const Dtype* a, Dtype* y); 216 | 217 | template 218 | void caffe_gpu_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); 219 | 220 | template 221 | void caffe_gpu_sqrt(const int n, const Dtype* a, Dtype* y); 222 | 223 | // caffe_gpu_rng_uniform with two arguments generates integers in the range 224 | // [0, UINT_MAX]. 225 | void caffe_gpu_rng_uniform(const int n, unsigned int* r); 226 | 227 | // caffe_gpu_rng_uniform with four arguments generates floats in the range 228 | // (a, b] (strictly greater than a, less than or equal to b) due to the 229 | // specification of curandGenerateUniform. With a = 0, b = 1, just calls 230 | // curandGenerateUniform; with other limits will shift and scale the outputs 231 | // appropriately after calling curandGenerateUniform. 232 | template 233 | void caffe_gpu_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r); 234 | 235 | template 236 | void caffe_gpu_rng_gaussian(const int n, const Dtype mu, const Dtype sigma, 237 | Dtype* r); 238 | 239 | template 240 | void caffe_gpu_rng_bernoulli(const int n, const Dtype p, int* r); 241 | 242 | template 243 | void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out); 244 | 245 | template 246 | void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y); 247 | 248 | template 249 | void caffe_gpu_sign(const int n, const Dtype* x, Dtype* y); 250 | 251 | template 252 | void caffe_gpu_sgnbit(const int n, const Dtype* x, Dtype* y); 253 | 254 | template 255 | void caffe_gpu_fabs(const int n, const Dtype* x, Dtype* y); 256 | 257 | template 258 | void caffe_gpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); 259 | 260 | #define DEFINE_AND_INSTANTIATE_GPU_UNARY_FUNC(name, operation) \ 261 | template \ 262 | __global__ void name##_kernel(const int n, const Dtype* x, Dtype* y) { \ 263 | CUDA_KERNEL_LOOP(index, n) { \ 264 | operation; \ 265 | } \ 266 | } \ 267 | template <> \ 268 | void caffe_gpu_##name(const int n, const float* x, float* y) { \ 269 | /* NOLINT_NEXT_LINE(whitespace/operators) */ \ 270 | name##_kernel<<>>( \ 271 | n, x, y); \ 272 | } \ 273 | template <> \ 274 | void caffe_gpu_##name(const int n, const double* x, double* y) { \ 275 | /* NOLINT_NEXT_LINE(whitespace/operators) */ \ 276 | name##_kernel<<>>( \ 277 | n, x, y); \ 278 | } 279 | 280 | #endif // !CPU_ONLY 281 | 282 | } // namespace caffe 283 | 284 | #endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ 285 | --------------------------------------------------------------------------------