├── ReadMe.md
├── base_conv_layer.cpp
├── base_conv_layer.hpp
├── doc
└── ZTE_challenge_2019.pdf
├── math_functions.cpp
├── math_functions.cu
└── math_functions.hpp
/ReadMe.md:
--------------------------------------------------------------------------------
1 | ## ReadMe
2 |
3 | ### ./doc/ZTE_challenge_2019.pdf is the documentation of the methods I used in ZTE challenge preliminary
4 |
5 |
6 |
7 | ##### Firstly, you should add all of *.cu and *.cpp to src/caffe/layers, and add all of *.hpp to include/caffe/layers . Then, you need change the caffe.proto :
8 |
9 | message ConvolutionParameter {
10 | ...
11 | optional bool combine_relu = 20 [default = false]; // add this sentence
12 | ...
13 | }
14 |
15 | then,recompile Caffe。
16 |
17 | ## How to use
18 |
19 |
20 |
21 | If you want to combine ReLU layer with convolutional layer (only on .caffemodel TEST stage), just delete ReLU layer directly and set `"combine_relu : true"` [default=False]。
22 |
23 | ## Some blogs about how to compile Caffe
24 |
25 | ### On Ubuntu
26 |
27 | > [Ubuntu 16.04 下用 cmake 安装 caffe](https://blog.csdn.net/Chris_zhangrx/article/details/80867482)
28 |
29 | ### On windows
30 |
31 | > [Windows 下用 build_win.cmd 直接编译CPU版caffe](https://blog.csdn.net/Chris_zhangrx/article/details/79096015)
32 | > [Windows 下用 build_win.cmd 直接编译 GPU 版caffe](https://blog.csdn.net/Chris_zhangrx/article/details/83339684)
33 |
--------------------------------------------------------------------------------
/base_conv_layer.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #include "caffe/filler.hpp"
5 | #include "caffe/layers/base_conv_layer.hpp"
6 | #include "caffe/util/im2col.hpp"
7 | #include "caffe/util/math_functions.hpp"
8 |
9 | namespace caffe {
10 |
11 | template
12 | void BaseConvolutionLayer::LayerSetUp(const vector*>& bottom,
13 | const vector*>& top) {
14 | // Configure the kernel size, padding, stride, and inputs.
15 | ConvolutionParameter conv_param = this->layer_param_.convolution_param();
16 | force_nd_im2col_ = conv_param.force_nd_im2col();
17 | myflag = conv_param.combine_relu();
18 | channel_axis_ = bottom[0]->CanonicalAxisIndex(conv_param.axis());
19 | const int first_spatial_axis = channel_axis_ + 1;
20 | const int num_axes = bottom[0]->num_axes();
21 | num_spatial_axes_ = num_axes - first_spatial_axis;
22 | CHECK_GE(num_spatial_axes_, 0);
23 | vector spatial_dim_blob_shape(1, std::max(num_spatial_axes_, 1));
24 | // Setup filter kernel dimensions (kernel_shape_).
25 | kernel_shape_.Reshape(spatial_dim_blob_shape);
26 | int* kernel_shape_data = kernel_shape_.mutable_cpu_data();
27 | if (conv_param.has_kernel_h() || conv_param.has_kernel_w()) {
28 | CHECK_EQ(num_spatial_axes_, 2)
29 | << "kernel_h & kernel_w can only be used for 2D convolution.";
30 | CHECK_EQ(0, conv_param.kernel_size_size())
31 | << "Either kernel_size or kernel_h/w should be specified; not both.";
32 | kernel_shape_data[0] = conv_param.kernel_h();
33 | kernel_shape_data[1] = conv_param.kernel_w();
34 | } else {
35 | const int num_kernel_dims = conv_param.kernel_size_size();
36 | CHECK(num_kernel_dims == 1 || num_kernel_dims == num_spatial_axes_)
37 | << "kernel_size must be specified once, or once per spatial dimension "
38 | << "(kernel_size specified " << num_kernel_dims << " times; "
39 | << num_spatial_axes_ << " spatial dims).";
40 | for (int i = 0; i < num_spatial_axes_; ++i) {
41 | kernel_shape_data[i] =
42 | conv_param.kernel_size((num_kernel_dims == 1) ? 0 : i);
43 | }
44 | }
45 | for (int i = 0; i < num_spatial_axes_; ++i) {
46 | CHECK_GT(kernel_shape_data[i], 0) << "Filter dimensions must be nonzero.";
47 | }
48 | // Setup stride dimensions (stride_).
49 | stride_.Reshape(spatial_dim_blob_shape);
50 | int* stride_data = stride_.mutable_cpu_data();
51 | if (conv_param.has_stride_h() || conv_param.has_stride_w()) {
52 | CHECK_EQ(num_spatial_axes_, 2)
53 | << "stride_h & stride_w can only be used for 2D convolution.";
54 | CHECK_EQ(0, conv_param.stride_size())
55 | << "Either stride or stride_h/w should be specified; not both.";
56 | stride_data[0] = conv_param.stride_h();
57 | stride_data[1] = conv_param.stride_w();
58 | } else {
59 | const int num_stride_dims = conv_param.stride_size();
60 | CHECK(num_stride_dims == 0 || num_stride_dims == 1 ||
61 | num_stride_dims == num_spatial_axes_)
62 | << "stride must be specified once, or once per spatial dimension "
63 | << "(stride specified " << num_stride_dims << " times; "
64 | << num_spatial_axes_ << " spatial dims).";
65 | const int kDefaultStride = 1;
66 | for (int i = 0; i < num_spatial_axes_; ++i) {
67 | stride_data[i] = (num_stride_dims == 0) ? kDefaultStride :
68 | conv_param.stride((num_stride_dims == 1) ? 0 : i);
69 | CHECK_GT(stride_data[i], 0) << "Stride dimensions must be nonzero.";
70 | }
71 | }
72 | // Setup pad dimensions (pad_).
73 | pad_.Reshape(spatial_dim_blob_shape);
74 | int* pad_data = pad_.mutable_cpu_data();
75 | if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
76 | CHECK_EQ(num_spatial_axes_, 2)
77 | << "pad_h & pad_w can only be used for 2D convolution.";
78 | CHECK_EQ(0, conv_param.pad_size())
79 | << "Either pad or pad_h/w should be specified; not both.";
80 | pad_data[0] = conv_param.pad_h();
81 | pad_data[1] = conv_param.pad_w();
82 | } else {
83 | const int num_pad_dims = conv_param.pad_size();
84 | CHECK(num_pad_dims == 0 || num_pad_dims == 1 ||
85 | num_pad_dims == num_spatial_axes_)
86 | << "pad must be specified once, or once per spatial dimension "
87 | << "(pad specified " << num_pad_dims << " times; "
88 | << num_spatial_axes_ << " spatial dims).";
89 | const int kDefaultPad = 0;
90 | for (int i = 0; i < num_spatial_axes_; ++i) {
91 | pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
92 | conv_param.pad((num_pad_dims == 1) ? 0 : i);
93 | }
94 | }
95 | // Setup dilation dimensions (dilation_).
96 | dilation_.Reshape(spatial_dim_blob_shape);
97 | int* dilation_data = dilation_.mutable_cpu_data();
98 | const int num_dilation_dims = conv_param.dilation_size();
99 | CHECK(num_dilation_dims == 0 || num_dilation_dims == 1 ||
100 | num_dilation_dims == num_spatial_axes_)
101 | << "dilation must be specified once, or once per spatial dimension "
102 | << "(dilation specified " << num_dilation_dims << " times; "
103 | << num_spatial_axes_ << " spatial dims).";
104 | const int kDefaultDilation = 1;
105 | for (int i = 0; i < num_spatial_axes_; ++i) {
106 | dilation_data[i] = (num_dilation_dims == 0) ? kDefaultDilation :
107 | conv_param.dilation((num_dilation_dims == 1) ? 0 : i);
108 | }
109 | // Special case: im2col is the identity for 1x1 convolution with stride 1
110 | // and no padding, so flag for skipping the buffer and transformation.
111 | is_1x1_ = true;
112 | for (int i = 0; i < num_spatial_axes_; ++i) {
113 | is_1x1_ &=
114 | kernel_shape_data[i] == 1 && stride_data[i] == 1 && pad_data[i] == 0;
115 | if (!is_1x1_) { break; }
116 | }
117 | // Configure output channels and groups.
118 | channels_ = bottom[0]->shape(channel_axis_);
119 | num_output_ = this->layer_param_.convolution_param().num_output();
120 | CHECK_GT(num_output_, 0);
121 | group_ = this->layer_param_.convolution_param().group();
122 | CHECK_EQ(channels_ % group_, 0);
123 | CHECK_EQ(num_output_ % group_, 0)
124 | << "Number of output should be multiples of group.";
125 | if (reverse_dimensions()) {
126 | conv_out_channels_ = channels_;
127 | conv_in_channels_ = num_output_;
128 | } else {
129 | conv_out_channels_ = num_output_;
130 | conv_in_channels_ = channels_;
131 | }
132 | // Handle the parameters: weights and biases.
133 | // - blobs_[0] holds the filter weights
134 | // - blobs_[1] holds the biases (optional)
135 | vector weight_shape(2);
136 | weight_shape[0] = conv_out_channels_;
137 | weight_shape[1] = conv_in_channels_ / group_;
138 | for (int i = 0; i < num_spatial_axes_; ++i) {
139 | weight_shape.push_back(kernel_shape_data[i]);
140 | }
141 | bias_term_ = this->layer_param_.convolution_param().bias_term();
142 | vector bias_shape(bias_term_, num_output_);
143 | if (this->blobs_.size() > 0) {
144 | CHECK_EQ(1 + bias_term_, this->blobs_.size())
145 | << "Incorrect number of weight blobs.";
146 | if (weight_shape != this->blobs_[0]->shape()) {
147 | Blob weight_shaped_blob(weight_shape);
148 | LOG(FATAL) << "Incorrect weight shape: expected shape "
149 | << weight_shaped_blob.shape_string() << "; instead, shape was "
150 | << this->blobs_[0]->shape_string();
151 | }
152 | if (bias_term_ && bias_shape != this->blobs_[1]->shape()) {
153 | Blob bias_shaped_blob(bias_shape);
154 | LOG(FATAL) << "Incorrect bias shape: expected shape "
155 | << bias_shaped_blob.shape_string() << "; instead, shape was "
156 | << this->blobs_[1]->shape_string();
157 | }
158 | LOG(INFO) << "Skipping parameter initialization";
159 | } else {
160 | if (bias_term_) {
161 | this->blobs_.resize(2);
162 | } else {
163 | this->blobs_.resize(1);
164 | }
165 | // Initialize and fill the weights:
166 | // output channels x input channels per-group x kernel height x kernel width
167 | this->blobs_[0].reset(new Blob(weight_shape));
168 | shared_ptr > weight_filler(GetFiller(
169 | this->layer_param_.convolution_param().weight_filler()));
170 | weight_filler->Fill(this->blobs_[0].get());
171 | // If necessary, initialize and fill the biases.
172 | if (bias_term_) {
173 | this->blobs_[1].reset(new Blob(bias_shape));
174 | shared_ptr > bias_filler(GetFiller(
175 | this->layer_param_.convolution_param().bias_filler()));
176 | bias_filler->Fill(this->blobs_[1].get());
177 | }
178 | }
179 | kernel_dim_ = this->blobs_[0]->count(1);
180 | weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
181 | // Propagate gradients to the parameters (as directed by backward pass).
182 | this->param_propagate_down_.resize(this->blobs_.size(), true);
183 | }
184 |
185 | template
186 | void BaseConvolutionLayer::Reshape(const vector*>& bottom,
187 | const vector*>& top) {
188 | const int first_spatial_axis = channel_axis_ + 1;
189 | CHECK_EQ(bottom[0]->num_axes(), first_spatial_axis + num_spatial_axes_)
190 | << "bottom num_axes may not change.";
191 | num_ = bottom[0]->count(0, channel_axis_);
192 | CHECK_EQ(bottom[0]->shape(channel_axis_), channels_)
193 | << "Input size incompatible with convolution kernel.";
194 | // TODO: generalize to handle inputs of different shapes.
195 | for (int bottom_id = 1; bottom_id < bottom.size(); ++bottom_id) {
196 | CHECK(bottom[0]->shape() == bottom[bottom_id]->shape())
197 | << "All inputs must have the same shape.";
198 | }
199 | // Shape the tops.
200 | bottom_shape_ = &bottom[0]->shape();
201 | compute_output_shape();
202 | vector top_shape(bottom[0]->shape().begin(),
203 | bottom[0]->shape().begin() + channel_axis_);
204 | top_shape.push_back(num_output_);
205 | for (int i = 0; i < num_spatial_axes_; ++i) {
206 | top_shape.push_back(output_shape_[i]);
207 | }
208 | for (int top_id = 0; top_id < top.size(); ++top_id) {
209 | top[top_id]->Reshape(top_shape);
210 | }
211 | if (reverse_dimensions()) {
212 | conv_out_spatial_dim_ = bottom[0]->count(first_spatial_axis);
213 | } else {
214 | conv_out_spatial_dim_ = top[0]->count(first_spatial_axis);
215 | }
216 | col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
217 | output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
218 | // Setup input dimensions (conv_input_shape_).
219 | vector bottom_dim_blob_shape(1, num_spatial_axes_ + 1);
220 | conv_input_shape_.Reshape(bottom_dim_blob_shape);
221 | int* conv_input_shape_data = conv_input_shape_.mutable_cpu_data();
222 | for (int i = 0; i < num_spatial_axes_ + 1; ++i) {
223 | if (reverse_dimensions()) {
224 | conv_input_shape_data[i] = top[0]->shape(channel_axis_ + i);
225 | } else {
226 | conv_input_shape_data[i] = bottom[0]->shape(channel_axis_ + i);
227 | }
228 | }
229 | // The im2col result buffer will only hold one image at a time to avoid
230 | // overly large memory usage. In the special case of 1x1 convolution
231 | // it goes lazily unused to save memory.
232 | col_buffer_shape_.clear();
233 | col_buffer_shape_.push_back(kernel_dim_ * group_);
234 | for (int i = 0; i < num_spatial_axes_; ++i) {
235 | if (reverse_dimensions()) {
236 | col_buffer_shape_.push_back(input_shape(i + 1));
237 | } else {
238 | col_buffer_shape_.push_back(output_shape_[i]);
239 | }
240 | }
241 | col_buffer_.Reshape(col_buffer_shape_);
242 | bottom_dim_ = bottom[0]->count(channel_axis_);
243 | top_dim_ = top[0]->count(channel_axis_);
244 | num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
245 | num_kernels_col2im_ = reverse_dimensions() ? top_dim_ : bottom_dim_;
246 | // Set up the all ones "bias multiplier" for adding biases by BLAS
247 | out_spatial_dim_ = top[0]->count(first_spatial_axis);
248 | if (bias_term_) {
249 | vector bias_multiplier_shape(1, out_spatial_dim_);
250 | bias_multiplier_.Reshape(bias_multiplier_shape);
251 | caffe_set(bias_multiplier_.count(), Dtype(1),
252 | bias_multiplier_.mutable_cpu_data());
253 | }
254 | }
255 |
256 | template
257 | void BaseConvolutionLayer::forward_cpu_gemm(const Dtype* input,
258 | const Dtype* weights, Dtype* output, bool skip_im2col) {
259 | const Dtype* col_buff = input;
260 | if (!is_1x1_) {
261 | if (!skip_im2col) {
262 | conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
263 | }
264 | col_buff = col_buffer_.cpu_data();
265 | }
266 | for (int g = 0; g < group_; ++g) {
267 | caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
268 | group_, conv_out_spatial_dim_, kernel_dim_,
269 | (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
270 | (Dtype)0., output + output_offset_ * g, myflag);
271 | }
272 | }
273 |
274 | template
275 | void BaseConvolutionLayer::forward_cpu_bias(Dtype* output,
276 | const Dtype* bias) {
277 | caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num_output_,
278 | out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.cpu_data(),
279 | (Dtype)1., output, myflag);
280 | }
281 |
282 | template
283 | void BaseConvolutionLayer::backward_cpu_gemm(const Dtype* output,
284 | const Dtype* weights, Dtype* input) {
285 | Dtype* col_buff = col_buffer_.mutable_cpu_data();
286 | if (is_1x1_) {
287 | col_buff = input;
288 | }
289 | for (int g = 0; g < group_; ++g) {
290 | caffe_cpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_,
291 | conv_out_spatial_dim_, conv_out_channels_ / group_,
292 | (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
293 | (Dtype)0., col_buff + col_offset_ * g);
294 | }
295 | if (!is_1x1_) {
296 | conv_col2im_cpu(col_buff, input);
297 | }
298 | }
299 |
300 | template
301 | void BaseConvolutionLayer::weight_cpu_gemm(const Dtype* input,
302 | const Dtype* output, Dtype* weights) {
303 | const Dtype* col_buff = input;
304 | if (!is_1x1_) {
305 | conv_im2col_cpu(input, col_buffer_.mutable_cpu_data());
306 | col_buff = col_buffer_.cpu_data();
307 | }
308 | for (int g = 0; g < group_; ++g) {
309 | caffe_cpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
310 | kernel_dim_, conv_out_spatial_dim_,
311 | (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
312 | (Dtype)1., weights + weight_offset_ * g);
313 | }
314 | }
315 |
316 | template
317 | void BaseConvolutionLayer::backward_cpu_bias(Dtype* bias,
318 | const Dtype* input) {
319 | caffe_cpu_gemv(CblasNoTrans, num_output_, out_spatial_dim_, 1.,
320 | input, bias_multiplier_.cpu_data(), 1., bias);
321 | }
322 |
323 | #ifndef CPU_ONLY
324 |
325 | template
326 | void BaseConvolutionLayer::forward_gpu_gemm(const Dtype* input,
327 | const Dtype* weights, Dtype* output, bool skip_im2col) {
328 | const Dtype* col_buff = input;
329 | if (!is_1x1_) {
330 | if (!skip_im2col) {
331 | conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
332 | }
333 | col_buff = col_buffer_.gpu_data();
334 | }
335 | for (int g = 0; g < group_; ++g) {
336 | caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, conv_out_channels_ /
337 | group_, conv_out_spatial_dim_, kernel_dim_,
338 | (Dtype)1., weights + weight_offset_ * g, col_buff + col_offset_ * g,
339 | (Dtype)0., output + output_offset_ * g, myflag);
340 | }
341 | }
342 |
343 | template
344 | void BaseConvolutionLayer::forward_gpu_bias(Dtype* output,
345 | const Dtype* bias) {
346 | caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, num_output_,
347 | out_spatial_dim_, 1, (Dtype)1., bias, bias_multiplier_.gpu_data(),
348 | (Dtype)1., output, myflag);
349 | }
350 |
351 | template
352 | void BaseConvolutionLayer::backward_gpu_gemm(const Dtype* output,
353 | const Dtype* weights, Dtype* input) {
354 | Dtype* col_buff = col_buffer_.mutable_gpu_data();
355 | if (is_1x1_) {
356 | col_buff = input;
357 | }
358 | for (int g = 0; g < group_; ++g) {
359 | caffe_gpu_gemm(CblasTrans, CblasNoTrans, kernel_dim_,
360 | conv_out_spatial_dim_, conv_out_channels_ / group_,
361 | (Dtype)1., weights + weight_offset_ * g, output + output_offset_ * g,
362 | (Dtype)0., col_buff + col_offset_ * g);
363 | }
364 | if (!is_1x1_) {
365 | conv_col2im_gpu(col_buff, input);
366 | }
367 | }
368 |
369 | template
370 | void BaseConvolutionLayer::weight_gpu_gemm(const Dtype* input,
371 | const Dtype* output, Dtype* weights) {
372 | const Dtype* col_buff = input;
373 | if (!is_1x1_) {
374 | conv_im2col_gpu(input, col_buffer_.mutable_gpu_data());
375 | col_buff = col_buffer_.gpu_data();
376 | }
377 | for (int g = 0; g < group_; ++g) {
378 | caffe_gpu_gemm(CblasNoTrans, CblasTrans, conv_out_channels_ / group_,
379 | kernel_dim_, conv_out_spatial_dim_,
380 | (Dtype)1., output + output_offset_ * g, col_buff + col_offset_ * g,
381 | (Dtype)1., weights + weight_offset_ * g);
382 | }
383 | }
384 |
385 | template
386 | void BaseConvolutionLayer::backward_gpu_bias(Dtype* bias,
387 | const Dtype* input) {
388 | caffe_gpu_gemv(CblasNoTrans, num_output_, out_spatial_dim_, 1.,
389 | input, bias_multiplier_.gpu_data(), 1., bias);
390 | }
391 |
392 | #endif // !CPU_ONLY
393 |
394 | INSTANTIATE_CLASS(BaseConvolutionLayer);
395 |
396 | } // namespace caffe
397 |
--------------------------------------------------------------------------------
/base_conv_layer.hpp:
--------------------------------------------------------------------------------
1 | #ifndef CAFFE_BASE_CONVOLUTION_LAYER_HPP_
2 | #define CAFFE_BASE_CONVOLUTION_LAYER_HPP_
3 |
4 | #include
5 |
6 | #include "caffe/blob.hpp"
7 | #include "caffe/layer.hpp"
8 | #include "caffe/proto/caffe.pb.h"
9 | #include "caffe/util/im2col.hpp"
10 |
11 | namespace caffe {
12 |
13 | /**
14 | * @brief Abstract base class that factors out the BLAS code common to
15 | * ConvolutionLayer and DeconvolutionLayer.
16 | */
17 | template
18 | class BaseConvolutionLayer : public Layer {
19 | public:
20 | explicit BaseConvolutionLayer(const LayerParameter& param)
21 | : Layer(param) {}
22 | virtual void LayerSetUp(const vector*>& bottom,
23 | const vector*>& top);
24 | virtual void Reshape(const vector*>& bottom,
25 | const vector*>& top);
26 |
27 | virtual inline int MinBottomBlobs() const { return 1; }
28 | virtual inline int MinTopBlobs() const { return 1; }
29 | virtual inline bool EqualNumBottomTopBlobs() const { return true; }
30 |
31 | protected:
32 | // Helper functions that abstract away the column buffer and gemm arguments.
33 | // The last argument in forward_cpu_gemm is so that we can skip the im2col if
34 | // we just called weight_cpu_gemm with the same input.
35 | void forward_cpu_gemm(const Dtype* input, const Dtype* weights,
36 | Dtype* output, bool skip_im2col = false);
37 | void forward_cpu_bias(Dtype* output, const Dtype* bias);
38 | void backward_cpu_gemm(const Dtype* input, const Dtype* weights,
39 | Dtype* output);
40 | void weight_cpu_gemm(const Dtype* input, const Dtype* output, Dtype*
41 | weights);
42 | void backward_cpu_bias(Dtype* bias, const Dtype* input);
43 |
44 | #ifndef CPU_ONLY
45 | void forward_gpu_gemm(const Dtype* col_input, const Dtype* weights,
46 | Dtype* output, bool skip_im2col = false);
47 | void forward_gpu_bias(Dtype* output, const Dtype* bias);
48 | void backward_gpu_gemm(const Dtype* input, const Dtype* weights,
49 | Dtype* col_output);
50 | void weight_gpu_gemm(const Dtype* col_input, const Dtype* output, Dtype*
51 | weights);
52 | void backward_gpu_bias(Dtype* bias, const Dtype* input);
53 | #endif
54 |
55 | /// @brief The spatial dimensions of the input.
56 | inline int input_shape(int i) {
57 | return (*bottom_shape_)[channel_axis_ + i];
58 | }
59 | // reverse_dimensions should return true iff we are implementing deconv, so
60 | // that conv helpers know which dimensions are which.
61 | virtual bool reverse_dimensions() = 0;
62 | // Compute height_out_ and width_out_ from other parameters.
63 | virtual void compute_output_shape() = 0;
64 |
65 | /// @brief The spatial dimensions of a filter kernel.
66 | Blob kernel_shape_;
67 | /// @brief The spatial dimensions of the stride.
68 | Blob stride_;
69 | /// @brief The spatial dimensions of the padding.
70 | Blob pad_;
71 | /// @brief The spatial dimensions of the dilation.
72 | Blob dilation_;
73 | /// @brief The spatial dimensions of the convolution input.
74 | Blob conv_input_shape_;
75 | /// @brief The spatial dimensions of the col_buffer.
76 | vector col_buffer_shape_;
77 | /// @brief The spatial dimensions of the output.
78 | vector output_shape_;
79 | const vector* bottom_shape_;
80 |
81 | int num_spatial_axes_;
82 | int bottom_dim_;
83 | int top_dim_;
84 |
85 | int channel_axis_;
86 | int num_;
87 | int channels_;
88 | int group_;
89 | int out_spatial_dim_;
90 | int weight_offset_;
91 | int num_output_;
92 | bool bias_term_;
93 | bool is_1x1_;
94 | bool force_nd_im2col_;
95 | bool myflag;
96 |
97 | private:
98 | // wrap im2col/col2im so we don't have to remember the (long) argument lists
99 | inline void conv_im2col_cpu(const Dtype* data, Dtype* col_buff) {
100 | if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
101 | im2col_cpu(data, conv_in_channels_,
102 | conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
103 | kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
104 | pad_.cpu_data()[0], pad_.cpu_data()[1],
105 | stride_.cpu_data()[0], stride_.cpu_data()[1],
106 | dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
107 | } else {
108 | im2col_nd_cpu(data, num_spatial_axes_, conv_input_shape_.cpu_data(),
109 | col_buffer_shape_.data(), kernel_shape_.cpu_data(),
110 | pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), col_buff);
111 | }
112 | }
113 | inline void conv_col2im_cpu(const Dtype* col_buff, Dtype* data) {
114 | if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
115 | col2im_cpu(col_buff, conv_in_channels_,
116 | conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
117 | kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
118 | pad_.cpu_data()[0], pad_.cpu_data()[1],
119 | stride_.cpu_data()[0], stride_.cpu_data()[1],
120 | dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
121 | } else {
122 | col2im_nd_cpu(col_buff, num_spatial_axes_, conv_input_shape_.cpu_data(),
123 | col_buffer_shape_.data(), kernel_shape_.cpu_data(),
124 | pad_.cpu_data(), stride_.cpu_data(), dilation_.cpu_data(), data);
125 | }
126 | }
127 | #ifndef CPU_ONLY
128 | inline void conv_im2col_gpu(const Dtype* data, Dtype* col_buff) {
129 | if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
130 | im2col_gpu(data, conv_in_channels_,
131 | conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
132 | kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
133 | pad_.cpu_data()[0], pad_.cpu_data()[1],
134 | stride_.cpu_data()[0], stride_.cpu_data()[1],
135 | dilation_.cpu_data()[0], dilation_.cpu_data()[1], col_buff);
136 | } else {
137 | im2col_nd_gpu(data, num_spatial_axes_, num_kernels_im2col_,
138 | conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
139 | kernel_shape_.gpu_data(), pad_.gpu_data(),
140 | stride_.gpu_data(), dilation_.gpu_data(), col_buff);
141 | }
142 | }
143 | inline void conv_col2im_gpu(const Dtype* col_buff, Dtype* data) {
144 | if (!force_nd_im2col_ && num_spatial_axes_ == 2) {
145 | col2im_gpu(col_buff, conv_in_channels_,
146 | conv_input_shape_.cpu_data()[1], conv_input_shape_.cpu_data()[2],
147 | kernel_shape_.cpu_data()[0], kernel_shape_.cpu_data()[1],
148 | pad_.cpu_data()[0], pad_.cpu_data()[1],
149 | stride_.cpu_data()[0], stride_.cpu_data()[1],
150 | dilation_.cpu_data()[0], dilation_.cpu_data()[1], data);
151 | } else {
152 | col2im_nd_gpu(col_buff, num_spatial_axes_, num_kernels_col2im_,
153 | conv_input_shape_.gpu_data(), col_buffer_.gpu_shape(),
154 | kernel_shape_.gpu_data(), pad_.gpu_data(), stride_.gpu_data(),
155 | dilation_.gpu_data(), data);
156 | }
157 | }
158 | #endif
159 |
160 | int num_kernels_im2col_;
161 | int num_kernels_col2im_;
162 | int conv_out_channels_;
163 | int conv_in_channels_;
164 | int conv_out_spatial_dim_;
165 | int kernel_dim_;
166 | int col_offset_;
167 | int output_offset_;
168 |
169 | Blob col_buffer_;
170 | Blob bias_multiplier_;
171 | };
172 |
173 | } // namespace caffe
174 |
175 | #endif // CAFFE_BASE_CONVOLUTION_LAYER_HPP_
176 |
--------------------------------------------------------------------------------
/doc/ZTE_challenge_2019.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anlongstory/caffe_combine_relu/7eb9762b475f1efa9b479b3f16a715101519f905/doc/ZTE_challenge_2019.pdf
--------------------------------------------------------------------------------
/math_functions.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #include
5 |
6 | #include "caffe/common.hpp"
7 | #include "caffe/util/math_functions.hpp"
8 | #include "caffe/util/rng.hpp"
9 |
10 | namespace caffe {
11 |
12 | template<>
13 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
14 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
15 | const float alpha, const float* A, const float* B, const float beta,
16 | float* C,const bool myflag) {
17 | int lda = (TransA == CblasNoTrans) ? K : M;
18 | int ldb = (TransB == CblasNoTrans) ? N : K;
19 | cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
20 | ldb, beta, C, N);
21 | if(myflag)
22 | {
23 | for (int i = 0; i < M*N; i++)
24 | {
25 | *(C + i) = std::max(*(C + i), float(0.));
26 | }
27 | }
28 | }
29 |
30 | template<>
31 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
32 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
33 | const double alpha, const double* A, const double* B, const double beta,
34 | double* C,const bool myflag) {
35 | int lda = (TransA == CblasNoTrans) ? K : M;
36 | int ldb = (TransB == CblasNoTrans) ? N : K;
37 | cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
38 | ldb, beta, C, N);
39 | if(myflag)
40 | {
41 | for (int i = 0; i < M*N; i++)
42 | {
43 | *(C + i) = std::max(*(C + i), double(0.));
44 | }
45 | }
46 | }
47 |
48 | template <>
49 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M,
50 | const int N, const float alpha, const float* A, const float* x,
51 | const float beta, float* y) {
52 | cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
53 | }
54 |
55 | template <>
56 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M,
57 | const int N, const double alpha, const double* A, const double* x,
58 | const double beta, double* y) {
59 | cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
60 | }
61 |
62 | template <>
63 | void caffe_axpy(const int N, const float alpha, const float* X,
64 | float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); }
65 |
66 | template <>
67 | void caffe_axpy(const int N, const double alpha, const double* X,
68 | double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
69 |
70 | template
71 | void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
72 | if (alpha == 0) {
73 | memset(Y, 0, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn)
74 | return;
75 | }
76 | for (int i = 0; i < N; ++i) {
77 | Y[i] = alpha;
78 | }
79 | }
80 |
81 | template void caffe_set(const int N, const int alpha, int* Y);
82 | template void caffe_set(const int N, const float alpha, float* Y);
83 | template void caffe_set(const int N, const double alpha, double* Y);
84 |
85 | template <>
86 | void caffe_add_scalar(const int N, const float alpha, float* Y) {
87 | for (int i = 0; i < N; ++i) {
88 | Y[i] += alpha;
89 | }
90 | }
91 |
92 | template <>
93 | void caffe_add_scalar(const int N, const double alpha, double* Y) {
94 | for (int i = 0; i < N; ++i) {
95 | Y[i] += alpha;
96 | }
97 | }
98 |
99 | template
100 | void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
101 | if (X != Y) {
102 | if (Caffe::mode() == Caffe::GPU) {
103 | #ifndef CPU_ONLY
104 | // NOLINT_NEXT_LINE(caffe/alt_fn)
105 | CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
106 | #else
107 | NO_GPU;
108 | #endif
109 | } else {
110 | memcpy(Y, X, sizeof(Dtype) * N); // NOLINT(caffe/alt_fn)
111 | }
112 | }
113 | }
114 |
115 | template void caffe_copy(const int N, const int* X, int* Y);
116 | template void caffe_copy(const int N, const unsigned int* X,
117 | unsigned int* Y);
118 | template void caffe_copy(const int N, const float* X, float* Y);
119 | template void caffe_copy(const int N, const double* X, double* Y);
120 |
121 | template <>
122 | void caffe_scal(const int N, const float alpha, float *X) {
123 | cblas_sscal(N, alpha, X, 1);
124 | }
125 |
126 | template <>
127 | void caffe_scal(const int N, const double alpha, double *X) {
128 | cblas_dscal(N, alpha, X, 1);
129 | }
130 |
131 | template <>
132 | void caffe_cpu_axpby(const int N, const float alpha, const float* X,
133 | const float beta, float* Y) {
134 | cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
135 | }
136 |
137 | template <>
138 | void caffe_cpu_axpby(const int N, const double alpha, const double* X,
139 | const double beta, double* Y) {
140 | cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
141 | }
142 |
143 | template <>
144 | void caffe_add(const int n, const float* a, const float* b,
145 | float* y) {
146 | vsAdd(n, a, b, y);
147 | }
148 |
149 | template <>
150 | void caffe_add(const int n, const double* a, const double* b,
151 | double* y) {
152 | vdAdd(n, a, b, y);
153 | }
154 |
155 | template <>
156 | void caffe_sub(const int n, const float* a, const float* b,
157 | float* y) {
158 | vsSub(n, a, b, y);
159 | }
160 |
161 | template <>
162 | void caffe_sub(const int n, const double* a, const double* b,
163 | double* y) {
164 | vdSub(n, a, b, y);
165 | }
166 |
167 | template <>
168 | void caffe_mul(const int n, const float* a, const float* b,
169 | float* y) {
170 | vsMul(n, a, b, y);
171 | }
172 |
173 | template <>
174 | void caffe_mul(const int n, const double* a, const double* b,
175 | double* y) {
176 | vdMul(n, a, b, y);
177 | }
178 |
179 | template <>
180 | void caffe_div(const int n, const float* a, const float* b,
181 | float* y) {
182 | vsDiv(n, a, b, y);
183 | }
184 |
185 | template <>
186 | void caffe_div(const int n, const double* a, const double* b,
187 | double* y) {
188 | vdDiv(n, a, b, y);
189 | }
190 |
191 | template <>
192 | void caffe_powx(const int n, const float* a, const float b,
193 | float* y) {
194 | vsPowx(n, a, b, y);
195 | }
196 |
197 | template <>
198 | void caffe_powx(const int n, const double* a, const double b,
199 | double* y) {
200 | vdPowx(n, a, b, y);
201 | }
202 |
203 | template <>
204 | void caffe_sqr(const int n, const float* a, float* y) {
205 | vsSqr(n, a, y);
206 | }
207 |
208 | template <>
209 | void caffe_sqr(const int n, const double* a, double* y) {
210 | vdSqr(n, a, y);
211 | }
212 |
213 | template <>
214 | void caffe_sqrt(const int n, const float* a, float* y) {
215 | vsSqrt(n, a, y);
216 | }
217 |
218 | template <>
219 | void caffe_sqrt(const int n, const double* a, double* y) {
220 | vdSqrt(n, a, y);
221 | }
222 |
223 | template <>
224 | void caffe_exp(const int n, const float* a, float* y) {
225 | vsExp(n, a, y);
226 | }
227 |
228 | template <>
229 | void caffe_exp(const int n, const double* a, double* y) {
230 | vdExp(n, a, y);
231 | }
232 |
233 | template <>
234 | void caffe_log(const int n, const float* a, float* y) {
235 | vsLn(n, a, y);
236 | }
237 |
238 | template <>
239 | void caffe_log(const int n, const double* a, double* y) {
240 | vdLn(n, a, y);
241 | }
242 |
243 | template <>
244 | void caffe_abs(const int n, const float* a, float* y) {
245 | vsAbs(n, a, y);
246 | }
247 |
248 | template <>
249 | void caffe_abs(const int n, const double* a, double* y) {
250 | vdAbs(n, a, y);
251 | }
252 |
253 | unsigned int caffe_rng_rand() {
254 | return (*caffe_rng())();
255 | }
256 |
257 | template
258 | Dtype caffe_nextafter(const Dtype b) {
259 | return boost::math::nextafter(
260 | b, std::numeric_limits::max());
261 | }
262 |
263 | template
264 | float caffe_nextafter(const float b);
265 |
266 | template
267 | double caffe_nextafter(const double b);
268 |
269 | template
270 | void caffe_rng_uniform(const int n, const Dtype a, const Dtype b, Dtype* r) {
271 | CHECK_GE(n, 0);
272 | CHECK(r);
273 | CHECK_LE(a, b);
274 | boost::uniform_real random_distribution(a, caffe_nextafter(b));
275 | boost::variate_generator >
276 | variate_generator(caffe_rng(), random_distribution);
277 | for (int i = 0; i < n; ++i) {
278 | r[i] = variate_generator();
279 | }
280 | }
281 |
282 | template
283 | void caffe_rng_uniform(const int n, const float a, const float b,
284 | float* r);
285 |
286 | template
287 | void caffe_rng_uniform(const int n, const double a, const double b,
288 | double* r);
289 |
290 | template
291 | void caffe_rng_gaussian(const int n, const Dtype a,
292 | const Dtype sigma, Dtype* r) {
293 | CHECK_GE(n, 0);
294 | CHECK(r);
295 | CHECK_GT(sigma, 0);
296 | boost::normal_distribution random_distribution(a, sigma);
297 | boost::variate_generator >
298 | variate_generator(caffe_rng(), random_distribution);
299 | for (int i = 0; i < n; ++i) {
300 | r[i] = variate_generator();
301 | }
302 | }
303 |
304 | template
305 | void caffe_rng_gaussian(const int n, const float mu,
306 | const float sigma, float* r);
307 |
308 | template
309 | void caffe_rng_gaussian(const int n, const double mu,
310 | const double sigma, double* r);
311 |
312 | template
313 | void caffe_rng_bernoulli(const int n, const Dtype p, int* r) {
314 | CHECK_GE(n, 0);
315 | CHECK(r);
316 | CHECK_GE(p, 0);
317 | CHECK_LE(p, 1);
318 | boost::bernoulli_distribution random_distribution(p);
319 | boost::variate_generator >
320 | variate_generator(caffe_rng(), random_distribution);
321 | for (int i = 0; i < n; ++i) {
322 | r[i] = variate_generator();
323 | }
324 | }
325 |
326 | template
327 | void caffe_rng_bernoulli(const int n, const double p, int* r);
328 |
329 | template
330 | void caffe_rng_bernoulli(const int n, const float p, int* r);
331 |
332 | template
333 | void caffe_rng_bernoulli(const int n, const Dtype p, unsigned int* r) {
334 | CHECK_GE(n, 0);
335 | CHECK(r);
336 | CHECK_GE(p, 0);
337 | CHECK_LE(p, 1);
338 | boost::bernoulli_distribution random_distribution(p);
339 | boost::variate_generator >
340 | variate_generator(caffe_rng(), random_distribution);
341 | for (int i = 0; i < n; ++i) {
342 | r[i] = static_cast(variate_generator());
343 | }
344 | }
345 |
346 | template
347 | void caffe_rng_bernoulli(const int n, const double p, unsigned int* r);
348 |
349 | template
350 | void caffe_rng_bernoulli(const int n, const float p, unsigned int* r);
351 |
352 | template <>
353 | float caffe_cpu_strided_dot(const int n, const float* x, const int incx,
354 | const float* y, const int incy) {
355 | return cblas_sdot(n, x, incx, y, incy);
356 | }
357 |
358 | template <>
359 | double caffe_cpu_strided_dot(const int n, const double* x,
360 | const int incx, const double* y, const int incy) {
361 | return cblas_ddot(n, x, incx, y, incy);
362 | }
363 |
364 | template
365 | Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) {
366 | return caffe_cpu_strided_dot(n, x, 1, y, 1);
367 | }
368 |
369 | template
370 | float caffe_cpu_dot(const int n, const float* x, const float* y);
371 |
372 | template
373 | double caffe_cpu_dot(const int n, const double* x, const double* y);
374 |
375 | template <>
376 | float caffe_cpu_asum(const int n, const float* x) {
377 | return cblas_sasum(n, x, 1);
378 | }
379 |
380 | template <>
381 | double caffe_cpu_asum(const int n, const double* x) {
382 | return cblas_dasum(n, x, 1);
383 | }
384 |
385 | template <>
386 | void caffe_cpu_scale(const int n, const float alpha, const float *x,
387 | float* y) {
388 | cblas_scopy(n, x, 1, y, 1);
389 | cblas_sscal(n, alpha, y, 1);
390 | }
391 |
392 | template <>
393 | void caffe_cpu_scale(const int n, const double alpha, const double *x,
394 | double* y) {
395 | cblas_dcopy(n, x, 1, y, 1);
396 | cblas_dscal(n, alpha, y, 1);
397 | }
398 |
399 | } // namespace caffe
400 |
--------------------------------------------------------------------------------
/math_functions.cu:
--------------------------------------------------------------------------------
1 | #include // CUDA's, not caffe's, for fabs, signbit
2 | #include
3 | #include // thrust::plus
4 | #include
5 |
6 | #include
7 |
8 | #include "caffe/common.hpp"
9 | #include "caffe/util/math_functions.hpp"
10 |
11 | namespace caffe {
12 |
13 | template
14 | __global__ void ReLUForward2(const int n, const Dtype* in, Dtype* out) {
15 | CUDA_KERNEL_LOOP(index, n) {
16 | out[index] = in[index] > 0 ? in[index] : 0;
17 | }
18 | }
19 |
20 | template <>
21 | void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
22 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
23 | const float alpha, const float* A, const float* B, const float beta,
24 | float* C,const bool myflag) {
25 | // Note that cublas follows fortran order.
26 | int lda = (TransA == CblasNoTrans) ? K : M;
27 | int ldb = (TransB == CblasNoTrans) ? N : K;
28 | cublasOperation_t cuTransA =
29 | (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
30 | cublasOperation_t cuTransB =
31 | (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
32 | CUBLAS_CHECK(cublasSgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
33 | N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
34 |
35 | if (myflag)
36 | {
37 | ReLUForward2 << > >(
38 | M*N, C, C);
39 | CUDA_POST_KERNEL_CHECK;
40 | }
41 | }
42 |
43 | template <>
44 | void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
45 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
46 | const double alpha, const double* A, const double* B, const double beta,
47 | double* C,const bool myflag) {
48 | // Note that cublas follows fortran order.
49 | int lda = (TransA == CblasNoTrans) ? K : M;
50 | int ldb = (TransB == CblasNoTrans) ? N : K;
51 | cublasOperation_t cuTransA =
52 | (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
53 | cublasOperation_t cuTransB =
54 | (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
55 | CUBLAS_CHECK(cublasDgemm(Caffe::cublas_handle(), cuTransB, cuTransA,
56 | N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
57 | if (myflag)
58 | {
59 | ReLUForward2 << > >(
60 | M*N, C, C);
61 | CUDA_POST_KERNEL_CHECK;
62 | }
63 | }
64 |
65 | template <>
66 | void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M,
67 | const int N, const float alpha, const float* A, const float* x,
68 | const float beta, float* y) {
69 | cublasOperation_t cuTransA =
70 | (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
71 | CUBLAS_CHECK(cublasSgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
72 | A, N, x, 1, &beta, y, 1));
73 | }
74 |
75 | template <>
76 | void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M,
77 | const int N, const double alpha, const double* A, const double* x,
78 | const double beta, double* y) {
79 | cublasOperation_t cuTransA =
80 | (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
81 | CUBLAS_CHECK(cublasDgemv(Caffe::cublas_handle(), cuTransA, N, M, &alpha,
82 | A, N, x, 1, &beta, y, 1));
83 | }
84 |
85 | template <>
86 | void caffe_gpu_axpy(const int N, const float alpha, const float* X,
87 | float* Y) {
88 | CUBLAS_CHECK(cublasSaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
89 | }
90 |
91 | template <>
92 | void caffe_gpu_axpy(const int N, const double alpha, const double* X,
93 | double* Y) {
94 | CUBLAS_CHECK(cublasDaxpy(Caffe::cublas_handle(), N, &alpha, X, 1, Y, 1));
95 | }
96 |
97 | void caffe_gpu_memcpy(const size_t N, const void* X, void* Y) {
98 | if (X != Y) {
99 | CUDA_CHECK(cudaMemcpy(Y, X, N, cudaMemcpyDefault)); // NOLINT(caffe/alt_fn)
100 | }
101 | }
102 |
103 | template <>
104 | void caffe_gpu_scal(const int N, const float alpha, float *X) {
105 | CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
106 | }
107 |
108 | template <>
109 | void caffe_gpu_scal(const int N, const double alpha, double *X) {
110 | CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
111 | }
112 |
113 | template <>
114 | void caffe_gpu_scal(const int N, const float alpha, float* X,
115 | cudaStream_t str) {
116 | cudaStream_t initial_stream;
117 | CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream));
118 | CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str));
119 | CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), N, &alpha, X, 1));
120 | CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream));
121 | }
122 |
123 | template <>
124 | void caffe_gpu_scal(const int N, const double alpha, double* X,
125 | cudaStream_t str) {
126 | cudaStream_t initial_stream;
127 | CUBLAS_CHECK(cublasGetStream(Caffe::cublas_handle(), &initial_stream));
128 | CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), str));
129 | CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), N, &alpha, X, 1));
130 | CUBLAS_CHECK(cublasSetStream(Caffe::cublas_handle(), initial_stream));
131 | }
132 |
133 | template <>
134 | void caffe_gpu_axpby(const int N, const float alpha, const float* X,
135 | const float beta, float* Y) {
136 | caffe_gpu_scal(N, beta, Y);
137 | caffe_gpu_axpy(N, alpha, X, Y);
138 | }
139 |
140 | template <>
141 | void caffe_gpu_axpby(const int N, const double alpha, const double* X,
142 | const double beta, double* Y) {
143 | caffe_gpu_scal(N, beta, Y);
144 | caffe_gpu_axpy(N, alpha, X, Y);
145 | }
146 |
147 | template <>
148 | void caffe_gpu_dot(const int n, const float* x, const float* y,
149 | float* out) {
150 | CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
151 | }
152 |
153 | template <>
154 | void caffe_gpu_dot(const int n, const double* x, const double* y,
155 | double * out) {
156 | CUBLAS_CHECK(cublasDdot(Caffe::cublas_handle(), n, x, 1, y, 1, out));
157 | }
158 |
159 | template <>
160 | void caffe_gpu_asum(const int n, const float* x, float* y) {
161 | CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y));
162 | }
163 |
164 | template <>
165 | void caffe_gpu_asum(const int n, const double* x, double* y) {
166 | CUBLAS_CHECK(cublasDasum(Caffe::cublas_handle(), n, x, 1, y));
167 | }
168 |
169 | template <>
170 | void caffe_gpu_scale(const int n, const float alpha, const float *x,
171 | float* y) {
172 | CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1));
173 | CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1));
174 | }
175 |
176 | template <>
177 | void caffe_gpu_scale(const int n, const double alpha, const double *x,
178 | double* y) {
179 | CUBLAS_CHECK(cublasDcopy(Caffe::cublas_handle(), n, x, 1, y, 1));
180 | CUBLAS_CHECK(cublasDscal(Caffe::cublas_handle(), n, &alpha, y, 1));
181 | }
182 |
183 | template
184 | __global__ void set_kernel(const int n, const Dtype alpha, Dtype* y) {
185 | CUDA_KERNEL_LOOP(index, n) {
186 | y[index] = alpha;
187 | }
188 | }
189 |
190 | template
191 | void caffe_gpu_set(const int N, const Dtype alpha, Dtype* Y) {
192 | if (alpha == 0) {
193 | CUDA_CHECK(cudaMemset(Y, 0, sizeof(Dtype) * N)); // NOLINT(caffe/alt_fn)
194 | return;
195 | }
196 | // NOLINT_NEXT_LINE(whitespace/operators)
197 | set_kernel<<>>(
198 | N, alpha, Y);
199 | }
200 |
201 | template void caffe_gpu_set(const int N, const int alpha, int* Y);
202 | template void caffe_gpu_set(const int N, const float alpha, float* Y);
203 | template void caffe_gpu_set(const int N, const double alpha, double* Y);
204 |
205 | template
206 | __global__ void add_scalar_kernel(const int n, const Dtype alpha, Dtype* y) {
207 | CUDA_KERNEL_LOOP(index, n) {
208 | y[index] += alpha;
209 | }
210 | }
211 |
212 | template <>
213 | void caffe_gpu_add_scalar(const int N, const float alpha, float* Y) {
214 | // NOLINT_NEXT_LINE(whitespace/operators)
215 | add_scalar_kernel<<>>(
216 | N, alpha, Y);
217 | }
218 |
219 | template <>
220 | void caffe_gpu_add_scalar(const int N, const double alpha, double* Y) {
221 | // NOLINT_NEXT_LINE(whitespace/operators)
222 | add_scalar_kernel<<>>(
223 | N, alpha, Y);
224 | }
225 |
226 | template
227 | __global__ void add_kernel(const int n, const Dtype* a,
228 | const Dtype* b, Dtype* y) {
229 | CUDA_KERNEL_LOOP(index, n) {
230 | y[index] = a[index] + b[index];
231 | }
232 | }
233 |
234 | template <>
235 | void caffe_gpu_add(const int N, const float* a, const float* b,
236 | float* y) {
237 | // NOLINT_NEXT_LINE(whitespace/operators)
238 | add_kernel<<>>(
239 | N, a, b, y);
240 | }
241 |
242 | template <>
243 | void caffe_gpu_add(const int N, const double* a, const double* b,
244 | double* y) {
245 | // NOLINT_NEXT_LINE(whitespace/operators)
246 | add_kernel<<>>(
247 | N, a, b, y);
248 | }
249 |
250 | template
251 | __global__ void sub_kernel(const int n, const Dtype* a,
252 | const Dtype* b, Dtype* y) {
253 | CUDA_KERNEL_LOOP(index, n) {
254 | y[index] = a[index] - b[index];
255 | }
256 | }
257 |
258 | template <>
259 | void caffe_gpu_sub(const int N, const float* a, const float* b,
260 | float* y) {
261 | // NOLINT_NEXT_LINE(whitespace/operators)
262 | sub_kernel<<>>(
263 | N, a, b, y);
264 | }
265 |
266 | template <>
267 | void caffe_gpu_sub(const int N, const double* a, const double* b,
268 | double* y) {
269 | // NOLINT_NEXT_LINE(whitespace/operators)
270 | sub_kernel<<>>(
271 | N, a, b, y);
272 | }
273 |
274 | template
275 | __global__ void mul_kernel(const int n, const Dtype* a,
276 | const Dtype* b, Dtype* y) {
277 | CUDA_KERNEL_LOOP(index, n) {
278 | y[index] = a[index] * b[index];
279 | }
280 | }
281 |
282 | template <>
283 | void caffe_gpu_mul(const int N, const float* a,
284 | const float* b, float* y) {
285 | // NOLINT_NEXT_LINE(whitespace/operators)
286 | mul_kernel<<>>(
287 | N, a, b, y);
288 | }
289 |
290 | template <>
291 | void caffe_gpu_mul(const int N, const double* a,
292 | const double* b, double* y) {
293 | // NOLINT_NEXT_LINE(whitespace/operators)
294 | mul_kernel<<>>(
295 | N, a, b, y);
296 | }
297 |
298 | template
299 | __global__ void div_kernel(const int n, const Dtype* a,
300 | const Dtype* b, Dtype* y) {
301 | CUDA_KERNEL_LOOP(index, n) {
302 | y[index] = a[index] / b[index];
303 | }
304 | }
305 |
306 | template <>
307 | void caffe_gpu_div(const int N, const float* a,
308 | const float* b, float* y) {
309 | // NOLINT_NEXT_LINE(whitespace/operators)
310 | div_kernel<<>>(
311 | N, a, b, y);
312 | }
313 |
314 | template <>
315 | void caffe_gpu_div(const int N, const double* a,
316 | const double* b, double* y) {
317 | // NOLINT_NEXT_LINE(whitespace/operators)
318 | div_kernel<<>>(
319 | N, a, b, y);
320 | }
321 |
322 | template
323 | __global__ void abs_kernel(const int n, const Dtype* a, Dtype* y) {
324 | CUDA_KERNEL_LOOP(index, n) {
325 | y[index] = abs(a[index]);
326 | }
327 | }
328 |
329 | template <>
330 | void caffe_gpu_abs(const int N, const float* a, float* y) {
331 | // NOLINT_NEXT_LINE(whitespace/operators)
332 | abs_kernel<<>>(
333 | N, a, y);
334 | }
335 |
336 | template <>
337 | void caffe_gpu_abs(const int N, const double* a, double* y) {
338 | // NOLINT_NEXT_LINE(whitespace/operators)
339 | abs_kernel<<>>(
340 | N, a, y);
341 | }
342 |
343 |
344 | template
345 | __global__ void exp_kernel(const int n, const Dtype* a, Dtype* y) {
346 | CUDA_KERNEL_LOOP(index, n) {
347 | y[index] = exp(a[index]);
348 | }
349 | }
350 |
351 | template <>
352 | void caffe_gpu_exp(const int N, const float* a, float* y) {
353 | // NOLINT_NEXT_LINE(whitespace/operators)
354 | exp_kernel<<>>(
355 | N, a, y);
356 | }
357 |
358 | template <>
359 | void caffe_gpu_exp(const int N, const double* a, double* y) {
360 | // NOLINT_NEXT_LINE(whitespace/operators)
361 | exp_kernel<<>>(
362 | N, a, y);
363 | }
364 |
365 | template
366 | __global__ void log_kernel(const int n, const Dtype* a, Dtype* y) {
367 | CUDA_KERNEL_LOOP(index, n) {
368 | y[index] = log(a[index]);
369 | }
370 | }
371 |
372 | template <>
373 | void caffe_gpu_log(const int N, const float* a, float* y) {
374 | // NOLINT_NEXT_LINE(whitespace/operators)
375 | log_kernel<<>>(
376 | N, a, y);
377 | }
378 |
379 | template <>
380 | void caffe_gpu_log(const int N, const double* a, double* y) {
381 | // NOLINT_NEXT_LINE(whitespace/operators)
382 | log_kernel<<>>(
383 | N, a, y);
384 | }
385 |
386 | template
387 | __global__ void powx_kernel(const int n, const Dtype* a,
388 | const Dtype alpha, Dtype* y) {
389 | CUDA_KERNEL_LOOP(index, n) {
390 | y[index] = pow(a[index], alpha);
391 | }
392 | }
393 |
394 | template <>
395 | void caffe_gpu_powx(const int N, const float* a,
396 | const float alpha, float* y) {
397 | // NOLINT_NEXT_LINE(whitespace/operators)
398 | powx_kernel<<>>(
399 | N, a, alpha, y);
400 | }
401 |
402 | template <>
403 | void caffe_gpu_powx(const int N, const double* a,
404 | const double alpha, double* y) {
405 | // NOLINT_NEXT_LINE(whitespace/operators)
406 | powx_kernel<<>>(
407 | N, a, alpha, y);
408 | }
409 |
410 | template
411 | __global__ void sqrt_kernel(const int n, const Dtype* a, Dtype* y) {
412 | CUDA_KERNEL_LOOP(index, n) {
413 | y[index] = sqrt(a[index]);
414 | }
415 | }
416 |
417 | template <>
418 | void caffe_gpu_sqrt(const int N, const float* a, float* y) {
419 | // NOLINT_NEXT_LINE(whitespace/operators)
420 | sqrt_kernel<<>>(
421 | N, a, y);
422 | }
423 |
424 | template <>
425 | void caffe_gpu_sqrt(const int N, const double* a, double* y) {
426 | // NOLINT_NEXT_LINE(whitespace/operators)
427 | sqrt_kernel<<