├── README.md
├── src
    ├── utils.cu
    ├── user_iface.cu
    ├── mnist_test.cu
    ├── solver.cu
    ├── slow_conv.cu
    ├── vgg_test.cu
    ├── main.cu
    ├── alexnet_test.cu
    ├── neural_net_time.cu
    └── layer_params.cu
├── CMakeLists.txt
├── cnmem
    ├── LICENSE
    ├── CMakeLists.txt
    ├── README.md
    └── include
    │   └── cnmem.h
└── include
    ├── solver.h
    ├── neural_net.h
    ├── user_iface.h
    ├── utils.h
    ├── layer_params.h
    └── cnmem.h


/README.md:
--------------------------------------------------------------------------------
1 | # vDNN
2 | My implementation of the paper titled **vDNN: Virtualized Deep Neural Networks for Scalable, Memory-Efficient Neural Network Design** (https://arxiv.org/abs/1602.08124). Supports only linear networks currently.
3 | 
4 | cnmem/ is a software-side memory manager by Nvidia (https://github.com/NVIDIA/cnmem). Original source has been modified to use heurisitcs other than best-fit.
5 | 
6 | ### Instructions to set up
7 | Run cmake and make inside ./cnmem/ as well as in ./. Look at vgg_test.cu for an example of specifying and training neural network. New program has to added in ./CMakeLists.txt. Look how vgg_test.cu has been added for an example. Essential API function declarations for training are in include/user_iface.h and include/solver.h. 
8 | 


--------------------------------------------------------------------------------
/src/utils.cu:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | int LayerDimension::getTotalSize() {
 4 | 	return N * C * H * W;
 5 | }
 6 | 
 7 | void outOfMemory() {
 8 | 	std::cout << "Out of Memory\n";
 9 | 	exit(0);
10 | }
11 | 
12 | CnmemSpace::CnmemSpace(size_t free_bytes) {
13 | 	this->free_bytes = free_bytes;
14 | 	this->initial_free_bytes = free_bytes;
15 | 	this->out_of_memory = false;
16 | } 
17 | 
18 | void CnmemSpace::updateSpace(CnmemSpace::Op op, size_t size) {
19 | 
20 | 	if (op == ADD)
21 | 		free_bytes += ceil(1.0 * size / CNMEM_GRANULARITY) * CNMEM_GRANULARITY;
22 | 	else if (op == SUB) {
23 | 		size_t required_space = ceil(1.0 * size / CNMEM_GRANULARITY) * CNMEM_GRANULARITY;
24 | 		if (required_space > free_bytes)
25 | 			this->out_of_memory = true;
26 | 		free_bytes -= required_space;
27 | 	}
28 | }
29 | 
30 | bool CnmemSpace::isAvailable() {
31 | 	return !out_of_memory;
32 | }
33 | 
34 | size_t CnmemSpace::getConsumed() {
35 | 	return (initial_free_bytes - free_bytes);
36 | }
37 | 
38 | void CnmemSpace::updateMaxConsume(size_t &max_consume) {
39 | 	max_consume = max_consume > (initial_free_bytes - free_bytes) ? max_consume : (initial_free_bytes - free_bytes);
40 | }


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project( vDNNNeuralNet )
 3 | include_directories( include /usr/include /usr/local/cuda/include /usr/include/x86_64-linux-gnu /usr/local/cuda/samples/common/inc )
 4 | 
 5 | link_directories( /usr/local/cuda/lib64/ )
 6 | 
 7 | find_package(CUDA)
 8 | 
 9 | cuda_add_executable( neural_net_vdnn.out src/main.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp)
10 | target_link_libraries( neural_net_vdnn.out -lcudnn -lcublas -lcurand)
11 | 
12 | cuda_add_executable( mnist_test.out src/mnist_test.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp)
13 | target_link_libraries( mnist_test.out -lcudnn -lcublas -lcurand )
14 | 
15 | cuda_add_executable( alexnet_test.out src/alexnet_test.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp)
16 | target_link_libraries( alexnet_test.out -lcudnn -lcublas -lcurand )
17 | 
18 | cuda_add_executable( vgg_test.out src/vgg_test.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp)
19 | target_link_libraries( vgg_test.out -lcudnn -lcublas -lcurand)
20 | 
21 | cuda_add_executable( slow_conv.out src/slow_conv.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp)
22 | target_link_libraries( slow_conv.out -lcudnn -lcublas -lcurand)
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/cnmem/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 |  * Redistributions of source code must retain the above copyright
 7 |    notice, this list of conditions and the following disclaimer.
 8 |  * Redistributions in binary form must reproduce the above copyright
 9 |    notice, this list of conditions and the following disclaimer in the
10 |    documentation and/or other materials provided with the distribution.
11 |  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |    contributors may be used to endorse or promote products derived
13 |    from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 
27 | 


--------------------------------------------------------------------------------
/include/solver.h:
--------------------------------------------------------------------------------
 1 | #include "neural_net.h"
 2 | 
 3 | class Solver {
 4 | public:
 5 | 	NeuralNet *model;
 6 | 	void *X_train, *X_val;
 7 | 	int *y_train, *y_val;
 8 | 	int num_epoch;
 9 | 	UpdateRule update_rule;
10 | 	double learning_rate, learning_rate_decay;
11 | 	int num_train, num_val;
12 | 	int num_train_batches;
13 | 	int num_features;
14 | 	cudaEvent_t start, stop;
15 | 
16 | 	Solver(NeuralNet *model, void *X_train, int *y_train, void *X_val, int *y_val, int num_epoch, UpdateRule update_rule, 
17 | 			double learning_rate, double learning_rate_decay, int num_train, int num_val);
18 | 	void train(std::vector<float> &loss, std::vector<int> &val_acc);
19 | 	float step(int start_X, int start_y, std::vector<float> &fwd_vdnn_lag, std::vector<float> &bwd_vdnn_lag);
20 | 	float step(int start_X, int start_y);
21 | 	void checkAccuracy(void *X, int *y, int num_samples, int *num_correct);
22 | 
23 | 	void getTrainTime(std::vector<float> &loss, std::vector<float> &time, int num_epoch, 
24 | 						std::vector<std::vector<float> > &fwd_vdnn_lag, std::vector<std::vector<float> > &bwd_vdnn_lag);
25 | 
26 | 	void getComputationTime(long num_epoch, std::vector<std::vector<float> > &fwd_computation_time, std::vector<std::vector<float> > &bwd_computation_time);
27 | 	void stepComputationTime(int start_X, int start_y, std::vector<float> &fwd_computation_time, std::vector<float> &bwd_computation_time);
28 | 
29 | 	void getTransferTime(long num_epoch, std::vector<std::vector<float> > &fwd_transfer_time, std::vector<std::vector<float> > &bwd_transfer_time);
30 | 	void stepTransferTime(int start_X, int start_y, std::vector<float> &fwd_transfer_time, std::vector<float> &bwd_transfer_time);
31 | 
32 | };


--------------------------------------------------------------------------------
/cnmem/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # CMakeLists to build the cnmem library.
 2 | cmake_minimum_required(VERSION 2.8.8)
 3 | project(cnmem)
 4 | 
 5 | # We need CUDA to build that library.
 6 | find_package(CUDA QUIET REQUIRED)
 7 | include_directories(${CUDA_INCLUDE_DIRS})
 8 | 
 9 | # Rules to build the cnmem library.
10 | include_directories(include)
11 | add_definitions(-DCNMEM_DLLEXPORT)
12 | add_library(cnmem SHARED src/cnmem.cpp)
13 | set_target_properties(cnmem PROPERTIES VERSION 1.0.0 SOVERSION 1)
14 | target_link_libraries(cnmem LINK_PUBLIC ${CUDA_LIBRARIES})
15 | install(TARGETS cnmem RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib)
16 | install(FILES include/cnmem.h DESTINATION include)
17 | 
18 | # Add the tests.
19 | if(WITH_TESTS)
20 | 
21 |   # Get Google tests.
22 |   find_package(GTest QUIET REQUIRED)
23 |   include_directories(${GTEST_INCLUDE_DIRS})
24 |   
25 |   # Build the executable.
26 |   add_executable(cnmem_tests tests/cnmem_tests.cpp)
27 |   if(MSVC)
28 |     if(MSVC_VERSION GREATER 1700) # Visual Studio 11 or more.
29 |       add_definitions(-DUSE_CPP_11)
30 |     endif(MSVC_VERSION GREATER 1700)
31 |   endif(MSVC)
32 |   if(CMAKE_COMPILER_IS_GNUCC)
33 |     add_definitions(-std=c++11 -DUSE_CPP_11)
34 |   endif(CMAKE_COMPILER_IS_GNUCC)
35 |   target_link_libraries(cnmem_tests LINK_PUBLIC cnmem ${CUDA_LIBRARIES} ${GTEST_LIBRARIES} -lpthread)
36 |   install(TARGETS cnmem_tests RUNTIME DESTINATION bin)
37 |   
38 |   # On Windows, we copy the Google test DLL to the bin folder.
39 |   if(MSVC)
40 |     get_filename_component(gtest_dll_path ${GTEST_LIBRARIES} DIRECTORY)
41 |     install(FILES ${gtest_dll_path}/gtest.dll DESTINATION bin)
42 |   endif(MSVC)
43 | 
44 | endif(WITH_TESTS)
45 | 
46 | 


--------------------------------------------------------------------------------
/cnmem/README.md:
--------------------------------------------------------------------------------
 1 | # CNMeM Library
 2 | 
 3 | Simple library to help the Deep Learning frameworks manage CUDA memory. 
 4 | 
 5 | CNMeM is not intended to be a general purpose memory management library. It was designed as a simple
 6 | tool for applications which work on a limited number of large memory buffers.
 7 | 
 8 | CNMeM is mostly developed on Ubuntu Linux. It should support other operating systems as well. If you
 9 | encounter an issue with the library on other operating systems, please submit a bug (or a fix).
10 | 
11 | # Prerequisites
12 | 
13 | CNMeM relies on the CUDA toolkit. It uses C++ STL and the Pthread library on Linux. On Windows, it uses 
14 | the native Win32 threading library. The build system uses CMake. The unit tests are written using
15 | Google tests (but are not mandatory).
16 | 
17 | ## CUDA
18 | 
19 | The CUDA toolkit is required. We recommend using CUDA >= 7.0 even if earlier versions will work. 
20 | * Download from the [CUDA website](https://developer.nvidia.com/cuda-downloads)
21 | * Follow the installation instructions
22 | * Don't forget to set your path. For example:
23 |   * `CUDA_HOME=/usr/local/cuda`
24 |   * `LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH`
25 | 
26 | # Build CNMeM
27 | 
28 | ## Grab the source
29 | 
30 |     % cd $HOME
31 |     % git clone https://github.com/NVIDIA/cnmem.git cnmem
32 | 
33 | ## Build CNMeM without the unit tests
34 | 
35 |     % cd cnmem
36 |     % mkdir build
37 |     % cd build
38 |     % cmake ..
39 |     % make
40 | 
41 | ## Build CNMeM with the unit tests
42 | 
43 | To build the tests, you need to add an extra option to the cmake command.
44 | 
45 |     % cd cnmem
46 |     % mkdir build
47 |     % cd build
48 |     % cmake -DWITH_TESTS=True ..
49 |     % make
50 | 
51 | ## Link with CNMeM
52 | 
53 | The source folder contains a header file 'include/cnmem.h' and the build directory contains the
54 | library 'libcnmem.so', 'cnmem.lib/cnmem.dll' or 'libcnmem.dylib', depending on your operating 
55 | system.
56 | 
57 | 


--------------------------------------------------------------------------------
/include/neural_net.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <string>
 4 | 
 5 | #include <cudnn.h>
 6 | #include <cublas_v2.h>
 7 | #include <curand.h>
 8 | #include <helper_cuda.h>
 9 | #include "user_iface.h"
10 | #include "layer_params.h"
11 | #include "utils.h"
12 | 
13 | // ---------------------- vDNN start ----------------------
14 | #include <cnmem.h>
15 | // ---------------------- vDNN emd ------------------------
16 | 
17 | #ifndef NEURAL_NET
18 | #define NEURAL_NET
19 | class NeuralNet {
20 | public:
21 | 	void **layer_input, **dlayer_input, **params;
22 | 	int *layer_input_size;
23 | 	int *y, *pred_y;
24 | 	float *loss;
25 | 	float softmax_eps;
26 | 	void *one_vec;
27 | 	float init_std_dev;
28 | 
29 | 	std::vector<LayerOp> layer_type;
30 | 	int num_layers;
31 | 	cudnnHandle_t cudnn_handle;
32 | 	cublasHandle_t cublas_handle;
33 | 	curandGenerator_t curand_gen;
34 | 
35 | 	cudnnDataType_t data_type;
36 | 	size_t data_type_size;
37 | 	cudnnTensorFormat_t tensor_format;
38 | 	int batch_size;
39 | 
40 | 	size_t init_free_bytes, free_bytes, total_bytes;
41 | 	size_t workspace_size;
42 | 	void *workspace;
43 | 
44 | 	int input_channels, input_h, input_w;
45 | 	int num_classes;
46 | 
47 | 	float *h_loss;
48 | 	int *h_pred_y;
49 | 	
50 | 	// vDNN
51 | 	vDNNType vdnn_type;
52 | 	vDNNConvAlgo vdnn_conv_algo;
53 | 	cudaStream_t stream_compute, stream_memory;
54 | 
55 | 	bool pre_alloc_conv_derivative, pre_alloc_fc_derivative, pre_alloc_batch_norm_derivative;
56 | 
57 | 	void **h_layer_input;
58 | 	bool *to_offload, *prefetched;
59 | 
60 | 	enum OffloadType {OFFLOAD_ALL, OFFLOAD_NONE, OFFLOAD_CONV, OFFLOAD_ALTERNATE_CONV};
61 | 
62 | 	NeuralNet(std::vector<LayerSpecifier> &layers, DataType data_type, int batch_size, TensorFormat tensor_format, 
63 | 				long long dropout_seed, float softmax_eps, float init_std_dev, vDNNType vdnn_type, vDNNConvAlgo vdnn_conv_algo, 
64 | 				UpdateRule update_rule);
65 | 
66 | 	void getLoss(void *X, int *y, double learning_rate, std::vector<float> &fwd_vdnn_lag, std::vector<float> &bwd_vdnn_lag, bool train = true, int *correct_count = NULL, float *loss = NULL);
67 | 	void getLoss(void *X, int *y, double learning_rate, bool train = true, int *correct_count = NULL, float *loss = NULL);
68 | 
69 | 	void compareOutputCorrect(int *correct_count, int *y);
70 | 
71 | 	float computeLoss();
72 | 
73 | 	int findPrefetchLayer(int cur_layer);
74 | 
75 | 	bool simulateNeuralNetworkMemory(vDNNConvAlgoPref algo_pref, bool hard, size_t &exp_max_consume, size_t &max_consume);
76 | 	bool simulateCNMEMMemory(size_t &max_consume);
77 | 	void vDNNOptimize(size_t &exp_max_consume, size_t &max_consume);
78 | 	void setOffload(OffloadType offload_type);
79 | 	void resetPrefetched();
80 | 
81 | 	// data of time
82 | 	cudaEvent_t start_compute, stop_compute;
83 | 	void getComputationTime(void *X, int *y, double learning_rate, std::vector<float> &fwd_computation_time, std::vector<float> &bwd_computation_time);
84 | 	cudaEvent_t start_transfer, stop_transfer;
85 | 	void getTransferTime(void *X, int *y, double learning_rate, std::vector<float> &fwd_transfer_time, std::vector<float> &bwd_transfer_time);
86 | };
87 | 
88 | #endif


--------------------------------------------------------------------------------
/include/user_iface.h:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | #ifndef USER_IFACE
 4 | #define USER_IFACE
 5 | 
 6 | enum LayerOp {CONV, FULLY_CONNECTED, BATCHNORM, DROPOUT, POOLING, ACTV, SOFTMAX, CROSS_ENTROPY, SVM};
 7 | enum SoftmaxAlgorithm {SOFTMAX_FAST, SOFTMAX_ACCURATE};
 8 | enum SoftmaxMode {SOFTMAX_MODE_INSTANCE, SOFTMAX_MODE_CHANNEL};
 9 | enum DataType {DATA_FLOAT, DATA_DOUBLE};
10 | enum TensorFormat {TENSOR_NCHW, TENSOR_NHWC};
11 | enum BatchNormMode {BATCHNORM_PER_ACTIVATION, BATCHNORM_SPATIAL};
12 | enum PoolingMode {POOLING_MAX, POOLING_AVERAGE_COUNT_INCLUDE_PADDING, POOLING_AVERAGE_COUNT_EXCLUDE_PADDING};
13 | enum ActivationMode {SIGMOID, RELU, TANH, CLIPPED_RELU, ELU, ACTIVATION_NONE};
14 | enum UpdateRule {SGD};
15 | enum vDNNType {vDNN_ALL, vDNN_CONV, vDNN_NONE, vDNN_DYN, vDNN_ALTERNATE_CONV};
16 | enum vDNNConvAlgo {vDNN_PERFORMANCE_OPTIMAL, vDNN_MEMORY_OPTIMAL};
17 | 
18 | struct ConvDescriptor {
19 | 	int input_channels, output_channels, kernel_h, kernel_w;	// define kernel parameters
20 | 	int input_h, input_w;										// output width can be inferred
21 | 	int pad_h, pad_w, stride_y, stride_x;
22 | 	ActivationMode activation_mode;
23 | 	double actv_coef;
24 | 
25 | 	void initializeValues(int input_channels, int output_channels, int kernel_h, int kernel_w, int input_h, int input_w, 
26 | 						int pad_h, int pad_w, int stride_x, int stride_y, ActivationMode activation_mode = ACTIVATION_NONE, double actv_coef = 1.0);
27 | 	
28 | };
29 | 
30 | struct PoolingDescriptor {
31 | 	int input_channels, kernel_h, kernel_w;
32 | 	int input_h, input_w;
33 | 	int pad_h, pad_w, stride_y, stride_x;
34 | 	PoolingMode mode;
35 | 
36 | 	void initializeValues(int input_channels, int kernel_h, int kernel_w,
37 | 						int input_h, int input_w, int pad_h, int pad_w, int stride_x, int stride_y, PoolingMode mode);
38 | };
39 | 
40 | struct DropoutDescriptor {
41 | 	double dropout_value;
42 | 	int channels, h, w;
43 | 
44 | 	void initializeValues(double dropout_value, int channels, int h, int w);
45 | };
46 | 
47 | struct FCDescriptor {
48 | 	int input_channels, output_channels;
49 | 	ActivationMode activation_mode;
50 | 	double actv_coef;
51 | 
52 | 	void initializeValues(int input_channels, int output_channels, ActivationMode activation_mode = ACTIVATION_NONE, double actv_coef = 1.0);
53 | 
54 | };
55 | 
56 | struct BatchNormDescriptor {
57 | 	BatchNormMode mode;
58 | 	double epsilon, factor;
59 | 	int channels, h, w;
60 | 
61 | 	void initializeValues(BatchNormMode mode, double epsilon, double factor, int channels, int h, int w);
62 | };
63 | 
64 | struct ActivationDescriptor {
65 | 	ActivationMode mode;
66 | 	int channels, h, w;
67 | 	double coef;
68 | 	void initializeValues(ActivationMode mode, int channels, int h, int w, double coef = 1.0);
69 | };
70 | 
71 | struct SoftmaxDescriptor {
72 | 	int channels, h, w;
73 | 	SoftmaxAlgorithm algo;
74 | 	SoftmaxMode mode;
75 | 
76 | 	void initializeValues(SoftmaxAlgorithm algo, SoftmaxMode mode, int channels, int h, int w);
77 | };
78 | 
79 | struct LayerSpecifier {
80 | 	LayerOp type;
81 | 	void *params;
82 | 
83 | 	void initPointer(LayerOp type);
84 | 	
85 | 	void freePointer();
86 | 
87 | };
88 | 
89 | #endif


--------------------------------------------------------------------------------
/src/user_iface.cu:
--------------------------------------------------------------------------------
 1 | #include "user_iface.h"
 2 | 
 3 | void ConvDescriptor::initializeValues(int input_channels, int output_channels, int kernel_h, int kernel_w, int input_h, int input_w, 
 4 | 									int pad_h, int pad_w, int stride_x, int stride_y, ActivationMode activation_mode, double actv_coef) {
 5 | 	this->input_channels = input_channels, this->output_channels = output_channels, this->kernel_h = kernel_h, this->kernel_w = kernel_w;
 6 | 	this->input_h = input_h, this->input_w = input_w;
 7 | 	this->pad_h = pad_h, this->pad_w = pad_w, this->stride_y = stride_y, this->stride_x = stride_x;
 8 | 	this->activation_mode = activation_mode;
 9 | 	this->actv_coef = actv_coef;
10 | }
11 | 
12 | void PoolingDescriptor::initializeValues(int input_channels, int kernel_h, int kernel_w,
13 | 									int input_h, int input_w, int pad_h, int pad_w, int stride_x, int stride_y, PoolingMode mode) {
14 | 	this->input_channels = input_channels, this->kernel_h = kernel_h, this->kernel_w = kernel_w;
15 | 	this->input_h = input_h, this->input_w = input_w;
16 | 	this->pad_h = pad_h, this->pad_w = pad_w, this->stride_y = stride_y, this->stride_x = stride_x;
17 | 	this->mode = mode;
18 | }
19 | 
20 | void DropoutDescriptor::initializeValues(double dropout_value, int channels, int h, int w) {
21 | 	this->dropout_value = dropout_value;
22 | 	this->channels = channels;
23 | 	this->h = h;
24 | 	this->w = w;
25 | }
26 | 
27 | void FCDescriptor::initializeValues(int input_channels, int output_channels, ActivationMode activation_mode, double actv_coef) {
28 | 	this->input_channels = input_channels;
29 | 	this->output_channels = output_channels;
30 | 	this->activation_mode = activation_mode;
31 | 	this->actv_coef = actv_coef;
32 | }
33 | 
34 | 
35 | void BatchNormDescriptor::initializeValues(BatchNormMode mode, double epsilon, double factor, int channels, int h, int w) {
36 | 	this->mode = mode;
37 | 	this->epsilon = epsilon, this->factor = factor;
38 | 	this->channels = channels, this->h = h, this->w = w;
39 | }
40 | 
41 | void ActivationDescriptor::initializeValues(ActivationMode mode, int channels, int h, int w, double coef) {
42 | 	this->mode = mode;
43 | 	this->channels = channels;
44 | 	this->h = h;
45 | 	this->w = w;
46 | 	this->coef = coef;
47 | }
48 | 
49 | void SoftmaxDescriptor::initializeValues(SoftmaxAlgorithm algo, SoftmaxMode mode, int channels, int h, int w) {
50 | 	this->algo = algo;
51 | 	this->mode = mode;
52 | 	this->channels = channels;
53 | 	this->h = h;
54 | 	this->w = w;
55 | }
56 | 
57 | void LayerSpecifier::initPointer(LayerOp type) {
58 | 	this->type = type;
59 | 	if (type == CONV)
60 | 		params = malloc(sizeof(ConvDescriptor));
61 | 	else if (type == FULLY_CONNECTED)
62 | 		params = malloc(sizeof(FCDescriptor));
63 | 	else if (type == BATCHNORM)
64 | 		params = malloc(sizeof(BatchNormDescriptor));
65 | 	else if (type == DROPOUT)
66 | 		params = malloc(sizeof(DropoutDescriptor));
67 | 	else if (type == POOLING)
68 | 		params = malloc(sizeof(PoolingDescriptor));
69 | 	else if (type == ACTV)
70 | 		params = malloc(sizeof(ActivationDescriptor));
71 | 	else if (type == SOFTMAX)
72 | 		params = malloc(sizeof(SoftmaxDescriptor));
73 | }
74 | 
75 | void LayerSpecifier::freePointer() {
76 | 	free(params);
77 | }
78 | 


--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
  1 | #include <cudnn.h>
  2 | #include <cublas_v2.h>
  3 | #include <curand.h>
  4 | #include <helper_cuda.h>
  5 | #include <iostream>
  6 | #include <cmath>
  7 | 
  8 | #ifndef UTILS
  9 | #define UTILS
 10 | 
 11 | #define BW (16 * 16)
 12 | #define CNMEM_GRANULARITY 512
 13 | 
 14 | #define FatalError(s) do {                                             \
 15 | 	std::stringstream _where, _message;                                \
 16 | 	_where << __FILE__ << ':' << __LINE__;                             \
 17 | 	_message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__;  \
 18 | 	std::cerr << _message.str() << "\nAborting...\n";                  \
 19 | 	cudaDeviceReset();                                                 \
 20 | 	exit(1);                                                           \
 21 | } while(0)
 22 | 
 23 | #define checkCUDNN(expression)                               							\
 24 | {                                                            							\
 25 | 	cudnnStatus_t status = (expression);                     							\
 26 | 	if (status != CUDNN_STATUS_SUCCESS) {                    							\
 27 | 	  std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": "      \
 28 | 				<< cudnnGetErrorString(status) << std::endl; 							\
 29 | 	  std::exit(EXIT_FAILURE);                               							\
 30 | 	}                                                        							\
 31 | }
 32 | 
 33 | #define checkCUBLAS(expression)                             							\
 34 | {                                                           							\
 35 | 	cublasStatus_t status = (expression);                   							\
 36 | 	if (status != CUBLAS_STATUS_SUCCESS) {                  							\
 37 | 	  std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": "     	\
 38 | 				<< _cudaGetErrorEnum(status) << std::endl;  							\
 39 | 	  std::exit(EXIT_FAILURE);                              							\
 40 | 	}                                                       							\
 41 | }
 42 | 
 43 | #define checkCURAND(expression)                             							\
 44 | {                                                          								\
 45 | 	curandStatus_t status = (expression);                   							\
 46 | 	if (status != CURAND_STATUS_SUCCESS) {                  							\
 47 | 	  std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": "     	\
 48 | 				<< _cudaGetErrorEnum(status) << std::endl;  							\
 49 | 	  std::exit(EXIT_FAILURE);                              							\
 50 | 	}                                                       							\
 51 | }
 52 | 
 53 | #define checkCNMEM(expression)                               							\
 54 | {                                                            							\
 55 | 	cnmemStatus_t status = (expression);                     							\
 56 | 	if (status != CNMEM_STATUS_SUCCESS) {                    							\
 57 | 	  std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": "      \
 58 | 				<< cnmemGetErrorString(status) << std::endl; 							\
 59 | 	  std::exit(EXIT_FAILURE);                               							\
 60 | 	}                                                        							\
 61 | }
 62 | 
 63 | #define checkCNMEMRet(expression)                               						\
 64 | {                                                            							\
 65 | 	cnmemStatus_t status = (expression);                     							\
 66 | 	if (status != CNMEM_STATUS_SUCCESS) {                    							\
 67 | 		if (status == CNMEM_STATUS_OUT_OF_MEMORY) {										\
 68 | 			return false;																\
 69 | 		}																				\
 70 | 	  std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": "      \
 71 | 				<< cnmemGetErrorString(status) << std::endl; 							\
 72 | 	  std::exit(EXIT_FAILURE);                               							\
 73 | 	}                                                      								\
 74 | }
 75 | 
 76 | #define checkCNMEMSim(expression, req_size, max_consume, free_bytes, action, flag)              \
 77 | {                                                            									\
 78 | 	cnmemStatus_t status = (expression);                     									\
 79 | 	if (status != CNMEM_STATUS_SUCCESS) {                    									\
 80 | 		if (status == CNMEM_STATUS_OUT_OF_MEMORY) {												\
 81 | 			flag = true;																		\
 82 | 			size_t largest_free_block_size = 0;													\
 83 | 			cnmemGetLargestFreeBlockSize(largest_free_block_size, NULL);						\
 84 | 			max_consume = req_size - largest_free_block_size + max_consume;						\
 85 | 			max_consume = (max_consume > free_bytes) ? free_bytes : max_consume;				\
 86 | 			action;																				\
 87 | 		}																						\
 88 | 		std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " 			\
 89 | 		<< cnmemGetErrorString(status) << std::endl; 											\
 90 | 			std::exit(EXIT_FAILURE);															\
 91 | 	}                                                      										\
 92 | }
 93 | 
 94 | struct LayerDimension {
 95 | 	int N, C, H, W;
 96 | 
 97 | 	int getTotalSize();
 98 | };
 99 | 
100 | template <typename T>
101 | __global__ void fillValue(T *v, int size, int value) {
102 | 	int i = blockIdx.x * blockDim.x + threadIdx.x;
103 | 	if (i >= size)
104 | 		return;
105 | 	v[i] = value;
106 | }
107 | 
108 | void outOfMemory();
109 | 
110 | struct CnmemSpace {
111 | 	size_t free_bytes;
112 | 	size_t initial_free_bytes;
113 | 	bool out_of_memory;
114 | 
115 | 	enum Op {ADD, SUB};
116 | 
117 | 	CnmemSpace(size_t free_bytes);
118 | 
119 | 	void updateSpace(Op op, size_t size);
120 | 
121 | 	bool isAvailable();
122 | 
123 | 	size_t getConsumed();
124 | 
125 | 	void updateMaxConsume(size_t &max_consume);
126 | 
127 | };
128 | 
129 | #endif


--------------------------------------------------------------------------------
/src/mnist_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <string>
  4 | 
  5 | #include "solver.h"
  6 | 
  7 | using namespace std;
  8 | 
  9 | typedef unsigned char uchar;
 10 | 
 11 | int num_train = 60000, num_test = 10000;
 12 | 
 13 | int reverseInt(int n) {
 14 | 	int bytes = 4;
 15 | 	unsigned char ch[bytes];
 16 | 	for (int i = 0; i < bytes; i++) {
 17 | 		ch[i] = (n >> i * 8) & 255;
 18 | 	}
 19 | 	int p = 0;
 20 | 	for (int i = 0; i < bytes; i++) {
 21 | 		p += (int) ch[i] << (bytes - i - 1) * 8;
 22 | 	}
 23 | 	return p;
 24 | }
 25 | 
 26 | void readMNIST(vector<vector<uchar> > &train_images, vector<vector<uchar> > &test_images, vector<uchar> &train_labels, vector<uchar> &test_labels) {
 27 | 	string filename_train_images = "data/train-images.idx3-ubyte";
 28 | 	string filename_train_labels = "data/train-labels.idx1-ubyte";
 29 | 
 30 | 	string filename_test_images = "data/t10k-images.idx3-ubyte";
 31 | 	string filename_test_labels = "data/t10k-labels.idx1-ubyte";
 32 | 
 33 | 	// read train/test images
 34 | 	for (int i = 0; i < 2; i++) {
 35 | 		string filename;
 36 | 		if (i == 0)
 37 | 			filename = filename_train_images;
 38 | 		else
 39 | 			filename = filename_test_images;
 40 | 
 41 | 		ifstream f(filename.c_str(), ios::binary);
 42 | 		if (!f.is_open())
 43 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 44 | 
 45 | 		// read metadata
 46 | 		int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0;
 47 | 		f.read((char *) &magic_number, sizeof(magic_number));
 48 | 		magic_number = reverseInt(magic_number);
 49 | 		f.read((char *) &n_images, sizeof(n_images));
 50 | 		n_images = reverseInt(n_images);
 51 | 		f.read((char *) &n_rows, sizeof(n_rows));
 52 | 		n_rows = reverseInt(n_rows);
 53 | 		f.read((char *) &n_cols, sizeof(n_cols));
 54 | 		n_cols = reverseInt(n_cols);
 55 | 
 56 | 		for (int k = 0; k < n_images; k++) {
 57 | 			vector<uchar> temp;
 58 | 			temp.reserve(n_rows * n_cols);
 59 | 			for (int j = 0; j < n_rows * n_cols; j++) {
 60 | 				uchar t = 0;
 61 | 				f.read((char *)&t, sizeof(t));
 62 | 				temp.push_back(t);
 63 | 			}
 64 | 			if (i == 0)
 65 | 				train_images.push_back(temp);
 66 | 			else
 67 | 				test_images.push_back(temp);
 68 | 		}
 69 | 		f.close();
 70 | 
 71 | 	}
 72 | 
 73 | 	// read train/test labels
 74 | 	for (int i = 0; i < 2; i++) {
 75 | 		string filename;
 76 | 		if (i == 0)
 77 | 			filename = filename_train_labels;
 78 | 		else
 79 | 			filename = filename_test_labels;
 80 | 
 81 | 		ifstream f(filename.c_str(), ios::binary);
 82 | 		if (!f.is_open())
 83 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 84 | 
 85 | 		// read metadata
 86 | 		int magic_number = 0, n_labels = 0;
 87 | 		f.read((char *) &magic_number, sizeof(magic_number));
 88 | 		magic_number = reverseInt(magic_number);
 89 | 		f.read((char *) &n_labels, sizeof(n_labels));
 90 | 		n_labels = reverseInt(n_labels);
 91 | 
 92 | 		for (int k = 0; k < n_labels; k++) {
 93 | 			uchar t = 0;
 94 | 			f.read((char *)&t, sizeof(t));
 95 | 			if (i == 0)
 96 | 				train_labels.push_back(t);
 97 | 			else
 98 | 				test_labels.push_back(t);
 99 | 		}
100 | 
101 | 		f.close();
102 | 
103 | 	}
104 | }
105 | 
106 | int main() {
107 | 
108 | 	int rows = 28, cols = 28, channels = 1;
109 | 	float *f_train_images, *f_test_images;
110 | 	int *f_train_labels, *f_test_labels;
111 | 	// int rows = 28, cols = 28, channels = 1;
112 | 	int input_size = rows * cols * channels;
113 | 	f_train_images = (float *)malloc(num_train * input_size * sizeof(float));
114 | 	f_train_labels = (int *)malloc(num_train * sizeof(int));
115 | 	f_test_images = (float *)malloc(num_test * input_size * sizeof(float));
116 | 	f_test_labels = (int *)malloc(num_test * sizeof(int));
117 | 
118 | 	{
119 | 		vector<vector<uchar> > train_images, test_images;
120 | 		vector<uchar> train_labels, test_labels;
121 | 		readMNIST(train_images, test_images, train_labels, test_labels);
122 | 	
123 | 		for (int k = 0; k < num_train; k++) {
124 | 			for (int j = 0; j < rows * cols; j++) {
125 | 				f_train_images[k * input_size + j] = (float)train_images[k][j];
126 | 			}
127 | 			f_train_labels[k] = (int)train_labels[k];
128 | 		}
129 | 	
130 | 		for (int k = 0; k < num_test; k++) {
131 | 			for (int j = 0; j < rows * cols; j++) {
132 | 				f_test_images[k * input_size + j] = (float)test_images[k][j];
133 | 			}
134 | 			f_test_labels[k] = (int)test_labels[k];
135 | 		}
136 | 	}
137 | 
138 | 	
139 | 
140 | 	float *mean_image;
141 | 	mean_image = (float *)malloc(input_size * sizeof(float));
142 | 
143 | 	for (int i = 0; i < input_size; i++) {
144 | 		mean_image[i] = 0;
145 | 		for (int k = 0; k < num_train; k++) {
146 | 			mean_image[i] += f_train_images[k * input_size + i];
147 | 		}
148 | 		mean_image[i] /= num_train;
149 | 	}
150 | 
151 | 
152 | 	for (int i = 0; i < num_train; i++) {
153 | 		for (int j = 0; j < input_size; j++) {
154 | 			f_train_images[i * input_size + j] -= mean_image[j];
155 | 		}
156 | 	}
157 | 
158 | 	for (int i = 0; i < num_test; i++) {
159 | 		for (int j = 0; j < input_size; j++) {
160 | 			f_test_images[i * input_size + j] -= mean_image[j];
161 | 		}
162 | 
163 | 	}
164 | 	
165 | 	vector<LayerSpecifier> layer_specifier;
166 | 	{
167 | 		ConvDescriptor layer0;
168 | 		layer0.initializeValues(1, 3, 3, 3, 28, 28, 1, 1, 1, 1, RELU);
169 | 		LayerSpecifier temp;
170 | 		temp.initPointer(CONV);
171 | 		*((ConvDescriptor *)temp.params) = layer0;
172 | 		layer_specifier.push_back(temp);
173 | 	}
174 | 	{
175 | 		FCDescriptor layer1;
176 | 		layer1.initializeValues(3 * 28 * 28, 50, RELU);
177 | 		LayerSpecifier temp;
178 | 		temp.initPointer(FULLY_CONNECTED);
179 | 		*((FCDescriptor *)temp.params) = layer1;
180 | 		layer_specifier.push_back(temp);
181 | 	}
182 | 	{
183 | 		FCDescriptor layer2;
184 | 		layer2.initializeValues(50, 10);
185 | 		LayerSpecifier temp;
186 | 		temp.initPointer(FULLY_CONNECTED);
187 | 		*((FCDescriptor *)temp.params) = layer2;
188 | 		layer_specifier.push_back(temp);
189 | 	}
190 | 	{
191 | 		SoftmaxDescriptor layer2_smax;
192 | 		layer2_smax.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 10, 1, 1);
193 | 		LayerSpecifier temp;
194 | 		temp.initPointer(SOFTMAX);
195 | 		*((SoftmaxDescriptor *)temp.params) = layer2_smax;
196 | 		layer_specifier.push_back(temp);
197 | 	}
198 | 
199 | 	int batch_size = 128;
200 | 	long long dropout_seed = 1;
201 | 	float softmax_eps = 1e-8;
202 | 	float init_std_dev = 0.01;
203 | 	NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vDNN_ALL, vDNN_MEMORY_OPTIMAL, SGD);
204 | 
205 | 	int num_epoch = 1000;
206 | 	double learning_rate = 1e-4;
207 | 	double learning_rate_decay = 0.9;
208 | 	
209 | 	Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train);
210 | 	vector<float> loss;
211 | 	vector<int> val_acc;
212 | 	solver.train(loss, val_acc);
213 | 	int num_correct;
214 | 	solver.checkAccuracy(f_train_images, f_train_labels, num_train, &num_correct);
215 | 	cout << num_correct << endl;
216 | 
217 | 
218 | 
219 | }


--------------------------------------------------------------------------------
/include/layer_params.h:
--------------------------------------------------------------------------------
  1 | #include "user_iface.h"
  2 | #include "utils.h"
  3 | #include <limits>
  4 | #include <cnmem.h>
  5 | 
  6 | #ifndef LAYER_PARAMS
  7 | #define LAYER_PARAMS
  8 | 
  9 | enum vDNNConvAlgoPref {PREFER_MEMORY_OPTIMAL, PREFER_PERFORMANCE_OPTIMAL};
 10 | 
 11 | enum workspaceStatus_t {WORKSPACE_STATUS_SUCCESS, WORKSPACE_STATUS_OUT_OF_MEMORY};
 12 | 
 13 | #define checkWORKSPACE(expression)                               						\
 14 | {                                                            							\
 15 | 	workspaceStatus_t status = (expression);                     						\
 16 | 	if (status != WORKSPACE_STATUS_SUCCESS) {                    						\
 17 | 	  std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": "      \
 18 | 				<< std::endl; 															\
 19 | 	  std::exit(EXIT_FAILURE);                               							\
 20 | 	}                                                        							\
 21 | }
 22 | 
 23 | struct ConvLayerParams {
 24 | 	void *W, *b;
 25 | 	void *dW, *db;
 26 | 	cudnnTensorDescriptor_t input_tensor, output_tensor, bias_desc;
 27 | 	cudnnFilterDescriptor_t filter_desc;
 28 | 	cudnnConvolutionDescriptor_t conv_desc;
 29 | 	cudnnConvolutionFwdAlgo_t fwd_algo;
 30 | 	cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo;
 31 | 	cudnnConvolutionBwdDataAlgo_t bwd_data_algo;
 32 | 	size_t fwd_workspace_size, bwd_filter_workspace_size, bwd_data_workspace_size;
 33 | 	int C_in, C_out, filter_h, filter_w;
 34 | 	int kernel_size;
 35 | 	enum ConvDirection {FWD, BWD_FILTER, BWD_DATA};
 36 | 	UpdateRule update_rule;
 37 | 	cudnnDataType_t data_type;
 38 | 	ActivationMode activation_mode;
 39 | 	cudnnActivationDescriptor_t actv_desc;
 40 | 
 41 | 
 42 | 	int fwd_req_count, fwd_ret_count;
 43 | 	int bwd_filter_req_count, bwd_filter_ret_count;
 44 | 	int bwd_data_req_count, bwd_data_ret_count;
 45 | 	cudnnConvolutionFwdAlgoPerf_t *fwd_perf;
 46 | 	cudnnConvolutionBwdFilterAlgoPerf_t *bwd_filter_perf;
 47 | 	cudnnConvolutionBwdDataAlgoPerf_t *bwd_data_perf;
 48 | 
 49 | 	void initializeValues(cudnnHandle_t cudnn_handle, ConvDescriptor *user_params, cudnnDataType_t data_type, 
 50 | 							int batch_size, cudnnTensorFormat_t tensor_format, size_t data_type_size, LayerDimension &output_size, 
 51 | 							UpdateRule update_rule);
 52 | 
 53 | 	void allocateSpace(curandGenerator_t curand_gen, cudnnDataType_t data_type, size_t data_type_size, float std_dev, size_t &free_bytes, 
 54 | 						bool alloc_derivative);
 55 | 
 56 | 	size_t getWorkspaceSize(size_t &free_bytes, ConvDirection conv_direction, vDNNConvAlgo vdnn_conv_algo);
 57 | 	workspaceStatus_t getWorkspaceSize(size_t &free_bytes, ConvDirection conv_direction, vDNNConvAlgoPref algo_pref, bool hard_pref, size_t &workspace_size);
 58 | 	
 59 | 	void cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream);
 60 | 	bool cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, size_t &max_consume, size_t free_bytes, bool &out_of_memory);
 61 | 	void stepParams(cublasHandle_t cublas_handle, double learning_rate);
 62 | 	void cnmemFreeDerivatives(cudaStream_t stream);
 63 | };
 64 | 
 65 | struct FCLayerParams {
 66 | 	void *W, *b;
 67 | 	void *dW, *db;
 68 | 	int C_in, C_out;
 69 | 	int weight_matrix_size;
 70 | 	UpdateRule update_rule;
 71 | 	cudnnDataType_t data_type;
 72 | 	ActivationMode activation_mode;
 73 | 	cudnnActivationDescriptor_t actv_desc;
 74 | 	cudnnTensorDescriptor_t output_tensor;
 75 | 
 76 | 	void initializeValues(FCDescriptor *user_params, int batch_size, cudnnTensorFormat_t tensor_format, cudnnDataType_t data_type, 
 77 | 							LayerDimension &output_size, UpdateRule update_rule);
 78 | 	void allocateSpace(curandGenerator_t curand_gen, cudnnDataType_t data_type, size_t data_type_size, 
 79 | 						float std_dev, size_t &free_bytes, bool alloc_derivative);
 80 | 	
 81 | 	void cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream);
 82 | 	bool cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, size_t &max_consume, size_t free_bytes, bool &out_of_memory);
 83 | 	void stepParams(cublasHandle_t cublas_handle, double learning_rate);
 84 | 	void cnmemFreeDerivatives(cudaStream_t stream);
 85 | };
 86 | 
 87 | struct DropoutLayerParams {
 88 | 	cudnnDropoutDescriptor_t dropout_desc;
 89 | 	cudnnTensorDescriptor_t input_tensor;
 90 | 	void *reserved_space;
 91 | 	void *state;
 92 | 	size_t reserved_space_size;
 93 | 	size_t state_size;
 94 | 
 95 | 	void initializeValues(cudnnHandle_t cudnn_handle, DropoutDescriptor *user_params, cudnnDataType_t data_type, int batch_size,
 96 | 										 cudnnTensorFormat_t tensor_format, LayerDimension &output_size);
 97 | 
 98 | 	void allocateSpace(size_t &free_bytes, cudnnHandle_t cudnn_handle, DropoutDescriptor *user_params, long long seed);
 99 | };
100 | 
101 | struct BatchNormLayerParams {
102 | 	cudnnTensorDescriptor_t input_tensor;
103 | 	cudnnTensorDescriptor_t sbmv_desc;
104 | 	void *scale, *bias;
105 | 	void *dscale, *dbias;
106 | 	void *running_mean, *running_variance;
107 | 	void *result_save_mean, *result_save_inv_var;
108 | 	double factor, epsilon;
109 | 	cudnnBatchNormMode_t mode;
110 | 	int h, w, c;
111 | 	int sbmv_size;
112 | 	UpdateRule update_rule;
113 | 	size_t allocation_size;
114 | 	cudnnDataType_t data_type;
115 | 
116 | 	void initializeValues(BatchNormDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 
117 | 							int batch_size, LayerDimension &output_size, UpdateRule update_rule);
118 | 	void allocateSpace(cudnnDataType_t data_type, size_t data_type_size, size_t &free_bytes, bool alloc_derivative);
119 | 
120 | 	void cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream);
121 | 	bool cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, size_t &max_consume, size_t free_bytes, bool &out_of_memory);
122 | 	void stepParams(cublasHandle_t cublas_handle, double learning_rate);
123 | 	void cnmemFreeDerivatives(cudaStream_t stream);
124 | };
125 | 
126 | struct PoolingLayerParams {
127 | 	cudnnTensorDescriptor_t input_tensor;
128 | 	cudnnTensorDescriptor_t output_tensor;
129 | 
130 | 	cudnnPoolingDescriptor_t pool_desc;
131 | 
132 | 	void initializeValues(PoolingDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 
133 | 							int batch_size, LayerDimension &output_size);
134 | 	void allocateSpace(size_t &free_bytes);
135 | };
136 | 
137 | struct ActivationLayerParams {
138 | 	cudnnActivationDescriptor_t actv_desc;
139 | 	cudnnTensorDescriptor_t input_tensor;
140 | 
141 | 	void initializeValues(ActivationDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 
142 | 							int batch_size, LayerDimension &output_size);
143 | 	void allocateSpace(size_t &free_bytes);
144 | };
145 | 
146 | struct SoftmaxLayerParams {
147 | 	cudnnTensorDescriptor_t input_tensor;
148 | 	cudnnSoftmaxAlgorithm_t algo;
149 | 	cudnnSoftmaxMode_t mode;
150 | 	
151 | 	void initializeValues(SoftmaxDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 
152 | 							int batch_size, LayerDimension &output_size);
153 | 	void allocateSpace(size_t &free_bytes);
154 | };
155 | 
156 | #endif
157 | 


--------------------------------------------------------------------------------
/src/solver.cu:
--------------------------------------------------------------------------------
  1 | #include "solver.h"
  2 | 
  3 | Solver::Solver(NeuralNet *model, void *X_train, int *y_train, void *X_val, int *y_val, int num_epoch, UpdateRule update_rule, 
  4 | 					double learning_rate, double learning_rate_decay, int num_train, int num_val) {
  5 | 	this->model = model;
  6 | 	this->X_train = X_train, this->X_val = X_val;
  7 | 	this->y_train = y_train, this->y_val = y_val;
  8 | 	this->num_epoch = num_epoch;
  9 | 	this->update_rule = update_rule;
 10 | 	this->learning_rate = learning_rate, this->learning_rate_decay = learning_rate_decay;
 11 | 
 12 | 	this->num_train = num_train, this->num_val = num_val;
 13 | 	this->num_features = model->input_channels * model->input_h * model->input_w;
 14 | 
 15 | 	checkCudaErrors(cudaEventCreate(&start));
 16 | 	checkCudaErrors(cudaEventCreate(&stop));
 17 | 	
 18 | 
 19 | }
 20 | 
 21 | float Solver::step(int start_X, int start_y) {
 22 | 	std::vector<float> t1, t2;
 23 | 	return this->step(start_X, start_y, t1, t2);
 24 | }
 25 | 
 26 | float Solver::step(int start_X, int start_y, std::vector<float> &fwd_vdnn_lag, std::vector<float> &bwd_vdnn_lag) {
 27 | 	float temp_loss;
 28 | 	// std::cout << "start_X: " << start_X << std::endl;
 29 | 	if (model->data_type == CUDNN_DATA_FLOAT)
 30 | 		model->getLoss(&(((float *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_vdnn_lag, bwd_vdnn_lag, true, NULL, &temp_loss);
 31 | 	else if (model->data_type == CUDNN_DATA_DOUBLE)
 32 | 		model->getLoss(&(((double *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_vdnn_lag, bwd_vdnn_lag, true, NULL, &temp_loss);
 33 | 
 34 | 	// float Salpha = -learning_rate;
 35 | 	// double Dalpha = -learning_rate;
 36 | 	// if (update_rule == SGD) {
 37 | 	// 	for (int i = 0; i < model->num_layers; i++) {
 38 | 	// 		if (model->layer_type[i] == CONV) {
 39 | 	// 			ConvLayerParams *cur_params = (ConvLayerParams *)model->params[i];
 40 | 	// 			int kernel_size = cur_params->C_in * cur_params->C_out * cur_params->filter_h * cur_params->filter_w;
 41 | 	// 			if (model->data_type == CUDNN_DATA_FLOAT) {
 42 | 	// 				checkCUBLAS(cublasSaxpy(model->cublas_handle, kernel_size,
 43 | 	// 										&Salpha,
 44 | 	// 										(float *)cur_params->dW, 1,
 45 | 	// 										(float *)cur_params->W, 1));
 46 | 
 47 | 	// 				checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->C_out,
 48 | 	// 										&Salpha,
 49 | 	// 										(float *)cur_params->db, 1,
 50 | 	// 										(float *)cur_params->b, 1));
 51 | 	// 			}
 52 | 	// 			else if (model->data_type == CUDNN_DATA_DOUBLE) {
 53 | 	// 				checkCUBLAS(cublasDaxpy(model->cublas_handle, kernel_size,
 54 | 	// 										&Dalpha,
 55 | 	// 										(double *)cur_params->dW, 1,
 56 | 	// 										(double *)cur_params->W, 1));
 57 | 
 58 | 	// 				checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->C_out,
 59 | 	// 										&Dalpha,
 60 | 	// 										(double *)cur_params->db, 1,
 61 | 	// 										(double *)cur_params->b, 1));
 62 | 	// 			}
 63 | 
 64 | 	// 		}
 65 | 
 66 | 	// 		else if (model->layer_type[i] == FULLY_CONNECTED) {
 67 | 	// 			FCLayerParams *cur_params = (FCLayerParams *)model->params[i];
 68 | 	// 			if (model->data_type == CUDNN_DATA_FLOAT) {
 69 | 	// 				checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->C_in * cur_params->C_out,
 70 | 	// 										&Salpha,
 71 | 	// 										(float *)cur_params->dW, 1,
 72 | 	// 										(float *)cur_params->W, 1));
 73 | 
 74 | 	// 				checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->C_out,
 75 | 	// 										&Salpha,
 76 | 	// 										(float *)cur_params->db, 1,
 77 | 	// 										(float *)cur_params->b, 1));
 78 | 	// 			}
 79 | 	// 			else if (model->data_type == CUDNN_DATA_DOUBLE) {
 80 | 	// 				checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->C_in * cur_params->C_out,
 81 | 	// 										&Dalpha,
 82 | 	// 										(double *)cur_params->dW, 1,
 83 | 	// 										(double *)cur_params->W, 1));
 84 | 
 85 | 	// 				checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->C_out,
 86 | 	// 										&Dalpha,
 87 | 	// 										(double *)cur_params->db, 1,
 88 | 	// 										(double *)cur_params->b, 1));
 89 | 	// 			}
 90 | 	// 		}
 91 | 
 92 | 	// 		else if (model->layer_type[i] == BATCHNORM) {
 93 | 	// 			BatchNormLayerParams *cur_params = (BatchNormLayerParams *)model->params[i];
 94 | 	// 			if (model->data_type == CUDNN_DATA_FLOAT) {
 95 | 	// 				checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->sbmv_size,
 96 | 	// 										&Salpha,
 97 | 	// 										(float *)cur_params->dscale, 1,
 98 | 	// 										(float *)cur_params->scale, 1));
 99 | 	// 				checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->sbmv_size,
100 | 	// 										&Salpha,
101 | 	// 										(float *)cur_params->dbias, 1,
102 | 	// 										(float *)cur_params->bias, 1));
103 | 
104 | 	// 			}
105 | 	// 			else if (model->data_type == CUDNN_DATA_DOUBLE) {
106 | 	// 				checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->sbmv_size,
107 | 	// 										&Dalpha,
108 | 	// 										(double *)cur_params->dscale, 1,
109 | 	// 										(double *)cur_params->scale, 1));
110 | 	// 				checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->sbmv_size,
111 | 	// 										&Dalpha,
112 | 	// 										(double *)cur_params->dbias, 1,
113 | 	// 										(double *)cur_params->bias, 1));
114 | 
115 | 	// 			}
116 | 	// 		}
117 | 	// 	}
118 | 	// }
119 | 	checkCudaErrors(cudaDeviceSynchronize());
120 | 	return temp_loss;
121 | 
122 | }
123 | 
124 | void Solver::train(std::vector<float> &loss, std::vector<int> &val_acc) {
125 | 
126 | 	int batch_size = model->batch_size;
127 | 	int num_train_batches = num_train / model->batch_size;
128 | 	int num_val_batches = num_val / model->batch_size;
129 | 	for (int i = 0; i < num_epoch; i++) {
130 | 		for (int j = 0; j < num_train_batches; j++) {
131 | 			int start_sample = j * num_features * batch_size;
132 | 
133 | 			float milli = 0;
134 | 			checkCudaErrors(cudaEventRecord(start, model->stream_compute));
135 | 
136 | 			float temp_loss = step(start_sample, j * batch_size);
137 | 
138 | 			checkCudaErrors(cudaEventRecord(stop, model->stream_compute));
139 | 			checkCudaErrors(cudaEventSynchronize(stop));
140 | 			checkCudaErrors(cudaEventElapsedTime(&milli, start, stop));
141 | 			std::cout << "One forward, backward pass time(ms): " << milli << std::endl;
142 | 			
143 | 			loss.push_back(temp_loss);
144 | 			std::cout << "loss: " << temp_loss << std::endl;
145 | 		}
146 | 		int correct_count = 0;
147 | 		for (int j = 0; j < num_val_batches; j++) {
148 | 			
149 | 			int start_sample = j * num_features * batch_size;
150 | 			int temp_correct_count;
151 | 			if (model->data_type == CUDNN_DATA_FLOAT)
152 | 				model->getLoss(&(((float *)X_val)[start_sample]), &y_val[j * batch_size], learning_rate, false, &temp_correct_count, NULL);
153 | 			else if (model->data_type == CUDNN_DATA_DOUBLE)
154 | 				model->getLoss(&(((double *)X_val)[start_sample]), &y_val[j * batch_size], learning_rate, false, &temp_correct_count, NULL);
155 | 			correct_count += temp_correct_count;
156 | 		}
157 | 		val_acc.push_back(correct_count);
158 | 		std::cout << "val_acc: " << val_acc[i] << std::endl;
159 | 		// learning_rate *= learning_rate_decay;
160 | 		// std::cout << "learning_rate: " << learning_rate << std::endl;
161 | 	}
162 | 	learning_rate *= learning_rate_decay;
163 | 	
164 | }
165 | 
166 | void Solver::checkAccuracy(void *X, int *y, int num_samples, int *num_correct) {
167 | 	int batch_size = model->batch_size;
168 | 	int num_iter = num_samples / batch_size;
169 | 	*num_correct = 0;
170 | 	for (int i = 0; i < num_iter; i++) {
171 | 		int start_sample = i * num_features * batch_size;
172 | 		int temp_correct_count;
173 | 		if (model->data_type == CUDNN_DATA_FLOAT)
174 | 				model->getLoss(&(((float *)X)[start_sample]), &y[i * batch_size], learning_rate, false, &temp_correct_count, NULL);
175 | 			else if (model->data_type == CUDNN_DATA_DOUBLE)
176 | 				model->getLoss(&(((double *)X)[start_sample]), &y[i * batch_size], learning_rate, false, &temp_correct_count, NULL);
177 | 		*num_correct = *num_correct + temp_correct_count;
178 | 	}
179 | }
180 | 
181 | void Solver::getTrainTime(std::vector<float> &loss, std::vector<float> &time, int num_epoch, 
182 | 							std::vector<std::vector<float> > &fwd_vdnn_lag, std::vector<std::vector<float> > &bwd_vdnn_lag) {
183 | 	int batch_size = model->batch_size;
184 | 	int num_train_batches = num_train / model->batch_size;
185 | 	for (int i = 0; i < num_epoch; i++) {
186 | 		for (int j = 0; j < num_train_batches; j++) {
187 | 			int start_sample = j * num_features * batch_size;
188 | 
189 | 			checkCudaErrors(cudaEventRecord(start));
190 | 			float milli;
191 | 
192 | 			std::vector<float> cur_fwd_vdnn_lag, cur_bwd_vdnn_lag; 
193 | 			float temp_loss = step(start_sample, j * batch_size, cur_fwd_vdnn_lag, cur_bwd_vdnn_lag);
194 | 
195 | 			checkCudaErrors(cudaEventRecord(stop));
196 | 			checkCudaErrors(cudaEventSynchronize(stop));
197 | 			checkCudaErrors(cudaEventElapsedTime(&milli, start, stop));
198 | 			// std::cout << "One forward, backward pass time(ms): " << milli << std::endl;
199 | 			
200 | 			fwd_vdnn_lag.push_back(cur_fwd_vdnn_lag);
201 | 			bwd_vdnn_lag.push_back(cur_bwd_vdnn_lag);
202 | 			
203 | 			loss.push_back(temp_loss);
204 | 			time.push_back(milli);
205 | 			// std::cout << "loss: " << temp_loss << std::endl;
206 | 			// for (int i = 0; i < cur_fwd_vdnn_lag.size(); i++) {
207 | 			// 	std::cout << "fwd_lag " << i << ":" << cur_fwd_vdnn_lag[i] << std::endl; 
208 | 			// }
209 | 			// for (int i = 0; i < cur_bwd_vdnn_lag.size(); i++) {
210 | 			// 	std::cout << "bwd_lag " << i << ":" << cur_bwd_vdnn_lag[i] << std::endl; 
211 | 			// }
212 | 		}
213 | 	}
214 | 	learning_rate *= learning_rate_decay;
215 | }
216 | 
217 | void Solver::getComputationTime(long num_epoch, std::vector<std::vector<float> > &fwd_computation_time, std::vector<std::vector<float> > &bwd_computation_time) {
218 | 	int batch_size = model->batch_size;
219 | 	int num_train_batches = num_train / model->batch_size;
220 | 	for (int i = 0; i < num_epoch; i++) {
221 | 		for (int j = 0; j < num_train_batches; j++) {
222 | 			int start_sample = j * num_features * batch_size;
223 | 
224 | 			float milli;
225 | 
226 | 			std::vector<float> cur_fwd_computation_time, cur_bwd_computation_time; 
227 | 			stepComputationTime(start_sample, j * batch_size, cur_fwd_computation_time, cur_bwd_computation_time);
228 | 			
229 | 			fwd_computation_time.push_back(cur_fwd_computation_time);
230 | 			bwd_computation_time.push_back(cur_bwd_computation_time);
231 | 			
232 | 		}
233 | 		learning_rate *= learning_rate_decay;	
234 | 	}
235 | }
236 | 
237 | void Solver::getTransferTime(long num_epoch, std::vector<std::vector<float> > &fwd_transfer_time, std::vector<std::vector<float> > &bwd_transfer_time) {
238 | 	int batch_size = model->batch_size;
239 | 	int num_train_batches = num_train / model->batch_size;
240 | 	for (int i = 0; i < num_epoch; i++) {
241 | 		for (int j = 0; j < num_train_batches; j++) {
242 | 			int start_sample = j * num_features * batch_size;
243 | 
244 | 			float milli;
245 | 
246 | 			std::vector<float> cur_fwd_transfer_time, cur_bwd_transfer_time; 
247 | 			stepTransferTime(start_sample, j * batch_size, cur_fwd_transfer_time, cur_bwd_transfer_time);
248 | 			
249 | 			fwd_transfer_time.push_back(cur_fwd_transfer_time);
250 | 			bwd_transfer_time.push_back(cur_bwd_transfer_time);
251 | 			
252 | 		}
253 | 		learning_rate *= learning_rate_decay;	
254 | 	}
255 | }
256 | 
257 | void Solver::stepComputationTime(int start_X, int start_y, std::vector<float> &fwd_computation_time, std::vector<float> &bwd_computation_time) {
258 | 	if (model->data_type == CUDNN_DATA_FLOAT)
259 | 		model->getComputationTime(&(((float *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_computation_time, bwd_computation_time);
260 | 	else if (model->data_type == CUDNN_DATA_DOUBLE)
261 | 		model->getComputationTime(&(((double *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_computation_time, bwd_computation_time);
262 | }
263 | 
264 | void Solver::stepTransferTime(int start_X, int start_y, std::vector<float> &fwd_transfer_time, std::vector<float> &bwd_transfer_time) {
265 | 	if (model->data_type == CUDNN_DATA_FLOAT)
266 | 		model->getTransferTime(&(((float *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_transfer_time, bwd_transfer_time);
267 | 	else if (model->data_type == CUDNN_DATA_DOUBLE)
268 | 		model->getTransferTime(&(((double *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_transfer_time, bwd_transfer_time);
269 | }
270 | 


--------------------------------------------------------------------------------
/cnmem/include/cnmem.h:
--------------------------------------------------------------------------------
  1 | /* ********************************************************************** 
  2 |  * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions
  6 |  * are met:
  7 |  *  * Redistributions of source code must retain the above copyright
  8 |  *    notice, this list of conditions and the following disclaimer.
  9 |  *  * Redistributions in binary form must reproduce the above copyright
 10 |  *    notice, this list of conditions and the following disclaimer in the
 11 |  *    documentation and/or other materials provided with the distribution.
 12 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 |  *    contributors may be used to endorse or promote products derived
 14 |  *    from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 |  * ********************************************************************** */
 28 | #pragma once
 29 | 
 30 | #ifdef __cplusplus
 31 | #include "cstdio"
 32 | #else
 33 | #include "stdio.h"
 34 | #endif
 35 | #include "cuda_runtime_api.h"
 36 | 
 37 | #if defined(_MSC_VER) || defined(WIN32)
 38 | #ifdef CNMEM_DLLEXPORT
 39 | #define CNMEM_API __declspec(dllexport)
 40 | #else
 41 | #define CNMEM_API __declspec(dllimport)
 42 | #endif
 43 | #else
 44 | #ifdef CNMEM_DLLEXPORT
 45 | #define CNMEM_API __attribute__((visibility ("default")))
 46 | #else
 47 | #define CNMEM_API
 48 | #endif
 49 | #endif
 50 | 
 51 | #define CNMEM_VERSION 100 // It corresponds to 1.0.0
 52 | 
 53 | #ifdef __cplusplus
 54 | extern "C" {
 55 | #endif
 56 | 
 57 | /* ********************************************************************************************* */
 58 | 
 59 | typedef enum
 60 | {
 61 |   CNMEM_STATUS_SUCCESS = 0,
 62 |   CNMEM_STATUS_CUDA_ERROR,
 63 |   CNMEM_STATUS_INVALID_ARGUMENT,
 64 |   CNMEM_STATUS_NOT_INITIALIZED,
 65 |   CNMEM_STATUS_OUT_OF_MEMORY,
 66 |   CNMEM_STATUS_UNKNOWN_ERROR
 67 | } cnmemStatus_t;
 68 | 
 69 | /* ********************************************************************************************* */
 70 | 
 71 | typedef enum
 72 | {
 73 |   CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
 74 |   CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
 75 |   CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
 76 | } cnmemManagerFlags_t;
 77 | 
 78 | /* ********************************************************************************************* */
 79 | 
 80 | typedef struct cnmemDevice_t_
 81 | {
 82 |   /** The device number. */
 83 |   int device;
 84 |   /** The size to allocate for that device. If 0, the implementation chooses the size. */
 85 |   size_t size;
 86 |   /** The number of named streams associated with the device. The NULL stream is not counted. */
 87 |   int numStreams;
 88 |   /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
 89 |   cudaStream_t *streams;
 90 |   /** The size reserved for each streams. It can be 0. */
 91 |   size_t *streamSizes;
 92 | 
 93 | } cnmemDevice_t;
 94 | 
 95 | /**
 96 |  * \brief Initialize the library and allocate memory on the listed devices.
 97 |  *
 98 |  * For each device, an internal memory manager is created and the specified amount of memory is 
 99 |  * allocated (it is the size defined in device[i].size). For each, named stream an additional 
100 |  * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
101 |  * manager for the device and a list of children, one for each named stream.
102 |  * 
103 |  * This function must be called before any other function in the library. It has to be called 
104 |  * by a single thread since it is not thread-safe.
105 |  *
106 |  * \return 
107 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
108 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
109 |  * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
110 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
111 |  */
112 | cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
113 | 
114 | /**
115 |  * \brief Release all the allocated memory. 
116 |  * 
117 |  * This function must be called by a single thread and after all threads that called 
118 |  * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
119 |  *
120 |  * \return 
121 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
122 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
123 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
124 |  */
125 | cnmemStatus_t CNMEM_API cnmemFinalize();
126 | 
127 | /**
128 |  * \brief Increase the internal reference counter of the context object.
129 |  * 
130 |  * This function increases the internal reference counter of the library. The purpose of that
131 |  * reference counting mechanism is to give more control to the user over the lifetime of the 
132 |  * library. It is useful with scoped memory allocation which may be destroyed in a final 
133 |  * memory collection after the end of main(). That function is thread-safe.
134 |  *
135 |  * \return 
136 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
137 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
138 |  */
139 | cnmemStatus_t CNMEM_API cnmemRetain();
140 | 
141 | /**
142 |  * \brief Decrease the internal reference counter of the context object.
143 |  * 
144 |  * This function decreases the internal reference counter of the library. The purpose of that
145 |  * reference counting mechanism is to give more control to the user over the lifetime of the 
146 |  * library. It is useful with scoped memory allocation which may be destroyed in a final 
147 |  * memory collection after the end of main(). That function is thread-safe.
148 |  *
149 |  * You can use \c cnmemRelease to explicitly finalize the library.
150 |  *
151 |  * \return 
152 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
153 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
154 |  */
155 | cnmemStatus_t CNMEM_API cnmemRelease();
156 | 
157 | /**
158 |  * \brief Add a new stream to the pool of managed streams on a device.
159 |  *
160 |  * This function registers a new stream into a device memory manager. It is thread-safe.
161 |  *
162 |  * \return 
163 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
164 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
165 |  */
166 | cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
167 | 
168 | /**
169 |  * \brief Allocate memory. 
170 |  * 
171 |  * This function allocates memory and initializes a pointer to device memory. If no memory 
172 |  * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
173 |  *
174 |  * The behavior of that function is the following: 
175 |  *
176 |  * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
177 |  *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
178 |  *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
179 |  *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
180 |  *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
181 |  *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
182 |  *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
183 |  *   CNMEM_STATUS_OUT_OF_MEMORY.
184 |  * 
185 |  * - If the stream is a named stream, the initial request goes to the memory manager associated 
186 |  *   with that stream. If a free node is available in the lists of that manager, it is returned. 
187 |  *   Otherwise, the request is passed to the root node and works as if the request were made on 
188 |  *   the NULL stream.
189 |  *
190 |  * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
191 |  * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
192 |  * make sure no kernel uses a given buffer before stealing it) and it the execution is 
193 |  * sequential (in a multi-threaded context, the code is executed in a critical section inside
194 |  * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
195 |  *
196 |  * \return 
197 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
198 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
199 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
200 |  * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
201 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
202 |  */
203 | cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
204 | 
205 | /**
206 |  * \brief Release memory. 
207 |  * 
208 |  * This function releases memory and recycles a memory block in the manager. This function is 
209 |  * thread safe. 
210 |  *
211 |  * \return 
212 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
213 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
214 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
215 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
216 |  */
217 | cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
218 | 
219 | /* ********************************************************************************************* */
220 | /* Utility functions.                                                                            */
221 | /* ********************************************************************************************* */
222 | 
223 | /**
224 |  * \brief Returns the amount of memory managed by the memory manager associated with a stream.
225 |  * 
226 |  * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
227 |  * xity linear in the number of allocated blocks so do not call it in performance critical 
228 |  * sections. 
229 |  *
230 |  * \return 
231 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
232 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
233 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
234 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
235 |  */
236 | cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
237 | 
238 | /**
239 |  * \brief Print a list of nodes to a file. 
240 |  * 
241 |  * This function is intended to be used in case of complex scenarios to help understand the 
242 |  * behaviour of the memory managers/application. It is thread safe.
243 |  *
244 |  * \return 
245 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
246 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
247 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
248 |  *                                or free_mem == 0,
249 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
250 |  */
251 | cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
252 | 
253 | /**
254 |  * \brief Converts a cnmemStatus_t value to a string.
255 |  */
256 | const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
257 | 
258 | /* ********************************************************************************************* */
259 | 
260 | #ifdef __cplusplus
261 | } // extern "C"
262 | #endif
263 | 
264 | 


--------------------------------------------------------------------------------
/include/cnmem.h:
--------------------------------------------------------------------------------
  1 | /* ********************************************************************** 
  2 |  * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions
  6 |  * are met:
  7 |  *  * Redistributions of source code must retain the above copyright
  8 |  *    notice, this list of conditions and the following disclaimer.
  9 |  *  * Redistributions in binary form must reproduce the above copyright
 10 |  *    notice, this list of conditions and the following disclaimer in the
 11 |  *    documentation and/or other materials provided with the distribution.
 12 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 13 |  *    contributors may be used to endorse or promote products derived
 14 |  *    from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 17 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 19 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 20 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 21 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 22 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 23 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 24 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 |  * ********************************************************************** */
 28 | #pragma once
 29 | 
 30 | #ifdef __cplusplus
 31 | #include "cstdio"
 32 | #else
 33 | #include "stdio.h"
 34 | #endif
 35 | #include "cuda_runtime_api.h"
 36 | 
 37 | #if defined(_MSC_VER) || defined(WIN32)
 38 | #ifdef CNMEM_DLLEXPORT
 39 | #define CNMEM_API __declspec(dllexport)
 40 | #else
 41 | #define CNMEM_API __declspec(dllimport)
 42 | #endif
 43 | #else
 44 | #ifdef CNMEM_DLLEXPORT
 45 | #define CNMEM_API __attribute__((visibility ("default")))
 46 | #else
 47 | #define CNMEM_API
 48 | #endif
 49 | #endif
 50 | 
 51 | #define CNMEM_VERSION 100 // It corresponds to 1.0.0
 52 | 
 53 | #ifdef __cplusplus
 54 | extern "C" {
 55 | #endif
 56 | 
 57 | /* ********************************************************************************************* */
 58 | 
 59 | typedef enum
 60 | {
 61 |   CNMEM_STATUS_SUCCESS = 0,
 62 |   CNMEM_STATUS_CUDA_ERROR,
 63 |   CNMEM_STATUS_INVALID_ARGUMENT,
 64 |   CNMEM_STATUS_NOT_INITIALIZED,
 65 |   CNMEM_STATUS_OUT_OF_MEMORY,
 66 |   CNMEM_STATUS_UNKNOWN_ERROR
 67 | } cnmemStatus_t;
 68 | 
 69 | /* ********************************************************************************************* */
 70 | 
 71 | typedef enum
 72 | {
 73 |   CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
 74 |   CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
 75 |   CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
 76 | } cnmemManagerFlags_t;
 77 | 
 78 | /* ********************************************************************************************* */
 79 | 
 80 | typedef struct cnmemDevice_t_
 81 | {
 82 |   /** The device number. */
 83 |   int device;
 84 |   /** The size to allocate for that device. If 0, the implementation chooses the size. */
 85 |   size_t size;
 86 |   /** The number of named streams associated with the device. The NULL stream is not counted. */
 87 |   int numStreams;
 88 |   /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
 89 |   cudaStream_t *streams;
 90 |   /** The size reserved for each streams. It can be 0. */
 91 |   size_t *streamSizes;
 92 | 
 93 | } cnmemDevice_t;
 94 | 
 95 | /**
 96 |  * \brief Initialize the library and allocate memory on the listed devices.
 97 |  *
 98 |  * For each device, an internal memory manager is created and the specified amount of memory is 
 99 |  * allocated (it is the size defined in device[i].size). For each, named stream an additional 
100 |  * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
101 |  * manager for the device and a list of children, one for each named stream.
102 |  * 
103 |  * This function must be called before any other function in the library. It has to be called 
104 |  * by a single thread since it is not thread-safe.
105 |  *
106 |  * \return 
107 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
108 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
109 |  * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
110 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
111 |  */
112 | cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
113 | 
114 | /**
115 |  * \brief Release all the allocated memory. 
116 |  * 
117 |  * This function must be called by a single thread and after all threads that called 
118 |  * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
119 |  *
120 |  * \return 
121 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
122 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
123 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
124 |  */
125 | cnmemStatus_t CNMEM_API cnmemFinalize();
126 | 
127 | /**
128 |  * \brief Increase the internal reference counter of the context object.
129 |  * 
130 |  * This function increases the internal reference counter of the library. The purpose of that
131 |  * reference counting mechanism is to give more control to the user over the lifetime of the 
132 |  * library. It is useful with scoped memory allocation which may be destroyed in a final 
133 |  * memory collection after the end of main(). That function is thread-safe.
134 |  *
135 |  * \return 
136 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
137 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
138 |  */
139 | cnmemStatus_t CNMEM_API cnmemRetain();
140 | 
141 | /**
142 |  * \brief Decrease the internal reference counter of the context object.
143 |  * 
144 |  * This function decreases the internal reference counter of the library. The purpose of that
145 |  * reference counting mechanism is to give more control to the user over the lifetime of the 
146 |  * library. It is useful with scoped memory allocation which may be destroyed in a final 
147 |  * memory collection after the end of main(). That function is thread-safe.
148 |  *
149 |  * You can use \c cnmemRelease to explicitly finalize the library.
150 |  *
151 |  * \return 
152 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
153 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
154 |  */
155 | cnmemStatus_t CNMEM_API cnmemRelease();
156 | 
157 | /**
158 |  * \brief Add a new stream to the pool of managed streams on a device.
159 |  *
160 |  * This function registers a new stream into a device memory manager. It is thread-safe.
161 |  *
162 |  * \return 
163 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
164 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
165 |  */
166 | cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
167 | 
168 | /**
169 |  * \brief Allocate memory. 
170 |  * 
171 |  * This function allocates memory and initializes a pointer to device memory. If no memory 
172 |  * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
173 |  *
174 |  * The behavior of that function is the following: 
175 |  *
176 |  * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
177 |  *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
178 |  *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
179 |  *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
180 |  *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
181 |  *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
182 |  *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
183 |  *   CNMEM_STATUS_OUT_OF_MEMORY.
184 |  * 
185 |  * - If the stream is a named stream, the initial request goes to the memory manager associated 
186 |  *   with that stream. If a free node is available in the lists of that manager, it is returned. 
187 |  *   Otherwise, the request is passed to the root node and works as if the request were made on 
188 |  *   the NULL stream.
189 |  *
190 |  * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
191 |  * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
192 |  * make sure no kernel uses a given buffer before stealing it) and it the execution is 
193 |  * sequential (in a multi-threaded context, the code is executed in a critical section inside
194 |  * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
195 |  *
196 |  * \return 
197 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
198 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
199 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
200 |  * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
201 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
202 |  */
203 | cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
204 | 
205 | /**
206 |  * \brief Release memory. 
207 |  * 
208 |  * This function releases memory and recycles a memory block in the manager. This function is 
209 |  * thread safe. 
210 |  *
211 |  * \return 
212 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
213 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
214 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
215 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
216 |  */
217 | cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
218 | 
219 | /* ********************************************************************************************* */
220 | /* Utility functions.                                                                            */
221 | /* ********************************************************************************************* */
222 | 
223 | /**
224 |  * \brief Returns the amount of memory managed by the memory manager associated with a stream.
225 |  * 
226 |  * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
227 |  * xity linear in the number of allocated blocks so do not call it in performance critical 
228 |  * sections. 
229 |  *
230 |  * \return 
231 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
232 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
233 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
234 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
235 |  */
236 | cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
237 | 
238 | /**
239 |  * \brief Print a list of nodes to a file. 
240 |  * 
241 |  * This function is intended to be used in case of complex scenarios to help understand the 
242 |  * behaviour of the memory managers/application. It is thread safe.
243 |  *
244 |  * \return 
245 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
246 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
247 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
248 |  *                                or free_mem == 0,
249 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
250 |  */
251 | cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
252 | 
253 | /**
254 |  * \brief Print a list of nodes to a file with used and free blocks together in ascending order of address. 
255 |  * 
256 |  * This function is intended to be used in case of complex scenarios to help understand the 
257 |  * behaviour of the memory managers/application. It is thread safe.
258 |  *
259 |  * \return 
260 |  * CNMEM_STATUS_SUCCESS,          if everything goes fine,
261 |  * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
262 |  * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
263 |  *                                or free_mem == 0,
264 |  * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
265 |  */
266 | cnmemStatus_t CNMEM_API cnmemPrintMemoryStateTogether(FILE *file, cudaStream_t stream);
267 | 
268 | /**
269 |  * \brief Converts a cnmemStatus_t value to a string.
270 |  */
271 | const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
272 | 
273 | /**
274 | * \brief Gets the size of last block, according to address if free, otherwise gives 0
275 | * 
276 | * Always returns CNMEM_STATUS_SUCCESS
277 | */
278 | cnmemStatus_t CNMEM_API cnmemGetLastFreeBlockSize(std::size_t &size, cudaStream_t stream);
279 | 
280 | /**
281 | * \brief Gets the size of largest free block, if exists, otherwise gives 0
282 | * 
283 | * Always returns CNMEM_STATUS_SUCCESS
284 | */
285 | cnmemStatus_t CNMEM_API cnmemGetLargestFreeBlockSize(std::size_t &size, cudaStream_t stream);
286 | 
287 | /* ********************************************************************************************* */
288 | 
289 | #ifdef __cplusplus
290 | } // extern "C"
291 | #endif
292 | 
293 | 


--------------------------------------------------------------------------------
/src/slow_conv.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <cmath>
  4 | #include <vector>
  5 | #include <string>
  6 | 
  7 | #include "solver.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | typedef unsigned char uchar;
 12 | 
 13 | int num_train = 1000, num_test = 500;
 14 | 
 15 | int reverseInt(int n) {
 16 | 	int bytes = 4;
 17 | 	unsigned char ch[bytes];
 18 | 	for (int i = 0; i < bytes; i++) {
 19 | 		ch[i] = (n >> i * 8) & 255;
 20 | 	}
 21 | 	int p = 0;
 22 | 	for (int i = 0; i < bytes; i++) {
 23 | 		p += (int) ch[i] << (bytes - i - 1) * 8;
 24 | 	}
 25 | 	return p;
 26 | }
 27 | 
 28 | void readMNIST(vector<vector<uchar> > &train_images, vector<vector<uchar> > &test_images, vector<uchar> &train_labels, vector<uchar> &test_labels) {
 29 | 	string filename_train_images = "data/train-images.idx3-ubyte";
 30 | 	string filename_train_labels = "data/train-labels.idx1-ubyte";
 31 | 
 32 | 	string filename_test_images = "data/t10k-images.idx3-ubyte";
 33 | 	string filename_test_labels = "data/t10k-labels.idx1-ubyte";
 34 | 
 35 | 	// read train/test images
 36 | 	for (int i = 0; i < 2; i++) {
 37 | 		string filename;
 38 | 		if (i == 0)
 39 | 			filename = filename_train_images;
 40 | 		else
 41 | 			filename = filename_test_images;
 42 | 
 43 | 		ifstream f(filename.c_str(), ios::binary);
 44 | 		if (!f.is_open())
 45 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 46 | 
 47 | 		// read metadata
 48 | 		int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0;
 49 | 		f.read((char *) &magic_number, sizeof(magic_number));
 50 | 		magic_number = reverseInt(magic_number);
 51 | 		f.read((char *) &n_images, sizeof(n_images));
 52 | 		n_images = reverseInt(n_images);
 53 | 		f.read((char *) &n_rows, sizeof(n_rows));
 54 | 		n_rows = reverseInt(n_rows);
 55 | 		f.read((char *) &n_cols, sizeof(n_cols));
 56 | 		n_cols = reverseInt(n_cols);
 57 | 
 58 | 		for (int k = 0; k < n_images; k++) {
 59 | 			vector<uchar> temp;
 60 | 			temp.reserve(n_rows * n_cols);
 61 | 			for (int j = 0; j < n_rows * n_cols; j++) {
 62 | 				uchar t = 0;
 63 | 				f.read((char *)&t, sizeof(t));
 64 | 				temp.push_back(t);
 65 | 			}
 66 | 			if (i == 0)
 67 | 				train_images.push_back(temp);
 68 | 			else
 69 | 				test_images.push_back(temp);
 70 | 		}
 71 | 		f.close();
 72 | 
 73 | 	}
 74 | 
 75 | 	// read train/test labels
 76 | 	for (int i = 0; i < 2; i++) {
 77 | 		string filename;
 78 | 		if (i == 0)
 79 | 			filename = filename_train_labels;
 80 | 		else
 81 | 			filename = filename_test_labels;
 82 | 
 83 | 		ifstream f(filename.c_str(), ios::binary);
 84 | 		if (!f.is_open())
 85 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 86 | 
 87 | 		// read metadata
 88 | 		int magic_number = 0, n_labels = 0;
 89 | 		f.read((char *) &magic_number, sizeof(magic_number));
 90 | 		magic_number = reverseInt(magic_number);
 91 | 		f.read((char *) &n_labels, sizeof(n_labels));
 92 | 		n_labels = reverseInt(n_labels);
 93 | 
 94 | 		for (int k = 0; k < n_labels; k++) {
 95 | 			uchar t = 0;
 96 | 			f.read((char *)&t, sizeof(t));
 97 | 			if (i == 0)
 98 | 				train_labels.push_back(t);
 99 | 			else
100 | 				test_labels.push_back(t);
101 | 		}
102 | 
103 | 		f.close();
104 | 
105 | 	}
106 | }
107 | 
108 | void printTimes(vector<float> &time, string filename);
109 | void printvDNNLag(vector<vector<float> > &fwd_vdnn_lag, vector<vector<float> > &bwd_vdnn_lag, string filename);
110 | 
111 | int main(int argc, char *argv[]) {
112 | 
113 | 	// int num_train = 100 * batch_size, num_val = batch_size;
114 | 	// void *X_train = malloc(num_train * input_channels * sizeof(float));
115 | 	// int *y_train = (int *)malloc(num_train * sizeof(int));
116 | 	// void *X_val = malloc(num_val * input_channels * sizeof(float));
117 | 	// int *y_val = (int *)malloc(num_val * sizeof(int));
118 | 	// for (int i = 0; i < num_train; i++) {
119 | 	// 	for (int j = 0; j < input_channels; j++)
120 | 	// 		((float *)X_train)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000;
121 | 	// 	y_train[i] = 0;
122 | 	// }
123 | 
124 | 	// for (int i = 0; i < num_val; i++) {
125 | 	// 	for (int j = 0; j < input_channels; j++)
126 | 	// 		((float *)X_val)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000;
127 | 	// 	y_val[i] = rand() % 2;
128 | 	// }
129 | 	
130 | 	// int rows = 28, cols = 28, channels = 1;
131 | 	// vector<vector<uchar> > train_images, test_images;
132 | 	// vector<uchar> train_labels, test_labels;
133 | 	// readMNIST(train_images, test_images, train_labels, test_labels);
134 | 	// float *f_train_images, *f_train_labels, *f_test_images, *f_test_labels;
135 | 	float *f_train_images, *f_test_images;
136 | 	int *f_train_labels, *f_test_labels;
137 | 	int rows = 156, cols = 156, channels = 3;
138 | 	int input_size = rows * cols * channels;
139 | 	f_train_images = (float *)malloc(num_train * input_size * sizeof(float));
140 | 	f_train_labels = (int *)malloc(num_train * sizeof(int));
141 | 	f_test_images = (float *)malloc(num_test * input_size * sizeof(float));
142 | 	f_test_labels = (int *)malloc(num_test * sizeof(int));
143 | 
144 | 	float *mean_image;
145 | 	mean_image = (float *)malloc(input_size * sizeof(float));
146 | 
147 | 	for (int i = 0; i < input_size; i++) {
148 | 		mean_image[i] = 0;
149 | 		for (int k = 0; k < num_train; k++) {
150 | 			mean_image[i] += f_train_images[k * input_size + i];
151 | 		}
152 | 		mean_image[i] /= num_train;
153 | 	}
154 | 
155 | 
156 | 	for (int i = 0; i < num_train; i++) {
157 | 		for (int j = 0; j < input_size; j++) {
158 | 			f_train_images[i * input_size + j] -= mean_image[j];
159 | 		}
160 | 	}
161 | 
162 | 	for (int i = 0; i < num_test; i++) {
163 | 		for (int j = 0; j < input_size; j++) {
164 | 			f_test_images[i * input_size + j] -= mean_image[j];
165 | 		}
166 | 
167 | 	}
168 | 
169 | 	// int input_channels = rows * cols * channels * 3, hidden_channels1 = 50, hidden_channels2 = 100, output_channels = 10;
170 | 	// vector<LayerSpecifier> layer_specifier;
171 | 	// ConvDescriptor layer0;
172 | 	// LayerSpecifier temp;
173 | 	// layer0.initializeValues(1, 3, 3, 3, rows, cols, 1, 1, 1, 1);
174 | 	// temp.initPointer(CONV);
175 | 	// *((ConvDescriptor *)temp.params) = layer0;
176 | 	// layer_specifier.push_back(temp);
177 | 	// ActivationDescriptor layer0_actv;
178 | 	// layer0_actv.initializeValues(RELU, 3, rows, cols);
179 | 	// temp.initPointer(ACTV);
180 | 	// *((ActivationDescriptor *)temp.params) = layer0_actv;
181 | 	// layer_specifier.push_back(temp);
182 | 
183 | 	// BatchNormDescriptor layer0_bn;
184 | 
185 | 	// for (int i = 0; i < 200; i++) {
186 | 	// 	layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows, cols);
187 | 	// 	temp.initPointer(BATCHNORM);
188 | 	// 	*((BatchNormDescriptor *)temp.params) = layer0_bn;
189 | 	// 	layer_specifier.push_back(temp);
190 | 
191 | 	// 	layer0.initializeValues(3, 3, 3, 3, rows, cols, 1, 1, 1, 1);
192 | 	// 	temp.initPointer(CONV);
193 | 	// 	*((ConvDescriptor *)temp.params) = layer0;
194 | 	// 	layer_specifier.push_back(temp);
195 | 	// 	layer0_actv.initializeValues(RELU, 3, rows, cols);
196 | 	// 	temp.initPointer(ACTV);
197 | 	// 	*((ActivationDescriptor *)temp.params) = layer0_actv;
198 | 	// 	layer_specifier.push_back(temp);
199 | 	// }
200 | 
201 | 	// PoolingDescriptor layer0_pool;
202 | 	// layer0_pool.initializeValues(3, 2, 2, rows, cols, 0, 0, 2, 2, POOLING_MAX);
203 | 	// temp.initPointer(POOLING);
204 | 	// *((PoolingDescriptor *)temp.params) = layer0_pool;
205 | 	// layer_specifier.push_back(temp);
206 | 
207 | 	// layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2);
208 | 	// temp.initPointer(BATCHNORM);
209 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
210 | 	// layer_specifier.push_back(temp);
211 | 
212 | 	// // DropoutDescriptor layer0_dropout;
213 | 	// // layer0_dropout.initializeValues(0.2, 3, rows / 2, cols / 2);
214 | 	// // temp.initPointer(DROPOUT);
215 | 	// // *((DropoutDescriptor *)temp.params) = layer0_dropout;
216 | 	// // layer_specifier.push_back(temp);
217 | 
218 | 	// layer0.initializeValues(3, 3, 3, 3, rows / 2, cols / 2, 1, 1, 1, 1);
219 | 	// temp.initPointer(CONV);
220 | 	// *((ConvDescriptor *)temp.params) = layer0;
221 | 	// layer_specifier.push_back(temp);
222 | 	// layer0_actv.initializeValues(RELU, 3, rows / 2, cols / 2);
223 | 	// temp.initPointer(ACTV);
224 | 	// *((ActivationDescriptor *)temp.params) = layer0_actv;
225 | 	// layer_specifier.push_back(temp);
226 | 
227 | 	// layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2);
228 | 	// temp.initPointer(BATCHNORM);
229 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
230 | 	// layer_specifier.push_back(temp);
231 | 
232 | 	// FCDescriptor layer1;
233 | 	// layer1.initializeValues(input_channels, hidden_channels1);
234 | 	// temp.initPointer(FULLY_CONNECTED);
235 | 	// *((FCDescriptor *)(temp.params)) = layer1;
236 | 	// layer_specifier.push_back(temp);
237 | 
238 | 	// temp.initPointer(ACTV);
239 | 	// ActivationDescriptor layer1_actv;
240 | 	// layer1_actv.initializeValues(RELU, hidden_channels1, 1, 1);
241 | 	// *((ActivationDescriptor *)temp.params) = layer1_actv;
242 | 	// layer_specifier.push_back(temp);
243 | 
244 | 	// layer0_bn.initializeValues(BATCHNORM_PER_ACTIVATION, 1e-5, 0.1, hidden_channels1, 1, 1);
245 | 	// temp.initPointer(BATCHNORM);
246 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
247 | 	// layer_specifier.push_back(temp);
248 | 
249 | 	// temp.initPointer(FULLY_CONNECTED);
250 | 	// FCDescriptor layer2;
251 | 	// layer2.initializeValues(hidden_channels1, output_channels);
252 | 	// *((FCDescriptor *)temp.params) = layer2;
253 | 	// layer_specifier.push_back(temp);
254 | 
255 | 	// // temp.initPointer(FULLY_CONNECTED);
256 | 	// // FCDescriptor layer3;
257 | 	// // layer3.initializeValues(hidden_channels2, output_channels);
258 | 	// // *((FCDescriptor *)temp.params) = layer3;
259 | 	// // layer_specifier.push_back(temp);
260 | 
261 | 	// temp.initPointer(SOFTMAX);
262 | 	// SoftmaxDescriptor smax;
263 | 	// smax.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, output_channels, 1, 1);
264 | 	// *((SoftmaxDescriptor *)(temp.params)) = smax;
265 | 	// layer_specifier.push_back(temp);
266 | 	
267 | 	// AlexNet
268 | 	vector<LayerSpecifier> layer_specifier;
269 | 	{
270 | 		ConvDescriptor layer0;
271 | 		layer0.initializeValues(3, 32, 1, 1, 156, 156, 0, 0, 1, 1, RELU);
272 | 		LayerSpecifier temp;
273 | 		temp.initPointer(CONV);
274 | 		*((ConvDescriptor *)temp.params) = layer0;
275 | 		layer_specifier.push_back(temp);
276 | 	}
277 | 	{
278 | 		ConvDescriptor layer0;
279 | 		layer0.initializeValues(32, 32, 1, 1, 156, 156, 0, 0, 1, 1, RELU);
280 | 		LayerSpecifier temp;
281 | 		temp.initPointer(CONV);
282 | 		*((ConvDescriptor *)temp.params) = layer0;
283 | 		layer_specifier.push_back(temp);
284 | 	}
285 | 	{
286 | 		ConvDescriptor layer0;
287 | 		layer0.initializeValues(32, 3, 1, 1, 156, 156, 0, 0, 1, 1, RELU);
288 | 		LayerSpecifier temp;
289 | 		temp.initPointer(CONV);
290 | 		*((ConvDescriptor *)temp.params) = layer0;
291 | 		layer_specifier.push_back(temp);
292 | 	}
293 | 	{
294 | 		FCDescriptor layer8;
295 | 		layer8.initializeValues(156 * 156 * 3, 10);
296 | 		LayerSpecifier temp;
297 | 		temp.initPointer(FULLY_CONNECTED);
298 | 		*((FCDescriptor *)temp.params) = layer8;
299 | 		layer_specifier.push_back(temp);
300 | 	}
301 | 	{
302 | 		SoftmaxDescriptor layer11;
303 | 		layer11.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 10, 1, 1);
304 | 		LayerSpecifier temp;
305 | 		temp.initPointer(SOFTMAX);
306 | 		*((SoftmaxDescriptor *)temp.params) = layer11;
307 | 		layer_specifier.push_back(temp);
308 | 	}
309 | 
310 | 	vDNNConvAlgo vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL;
311 | 	vDNNType vdnn_type = vDNN_DYN;
312 | 	string filename("vdnn_dyn");
313 | 	if (argc == 3) {
314 | 		filename.assign("vdnn");
315 | 		// argv[1] - layers to offload, argv[2] - conv algo to use
316 | 		if (strcmp(argv[1], "dyn") == 0) {
317 | 			vdnn_type = vDNN_DYN;
318 | 			filename.append("_dyn");
319 | 		}
320 | 		else if (strcmp(argv[1], "conv") == 0) {
321 | 			vdnn_type = vDNN_CONV;
322 | 			filename.append("_conv");
323 | 		}
324 | 		else if (strcmp(argv[1], "all") == 0) {
325 | 			vdnn_type = vDNN_ALL;
326 | 			filename.append("_all");
327 | 		}
328 | 		else {
329 | 			printf("invalid argument.. using vdnn dynamic\n");
330 | 			filename.assign("vdnn_dyn");
331 | 		}
332 | 		if ((strcmp(argv[1], "conv") == 0 or strcmp(argv[1], "all") == 0)) {
333 | 			if (strcmp(argv[2], "p") == 0) {
334 | 				vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL;
335 | 				filename.append("_p");
336 | 			}
337 | 			else if (strcmp(argv[2], "m") == 0) {
338 | 				vdnn_conv_algo = vDNN_MEMORY_OPTIMAL;
339 | 				filename.append("_m");
340 | 			}
341 | 			else {
342 | 				printf("invalid argument.. using vdnn dynamic\n");
343 | 				filename.assign("vdnn_dyn");
344 | 			}
345 | 		}
346 | 	}
347 | 
348 | 	int batch_size = 128;
349 | 	long long dropout_seed = 1;
350 | 	float softmax_eps = 1e-8;
351 | 	float init_std_dev = 0.1;
352 | 	NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vdnn_type, vdnn_conv_algo, SGD);
353 | 
354 | 	int num_epoch = 1000;
355 | 	double learning_rate = 1e-3;
356 | 	double learning_rate_decay = 0.9;
357 | 	
358 | 	Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train);
359 | 	vector<float> loss;
360 | 	vector<float> time;
361 | 	vector<vector<float> > fwd_vdnn_lag, bwd_vdnn_lag;
362 | 	solver.getTrainTime(loss, time, 100, fwd_vdnn_lag, bwd_vdnn_lag);
363 | 	printTimes(time, filename);
364 | 	printvDNNLag(fwd_vdnn_lag, bwd_vdnn_lag, filename);
365 | 
366 | }
367 | 
368 | void printTimes(vector<float> &time, string filename) {
369 | 	float mean_time = 0.0;
370 | 	float std_dev = 0.0;
371 | 	int N = time.size();
372 | 	for (int i = 0; i < N; i++) {
373 | 		mean_time += time[i];
374 | 	}
375 | 	mean_time /= N;
376 | 	for (int i = 0; i < N; i++) {
377 | 		std_dev += pow(time[i] - mean_time, 2);
378 | 	}
379 | 	std_dev /= N;
380 | 	pow(std_dev, 0.5);
381 | 	cout << "Average time: " << mean_time << endl;
382 | 	cout << "Standard deviation: " << std_dev << endl;
383 | 
384 | 	filename.append(".dat");
385 | 	fstream f;
386 | 	f.open(filename.c_str(), ios_base::out);
387 | 
388 | 	for (int i = 0; i < N; i++) {
389 | 		f << time[i] << endl;
390 | 	}
391 | 	f << "mean_time: " << mean_time << endl;
392 | 	f << "standard_deviation: " << std_dev << endl;
393 | 	f.close();
394 | 
395 | }
396 | 
397 | void printvDNNLag(vector<vector<float> > &fwd_vdnn_lag, vector<vector<float> > &bwd_vdnn_lag, string filename) {
398 | 	filename.append("_lag.dat");
399 | 	
400 | 	fstream f;
401 | 	f.open(filename.c_str(), ios_base::out);
402 | 
403 | 	int N = fwd_vdnn_lag.size();
404 | 	for (int i = 0; i < N; i++) {
405 | 		for (int j = 0; j < fwd_vdnn_lag[i].size(); j++) {
406 | 			f << "fwd" << j << ": " << fwd_vdnn_lag[i][j] << endl;
407 | 		}
408 | 		for (int j = 0; j < bwd_vdnn_lag[i].size(); j++) {
409 | 			f << "bwd" << j << ": " << bwd_vdnn_lag[i][j] << endl;
410 | 		}
411 | 		f << endl;
412 | 	}
413 | 	f.close();
414 | }


--------------------------------------------------------------------------------
/src/vgg_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <cmath>
  4 | #include <vector>
  5 | #include <string>
  6 | 
  7 | #include "solver.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | typedef unsigned char uchar;
 12 | 
 13 | int num_train = 128, num_test = 500;
 14 | 
 15 | int reverseInt(int n) {
 16 | 	int bytes = 4;
 17 | 	unsigned char ch[bytes];
 18 | 	for (int i = 0; i < bytes; i++) {
 19 | 		ch[i] = (n >> i * 8) & 255;
 20 | 	}
 21 | 	int p = 0;
 22 | 	for (int i = 0; i < bytes; i++) {
 23 | 		p += (int) ch[i] << (bytes - i - 1) * 8;
 24 | 	}
 25 | 	return p;
 26 | }
 27 | 
 28 | void readMNIST(vector<vector<uchar> > &train_images, vector<vector<uchar> > &test_images, vector<uchar> &train_labels, vector<uchar> &test_labels) {
 29 | 	string filename_train_images = "data/train-images.idx3-ubyte";
 30 | 	string filename_train_labels = "data/train-labels.idx1-ubyte";
 31 | 
 32 | 	string filename_test_images = "data/t10k-images.idx3-ubyte";
 33 | 	string filename_test_labels = "data/t10k-labels.idx1-ubyte";
 34 | 
 35 | 	// read train/test images
 36 | 	for (int i = 0; i < 2; i++) {
 37 | 		string filename;
 38 | 		if (i == 0)
 39 | 			filename = filename_train_images;
 40 | 		else
 41 | 			filename = filename_test_images;
 42 | 
 43 | 		ifstream f(filename.c_str(), ios::binary);
 44 | 		if (!f.is_open())
 45 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 46 | 
 47 | 		// read metadata
 48 | 		int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0;
 49 | 		f.read((char *) &magic_number, sizeof(magic_number));
 50 | 		magic_number = reverseInt(magic_number);
 51 | 		f.read((char *) &n_images, sizeof(n_images));
 52 | 		n_images = reverseInt(n_images);
 53 | 		f.read((char *) &n_rows, sizeof(n_rows));
 54 | 		n_rows = reverseInt(n_rows);
 55 | 		f.read((char *) &n_cols, sizeof(n_cols));
 56 | 		n_cols = reverseInt(n_cols);
 57 | 
 58 | 		for (int k = 0; k < n_images; k++) {
 59 | 			vector<uchar> temp;
 60 | 			temp.reserve(n_rows * n_cols);
 61 | 			for (int j = 0; j < n_rows * n_cols; j++) {
 62 | 				uchar t = 0;
 63 | 				f.read((char *)&t, sizeof(t));
 64 | 				temp.push_back(t);
 65 | 			}
 66 | 			if (i == 0)
 67 | 				train_images.push_back(temp);
 68 | 			else
 69 | 				test_images.push_back(temp);
 70 | 		}
 71 | 		f.close();
 72 | 
 73 | 	}
 74 | 
 75 | 	// read train/test labels
 76 | 	for (int i = 0; i < 2; i++) {
 77 | 		string filename;
 78 | 		if (i == 0)
 79 | 			filename = filename_train_labels;
 80 | 		else
 81 | 			filename = filename_test_labels;
 82 | 
 83 | 		ifstream f(filename.c_str(), ios::binary);
 84 | 		if (!f.is_open())
 85 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 86 | 
 87 | 		// read metadata
 88 | 		int magic_number = 0, n_labels = 0;
 89 | 		f.read((char *) &magic_number, sizeof(magic_number));
 90 | 		magic_number = reverseInt(magic_number);
 91 | 		f.read((char *) &n_labels, sizeof(n_labels));
 92 | 		n_labels = reverseInt(n_labels);
 93 | 
 94 | 		for (int k = 0; k < n_labels; k++) {
 95 | 			uchar t = 0;
 96 | 			f.read((char *)&t, sizeof(t));
 97 | 			if (i == 0)
 98 | 				train_labels.push_back(t);
 99 | 			else
100 | 				test_labels.push_back(t);
101 | 		}
102 | 
103 | 		f.close();
104 | 
105 | 	}
106 | }
107 | 
108 | void printTimes(vector<float> &time, string filename);
109 | void printvDNNLag(vector<vector<float> > &fwd_vdnn_lag, vector<vector<float> > &bwd_vdnn_lag, string filename);
110 | void printComputationTransferTimes(vector<vector<float> > &fwd_times, vector<vector<float> >&bwd_times, bool computation, string filename);
111 | 
112 | int main(int argc, char *argv[]) {
113 | 
114 | 	
115 | 	// Allocate space in memory and read images
116 | 	float *f_train_images, *f_test_images;
117 | 	int *f_train_labels, *f_test_labels;
118 | 	int rows = 224, cols = 224, channels = 3;
119 | 	int input_size = rows * cols * channels;
120 | 	checkCudaErrors(cudaMallocHost(&f_train_images, num_train * input_size * sizeof(float)));
121 | 	checkCudaErrors(cudaMallocHost(&f_train_labels, num_train * sizeof(int)));
122 | 	f_test_images = (float *)malloc(num_test * input_size * sizeof(float));
123 | 	f_test_labels = (int *)malloc(num_test * sizeof(int));
124 | 
125 | 	// read images here
126 | 	// ...
127 | 	// ...
128 | 	// ...
129 | 
130 | 	float *mean_image;
131 | 	mean_image = (float *)malloc(input_size * sizeof(float));
132 | 
133 | 	for (int i = 0; i < input_size; i++) {
134 | 		mean_image[i] = 0;
135 | 		for (int k = 0; k < num_train; k++) {
136 | 			mean_image[i] += f_train_images[k * input_size + i];
137 | 		}
138 | 		mean_image[i] /= num_train;
139 | 	}
140 | 
141 | 
142 | 	for (int i = 0; i < num_train; i++) {
143 | 		for (int j = 0; j < input_size; j++) {
144 | 			f_train_images[i * input_size + j] -= mean_image[j];
145 | 		}
146 | 	}
147 | 
148 | 	for (int i = 0; i < num_test; i++) {
149 | 		for (int j = 0; j < input_size; j++) {
150 | 			f_test_images[i * input_size + j] -= mean_image[j];
151 | 		}
152 | 
153 | 	}
154 | 
155 | 	
156 | 	// VGG specification
157 | 	// Look at user_iface.h for function declaration to initialize values
158 | 	vector<LayerSpecifier> layer_specifier;
159 | 	{
160 | 		ConvDescriptor part0_conv0;
161 | 		part0_conv0.initializeValues(3, 64, 3, 3, 224, 224, 1, 1, 1, 1, RELU);
162 | 		LayerSpecifier temp;
163 | 		temp.initPointer(CONV);
164 | 		*((ConvDescriptor *)temp.params) = part0_conv0;
165 | 		layer_specifier.push_back(temp);
166 | 	}
167 | 	{
168 | 		ConvDescriptor part0_conv1;
169 | 		part0_conv1.initializeValues(64, 64, 3, 3, 224, 224, 1, 1, 1, 1, RELU);
170 | 		LayerSpecifier temp;
171 | 		temp.initPointer(CONV);
172 | 		*((ConvDescriptor *)temp.params) = part0_conv1;
173 | 		layer_specifier.push_back(temp);
174 | 	}
175 | 	{
176 | 		PoolingDescriptor pool0;
177 | 		pool0.initializeValues(64, 2, 2, 224, 224, 0, 0, 2, 2, POOLING_MAX);
178 | 		LayerSpecifier temp;
179 | 		temp.initPointer(POOLING);
180 | 		*((PoolingDescriptor *)temp.params) = pool0;
181 | 		layer_specifier.push_back(temp);
182 | 	}
183 | 	{
184 | 		ConvDescriptor part1_conv0;
185 | 		part1_conv0.initializeValues(64, 128, 3, 3, 112, 112, 1, 1, 1, 1, RELU);
186 | 		LayerSpecifier temp;
187 | 		temp.initPointer(CONV);
188 | 		*((ConvDescriptor *)temp.params) = part1_conv0;
189 | 		layer_specifier.push_back(temp);
190 | 	}
191 | 	{
192 | 		ConvDescriptor part1_conv1;
193 | 		part1_conv1.initializeValues(128, 128, 3, 3, 112, 112, 1, 1, 1, 1, RELU);
194 | 		LayerSpecifier temp;
195 | 		temp.initPointer(CONV);
196 | 		*((ConvDescriptor *)temp.params) = part1_conv1;
197 | 		layer_specifier.push_back(temp);
198 | 	}
199 | 	{
200 | 		PoolingDescriptor pool1;
201 | 		pool1.initializeValues(128, 2, 2, 112, 112, 0, 0, 2, 2, POOLING_MAX);
202 | 		LayerSpecifier temp;
203 | 		temp.initPointer(POOLING);
204 | 		*((PoolingDescriptor *)temp.params) = pool1;
205 | 		layer_specifier.push_back(temp);
206 | 	}
207 | 	{
208 | 		ConvDescriptor part2_conv0;
209 | 		part2_conv0.initializeValues(128, 256, 3, 3, 56, 56, 1, 1, 1, 1, RELU);
210 | 		LayerSpecifier temp;
211 | 		temp.initPointer(CONV);
212 | 		*((ConvDescriptor *)temp.params) = part2_conv0;
213 | 		layer_specifier.push_back(temp);
214 | 	}
215 | 	{
216 | 		ConvDescriptor part2_conv1;
217 | 		part2_conv1.initializeValues(256, 256, 3, 3, 56, 56, 1, 1, 1, 1, RELU);
218 | 		LayerSpecifier temp;
219 | 		temp.initPointer(CONV);
220 | 		*((ConvDescriptor *)temp.params) = part2_conv1;
221 | 		layer_specifier.push_back(temp);
222 | 	}
223 | 	{
224 | 		ConvDescriptor part2_conv2;
225 | 		part2_conv2.initializeValues(256, 256, 3, 3, 56, 56, 1, 1, 1, 1, RELU);
226 | 		LayerSpecifier temp;
227 | 		temp.initPointer(CONV);
228 | 		*((ConvDescriptor *)temp.params) = part2_conv2;
229 | 		layer_specifier.push_back(temp);
230 | 	}
231 | 	{
232 | 		PoolingDescriptor pool2;
233 | 		pool2.initializeValues(256, 2, 2, 56, 56, 0, 0, 2, 2, POOLING_MAX);
234 | 		LayerSpecifier temp;
235 | 		temp.initPointer(POOLING);
236 | 		*((PoolingDescriptor *)temp.params) = pool2;
237 | 		layer_specifier.push_back(temp);
238 | 	}
239 | 	{
240 | 		ConvDescriptor part3_conv0;
241 | 		part3_conv0.initializeValues(256, 512, 3, 3, 28, 28, 1, 1, 1, 1, RELU);
242 | 		LayerSpecifier temp;
243 | 		temp.initPointer(CONV);
244 | 		*((ConvDescriptor *)temp.params) = part3_conv0;
245 | 		layer_specifier.push_back(temp);
246 | 	}
247 | 	{
248 | 		ConvDescriptor part3_conv1;
249 | 		part3_conv1.initializeValues(512, 512, 3, 3, 28, 28, 1, 1, 1, 1, RELU);
250 | 		LayerSpecifier temp;
251 | 		temp.initPointer(CONV);
252 | 		*((ConvDescriptor *)temp.params) = part3_conv1;
253 | 		layer_specifier.push_back(temp);
254 | 	}
255 | 	{
256 | 		ConvDescriptor part3_conv2;
257 | 		part3_conv2.initializeValues(512, 512, 3, 3, 28, 28, 1, 1, 1, 1, RELU);
258 | 		LayerSpecifier temp;
259 | 		temp.initPointer(CONV);
260 | 		*((ConvDescriptor *)temp.params) = part3_conv2;
261 | 		layer_specifier.push_back(temp);
262 | 	}
263 | 	{
264 | 		PoolingDescriptor pool3;
265 | 		pool3.initializeValues(512, 2, 2, 28, 28, 0, 0, 2, 2, POOLING_MAX);
266 | 		LayerSpecifier temp;
267 | 		temp.initPointer(POOLING);
268 | 		*((PoolingDescriptor *)temp.params) = pool3;
269 | 		layer_specifier.push_back(temp);
270 | 	}
271 | 	{
272 | 		ConvDescriptor part4_conv0;
273 | 		part4_conv0.initializeValues(512, 512, 3, 3, 14, 14, 1, 1, 1, 1, RELU);
274 | 		LayerSpecifier temp;
275 | 		temp.initPointer(CONV);
276 | 		*((ConvDescriptor *)temp.params) = part4_conv0;
277 | 		layer_specifier.push_back(temp);
278 | 	}
279 | 	{
280 | 		ConvDescriptor part4_conv1;
281 | 		part4_conv1.initializeValues(512, 512, 3, 3, 14, 14, 1, 1, 1, 1, RELU);
282 | 		LayerSpecifier temp;
283 | 		temp.initPointer(CONV);
284 | 		*((ConvDescriptor *)temp.params) = part4_conv1;
285 | 		layer_specifier.push_back(temp);
286 | 	}
287 | 	{
288 | 		ConvDescriptor part4_conv2;
289 | 		part4_conv2.initializeValues(512, 512, 3, 3, 14, 14, 1, 1, 1, 1, RELU);
290 | 		LayerSpecifier temp;
291 | 		temp.initPointer(CONV);
292 | 		*((ConvDescriptor *)temp.params) = part4_conv2;
293 | 		layer_specifier.push_back(temp);
294 | 	}
295 | 	{
296 | 		PoolingDescriptor pool3;
297 | 		pool3.initializeValues(512, 2, 2, 14, 14, 0, 0, 2, 2, POOLING_MAX);
298 | 		LayerSpecifier temp;
299 | 		temp.initPointer(POOLING);
300 | 		*((PoolingDescriptor *)temp.params) = pool3;
301 | 		layer_specifier.push_back(temp);
302 | 	}
303 | 	
304 | 	{
305 | 		FCDescriptor part5_fc0;
306 | 		part5_fc0.initializeValues(7 * 7 * 512, 4096, RELU);
307 | 		LayerSpecifier temp;
308 | 		temp.initPointer(FULLY_CONNECTED);
309 | 		*((FCDescriptor *)temp.params) = part5_fc0;
310 | 		layer_specifier.push_back(temp);
311 | 	}
312 | 	{
313 | 		FCDescriptor part5_fc1;
314 | 		part5_fc1.initializeValues(4096, 4096, RELU);
315 | 		LayerSpecifier temp;
316 | 		temp.initPointer(FULLY_CONNECTED);
317 | 		*((FCDescriptor *)temp.params) = part5_fc1;
318 | 		layer_specifier.push_back(temp);
319 | 	}
320 | 	{
321 | 		FCDescriptor part5_fc2;
322 | 		part5_fc2.initializeValues(4096, 1000);
323 | 		LayerSpecifier temp;
324 | 		temp.initPointer(FULLY_CONNECTED);
325 | 		*((FCDescriptor *)temp.params) = part5_fc2;
326 | 		layer_specifier.push_back(temp);
327 | 	}
328 | 	{
329 | 		SoftmaxDescriptor s_max;
330 | 		s_max.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 1000, 1, 1);
331 | 		LayerSpecifier temp;
332 | 		temp.initPointer(SOFTMAX);
333 | 		*((SoftmaxDescriptor *)temp.params) = s_max;
334 | 		layer_specifier.push_back(temp);
335 | 	}
336 | 
337 | 
338 | 	// reading command line input
339 | 	// argv[1] - vDNN scheme - dyn, all, conv, alternate_conv, argv[2] - performance_optimal or memory_optimal
340 | 	vDNNConvAlgo vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL;
341 | 	vDNNType vdnn_type = vDNN_DYN;
342 | 	string filename("vdnn_dyn");
343 | 	if (argc == 3) {
344 | 		filename.assign("vdnn");
345 | 		if (strcmp(argv[1], "dyn") == 0) {
346 | 			vdnn_type = vDNN_DYN;
347 | 			filename.append("_dyn");
348 | 		}
349 | 		else if (strcmp(argv[1], "conv") == 0) {
350 | 			vdnn_type = vDNN_CONV;
351 | 			filename.append("_conv");
352 | 		}
353 | 		else if (strcmp(argv[1], "all") == 0) {
354 | 			vdnn_type = vDNN_ALL;
355 | 			filename.append("_all");
356 | 		}
357 | 		else if (strcmp(argv[1], "alternate_conv") == 0) {
358 | 			vdnn_type = vDNN_ALTERNATE_CONV;
359 | 			filename.append("_alternate_conv");
360 | 		}
361 | 		else {
362 | 			printf("invalid argument.. using vdnn dynamic\n");
363 | 			filename.assign("vdnn_dyn");
364 | 		}
365 | 		if ((strcmp(argv[1], "conv") == 0 or strcmp(argv[1], "all") == 0 or strcmp(argv[1], "alternate_conv") == 0)) {
366 | 			if (strcmp(argv[2], "p") == 0) {
367 | 				vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL;
368 | 				filename.append("_p");
369 | 			}
370 | 			else if (strcmp(argv[2], "m") == 0) {
371 | 				vdnn_conv_algo = vDNN_MEMORY_OPTIMAL;
372 | 				filename.append("_m");
373 | 			}
374 | 			else {
375 | 				printf("invalid argument.. using vdnn dynamic\n");
376 | 				filename.assign("vdnn_dyn");
377 | 			}
378 | 		}
379 | 	}
380 | 
381 | 
382 | 	int batch_size = 64;
383 | 	long long dropout_seed = 1;
384 | 	float softmax_eps = 1e-8;
385 | 	float init_std_dev = 0.1;
386 | 	// instantiating network object
387 | 	NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vdnn_type, vdnn_conv_algo, SGD);
388 | 
389 | 	int num_epoch = 1000;
390 | 	double learning_rate = 1e-3;
391 | 	double learning_rate_decay = 0.9;
392 | 	
393 | 	// solver, which takes a network object and runs SGD on it
394 | 	Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train);
395 | 	vector<float> loss;
396 | 	vector<float> time;
397 | 	vector<vector<float> > fwd_vdnn_lag, bwd_vdnn_lag;
398 | 	// trains for given number of steps (here 100). and gets computation/transfer times of each layer for each iteration
399 | 	solver.getTrainTime(loss, time, 100, fwd_vdnn_lag, bwd_vdnn_lag);
400 | 	printTimes(time, filename);
401 | 	printvDNNLag(fwd_vdnn_lag, bwd_vdnn_lag, filename);
402 | 
403 | 	vector<vector<float> > fwd_computation_time, bwd_computation_time;
404 | 	solver.getComputationTime(1, fwd_computation_time, bwd_computation_time);
405 | 
406 | 	vector<vector<float> > fwd_transfer_time, bwd_transfer_time;
407 | 	solver.getTransferTime(1, fwd_transfer_time, bwd_transfer_time);
408 | 
409 | 	printComputationTransferTimes(fwd_computation_time, bwd_computation_time, true, filename);
410 | 	printComputationTransferTimes(fwd_transfer_time, bwd_transfer_time, false, filename);
411 | 
412 | }
413 | 
414 | void printTimes(vector<float> &time, string filename) {
415 | 	float mean_time = 0.0;
416 | 	float std_dev = 0.0;
417 | 	int N = time.size();
418 | 	for (int i = 0; i < N; i++) {
419 | 		mean_time += time[i];
420 | 	}
421 | 	mean_time /= N;
422 | 	for (int i = 0; i < N; i++) {
423 | 		std_dev += pow(time[i] - mean_time, 2);
424 | 	}
425 | 	std_dev /= N;
426 | 	std_dev = pow(std_dev, 0.5);
427 | 	cout << "Average time: " << mean_time << endl;
428 | 	cout << "Standard deviation: " << std_dev << endl;
429 | 
430 | 	filename.append(".dat");
431 | 	fstream f;
432 | 	f.open(filename.c_str(), ios_base::out);
433 | 
434 | 	for (int i = 0; i < N; i++) {
435 | 		f << time[i] << endl;
436 | 	}
437 | 	f << "mean_time: " << mean_time << endl;
438 | 	f << "standard_deviation: " << std_dev << endl;
439 | 	f.close();
440 | 
441 | 	filename.append(".bin");
442 | 	fstream f_bin;
443 | 	f_bin.open(filename.c_str(), ios_base::out);
444 | 	f_bin.write((char *)&N, sizeof(N));
445 | 	for (int i = 0; i < N; i++) {
446 | 		f_bin.write((char *)&time[i], sizeof(time[i]));
447 | 	}
448 | 	f_bin.close();
449 | 
450 | }
451 | 
452 | void printvDNNLag(vector<vector<float> > &fwd_vdnn_lag, vector<vector<float> > &bwd_vdnn_lag, string filename) {
453 | 	filename.append("_lag.dat");
454 | 	
455 | 	fstream f;
456 | 	f.open(filename.c_str(), ios_base::out);
457 | 
458 | 	int N = fwd_vdnn_lag.size();
459 | 	for (int i = 0; i < N; i++) {
460 | 		for (int j = 0; j < fwd_vdnn_lag[i].size(); j++) {
461 | 			f << "fwd" << j << ": " << fwd_vdnn_lag[i][j] << endl;
462 | 		}
463 | 		for (int j = 0; j < bwd_vdnn_lag[i].size(); j++) {
464 | 			f << "bwd" << j << ": " << bwd_vdnn_lag[i][j] << endl;
465 | 		}
466 | 		f << endl;
467 | 	}
468 | 	f.close();
469 | }
470 | 
471 | void printComputationTransferTimes(vector<vector<float> > &fwd_times, vector<vector<float> >&bwd_times, bool computation, string filename) {
472 | 	if (computation)
473 | 		filename.append("_compute_time.dat");
474 | 	else
475 | 		filename.append("_transfer_time.dat");
476 | 
477 | 	fstream f;
478 | 	f.open(filename.c_str(), ios_base::out);
479 | 
480 | 	int N = fwd_times.size();
481 | 	for (int i = 0; i < N; i++) {
482 | 		for (int j = 0; j < fwd_times[i].size(); j++) {
483 | 			f << "fwd" << j << ": " << fwd_times[i][j] << endl;
484 | 		}
485 | 		for (int j = 0; j < bwd_times[i].size(); j++) {
486 | 			f << "bwd" << j << ": " << bwd_times[i][j] << endl;
487 | 		}
488 | 		f << endl;
489 | 	}
490 | 	f.close();	
491 | }


--------------------------------------------------------------------------------
/src/main.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <cmath>
  4 | #include <vector>
  5 | #include <string>
  6 | 
  7 | #include "solver.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | typedef unsigned char uchar;
 12 | 
 13 | int num_train = 1000, num_test = 500;
 14 | 
 15 | int reverseInt(int n) {
 16 | 	int bytes = 4;
 17 | 	unsigned char ch[bytes];
 18 | 	for (int i = 0; i < bytes; i++) {
 19 | 		ch[i] = (n >> i * 8) & 255;
 20 | 	}
 21 | 	int p = 0;
 22 | 	for (int i = 0; i < bytes; i++) {
 23 | 		p += (int) ch[i] << (bytes - i - 1) * 8;
 24 | 	}
 25 | 	return p;
 26 | }
 27 | 
 28 | void readMNIST(vector<vector<uchar> > &train_images, vector<vector<uchar> > &test_images, vector<uchar> &train_labels, vector<uchar> &test_labels) {
 29 | 	string filename_train_images = "data/train-images.idx3-ubyte";
 30 | 	string filename_train_labels = "data/train-labels.idx1-ubyte";
 31 | 
 32 | 	string filename_test_images = "data/t10k-images.idx3-ubyte";
 33 | 	string filename_test_labels = "data/t10k-labels.idx1-ubyte";
 34 | 
 35 | 	// read train/test images
 36 | 	for (int i = 0; i < 2; i++) {
 37 | 		string filename;
 38 | 		if (i == 0)
 39 | 			filename = filename_train_images;
 40 | 		else
 41 | 			filename = filename_test_images;
 42 | 
 43 | 		ifstream f(filename.c_str(), ios::binary);
 44 | 		if (!f.is_open())
 45 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 46 | 
 47 | 		// read metadata
 48 | 		int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0;
 49 | 		f.read((char *) &magic_number, sizeof(magic_number));
 50 | 		magic_number = reverseInt(magic_number);
 51 | 		f.read((char *) &n_images, sizeof(n_images));
 52 | 		n_images = reverseInt(n_images);
 53 | 		f.read((char *) &n_rows, sizeof(n_rows));
 54 | 		n_rows = reverseInt(n_rows);
 55 | 		f.read((char *) &n_cols, sizeof(n_cols));
 56 | 		n_cols = reverseInt(n_cols);
 57 | 
 58 | 		for (int k = 0; k < n_images; k++) {
 59 | 			vector<uchar> temp;
 60 | 			temp.reserve(n_rows * n_cols);
 61 | 			for (int j = 0; j < n_rows * n_cols; j++) {
 62 | 				uchar t = 0;
 63 | 				f.read((char *)&t, sizeof(t));
 64 | 				temp.push_back(t);
 65 | 			}
 66 | 			if (i == 0)
 67 | 				train_images.push_back(temp);
 68 | 			else
 69 | 				test_images.push_back(temp);
 70 | 		}
 71 | 		f.close();
 72 | 
 73 | 	}
 74 | 
 75 | 	// read train/test labels
 76 | 	for (int i = 0; i < 2; i++) {
 77 | 		string filename;
 78 | 		if (i == 0)
 79 | 			filename = filename_train_labels;
 80 | 		else
 81 | 			filename = filename_test_labels;
 82 | 
 83 | 		ifstream f(filename.c_str(), ios::binary);
 84 | 		if (!f.is_open())
 85 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 86 | 
 87 | 		// read metadata
 88 | 		int magic_number = 0, n_labels = 0;
 89 | 		f.read((char *) &magic_number, sizeof(magic_number));
 90 | 		magic_number = reverseInt(magic_number);
 91 | 		f.read((char *) &n_labels, sizeof(n_labels));
 92 | 		n_labels = reverseInt(n_labels);
 93 | 
 94 | 		for (int k = 0; k < n_labels; k++) {
 95 | 			uchar t = 0;
 96 | 			f.read((char *)&t, sizeof(t));
 97 | 			if (i == 0)
 98 | 				train_labels.push_back(t);
 99 | 			else
100 | 				test_labels.push_back(t);
101 | 		}
102 | 
103 | 		f.close();
104 | 
105 | 	}
106 | }
107 | 
108 | void printTimes(vector<float> &time, string filename);
109 | void printvDNNLag(vector<vector<float> > &fwd_vdnn_lag, vector<vector<float> > &bwd_vdnn_lag, string filename);
110 | 
111 | int main(int argc, char *argv[]) {
112 | 
113 | 	// int num_train = 100 * batch_size, num_val = batch_size;
114 | 	// void *X_train = malloc(num_train * input_channels * sizeof(float));
115 | 	// int *y_train = (int *)malloc(num_train * sizeof(int));
116 | 	// void *X_val = malloc(num_val * input_channels * sizeof(float));
117 | 	// int *y_val = (int *)malloc(num_val * sizeof(int));
118 | 	// for (int i = 0; i < num_train; i++) {
119 | 	// 	for (int j = 0; j < input_channels; j++)
120 | 	// 		((float *)X_train)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000;
121 | 	// 	y_train[i] = 0;
122 | 	// }
123 | 
124 | 	// for (int i = 0; i < num_val; i++) {
125 | 	// 	for (int j = 0; j < input_channels; j++)
126 | 	// 		((float *)X_val)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000;
127 | 	// 	y_val[i] = rand() % 2;
128 | 	// }
129 | 	
130 | 	// int rows = 28, cols = 28, channels = 1;
131 | 	// vector<vector<uchar> > train_images, test_images;
132 | 	// vector<uchar> train_labels, test_labels;
133 | 	// readMNIST(train_images, test_images, train_labels, test_labels);
134 | 	// float *f_train_images, *f_train_labels, *f_test_images, *f_test_labels;
135 | 	float *f_train_images, *f_test_images;
136 | 	int *f_train_labels, *f_test_labels;
137 | 	int rows = 227, cols = 227, channels = 3;
138 | 	int input_size = rows * cols * channels;
139 | 	f_train_images = (float *)malloc(num_train * input_size * sizeof(float));
140 | 	f_train_labels = (int *)malloc(num_train * sizeof(int));
141 | 	f_test_images = (float *)malloc(num_test * input_size * sizeof(float));
142 | 	f_test_labels = (int *)malloc(num_test * sizeof(int));
143 | 
144 | 	float *mean_image;
145 | 	mean_image = (float *)malloc(input_size * sizeof(float));
146 | 
147 | 	for (int i = 0; i < input_size; i++) {
148 | 		mean_image[i] = 0;
149 | 		for (int k = 0; k < num_train; k++) {
150 | 			mean_image[i] += f_train_images[k * input_size + i];
151 | 		}
152 | 		mean_image[i] /= num_train;
153 | 	}
154 | 
155 | 
156 | 	for (int i = 0; i < num_train; i++) {
157 | 		for (int j = 0; j < input_size; j++) {
158 | 			f_train_images[i * input_size + j] -= mean_image[j];
159 | 		}
160 | 	}
161 | 
162 | 	for (int i = 0; i < num_test; i++) {
163 | 		for (int j = 0; j < input_size; j++) {
164 | 			f_test_images[i * input_size + j] -= mean_image[j];
165 | 		}
166 | 
167 | 	}
168 | 
169 | 	// int input_channels = rows * cols * channels * 3, hidden_channels1 = 50, hidden_channels2 = 100, output_channels = 10;
170 | 	// vector<LayerSpecifier> layer_specifier;
171 | 	// ConvDescriptor layer0;
172 | 	// LayerSpecifier temp;
173 | 	// layer0.initializeValues(1, 3, 3, 3, rows, cols, 1, 1, 1, 1);
174 | 	// temp.initPointer(CONV);
175 | 	// *((ConvDescriptor *)temp.params) = layer0;
176 | 	// layer_specifier.push_back(temp);
177 | 	// ActivationDescriptor layer0_actv;
178 | 	// layer0_actv.initializeValues(RELU, 3, rows, cols);
179 | 	// temp.initPointer(ACTV);
180 | 	// *((ActivationDescriptor *)temp.params) = layer0_actv;
181 | 	// layer_specifier.push_back(temp);
182 | 
183 | 	// BatchNormDescriptor layer0_bn;
184 | 
185 | 	// for (int i = 0; i < 200; i++) {
186 | 	// 	layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows, cols);
187 | 	// 	temp.initPointer(BATCHNORM);
188 | 	// 	*((BatchNormDescriptor *)temp.params) = layer0_bn;
189 | 	// 	layer_specifier.push_back(temp);
190 | 
191 | 	// 	layer0.initializeValues(3, 3, 3, 3, rows, cols, 1, 1, 1, 1);
192 | 	// 	temp.initPointer(CONV);
193 | 	// 	*((ConvDescriptor *)temp.params) = layer0;
194 | 	// 	layer_specifier.push_back(temp);
195 | 	// 	layer0_actv.initializeValues(RELU, 3, rows, cols);
196 | 	// 	temp.initPointer(ACTV);
197 | 	// 	*((ActivationDescriptor *)temp.params) = layer0_actv;
198 | 	// 	layer_specifier.push_back(temp);
199 | 	// }
200 | 
201 | 	// PoolingDescriptor layer0_pool;
202 | 	// layer0_pool.initializeValues(3, 2, 2, rows, cols, 0, 0, 2, 2, POOLING_MAX);
203 | 	// temp.initPointer(POOLING);
204 | 	// *((PoolingDescriptor *)temp.params) = layer0_pool;
205 | 	// layer_specifier.push_back(temp);
206 | 
207 | 	// layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2);
208 | 	// temp.initPointer(BATCHNORM);
209 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
210 | 	// layer_specifier.push_back(temp);
211 | 
212 | 	// // DropoutDescriptor layer0_dropout;
213 | 	// // layer0_dropout.initializeValues(0.2, 3, rows / 2, cols / 2);
214 | 	// // temp.initPointer(DROPOUT);
215 | 	// // *((DropoutDescriptor *)temp.params) = layer0_dropout;
216 | 	// // layer_specifier.push_back(temp);
217 | 
218 | 	// layer0.initializeValues(3, 3, 3, 3, rows / 2, cols / 2, 1, 1, 1, 1);
219 | 	// temp.initPointer(CONV);
220 | 	// *((ConvDescriptor *)temp.params) = layer0;
221 | 	// layer_specifier.push_back(temp);
222 | 	// layer0_actv.initializeValues(RELU, 3, rows / 2, cols / 2);
223 | 	// temp.initPointer(ACTV);
224 | 	// *((ActivationDescriptor *)temp.params) = layer0_actv;
225 | 	// layer_specifier.push_back(temp);
226 | 
227 | 	// layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2);
228 | 	// temp.initPointer(BATCHNORM);
229 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
230 | 	// layer_specifier.push_back(temp);
231 | 
232 | 	// FCDescriptor layer1;
233 | 	// layer1.initializeValues(input_channels, hidden_channels1);
234 | 	// temp.initPointer(FULLY_CONNECTED);
235 | 	// *((FCDescriptor *)(temp.params)) = layer1;
236 | 	// layer_specifier.push_back(temp);
237 | 
238 | 	// temp.initPointer(ACTV);
239 | 	// ActivationDescriptor layer1_actv;
240 | 	// layer1_actv.initializeValues(RELU, hidden_channels1, 1, 1);
241 | 	// *((ActivationDescriptor *)temp.params) = layer1_actv;
242 | 	// layer_specifier.push_back(temp);
243 | 
244 | 	// layer0_bn.initializeValues(BATCHNORM_PER_ACTIVATION, 1e-5, 0.1, hidden_channels1, 1, 1);
245 | 	// temp.initPointer(BATCHNORM);
246 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
247 | 	// layer_specifier.push_back(temp);
248 | 
249 | 	// temp.initPointer(FULLY_CONNECTED);
250 | 	// FCDescriptor layer2;
251 | 	// layer2.initializeValues(hidden_channels1, output_channels);
252 | 	// *((FCDescriptor *)temp.params) = layer2;
253 | 	// layer_specifier.push_back(temp);
254 | 
255 | 	// // temp.initPointer(FULLY_CONNECTED);
256 | 	// // FCDescriptor layer3;
257 | 	// // layer3.initializeValues(hidden_channels2, output_channels);
258 | 	// // *((FCDescriptor *)temp.params) = layer3;
259 | 	// // layer_specifier.push_back(temp);
260 | 
261 | 	// temp.initPointer(SOFTMAX);
262 | 	// SoftmaxDescriptor smax;
263 | 	// smax.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, output_channels, 1, 1);
264 | 	// *((SoftmaxDescriptor *)(temp.params)) = smax;
265 | 	// layer_specifier.push_back(temp);
266 | 	
267 | 	// AlexNet
268 | 	vector<LayerSpecifier> layer_specifier;
269 | 	{
270 | 		ConvDescriptor layer0;
271 | 		layer0.initializeValues(3, 96, 11, 11, 227, 227, 0, 0, 4, 4);
272 | 		LayerSpecifier temp;
273 | 		temp.initPointer(CONV);
274 | 		*((ConvDescriptor *)temp.params) = layer0;
275 | 		layer_specifier.push_back(temp);
276 | 	}
277 | 	{
278 | 		PoolingDescriptor layer1;
279 | 		layer1.initializeValues(96, 3, 3, 55, 55, 0, 0, 2, 2, POOLING_MAX);
280 | 		LayerSpecifier temp;
281 | 		temp.initPointer(POOLING);
282 | 		*((PoolingDescriptor *)temp.params) = layer1;
283 | 		layer_specifier.push_back(temp);
284 | 	}
285 | 	{
286 | 		ConvDescriptor layer2;
287 | 		layer2.initializeValues(96, 256, 5, 5, 27, 27, 2, 2, 1, 1);
288 | 		LayerSpecifier temp;
289 | 		temp.initPointer(CONV);
290 | 		*((ConvDescriptor *)temp.params) = layer2;
291 | 		layer_specifier.push_back(temp);
292 | 	}
293 | 	{
294 | 		PoolingDescriptor layer3;
295 | 		layer3.initializeValues(256, 3, 3, 27, 27, 0, 0, 2, 2, POOLING_MAX);
296 | 		LayerSpecifier temp;
297 | 		temp.initPointer(POOLING);
298 | 		*((PoolingDescriptor *)temp.params) = layer3;
299 | 		layer_specifier.push_back(temp);
300 | 	}
301 | 	{
302 | 		ConvDescriptor layer4;
303 | 		layer4.initializeValues(256, 384, 3, 3, 13, 13, 1, 1, 1, 1);
304 | 		LayerSpecifier temp;
305 | 		temp.initPointer(CONV);
306 | 		*((ConvDescriptor *)temp.params) = layer4;
307 | 		layer_specifier.push_back(temp);
308 | 	}
309 | 	{
310 | 		ConvDescriptor layer5;
311 | 		layer5.initializeValues(384, 384, 3, 3, 13, 13, 1, 1, 1, 1);
312 | 		LayerSpecifier temp;
313 | 		temp.initPointer(CONV);
314 | 		*((ConvDescriptor *)temp.params) = layer5;
315 | 		layer_specifier.push_back(temp);
316 | 	}
317 | 	{
318 | 		ConvDescriptor layer6;
319 | 		layer6.initializeValues(384, 256, 3, 3, 13, 13, 1, 1, 1, 1);
320 | 		LayerSpecifier temp;
321 | 		temp.initPointer(CONV);
322 | 		*((ConvDescriptor *)temp.params) = layer6;
323 | 		layer_specifier.push_back(temp);
324 | 	}
325 | 	{
326 | 		PoolingDescriptor layer7;
327 | 		layer7.initializeValues(256, 3, 3, 13, 13, 0, 0, 2, 2, POOLING_MAX);
328 | 		LayerSpecifier temp;
329 | 		temp.initPointer(POOLING);
330 | 		*((PoolingDescriptor *)temp.params) = layer7;
331 | 		layer_specifier.push_back(temp);
332 | 	}
333 | 	{
334 | 		FCDescriptor layer8;
335 | 		layer8.initializeValues(9216, 4096);
336 | 		LayerSpecifier temp;
337 | 		temp.initPointer(FULLY_CONNECTED);
338 | 		*((FCDescriptor *)temp.params) = layer8;
339 | 		layer_specifier.push_back(temp);
340 | 	}
341 | 	{
342 | 		FCDescriptor layer9;
343 | 		layer9.initializeValues(4096, 4096);
344 | 		LayerSpecifier temp;
345 | 		temp.initPointer(FULLY_CONNECTED);
346 | 		*((FCDescriptor *)temp.params) = layer9;
347 | 		layer_specifier.push_back(temp);
348 | 	}
349 | 	{
350 | 		FCDescriptor layer10;
351 | 		layer10.initializeValues(4096, 1000);
352 | 		LayerSpecifier temp;
353 | 		temp.initPointer(FULLY_CONNECTED);
354 | 		*((FCDescriptor *)temp.params) = layer10;
355 | 		layer_specifier.push_back(temp);
356 | 	}
357 | 	{
358 | 		SoftmaxDescriptor layer11;
359 | 		layer11.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 1000, 1, 1);
360 | 		LayerSpecifier temp;
361 | 		temp.initPointer(SOFTMAX);
362 | 		*((SoftmaxDescriptor *)temp.params) = layer11;
363 | 		layer_specifier.push_back(temp);
364 | 	}
365 | 
366 | 	vDNNConvAlgo vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL;
367 | 	vDNNType vdnn_type = vDNN_DYN;
368 | 	string filename("vdnn_dyn");
369 | 	if (argc == 3) {
370 | 		filename.assign("vdnn");
371 | 		// argv[1] - layers to offload, argv[2] - conv algo to use
372 | 		if (strcmp(argv[1], "dyn") == 0) {
373 | 			vdnn_type = vDNN_DYN;
374 | 			filename.append("_dyn");
375 | 		}
376 | 		else if (strcmp(argv[1], "conv") == 0) {
377 | 			vdnn_type = vDNN_CONV;
378 | 			filename.append("_conv");
379 | 		}
380 | 		else if (strcmp(argv[1], "all") == 0) {
381 | 			vdnn_type = vDNN_ALL;
382 | 			filename.append("_all");
383 | 		}
384 | 		else {
385 | 			printf("invalid argument.. using vdnn dynamic\n");
386 | 			filename.assign("vdnn_dyn");
387 | 		}
388 | 		if ((strcmp(argv[1], "conv") == 0 or strcmp(argv[1], "all") == 0)) {
389 | 			if (strcmp(argv[2], "p") == 0) {
390 | 				vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL;
391 | 				filename.append("_p");
392 | 			}
393 | 			else if (strcmp(argv[2], "m") == 0) {
394 | 				vdnn_conv_algo = vDNN_MEMORY_OPTIMAL;
395 | 				filename.append("_m");
396 | 			}
397 | 			else {
398 | 				printf("invalid argument.. using vdnn dynamic\n");
399 | 				filename.assign("vdnn_dyn");
400 | 			}
401 | 		}
402 | 	}
403 | 
404 | 	int batch_size = 128;
405 | 	long long dropout_seed = 1;
406 | 	float softmax_eps = 1e-8;
407 | 	float init_std_dev = 0.1;
408 | 	NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vdnn_type, vdnn_conv_algo, SGD);
409 | 
410 | 	int num_epoch = 1000;
411 | 	double learning_rate = 1e-15;
412 | 	double learning_rate_decay = 0.9;
413 | 	
414 | 	Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train);
415 | 	vector<float> loss;
416 | 	vector<float> time;
417 | 	vector<vector<float> > fwd_vdnn_lag, bwd_vdnn_lag;
418 | 	solver.getTrainTime(loss, time, 100, fwd_vdnn_lag, bwd_vdnn_lag);
419 | 	printTimes(time, filename);
420 | 	printvDNNLag(fwd_vdnn_lag, bwd_vdnn_lag, filename);
421 | 
422 | }
423 | 
424 | void printTimes(vector<float> &time, string filename) {
425 | 	float mean_time = 0.0;
426 | 	float std_dev = 0.0;
427 | 	int N = time.size();
428 | 	for (int i = 0; i < N; i++) {
429 | 		mean_time += time[i];
430 | 	}
431 | 	mean_time /= N;
432 | 	for (int i = 0; i < N; i++) {
433 | 		std_dev += pow(time[i] - mean_time, 2);
434 | 	}
435 | 	std_dev /= N;
436 | 	pow(std_dev, 0.5);
437 | 	cout << "Average time: " << mean_time << endl;
438 | 	cout << "Standard deviation: " << std_dev << endl;
439 | 
440 | 	filename.append(".dat");
441 | 	fstream f;
442 | 	f.open(filename.c_str(), ios_base::out);
443 | 
444 | 	for (int i = 0; i < N; i++) {
445 | 		f << time[i] << endl;
446 | 	}
447 | 	f << "mean_time: " << mean_time << endl;
448 | 	f << "standard_deviation: " << std_dev << endl;
449 | 	f.close();
450 | 
451 | }
452 | 
453 | void printvDNNLag(vector<vector<float> > &fwd_vdnn_lag, vector<vector<float> > &bwd_vdnn_lag, string filename) {
454 | 	filename.append("_lag.dat");
455 | 	
456 | 	fstream f;
457 | 	f.open(filename.c_str(), ios_base::out);
458 | 
459 | 	int N = fwd_vdnn_lag.size();
460 | 	for (int i = 0; i < N; i++) {
461 | 		for (int j = 0; j < fwd_vdnn_lag[i].size(); j++) {
462 | 			f << "fwd" << j << ": " << fwd_vdnn_lag[i][j] << endl;
463 | 		}
464 | 		for (int j = 0; j < bwd_vdnn_lag[i].size(); j++) {
465 | 			f << "bwd" << j << ": " << bwd_vdnn_lag[i][j] << endl;
466 | 		}
467 | 		f << endl;
468 | 	}
469 | 	f.close();
470 | }


--------------------------------------------------------------------------------
/src/alexnet_test.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <cstdlib>
  3 | #include <cmath>
  4 | #include <vector>
  5 | #include <string>
  6 | 
  7 | #include "solver.h"
  8 | 
  9 | using namespace std;
 10 | 
 11 | typedef unsigned char uchar;
 12 | 
 13 | int num_train = 512, num_test = 500;
 14 | 
 15 | int reverseInt(int n) {
 16 | 	int bytes = 4;
 17 | 	unsigned char ch[bytes];
 18 | 	for (int i = 0; i < bytes; i++) {
 19 | 		ch[i] = (n >> i * 8) & 255;
 20 | 	}
 21 | 	int p = 0;
 22 | 	for (int i = 0; i < bytes; i++) {
 23 | 		p += (int) ch[i] << (bytes - i - 1) * 8;
 24 | 	}
 25 | 	return p;
 26 | }
 27 | 
 28 | void readMNIST(vector<vector<uchar> > &train_images, vector<vector<uchar> > &test_images, vector<uchar> &train_labels, vector<uchar> &test_labels) {
 29 | 	string filename_train_images = "data/train-images.idx3-ubyte";
 30 | 	string filename_train_labels = "data/train-labels.idx1-ubyte";
 31 | 
 32 | 	string filename_test_images = "data/t10k-images.idx3-ubyte";
 33 | 	string filename_test_labels = "data/t10k-labels.idx1-ubyte";
 34 | 
 35 | 	// read train/test images
 36 | 	for (int i = 0; i < 2; i++) {
 37 | 		string filename;
 38 | 		if (i == 0)
 39 | 			filename = filename_train_images;
 40 | 		else
 41 | 			filename = filename_test_images;
 42 | 
 43 | 		ifstream f(filename.c_str(), ios::binary);
 44 | 		if (!f.is_open())
 45 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 46 | 
 47 | 		// read metadata
 48 | 		int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0;
 49 | 		f.read((char *) &magic_number, sizeof(magic_number));
 50 | 		magic_number = reverseInt(magic_number);
 51 | 		f.read((char *) &n_images, sizeof(n_images));
 52 | 		n_images = reverseInt(n_images);
 53 | 		f.read((char *) &n_rows, sizeof(n_rows));
 54 | 		n_rows = reverseInt(n_rows);
 55 | 		f.read((char *) &n_cols, sizeof(n_cols));
 56 | 		n_cols = reverseInt(n_cols);
 57 | 
 58 | 		for (int k = 0; k < n_images; k++) {
 59 | 			vector<uchar> temp;
 60 | 			temp.reserve(n_rows * n_cols);
 61 | 			for (int j = 0; j < n_rows * n_cols; j++) {
 62 | 				uchar t = 0;
 63 | 				f.read((char *)&t, sizeof(t));
 64 | 				temp.push_back(t);
 65 | 			}
 66 | 			if (i == 0)
 67 | 				train_images.push_back(temp);
 68 | 			else
 69 | 				test_images.push_back(temp);
 70 | 		}
 71 | 		f.close();
 72 | 
 73 | 	}
 74 | 
 75 | 	// read train/test labels
 76 | 	for (int i = 0; i < 2; i++) {
 77 | 		string filename;
 78 | 		if (i == 0)
 79 | 			filename = filename_train_labels;
 80 | 		else
 81 | 			filename = filename_test_labels;
 82 | 
 83 | 		ifstream f(filename.c_str(), ios::binary);
 84 | 		if (!f.is_open())
 85 | 			printf("Cannot read MNIST from %s\n", filename.c_str());
 86 | 
 87 | 		// read metadata
 88 | 		int magic_number = 0, n_labels = 0;
 89 | 		f.read((char *) &magic_number, sizeof(magic_number));
 90 | 		magic_number = reverseInt(magic_number);
 91 | 		f.read((char *) &n_labels, sizeof(n_labels));
 92 | 		n_labels = reverseInt(n_labels);
 93 | 
 94 | 		for (int k = 0; k < n_labels; k++) {
 95 | 			uchar t = 0;
 96 | 			f.read((char *)&t, sizeof(t));
 97 | 			if (i == 0)
 98 | 				train_labels.push_back(t);
 99 | 			else
100 | 				test_labels.push_back(t);
101 | 		}
102 | 
103 | 		f.close();
104 | 
105 | 	}
106 | }
107 | 
108 | void printTimes(vector<float> &time, string filename);
109 | void printvDNNLag(vector<vector<float> > &fwd_vdnn_lag, vector<vector<float> > &bwd_vdnn_lag, string filename);
110 | void printComputationTransferTimes(vector<vector<float> > &fwd_times, vector<vector<float> >&bwd_times, bool computation, string filename);
111 | 
112 | int main(int argc, char *argv[]) {
113 | 
114 | 	// int num_train = 100 * batch_size, num_val = batch_size;
115 | 	// void *X_train = malloc(num_train * input_channels * sizeof(float));
116 | 	// int *y_train = (int *)malloc(num_train * sizeof(int));
117 | 	// void *X_val = malloc(num_val * input_channels * sizeof(float));
118 | 	// int *y_val = (int *)malloc(num_val * sizeof(int));
119 | 	// for (int i = 0; i < num_train; i++) {
120 | 	// 	for (int j = 0; j < input_channels; j++)
121 | 	// 		((float *)X_train)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000;
122 | 	// 	y_train[i] = 0;
123 | 	// }
124 | 
125 | 	// for (int i = 0; i < num_val; i++) {
126 | 	// 	for (int j = 0; j < input_channels; j++)
127 | 	// 		((float *)X_val)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000;
128 | 	// 	y_val[i] = rand() % 2;
129 | 	// }
130 | 	
131 | 	// int rows = 28, cols = 28, channels = 1;
132 | 	// vector<vector<uchar> > train_images, test_images;
133 | 	// vector<uchar> train_labels, test_labels;
134 | 	// readMNIST(train_images, test_images, train_labels, test_labels);
135 | 	// float *f_train_images, *f_train_labels, *f_test_images, *f_test_labels;
136 | 	float *f_train_images, *f_test_images;
137 | 	int *f_train_labels, *f_test_labels;
138 | 	int rows = 227, cols = 227, channels = 3;
139 | 	int input_size = rows * cols * channels;
140 | 	// f_train_images = (float *)malloc(num_train * input_size * sizeof(float));
141 | 	// f_train_labels = (int *)malloc(num_train * sizeof(int));
142 | 	checkCudaErrors(cudaMallocHost(&f_train_images, num_train * input_size * sizeof(float)));
143 | 	checkCudaErrors(cudaMallocHost(&f_train_labels, num_train * sizeof(int)));
144 | 	f_test_images = (float *)malloc(num_test * input_size * sizeof(float));
145 | 	f_test_labels = (int *)malloc(num_test * sizeof(int));
146 | 
147 | 	float *mean_image;
148 | 	mean_image = (float *)malloc(input_size * sizeof(float));
149 | 
150 | 	for (int i = 0; i < input_size; i++) {
151 | 		mean_image[i] = 0;
152 | 		for (int k = 0; k < num_train; k++) {
153 | 			mean_image[i] += f_train_images[k * input_size + i];
154 | 		}
155 | 		mean_image[i] /= num_train;
156 | 	}
157 | 
158 | 
159 | 	for (int i = 0; i < num_train; i++) {
160 | 		for (int j = 0; j < input_size; j++) {
161 | 			f_train_images[i * input_size + j] -= mean_image[j];
162 | 		}
163 | 	}
164 | 
165 | 	for (int i = 0; i < num_test; i++) {
166 | 		for (int j = 0; j < input_size; j++) {
167 | 			f_test_images[i * input_size + j] -= mean_image[j];
168 | 		}
169 | 
170 | 	}
171 | 
172 | 	// int input_channels = rows * cols * channels * 3, hidden_channels1 = 50, hidden_channels2 = 100, output_channels = 10;
173 | 	// vector<LayerSpecifier> layer_specifier;
174 | 	// ConvDescriptor layer0;
175 | 	// LayerSpecifier temp;
176 | 	// layer0.initializeValues(1, 3, 3, 3, rows, cols, 1, 1, 1, 1);
177 | 	// temp.initPointer(CONV);
178 | 	// *((ConvDescriptor *)temp.params) = layer0;
179 | 	// layer_specifier.push_back(temp);
180 | 	// ActivationDescriptor layer0_actv;
181 | 	// layer0_actv.initializeValues(RELU, 3, rows, cols);
182 | 	// temp.initPointer(ACTV);
183 | 	// *((ActivationDescriptor *)temp.params) = layer0_actv;
184 | 	// layer_specifier.push_back(temp);
185 | 
186 | 	// BatchNormDescriptor layer0_bn;
187 | 
188 | 	// for (int i = 0; i < 200; i++) {
189 | 	// 	layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows, cols);
190 | 	// 	temp.initPointer(BATCHNORM);
191 | 	// 	*((BatchNormDescriptor *)temp.params) = layer0_bn;
192 | 	// 	layer_specifier.push_back(temp);
193 | 
194 | 	// 	layer0.initializeValues(3, 3, 3, 3, rows, cols, 1, 1, 1, 1);
195 | 	// 	temp.initPointer(CONV);
196 | 	// 	*((ConvDescriptor *)temp.params) = layer0;
197 | 	// 	layer_specifier.push_back(temp);
198 | 	// 	layer0_actv.initializeValues(RELU, 3, rows, cols);
199 | 	// 	temp.initPointer(ACTV);
200 | 	// 	*((ActivationDescriptor *)temp.params) = layer0_actv;
201 | 	// 	layer_specifier.push_back(temp);
202 | 	// }
203 | 
204 | 	// PoolingDescriptor layer0_pool;
205 | 	// layer0_pool.initializeValues(3, 2, 2, rows, cols, 0, 0, 2, 2, POOLING_MAX);
206 | 	// temp.initPointer(POOLING);
207 | 	// *((PoolingDescriptor *)temp.params) = layer0_pool;
208 | 	// layer_specifier.push_back(temp);
209 | 
210 | 	// layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2);
211 | 	// temp.initPointer(BATCHNORM);
212 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
213 | 	// layer_specifier.push_back(temp);
214 | 
215 | 	// // DropoutDescriptor layer0_dropout;
216 | 	// // layer0_dropout.initializeValues(0.2, 3, rows / 2, cols / 2);
217 | 	// // temp.initPointer(DROPOUT);
218 | 	// // *((DropoutDescriptor *)temp.params) = layer0_dropout;
219 | 	// // layer_specifier.push_back(temp);
220 | 
221 | 	// layer0.initializeValues(3, 3, 3, 3, rows / 2, cols / 2, 1, 1, 1, 1);
222 | 	// temp.initPointer(CONV);
223 | 	// *((ConvDescriptor *)temp.params) = layer0;
224 | 	// layer_specifier.push_back(temp);
225 | 	// layer0_actv.initializeValues(RELU, 3, rows / 2, cols / 2);
226 | 	// temp.initPointer(ACTV);
227 | 	// *((ActivationDescriptor *)temp.params) = layer0_actv;
228 | 	// layer_specifier.push_back(temp);
229 | 
230 | 	// layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2);
231 | 	// temp.initPointer(BATCHNORM);
232 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
233 | 	// layer_specifier.push_back(temp);
234 | 
235 | 	// FCDescriptor layer1;
236 | 	// layer1.initializeValues(input_channels, hidden_channels1);
237 | 	// temp.initPointer(FULLY_CONNECTED);
238 | 	// *((FCDescriptor *)(temp.params)) = layer1;
239 | 	// layer_specifier.push_back(temp);
240 | 
241 | 	// temp.initPointer(ACTV);
242 | 	// ActivationDescriptor layer1_actv;
243 | 	// layer1_actv.initializeValues(RELU, hidden_channels1, 1, 1);
244 | 	// *((ActivationDescriptor *)temp.params) = layer1_actv;
245 | 	// layer_specifier.push_back(temp);
246 | 
247 | 	// layer0_bn.initializeValues(BATCHNORM_PER_ACTIVATION, 1e-5, 0.1, hidden_channels1, 1, 1);
248 | 	// temp.initPointer(BATCHNORM);
249 | 	// *((BatchNormDescriptor *)temp.params) = layer0_bn;
250 | 	// layer_specifier.push_back(temp);
251 | 
252 | 	// temp.initPointer(FULLY_CONNECTED);
253 | 	// FCDescriptor layer2;
254 | 	// layer2.initializeValues(hidden_channels1, output_channels);
255 | 	// *((FCDescriptor *)temp.params) = layer2;
256 | 	// layer_specifier.push_back(temp);
257 | 
258 | 	// // temp.initPointer(FULLY_CONNECTED);
259 | 	// // FCDescriptor layer3;
260 | 	// // layer3.initializeValues(hidden_channels2, output_channels);
261 | 	// // *((FCDescriptor *)temp.params) = layer3;
262 | 	// // layer_specifier.push_back(temp);
263 | 
264 | 	// temp.initPointer(SOFTMAX);
265 | 	// SoftmaxDescriptor smax;
266 | 	// smax.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, output_channels, 1, 1);
267 | 	// *((SoftmaxDescriptor *)(temp.params)) = smax;
268 | 	// layer_specifier.push_back(temp);
269 | 	
270 | 	// AlexNet
271 | 	vector<LayerSpecifier> layer_specifier;
272 | 	{
273 | 		ConvDescriptor layer0;
274 | 		layer0.initializeValues(3, 96, 11, 11, 227, 227, 0, 0, 4, 4, RELU);
275 | 		LayerSpecifier temp;
276 | 		temp.initPointer(CONV);
277 | 		*((ConvDescriptor *)temp.params) = layer0;
278 | 		layer_specifier.push_back(temp);
279 | 	}
280 | 	{
281 | 		PoolingDescriptor layer1;
282 | 		layer1.initializeValues(96, 3, 3, 55, 55, 0, 0, 2, 2, POOLING_MAX);
283 | 		LayerSpecifier temp;
284 | 		temp.initPointer(POOLING);
285 | 		*((PoolingDescriptor *)temp.params) = layer1;
286 | 		layer_specifier.push_back(temp);
287 | 	}
288 | 	{
289 | 		ConvDescriptor layer2;
290 | 		layer2.initializeValues(96, 256, 5, 5, 27, 27, 2, 2, 1, 1, RELU);
291 | 		LayerSpecifier temp;
292 | 		temp.initPointer(CONV);
293 | 		*((ConvDescriptor *)temp.params) = layer2;
294 | 		layer_specifier.push_back(temp);
295 | 	}
296 | 	{
297 | 		PoolingDescriptor layer3;
298 | 		layer3.initializeValues(256, 3, 3, 27, 27, 0, 0, 2, 2, POOLING_MAX);
299 | 		LayerSpecifier temp;
300 | 		temp.initPointer(POOLING);
301 | 		*((PoolingDescriptor *)temp.params) = layer3;
302 | 		layer_specifier.push_back(temp);
303 | 	}
304 | 	{
305 | 		ConvDescriptor layer4;
306 | 		layer4.initializeValues(256, 384, 3, 3, 13, 13, 1, 1, 1, 1, RELU);
307 | 		LayerSpecifier temp;
308 | 		temp.initPointer(CONV);
309 | 		*((ConvDescriptor *)temp.params) = layer4;
310 | 		layer_specifier.push_back(temp);
311 | 	}
312 | 	{
313 | 		ConvDescriptor layer5;
314 | 		layer5.initializeValues(384, 384, 3, 3, 13, 13, 1, 1, 1, 1, RELU);
315 | 		LayerSpecifier temp;
316 | 		temp.initPointer(CONV);
317 | 		*((ConvDescriptor *)temp.params) = layer5;
318 | 		layer_specifier.push_back(temp);
319 | 	}
320 | 	{
321 | 		ConvDescriptor layer6;
322 | 		layer6.initializeValues(384, 256, 3, 3, 13, 13, 1, 1, 1, 1, RELU);
323 | 		LayerSpecifier temp;
324 | 		temp.initPointer(CONV);
325 | 		*((ConvDescriptor *)temp.params) = layer6;
326 | 		layer_specifier.push_back(temp);
327 | 	}
328 | 	{
329 | 		PoolingDescriptor layer7;
330 | 		layer7.initializeValues(256, 3, 3, 13, 13, 0, 0, 2, 2, POOLING_MAX);
331 | 		LayerSpecifier temp;
332 | 		temp.initPointer(POOLING);
333 | 		*((PoolingDescriptor *)temp.params) = layer7;
334 | 		layer_specifier.push_back(temp);
335 | 	}
336 | 	{
337 | 		FCDescriptor layer8;
338 | 		layer8.initializeValues(9216, 4096, RELU);
339 | 		LayerSpecifier temp;
340 | 		temp.initPointer(FULLY_CONNECTED);
341 | 		*((FCDescriptor *)temp.params) = layer8;
342 | 		layer_specifier.push_back(temp);
343 | 	}
344 | 	{
345 | 		FCDescriptor layer9;
346 | 		layer9.initializeValues(4096, 4096, RELU);
347 | 		LayerSpecifier temp;
348 | 		temp.initPointer(FULLY_CONNECTED);
349 | 		*((FCDescriptor *)temp.params) = layer9;
350 | 		layer_specifier.push_back(temp);
351 | 	}
352 | 	{
353 | 		FCDescriptor layer10;
354 | 		layer10.initializeValues(4096, 1000);
355 | 		LayerSpecifier temp;
356 | 		temp.initPointer(FULLY_CONNECTED);
357 | 		*((FCDescriptor *)temp.params) = layer10;
358 | 		layer_specifier.push_back(temp);
359 | 	}
360 | 	{
361 | 		SoftmaxDescriptor layer11;
362 | 		layer11.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 1000, 1, 1);
363 | 		LayerSpecifier temp;
364 | 		temp.initPointer(SOFTMAX);
365 | 		*((SoftmaxDescriptor *)temp.params) = layer11;
366 | 		layer_specifier.push_back(temp);
367 | 	}
368 | 
369 | 	vDNNConvAlgo vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL;
370 | 	vDNNType vdnn_type = vDNN_DYN;
371 | 	string filename("vdnn_dyn");
372 | 	if (argc == 3) {
373 | 		filename.assign("vdnn");
374 | 		// argv[1] - layers to offload, argv[2] - conv algo to use
375 | 		if (strcmp(argv[1], "dyn") == 0) {
376 | 			vdnn_type = vDNN_DYN;
377 | 			filename.append("_dyn");
378 | 		}
379 | 		else if (strcmp(argv[1], "conv") == 0) {
380 | 			vdnn_type = vDNN_CONV;
381 | 			filename.append("_conv");
382 | 		}
383 | 		else if (strcmp(argv[1], "all") == 0) {
384 | 			vdnn_type = vDNN_ALL;
385 | 			filename.append("_all");
386 | 		}
387 | 		else if (strcmp(argv[1], "alternate_conv") == 0) {
388 | 			vdnn_type = vDNN_ALTERNATE_CONV;
389 | 			filename.append("_alternate_conv");
390 | 		}
391 | 		else {
392 | 			printf("invalid argument.. using vdnn dynamic\n");
393 | 			filename.assign("vdnn_dyn");
394 | 		}
395 | 		if ((strcmp(argv[1], "conv") == 0 or strcmp(argv[1], "all") == 0 or strcmp(argv[1], "alternate_conv") == 0)) {
396 | 			if (strcmp(argv[2], "p") == 0) {
397 | 				vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL;
398 | 				filename.append("_p");
399 | 			}
400 | 			else if (strcmp(argv[2], "m") == 0) {
401 | 				vdnn_conv_algo = vDNN_MEMORY_OPTIMAL;
402 | 				filename.append("_m");
403 | 			}
404 | 			else {
405 | 				printf("invalid argument.. using vdnn dynamic\n");
406 | 				filename.assign("vdnn_dyn");
407 | 			}
408 | 		}
409 | 	}
410 | 
411 | 	int batch_size = 256;
412 | 	long long dropout_seed = 1;
413 | 	float softmax_eps = 1e-8;
414 | 	float init_std_dev = 0.1;
415 | 	NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vdnn_type, vdnn_conv_algo, SGD);
416 | 
417 | 	int num_epoch = 1000;
418 | 	double learning_rate = 1e-3;
419 | 	double learning_rate_decay = 0.9;
420 | 	
421 | 	Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train);
422 | 	vector<float> loss;
423 | 	vector<float> time;
424 | 	vector<vector<float> > fwd_vdnn_lag, bwd_vdnn_lag;
425 | 	solver.getTrainTime(loss, time, 100, fwd_vdnn_lag, bwd_vdnn_lag);
426 | 	printTimes(time, filename);
427 | 	printvDNNLag(fwd_vdnn_lag, bwd_vdnn_lag, filename);
428 | 
429 | 	vector<vector<float> > fwd_computation_time, bwd_computation_time;
430 | 	solver.getComputationTime(1, fwd_computation_time, bwd_computation_time);
431 | 
432 | 	vector<vector<float> > fwd_transfer_time, bwd_transfer_time;
433 | 	solver.getTransferTime(1, fwd_transfer_time, bwd_transfer_time);
434 | 
435 | 	printComputationTransferTimes(fwd_computation_time, bwd_computation_time, true, filename);
436 | 	printComputationTransferTimes(fwd_transfer_time, bwd_transfer_time, false, filename);
437 | 
438 | }
439 | 
440 | void printTimes(vector<float> &time, string filename) {
441 | 	float mean_time = 0.0;
442 | 	float std_dev = 0.0;
443 | 	int N = time.size();
444 | 	for (int i = 0; i < N; i++) {
445 | 		mean_time += time[i];
446 | 	}
447 | 	mean_time /= N;
448 | 	for (int i = 0; i < N; i++) {
449 | 		std_dev += pow(time[i] - mean_time, 2);
450 | 	}
451 | 	std_dev /= N;
452 | 	std_dev = pow(std_dev, 0.5);
453 | 	cout << "Average time: " << mean_time << endl;
454 | 	cout << "Standard deviation: " << std_dev << endl;
455 | 
456 | 	filename.append(".dat");
457 | 	fstream f;
458 | 	f.open(filename.c_str(), ios_base::out);
459 | 
460 | 	for (int i = 0; i < N; i++) {
461 | 		f << time[i] << endl;
462 | 	}
463 | 	f << "mean_time: " << mean_time << endl;
464 | 	f << "standard_deviation: " << std_dev << endl;
465 | 	f.close();
466 | 
467 | 	filename.append(".bin");
468 | 	fstream f_bin;
469 | 	f_bin.open(filename.c_str(), ios_base::out);
470 | 	f_bin.write((char *)&N, sizeof(N));
471 | 	for (int i = 0; i < N; i++) {
472 | 		f_bin.write((char *)&time[i], sizeof(time[i]));
473 | 	}
474 | 	f_bin.close();
475 | 
476 | }
477 | 
478 | void printvDNNLag(vector<vector<float> > &fwd_vdnn_lag, vector<vector<float> > &bwd_vdnn_lag, string filename) {
479 | 	filename.append("_lag.dat");
480 | 	
481 | 	fstream f;
482 | 	f.open(filename.c_str(), ios_base::out);
483 | 
484 | 	int N = fwd_vdnn_lag.size();
485 | 	for (int i = 0; i < N; i++) {
486 | 		for (int j = 0; j < fwd_vdnn_lag[i].size(); j++) {
487 | 			f << "fwd" << j << ": " << fwd_vdnn_lag[i][j] << endl;
488 | 		}
489 | 		for (int j = 0; j < bwd_vdnn_lag[i].size(); j++) {
490 | 			f << "bwd" << j << ": " << bwd_vdnn_lag[i][j] << endl;
491 | 		}
492 | 		f << endl;
493 | 	}
494 | 	f.close();
495 | }
496 | 
497 | void printComputationTransferTimes(vector<vector<float> > &fwd_times, vector<vector<float> >&bwd_times, bool computation, string filename) {
498 | 	if (computation)
499 | 		filename.append("_compute_time.dat");
500 | 	else
501 | 		filename.append("_transfer_time.dat");
502 | 
503 | 	fstream f;
504 | 	f.open(filename.c_str(), ios_base::out);
505 | 
506 | 	int N = fwd_times.size();
507 | 	for (int i = 0; i < N; i++) {
508 | 		for (int j = 0; j < fwd_times[i].size(); j++) {
509 | 			f << "fwd" << j << ": " << fwd_times[i][j] << endl;
510 | 		}
511 | 		for (int j = 0; j < bwd_times[i].size(); j++) {
512 | 			f << "bwd" << j << ": " << bwd_times[i][j] << endl;
513 | 		}
514 | 		f << endl;
515 | 	}
516 | 	f.close();	
517 | }


--------------------------------------------------------------------------------
/src/neural_net_time.cu:
--------------------------------------------------------------------------------
  1 | #include "neural_net.h"
  2 | 
  3 | void NeuralNet::getComputationTime(void *X, int *y, double learning_rate, 
  4 | 									std::vector<float> &fwd_computation_time, std::vector<float> &bwd_computation_time) {
  5 | 	for (int i = 0; i < num_layers; i++)
  6 | 		prefetched[i] = false;
  7 | 
  8 | 	// checkCNMEM(cnmemMalloc(&layer_input[0], layer_input_size[0] * data_type_size, NULL));
  9 | 	// checkCudaErrors(cudaMemcpy(layer_input[0], X, batch_size * input_channels * input_h * input_w * data_type_size, cudaMemcpyHostToDevice));
 10 | 	// checkCudaErrors(cudaMemcpy(this->y, y, batch_size * data_type_size, cudaMemcpyHostToDevice));
 11 | 	
 12 | 	float alpha = 1.0, beta = 0.0;
 13 | 	float Salpha = 1.0, Sbeta = 0.0;
 14 | 	double Dalpha = 1.0, Dbeta = 0.0;
 15 | 
 16 | 	// forward propagate
 17 | 	for (int i = 0; i < num_layers; i++) {
 18 | 		size_t cur_workspace_size;
 19 | 		void *cur_workspace;
 20 | 
 21 | 		checkCNMEM(cnmemMalloc(&layer_input[i], layer_input_size[i] * data_type_size, NULL));
 22 | 		checkCNMEM(cnmemMalloc(&layer_input[i + 1], layer_input_size[i + 1] * data_type_size, NULL));
 23 | 		if (layer_type[i] == CONV) {
 24 | 			ConvLayerParams *cur_params = (ConvLayerParams *)params[i];
 25 | 
 26 | 			cur_workspace_size = cur_params->fwd_workspace_size;
 27 | 			checkCNMEM(cnmemMalloc(&cur_workspace, cur_workspace_size, NULL));
 28 | 		}
 29 | 
 30 | 		checkCudaErrors(cudaEventRecord(start_compute, stream_compute));
 31 | 		if (layer_type[i] == CONV) {
 32 | 			ConvLayerParams *cur_params = (ConvLayerParams *)params[i];
 33 | 
 34 | 			cur_workspace_size = cur_params->fwd_workspace_size;			
 35 | 			// computation
 36 | 			checkCUDNN(cudnnConvolutionForward(cudnn_handle, &alpha, 
 37 | 												cur_params->input_tensor, layer_input[i],
 38 | 												cur_params->filter_desc, cur_params->W,
 39 | 												cur_params->conv_desc, cur_params->fwd_algo,
 40 | 												cur_workspace, cur_workspace_size,
 41 | 												&beta,
 42 | 												cur_params->output_tensor, layer_input[i + 1]));
 43 | 			checkCUDNN(cudnnAddTensor(cudnn_handle, &alpha, 
 44 | 										cur_params->bias_desc, cur_params->b, 
 45 | 										&alpha,
 46 | 										cur_params->output_tensor, layer_input[i + 1]));
 47 | 
 48 | 			// if activation required
 49 | 			if (cur_params->activation_mode != ACTIVATION_NONE) {
 50 | 				checkCUDNN(cudnnActivationForward(cudnn_handle, cur_params->actv_desc,
 51 | 												&alpha,
 52 | 												cur_params->output_tensor, layer_input[i + 1],
 53 | 												&beta,
 54 | 												cur_params->output_tensor, layer_input[i + 1]));
 55 | 			}
 56 | 			
 57 | 		}
 58 | 
 59 | 		else if (layer_type[i] == FULLY_CONNECTED) {
 60 | 			// std::cout << "FC\n";
 61 | 			FCLayerParams *cur_params = (FCLayerParams *)params[i];
 62 | 			// std::cout << "FChere" << i << std::endl;
 63 | 
 64 | 			if (data_type == CUDNN_DATA_FLOAT) {
 65 | 				checkCUBLAS(cublasSgemm(cublas_handle,
 66 | 										CUBLAS_OP_N, CUBLAS_OP_N,
 67 | 										cur_params->C_out, batch_size, cur_params->C_in,
 68 | 										&Salpha,
 69 | 										(float *)cur_params->W, cur_params->C_out,
 70 | 										(float *)layer_input[i], cur_params->C_in,
 71 | 										&Sbeta,
 72 | 										(float *)layer_input[i + 1], cur_params->C_out));
 73 | 				checkCUBLAS(cublasSgemm(cublas_handle,
 74 | 										CUBLAS_OP_N, CUBLAS_OP_N,
 75 | 										cur_params->C_out, batch_size, 1,
 76 | 										&Salpha,
 77 | 										(float *)cur_params->b, cur_params->C_out,
 78 | 										(float *)one_vec, 1,
 79 | 										&Salpha,
 80 | 										(float *)layer_input[i + 1], cur_params->C_out));
 81 | 			}
 82 | 			else if (data_type == CUDNN_DATA_DOUBLE) {
 83 | 				checkCUBLAS(cublasDgemm(cublas_handle,
 84 | 										CUBLAS_OP_N, CUBLAS_OP_N,
 85 | 										cur_params->C_out, batch_size, cur_params->C_in,
 86 | 										&Dalpha,
 87 | 										(double *)cur_params->W, cur_params->C_out,
 88 | 										(double *)layer_input[i], cur_params->C_in,
 89 | 										&Dbeta,
 90 | 										(double *)layer_input[i + 1], cur_params->C_out));
 91 | 				checkCUBLAS(cublasDgemm(cublas_handle,
 92 | 										CUBLAS_OP_N, CUBLAS_OP_N,
 93 | 										cur_params->C_out, batch_size, 1,
 94 | 										&Dalpha,
 95 | 										(double *)cur_params->b, cur_params->C_out,
 96 | 										(double *)one_vec, 1,
 97 | 										&Dalpha,
 98 | 										(double *)layer_input[i + 1], cur_params->C_out));
 99 | 			}
100 | 			if (cur_params->activation_mode != ACTIVATION_NONE) {
101 | 				checkCUDNN(cudnnActivationForward(cudnn_handle, cur_params->actv_desc,
102 | 												&alpha,
103 | 												cur_params->output_tensor, layer_input[i + 1],
104 | 												&beta,
105 | 												cur_params->output_tensor, layer_input[i + 1]));
106 | 			}
107 | 		}
108 | 		else if (layer_type[i] == DROPOUT) {
109 | 			DropoutLayerParams *cur_params = (DropoutLayerParams *)params[i];
110 | 			checkCUDNN(cudnnDropoutForward(cudnn_handle, cur_params->dropout_desc,
111 | 											cur_params->input_tensor, layer_input[i],
112 | 											cur_params->input_tensor, layer_input[i + 1],
113 | 											cur_params->reserved_space,
114 | 											cur_params->reserved_space_size));
115 | 		}
116 | 		else if (layer_type[i] == BATCHNORM) {
117 | 			BatchNormLayerParams *cur_params = (BatchNormLayerParams *)params[i];
118 | 
119 | 			checkCUDNN(cudnnBatchNormalizationForwardTraining(cudnn_handle, cur_params->mode,
120 | 																&alpha, &beta,
121 | 																cur_params->input_tensor, layer_input[i],
122 | 																cur_params->input_tensor, layer_input[i + 1],
123 | 																cur_params->sbmv_desc,
124 | 																cur_params->scale, cur_params->bias,
125 | 																cur_params->factor,
126 | 																cur_params->running_mean, cur_params->running_variance,
127 | 																cur_params->epsilon,
128 | 																cur_params->result_save_mean, cur_params->result_save_inv_var));
129 | 
130 | 		}
131 | 		else if (layer_type[i] == POOLING) {
132 | 			PoolingLayerParams *cur_params = (PoolingLayerParams *)params[i];
133 | 			checkCUDNN(cudnnPoolingForward(cudnn_handle, cur_params->pool_desc,
134 | 											&alpha,
135 | 											cur_params->input_tensor, layer_input[i],
136 | 											&beta,
137 | 											cur_params->output_tensor, layer_input[i + 1]));
138 | 		}
139 | 		else if (layer_type[i] == ACTV) {
140 | 			std::cout << "Panic!! ACTV wrong place\n";
141 | 			exit(0);
142 | 			ActivationLayerParams *cur_params = (ActivationLayerParams *)params[i];
143 | 			checkCUDNN(cudnnActivationForward(cudnn_handle, cur_params->actv_desc,
144 | 												&alpha,
145 | 												cur_params->input_tensor, layer_input[i],
146 | 												&beta,
147 | 												cur_params->input_tensor, layer_input[i + 1]));
148 | 		}
149 | 		else if (layer_type[i] == SOFTMAX) {
150 | 			// std::cout << "Softmax\n";
151 | 			std::cout << "Panic!! SOFTMAX wrong place\n";
152 | 			exit(0);
153 | 			SoftmaxLayerParams *cur_params = (SoftmaxLayerParams *)params[i];
154 | 			checkCUDNN(cudnnSoftmaxForward(cudnn_handle, cur_params->algo, cur_params->mode,
155 | 											&alpha,
156 | 											cur_params->input_tensor, layer_input[i],
157 | 											&beta,
158 | 											cur_params->input_tensor, layer_input[i + 1]));
159 | 		}
160 | 
161 | 		// ---------------------- vDNN start ----------------------
162 | 		// synchronization
163 | 		// checkCudaErrors(cudaDeviceSynchronize());
164 | 
165 | 		// if next layer is ACTV or SOFTMAX, complete that and come to synchronization
166 | 		// the case in above if for ACTV and SOFTMAX never occurs
167 | 		if (layer_type[i + 1] == SOFTMAX) {
168 | 			i++;
169 | 			layer_input[i + 1] = layer_input[i];
170 | 			SoftmaxLayerParams *cur_params = (SoftmaxLayerParams *)params[i];
171 | 			checkCUDNN(cudnnSoftmaxForward(cudnn_handle, cur_params->algo, cur_params->mode,
172 | 											&alpha,
173 | 											cur_params->input_tensor, layer_input[i],
174 | 											&beta,
175 | 											cur_params->input_tensor, layer_input[i + 1]));
176 | 			i--;
177 | 		}
178 | 
179 | 		// sync with stream_compute guaranteed
180 | 		checkCudaErrors(cudaEventRecord(stop_compute, stream_compute));
181 | 		checkCudaErrors(cudaEventSynchronize(stop_compute));
182 | 		float compute_time = 0;
183 | 		checkCudaErrors(cudaEventElapsedTime(&compute_time, start_compute, stop_compute));
184 | 		
185 | 		fwd_computation_time.push_back(compute_time);
186 | 		
187 | 		if (layer_type[i] == CONV) {
188 | 			checkCNMEM(cnmemFree(cur_workspace, NULL));
189 | 		}
190 | 
191 | 		checkCNMEM(cnmemFree(layer_input[i], NULL));
192 | 		checkCNMEM(cnmemFree(layer_input[i + 1], NULL));
193 | 
194 | 		if (layer_type[i + 1] == ACTV or layer_type[i + 1] == SOFTMAX) {
195 | 			i = i + 1;
196 | 		}
197 | 
198 | 		// ---------------------- vDNN end ------------------------
199 | 	}
200 | 
201 | 	// time for loss compute ignored
202 | 	// *scalar_loss = computeLoss();
203 | 
204 | 	// time for softmax backward ignored
205 | 	// ---------------------- vDNN start ----------------------
206 | 	// checkCNMEM(cnmemMalloc(&dlayer_input[num_layers], batch_size * num_classes * data_type_size, NULL));
207 | 	// space_tracker.updateSpace(CnmemSpace::SUB, layer_input_size[num_layers] * data_type_size);
208 | 	// // std::cout << "Free bytes: " << free_bytes << std::endl;
209 | 	// // ---------------------- vDNN end ------------------------
210 | 	// if (layer_type[num_layers - 1] == SOFTMAX) {
211 | 	// 	// SoftmaxLayerParams *cur_params = (SoftmaxLayerParams *)params[num_layers - 1];
212 | 	// 	if (data_type == CUDNN_DATA_FLOAT) {
213 | 	// 		checkCudaErrors(cudaMemset(dlayer_input[num_layers], 0, batch_size * num_classes * sizeof(float)));
214 | 	// 		softmaxLossBackProp<float><<<ceil(1.0 * batch_size / BW), BW>>>(this->y, (float *)layer_input[num_layers], 
215 | 	// 																		(float *)dlayer_input[num_layers], batch_size, num_classes, softmax_eps);
216 | 	// 	}
217 | 	// 	else if (data_type == CUDNN_DATA_DOUBLE) {
218 | 	// 		checkCudaErrors(cudaMemset(dlayer_input[num_layers], 0, batch_size * num_classes * sizeof(double)));
219 | 	// 		softmaxLossBackProp<double><<<ceil(1.0 * batch_size / BW), BW>>>(this->y, (double *)layer_input[num_layers], 
220 | 	// 																		(double *)dlayer_input[num_layers], batch_size, num_classes, softmax_eps);
221 | 	// 	}
222 | 	// }
223 | 
224 | 	for (int i = num_layers - 1; i >= 0; i--) {
225 | 		// ---------------------- vDNN start ----------------------
226 | 		size_t cur_filter_workspace_size, cur_data_workspace_size, cur_workspace_size;
227 | 		void *cur_workspace;
228 | 
229 | 		checkCNMEM(cnmemMalloc(&layer_input[i + 1], layer_input_size[i + 1] * data_type_size, NULL));
230 | 		checkCNMEM(cnmemMalloc(&layer_input[i], layer_input_size[i] * data_type_size, NULL));
231 | 		checkCNMEM(cnmemMalloc(&dlayer_input[i + 1], layer_input_size[i] * data_type_size, NULL));
232 | 
233 | 		if (i > 0) {
234 | 			if (layer_type[i] == ACTV or layer_type[i] == SOFTMAX) {
235 | 				dlayer_input[i] = dlayer_input[i + 1];
236 | 			}
237 | 			else {
238 | 				checkCNMEM(cnmemMalloc(&dlayer_input[i], layer_input_size[i] * data_type_size, NULL));
239 | 			}
240 | 		}
241 | 		// ---------------------- vDNN end ------------------------
242 | 
243 | 		if (layer_type[i] == CONV) {
244 | 			ConvLayerParams *cur_params = (ConvLayerParams *)params[i];
245 | 
246 | 			// allocate space for derivative
247 | 			if (!pre_alloc_conv_derivative) {
248 | 				cur_params->cnmemAllocDerivatives(data_type_size, NULL);
249 | 			}
250 | 
251 | 			cur_filter_workspace_size = cur_params->bwd_filter_workspace_size;
252 | 			if (i > 0)
253 | 				cur_data_workspace_size = cur_params->bwd_data_workspace_size;
254 | 			else
255 | 				cur_data_workspace_size = 0;
256 | 			// std::cout << "bwd cur_workspace_size: " << cur_workspace_size << std::endl;
257 | 			cur_workspace_size = (cur_filter_workspace_size > cur_data_workspace_size) ? cur_filter_workspace_size : cur_data_workspace_size;
258 | 			checkCNMEM(cnmemMalloc(&cur_workspace, cur_workspace_size, NULL));
259 | 
260 | 		}
261 | 
262 | 		else if (layer_type[i] == FULLY_CONNECTED) {
263 | 			FCLayerParams *cur_params = (FCLayerParams *)params[i];
264 | 
265 | 			if (!pre_alloc_fc_derivative) {
266 | 				cur_params->cnmemAllocDerivatives(data_type_size, NULL);
267 | 			}
268 | 		}
269 | 
270 | 		else if (layer_type[i] == BATCHNORM) {
271 | 			BatchNormLayerParams *cur_params = (BatchNormLayerParams *)params[i];
272 | 
273 | 			if (!pre_alloc_batch_norm_derivative) {
274 | 				cur_params->cnmemAllocDerivatives(data_type_size, NULL);
275 | 			}
276 | 		}
277 | 
278 | 
279 | 		if (!(i + 1 < num_layers && layer_type[i + 1] == SOFTMAX))
280 | 			checkCudaErrors(cudaEventRecord(start_compute, stream_compute));
281 | 
282 | 		if (layer_type[i] == CONV) {
283 | 			ConvLayerParams *cur_params = (ConvLayerParams *)params[i];
284 | 
285 | 			if (cur_params->activation_mode != ACTIVATION_NONE) {
286 | 				checkCUDNN(cudnnActivationBackward(cudnn_handle, cur_params->actv_desc, &alpha,
287 | 												cur_params->output_tensor, layer_input[i + 1],
288 | 												cur_params->output_tensor, dlayer_input[i + 1],
289 | 												cur_params->output_tensor, layer_input[i + 1],
290 | 												&beta,
291 | 												cur_params->output_tensor, dlayer_input[i + 1]));
292 | 			}
293 | 
294 | 			cur_filter_workspace_size = cur_params->bwd_filter_workspace_size;
295 | 			if (i > 0)
296 | 				cur_data_workspace_size = cur_params->bwd_data_workspace_size;
297 | 			else
298 | 				cur_data_workspace_size = 0;
299 | 			// std::cout << "bwd cur_workspace_size: " << cur_workspace_size << std::endl;
300 | 			cur_workspace_size = (cur_filter_workspace_size > cur_data_workspace_size) ? cur_filter_workspace_size : cur_data_workspace_size;
301 | 
302 | 			checkCUDNN(cudnnConvolutionBackwardBias(cudnn_handle, &alpha,
303 | 													cur_params->output_tensor, dlayer_input[i + 1],
304 | 													&beta,
305 | 													cur_params->bias_desc, cur_params->db));
306 | 
307 | 			// std::cout << "neural_net: backward conv i:" << i << std::endl;
308 | 
309 | 			checkCUDNN(cudnnConvolutionBackwardFilter(cudnn_handle, &alpha,
310 | 														cur_params->input_tensor, layer_input[i],
311 | 														cur_params->output_tensor, dlayer_input[i + 1],
312 | 														cur_params->conv_desc, cur_params->bwd_filter_algo,
313 | 														cur_workspace, cur_workspace_size,
314 | 														&beta, 
315 | 														cur_params->filter_desc,
316 | 														cur_params->dW));
317 | 			if (i > 0)
318 | 				checkCUDNN(cudnnConvolutionBackwardData(cudnn_handle, &alpha,
319 | 														cur_params->filter_desc, cur_params->W,
320 | 														cur_params->output_tensor, dlayer_input[i + 1],
321 | 														cur_params->conv_desc, cur_params->bwd_data_algo,
322 | 														cur_workspace, cur_workspace_size,
323 | 														&beta,
324 | 														cur_params->input_tensor, dlayer_input[i]));
325 | 
326 | 			// std::cout << "Free bytes: " << free_bytes << std::endl;
327 | 			// std::cout << "here\n";
328 | 			cur_params->stepParams(cublas_handle, learning_rate);
329 | 		}
330 | 
331 | 		else if (layer_type[i] == FULLY_CONNECTED) {
332 | 			FCLayerParams *cur_params = (FCLayerParams *)params[i];
333 | 
334 | 			if (cur_params->activation_mode != ACTIVATION_NONE) {
335 | 				checkCUDNN(cudnnActivationBackward(cudnn_handle, cur_params->actv_desc, &alpha,
336 | 												cur_params->output_tensor, layer_input[i + 1],
337 | 												cur_params->output_tensor, dlayer_input[i + 1],
338 | 												cur_params->output_tensor, layer_input[i + 1],
339 | 												&beta,
340 | 												cur_params->output_tensor, dlayer_input[i + 1]));
341 | 			}
342 | 
343 | 			if (data_type == CUDNN_DATA_FLOAT) {
344 | 				// bias backward
345 | 				checkCUBLAS(cublasSgemm(cublas_handle,
346 | 										CUBLAS_OP_N, CUBLAS_OP_N,
347 | 										cur_params->C_out, 1, batch_size,
348 | 										&Salpha,
349 | 										(float *)dlayer_input[i + 1], cur_params->C_out,
350 | 										(float *)one_vec, batch_size,
351 | 										&Sbeta,
352 | 										(float *)cur_params->db, cur_params->C_out));
353 | 
354 | 				// weight backward
355 | 				checkCUBLAS(cublasSgemm(cublas_handle,
356 | 										CUBLAS_OP_N, CUBLAS_OP_T,
357 | 										cur_params->C_out, cur_params->C_in, batch_size,
358 | 										&Salpha,
359 | 										(float *)dlayer_input[i + 1], cur_params->C_out,
360 | 										(float *)layer_input[i], cur_params->C_in,
361 | 										&Sbeta,
362 | 										(float *)cur_params->dW, cur_params->C_out));
363 | 
364 | 				// data backward
365 | 				if (i > 0)
366 | 					checkCUBLAS(cublasSgemm(cublas_handle,
367 | 											CUBLAS_OP_T, CUBLAS_OP_N,
368 | 											cur_params->C_in, batch_size, cur_params->C_out,
369 | 											&Salpha,
370 | 											(float *)cur_params->W, cur_params->C_out,
371 | 											(float *)dlayer_input[i + 1], cur_params->C_out,
372 | 											&Sbeta,
373 | 											(float *)dlayer_input[i], cur_params->C_in));
374 | 			}
375 | 
376 | 			else if (data_type == CUDNN_DATA_DOUBLE) {
377 | 				// bias backward
378 | 				checkCUBLAS(cublasDgemm(cublas_handle,
379 | 										CUBLAS_OP_N, CUBLAS_OP_N,
380 | 										cur_params->C_out, 1, batch_size,
381 | 										&Dalpha,
382 | 										(double *)dlayer_input[i + 1], cur_params->C_out,
383 | 										(double *)one_vec, batch_size,
384 | 										&Dbeta,
385 | 										(double *)cur_params->db, cur_params->C_out));
386 | 
387 | 				// weight backward
388 | 				checkCUBLAS(cublasDgemm(cublas_handle,
389 | 										CUBLAS_OP_N, CUBLAS_OP_T,
390 | 										cur_params->C_out, cur_params->C_in, batch_size,
391 | 										&Dalpha,
392 | 										(double *)dlayer_input[i + 1], cur_params->C_out,
393 | 										(double *)layer_input[i], cur_params->C_in,
394 | 										&Dbeta,
395 | 										(double *)cur_params->dW, cur_params->C_out));
396 | 
397 | 				// data backward
398 | 				if (i > 0)
399 | 					checkCUBLAS(cublasDgemm(cublas_handle,
400 | 											CUBLAS_OP_T, CUBLAS_OP_N,
401 | 											cur_params->C_in, batch_size, cur_params->C_out,
402 | 											&Dalpha,
403 | 											(double *)cur_params->W, cur_params->C_out,
404 | 											(double *)dlayer_input[i + 1], cur_params->C_out,
405 | 											&Dbeta,
406 | 											(double *)dlayer_input[i], cur_params->C_in));
407 | 			}
408 | 			cur_params->stepParams(cublas_handle, learning_rate);
409 | 		}
410 | 
411 | 		else if (layer_type[i] == DROPOUT) {
412 | 			DropoutLayerParams *cur_params = (DropoutLayerParams *)params[i];
413 | 			checkCUDNN(cudnnDropoutBackward(cudnn_handle, cur_params->dropout_desc,
414 | 											cur_params->input_tensor, dlayer_input[i + 1],
415 | 											cur_params->input_tensor, dlayer_input[i],
416 | 											cur_params->reserved_space, cur_params->reserved_space_size));
417 | 		}
418 | 
419 | 		else if (layer_type[i] == BATCHNORM) {
420 | 			BatchNormLayerParams *cur_params = (BatchNormLayerParams *)params[i];
421 | 
422 | 			checkCUDNN(cudnnBatchNormalizationBackward(cudnn_handle, cur_params->mode,
423 | 														&alpha, &beta,
424 | 														&alpha, &beta,
425 | 														cur_params->input_tensor, layer_input[i],
426 | 														cur_params->input_tensor, dlayer_input[i + 1],
427 | 														cur_params->input_tensor, dlayer_input[i],
428 | 														cur_params->sbmv_desc, cur_params->scale,
429 | 														cur_params->dscale, cur_params->dbias,
430 | 														cur_params->epsilon,
431 | 														cur_params->result_save_mean, cur_params->result_save_inv_var));
432 | 
433 | 			cur_params->stepParams(cublas_handle, learning_rate);
434 | 		}
435 | 
436 | 		else if (layer_type[i] == POOLING) {
437 | 			PoolingLayerParams *cur_params = (PoolingLayerParams *)params[i];
438 | 			checkCUDNN(cudnnPoolingBackward(cudnn_handle, cur_params->pool_desc, &alpha, 
439 | 											cur_params->output_tensor, layer_input[i + 1],
440 | 											cur_params->output_tensor, dlayer_input[i + 1],
441 | 											cur_params->input_tensor, layer_input[i],
442 | 											&beta,
443 | 											cur_params->input_tensor, dlayer_input[i]));
444 | 		}
445 | 
446 | 		else if (layer_type[i] == ACTV) {
447 | 			ActivationLayerParams *cur_params = (ActivationLayerParams *)params[i];
448 | 			checkCUDNN(cudnnActivationBackward(cudnn_handle, cur_params->actv_desc, &alpha,
449 | 												cur_params->input_tensor, layer_input[i + 1],
450 | 												cur_params->input_tensor, dlayer_input[i + 1],
451 | 												cur_params->input_tensor, layer_input[i],
452 | 												&beta,
453 | 												cur_params->input_tensor, dlayer_input[i]));
454 | 			continue;
455 | 		}
456 | 
457 | 		else if (layer_type[i] == SOFTMAX) {
458 | 			// std::cout << "compute here\n";
459 | 			SoftmaxLayerParams *cur_params = (SoftmaxLayerParams *)params[i];
460 | 			checkCUDNN(cudnnSoftmaxBackward(cudnn_handle, cur_params->algo, cur_params->mode, &alpha,
461 | 											cur_params->input_tensor, layer_input[i + 1],
462 | 											cur_params->input_tensor, dlayer_input[i + 1],
463 | 											&beta,
464 | 											cur_params->input_tensor, dlayer_input[i]));
465 | 			// std::cout << "compute here\n";
466 | 			continue;
467 | 		}
468 | 
469 | 		// ---------------------- vDNN start ----------------------
470 | 		
471 | 		// checkCudaErrors(cudaDeviceSynchronize());
472 | 
473 | 		checkCudaErrors(cudaEventRecord(stop_compute, stream_compute));
474 | 		checkCudaErrors(cudaEventSynchronize(stop_compute));
475 | 		float compute_time;
476 | 		checkCudaErrors(cudaEventElapsedTime(&compute_time, start_compute, stop_compute));
477 | 
478 | 		bwd_computation_time.insert(bwd_computation_time.begin(), compute_time);
479 | 
480 | 		if (layer_type[i] == CONV) {
481 | 			checkCNMEM(cnmemFree(cur_workspace, NULL));
482 | 			if (!pre_alloc_conv_derivative) {
483 | 				ConvLayerParams *cur_params = (ConvLayerParams *)params[i];
484 | 				cur_params->cnmemFreeDerivatives(NULL);
485 | 			}
486 | 		}
487 | 		else if (layer_type[i] == FULLY_CONNECTED) {
488 | 			if (!pre_alloc_fc_derivative) {
489 | 				FCLayerParams *cur_params = (FCLayerParams *)params[i];
490 | 				cur_params->cnmemFreeDerivatives(NULL);
491 | 			}
492 | 		}
493 | 		else if (layer_type[i] == BATCHNORM) {
494 | 			if (!pre_alloc_batch_norm_derivative) {
495 | 				BatchNormLayerParams *cur_params = (BatchNormLayerParams *)params[i];
496 | 				cur_params->cnmemFreeDerivatives(NULL);
497 | 			}
498 | 		}
499 | 
500 | 		checkCNMEM(cnmemFree(layer_input[i + 1], NULL));
501 | 		checkCNMEM(cnmemFree(dlayer_input[i + 1], NULL));
502 | 		checkCNMEM(cnmemFree(layer_input[i], NULL));
503 | 		if (i > 0 && layer_type[i] != SOFTMAX)
504 | 			checkCNMEM(cnmemFree(dlayer_input[i], NULL));
505 | 	}
506 | }
507 | 
508 | 
509 | void NeuralNet::getTransferTime(void *X, int *y, double learning_rate, std::vector<float> &fwd_transfer_time, std::vector<float> &bwd_transfer_time) {
510 | 	for (int i = 0; i < num_layers; i++) {
511 | 		if (layer_type[i] == SOFTMAX)
512 | 			continue;
513 | 
514 | 		void *device_data;
515 | 		void *host_data;
516 | 
517 | 		checkCNMEM(cnmemMalloc(&device_data, layer_input_size[i] * data_type_size, NULL));
518 | 		checkCudaErrors(cudaMallocHost(&host_data, layer_input_size[i] * data_type_size));
519 | 
520 | 		checkCudaErrors(cudaEventRecord(start_transfer, stream_memory));
521 | 
522 | 		checkCudaErrors(cudaMemcpyAsync(host_data, device_data, layer_input_size[i] * data_type_size, cudaMemcpyDeviceToHost, stream_memory));
523 | 
524 | 		checkCudaErrors(cudaEventRecord(stop_transfer, stream_memory));
525 | 		checkCudaErrors(cudaEventSynchronize(stop_transfer));
526 | 		float transfer_time;
527 | 		checkCudaErrors(cudaEventElapsedTime(&transfer_time, start_transfer, stop_transfer));
528 | 		fwd_transfer_time.push_back(transfer_time);
529 | 
530 | 		checkCudaErrors(cudaEventRecord(start_transfer, stream_memory));
531 | 
532 | 		checkCudaErrors(cudaMemcpyAsync(device_data, host_data, layer_input_size[i] * data_type_size, cudaMemcpyHostToDevice, stream_memory));
533 | 
534 | 		checkCudaErrors(cudaEventRecord(stop_transfer, stream_memory));
535 | 		checkCudaErrors(cudaEventSynchronize(stop_transfer));
536 | 		checkCudaErrors(cudaEventElapsedTime(&transfer_time, start_transfer, stop_transfer));
537 | 		bwd_transfer_time.push_back(transfer_time);
538 | 	}
539 | }


--------------------------------------------------------------------------------
/src/layer_params.cu:
--------------------------------------------------------------------------------
  1 | #include "layer_params.h"
  2 | 
  3 | void ConvLayerParams::initializeValues(cudnnHandle_t cudnn_handle, ConvDescriptor *user_params, cudnnDataType_t data_type, 
  4 | 									int batch_size, cudnnTensorFormat_t tensor_format, size_t data_type_size, LayerDimension &output_size, 
  5 | 									UpdateRule update_rule) {
  6 | 	// create tensor, filter, conv descriptor
  7 | 	checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor));
  8 | 	checkCUDNN(cudnnCreateTensorDescriptor(&output_tensor));
  9 | 	checkCUDNN(cudnnCreateTensorDescriptor(&bias_desc));
 10 | 	checkCUDNN(cudnnCreateFilterDescriptor(&filter_desc));
 11 | 	checkCUDNN(cudnnCreateConvolutionDescriptor(&conv_desc));
 12 | 
 13 | 	C_in = user_params->input_channels;
 14 | 	C_out = user_params->output_channels;
 15 | 	filter_h = user_params->kernel_h;
 16 | 	filter_w = user_params->kernel_w;
 17 | 	kernel_size = C_out * C_in * filter_h * filter_w;
 18 | 	this->data_type = data_type;
 19 | 	this->activation_mode = user_params->activation_mode;
 20 | 
 21 | 	checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 
 22 | 										batch_size, user_params->input_channels, user_params->input_h, user_params->input_w));
 23 | 
 24 | 
 25 | 	checkCUDNN(cudnnSetFilter4dDescriptor(filter_desc, data_type, tensor_format, 
 26 | 										user_params->output_channels, user_params->input_channels, user_params->kernel_h, user_params->kernel_w));
 27 | 
 28 | 	int dilation_h = 1, dilation_w = 1;
 29 | 	checkCUDNN(cudnnSetConvolution2dDescriptor(conv_desc, user_params->pad_h, user_params->pad_w, 
 30 | 												user_params->stride_y, user_params->stride_x,
 31 | 												dilation_h, dilation_w, 
 32 | 												CUDNN_CROSS_CORRELATION, data_type));
 33 | 
 34 | 	int output_batch_size, output_channels, output_h, output_w;
 35 | 	checkCUDNN(cudnnGetConvolution2dForwardOutputDim(conv_desc, input_tensor, filter_desc,
 36 | 													&output_batch_size, &output_channels, &output_h, &output_w));
 37 | 
 38 | 	checkCUDNN(cudnnSetTensor4dDescriptor(output_tensor, tensor_format, data_type, 
 39 | 										output_batch_size, output_channels, output_h, output_w));
 40 | 	checkCUDNN(cudnnSetTensor4dDescriptor(bias_desc, tensor_format, data_type, 
 41 | 										1, output_channels, 1, 1));
 42 | 
 43 | 	fwd_req_count = 10;
 44 | 	fwd_perf = (cudnnConvolutionFwdAlgoPerf_t *)malloc(fwd_req_count * sizeof(cudnnConvolutionFwdAlgoPerf_t));
 45 | 	checkCUDNN(cudnnFindConvolutionForwardAlgorithm(cudnn_handle, 
 46 | 													input_tensor, filter_desc, conv_desc, output_tensor, 
 47 | 													fwd_req_count, &fwd_ret_count, fwd_perf));
 48 | 
 49 | 	// std::cout << "Printing forward conv algo perf\n";
 50 | 	// std::cout << "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM: " << CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM << std::endl;
 51 | 	// for (int i = 0; i < fwd_ret_count; i++) {
 52 | 	// 	std::cout << i << std::endl;
 53 | 	// 	std::cout << "algo: " << fwd_perf[i].algo << std::endl;
 54 | 	// 	std::cout << "status: " << cudnnGetErrorString(fwd_perf[i].status) << std::endl;
 55 | 	// 	std::cout << "time(ms): " << fwd_perf[i].time << std::endl;
 56 | 	// 	std::cout << "memory(MB): " << fwd_perf[i].memory * 1.0 / 1024 / 1024 << std::endl;
 57 | 	// 	std::cout << "mathType: " << fwd_perf[i].mathType << std::endl;
 58 | 	// 	std::cout << std::endl;
 59 | 	// }
 60 | 
 61 | 	bwd_filter_req_count = 10;
 62 | 	bwd_filter_perf = (cudnnConvolutionBwdFilterAlgoPerf_t *)malloc(bwd_filter_req_count * sizeof(cudnnConvolutionBwdFilterAlgoPerf_t));
 63 | 	checkCUDNN(cudnnFindConvolutionBackwardFilterAlgorithm(cudnn_handle, 
 64 | 															input_tensor, output_tensor, conv_desc, filter_desc, 
 65 | 															bwd_filter_req_count, &bwd_filter_ret_count, bwd_filter_perf));
 66 | 
 67 | 	// std::cout << "Printing bwdfilter conv algo perf\n";
 68 | 	// std::cout << "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 " << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 << std::endl;
 69 | 	// for (int i = 0; i < bwd_filter_ret_count; i++) {
 70 | 	// 	std::cout << i << std::endl;
 71 | 	// 	std::cout << "algo: " << bwd_filter_perf[i].algo << std::endl;
 72 | 	// 	std::cout << "status: " << cudnnGetErrorString(bwd_filter_perf[i].status) << std::endl;
 73 | 	// 	std::cout << "time(ms): " << bwd_filter_perf[i].time << std::endl;
 74 | 	// 	std::cout << "memory(MB): " << bwd_filter_perf[i].memory * 1.0 / 1024 / 1024 << std::endl;
 75 | 	// 	std::cout << "mathType: " << bwd_filter_perf[i].mathType << std::endl;
 76 | 	// 	std::cout << std::endl;
 77 | 	// }
 78 | 	bwd_data_req_count = 10;
 79 | 	bwd_data_perf = (cudnnConvolutionBwdDataAlgoPerf_t *)malloc(bwd_data_req_count * sizeof(cudnnConvolutionBwdDataAlgoPerf_t));
 80 | 	checkCUDNN(cudnnFindConvolutionBackwardDataAlgorithm(cudnn_handle,
 81 | 															filter_desc, output_tensor, conv_desc, input_tensor, 
 82 | 															bwd_data_req_count, &bwd_data_ret_count, bwd_data_perf));
 83 | 
 84 | 	// std::cout << "Printing bwddata conv algo perf\n";
 85 | 	// for (int i = 0; i < bwd_data_ret_count; i++) {
 86 | 	// 	std::cout << i << std::endl;
 87 | 	// 	std::cout << "algo: " << bwd_data_perf[i].algo << std::endl;
 88 | 	// 	std::cout << "status: " << cudnnGetErrorString(bwd_data_perf[i].status) << std::endl;
 89 | 	// 	std::cout << "time(ms): " << bwd_data_perf[i].time << std::endl;
 90 | 	// 	std::cout << "memory(MB): " << bwd_data_perf[i].memory * 1.0 / 1024 / 1024 << std::endl;
 91 | 	// 	std::cout << "mathType: " << bwd_data_perf[i].mathType << std::endl;
 92 | 	// 	std::cout << std::endl;
 93 | 	// }
 94 | 
 95 | 	this->update_rule = update_rule;
 96 | 
 97 | 	cudnnActivationMode_t mode;
 98 | 	if (activation_mode == SIGMOID)
 99 | 		mode = CUDNN_ACTIVATION_SIGMOID;
100 | 	else if (activation_mode == RELU)
101 | 		mode = CUDNN_ACTIVATION_RELU;
102 | 	else if (activation_mode == TANH)
103 | 		mode = CUDNN_ACTIVATION_TANH;
104 | 	else if (activation_mode == CLIPPED_RELU)
105 | 		mode = CUDNN_ACTIVATION_CLIPPED_RELU;
106 | 	else if (activation_mode == ELU)
107 | 		mode = CUDNN_ACTIVATION_ELU;
108 | 
109 | 	if (activation_mode != ACTIVATION_NONE) {
110 | 		checkCUDNN(cudnnCreateActivationDescriptor(&actv_desc));
111 | 		checkCUDNN(cudnnSetActivationDescriptor(actv_desc, mode, CUDNN_PROPAGATE_NAN, user_params->actv_coef));
112 | 	}
113 | 
114 | 	output_size.N = output_batch_size, output_size.C = output_channels, output_size.H = output_h, output_size.W = output_w;
115 | 	
116 | }
117 | 
118 | void ConvLayerParams::allocateSpace(curandGenerator_t curand_gen, cudnnDataType_t data_type, size_t data_type_size, 
119 | 									float std_dev, size_t &free_bytes, bool alloc_derivative) {
120 | 
121 | 	if (kernel_size % 2 != 0)
122 | 		kernel_size += 1;
123 | 	checkCudaErrors(cudaMalloc(&W, kernel_size * data_type_size));
124 | 	checkCudaErrors(cudaMalloc(&b, C_out * data_type_size));
125 | 	
126 | 	if (alloc_derivative) {
127 | 		checkCudaErrors(cudaMalloc(&dW, kernel_size * data_type_size));
128 | 		checkCudaErrors(cudaMalloc(&db, C_out * data_type_size));
129 | 	}
130 | 
131 | 	if (data_type == CUDNN_DATA_FLOAT) {
132 | 		checkCURAND(curandGenerateNormal(curand_gen, (float *)W, kernel_size, 0, std_dev));
133 | 		fillValue<float><<<ceil(1.0 * C_out / BW), BW>>>((float *)b, C_out, 0);
134 | 	}
135 | 	else {
136 | 		checkCURAND(curandGenerateNormalDouble(curand_gen, (double *)W, kernel_size, 0, std_dev));
137 | 		fillValue<double><<<ceil(1.0 * C_out / BW), BW>>>((double *)b, C_out, 0);
138 | 	}
139 | 
140 | 	free_bytes = free_bytes - 2 * (kernel_size + C_out) * data_type_size;
141 | 
142 | }
143 | 
144 | void ConvLayerParams::cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream) {
145 | 	checkCNMEM(cnmemMalloc(&dW, kernel_size * data_type_size, stream));
146 | 	checkCNMEM(cnmemMalloc(&db, C_out * data_type_size, stream));
147 | }
148 | 
149 | bool ConvLayerParams::cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, 
150 | 													size_t &max_consume, size_t free_bytes, bool &out_of_memory) {
151 | 	checkCNMEMSim(cnmemMalloc(&dW, kernel_size * data_type_size, stream), 
152 | 					kernel_size * data_type_size, max_consume, free_bytes, return false, out_of_memory);
153 | 	checkCNMEMSim(cnmemMalloc(&db, C_out * data_type_size, stream), 
154 | 					C_out * data_type_size, max_consume, free_bytes, return false, out_of_memory);
155 | 
156 | 	return true;
157 | }
158 | 
159 | void ConvLayerParams::stepParams(cublasHandle_t cublas_handle, double learning_rate) {
160 | 	float Salpha = -learning_rate;
161 | 	double Dalpha = -learning_rate;
162 | 
163 | 	if (update_rule == SGD) {
164 | 		if (data_type == CUDNN_DATA_FLOAT) {
165 | 			checkCUBLAS(cublasSaxpy(cublas_handle, kernel_size,
166 | 									&Salpha,
167 | 									(float *)dW, 1,
168 | 									(float *)W, 1));
169 | 
170 | 			checkCUBLAS(cublasSaxpy(cublas_handle, C_out,
171 | 									&Salpha,
172 | 									(float *)db, 1,
173 | 									(float *)b, 1));
174 | 		}
175 | 		else if (data_type == CUDNN_DATA_DOUBLE) {
176 | 			checkCUBLAS(cublasDaxpy(cublas_handle, kernel_size,
177 | 									&Dalpha,
178 | 									(double *)dW, 1,
179 | 									(double *)W, 1));
180 | 
181 | 			checkCUBLAS(cublasDaxpy(cublas_handle, C_out,
182 | 									&Dalpha,
183 | 									(double *)db, 1,
184 | 									(double *)b, 1));
185 | 		}
186 | 	}
187 | }
188 | 
189 | void ConvLayerParams::cnmemFreeDerivatives(cudaStream_t stream) {
190 | 	checkCNMEM(cnmemFree(dW, stream));
191 | 	checkCNMEM(cnmemFree(db, stream));
192 | }
193 | 
194 | size_t ConvLayerParams::getWorkspaceSize(size_t &free_bytes, ConvLayerParams::ConvDirection conv_direction, vDNNConvAlgo vdnn_conv_algo) {
195 | 	if (vdnn_conv_algo == vDNN_PERFORMANCE_OPTIMAL) {
196 | 		if (conv_direction == FWD) {
197 | 			if (fwd_perf[0].memory > free_bytes)
198 | 				outOfMemory();
199 | 			fwd_algo = fwd_perf[0].algo;
200 | 			return fwd_perf[0].memory;
201 | 		}
202 | 		else if (conv_direction == BWD_FILTER) {
203 | 			if (bwd_filter_perf[0].memory > free_bytes)
204 | 				outOfMemory();
205 | 			bwd_filter_algo = bwd_filter_perf[0].algo;
206 | 			return bwd_filter_perf[0].memory;
207 | 		}
208 | 		else if (conv_direction == BWD_DATA) {
209 | 			if (bwd_data_perf[0].memory > free_bytes)
210 | 				outOfMemory();
211 | 			bwd_data_algo = bwd_data_perf[0].algo;
212 | 			return bwd_data_perf[0].memory;
213 | 		}
214 | 	}
215 | 	else if (vdnn_conv_algo == vDNN_MEMORY_OPTIMAL) {
216 | 		if (conv_direction == FWD) {
217 | 			for (int i = 0; i < fwd_ret_count; i++) {
218 | 				if (fwd_perf[i].algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM && fwd_perf[i].status == CUDNN_STATUS_SUCCESS &&
219 | 						fwd_perf[i].memory < free_bytes) {
220 | 					fwd_algo = fwd_perf[i].algo;
221 | 					return fwd_perf[i].memory;
222 | 				}
223 | 			}
224 | 		}
225 | 		else if (conv_direction == BWD_FILTER) {
226 | 			for (int i = 0; i < bwd_filter_ret_count; i++) {
227 | 				if (bwd_filter_perf[i].algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 && bwd_filter_perf[i].status == CUDNN_STATUS_SUCCESS &&
228 | 						bwd_filter_perf[i].memory < free_bytes) {
229 | 					bwd_filter_algo = bwd_filter_perf[i].algo;
230 | 					// std::cout << "Free bytes " << free_bytes << std::endl;
231 | 					// std::cout << "bwd_filter_perf[i].memory " << bwd_filter_perf[i].memory << std::endl;
232 | 					return bwd_filter_perf[i].memory;
233 | 				}
234 | 			}
235 | 		}
236 | 		else if (conv_direction == BWD_DATA) {
237 | 			for (int i = 0; i < bwd_data_ret_count; i++) {
238 | 				if (bwd_data_perf[i].algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 && bwd_data_perf[i].status == CUDNN_STATUS_SUCCESS &&
239 | 						bwd_data_perf[i].memory < free_bytes) {
240 | 					bwd_data_algo = bwd_data_perf[i].algo;
241 | 					return bwd_data_perf[i].memory;
242 | 				}
243 | 			}
244 | 		}
245 | 		std::cout << "Error in getWorkspaceSize" << std::endl;
246 | 		exit(0);
247 | 	}
248 | 	return 0;
249 | }
250 | 
251 | workspaceStatus_t ConvLayerParams::getWorkspaceSize(size_t &free_bytes, ConvLayerParams::ConvDirection conv_direction, vDNNConvAlgoPref algo_pref, 
252 | 										bool hard_pref, size_t &workspace_size) {
253 | 	if (hard_pref) {
254 | 		if (algo_pref == PREFER_PERFORMANCE_OPTIMAL) {
255 | 			if (conv_direction == FWD) {
256 | 				if (fwd_perf[0].memory > free_bytes && fwd_perf[0].status == CUDNN_STATUS_SUCCESS)
257 | 					return WORKSPACE_STATUS_OUT_OF_MEMORY;
258 | 				fwd_algo = fwd_perf[0].algo;
259 | 				fwd_workspace_size = fwd_perf[0].memory;
260 | 				workspace_size = fwd_workspace_size;
261 | 				return WORKSPACE_STATUS_SUCCESS;
262 | 			}
263 | 			else if (conv_direction == BWD_FILTER) {
264 | 				if (bwd_filter_perf[0].memory > free_bytes && bwd_filter_perf[0].status == CUDNN_STATUS_SUCCESS)
265 | 					return WORKSPACE_STATUS_OUT_OF_MEMORY;
266 | 				bwd_filter_algo = bwd_filter_perf[0].algo;
267 | 				bwd_filter_workspace_size = bwd_filter_perf[0].memory;
268 | 				workspace_size = bwd_filter_workspace_size;
269 | 				return WORKSPACE_STATUS_SUCCESS;
270 | 			}
271 | 			else if (conv_direction == BWD_DATA) {
272 | 				if (bwd_data_perf[0].memory > free_bytes && bwd_data_perf[0].status == CUDNN_STATUS_SUCCESS)
273 | 					return WORKSPACE_STATUS_OUT_OF_MEMORY;
274 | 				bwd_data_algo = bwd_data_perf[0].algo;
275 | 				bwd_data_workspace_size = bwd_data_perf[0].memory;
276 | 				workspace_size = bwd_data_workspace_size;
277 | 				return WORKSPACE_STATUS_SUCCESS;
278 | 			}
279 | 		}
280 | 		else if (algo_pref == PREFER_MEMORY_OPTIMAL) {
281 | 			if (conv_direction == FWD) {
282 | 				for (int i = 0; i < fwd_ret_count; i++) {
283 | 					if (fwd_perf[i].algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)
284 | 						if (fwd_perf[i].memory < free_bytes && fwd_perf[i].status == CUDNN_STATUS_SUCCESS) {
285 | 							fwd_algo = fwd_perf[i].algo;
286 | 							fwd_workspace_size = fwd_perf[i].memory;
287 | 							workspace_size = fwd_workspace_size;
288 | 							return WORKSPACE_STATUS_SUCCESS;
289 | 						}
290 | 						else
291 | 							return WORKSPACE_STATUS_OUT_OF_MEMORY;
292 | 				}
293 | 			}
294 | 			else if (conv_direction == BWD_FILTER) {
295 | 				for (int i = 0; i < bwd_filter_ret_count; i++) {
296 | 					if (bwd_filter_perf[i].algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1)
297 | 						if (bwd_filter_perf[i].memory < free_bytes && bwd_filter_perf[i].status == CUDNN_STATUS_SUCCESS) {
298 | 							bwd_filter_algo = bwd_filter_perf[i].algo;
299 | 							// std::cout << "Free bytes " << free_bytes << std::endl;
300 | 							// std::cout << "bwd_filter_perf[i].memory " << bwd_filter_perf[i].memory << std::endl;
301 | 							bwd_filter_workspace_size = bwd_filter_perf[i].memory;
302 | 							workspace_size = bwd_filter_workspace_size;
303 | 							return WORKSPACE_STATUS_SUCCESS;
304 | 						}
305 | 						else
306 | 							return WORKSPACE_STATUS_OUT_OF_MEMORY;
307 | 				}
308 | 			}
309 | 			else if (conv_direction == BWD_DATA) {
310 | 				for (int i = 0; i < bwd_data_ret_count; i++) {
311 | 					if (bwd_data_perf[i].algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1)
312 | 						if (bwd_data_perf[i].memory < free_bytes && bwd_data_perf[i].status == CUDNN_STATUS_SUCCESS) {
313 | 							bwd_data_algo = bwd_data_perf[i].algo;
314 | 							bwd_data_workspace_size = bwd_data_perf[i].memory;
315 | 							workspace_size = bwd_data_workspace_size;
316 | 							return WORKSPACE_STATUS_SUCCESS;
317 | 						}
318 | 						else
319 | 							return WORKSPACE_STATUS_OUT_OF_MEMORY;
320 | 				}
321 | 			}
322 | 		}
323 | 	}
324 | 	else {
325 | 		// only performance optimal is possible
326 | 		if (algo_pref == PREFER_PERFORMANCE_OPTIMAL) {
327 | 			if (conv_direction == FWD) {
328 | 				for (int i = 0; i < fwd_ret_count; i++) {
329 | 					if (fwd_perf[i].memory < free_bytes && fwd_perf[i].status == CUDNN_STATUS_SUCCESS) {
330 | 						fwd_algo = fwd_perf[i].algo;
331 | 						fwd_workspace_size = fwd_perf[i].memory;
332 | 						workspace_size = fwd_workspace_size;
333 | 						return WORKSPACE_STATUS_SUCCESS;
334 | 					}
335 | 				}
336 | 			}
337 | 			else if (conv_direction == BWD_FILTER) {
338 | 				for (int i = 0; i < bwd_filter_ret_count; i++) {
339 | 					if (bwd_filter_perf[i].memory < free_bytes && bwd_filter_perf[i].status == CUDNN_STATUS_SUCCESS) {
340 | 						bwd_filter_algo = bwd_filter_perf[i].algo;
341 | 						// std::cout << "Free bytes " << free_bytes << std::endl;
342 | 						// std::cout << "bwd_filter_perf[i].memory " << bwd_filter_perf[i].memory << std::endl;
343 | 						bwd_filter_workspace_size = bwd_filter_perf[i].memory;
344 | 						workspace_size = bwd_filter_workspace_size;
345 | 						return WORKSPACE_STATUS_SUCCESS;
346 | 					}
347 | 				}
348 | 			}
349 | 			else if (conv_direction == BWD_DATA) {
350 | 				for (int i = 0; i < bwd_data_ret_count; i++) {
351 | 					if (bwd_data_perf[i].memory < free_bytes && bwd_data_perf[i].status == CUDNN_STATUS_SUCCESS) {
352 | 						bwd_data_algo = bwd_data_perf[i].algo;
353 | 						bwd_data_workspace_size = bwd_data_perf[i].memory;
354 | 						workspace_size = bwd_data_workspace_size;
355 | 						return WORKSPACE_STATUS_SUCCESS;
356 | 					}
357 | 				}
358 | 			}
359 | 		}
360 | 	}
361 | 	return WORKSPACE_STATUS_OUT_OF_MEMORY;
362 | }
363 | 
364 | void FCLayerParams::initializeValues(FCDescriptor *user_params, int batch_size, cudnnTensorFormat_t tensor_format, cudnnDataType_t data_type, 
365 | 										LayerDimension &output_size, UpdateRule update_rule) {
366 | 	C_in = user_params->input_channels;
367 | 	C_out = user_params->output_channels;
368 | 	weight_matrix_size = C_in * C_out;
369 | 	this->data_type = data_type;
370 | 	this->activation_mode = user_params->activation_mode;
371 | 
372 | 	this->update_rule = update_rule;
373 | 
374 | 	cudnnActivationMode_t mode;
375 | 	if (activation_mode == SIGMOID)
376 | 		mode = CUDNN_ACTIVATION_SIGMOID;
377 | 	else if (activation_mode == RELU)
378 | 		mode = CUDNN_ACTIVATION_RELU;
379 | 	else if (activation_mode == TANH)
380 | 		mode = CUDNN_ACTIVATION_TANH;
381 | 	else if (activation_mode == CLIPPED_RELU)
382 | 		mode = CUDNN_ACTIVATION_CLIPPED_RELU;
383 | 	else if (activation_mode == ELU)
384 | 		mode = CUDNN_ACTIVATION_ELU;
385 | 
386 | 	if (activation_mode != ACTIVATION_NONE) {
387 | 		checkCUDNN(cudnnCreateActivationDescriptor(&actv_desc));
388 | 		checkCUDNN(cudnnSetActivationDescriptor(actv_desc, mode, CUDNN_PROPAGATE_NAN, user_params->actv_coef));
389 | 		checkCUDNN(cudnnCreateTensorDescriptor(&output_tensor));
390 | 		checkCUDNN(cudnnSetTensor4dDescriptor(output_tensor, tensor_format, data_type, 
391 | 										batch_size, user_params->output_channels, 1, 1));
392 | 	}
393 | 
394 | 	output_size.N = batch_size, output_size.C = C_out, output_size.H = output_size.W = 1;
395 | }
396 | 
397 | void FCLayerParams::allocateSpace(curandGenerator_t curand_gen, cudnnDataType_t data_type, size_t data_type_size, 
398 | 									float std_dev, size_t &free_bytes, bool alloc_derivative) {
399 | 	int wt_alloc_size = weight_matrix_size;
400 | 	if (wt_alloc_size % 2 != 0)
401 | 		wt_alloc_size += 1;
402 | 	checkCudaErrors(cudaMalloc(&W, wt_alloc_size * data_type_size));
403 | 	checkCudaErrors(cudaMalloc(&b, C_out * data_type_size));
404 | 	if (alloc_derivative) {
405 | 		checkCudaErrors(cudaMalloc(&dW, wt_alloc_size * data_type_size));
406 | 		checkCudaErrors(cudaMalloc(&db, C_out * data_type_size));
407 | 	}
408 | 
409 | 	if (data_type == CUDNN_DATA_FLOAT) {
410 | 		checkCURAND(curandGenerateNormal(curand_gen, (float *)W, wt_alloc_size, 0, std_dev));
411 | 		fillValue<float><<<ceil(1.0 * C_out / BW), BW>>>((float *)b, C_out, 0);
412 | 	}
413 | 	else if (data_type == CUDNN_DATA_DOUBLE) {
414 | 		checkCURAND(curandGenerateNormalDouble(curand_gen, (double *)W, wt_alloc_size, 0, std_dev));
415 | 		fillValue<double><<<ceil(1.0 * C_out / BW), BW>>>((double *)b, C_out, 0);
416 | 	}
417 | 	free_bytes = free_bytes - 2 * (C_in * C_out + C_out) * data_type_size;
418 | }
419 | 
420 | void FCLayerParams::cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream) {
421 | 	checkCNMEM(cnmemMalloc(&dW, weight_matrix_size * data_type_size, stream));
422 | 	checkCNMEM(cnmemMalloc(&db, C_out * data_type_size, stream));
423 | }
424 | 
425 | bool FCLayerParams::cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, 
426 | 												size_t &max_consume, size_t free_bytes, bool &out_of_memory) {
427 | 	checkCNMEMSim(cnmemMalloc(&dW, weight_matrix_size * data_type_size, stream), 
428 | 					weight_matrix_size * data_type_size, max_consume, free_bytes, return false, out_of_memory);
429 | 	checkCNMEMSim(cnmemMalloc(&db, C_out * data_type_size, stream), 
430 | 					C_out * data_type_size, max_consume, free_bytes, return false, out_of_memory);
431 | 	return true;
432 | }
433 | 
434 | void FCLayerParams::stepParams(cublasHandle_t cublas_handle, double learning_rate) {
435 | 	float Salpha = -learning_rate;
436 | 	double Dalpha = -learning_rate;
437 | 	
438 | 	// {
439 | 	// 	float *db_h = (float *)malloc(C_out * sizeof(float));
440 | 	// 	checkCudaErrors(cudaMemcpy(db_h, db, C_out * sizeof(float), cudaMemcpyDeviceToHost));
441 | 	// 	for (int i = 0; i < C_out; i++) {
442 | 	// 		std::cout << db_h[i] << ' ';
443 | 	// 	}
444 | 	// 	std::cout << "\n";
445 | 	// 	int n;
446 | 	// 	std::cin >> n;
447 | 	// }
448 | 
449 | 
450 | 	if (update_rule == SGD) {
451 | 		if (data_type == CUDNN_DATA_FLOAT) {
452 | 			checkCUBLAS(cublasSaxpy(cublas_handle, weight_matrix_size,
453 | 									&Salpha,
454 | 									(float *)dW, 1,
455 | 									(float *)W, 1));
456 | 	
457 | 			checkCUBLAS(cublasSaxpy(cublas_handle, C_out,
458 | 									&Salpha,
459 | 									(float *)db, 1,
460 | 									(float *)b, 1));
461 | 		}
462 | 		else if (data_type == CUDNN_DATA_DOUBLE) {
463 | 			checkCUBLAS(cublasDaxpy(cublas_handle, weight_matrix_size,
464 | 									&Dalpha,
465 | 									(double *)dW, 1,
466 | 									(double *)W, 1));
467 | 
468 | 			checkCUBLAS(cublasDaxpy(cublas_handle, C_out,
469 | 									&Dalpha,
470 | 									(double *)db, 1,
471 | 									(double *)b, 1));
472 | 		}
473 | 	}
474 | 	// {
475 | 	// 	float *db_h = (float *)malloc(C_out * sizeof(float));
476 | 	// 	checkCudaErrors(cudaMemcpy(db_h, b, C_out * sizeof(float), cudaMemcpyDeviceToHost));
477 | 	// 	for (int i = 0; i < C_out; i++) {
478 | 	// 		std::cout << db_h[i] << ' ';
479 | 	// 	}
480 | 	// 	std::cout << "\n";
481 | 	// 	int n;
482 | 	// 	std::cin >> n;
483 | 	// }
484 | }
485 | 
486 | void FCLayerParams::cnmemFreeDerivatives(cudaStream_t stream) {
487 | 	checkCNMEM(cnmemFree(dW, stream));
488 | 	checkCNMEM(cnmemFree(db, stream));
489 | }
490 | 
491 | void DropoutLayerParams::initializeValues(cudnnHandle_t cudnn_handle, DropoutDescriptor *user_params, cudnnDataType_t data_type, int batch_size,
492 | 										 cudnnTensorFormat_t tensor_format, LayerDimension &output_size) {
493 | 	checkCUDNN(cudnnCreateDropoutDescriptor(&dropout_desc));
494 | 	checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor));
495 | 
496 | 	checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 
497 | 										batch_size, user_params->channels, user_params->h, user_params->w));
498 | 
499 | 	checkCUDNN(cudnnDropoutGetStatesSize(cudnn_handle, &state_size));
500 | 
501 | 	checkCUDNN(cudnnDropoutGetReserveSpaceSize(input_tensor, &reserved_space_size));
502 | 	
503 | 	output_size.N = batch_size, output_size.C = user_params->channels, output_size.H = user_params->h, output_size.W = user_params->w;
504 | 
505 | }
506 | 
507 | void DropoutLayerParams::allocateSpace(size_t &free_bytes, cudnnHandle_t cudnn_handle, DropoutDescriptor *user_params, long long seed) {
508 | 	checkCudaErrors(cudaMalloc(&state, state_size));
509 | 	checkCudaErrors(cudaMalloc(&reserved_space, reserved_space_size));
510 | 	checkCUDNN(cudnnSetDropoutDescriptor(dropout_desc, cudnn_handle, user_params->dropout_value, state, state_size, seed));
511 | 
512 | 	free_bytes = free_bytes - (state_size + reserved_space_size);
513 | }
514 | 
515 | void BatchNormLayerParams::initializeValues(BatchNormDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 
516 | 							int batch_size, LayerDimension &output_size, UpdateRule update_rule) {
517 | 	checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor));
518 | 	checkCUDNN(cudnnCreateTensorDescriptor(&sbmv_desc));
519 | 	c = user_params->channels, h = user_params->h, w = user_params->w;
520 | 	if (user_params->mode == BATCHNORM_PER_ACTIVATION) {
521 | 		mode = CUDNN_BATCHNORM_PER_ACTIVATION;
522 | 		checkCUDNN(cudnnSetTensor4dDescriptor(sbmv_desc, tensor_format, data_type,
523 | 												1, user_params->channels, user_params->h, user_params->w));
524 | 		sbmv_size = c * h * w;
525 | 	}
526 | 	else if (user_params->mode == BATCHNORM_SPATIAL) {
527 | 		mode = CUDNN_BATCHNORM_SPATIAL;
528 | 		checkCUDNN(cudnnSetTensor4dDescriptor(sbmv_desc, tensor_format, data_type,
529 | 												1, user_params->channels, 1, 1));
530 | 		sbmv_size = c;
531 | 	}
532 | 
533 | 	checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 
534 | 										batch_size, user_params->channels, user_params->h, user_params->w));
535 | 	
536 | 	factor = user_params->factor;
537 | 	epsilon = user_params->epsilon;
538 | 
539 | 	this->update_rule = update_rule;
540 | 	this->data_type = data_type;
541 | 
542 | 	if (mode == CUDNN_BATCHNORM_PER_ACTIVATION)
543 | 		allocation_size = c * h * w;
544 | 	else
545 | 		allocation_size = c;
546 | 
547 | 	output_size.N = batch_size, output_size.C = user_params->channels, output_size.H = user_params->h, output_size.W = user_params->w;
548 | }
549 | 
550 | void BatchNormLayerParams::allocateSpace(cudnnDataType_t data_type, size_t data_type_size, size_t &free_bytes, bool alloc_derivative) {
551 | 
552 | 	size_t allocation_size_bytes = allocation_size * data_type_size;
553 | 
554 | 	checkCudaErrors(cudaMalloc(&scale, allocation_size_bytes));
555 | 	checkCudaErrors(cudaMalloc(&bias, allocation_size_bytes));
556 | 	if (alloc_derivative) {
557 | 		checkCudaErrors(cudaMalloc(&dscale, allocation_size_bytes));
558 | 		checkCudaErrors(cudaMalloc(&dbias, allocation_size_bytes));
559 | 	}
560 | 
561 | 	checkCudaErrors(cudaMalloc(&running_mean, allocation_size_bytes));
562 | 	checkCudaErrors(cudaMalloc(&running_variance, allocation_size_bytes));
563 | 
564 | 	checkCudaErrors(cudaMalloc(&result_save_mean, allocation_size_bytes));
565 | 	checkCudaErrors(cudaMalloc(&result_save_inv_var, allocation_size_bytes));
566 | 
567 | 	if (data_type == CUDNN_DATA_FLOAT) {
568 | 		fillValue<float><<<ceil(1.0 * allocation_size / BW), BW>>>((float *)scale, allocation_size, 1);
569 | 		fillValue<float><<<ceil(1.0 * allocation_size / BW), BW>>>((float *)bias, allocation_size, 1);
570 | 	}
571 | 	else if (data_type == CUDNN_DATA_DOUBLE) {
572 | 		fillValue<double><<<ceil(1.0 * allocation_size / BW), BW>>>((double *)scale, allocation_size, 1);
573 | 		fillValue<double><<<ceil(1.0 * allocation_size / BW), BW>>>((double *)bias, allocation_size, 1);
574 | 	}
575 | 	free_bytes = free_bytes - 6 * allocation_size_bytes;
576 | 
577 | }
578 | 
579 | void BatchNormLayerParams::cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream) {
580 | 	checkCNMEM(cnmemMalloc(&dscale, allocation_size * data_type_size, stream));
581 | 	checkCNMEM(cnmemMalloc(&dbias, allocation_size * data_type_size, stream));
582 | }
583 | 
584 | bool BatchNormLayerParams::cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, 
585 | 														size_t &max_consume, size_t free_bytes, bool &out_of_memory) {
586 | 	checkCNMEMSim(cnmemMalloc(&dscale, allocation_size * data_type_size, stream), 
587 | 					allocation_size * data_type_size, max_consume, free_bytes, return false, out_of_memory);
588 | 	checkCNMEMSim(cnmemMalloc(&dbias, allocation_size * data_type_size, stream), 
589 | 					allocation_size * data_type_size, max_consume, free_bytes, return false, out_of_memory);
590 | 	return true;
591 | }
592 | 
593 | void BatchNormLayerParams::stepParams(cublasHandle_t cublas_handle, double learning_rate) {
594 | 	float Salpha = -learning_rate;
595 | 	double Dalpha = -learning_rate;
596 | 
597 | 	if (update_rule == SGD) {
598 | 		if (data_type == CUDNN_DATA_FLOAT) {
599 | 			checkCUBLAS(cublasSaxpy(cublas_handle, sbmv_size,
600 | 									&Salpha,
601 | 									(float *)dscale, 1,
602 | 									(float *)scale, 1));
603 | 			checkCUBLAS(cublasSaxpy(cublas_handle, sbmv_size,
604 | 									&Salpha,
605 | 									(float *)dbias, 1,
606 | 									(float *)bias, 1));
607 | 		}
608 | 		else if (data_type == CUDNN_DATA_DOUBLE) {
609 | 			checkCUBLAS(cublasDaxpy(cublas_handle, sbmv_size,
610 | 									&Dalpha,
611 | 									(double *)dscale, 1,
612 | 									(double *)scale, 1));
613 | 			checkCUBLAS(cublasDaxpy(cublas_handle, sbmv_size,
614 | 									&Dalpha,
615 | 									(double *)dbias, 1,
616 | 									(double *)bias, 1));
617 | 		}
618 | 	}
619 | }
620 | 
621 | void BatchNormLayerParams::cnmemFreeDerivatives(cudaStream_t stream) {
622 | 	checkCNMEM(cnmemFree(dscale, stream));
623 | 	checkCNMEM(cnmemFree(dbias, stream));
624 | }
625 | 
626 | void PoolingLayerParams::initializeValues(PoolingDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 
627 | 							int batch_size, LayerDimension &output_size) {
628 | 	checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor));
629 | 	checkCUDNN(cudnnCreateTensorDescriptor(&output_tensor));
630 | 
631 | 	checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 
632 | 										batch_size, user_params->input_channels, user_params->input_h, user_params->input_w));
633 | 	
634 | 
635 | 	checkCUDNN(cudnnCreatePoolingDescriptor(&pool_desc));
636 | 
637 | 	cudnnPoolingMode_t mode;
638 | 	if (user_params->mode == POOLING_MAX)
639 | 		mode = CUDNN_POOLING_MAX;
640 | 	else if (user_params->mode == POOLING_AVERAGE_COUNT_INCLUDE_PADDING)
641 | 		mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
642 | 	else if (user_params->mode == POOLING_AVERAGE_COUNT_EXCLUDE_PADDING)
643 | 		mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
644 | 
645 | 	checkCUDNN(cudnnSetPooling2dDescriptor(pool_desc, mode, CUDNN_PROPAGATE_NAN,
646 | 											user_params->kernel_h, user_params->kernel_w,
647 | 											user_params->pad_h, user_params->pad_w,
648 | 											user_params->stride_y, user_params->stride_x));
649 | 
650 | 
651 | 	int output_batch_size, output_channels, output_h, output_w;
652 | 	checkCUDNN(cudnnGetPooling2dForwardOutputDim(pool_desc, input_tensor, 
653 | 											&output_batch_size, &output_channels, &output_h, &output_w));
654 | 
655 | 	checkCUDNN(cudnnSetTensor4dDescriptor(output_tensor, tensor_format, data_type, 
656 | 										output_batch_size, output_channels, output_h, output_w));
657 | 
658 | 	output_size.N = output_batch_size, output_size.C = output_channels, output_size.H = output_h, output_size.W = output_w;
659 | 
660 | }
661 | 
662 | void PoolingLayerParams::allocateSpace(size_t &free_bytes) {
663 | 
664 | }
665 | 
666 | void ActivationLayerParams::initializeValues(ActivationDescriptor *user_params, cudnnDataType_t data_type,
667 | 											cudnnTensorFormat_t tensor_format, int batch_size, LayerDimension &output_size) {
668 | 	checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor));
669 | 
670 | 	checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 
671 | 										batch_size, user_params->channels, user_params->h, user_params->w));
672 | 
673 | 	cudnnActivationMode_t mode;
674 | 	if (user_params->mode == SIGMOID)
675 | 		mode = CUDNN_ACTIVATION_SIGMOID;
676 | 	else if (user_params->mode == RELU)
677 | 		mode = CUDNN_ACTIVATION_RELU;
678 | 	else if (user_params->mode == TANH)
679 | 		mode = CUDNN_ACTIVATION_TANH;
680 | 	else if (user_params->mode == CLIPPED_RELU)
681 | 		mode = CUDNN_ACTIVATION_CLIPPED_RELU;
682 | 	else if (user_params->mode == ELU)
683 | 		mode = CUDNN_ACTIVATION_ELU;
684 | 
685 | 	checkCUDNN(cudnnCreateActivationDescriptor(&actv_desc));
686 | 	checkCUDNN(cudnnSetActivationDescriptor(actv_desc, mode, CUDNN_PROPAGATE_NAN, user_params->coef));
687 | 
688 | 	output_size.N = batch_size, output_size.C = user_params->channels, output_size.H = user_params->h, output_size.W = user_params->w;
689 | }
690 | 
691 | void ActivationLayerParams::allocateSpace(size_t &free_bytes) {
692 | 	
693 | }
694 | 
695 | void SoftmaxLayerParams::initializeValues(SoftmaxDescriptor *user_params, cudnnDataType_t data_type,
696 | 											cudnnTensorFormat_t tensor_format, int batch_size, LayerDimension &output_size) {
697 | 	if (user_params->algo == SOFTMAX_FAST)
698 | 		algo = CUDNN_SOFTMAX_FAST;
699 | 	else if (user_params->algo == SOFTMAX_ACCURATE)
700 | 		algo = CUDNN_SOFTMAX_ACCURATE;
701 | 
702 | 	if (user_params->mode == SOFTMAX_MODE_INSTANCE)
703 | 		mode = CUDNN_SOFTMAX_MODE_INSTANCE;
704 | 	else if (user_params->mode == SOFTMAX_MODE_CHANNEL) {
705 | 		mode = CUDNN_SOFTMAX_MODE_CHANNEL;
706 | 	}
707 | 
708 | 	checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor));
709 | 	checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 
710 | 										batch_size, user_params->channels, user_params->h, user_params->w));
711 | 
712 | 	output_size.N = batch_size, output_size.C = user_params->channels, output_size.H = user_params->h, output_size.W = user_params->w;	
713 | }
714 | 
715 | void SoftmaxLayerParams::allocateSpace(size_t &free_bytes) {
716 | 
717 | }


--------------------------------------------------------------------------------