├── README.md ├── src ├── utils.cu ├── user_iface.cu ├── mnist_test.cu ├── solver.cu ├── slow_conv.cu ├── vgg_test.cu ├── main.cu ├── alexnet_test.cu ├── neural_net_time.cu └── layer_params.cu ├── CMakeLists.txt ├── cnmem ├── LICENSE ├── CMakeLists.txt ├── README.md └── include │ └── cnmem.h └── include ├── solver.h ├── neural_net.h ├── user_iface.h ├── utils.h ├── layer_params.h └── cnmem.h /README.md: -------------------------------------------------------------------------------- 1 | # vDNN 2 | My implementation of the paper titled **vDNN: Virtualized Deep Neural Networks for Scalable, Memory-Efficient Neural Network Design** (https://arxiv.org/abs/1602.08124). Supports only linear networks currently. 3 | 4 | cnmem/ is a software-side memory manager by Nvidia (https://github.com/NVIDIA/cnmem). Original source has been modified to use heurisitcs other than best-fit. 5 | 6 | ### Instructions to set up 7 | Run cmake and make inside ./cnmem/ as well as in ./. Look at vgg_test.cu for an example of specifying and training neural network. New program has to added in ./CMakeLists.txt. Look how vgg_test.cu has been added for an example. Essential API function declarations for training are in include/user_iface.h and include/solver.h. 8 | -------------------------------------------------------------------------------- /src/utils.cu: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | int LayerDimension::getTotalSize() { 4 | return N * C * H * W; 5 | } 6 | 7 | void outOfMemory() { 8 | std::cout << "Out of Memory\n"; 9 | exit(0); 10 | } 11 | 12 | CnmemSpace::CnmemSpace(size_t free_bytes) { 13 | this->free_bytes = free_bytes; 14 | this->initial_free_bytes = free_bytes; 15 | this->out_of_memory = false; 16 | } 17 | 18 | void CnmemSpace::updateSpace(CnmemSpace::Op op, size_t size) { 19 | 20 | if (op == ADD) 21 | free_bytes += ceil(1.0 * size / CNMEM_GRANULARITY) * CNMEM_GRANULARITY; 22 | else if (op == SUB) { 23 | size_t required_space = ceil(1.0 * size / CNMEM_GRANULARITY) * CNMEM_GRANULARITY; 24 | if (required_space > free_bytes) 25 | this->out_of_memory = true; 26 | free_bytes -= required_space; 27 | } 28 | } 29 | 30 | bool CnmemSpace::isAvailable() { 31 | return !out_of_memory; 32 | } 33 | 34 | size_t CnmemSpace::getConsumed() { 35 | return (initial_free_bytes - free_bytes); 36 | } 37 | 38 | void CnmemSpace::updateMaxConsume(size_t &max_consume) { 39 | max_consume = max_consume > (initial_free_bytes - free_bytes) ? max_consume : (initial_free_bytes - free_bytes); 40 | } -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project( vDNNNeuralNet ) 3 | include_directories( include /usr/include /usr/local/cuda/include /usr/include/x86_64-linux-gnu /usr/local/cuda/samples/common/inc ) 4 | 5 | link_directories( /usr/local/cuda/lib64/ ) 6 | 7 | find_package(CUDA) 8 | 9 | cuda_add_executable( neural_net_vdnn.out src/main.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp) 10 | target_link_libraries( neural_net_vdnn.out -lcudnn -lcublas -lcurand) 11 | 12 | cuda_add_executable( mnist_test.out src/mnist_test.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp) 13 | target_link_libraries( mnist_test.out -lcudnn -lcublas -lcurand ) 14 | 15 | cuda_add_executable( alexnet_test.out src/alexnet_test.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp) 16 | target_link_libraries( alexnet_test.out -lcudnn -lcublas -lcurand ) 17 | 18 | cuda_add_executable( vgg_test.out src/vgg_test.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp) 19 | target_link_libraries( vgg_test.out -lcudnn -lcublas -lcurand) 20 | 21 | cuda_add_executable( slow_conv.out src/slow_conv.cu src/solver.cu src/neural_net.cu src/neural_net_time.cu src/layer_params.cu src/user_iface.cu src/utils.cu src/cnmem.cpp) 22 | target_link_libraries( slow_conv.out -lcudnn -lcublas -lcurand) 23 | 24 | 25 | -------------------------------------------------------------------------------- /cnmem/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of NVIDIA CORPORATION nor the names of its 12 | contributors may be used to endorse or promote products derived 13 | from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | -------------------------------------------------------------------------------- /include/solver.h: -------------------------------------------------------------------------------- 1 | #include "neural_net.h" 2 | 3 | class Solver { 4 | public: 5 | NeuralNet *model; 6 | void *X_train, *X_val; 7 | int *y_train, *y_val; 8 | int num_epoch; 9 | UpdateRule update_rule; 10 | double learning_rate, learning_rate_decay; 11 | int num_train, num_val; 12 | int num_train_batches; 13 | int num_features; 14 | cudaEvent_t start, stop; 15 | 16 | Solver(NeuralNet *model, void *X_train, int *y_train, void *X_val, int *y_val, int num_epoch, UpdateRule update_rule, 17 | double learning_rate, double learning_rate_decay, int num_train, int num_val); 18 | void train(std::vector &loss, std::vector &val_acc); 19 | float step(int start_X, int start_y, std::vector &fwd_vdnn_lag, std::vector &bwd_vdnn_lag); 20 | float step(int start_X, int start_y); 21 | void checkAccuracy(void *X, int *y, int num_samples, int *num_correct); 22 | 23 | void getTrainTime(std::vector &loss, std::vector &time, int num_epoch, 24 | std::vector > &fwd_vdnn_lag, std::vector > &bwd_vdnn_lag); 25 | 26 | void getComputationTime(long num_epoch, std::vector > &fwd_computation_time, std::vector > &bwd_computation_time); 27 | void stepComputationTime(int start_X, int start_y, std::vector &fwd_computation_time, std::vector &bwd_computation_time); 28 | 29 | void getTransferTime(long num_epoch, std::vector > &fwd_transfer_time, std::vector > &bwd_transfer_time); 30 | void stepTransferTime(int start_X, int start_y, std::vector &fwd_transfer_time, std::vector &bwd_transfer_time); 31 | 32 | }; -------------------------------------------------------------------------------- /cnmem/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CMakeLists to build the cnmem library. 2 | cmake_minimum_required(VERSION 2.8.8) 3 | project(cnmem) 4 | 5 | # We need CUDA to build that library. 6 | find_package(CUDA QUIET REQUIRED) 7 | include_directories(${CUDA_INCLUDE_DIRS}) 8 | 9 | # Rules to build the cnmem library. 10 | include_directories(include) 11 | add_definitions(-DCNMEM_DLLEXPORT) 12 | add_library(cnmem SHARED src/cnmem.cpp) 13 | set_target_properties(cnmem PROPERTIES VERSION 1.0.0 SOVERSION 1) 14 | target_link_libraries(cnmem LINK_PUBLIC ${CUDA_LIBRARIES}) 15 | install(TARGETS cnmem RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) 16 | install(FILES include/cnmem.h DESTINATION include) 17 | 18 | # Add the tests. 19 | if(WITH_TESTS) 20 | 21 | # Get Google tests. 22 | find_package(GTest QUIET REQUIRED) 23 | include_directories(${GTEST_INCLUDE_DIRS}) 24 | 25 | # Build the executable. 26 | add_executable(cnmem_tests tests/cnmem_tests.cpp) 27 | if(MSVC) 28 | if(MSVC_VERSION GREATER 1700) # Visual Studio 11 or more. 29 | add_definitions(-DUSE_CPP_11) 30 | endif(MSVC_VERSION GREATER 1700) 31 | endif(MSVC) 32 | if(CMAKE_COMPILER_IS_GNUCC) 33 | add_definitions(-std=c++11 -DUSE_CPP_11) 34 | endif(CMAKE_COMPILER_IS_GNUCC) 35 | target_link_libraries(cnmem_tests LINK_PUBLIC cnmem ${CUDA_LIBRARIES} ${GTEST_LIBRARIES} -lpthread) 36 | install(TARGETS cnmem_tests RUNTIME DESTINATION bin) 37 | 38 | # On Windows, we copy the Google test DLL to the bin folder. 39 | if(MSVC) 40 | get_filename_component(gtest_dll_path ${GTEST_LIBRARIES} DIRECTORY) 41 | install(FILES ${gtest_dll_path}/gtest.dll DESTINATION bin) 42 | endif(MSVC) 43 | 44 | endif(WITH_TESTS) 45 | 46 | -------------------------------------------------------------------------------- /cnmem/README.md: -------------------------------------------------------------------------------- 1 | # CNMeM Library 2 | 3 | Simple library to help the Deep Learning frameworks manage CUDA memory. 4 | 5 | CNMeM is not intended to be a general purpose memory management library. It was designed as a simple 6 | tool for applications which work on a limited number of large memory buffers. 7 | 8 | CNMeM is mostly developed on Ubuntu Linux. It should support other operating systems as well. If you 9 | encounter an issue with the library on other operating systems, please submit a bug (or a fix). 10 | 11 | # Prerequisites 12 | 13 | CNMeM relies on the CUDA toolkit. It uses C++ STL and the Pthread library on Linux. On Windows, it uses 14 | the native Win32 threading library. The build system uses CMake. The unit tests are written using 15 | Google tests (but are not mandatory). 16 | 17 | ## CUDA 18 | 19 | The CUDA toolkit is required. We recommend using CUDA >= 7.0 even if earlier versions will work. 20 | * Download from the [CUDA website](https://developer.nvidia.com/cuda-downloads) 21 | * Follow the installation instructions 22 | * Don't forget to set your path. For example: 23 | * `CUDA_HOME=/usr/local/cuda` 24 | * `LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH` 25 | 26 | # Build CNMeM 27 | 28 | ## Grab the source 29 | 30 | % cd $HOME 31 | % git clone https://github.com/NVIDIA/cnmem.git cnmem 32 | 33 | ## Build CNMeM without the unit tests 34 | 35 | % cd cnmem 36 | % mkdir build 37 | % cd build 38 | % cmake .. 39 | % make 40 | 41 | ## Build CNMeM with the unit tests 42 | 43 | To build the tests, you need to add an extra option to the cmake command. 44 | 45 | % cd cnmem 46 | % mkdir build 47 | % cd build 48 | % cmake -DWITH_TESTS=True .. 49 | % make 50 | 51 | ## Link with CNMeM 52 | 53 | The source folder contains a header file 'include/cnmem.h' and the build directory contains the 54 | library 'libcnmem.so', 'cnmem.lib/cnmem.dll' or 'libcnmem.dylib', depending on your operating 55 | system. 56 | 57 | -------------------------------------------------------------------------------- /include/neural_net.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "user_iface.h" 10 | #include "layer_params.h" 11 | #include "utils.h" 12 | 13 | // ---------------------- vDNN start ---------------------- 14 | #include 15 | // ---------------------- vDNN emd ------------------------ 16 | 17 | #ifndef NEURAL_NET 18 | #define NEURAL_NET 19 | class NeuralNet { 20 | public: 21 | void **layer_input, **dlayer_input, **params; 22 | int *layer_input_size; 23 | int *y, *pred_y; 24 | float *loss; 25 | float softmax_eps; 26 | void *one_vec; 27 | float init_std_dev; 28 | 29 | std::vector layer_type; 30 | int num_layers; 31 | cudnnHandle_t cudnn_handle; 32 | cublasHandle_t cublas_handle; 33 | curandGenerator_t curand_gen; 34 | 35 | cudnnDataType_t data_type; 36 | size_t data_type_size; 37 | cudnnTensorFormat_t tensor_format; 38 | int batch_size; 39 | 40 | size_t init_free_bytes, free_bytes, total_bytes; 41 | size_t workspace_size; 42 | void *workspace; 43 | 44 | int input_channels, input_h, input_w; 45 | int num_classes; 46 | 47 | float *h_loss; 48 | int *h_pred_y; 49 | 50 | // vDNN 51 | vDNNType vdnn_type; 52 | vDNNConvAlgo vdnn_conv_algo; 53 | cudaStream_t stream_compute, stream_memory; 54 | 55 | bool pre_alloc_conv_derivative, pre_alloc_fc_derivative, pre_alloc_batch_norm_derivative; 56 | 57 | void **h_layer_input; 58 | bool *to_offload, *prefetched; 59 | 60 | enum OffloadType {OFFLOAD_ALL, OFFLOAD_NONE, OFFLOAD_CONV, OFFLOAD_ALTERNATE_CONV}; 61 | 62 | NeuralNet(std::vector &layers, DataType data_type, int batch_size, TensorFormat tensor_format, 63 | long long dropout_seed, float softmax_eps, float init_std_dev, vDNNType vdnn_type, vDNNConvAlgo vdnn_conv_algo, 64 | UpdateRule update_rule); 65 | 66 | void getLoss(void *X, int *y, double learning_rate, std::vector &fwd_vdnn_lag, std::vector &bwd_vdnn_lag, bool train = true, int *correct_count = NULL, float *loss = NULL); 67 | void getLoss(void *X, int *y, double learning_rate, bool train = true, int *correct_count = NULL, float *loss = NULL); 68 | 69 | void compareOutputCorrect(int *correct_count, int *y); 70 | 71 | float computeLoss(); 72 | 73 | int findPrefetchLayer(int cur_layer); 74 | 75 | bool simulateNeuralNetworkMemory(vDNNConvAlgoPref algo_pref, bool hard, size_t &exp_max_consume, size_t &max_consume); 76 | bool simulateCNMEMMemory(size_t &max_consume); 77 | void vDNNOptimize(size_t &exp_max_consume, size_t &max_consume); 78 | void setOffload(OffloadType offload_type); 79 | void resetPrefetched(); 80 | 81 | // data of time 82 | cudaEvent_t start_compute, stop_compute; 83 | void getComputationTime(void *X, int *y, double learning_rate, std::vector &fwd_computation_time, std::vector &bwd_computation_time); 84 | cudaEvent_t start_transfer, stop_transfer; 85 | void getTransferTime(void *X, int *y, double learning_rate, std::vector &fwd_transfer_time, std::vector &bwd_transfer_time); 86 | }; 87 | 88 | #endif -------------------------------------------------------------------------------- /include/user_iface.h: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #ifndef USER_IFACE 4 | #define USER_IFACE 5 | 6 | enum LayerOp {CONV, FULLY_CONNECTED, BATCHNORM, DROPOUT, POOLING, ACTV, SOFTMAX, CROSS_ENTROPY, SVM}; 7 | enum SoftmaxAlgorithm {SOFTMAX_FAST, SOFTMAX_ACCURATE}; 8 | enum SoftmaxMode {SOFTMAX_MODE_INSTANCE, SOFTMAX_MODE_CHANNEL}; 9 | enum DataType {DATA_FLOAT, DATA_DOUBLE}; 10 | enum TensorFormat {TENSOR_NCHW, TENSOR_NHWC}; 11 | enum BatchNormMode {BATCHNORM_PER_ACTIVATION, BATCHNORM_SPATIAL}; 12 | enum PoolingMode {POOLING_MAX, POOLING_AVERAGE_COUNT_INCLUDE_PADDING, POOLING_AVERAGE_COUNT_EXCLUDE_PADDING}; 13 | enum ActivationMode {SIGMOID, RELU, TANH, CLIPPED_RELU, ELU, ACTIVATION_NONE}; 14 | enum UpdateRule {SGD}; 15 | enum vDNNType {vDNN_ALL, vDNN_CONV, vDNN_NONE, vDNN_DYN, vDNN_ALTERNATE_CONV}; 16 | enum vDNNConvAlgo {vDNN_PERFORMANCE_OPTIMAL, vDNN_MEMORY_OPTIMAL}; 17 | 18 | struct ConvDescriptor { 19 | int input_channels, output_channels, kernel_h, kernel_w; // define kernel parameters 20 | int input_h, input_w; // output width can be inferred 21 | int pad_h, pad_w, stride_y, stride_x; 22 | ActivationMode activation_mode; 23 | double actv_coef; 24 | 25 | void initializeValues(int input_channels, int output_channels, int kernel_h, int kernel_w, int input_h, int input_w, 26 | int pad_h, int pad_w, int stride_x, int stride_y, ActivationMode activation_mode = ACTIVATION_NONE, double actv_coef = 1.0); 27 | 28 | }; 29 | 30 | struct PoolingDescriptor { 31 | int input_channels, kernel_h, kernel_w; 32 | int input_h, input_w; 33 | int pad_h, pad_w, stride_y, stride_x; 34 | PoolingMode mode; 35 | 36 | void initializeValues(int input_channels, int kernel_h, int kernel_w, 37 | int input_h, int input_w, int pad_h, int pad_w, int stride_x, int stride_y, PoolingMode mode); 38 | }; 39 | 40 | struct DropoutDescriptor { 41 | double dropout_value; 42 | int channels, h, w; 43 | 44 | void initializeValues(double dropout_value, int channels, int h, int w); 45 | }; 46 | 47 | struct FCDescriptor { 48 | int input_channels, output_channels; 49 | ActivationMode activation_mode; 50 | double actv_coef; 51 | 52 | void initializeValues(int input_channels, int output_channels, ActivationMode activation_mode = ACTIVATION_NONE, double actv_coef = 1.0); 53 | 54 | }; 55 | 56 | struct BatchNormDescriptor { 57 | BatchNormMode mode; 58 | double epsilon, factor; 59 | int channels, h, w; 60 | 61 | void initializeValues(BatchNormMode mode, double epsilon, double factor, int channels, int h, int w); 62 | }; 63 | 64 | struct ActivationDescriptor { 65 | ActivationMode mode; 66 | int channels, h, w; 67 | double coef; 68 | void initializeValues(ActivationMode mode, int channels, int h, int w, double coef = 1.0); 69 | }; 70 | 71 | struct SoftmaxDescriptor { 72 | int channels, h, w; 73 | SoftmaxAlgorithm algo; 74 | SoftmaxMode mode; 75 | 76 | void initializeValues(SoftmaxAlgorithm algo, SoftmaxMode mode, int channels, int h, int w); 77 | }; 78 | 79 | struct LayerSpecifier { 80 | LayerOp type; 81 | void *params; 82 | 83 | void initPointer(LayerOp type); 84 | 85 | void freePointer(); 86 | 87 | }; 88 | 89 | #endif -------------------------------------------------------------------------------- /src/user_iface.cu: -------------------------------------------------------------------------------- 1 | #include "user_iface.h" 2 | 3 | void ConvDescriptor::initializeValues(int input_channels, int output_channels, int kernel_h, int kernel_w, int input_h, int input_w, 4 | int pad_h, int pad_w, int stride_x, int stride_y, ActivationMode activation_mode, double actv_coef) { 5 | this->input_channels = input_channels, this->output_channels = output_channels, this->kernel_h = kernel_h, this->kernel_w = kernel_w; 6 | this->input_h = input_h, this->input_w = input_w; 7 | this->pad_h = pad_h, this->pad_w = pad_w, this->stride_y = stride_y, this->stride_x = stride_x; 8 | this->activation_mode = activation_mode; 9 | this->actv_coef = actv_coef; 10 | } 11 | 12 | void PoolingDescriptor::initializeValues(int input_channels, int kernel_h, int kernel_w, 13 | int input_h, int input_w, int pad_h, int pad_w, int stride_x, int stride_y, PoolingMode mode) { 14 | this->input_channels = input_channels, this->kernel_h = kernel_h, this->kernel_w = kernel_w; 15 | this->input_h = input_h, this->input_w = input_w; 16 | this->pad_h = pad_h, this->pad_w = pad_w, this->stride_y = stride_y, this->stride_x = stride_x; 17 | this->mode = mode; 18 | } 19 | 20 | void DropoutDescriptor::initializeValues(double dropout_value, int channels, int h, int w) { 21 | this->dropout_value = dropout_value; 22 | this->channels = channels; 23 | this->h = h; 24 | this->w = w; 25 | } 26 | 27 | void FCDescriptor::initializeValues(int input_channels, int output_channels, ActivationMode activation_mode, double actv_coef) { 28 | this->input_channels = input_channels; 29 | this->output_channels = output_channels; 30 | this->activation_mode = activation_mode; 31 | this->actv_coef = actv_coef; 32 | } 33 | 34 | 35 | void BatchNormDescriptor::initializeValues(BatchNormMode mode, double epsilon, double factor, int channels, int h, int w) { 36 | this->mode = mode; 37 | this->epsilon = epsilon, this->factor = factor; 38 | this->channels = channels, this->h = h, this->w = w; 39 | } 40 | 41 | void ActivationDescriptor::initializeValues(ActivationMode mode, int channels, int h, int w, double coef) { 42 | this->mode = mode; 43 | this->channels = channels; 44 | this->h = h; 45 | this->w = w; 46 | this->coef = coef; 47 | } 48 | 49 | void SoftmaxDescriptor::initializeValues(SoftmaxAlgorithm algo, SoftmaxMode mode, int channels, int h, int w) { 50 | this->algo = algo; 51 | this->mode = mode; 52 | this->channels = channels; 53 | this->h = h; 54 | this->w = w; 55 | } 56 | 57 | void LayerSpecifier::initPointer(LayerOp type) { 58 | this->type = type; 59 | if (type == CONV) 60 | params = malloc(sizeof(ConvDescriptor)); 61 | else if (type == FULLY_CONNECTED) 62 | params = malloc(sizeof(FCDescriptor)); 63 | else if (type == BATCHNORM) 64 | params = malloc(sizeof(BatchNormDescriptor)); 65 | else if (type == DROPOUT) 66 | params = malloc(sizeof(DropoutDescriptor)); 67 | else if (type == POOLING) 68 | params = malloc(sizeof(PoolingDescriptor)); 69 | else if (type == ACTV) 70 | params = malloc(sizeof(ActivationDescriptor)); 71 | else if (type == SOFTMAX) 72 | params = malloc(sizeof(SoftmaxDescriptor)); 73 | } 74 | 75 | void LayerSpecifier::freePointer() { 76 | free(params); 77 | } 78 | -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #ifndef UTILS 9 | #define UTILS 10 | 11 | #define BW (16 * 16) 12 | #define CNMEM_GRANULARITY 512 13 | 14 | #define FatalError(s) do { \ 15 | std::stringstream _where, _message; \ 16 | _where << __FILE__ << ':' << __LINE__; \ 17 | _message << std::string(s) + "\n" << __FILE__ << ':' << __LINE__; \ 18 | std::cerr << _message.str() << "\nAborting...\n"; \ 19 | cudaDeviceReset(); \ 20 | exit(1); \ 21 | } while(0) 22 | 23 | #define checkCUDNN(expression) \ 24 | { \ 25 | cudnnStatus_t status = (expression); \ 26 | if (status != CUDNN_STATUS_SUCCESS) { \ 27 | std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \ 28 | << cudnnGetErrorString(status) << std::endl; \ 29 | std::exit(EXIT_FAILURE); \ 30 | } \ 31 | } 32 | 33 | #define checkCUBLAS(expression) \ 34 | { \ 35 | cublasStatus_t status = (expression); \ 36 | if (status != CUBLAS_STATUS_SUCCESS) { \ 37 | std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \ 38 | << _cudaGetErrorEnum(status) << std::endl; \ 39 | std::exit(EXIT_FAILURE); \ 40 | } \ 41 | } 42 | 43 | #define checkCURAND(expression) \ 44 | { \ 45 | curandStatus_t status = (expression); \ 46 | if (status != CURAND_STATUS_SUCCESS) { \ 47 | std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \ 48 | << _cudaGetErrorEnum(status) << std::endl; \ 49 | std::exit(EXIT_FAILURE); \ 50 | } \ 51 | } 52 | 53 | #define checkCNMEM(expression) \ 54 | { \ 55 | cnmemStatus_t status = (expression); \ 56 | if (status != CNMEM_STATUS_SUCCESS) { \ 57 | std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \ 58 | << cnmemGetErrorString(status) << std::endl; \ 59 | std::exit(EXIT_FAILURE); \ 60 | } \ 61 | } 62 | 63 | #define checkCNMEMRet(expression) \ 64 | { \ 65 | cnmemStatus_t status = (expression); \ 66 | if (status != CNMEM_STATUS_SUCCESS) { \ 67 | if (status == CNMEM_STATUS_OUT_OF_MEMORY) { \ 68 | return false; \ 69 | } \ 70 | std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \ 71 | << cnmemGetErrorString(status) << std::endl; \ 72 | std::exit(EXIT_FAILURE); \ 73 | } \ 74 | } 75 | 76 | #define checkCNMEMSim(expression, req_size, max_consume, free_bytes, action, flag) \ 77 | { \ 78 | cnmemStatus_t status = (expression); \ 79 | if (status != CNMEM_STATUS_SUCCESS) { \ 80 | if (status == CNMEM_STATUS_OUT_OF_MEMORY) { \ 81 | flag = true; \ 82 | size_t largest_free_block_size = 0; \ 83 | cnmemGetLargestFreeBlockSize(largest_free_block_size, NULL); \ 84 | max_consume = req_size - largest_free_block_size + max_consume; \ 85 | max_consume = (max_consume > free_bytes) ? free_bytes : max_consume; \ 86 | action; \ 87 | } \ 88 | std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \ 89 | << cnmemGetErrorString(status) << std::endl; \ 90 | std::exit(EXIT_FAILURE); \ 91 | } \ 92 | } 93 | 94 | struct LayerDimension { 95 | int N, C, H, W; 96 | 97 | int getTotalSize(); 98 | }; 99 | 100 | template 101 | __global__ void fillValue(T *v, int size, int value) { 102 | int i = blockIdx.x * blockDim.x + threadIdx.x; 103 | if (i >= size) 104 | return; 105 | v[i] = value; 106 | } 107 | 108 | void outOfMemory(); 109 | 110 | struct CnmemSpace { 111 | size_t free_bytes; 112 | size_t initial_free_bytes; 113 | bool out_of_memory; 114 | 115 | enum Op {ADD, SUB}; 116 | 117 | CnmemSpace(size_t free_bytes); 118 | 119 | void updateSpace(Op op, size_t size); 120 | 121 | bool isAvailable(); 122 | 123 | size_t getConsumed(); 124 | 125 | void updateMaxConsume(size_t &max_consume); 126 | 127 | }; 128 | 129 | #endif -------------------------------------------------------------------------------- /src/mnist_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "solver.h" 6 | 7 | using namespace std; 8 | 9 | typedef unsigned char uchar; 10 | 11 | int num_train = 60000, num_test = 10000; 12 | 13 | int reverseInt(int n) { 14 | int bytes = 4; 15 | unsigned char ch[bytes]; 16 | for (int i = 0; i < bytes; i++) { 17 | ch[i] = (n >> i * 8) & 255; 18 | } 19 | int p = 0; 20 | for (int i = 0; i < bytes; i++) { 21 | p += (int) ch[i] << (bytes - i - 1) * 8; 22 | } 23 | return p; 24 | } 25 | 26 | void readMNIST(vector > &train_images, vector > &test_images, vector &train_labels, vector &test_labels) { 27 | string filename_train_images = "data/train-images.idx3-ubyte"; 28 | string filename_train_labels = "data/train-labels.idx1-ubyte"; 29 | 30 | string filename_test_images = "data/t10k-images.idx3-ubyte"; 31 | string filename_test_labels = "data/t10k-labels.idx1-ubyte"; 32 | 33 | // read train/test images 34 | for (int i = 0; i < 2; i++) { 35 | string filename; 36 | if (i == 0) 37 | filename = filename_train_images; 38 | else 39 | filename = filename_test_images; 40 | 41 | ifstream f(filename.c_str(), ios::binary); 42 | if (!f.is_open()) 43 | printf("Cannot read MNIST from %s\n", filename.c_str()); 44 | 45 | // read metadata 46 | int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0; 47 | f.read((char *) &magic_number, sizeof(magic_number)); 48 | magic_number = reverseInt(magic_number); 49 | f.read((char *) &n_images, sizeof(n_images)); 50 | n_images = reverseInt(n_images); 51 | f.read((char *) &n_rows, sizeof(n_rows)); 52 | n_rows = reverseInt(n_rows); 53 | f.read((char *) &n_cols, sizeof(n_cols)); 54 | n_cols = reverseInt(n_cols); 55 | 56 | for (int k = 0; k < n_images; k++) { 57 | vector temp; 58 | temp.reserve(n_rows * n_cols); 59 | for (int j = 0; j < n_rows * n_cols; j++) { 60 | uchar t = 0; 61 | f.read((char *)&t, sizeof(t)); 62 | temp.push_back(t); 63 | } 64 | if (i == 0) 65 | train_images.push_back(temp); 66 | else 67 | test_images.push_back(temp); 68 | } 69 | f.close(); 70 | 71 | } 72 | 73 | // read train/test labels 74 | for (int i = 0; i < 2; i++) { 75 | string filename; 76 | if (i == 0) 77 | filename = filename_train_labels; 78 | else 79 | filename = filename_test_labels; 80 | 81 | ifstream f(filename.c_str(), ios::binary); 82 | if (!f.is_open()) 83 | printf("Cannot read MNIST from %s\n", filename.c_str()); 84 | 85 | // read metadata 86 | int magic_number = 0, n_labels = 0; 87 | f.read((char *) &magic_number, sizeof(magic_number)); 88 | magic_number = reverseInt(magic_number); 89 | f.read((char *) &n_labels, sizeof(n_labels)); 90 | n_labels = reverseInt(n_labels); 91 | 92 | for (int k = 0; k < n_labels; k++) { 93 | uchar t = 0; 94 | f.read((char *)&t, sizeof(t)); 95 | if (i == 0) 96 | train_labels.push_back(t); 97 | else 98 | test_labels.push_back(t); 99 | } 100 | 101 | f.close(); 102 | 103 | } 104 | } 105 | 106 | int main() { 107 | 108 | int rows = 28, cols = 28, channels = 1; 109 | float *f_train_images, *f_test_images; 110 | int *f_train_labels, *f_test_labels; 111 | // int rows = 28, cols = 28, channels = 1; 112 | int input_size = rows * cols * channels; 113 | f_train_images = (float *)malloc(num_train * input_size * sizeof(float)); 114 | f_train_labels = (int *)malloc(num_train * sizeof(int)); 115 | f_test_images = (float *)malloc(num_test * input_size * sizeof(float)); 116 | f_test_labels = (int *)malloc(num_test * sizeof(int)); 117 | 118 | { 119 | vector > train_images, test_images; 120 | vector train_labels, test_labels; 121 | readMNIST(train_images, test_images, train_labels, test_labels); 122 | 123 | for (int k = 0; k < num_train; k++) { 124 | for (int j = 0; j < rows * cols; j++) { 125 | f_train_images[k * input_size + j] = (float)train_images[k][j]; 126 | } 127 | f_train_labels[k] = (int)train_labels[k]; 128 | } 129 | 130 | for (int k = 0; k < num_test; k++) { 131 | for (int j = 0; j < rows * cols; j++) { 132 | f_test_images[k * input_size + j] = (float)test_images[k][j]; 133 | } 134 | f_test_labels[k] = (int)test_labels[k]; 135 | } 136 | } 137 | 138 | 139 | 140 | float *mean_image; 141 | mean_image = (float *)malloc(input_size * sizeof(float)); 142 | 143 | for (int i = 0; i < input_size; i++) { 144 | mean_image[i] = 0; 145 | for (int k = 0; k < num_train; k++) { 146 | mean_image[i] += f_train_images[k * input_size + i]; 147 | } 148 | mean_image[i] /= num_train; 149 | } 150 | 151 | 152 | for (int i = 0; i < num_train; i++) { 153 | for (int j = 0; j < input_size; j++) { 154 | f_train_images[i * input_size + j] -= mean_image[j]; 155 | } 156 | } 157 | 158 | for (int i = 0; i < num_test; i++) { 159 | for (int j = 0; j < input_size; j++) { 160 | f_test_images[i * input_size + j] -= mean_image[j]; 161 | } 162 | 163 | } 164 | 165 | vector layer_specifier; 166 | { 167 | ConvDescriptor layer0; 168 | layer0.initializeValues(1, 3, 3, 3, 28, 28, 1, 1, 1, 1, RELU); 169 | LayerSpecifier temp; 170 | temp.initPointer(CONV); 171 | *((ConvDescriptor *)temp.params) = layer0; 172 | layer_specifier.push_back(temp); 173 | } 174 | { 175 | FCDescriptor layer1; 176 | layer1.initializeValues(3 * 28 * 28, 50, RELU); 177 | LayerSpecifier temp; 178 | temp.initPointer(FULLY_CONNECTED); 179 | *((FCDescriptor *)temp.params) = layer1; 180 | layer_specifier.push_back(temp); 181 | } 182 | { 183 | FCDescriptor layer2; 184 | layer2.initializeValues(50, 10); 185 | LayerSpecifier temp; 186 | temp.initPointer(FULLY_CONNECTED); 187 | *((FCDescriptor *)temp.params) = layer2; 188 | layer_specifier.push_back(temp); 189 | } 190 | { 191 | SoftmaxDescriptor layer2_smax; 192 | layer2_smax.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 10, 1, 1); 193 | LayerSpecifier temp; 194 | temp.initPointer(SOFTMAX); 195 | *((SoftmaxDescriptor *)temp.params) = layer2_smax; 196 | layer_specifier.push_back(temp); 197 | } 198 | 199 | int batch_size = 128; 200 | long long dropout_seed = 1; 201 | float softmax_eps = 1e-8; 202 | float init_std_dev = 0.01; 203 | NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vDNN_ALL, vDNN_MEMORY_OPTIMAL, SGD); 204 | 205 | int num_epoch = 1000; 206 | double learning_rate = 1e-4; 207 | double learning_rate_decay = 0.9; 208 | 209 | Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train); 210 | vector loss; 211 | vector val_acc; 212 | solver.train(loss, val_acc); 213 | int num_correct; 214 | solver.checkAccuracy(f_train_images, f_train_labels, num_train, &num_correct); 215 | cout << num_correct << endl; 216 | 217 | 218 | 219 | } -------------------------------------------------------------------------------- /include/layer_params.h: -------------------------------------------------------------------------------- 1 | #include "user_iface.h" 2 | #include "utils.h" 3 | #include 4 | #include 5 | 6 | #ifndef LAYER_PARAMS 7 | #define LAYER_PARAMS 8 | 9 | enum vDNNConvAlgoPref {PREFER_MEMORY_OPTIMAL, PREFER_PERFORMANCE_OPTIMAL}; 10 | 11 | enum workspaceStatus_t {WORKSPACE_STATUS_SUCCESS, WORKSPACE_STATUS_OUT_OF_MEMORY}; 12 | 13 | #define checkWORKSPACE(expression) \ 14 | { \ 15 | workspaceStatus_t status = (expression); \ 16 | if (status != WORKSPACE_STATUS_SUCCESS) { \ 17 | std::cerr << "Error in file " << __FILE__ << " on line " << __LINE__ << ": " \ 18 | << std::endl; \ 19 | std::exit(EXIT_FAILURE); \ 20 | } \ 21 | } 22 | 23 | struct ConvLayerParams { 24 | void *W, *b; 25 | void *dW, *db; 26 | cudnnTensorDescriptor_t input_tensor, output_tensor, bias_desc; 27 | cudnnFilterDescriptor_t filter_desc; 28 | cudnnConvolutionDescriptor_t conv_desc; 29 | cudnnConvolutionFwdAlgo_t fwd_algo; 30 | cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo; 31 | cudnnConvolutionBwdDataAlgo_t bwd_data_algo; 32 | size_t fwd_workspace_size, bwd_filter_workspace_size, bwd_data_workspace_size; 33 | int C_in, C_out, filter_h, filter_w; 34 | int kernel_size; 35 | enum ConvDirection {FWD, BWD_FILTER, BWD_DATA}; 36 | UpdateRule update_rule; 37 | cudnnDataType_t data_type; 38 | ActivationMode activation_mode; 39 | cudnnActivationDescriptor_t actv_desc; 40 | 41 | 42 | int fwd_req_count, fwd_ret_count; 43 | int bwd_filter_req_count, bwd_filter_ret_count; 44 | int bwd_data_req_count, bwd_data_ret_count; 45 | cudnnConvolutionFwdAlgoPerf_t *fwd_perf; 46 | cudnnConvolutionBwdFilterAlgoPerf_t *bwd_filter_perf; 47 | cudnnConvolutionBwdDataAlgoPerf_t *bwd_data_perf; 48 | 49 | void initializeValues(cudnnHandle_t cudnn_handle, ConvDescriptor *user_params, cudnnDataType_t data_type, 50 | int batch_size, cudnnTensorFormat_t tensor_format, size_t data_type_size, LayerDimension &output_size, 51 | UpdateRule update_rule); 52 | 53 | void allocateSpace(curandGenerator_t curand_gen, cudnnDataType_t data_type, size_t data_type_size, float std_dev, size_t &free_bytes, 54 | bool alloc_derivative); 55 | 56 | size_t getWorkspaceSize(size_t &free_bytes, ConvDirection conv_direction, vDNNConvAlgo vdnn_conv_algo); 57 | workspaceStatus_t getWorkspaceSize(size_t &free_bytes, ConvDirection conv_direction, vDNNConvAlgoPref algo_pref, bool hard_pref, size_t &workspace_size); 58 | 59 | void cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream); 60 | bool cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, size_t &max_consume, size_t free_bytes, bool &out_of_memory); 61 | void stepParams(cublasHandle_t cublas_handle, double learning_rate); 62 | void cnmemFreeDerivatives(cudaStream_t stream); 63 | }; 64 | 65 | struct FCLayerParams { 66 | void *W, *b; 67 | void *dW, *db; 68 | int C_in, C_out; 69 | int weight_matrix_size; 70 | UpdateRule update_rule; 71 | cudnnDataType_t data_type; 72 | ActivationMode activation_mode; 73 | cudnnActivationDescriptor_t actv_desc; 74 | cudnnTensorDescriptor_t output_tensor; 75 | 76 | void initializeValues(FCDescriptor *user_params, int batch_size, cudnnTensorFormat_t tensor_format, cudnnDataType_t data_type, 77 | LayerDimension &output_size, UpdateRule update_rule); 78 | void allocateSpace(curandGenerator_t curand_gen, cudnnDataType_t data_type, size_t data_type_size, 79 | float std_dev, size_t &free_bytes, bool alloc_derivative); 80 | 81 | void cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream); 82 | bool cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, size_t &max_consume, size_t free_bytes, bool &out_of_memory); 83 | void stepParams(cublasHandle_t cublas_handle, double learning_rate); 84 | void cnmemFreeDerivatives(cudaStream_t stream); 85 | }; 86 | 87 | struct DropoutLayerParams { 88 | cudnnDropoutDescriptor_t dropout_desc; 89 | cudnnTensorDescriptor_t input_tensor; 90 | void *reserved_space; 91 | void *state; 92 | size_t reserved_space_size; 93 | size_t state_size; 94 | 95 | void initializeValues(cudnnHandle_t cudnn_handle, DropoutDescriptor *user_params, cudnnDataType_t data_type, int batch_size, 96 | cudnnTensorFormat_t tensor_format, LayerDimension &output_size); 97 | 98 | void allocateSpace(size_t &free_bytes, cudnnHandle_t cudnn_handle, DropoutDescriptor *user_params, long long seed); 99 | }; 100 | 101 | struct BatchNormLayerParams { 102 | cudnnTensorDescriptor_t input_tensor; 103 | cudnnTensorDescriptor_t sbmv_desc; 104 | void *scale, *bias; 105 | void *dscale, *dbias; 106 | void *running_mean, *running_variance; 107 | void *result_save_mean, *result_save_inv_var; 108 | double factor, epsilon; 109 | cudnnBatchNormMode_t mode; 110 | int h, w, c; 111 | int sbmv_size; 112 | UpdateRule update_rule; 113 | size_t allocation_size; 114 | cudnnDataType_t data_type; 115 | 116 | void initializeValues(BatchNormDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 117 | int batch_size, LayerDimension &output_size, UpdateRule update_rule); 118 | void allocateSpace(cudnnDataType_t data_type, size_t data_type_size, size_t &free_bytes, bool alloc_derivative); 119 | 120 | void cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream); 121 | bool cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, size_t &max_consume, size_t free_bytes, bool &out_of_memory); 122 | void stepParams(cublasHandle_t cublas_handle, double learning_rate); 123 | void cnmemFreeDerivatives(cudaStream_t stream); 124 | }; 125 | 126 | struct PoolingLayerParams { 127 | cudnnTensorDescriptor_t input_tensor; 128 | cudnnTensorDescriptor_t output_tensor; 129 | 130 | cudnnPoolingDescriptor_t pool_desc; 131 | 132 | void initializeValues(PoolingDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 133 | int batch_size, LayerDimension &output_size); 134 | void allocateSpace(size_t &free_bytes); 135 | }; 136 | 137 | struct ActivationLayerParams { 138 | cudnnActivationDescriptor_t actv_desc; 139 | cudnnTensorDescriptor_t input_tensor; 140 | 141 | void initializeValues(ActivationDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 142 | int batch_size, LayerDimension &output_size); 143 | void allocateSpace(size_t &free_bytes); 144 | }; 145 | 146 | struct SoftmaxLayerParams { 147 | cudnnTensorDescriptor_t input_tensor; 148 | cudnnSoftmaxAlgorithm_t algo; 149 | cudnnSoftmaxMode_t mode; 150 | 151 | void initializeValues(SoftmaxDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 152 | int batch_size, LayerDimension &output_size); 153 | void allocateSpace(size_t &free_bytes); 154 | }; 155 | 156 | #endif 157 | -------------------------------------------------------------------------------- /src/solver.cu: -------------------------------------------------------------------------------- 1 | #include "solver.h" 2 | 3 | Solver::Solver(NeuralNet *model, void *X_train, int *y_train, void *X_val, int *y_val, int num_epoch, UpdateRule update_rule, 4 | double learning_rate, double learning_rate_decay, int num_train, int num_val) { 5 | this->model = model; 6 | this->X_train = X_train, this->X_val = X_val; 7 | this->y_train = y_train, this->y_val = y_val; 8 | this->num_epoch = num_epoch; 9 | this->update_rule = update_rule; 10 | this->learning_rate = learning_rate, this->learning_rate_decay = learning_rate_decay; 11 | 12 | this->num_train = num_train, this->num_val = num_val; 13 | this->num_features = model->input_channels * model->input_h * model->input_w; 14 | 15 | checkCudaErrors(cudaEventCreate(&start)); 16 | checkCudaErrors(cudaEventCreate(&stop)); 17 | 18 | 19 | } 20 | 21 | float Solver::step(int start_X, int start_y) { 22 | std::vector t1, t2; 23 | return this->step(start_X, start_y, t1, t2); 24 | } 25 | 26 | float Solver::step(int start_X, int start_y, std::vector &fwd_vdnn_lag, std::vector &bwd_vdnn_lag) { 27 | float temp_loss; 28 | // std::cout << "start_X: " << start_X << std::endl; 29 | if (model->data_type == CUDNN_DATA_FLOAT) 30 | model->getLoss(&(((float *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_vdnn_lag, bwd_vdnn_lag, true, NULL, &temp_loss); 31 | else if (model->data_type == CUDNN_DATA_DOUBLE) 32 | model->getLoss(&(((double *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_vdnn_lag, bwd_vdnn_lag, true, NULL, &temp_loss); 33 | 34 | // float Salpha = -learning_rate; 35 | // double Dalpha = -learning_rate; 36 | // if (update_rule == SGD) { 37 | // for (int i = 0; i < model->num_layers; i++) { 38 | // if (model->layer_type[i] == CONV) { 39 | // ConvLayerParams *cur_params = (ConvLayerParams *)model->params[i]; 40 | // int kernel_size = cur_params->C_in * cur_params->C_out * cur_params->filter_h * cur_params->filter_w; 41 | // if (model->data_type == CUDNN_DATA_FLOAT) { 42 | // checkCUBLAS(cublasSaxpy(model->cublas_handle, kernel_size, 43 | // &Salpha, 44 | // (float *)cur_params->dW, 1, 45 | // (float *)cur_params->W, 1)); 46 | 47 | // checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->C_out, 48 | // &Salpha, 49 | // (float *)cur_params->db, 1, 50 | // (float *)cur_params->b, 1)); 51 | // } 52 | // else if (model->data_type == CUDNN_DATA_DOUBLE) { 53 | // checkCUBLAS(cublasDaxpy(model->cublas_handle, kernel_size, 54 | // &Dalpha, 55 | // (double *)cur_params->dW, 1, 56 | // (double *)cur_params->W, 1)); 57 | 58 | // checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->C_out, 59 | // &Dalpha, 60 | // (double *)cur_params->db, 1, 61 | // (double *)cur_params->b, 1)); 62 | // } 63 | 64 | // } 65 | 66 | // else if (model->layer_type[i] == FULLY_CONNECTED) { 67 | // FCLayerParams *cur_params = (FCLayerParams *)model->params[i]; 68 | // if (model->data_type == CUDNN_DATA_FLOAT) { 69 | // checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->C_in * cur_params->C_out, 70 | // &Salpha, 71 | // (float *)cur_params->dW, 1, 72 | // (float *)cur_params->W, 1)); 73 | 74 | // checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->C_out, 75 | // &Salpha, 76 | // (float *)cur_params->db, 1, 77 | // (float *)cur_params->b, 1)); 78 | // } 79 | // else if (model->data_type == CUDNN_DATA_DOUBLE) { 80 | // checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->C_in * cur_params->C_out, 81 | // &Dalpha, 82 | // (double *)cur_params->dW, 1, 83 | // (double *)cur_params->W, 1)); 84 | 85 | // checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->C_out, 86 | // &Dalpha, 87 | // (double *)cur_params->db, 1, 88 | // (double *)cur_params->b, 1)); 89 | // } 90 | // } 91 | 92 | // else if (model->layer_type[i] == BATCHNORM) { 93 | // BatchNormLayerParams *cur_params = (BatchNormLayerParams *)model->params[i]; 94 | // if (model->data_type == CUDNN_DATA_FLOAT) { 95 | // checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->sbmv_size, 96 | // &Salpha, 97 | // (float *)cur_params->dscale, 1, 98 | // (float *)cur_params->scale, 1)); 99 | // checkCUBLAS(cublasSaxpy(model->cublas_handle, cur_params->sbmv_size, 100 | // &Salpha, 101 | // (float *)cur_params->dbias, 1, 102 | // (float *)cur_params->bias, 1)); 103 | 104 | // } 105 | // else if (model->data_type == CUDNN_DATA_DOUBLE) { 106 | // checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->sbmv_size, 107 | // &Dalpha, 108 | // (double *)cur_params->dscale, 1, 109 | // (double *)cur_params->scale, 1)); 110 | // checkCUBLAS(cublasDaxpy(model->cublas_handle, cur_params->sbmv_size, 111 | // &Dalpha, 112 | // (double *)cur_params->dbias, 1, 113 | // (double *)cur_params->bias, 1)); 114 | 115 | // } 116 | // } 117 | // } 118 | // } 119 | checkCudaErrors(cudaDeviceSynchronize()); 120 | return temp_loss; 121 | 122 | } 123 | 124 | void Solver::train(std::vector &loss, std::vector &val_acc) { 125 | 126 | int batch_size = model->batch_size; 127 | int num_train_batches = num_train / model->batch_size; 128 | int num_val_batches = num_val / model->batch_size; 129 | for (int i = 0; i < num_epoch; i++) { 130 | for (int j = 0; j < num_train_batches; j++) { 131 | int start_sample = j * num_features * batch_size; 132 | 133 | float milli = 0; 134 | checkCudaErrors(cudaEventRecord(start, model->stream_compute)); 135 | 136 | float temp_loss = step(start_sample, j * batch_size); 137 | 138 | checkCudaErrors(cudaEventRecord(stop, model->stream_compute)); 139 | checkCudaErrors(cudaEventSynchronize(stop)); 140 | checkCudaErrors(cudaEventElapsedTime(&milli, start, stop)); 141 | std::cout << "One forward, backward pass time(ms): " << milli << std::endl; 142 | 143 | loss.push_back(temp_loss); 144 | std::cout << "loss: " << temp_loss << std::endl; 145 | } 146 | int correct_count = 0; 147 | for (int j = 0; j < num_val_batches; j++) { 148 | 149 | int start_sample = j * num_features * batch_size; 150 | int temp_correct_count; 151 | if (model->data_type == CUDNN_DATA_FLOAT) 152 | model->getLoss(&(((float *)X_val)[start_sample]), &y_val[j * batch_size], learning_rate, false, &temp_correct_count, NULL); 153 | else if (model->data_type == CUDNN_DATA_DOUBLE) 154 | model->getLoss(&(((double *)X_val)[start_sample]), &y_val[j * batch_size], learning_rate, false, &temp_correct_count, NULL); 155 | correct_count += temp_correct_count; 156 | } 157 | val_acc.push_back(correct_count); 158 | std::cout << "val_acc: " << val_acc[i] << std::endl; 159 | // learning_rate *= learning_rate_decay; 160 | // std::cout << "learning_rate: " << learning_rate << std::endl; 161 | } 162 | learning_rate *= learning_rate_decay; 163 | 164 | } 165 | 166 | void Solver::checkAccuracy(void *X, int *y, int num_samples, int *num_correct) { 167 | int batch_size = model->batch_size; 168 | int num_iter = num_samples / batch_size; 169 | *num_correct = 0; 170 | for (int i = 0; i < num_iter; i++) { 171 | int start_sample = i * num_features * batch_size; 172 | int temp_correct_count; 173 | if (model->data_type == CUDNN_DATA_FLOAT) 174 | model->getLoss(&(((float *)X)[start_sample]), &y[i * batch_size], learning_rate, false, &temp_correct_count, NULL); 175 | else if (model->data_type == CUDNN_DATA_DOUBLE) 176 | model->getLoss(&(((double *)X)[start_sample]), &y[i * batch_size], learning_rate, false, &temp_correct_count, NULL); 177 | *num_correct = *num_correct + temp_correct_count; 178 | } 179 | } 180 | 181 | void Solver::getTrainTime(std::vector &loss, std::vector &time, int num_epoch, 182 | std::vector > &fwd_vdnn_lag, std::vector > &bwd_vdnn_lag) { 183 | int batch_size = model->batch_size; 184 | int num_train_batches = num_train / model->batch_size; 185 | for (int i = 0; i < num_epoch; i++) { 186 | for (int j = 0; j < num_train_batches; j++) { 187 | int start_sample = j * num_features * batch_size; 188 | 189 | checkCudaErrors(cudaEventRecord(start)); 190 | float milli; 191 | 192 | std::vector cur_fwd_vdnn_lag, cur_bwd_vdnn_lag; 193 | float temp_loss = step(start_sample, j * batch_size, cur_fwd_vdnn_lag, cur_bwd_vdnn_lag); 194 | 195 | checkCudaErrors(cudaEventRecord(stop)); 196 | checkCudaErrors(cudaEventSynchronize(stop)); 197 | checkCudaErrors(cudaEventElapsedTime(&milli, start, stop)); 198 | // std::cout << "One forward, backward pass time(ms): " << milli << std::endl; 199 | 200 | fwd_vdnn_lag.push_back(cur_fwd_vdnn_lag); 201 | bwd_vdnn_lag.push_back(cur_bwd_vdnn_lag); 202 | 203 | loss.push_back(temp_loss); 204 | time.push_back(milli); 205 | // std::cout << "loss: " << temp_loss << std::endl; 206 | // for (int i = 0; i < cur_fwd_vdnn_lag.size(); i++) { 207 | // std::cout << "fwd_lag " << i << ":" << cur_fwd_vdnn_lag[i] << std::endl; 208 | // } 209 | // for (int i = 0; i < cur_bwd_vdnn_lag.size(); i++) { 210 | // std::cout << "bwd_lag " << i << ":" << cur_bwd_vdnn_lag[i] << std::endl; 211 | // } 212 | } 213 | } 214 | learning_rate *= learning_rate_decay; 215 | } 216 | 217 | void Solver::getComputationTime(long num_epoch, std::vector > &fwd_computation_time, std::vector > &bwd_computation_time) { 218 | int batch_size = model->batch_size; 219 | int num_train_batches = num_train / model->batch_size; 220 | for (int i = 0; i < num_epoch; i++) { 221 | for (int j = 0; j < num_train_batches; j++) { 222 | int start_sample = j * num_features * batch_size; 223 | 224 | float milli; 225 | 226 | std::vector cur_fwd_computation_time, cur_bwd_computation_time; 227 | stepComputationTime(start_sample, j * batch_size, cur_fwd_computation_time, cur_bwd_computation_time); 228 | 229 | fwd_computation_time.push_back(cur_fwd_computation_time); 230 | bwd_computation_time.push_back(cur_bwd_computation_time); 231 | 232 | } 233 | learning_rate *= learning_rate_decay; 234 | } 235 | } 236 | 237 | void Solver::getTransferTime(long num_epoch, std::vector > &fwd_transfer_time, std::vector > &bwd_transfer_time) { 238 | int batch_size = model->batch_size; 239 | int num_train_batches = num_train / model->batch_size; 240 | for (int i = 0; i < num_epoch; i++) { 241 | for (int j = 0; j < num_train_batches; j++) { 242 | int start_sample = j * num_features * batch_size; 243 | 244 | float milli; 245 | 246 | std::vector cur_fwd_transfer_time, cur_bwd_transfer_time; 247 | stepTransferTime(start_sample, j * batch_size, cur_fwd_transfer_time, cur_bwd_transfer_time); 248 | 249 | fwd_transfer_time.push_back(cur_fwd_transfer_time); 250 | bwd_transfer_time.push_back(cur_bwd_transfer_time); 251 | 252 | } 253 | learning_rate *= learning_rate_decay; 254 | } 255 | } 256 | 257 | void Solver::stepComputationTime(int start_X, int start_y, std::vector &fwd_computation_time, std::vector &bwd_computation_time) { 258 | if (model->data_type == CUDNN_DATA_FLOAT) 259 | model->getComputationTime(&(((float *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_computation_time, bwd_computation_time); 260 | else if (model->data_type == CUDNN_DATA_DOUBLE) 261 | model->getComputationTime(&(((double *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_computation_time, bwd_computation_time); 262 | } 263 | 264 | void Solver::stepTransferTime(int start_X, int start_y, std::vector &fwd_transfer_time, std::vector &bwd_transfer_time) { 265 | if (model->data_type == CUDNN_DATA_FLOAT) 266 | model->getTransferTime(&(((float *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_transfer_time, bwd_transfer_time); 267 | else if (model->data_type == CUDNN_DATA_DOUBLE) 268 | model->getTransferTime(&(((double *)X_train)[start_X]), &y_train[start_y], learning_rate, fwd_transfer_time, bwd_transfer_time); 269 | } 270 | -------------------------------------------------------------------------------- /cnmem/include/cnmem.h: -------------------------------------------------------------------------------- 1 | /* ********************************************************************** 2 | * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of NVIDIA CORPORATION nor the names of its 13 | * contributors may be used to endorse or promote products derived 14 | * from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | * ********************************************************************** */ 28 | #pragma once 29 | 30 | #ifdef __cplusplus 31 | #include "cstdio" 32 | #else 33 | #include "stdio.h" 34 | #endif 35 | #include "cuda_runtime_api.h" 36 | 37 | #if defined(_MSC_VER) || defined(WIN32) 38 | #ifdef CNMEM_DLLEXPORT 39 | #define CNMEM_API __declspec(dllexport) 40 | #else 41 | #define CNMEM_API __declspec(dllimport) 42 | #endif 43 | #else 44 | #ifdef CNMEM_DLLEXPORT 45 | #define CNMEM_API __attribute__((visibility ("default"))) 46 | #else 47 | #define CNMEM_API 48 | #endif 49 | #endif 50 | 51 | #define CNMEM_VERSION 100 // It corresponds to 1.0.0 52 | 53 | #ifdef __cplusplus 54 | extern "C" { 55 | #endif 56 | 57 | /* ********************************************************************************************* */ 58 | 59 | typedef enum 60 | { 61 | CNMEM_STATUS_SUCCESS = 0, 62 | CNMEM_STATUS_CUDA_ERROR, 63 | CNMEM_STATUS_INVALID_ARGUMENT, 64 | CNMEM_STATUS_NOT_INITIALIZED, 65 | CNMEM_STATUS_OUT_OF_MEMORY, 66 | CNMEM_STATUS_UNKNOWN_ERROR 67 | } cnmemStatus_t; 68 | 69 | /* ********************************************************************************************* */ 70 | 71 | typedef enum 72 | { 73 | CNMEM_FLAGS_DEFAULT = 0, /// Default flags. 74 | CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption. 75 | CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory. 76 | } cnmemManagerFlags_t; 77 | 78 | /* ********************************************************************************************* */ 79 | 80 | typedef struct cnmemDevice_t_ 81 | { 82 | /** The device number. */ 83 | int device; 84 | /** The size to allocate for that device. If 0, the implementation chooses the size. */ 85 | size_t size; 86 | /** The number of named streams associated with the device. The NULL stream is not counted. */ 87 | int numStreams; 88 | /** The streams associated with the device. It can be NULL. The NULL stream is managed. */ 89 | cudaStream_t *streams; 90 | /** The size reserved for each streams. It can be 0. */ 91 | size_t *streamSizes; 92 | 93 | } cnmemDevice_t; 94 | 95 | /** 96 | * \brief Initialize the library and allocate memory on the listed devices. 97 | * 98 | * For each device, an internal memory manager is created and the specified amount of memory is 99 | * allocated (it is the size defined in device[i].size). For each, named stream an additional 100 | * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 101 | * manager for the device and a list of children, one for each named stream. 102 | * 103 | * This function must be called before any other function in the library. It has to be called 104 | * by a single thread since it is not thread-safe. 105 | * 106 | * \return 107 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 108 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 109 | * CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory, 110 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function. 111 | */ 112 | cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags); 113 | 114 | /** 115 | * \brief Release all the allocated memory. 116 | * 117 | * This function must be called by a single thread and after all threads that called 118 | * cnmemMalloc/cnmemFree have joined. This function is not thread-safe. 119 | * 120 | * \return 121 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 122 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 123 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 124 | */ 125 | cnmemStatus_t CNMEM_API cnmemFinalize(); 126 | 127 | /** 128 | * \brief Increase the internal reference counter of the context object. 129 | * 130 | * This function increases the internal reference counter of the library. The purpose of that 131 | * reference counting mechanism is to give more control to the user over the lifetime of the 132 | * library. It is useful with scoped memory allocation which may be destroyed in a final 133 | * memory collection after the end of main(). That function is thread-safe. 134 | * 135 | * \return 136 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 137 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 138 | */ 139 | cnmemStatus_t CNMEM_API cnmemRetain(); 140 | 141 | /** 142 | * \brief Decrease the internal reference counter of the context object. 143 | * 144 | * This function decreases the internal reference counter of the library. The purpose of that 145 | * reference counting mechanism is to give more control to the user over the lifetime of the 146 | * library. It is useful with scoped memory allocation which may be destroyed in a final 147 | * memory collection after the end of main(). That function is thread-safe. 148 | * 149 | * You can use \c cnmemRelease to explicitly finalize the library. 150 | * 151 | * \return 152 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 153 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 154 | */ 155 | cnmemStatus_t CNMEM_API cnmemRelease(); 156 | 157 | /** 158 | * \brief Add a new stream to the pool of managed streams on a device. 159 | * 160 | * This function registers a new stream into a device memory manager. It is thread-safe. 161 | * 162 | * \return 163 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 164 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 165 | */ 166 | cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream); 167 | 168 | /** 169 | * \brief Allocate memory. 170 | * 171 | * This function allocates memory and initializes a pointer to device memory. If no memory 172 | * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe. 173 | * 174 | * The behavior of that function is the following: 175 | * 176 | * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 177 | * memory. If there's a buffer of size larger or equal to the requested size in the list of 178 | * free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 179 | * its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 180 | * cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 181 | * allowed to grow, the manager attempts to steal memory from one of its children (unless 182 | * CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 183 | * CNMEM_STATUS_OUT_OF_MEMORY. 184 | * 185 | * - If the stream is a named stream, the initial request goes to the memory manager associated 186 | * with that stream. If a free node is available in the lists of that manager, it is returned. 187 | * Otherwise, the request is passed to the root node and works as if the request were made on 188 | * the NULL stream. 189 | * 190 | * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 191 | * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 192 | * make sure no kernel uses a given buffer before stealing it) and it the execution is 193 | * sequential (in a multi-threaded context, the code is executed in a critical section inside 194 | * the cnmem library - no need for the user to wrap cnmemMalloc with locks). 195 | * 196 | * \return 197 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 198 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 199 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, 200 | * CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available, 201 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 202 | */ 203 | cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream); 204 | 205 | /** 206 | * \brief Release memory. 207 | * 208 | * This function releases memory and recycles a memory block in the manager. This function is 209 | * thread safe. 210 | * 211 | * \return 212 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 213 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 214 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, 215 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 216 | */ 217 | cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream); 218 | 219 | /* ********************************************************************************************* */ 220 | /* Utility functions. */ 221 | /* ********************************************************************************************* */ 222 | 223 | /** 224 | * \brief Returns the amount of memory managed by the memory manager associated with a stream. 225 | * 226 | * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple- 227 | * xity linear in the number of allocated blocks so do not call it in performance critical 228 | * sections. 229 | * 230 | * \return 231 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 232 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 233 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 234 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 235 | */ 236 | cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream); 237 | 238 | /** 239 | * \brief Print a list of nodes to a file. 240 | * 241 | * This function is intended to be used in case of complex scenarios to help understand the 242 | * behaviour of the memory managers/application. It is thread safe. 243 | * 244 | * \return 245 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 246 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 247 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 248 | * or free_mem == 0, 249 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 250 | */ 251 | cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream); 252 | 253 | /** 254 | * \brief Converts a cnmemStatus_t value to a string. 255 | */ 256 | const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status); 257 | 258 | /* ********************************************************************************************* */ 259 | 260 | #ifdef __cplusplus 261 | } // extern "C" 262 | #endif 263 | 264 | -------------------------------------------------------------------------------- /include/cnmem.h: -------------------------------------------------------------------------------- 1 | /* ********************************************************************** 2 | * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of NVIDIA CORPORATION nor the names of its 13 | * contributors may be used to endorse or promote products derived 14 | * from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | * ********************************************************************** */ 28 | #pragma once 29 | 30 | #ifdef __cplusplus 31 | #include "cstdio" 32 | #else 33 | #include "stdio.h" 34 | #endif 35 | #include "cuda_runtime_api.h" 36 | 37 | #if defined(_MSC_VER) || defined(WIN32) 38 | #ifdef CNMEM_DLLEXPORT 39 | #define CNMEM_API __declspec(dllexport) 40 | #else 41 | #define CNMEM_API __declspec(dllimport) 42 | #endif 43 | #else 44 | #ifdef CNMEM_DLLEXPORT 45 | #define CNMEM_API __attribute__((visibility ("default"))) 46 | #else 47 | #define CNMEM_API 48 | #endif 49 | #endif 50 | 51 | #define CNMEM_VERSION 100 // It corresponds to 1.0.0 52 | 53 | #ifdef __cplusplus 54 | extern "C" { 55 | #endif 56 | 57 | /* ********************************************************************************************* */ 58 | 59 | typedef enum 60 | { 61 | CNMEM_STATUS_SUCCESS = 0, 62 | CNMEM_STATUS_CUDA_ERROR, 63 | CNMEM_STATUS_INVALID_ARGUMENT, 64 | CNMEM_STATUS_NOT_INITIALIZED, 65 | CNMEM_STATUS_OUT_OF_MEMORY, 66 | CNMEM_STATUS_UNKNOWN_ERROR 67 | } cnmemStatus_t; 68 | 69 | /* ********************************************************************************************* */ 70 | 71 | typedef enum 72 | { 73 | CNMEM_FLAGS_DEFAULT = 0, /// Default flags. 74 | CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption. 75 | CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory. 76 | } cnmemManagerFlags_t; 77 | 78 | /* ********************************************************************************************* */ 79 | 80 | typedef struct cnmemDevice_t_ 81 | { 82 | /** The device number. */ 83 | int device; 84 | /** The size to allocate for that device. If 0, the implementation chooses the size. */ 85 | size_t size; 86 | /** The number of named streams associated with the device. The NULL stream is not counted. */ 87 | int numStreams; 88 | /** The streams associated with the device. It can be NULL. The NULL stream is managed. */ 89 | cudaStream_t *streams; 90 | /** The size reserved for each streams. It can be 0. */ 91 | size_t *streamSizes; 92 | 93 | } cnmemDevice_t; 94 | 95 | /** 96 | * \brief Initialize the library and allocate memory on the listed devices. 97 | * 98 | * For each device, an internal memory manager is created and the specified amount of memory is 99 | * allocated (it is the size defined in device[i].size). For each, named stream an additional 100 | * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 101 | * manager for the device and a list of children, one for each named stream. 102 | * 103 | * This function must be called before any other function in the library. It has to be called 104 | * by a single thread since it is not thread-safe. 105 | * 106 | * \return 107 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 108 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 109 | * CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory, 110 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function. 111 | */ 112 | cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags); 113 | 114 | /** 115 | * \brief Release all the allocated memory. 116 | * 117 | * This function must be called by a single thread and after all threads that called 118 | * cnmemMalloc/cnmemFree have joined. This function is not thread-safe. 119 | * 120 | * \return 121 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 122 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 123 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 124 | */ 125 | cnmemStatus_t CNMEM_API cnmemFinalize(); 126 | 127 | /** 128 | * \brief Increase the internal reference counter of the context object. 129 | * 130 | * This function increases the internal reference counter of the library. The purpose of that 131 | * reference counting mechanism is to give more control to the user over the lifetime of the 132 | * library. It is useful with scoped memory allocation which may be destroyed in a final 133 | * memory collection after the end of main(). That function is thread-safe. 134 | * 135 | * \return 136 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 137 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 138 | */ 139 | cnmemStatus_t CNMEM_API cnmemRetain(); 140 | 141 | /** 142 | * \brief Decrease the internal reference counter of the context object. 143 | * 144 | * This function decreases the internal reference counter of the library. The purpose of that 145 | * reference counting mechanism is to give more control to the user over the lifetime of the 146 | * library. It is useful with scoped memory allocation which may be destroyed in a final 147 | * memory collection after the end of main(). That function is thread-safe. 148 | * 149 | * You can use \c cnmemRelease to explicitly finalize the library. 150 | * 151 | * \return 152 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 153 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 154 | */ 155 | cnmemStatus_t CNMEM_API cnmemRelease(); 156 | 157 | /** 158 | * \brief Add a new stream to the pool of managed streams on a device. 159 | * 160 | * This function registers a new stream into a device memory manager. It is thread-safe. 161 | * 162 | * \return 163 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 164 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 165 | */ 166 | cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream); 167 | 168 | /** 169 | * \brief Allocate memory. 170 | * 171 | * This function allocates memory and initializes a pointer to device memory. If no memory 172 | * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe. 173 | * 174 | * The behavior of that function is the following: 175 | * 176 | * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 177 | * memory. If there's a buffer of size larger or equal to the requested size in the list of 178 | * free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 179 | * its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 180 | * cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 181 | * allowed to grow, the manager attempts to steal memory from one of its children (unless 182 | * CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 183 | * CNMEM_STATUS_OUT_OF_MEMORY. 184 | * 185 | * - If the stream is a named stream, the initial request goes to the memory manager associated 186 | * with that stream. If a free node is available in the lists of that manager, it is returned. 187 | * Otherwise, the request is passed to the root node and works as if the request were made on 188 | * the NULL stream. 189 | * 190 | * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 191 | * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 192 | * make sure no kernel uses a given buffer before stealing it) and it the execution is 193 | * sequential (in a multi-threaded context, the code is executed in a critical section inside 194 | * the cnmem library - no need for the user to wrap cnmemMalloc with locks). 195 | * 196 | * \return 197 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 198 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 199 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, 200 | * CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available, 201 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 202 | */ 203 | cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream); 204 | 205 | /** 206 | * \brief Release memory. 207 | * 208 | * This function releases memory and recycles a memory block in the manager. This function is 209 | * thread safe. 210 | * 211 | * \return 212 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 213 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 214 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, 215 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 216 | */ 217 | cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream); 218 | 219 | /* ********************************************************************************************* */ 220 | /* Utility functions. */ 221 | /* ********************************************************************************************* */ 222 | 223 | /** 224 | * \brief Returns the amount of memory managed by the memory manager associated with a stream. 225 | * 226 | * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple- 227 | * xity linear in the number of allocated blocks so do not call it in performance critical 228 | * sections. 229 | * 230 | * \return 231 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 232 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 233 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 234 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 235 | */ 236 | cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream); 237 | 238 | /** 239 | * \brief Print a list of nodes to a file. 240 | * 241 | * This function is intended to be used in case of complex scenarios to help understand the 242 | * behaviour of the memory managers/application. It is thread safe. 243 | * 244 | * \return 245 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 246 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 247 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 248 | * or free_mem == 0, 249 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 250 | */ 251 | cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream); 252 | 253 | /** 254 | * \brief Print a list of nodes to a file with used and free blocks together in ascending order of address. 255 | * 256 | * This function is intended to be used in case of complex scenarios to help understand the 257 | * behaviour of the memory managers/application. It is thread safe. 258 | * 259 | * \return 260 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 261 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 262 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 263 | * or free_mem == 0, 264 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 265 | */ 266 | cnmemStatus_t CNMEM_API cnmemPrintMemoryStateTogether(FILE *file, cudaStream_t stream); 267 | 268 | /** 269 | * \brief Converts a cnmemStatus_t value to a string. 270 | */ 271 | const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status); 272 | 273 | /** 274 | * \brief Gets the size of last block, according to address if free, otherwise gives 0 275 | * 276 | * Always returns CNMEM_STATUS_SUCCESS 277 | */ 278 | cnmemStatus_t CNMEM_API cnmemGetLastFreeBlockSize(std::size_t &size, cudaStream_t stream); 279 | 280 | /** 281 | * \brief Gets the size of largest free block, if exists, otherwise gives 0 282 | * 283 | * Always returns CNMEM_STATUS_SUCCESS 284 | */ 285 | cnmemStatus_t CNMEM_API cnmemGetLargestFreeBlockSize(std::size_t &size, cudaStream_t stream); 286 | 287 | /* ********************************************************************************************* */ 288 | 289 | #ifdef __cplusplus 290 | } // extern "C" 291 | #endif 292 | 293 | -------------------------------------------------------------------------------- /src/slow_conv.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "solver.h" 8 | 9 | using namespace std; 10 | 11 | typedef unsigned char uchar; 12 | 13 | int num_train = 1000, num_test = 500; 14 | 15 | int reverseInt(int n) { 16 | int bytes = 4; 17 | unsigned char ch[bytes]; 18 | for (int i = 0; i < bytes; i++) { 19 | ch[i] = (n >> i * 8) & 255; 20 | } 21 | int p = 0; 22 | for (int i = 0; i < bytes; i++) { 23 | p += (int) ch[i] << (bytes - i - 1) * 8; 24 | } 25 | return p; 26 | } 27 | 28 | void readMNIST(vector > &train_images, vector > &test_images, vector &train_labels, vector &test_labels) { 29 | string filename_train_images = "data/train-images.idx3-ubyte"; 30 | string filename_train_labels = "data/train-labels.idx1-ubyte"; 31 | 32 | string filename_test_images = "data/t10k-images.idx3-ubyte"; 33 | string filename_test_labels = "data/t10k-labels.idx1-ubyte"; 34 | 35 | // read train/test images 36 | for (int i = 0; i < 2; i++) { 37 | string filename; 38 | if (i == 0) 39 | filename = filename_train_images; 40 | else 41 | filename = filename_test_images; 42 | 43 | ifstream f(filename.c_str(), ios::binary); 44 | if (!f.is_open()) 45 | printf("Cannot read MNIST from %s\n", filename.c_str()); 46 | 47 | // read metadata 48 | int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0; 49 | f.read((char *) &magic_number, sizeof(magic_number)); 50 | magic_number = reverseInt(magic_number); 51 | f.read((char *) &n_images, sizeof(n_images)); 52 | n_images = reverseInt(n_images); 53 | f.read((char *) &n_rows, sizeof(n_rows)); 54 | n_rows = reverseInt(n_rows); 55 | f.read((char *) &n_cols, sizeof(n_cols)); 56 | n_cols = reverseInt(n_cols); 57 | 58 | for (int k = 0; k < n_images; k++) { 59 | vector temp; 60 | temp.reserve(n_rows * n_cols); 61 | for (int j = 0; j < n_rows * n_cols; j++) { 62 | uchar t = 0; 63 | f.read((char *)&t, sizeof(t)); 64 | temp.push_back(t); 65 | } 66 | if (i == 0) 67 | train_images.push_back(temp); 68 | else 69 | test_images.push_back(temp); 70 | } 71 | f.close(); 72 | 73 | } 74 | 75 | // read train/test labels 76 | for (int i = 0; i < 2; i++) { 77 | string filename; 78 | if (i == 0) 79 | filename = filename_train_labels; 80 | else 81 | filename = filename_test_labels; 82 | 83 | ifstream f(filename.c_str(), ios::binary); 84 | if (!f.is_open()) 85 | printf("Cannot read MNIST from %s\n", filename.c_str()); 86 | 87 | // read metadata 88 | int magic_number = 0, n_labels = 0; 89 | f.read((char *) &magic_number, sizeof(magic_number)); 90 | magic_number = reverseInt(magic_number); 91 | f.read((char *) &n_labels, sizeof(n_labels)); 92 | n_labels = reverseInt(n_labels); 93 | 94 | for (int k = 0; k < n_labels; k++) { 95 | uchar t = 0; 96 | f.read((char *)&t, sizeof(t)); 97 | if (i == 0) 98 | train_labels.push_back(t); 99 | else 100 | test_labels.push_back(t); 101 | } 102 | 103 | f.close(); 104 | 105 | } 106 | } 107 | 108 | void printTimes(vector &time, string filename); 109 | void printvDNNLag(vector > &fwd_vdnn_lag, vector > &bwd_vdnn_lag, string filename); 110 | 111 | int main(int argc, char *argv[]) { 112 | 113 | // int num_train = 100 * batch_size, num_val = batch_size; 114 | // void *X_train = malloc(num_train * input_channels * sizeof(float)); 115 | // int *y_train = (int *)malloc(num_train * sizeof(int)); 116 | // void *X_val = malloc(num_val * input_channels * sizeof(float)); 117 | // int *y_val = (int *)malloc(num_val * sizeof(int)); 118 | // for (int i = 0; i < num_train; i++) { 119 | // for (int j = 0; j < input_channels; j++) 120 | // ((float *)X_train)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000; 121 | // y_train[i] = 0; 122 | // } 123 | 124 | // for (int i = 0; i < num_val; i++) { 125 | // for (int j = 0; j < input_channels; j++) 126 | // ((float *)X_val)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000; 127 | // y_val[i] = rand() % 2; 128 | // } 129 | 130 | // int rows = 28, cols = 28, channels = 1; 131 | // vector > train_images, test_images; 132 | // vector train_labels, test_labels; 133 | // readMNIST(train_images, test_images, train_labels, test_labels); 134 | // float *f_train_images, *f_train_labels, *f_test_images, *f_test_labels; 135 | float *f_train_images, *f_test_images; 136 | int *f_train_labels, *f_test_labels; 137 | int rows = 156, cols = 156, channels = 3; 138 | int input_size = rows * cols * channels; 139 | f_train_images = (float *)malloc(num_train * input_size * sizeof(float)); 140 | f_train_labels = (int *)malloc(num_train * sizeof(int)); 141 | f_test_images = (float *)malloc(num_test * input_size * sizeof(float)); 142 | f_test_labels = (int *)malloc(num_test * sizeof(int)); 143 | 144 | float *mean_image; 145 | mean_image = (float *)malloc(input_size * sizeof(float)); 146 | 147 | for (int i = 0; i < input_size; i++) { 148 | mean_image[i] = 0; 149 | for (int k = 0; k < num_train; k++) { 150 | mean_image[i] += f_train_images[k * input_size + i]; 151 | } 152 | mean_image[i] /= num_train; 153 | } 154 | 155 | 156 | for (int i = 0; i < num_train; i++) { 157 | for (int j = 0; j < input_size; j++) { 158 | f_train_images[i * input_size + j] -= mean_image[j]; 159 | } 160 | } 161 | 162 | for (int i = 0; i < num_test; i++) { 163 | for (int j = 0; j < input_size; j++) { 164 | f_test_images[i * input_size + j] -= mean_image[j]; 165 | } 166 | 167 | } 168 | 169 | // int input_channels = rows * cols * channels * 3, hidden_channels1 = 50, hidden_channels2 = 100, output_channels = 10; 170 | // vector layer_specifier; 171 | // ConvDescriptor layer0; 172 | // LayerSpecifier temp; 173 | // layer0.initializeValues(1, 3, 3, 3, rows, cols, 1, 1, 1, 1); 174 | // temp.initPointer(CONV); 175 | // *((ConvDescriptor *)temp.params) = layer0; 176 | // layer_specifier.push_back(temp); 177 | // ActivationDescriptor layer0_actv; 178 | // layer0_actv.initializeValues(RELU, 3, rows, cols); 179 | // temp.initPointer(ACTV); 180 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 181 | // layer_specifier.push_back(temp); 182 | 183 | // BatchNormDescriptor layer0_bn; 184 | 185 | // for (int i = 0; i < 200; i++) { 186 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows, cols); 187 | // temp.initPointer(BATCHNORM); 188 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 189 | // layer_specifier.push_back(temp); 190 | 191 | // layer0.initializeValues(3, 3, 3, 3, rows, cols, 1, 1, 1, 1); 192 | // temp.initPointer(CONV); 193 | // *((ConvDescriptor *)temp.params) = layer0; 194 | // layer_specifier.push_back(temp); 195 | // layer0_actv.initializeValues(RELU, 3, rows, cols); 196 | // temp.initPointer(ACTV); 197 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 198 | // layer_specifier.push_back(temp); 199 | // } 200 | 201 | // PoolingDescriptor layer0_pool; 202 | // layer0_pool.initializeValues(3, 2, 2, rows, cols, 0, 0, 2, 2, POOLING_MAX); 203 | // temp.initPointer(POOLING); 204 | // *((PoolingDescriptor *)temp.params) = layer0_pool; 205 | // layer_specifier.push_back(temp); 206 | 207 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2); 208 | // temp.initPointer(BATCHNORM); 209 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 210 | // layer_specifier.push_back(temp); 211 | 212 | // // DropoutDescriptor layer0_dropout; 213 | // // layer0_dropout.initializeValues(0.2, 3, rows / 2, cols / 2); 214 | // // temp.initPointer(DROPOUT); 215 | // // *((DropoutDescriptor *)temp.params) = layer0_dropout; 216 | // // layer_specifier.push_back(temp); 217 | 218 | // layer0.initializeValues(3, 3, 3, 3, rows / 2, cols / 2, 1, 1, 1, 1); 219 | // temp.initPointer(CONV); 220 | // *((ConvDescriptor *)temp.params) = layer0; 221 | // layer_specifier.push_back(temp); 222 | // layer0_actv.initializeValues(RELU, 3, rows / 2, cols / 2); 223 | // temp.initPointer(ACTV); 224 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 225 | // layer_specifier.push_back(temp); 226 | 227 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2); 228 | // temp.initPointer(BATCHNORM); 229 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 230 | // layer_specifier.push_back(temp); 231 | 232 | // FCDescriptor layer1; 233 | // layer1.initializeValues(input_channels, hidden_channels1); 234 | // temp.initPointer(FULLY_CONNECTED); 235 | // *((FCDescriptor *)(temp.params)) = layer1; 236 | // layer_specifier.push_back(temp); 237 | 238 | // temp.initPointer(ACTV); 239 | // ActivationDescriptor layer1_actv; 240 | // layer1_actv.initializeValues(RELU, hidden_channels1, 1, 1); 241 | // *((ActivationDescriptor *)temp.params) = layer1_actv; 242 | // layer_specifier.push_back(temp); 243 | 244 | // layer0_bn.initializeValues(BATCHNORM_PER_ACTIVATION, 1e-5, 0.1, hidden_channels1, 1, 1); 245 | // temp.initPointer(BATCHNORM); 246 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 247 | // layer_specifier.push_back(temp); 248 | 249 | // temp.initPointer(FULLY_CONNECTED); 250 | // FCDescriptor layer2; 251 | // layer2.initializeValues(hidden_channels1, output_channels); 252 | // *((FCDescriptor *)temp.params) = layer2; 253 | // layer_specifier.push_back(temp); 254 | 255 | // // temp.initPointer(FULLY_CONNECTED); 256 | // // FCDescriptor layer3; 257 | // // layer3.initializeValues(hidden_channels2, output_channels); 258 | // // *((FCDescriptor *)temp.params) = layer3; 259 | // // layer_specifier.push_back(temp); 260 | 261 | // temp.initPointer(SOFTMAX); 262 | // SoftmaxDescriptor smax; 263 | // smax.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, output_channels, 1, 1); 264 | // *((SoftmaxDescriptor *)(temp.params)) = smax; 265 | // layer_specifier.push_back(temp); 266 | 267 | // AlexNet 268 | vector layer_specifier; 269 | { 270 | ConvDescriptor layer0; 271 | layer0.initializeValues(3, 32, 1, 1, 156, 156, 0, 0, 1, 1, RELU); 272 | LayerSpecifier temp; 273 | temp.initPointer(CONV); 274 | *((ConvDescriptor *)temp.params) = layer0; 275 | layer_specifier.push_back(temp); 276 | } 277 | { 278 | ConvDescriptor layer0; 279 | layer0.initializeValues(32, 32, 1, 1, 156, 156, 0, 0, 1, 1, RELU); 280 | LayerSpecifier temp; 281 | temp.initPointer(CONV); 282 | *((ConvDescriptor *)temp.params) = layer0; 283 | layer_specifier.push_back(temp); 284 | } 285 | { 286 | ConvDescriptor layer0; 287 | layer0.initializeValues(32, 3, 1, 1, 156, 156, 0, 0, 1, 1, RELU); 288 | LayerSpecifier temp; 289 | temp.initPointer(CONV); 290 | *((ConvDescriptor *)temp.params) = layer0; 291 | layer_specifier.push_back(temp); 292 | } 293 | { 294 | FCDescriptor layer8; 295 | layer8.initializeValues(156 * 156 * 3, 10); 296 | LayerSpecifier temp; 297 | temp.initPointer(FULLY_CONNECTED); 298 | *((FCDescriptor *)temp.params) = layer8; 299 | layer_specifier.push_back(temp); 300 | } 301 | { 302 | SoftmaxDescriptor layer11; 303 | layer11.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 10, 1, 1); 304 | LayerSpecifier temp; 305 | temp.initPointer(SOFTMAX); 306 | *((SoftmaxDescriptor *)temp.params) = layer11; 307 | layer_specifier.push_back(temp); 308 | } 309 | 310 | vDNNConvAlgo vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL; 311 | vDNNType vdnn_type = vDNN_DYN; 312 | string filename("vdnn_dyn"); 313 | if (argc == 3) { 314 | filename.assign("vdnn"); 315 | // argv[1] - layers to offload, argv[2] - conv algo to use 316 | if (strcmp(argv[1], "dyn") == 0) { 317 | vdnn_type = vDNN_DYN; 318 | filename.append("_dyn"); 319 | } 320 | else if (strcmp(argv[1], "conv") == 0) { 321 | vdnn_type = vDNN_CONV; 322 | filename.append("_conv"); 323 | } 324 | else if (strcmp(argv[1], "all") == 0) { 325 | vdnn_type = vDNN_ALL; 326 | filename.append("_all"); 327 | } 328 | else { 329 | printf("invalid argument.. using vdnn dynamic\n"); 330 | filename.assign("vdnn_dyn"); 331 | } 332 | if ((strcmp(argv[1], "conv") == 0 or strcmp(argv[1], "all") == 0)) { 333 | if (strcmp(argv[2], "p") == 0) { 334 | vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL; 335 | filename.append("_p"); 336 | } 337 | else if (strcmp(argv[2], "m") == 0) { 338 | vdnn_conv_algo = vDNN_MEMORY_OPTIMAL; 339 | filename.append("_m"); 340 | } 341 | else { 342 | printf("invalid argument.. using vdnn dynamic\n"); 343 | filename.assign("vdnn_dyn"); 344 | } 345 | } 346 | } 347 | 348 | int batch_size = 128; 349 | long long dropout_seed = 1; 350 | float softmax_eps = 1e-8; 351 | float init_std_dev = 0.1; 352 | NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vdnn_type, vdnn_conv_algo, SGD); 353 | 354 | int num_epoch = 1000; 355 | double learning_rate = 1e-3; 356 | double learning_rate_decay = 0.9; 357 | 358 | Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train); 359 | vector loss; 360 | vector time; 361 | vector > fwd_vdnn_lag, bwd_vdnn_lag; 362 | solver.getTrainTime(loss, time, 100, fwd_vdnn_lag, bwd_vdnn_lag); 363 | printTimes(time, filename); 364 | printvDNNLag(fwd_vdnn_lag, bwd_vdnn_lag, filename); 365 | 366 | } 367 | 368 | void printTimes(vector &time, string filename) { 369 | float mean_time = 0.0; 370 | float std_dev = 0.0; 371 | int N = time.size(); 372 | for (int i = 0; i < N; i++) { 373 | mean_time += time[i]; 374 | } 375 | mean_time /= N; 376 | for (int i = 0; i < N; i++) { 377 | std_dev += pow(time[i] - mean_time, 2); 378 | } 379 | std_dev /= N; 380 | pow(std_dev, 0.5); 381 | cout << "Average time: " << mean_time << endl; 382 | cout << "Standard deviation: " << std_dev << endl; 383 | 384 | filename.append(".dat"); 385 | fstream f; 386 | f.open(filename.c_str(), ios_base::out); 387 | 388 | for (int i = 0; i < N; i++) { 389 | f << time[i] << endl; 390 | } 391 | f << "mean_time: " << mean_time << endl; 392 | f << "standard_deviation: " << std_dev << endl; 393 | f.close(); 394 | 395 | } 396 | 397 | void printvDNNLag(vector > &fwd_vdnn_lag, vector > &bwd_vdnn_lag, string filename) { 398 | filename.append("_lag.dat"); 399 | 400 | fstream f; 401 | f.open(filename.c_str(), ios_base::out); 402 | 403 | int N = fwd_vdnn_lag.size(); 404 | for (int i = 0; i < N; i++) { 405 | for (int j = 0; j < fwd_vdnn_lag[i].size(); j++) { 406 | f << "fwd" << j << ": " << fwd_vdnn_lag[i][j] << endl; 407 | } 408 | for (int j = 0; j < bwd_vdnn_lag[i].size(); j++) { 409 | f << "bwd" << j << ": " << bwd_vdnn_lag[i][j] << endl; 410 | } 411 | f << endl; 412 | } 413 | f.close(); 414 | } -------------------------------------------------------------------------------- /src/vgg_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "solver.h" 8 | 9 | using namespace std; 10 | 11 | typedef unsigned char uchar; 12 | 13 | int num_train = 128, num_test = 500; 14 | 15 | int reverseInt(int n) { 16 | int bytes = 4; 17 | unsigned char ch[bytes]; 18 | for (int i = 0; i < bytes; i++) { 19 | ch[i] = (n >> i * 8) & 255; 20 | } 21 | int p = 0; 22 | for (int i = 0; i < bytes; i++) { 23 | p += (int) ch[i] << (bytes - i - 1) * 8; 24 | } 25 | return p; 26 | } 27 | 28 | void readMNIST(vector > &train_images, vector > &test_images, vector &train_labels, vector &test_labels) { 29 | string filename_train_images = "data/train-images.idx3-ubyte"; 30 | string filename_train_labels = "data/train-labels.idx1-ubyte"; 31 | 32 | string filename_test_images = "data/t10k-images.idx3-ubyte"; 33 | string filename_test_labels = "data/t10k-labels.idx1-ubyte"; 34 | 35 | // read train/test images 36 | for (int i = 0; i < 2; i++) { 37 | string filename; 38 | if (i == 0) 39 | filename = filename_train_images; 40 | else 41 | filename = filename_test_images; 42 | 43 | ifstream f(filename.c_str(), ios::binary); 44 | if (!f.is_open()) 45 | printf("Cannot read MNIST from %s\n", filename.c_str()); 46 | 47 | // read metadata 48 | int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0; 49 | f.read((char *) &magic_number, sizeof(magic_number)); 50 | magic_number = reverseInt(magic_number); 51 | f.read((char *) &n_images, sizeof(n_images)); 52 | n_images = reverseInt(n_images); 53 | f.read((char *) &n_rows, sizeof(n_rows)); 54 | n_rows = reverseInt(n_rows); 55 | f.read((char *) &n_cols, sizeof(n_cols)); 56 | n_cols = reverseInt(n_cols); 57 | 58 | for (int k = 0; k < n_images; k++) { 59 | vector temp; 60 | temp.reserve(n_rows * n_cols); 61 | for (int j = 0; j < n_rows * n_cols; j++) { 62 | uchar t = 0; 63 | f.read((char *)&t, sizeof(t)); 64 | temp.push_back(t); 65 | } 66 | if (i == 0) 67 | train_images.push_back(temp); 68 | else 69 | test_images.push_back(temp); 70 | } 71 | f.close(); 72 | 73 | } 74 | 75 | // read train/test labels 76 | for (int i = 0; i < 2; i++) { 77 | string filename; 78 | if (i == 0) 79 | filename = filename_train_labels; 80 | else 81 | filename = filename_test_labels; 82 | 83 | ifstream f(filename.c_str(), ios::binary); 84 | if (!f.is_open()) 85 | printf("Cannot read MNIST from %s\n", filename.c_str()); 86 | 87 | // read metadata 88 | int magic_number = 0, n_labels = 0; 89 | f.read((char *) &magic_number, sizeof(magic_number)); 90 | magic_number = reverseInt(magic_number); 91 | f.read((char *) &n_labels, sizeof(n_labels)); 92 | n_labels = reverseInt(n_labels); 93 | 94 | for (int k = 0; k < n_labels; k++) { 95 | uchar t = 0; 96 | f.read((char *)&t, sizeof(t)); 97 | if (i == 0) 98 | train_labels.push_back(t); 99 | else 100 | test_labels.push_back(t); 101 | } 102 | 103 | f.close(); 104 | 105 | } 106 | } 107 | 108 | void printTimes(vector &time, string filename); 109 | void printvDNNLag(vector > &fwd_vdnn_lag, vector > &bwd_vdnn_lag, string filename); 110 | void printComputationTransferTimes(vector > &fwd_times, vector >&bwd_times, bool computation, string filename); 111 | 112 | int main(int argc, char *argv[]) { 113 | 114 | 115 | // Allocate space in memory and read images 116 | float *f_train_images, *f_test_images; 117 | int *f_train_labels, *f_test_labels; 118 | int rows = 224, cols = 224, channels = 3; 119 | int input_size = rows * cols * channels; 120 | checkCudaErrors(cudaMallocHost(&f_train_images, num_train * input_size * sizeof(float))); 121 | checkCudaErrors(cudaMallocHost(&f_train_labels, num_train * sizeof(int))); 122 | f_test_images = (float *)malloc(num_test * input_size * sizeof(float)); 123 | f_test_labels = (int *)malloc(num_test * sizeof(int)); 124 | 125 | // read images here 126 | // ... 127 | // ... 128 | // ... 129 | 130 | float *mean_image; 131 | mean_image = (float *)malloc(input_size * sizeof(float)); 132 | 133 | for (int i = 0; i < input_size; i++) { 134 | mean_image[i] = 0; 135 | for (int k = 0; k < num_train; k++) { 136 | mean_image[i] += f_train_images[k * input_size + i]; 137 | } 138 | mean_image[i] /= num_train; 139 | } 140 | 141 | 142 | for (int i = 0; i < num_train; i++) { 143 | for (int j = 0; j < input_size; j++) { 144 | f_train_images[i * input_size + j] -= mean_image[j]; 145 | } 146 | } 147 | 148 | for (int i = 0; i < num_test; i++) { 149 | for (int j = 0; j < input_size; j++) { 150 | f_test_images[i * input_size + j] -= mean_image[j]; 151 | } 152 | 153 | } 154 | 155 | 156 | // VGG specification 157 | // Look at user_iface.h for function declaration to initialize values 158 | vector layer_specifier; 159 | { 160 | ConvDescriptor part0_conv0; 161 | part0_conv0.initializeValues(3, 64, 3, 3, 224, 224, 1, 1, 1, 1, RELU); 162 | LayerSpecifier temp; 163 | temp.initPointer(CONV); 164 | *((ConvDescriptor *)temp.params) = part0_conv0; 165 | layer_specifier.push_back(temp); 166 | } 167 | { 168 | ConvDescriptor part0_conv1; 169 | part0_conv1.initializeValues(64, 64, 3, 3, 224, 224, 1, 1, 1, 1, RELU); 170 | LayerSpecifier temp; 171 | temp.initPointer(CONV); 172 | *((ConvDescriptor *)temp.params) = part0_conv1; 173 | layer_specifier.push_back(temp); 174 | } 175 | { 176 | PoolingDescriptor pool0; 177 | pool0.initializeValues(64, 2, 2, 224, 224, 0, 0, 2, 2, POOLING_MAX); 178 | LayerSpecifier temp; 179 | temp.initPointer(POOLING); 180 | *((PoolingDescriptor *)temp.params) = pool0; 181 | layer_specifier.push_back(temp); 182 | } 183 | { 184 | ConvDescriptor part1_conv0; 185 | part1_conv0.initializeValues(64, 128, 3, 3, 112, 112, 1, 1, 1, 1, RELU); 186 | LayerSpecifier temp; 187 | temp.initPointer(CONV); 188 | *((ConvDescriptor *)temp.params) = part1_conv0; 189 | layer_specifier.push_back(temp); 190 | } 191 | { 192 | ConvDescriptor part1_conv1; 193 | part1_conv1.initializeValues(128, 128, 3, 3, 112, 112, 1, 1, 1, 1, RELU); 194 | LayerSpecifier temp; 195 | temp.initPointer(CONV); 196 | *((ConvDescriptor *)temp.params) = part1_conv1; 197 | layer_specifier.push_back(temp); 198 | } 199 | { 200 | PoolingDescriptor pool1; 201 | pool1.initializeValues(128, 2, 2, 112, 112, 0, 0, 2, 2, POOLING_MAX); 202 | LayerSpecifier temp; 203 | temp.initPointer(POOLING); 204 | *((PoolingDescriptor *)temp.params) = pool1; 205 | layer_specifier.push_back(temp); 206 | } 207 | { 208 | ConvDescriptor part2_conv0; 209 | part2_conv0.initializeValues(128, 256, 3, 3, 56, 56, 1, 1, 1, 1, RELU); 210 | LayerSpecifier temp; 211 | temp.initPointer(CONV); 212 | *((ConvDescriptor *)temp.params) = part2_conv0; 213 | layer_specifier.push_back(temp); 214 | } 215 | { 216 | ConvDescriptor part2_conv1; 217 | part2_conv1.initializeValues(256, 256, 3, 3, 56, 56, 1, 1, 1, 1, RELU); 218 | LayerSpecifier temp; 219 | temp.initPointer(CONV); 220 | *((ConvDescriptor *)temp.params) = part2_conv1; 221 | layer_specifier.push_back(temp); 222 | } 223 | { 224 | ConvDescriptor part2_conv2; 225 | part2_conv2.initializeValues(256, 256, 3, 3, 56, 56, 1, 1, 1, 1, RELU); 226 | LayerSpecifier temp; 227 | temp.initPointer(CONV); 228 | *((ConvDescriptor *)temp.params) = part2_conv2; 229 | layer_specifier.push_back(temp); 230 | } 231 | { 232 | PoolingDescriptor pool2; 233 | pool2.initializeValues(256, 2, 2, 56, 56, 0, 0, 2, 2, POOLING_MAX); 234 | LayerSpecifier temp; 235 | temp.initPointer(POOLING); 236 | *((PoolingDescriptor *)temp.params) = pool2; 237 | layer_specifier.push_back(temp); 238 | } 239 | { 240 | ConvDescriptor part3_conv0; 241 | part3_conv0.initializeValues(256, 512, 3, 3, 28, 28, 1, 1, 1, 1, RELU); 242 | LayerSpecifier temp; 243 | temp.initPointer(CONV); 244 | *((ConvDescriptor *)temp.params) = part3_conv0; 245 | layer_specifier.push_back(temp); 246 | } 247 | { 248 | ConvDescriptor part3_conv1; 249 | part3_conv1.initializeValues(512, 512, 3, 3, 28, 28, 1, 1, 1, 1, RELU); 250 | LayerSpecifier temp; 251 | temp.initPointer(CONV); 252 | *((ConvDescriptor *)temp.params) = part3_conv1; 253 | layer_specifier.push_back(temp); 254 | } 255 | { 256 | ConvDescriptor part3_conv2; 257 | part3_conv2.initializeValues(512, 512, 3, 3, 28, 28, 1, 1, 1, 1, RELU); 258 | LayerSpecifier temp; 259 | temp.initPointer(CONV); 260 | *((ConvDescriptor *)temp.params) = part3_conv2; 261 | layer_specifier.push_back(temp); 262 | } 263 | { 264 | PoolingDescriptor pool3; 265 | pool3.initializeValues(512, 2, 2, 28, 28, 0, 0, 2, 2, POOLING_MAX); 266 | LayerSpecifier temp; 267 | temp.initPointer(POOLING); 268 | *((PoolingDescriptor *)temp.params) = pool3; 269 | layer_specifier.push_back(temp); 270 | } 271 | { 272 | ConvDescriptor part4_conv0; 273 | part4_conv0.initializeValues(512, 512, 3, 3, 14, 14, 1, 1, 1, 1, RELU); 274 | LayerSpecifier temp; 275 | temp.initPointer(CONV); 276 | *((ConvDescriptor *)temp.params) = part4_conv0; 277 | layer_specifier.push_back(temp); 278 | } 279 | { 280 | ConvDescriptor part4_conv1; 281 | part4_conv1.initializeValues(512, 512, 3, 3, 14, 14, 1, 1, 1, 1, RELU); 282 | LayerSpecifier temp; 283 | temp.initPointer(CONV); 284 | *((ConvDescriptor *)temp.params) = part4_conv1; 285 | layer_specifier.push_back(temp); 286 | } 287 | { 288 | ConvDescriptor part4_conv2; 289 | part4_conv2.initializeValues(512, 512, 3, 3, 14, 14, 1, 1, 1, 1, RELU); 290 | LayerSpecifier temp; 291 | temp.initPointer(CONV); 292 | *((ConvDescriptor *)temp.params) = part4_conv2; 293 | layer_specifier.push_back(temp); 294 | } 295 | { 296 | PoolingDescriptor pool3; 297 | pool3.initializeValues(512, 2, 2, 14, 14, 0, 0, 2, 2, POOLING_MAX); 298 | LayerSpecifier temp; 299 | temp.initPointer(POOLING); 300 | *((PoolingDescriptor *)temp.params) = pool3; 301 | layer_specifier.push_back(temp); 302 | } 303 | 304 | { 305 | FCDescriptor part5_fc0; 306 | part5_fc0.initializeValues(7 * 7 * 512, 4096, RELU); 307 | LayerSpecifier temp; 308 | temp.initPointer(FULLY_CONNECTED); 309 | *((FCDescriptor *)temp.params) = part5_fc0; 310 | layer_specifier.push_back(temp); 311 | } 312 | { 313 | FCDescriptor part5_fc1; 314 | part5_fc1.initializeValues(4096, 4096, RELU); 315 | LayerSpecifier temp; 316 | temp.initPointer(FULLY_CONNECTED); 317 | *((FCDescriptor *)temp.params) = part5_fc1; 318 | layer_specifier.push_back(temp); 319 | } 320 | { 321 | FCDescriptor part5_fc2; 322 | part5_fc2.initializeValues(4096, 1000); 323 | LayerSpecifier temp; 324 | temp.initPointer(FULLY_CONNECTED); 325 | *((FCDescriptor *)temp.params) = part5_fc2; 326 | layer_specifier.push_back(temp); 327 | } 328 | { 329 | SoftmaxDescriptor s_max; 330 | s_max.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 1000, 1, 1); 331 | LayerSpecifier temp; 332 | temp.initPointer(SOFTMAX); 333 | *((SoftmaxDescriptor *)temp.params) = s_max; 334 | layer_specifier.push_back(temp); 335 | } 336 | 337 | 338 | // reading command line input 339 | // argv[1] - vDNN scheme - dyn, all, conv, alternate_conv, argv[2] - performance_optimal or memory_optimal 340 | vDNNConvAlgo vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL; 341 | vDNNType vdnn_type = vDNN_DYN; 342 | string filename("vdnn_dyn"); 343 | if (argc == 3) { 344 | filename.assign("vdnn"); 345 | if (strcmp(argv[1], "dyn") == 0) { 346 | vdnn_type = vDNN_DYN; 347 | filename.append("_dyn"); 348 | } 349 | else if (strcmp(argv[1], "conv") == 0) { 350 | vdnn_type = vDNN_CONV; 351 | filename.append("_conv"); 352 | } 353 | else if (strcmp(argv[1], "all") == 0) { 354 | vdnn_type = vDNN_ALL; 355 | filename.append("_all"); 356 | } 357 | else if (strcmp(argv[1], "alternate_conv") == 0) { 358 | vdnn_type = vDNN_ALTERNATE_CONV; 359 | filename.append("_alternate_conv"); 360 | } 361 | else { 362 | printf("invalid argument.. using vdnn dynamic\n"); 363 | filename.assign("vdnn_dyn"); 364 | } 365 | if ((strcmp(argv[1], "conv") == 0 or strcmp(argv[1], "all") == 0 or strcmp(argv[1], "alternate_conv") == 0)) { 366 | if (strcmp(argv[2], "p") == 0) { 367 | vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL; 368 | filename.append("_p"); 369 | } 370 | else if (strcmp(argv[2], "m") == 0) { 371 | vdnn_conv_algo = vDNN_MEMORY_OPTIMAL; 372 | filename.append("_m"); 373 | } 374 | else { 375 | printf("invalid argument.. using vdnn dynamic\n"); 376 | filename.assign("vdnn_dyn"); 377 | } 378 | } 379 | } 380 | 381 | 382 | int batch_size = 64; 383 | long long dropout_seed = 1; 384 | float softmax_eps = 1e-8; 385 | float init_std_dev = 0.1; 386 | // instantiating network object 387 | NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vdnn_type, vdnn_conv_algo, SGD); 388 | 389 | int num_epoch = 1000; 390 | double learning_rate = 1e-3; 391 | double learning_rate_decay = 0.9; 392 | 393 | // solver, which takes a network object and runs SGD on it 394 | Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train); 395 | vector loss; 396 | vector time; 397 | vector > fwd_vdnn_lag, bwd_vdnn_lag; 398 | // trains for given number of steps (here 100). and gets computation/transfer times of each layer for each iteration 399 | solver.getTrainTime(loss, time, 100, fwd_vdnn_lag, bwd_vdnn_lag); 400 | printTimes(time, filename); 401 | printvDNNLag(fwd_vdnn_lag, bwd_vdnn_lag, filename); 402 | 403 | vector > fwd_computation_time, bwd_computation_time; 404 | solver.getComputationTime(1, fwd_computation_time, bwd_computation_time); 405 | 406 | vector > fwd_transfer_time, bwd_transfer_time; 407 | solver.getTransferTime(1, fwd_transfer_time, bwd_transfer_time); 408 | 409 | printComputationTransferTimes(fwd_computation_time, bwd_computation_time, true, filename); 410 | printComputationTransferTimes(fwd_transfer_time, bwd_transfer_time, false, filename); 411 | 412 | } 413 | 414 | void printTimes(vector &time, string filename) { 415 | float mean_time = 0.0; 416 | float std_dev = 0.0; 417 | int N = time.size(); 418 | for (int i = 0; i < N; i++) { 419 | mean_time += time[i]; 420 | } 421 | mean_time /= N; 422 | for (int i = 0; i < N; i++) { 423 | std_dev += pow(time[i] - mean_time, 2); 424 | } 425 | std_dev /= N; 426 | std_dev = pow(std_dev, 0.5); 427 | cout << "Average time: " << mean_time << endl; 428 | cout << "Standard deviation: " << std_dev << endl; 429 | 430 | filename.append(".dat"); 431 | fstream f; 432 | f.open(filename.c_str(), ios_base::out); 433 | 434 | for (int i = 0; i < N; i++) { 435 | f << time[i] << endl; 436 | } 437 | f << "mean_time: " << mean_time << endl; 438 | f << "standard_deviation: " << std_dev << endl; 439 | f.close(); 440 | 441 | filename.append(".bin"); 442 | fstream f_bin; 443 | f_bin.open(filename.c_str(), ios_base::out); 444 | f_bin.write((char *)&N, sizeof(N)); 445 | for (int i = 0; i < N; i++) { 446 | f_bin.write((char *)&time[i], sizeof(time[i])); 447 | } 448 | f_bin.close(); 449 | 450 | } 451 | 452 | void printvDNNLag(vector > &fwd_vdnn_lag, vector > &bwd_vdnn_lag, string filename) { 453 | filename.append("_lag.dat"); 454 | 455 | fstream f; 456 | f.open(filename.c_str(), ios_base::out); 457 | 458 | int N = fwd_vdnn_lag.size(); 459 | for (int i = 0; i < N; i++) { 460 | for (int j = 0; j < fwd_vdnn_lag[i].size(); j++) { 461 | f << "fwd" << j << ": " << fwd_vdnn_lag[i][j] << endl; 462 | } 463 | for (int j = 0; j < bwd_vdnn_lag[i].size(); j++) { 464 | f << "bwd" << j << ": " << bwd_vdnn_lag[i][j] << endl; 465 | } 466 | f << endl; 467 | } 468 | f.close(); 469 | } 470 | 471 | void printComputationTransferTimes(vector > &fwd_times, vector >&bwd_times, bool computation, string filename) { 472 | if (computation) 473 | filename.append("_compute_time.dat"); 474 | else 475 | filename.append("_transfer_time.dat"); 476 | 477 | fstream f; 478 | f.open(filename.c_str(), ios_base::out); 479 | 480 | int N = fwd_times.size(); 481 | for (int i = 0; i < N; i++) { 482 | for (int j = 0; j < fwd_times[i].size(); j++) { 483 | f << "fwd" << j << ": " << fwd_times[i][j] << endl; 484 | } 485 | for (int j = 0; j < bwd_times[i].size(); j++) { 486 | f << "bwd" << j << ": " << bwd_times[i][j] << endl; 487 | } 488 | f << endl; 489 | } 490 | f.close(); 491 | } -------------------------------------------------------------------------------- /src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "solver.h" 8 | 9 | using namespace std; 10 | 11 | typedef unsigned char uchar; 12 | 13 | int num_train = 1000, num_test = 500; 14 | 15 | int reverseInt(int n) { 16 | int bytes = 4; 17 | unsigned char ch[bytes]; 18 | for (int i = 0; i < bytes; i++) { 19 | ch[i] = (n >> i * 8) & 255; 20 | } 21 | int p = 0; 22 | for (int i = 0; i < bytes; i++) { 23 | p += (int) ch[i] << (bytes - i - 1) * 8; 24 | } 25 | return p; 26 | } 27 | 28 | void readMNIST(vector > &train_images, vector > &test_images, vector &train_labels, vector &test_labels) { 29 | string filename_train_images = "data/train-images.idx3-ubyte"; 30 | string filename_train_labels = "data/train-labels.idx1-ubyte"; 31 | 32 | string filename_test_images = "data/t10k-images.idx3-ubyte"; 33 | string filename_test_labels = "data/t10k-labels.idx1-ubyte"; 34 | 35 | // read train/test images 36 | for (int i = 0; i < 2; i++) { 37 | string filename; 38 | if (i == 0) 39 | filename = filename_train_images; 40 | else 41 | filename = filename_test_images; 42 | 43 | ifstream f(filename.c_str(), ios::binary); 44 | if (!f.is_open()) 45 | printf("Cannot read MNIST from %s\n", filename.c_str()); 46 | 47 | // read metadata 48 | int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0; 49 | f.read((char *) &magic_number, sizeof(magic_number)); 50 | magic_number = reverseInt(magic_number); 51 | f.read((char *) &n_images, sizeof(n_images)); 52 | n_images = reverseInt(n_images); 53 | f.read((char *) &n_rows, sizeof(n_rows)); 54 | n_rows = reverseInt(n_rows); 55 | f.read((char *) &n_cols, sizeof(n_cols)); 56 | n_cols = reverseInt(n_cols); 57 | 58 | for (int k = 0; k < n_images; k++) { 59 | vector temp; 60 | temp.reserve(n_rows * n_cols); 61 | for (int j = 0; j < n_rows * n_cols; j++) { 62 | uchar t = 0; 63 | f.read((char *)&t, sizeof(t)); 64 | temp.push_back(t); 65 | } 66 | if (i == 0) 67 | train_images.push_back(temp); 68 | else 69 | test_images.push_back(temp); 70 | } 71 | f.close(); 72 | 73 | } 74 | 75 | // read train/test labels 76 | for (int i = 0; i < 2; i++) { 77 | string filename; 78 | if (i == 0) 79 | filename = filename_train_labels; 80 | else 81 | filename = filename_test_labels; 82 | 83 | ifstream f(filename.c_str(), ios::binary); 84 | if (!f.is_open()) 85 | printf("Cannot read MNIST from %s\n", filename.c_str()); 86 | 87 | // read metadata 88 | int magic_number = 0, n_labels = 0; 89 | f.read((char *) &magic_number, sizeof(magic_number)); 90 | magic_number = reverseInt(magic_number); 91 | f.read((char *) &n_labels, sizeof(n_labels)); 92 | n_labels = reverseInt(n_labels); 93 | 94 | for (int k = 0; k < n_labels; k++) { 95 | uchar t = 0; 96 | f.read((char *)&t, sizeof(t)); 97 | if (i == 0) 98 | train_labels.push_back(t); 99 | else 100 | test_labels.push_back(t); 101 | } 102 | 103 | f.close(); 104 | 105 | } 106 | } 107 | 108 | void printTimes(vector &time, string filename); 109 | void printvDNNLag(vector > &fwd_vdnn_lag, vector > &bwd_vdnn_lag, string filename); 110 | 111 | int main(int argc, char *argv[]) { 112 | 113 | // int num_train = 100 * batch_size, num_val = batch_size; 114 | // void *X_train = malloc(num_train * input_channels * sizeof(float)); 115 | // int *y_train = (int *)malloc(num_train * sizeof(int)); 116 | // void *X_val = malloc(num_val * input_channels * sizeof(float)); 117 | // int *y_val = (int *)malloc(num_val * sizeof(int)); 118 | // for (int i = 0; i < num_train; i++) { 119 | // for (int j = 0; j < input_channels; j++) 120 | // ((float *)X_train)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000; 121 | // y_train[i] = 0; 122 | // } 123 | 124 | // for (int i = 0; i < num_val; i++) { 125 | // for (int j = 0; j < input_channels; j++) 126 | // ((float *)X_val)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000; 127 | // y_val[i] = rand() % 2; 128 | // } 129 | 130 | // int rows = 28, cols = 28, channels = 1; 131 | // vector > train_images, test_images; 132 | // vector train_labels, test_labels; 133 | // readMNIST(train_images, test_images, train_labels, test_labels); 134 | // float *f_train_images, *f_train_labels, *f_test_images, *f_test_labels; 135 | float *f_train_images, *f_test_images; 136 | int *f_train_labels, *f_test_labels; 137 | int rows = 227, cols = 227, channels = 3; 138 | int input_size = rows * cols * channels; 139 | f_train_images = (float *)malloc(num_train * input_size * sizeof(float)); 140 | f_train_labels = (int *)malloc(num_train * sizeof(int)); 141 | f_test_images = (float *)malloc(num_test * input_size * sizeof(float)); 142 | f_test_labels = (int *)malloc(num_test * sizeof(int)); 143 | 144 | float *mean_image; 145 | mean_image = (float *)malloc(input_size * sizeof(float)); 146 | 147 | for (int i = 0; i < input_size; i++) { 148 | mean_image[i] = 0; 149 | for (int k = 0; k < num_train; k++) { 150 | mean_image[i] += f_train_images[k * input_size + i]; 151 | } 152 | mean_image[i] /= num_train; 153 | } 154 | 155 | 156 | for (int i = 0; i < num_train; i++) { 157 | for (int j = 0; j < input_size; j++) { 158 | f_train_images[i * input_size + j] -= mean_image[j]; 159 | } 160 | } 161 | 162 | for (int i = 0; i < num_test; i++) { 163 | for (int j = 0; j < input_size; j++) { 164 | f_test_images[i * input_size + j] -= mean_image[j]; 165 | } 166 | 167 | } 168 | 169 | // int input_channels = rows * cols * channels * 3, hidden_channels1 = 50, hidden_channels2 = 100, output_channels = 10; 170 | // vector layer_specifier; 171 | // ConvDescriptor layer0; 172 | // LayerSpecifier temp; 173 | // layer0.initializeValues(1, 3, 3, 3, rows, cols, 1, 1, 1, 1); 174 | // temp.initPointer(CONV); 175 | // *((ConvDescriptor *)temp.params) = layer0; 176 | // layer_specifier.push_back(temp); 177 | // ActivationDescriptor layer0_actv; 178 | // layer0_actv.initializeValues(RELU, 3, rows, cols); 179 | // temp.initPointer(ACTV); 180 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 181 | // layer_specifier.push_back(temp); 182 | 183 | // BatchNormDescriptor layer0_bn; 184 | 185 | // for (int i = 0; i < 200; i++) { 186 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows, cols); 187 | // temp.initPointer(BATCHNORM); 188 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 189 | // layer_specifier.push_back(temp); 190 | 191 | // layer0.initializeValues(3, 3, 3, 3, rows, cols, 1, 1, 1, 1); 192 | // temp.initPointer(CONV); 193 | // *((ConvDescriptor *)temp.params) = layer0; 194 | // layer_specifier.push_back(temp); 195 | // layer0_actv.initializeValues(RELU, 3, rows, cols); 196 | // temp.initPointer(ACTV); 197 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 198 | // layer_specifier.push_back(temp); 199 | // } 200 | 201 | // PoolingDescriptor layer0_pool; 202 | // layer0_pool.initializeValues(3, 2, 2, rows, cols, 0, 0, 2, 2, POOLING_MAX); 203 | // temp.initPointer(POOLING); 204 | // *((PoolingDescriptor *)temp.params) = layer0_pool; 205 | // layer_specifier.push_back(temp); 206 | 207 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2); 208 | // temp.initPointer(BATCHNORM); 209 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 210 | // layer_specifier.push_back(temp); 211 | 212 | // // DropoutDescriptor layer0_dropout; 213 | // // layer0_dropout.initializeValues(0.2, 3, rows / 2, cols / 2); 214 | // // temp.initPointer(DROPOUT); 215 | // // *((DropoutDescriptor *)temp.params) = layer0_dropout; 216 | // // layer_specifier.push_back(temp); 217 | 218 | // layer0.initializeValues(3, 3, 3, 3, rows / 2, cols / 2, 1, 1, 1, 1); 219 | // temp.initPointer(CONV); 220 | // *((ConvDescriptor *)temp.params) = layer0; 221 | // layer_specifier.push_back(temp); 222 | // layer0_actv.initializeValues(RELU, 3, rows / 2, cols / 2); 223 | // temp.initPointer(ACTV); 224 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 225 | // layer_specifier.push_back(temp); 226 | 227 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2); 228 | // temp.initPointer(BATCHNORM); 229 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 230 | // layer_specifier.push_back(temp); 231 | 232 | // FCDescriptor layer1; 233 | // layer1.initializeValues(input_channels, hidden_channels1); 234 | // temp.initPointer(FULLY_CONNECTED); 235 | // *((FCDescriptor *)(temp.params)) = layer1; 236 | // layer_specifier.push_back(temp); 237 | 238 | // temp.initPointer(ACTV); 239 | // ActivationDescriptor layer1_actv; 240 | // layer1_actv.initializeValues(RELU, hidden_channels1, 1, 1); 241 | // *((ActivationDescriptor *)temp.params) = layer1_actv; 242 | // layer_specifier.push_back(temp); 243 | 244 | // layer0_bn.initializeValues(BATCHNORM_PER_ACTIVATION, 1e-5, 0.1, hidden_channels1, 1, 1); 245 | // temp.initPointer(BATCHNORM); 246 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 247 | // layer_specifier.push_back(temp); 248 | 249 | // temp.initPointer(FULLY_CONNECTED); 250 | // FCDescriptor layer2; 251 | // layer2.initializeValues(hidden_channels1, output_channels); 252 | // *((FCDescriptor *)temp.params) = layer2; 253 | // layer_specifier.push_back(temp); 254 | 255 | // // temp.initPointer(FULLY_CONNECTED); 256 | // // FCDescriptor layer3; 257 | // // layer3.initializeValues(hidden_channels2, output_channels); 258 | // // *((FCDescriptor *)temp.params) = layer3; 259 | // // layer_specifier.push_back(temp); 260 | 261 | // temp.initPointer(SOFTMAX); 262 | // SoftmaxDescriptor smax; 263 | // smax.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, output_channels, 1, 1); 264 | // *((SoftmaxDescriptor *)(temp.params)) = smax; 265 | // layer_specifier.push_back(temp); 266 | 267 | // AlexNet 268 | vector layer_specifier; 269 | { 270 | ConvDescriptor layer0; 271 | layer0.initializeValues(3, 96, 11, 11, 227, 227, 0, 0, 4, 4); 272 | LayerSpecifier temp; 273 | temp.initPointer(CONV); 274 | *((ConvDescriptor *)temp.params) = layer0; 275 | layer_specifier.push_back(temp); 276 | } 277 | { 278 | PoolingDescriptor layer1; 279 | layer1.initializeValues(96, 3, 3, 55, 55, 0, 0, 2, 2, POOLING_MAX); 280 | LayerSpecifier temp; 281 | temp.initPointer(POOLING); 282 | *((PoolingDescriptor *)temp.params) = layer1; 283 | layer_specifier.push_back(temp); 284 | } 285 | { 286 | ConvDescriptor layer2; 287 | layer2.initializeValues(96, 256, 5, 5, 27, 27, 2, 2, 1, 1); 288 | LayerSpecifier temp; 289 | temp.initPointer(CONV); 290 | *((ConvDescriptor *)temp.params) = layer2; 291 | layer_specifier.push_back(temp); 292 | } 293 | { 294 | PoolingDescriptor layer3; 295 | layer3.initializeValues(256, 3, 3, 27, 27, 0, 0, 2, 2, POOLING_MAX); 296 | LayerSpecifier temp; 297 | temp.initPointer(POOLING); 298 | *((PoolingDescriptor *)temp.params) = layer3; 299 | layer_specifier.push_back(temp); 300 | } 301 | { 302 | ConvDescriptor layer4; 303 | layer4.initializeValues(256, 384, 3, 3, 13, 13, 1, 1, 1, 1); 304 | LayerSpecifier temp; 305 | temp.initPointer(CONV); 306 | *((ConvDescriptor *)temp.params) = layer4; 307 | layer_specifier.push_back(temp); 308 | } 309 | { 310 | ConvDescriptor layer5; 311 | layer5.initializeValues(384, 384, 3, 3, 13, 13, 1, 1, 1, 1); 312 | LayerSpecifier temp; 313 | temp.initPointer(CONV); 314 | *((ConvDescriptor *)temp.params) = layer5; 315 | layer_specifier.push_back(temp); 316 | } 317 | { 318 | ConvDescriptor layer6; 319 | layer6.initializeValues(384, 256, 3, 3, 13, 13, 1, 1, 1, 1); 320 | LayerSpecifier temp; 321 | temp.initPointer(CONV); 322 | *((ConvDescriptor *)temp.params) = layer6; 323 | layer_specifier.push_back(temp); 324 | } 325 | { 326 | PoolingDescriptor layer7; 327 | layer7.initializeValues(256, 3, 3, 13, 13, 0, 0, 2, 2, POOLING_MAX); 328 | LayerSpecifier temp; 329 | temp.initPointer(POOLING); 330 | *((PoolingDescriptor *)temp.params) = layer7; 331 | layer_specifier.push_back(temp); 332 | } 333 | { 334 | FCDescriptor layer8; 335 | layer8.initializeValues(9216, 4096); 336 | LayerSpecifier temp; 337 | temp.initPointer(FULLY_CONNECTED); 338 | *((FCDescriptor *)temp.params) = layer8; 339 | layer_specifier.push_back(temp); 340 | } 341 | { 342 | FCDescriptor layer9; 343 | layer9.initializeValues(4096, 4096); 344 | LayerSpecifier temp; 345 | temp.initPointer(FULLY_CONNECTED); 346 | *((FCDescriptor *)temp.params) = layer9; 347 | layer_specifier.push_back(temp); 348 | } 349 | { 350 | FCDescriptor layer10; 351 | layer10.initializeValues(4096, 1000); 352 | LayerSpecifier temp; 353 | temp.initPointer(FULLY_CONNECTED); 354 | *((FCDescriptor *)temp.params) = layer10; 355 | layer_specifier.push_back(temp); 356 | } 357 | { 358 | SoftmaxDescriptor layer11; 359 | layer11.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 1000, 1, 1); 360 | LayerSpecifier temp; 361 | temp.initPointer(SOFTMAX); 362 | *((SoftmaxDescriptor *)temp.params) = layer11; 363 | layer_specifier.push_back(temp); 364 | } 365 | 366 | vDNNConvAlgo vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL; 367 | vDNNType vdnn_type = vDNN_DYN; 368 | string filename("vdnn_dyn"); 369 | if (argc == 3) { 370 | filename.assign("vdnn"); 371 | // argv[1] - layers to offload, argv[2] - conv algo to use 372 | if (strcmp(argv[1], "dyn") == 0) { 373 | vdnn_type = vDNN_DYN; 374 | filename.append("_dyn"); 375 | } 376 | else if (strcmp(argv[1], "conv") == 0) { 377 | vdnn_type = vDNN_CONV; 378 | filename.append("_conv"); 379 | } 380 | else if (strcmp(argv[1], "all") == 0) { 381 | vdnn_type = vDNN_ALL; 382 | filename.append("_all"); 383 | } 384 | else { 385 | printf("invalid argument.. using vdnn dynamic\n"); 386 | filename.assign("vdnn_dyn"); 387 | } 388 | if ((strcmp(argv[1], "conv") == 0 or strcmp(argv[1], "all") == 0)) { 389 | if (strcmp(argv[2], "p") == 0) { 390 | vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL; 391 | filename.append("_p"); 392 | } 393 | else if (strcmp(argv[2], "m") == 0) { 394 | vdnn_conv_algo = vDNN_MEMORY_OPTIMAL; 395 | filename.append("_m"); 396 | } 397 | else { 398 | printf("invalid argument.. using vdnn dynamic\n"); 399 | filename.assign("vdnn_dyn"); 400 | } 401 | } 402 | } 403 | 404 | int batch_size = 128; 405 | long long dropout_seed = 1; 406 | float softmax_eps = 1e-8; 407 | float init_std_dev = 0.1; 408 | NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vdnn_type, vdnn_conv_algo, SGD); 409 | 410 | int num_epoch = 1000; 411 | double learning_rate = 1e-15; 412 | double learning_rate_decay = 0.9; 413 | 414 | Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train); 415 | vector loss; 416 | vector time; 417 | vector > fwd_vdnn_lag, bwd_vdnn_lag; 418 | solver.getTrainTime(loss, time, 100, fwd_vdnn_lag, bwd_vdnn_lag); 419 | printTimes(time, filename); 420 | printvDNNLag(fwd_vdnn_lag, bwd_vdnn_lag, filename); 421 | 422 | } 423 | 424 | void printTimes(vector &time, string filename) { 425 | float mean_time = 0.0; 426 | float std_dev = 0.0; 427 | int N = time.size(); 428 | for (int i = 0; i < N; i++) { 429 | mean_time += time[i]; 430 | } 431 | mean_time /= N; 432 | for (int i = 0; i < N; i++) { 433 | std_dev += pow(time[i] - mean_time, 2); 434 | } 435 | std_dev /= N; 436 | pow(std_dev, 0.5); 437 | cout << "Average time: " << mean_time << endl; 438 | cout << "Standard deviation: " << std_dev << endl; 439 | 440 | filename.append(".dat"); 441 | fstream f; 442 | f.open(filename.c_str(), ios_base::out); 443 | 444 | for (int i = 0; i < N; i++) { 445 | f << time[i] << endl; 446 | } 447 | f << "mean_time: " << mean_time << endl; 448 | f << "standard_deviation: " << std_dev << endl; 449 | f.close(); 450 | 451 | } 452 | 453 | void printvDNNLag(vector > &fwd_vdnn_lag, vector > &bwd_vdnn_lag, string filename) { 454 | filename.append("_lag.dat"); 455 | 456 | fstream f; 457 | f.open(filename.c_str(), ios_base::out); 458 | 459 | int N = fwd_vdnn_lag.size(); 460 | for (int i = 0; i < N; i++) { 461 | for (int j = 0; j < fwd_vdnn_lag[i].size(); j++) { 462 | f << "fwd" << j << ": " << fwd_vdnn_lag[i][j] << endl; 463 | } 464 | for (int j = 0; j < bwd_vdnn_lag[i].size(); j++) { 465 | f << "bwd" << j << ": " << bwd_vdnn_lag[i][j] << endl; 466 | } 467 | f << endl; 468 | } 469 | f.close(); 470 | } -------------------------------------------------------------------------------- /src/alexnet_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "solver.h" 8 | 9 | using namespace std; 10 | 11 | typedef unsigned char uchar; 12 | 13 | int num_train = 512, num_test = 500; 14 | 15 | int reverseInt(int n) { 16 | int bytes = 4; 17 | unsigned char ch[bytes]; 18 | for (int i = 0; i < bytes; i++) { 19 | ch[i] = (n >> i * 8) & 255; 20 | } 21 | int p = 0; 22 | for (int i = 0; i < bytes; i++) { 23 | p += (int) ch[i] << (bytes - i - 1) * 8; 24 | } 25 | return p; 26 | } 27 | 28 | void readMNIST(vector > &train_images, vector > &test_images, vector &train_labels, vector &test_labels) { 29 | string filename_train_images = "data/train-images.idx3-ubyte"; 30 | string filename_train_labels = "data/train-labels.idx1-ubyte"; 31 | 32 | string filename_test_images = "data/t10k-images.idx3-ubyte"; 33 | string filename_test_labels = "data/t10k-labels.idx1-ubyte"; 34 | 35 | // read train/test images 36 | for (int i = 0; i < 2; i++) { 37 | string filename; 38 | if (i == 0) 39 | filename = filename_train_images; 40 | else 41 | filename = filename_test_images; 42 | 43 | ifstream f(filename.c_str(), ios::binary); 44 | if (!f.is_open()) 45 | printf("Cannot read MNIST from %s\n", filename.c_str()); 46 | 47 | // read metadata 48 | int magic_number = 0, n_images = 0, n_rows = 0, n_cols = 0; 49 | f.read((char *) &magic_number, sizeof(magic_number)); 50 | magic_number = reverseInt(magic_number); 51 | f.read((char *) &n_images, sizeof(n_images)); 52 | n_images = reverseInt(n_images); 53 | f.read((char *) &n_rows, sizeof(n_rows)); 54 | n_rows = reverseInt(n_rows); 55 | f.read((char *) &n_cols, sizeof(n_cols)); 56 | n_cols = reverseInt(n_cols); 57 | 58 | for (int k = 0; k < n_images; k++) { 59 | vector temp; 60 | temp.reserve(n_rows * n_cols); 61 | for (int j = 0; j < n_rows * n_cols; j++) { 62 | uchar t = 0; 63 | f.read((char *)&t, sizeof(t)); 64 | temp.push_back(t); 65 | } 66 | if (i == 0) 67 | train_images.push_back(temp); 68 | else 69 | test_images.push_back(temp); 70 | } 71 | f.close(); 72 | 73 | } 74 | 75 | // read train/test labels 76 | for (int i = 0; i < 2; i++) { 77 | string filename; 78 | if (i == 0) 79 | filename = filename_train_labels; 80 | else 81 | filename = filename_test_labels; 82 | 83 | ifstream f(filename.c_str(), ios::binary); 84 | if (!f.is_open()) 85 | printf("Cannot read MNIST from %s\n", filename.c_str()); 86 | 87 | // read metadata 88 | int magic_number = 0, n_labels = 0; 89 | f.read((char *) &magic_number, sizeof(magic_number)); 90 | magic_number = reverseInt(magic_number); 91 | f.read((char *) &n_labels, sizeof(n_labels)); 92 | n_labels = reverseInt(n_labels); 93 | 94 | for (int k = 0; k < n_labels; k++) { 95 | uchar t = 0; 96 | f.read((char *)&t, sizeof(t)); 97 | if (i == 0) 98 | train_labels.push_back(t); 99 | else 100 | test_labels.push_back(t); 101 | } 102 | 103 | f.close(); 104 | 105 | } 106 | } 107 | 108 | void printTimes(vector &time, string filename); 109 | void printvDNNLag(vector > &fwd_vdnn_lag, vector > &bwd_vdnn_lag, string filename); 110 | void printComputationTransferTimes(vector > &fwd_times, vector >&bwd_times, bool computation, string filename); 111 | 112 | int main(int argc, char *argv[]) { 113 | 114 | // int num_train = 100 * batch_size, num_val = batch_size; 115 | // void *X_train = malloc(num_train * input_channels * sizeof(float)); 116 | // int *y_train = (int *)malloc(num_train * sizeof(int)); 117 | // void *X_val = malloc(num_val * input_channels * sizeof(float)); 118 | // int *y_val = (int *)malloc(num_val * sizeof(int)); 119 | // for (int i = 0; i < num_train; i++) { 120 | // for (int j = 0; j < input_channels; j++) 121 | // ((float *)X_train)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000; 122 | // y_train[i] = 0; 123 | // } 124 | 125 | // for (int i = 0; i < num_val; i++) { 126 | // for (int j = 0; j < input_channels; j++) 127 | // ((float *)X_val)[i * input_channels + j] = (rand() % 1000) * 1.0 / 1000; 128 | // y_val[i] = rand() % 2; 129 | // } 130 | 131 | // int rows = 28, cols = 28, channels = 1; 132 | // vector > train_images, test_images; 133 | // vector train_labels, test_labels; 134 | // readMNIST(train_images, test_images, train_labels, test_labels); 135 | // float *f_train_images, *f_train_labels, *f_test_images, *f_test_labels; 136 | float *f_train_images, *f_test_images; 137 | int *f_train_labels, *f_test_labels; 138 | int rows = 227, cols = 227, channels = 3; 139 | int input_size = rows * cols * channels; 140 | // f_train_images = (float *)malloc(num_train * input_size * sizeof(float)); 141 | // f_train_labels = (int *)malloc(num_train * sizeof(int)); 142 | checkCudaErrors(cudaMallocHost(&f_train_images, num_train * input_size * sizeof(float))); 143 | checkCudaErrors(cudaMallocHost(&f_train_labels, num_train * sizeof(int))); 144 | f_test_images = (float *)malloc(num_test * input_size * sizeof(float)); 145 | f_test_labels = (int *)malloc(num_test * sizeof(int)); 146 | 147 | float *mean_image; 148 | mean_image = (float *)malloc(input_size * sizeof(float)); 149 | 150 | for (int i = 0; i < input_size; i++) { 151 | mean_image[i] = 0; 152 | for (int k = 0; k < num_train; k++) { 153 | mean_image[i] += f_train_images[k * input_size + i]; 154 | } 155 | mean_image[i] /= num_train; 156 | } 157 | 158 | 159 | for (int i = 0; i < num_train; i++) { 160 | for (int j = 0; j < input_size; j++) { 161 | f_train_images[i * input_size + j] -= mean_image[j]; 162 | } 163 | } 164 | 165 | for (int i = 0; i < num_test; i++) { 166 | for (int j = 0; j < input_size; j++) { 167 | f_test_images[i * input_size + j] -= mean_image[j]; 168 | } 169 | 170 | } 171 | 172 | // int input_channels = rows * cols * channels * 3, hidden_channels1 = 50, hidden_channels2 = 100, output_channels = 10; 173 | // vector layer_specifier; 174 | // ConvDescriptor layer0; 175 | // LayerSpecifier temp; 176 | // layer0.initializeValues(1, 3, 3, 3, rows, cols, 1, 1, 1, 1); 177 | // temp.initPointer(CONV); 178 | // *((ConvDescriptor *)temp.params) = layer0; 179 | // layer_specifier.push_back(temp); 180 | // ActivationDescriptor layer0_actv; 181 | // layer0_actv.initializeValues(RELU, 3, rows, cols); 182 | // temp.initPointer(ACTV); 183 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 184 | // layer_specifier.push_back(temp); 185 | 186 | // BatchNormDescriptor layer0_bn; 187 | 188 | // for (int i = 0; i < 200; i++) { 189 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows, cols); 190 | // temp.initPointer(BATCHNORM); 191 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 192 | // layer_specifier.push_back(temp); 193 | 194 | // layer0.initializeValues(3, 3, 3, 3, rows, cols, 1, 1, 1, 1); 195 | // temp.initPointer(CONV); 196 | // *((ConvDescriptor *)temp.params) = layer0; 197 | // layer_specifier.push_back(temp); 198 | // layer0_actv.initializeValues(RELU, 3, rows, cols); 199 | // temp.initPointer(ACTV); 200 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 201 | // layer_specifier.push_back(temp); 202 | // } 203 | 204 | // PoolingDescriptor layer0_pool; 205 | // layer0_pool.initializeValues(3, 2, 2, rows, cols, 0, 0, 2, 2, POOLING_MAX); 206 | // temp.initPointer(POOLING); 207 | // *((PoolingDescriptor *)temp.params) = layer0_pool; 208 | // layer_specifier.push_back(temp); 209 | 210 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2); 211 | // temp.initPointer(BATCHNORM); 212 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 213 | // layer_specifier.push_back(temp); 214 | 215 | // // DropoutDescriptor layer0_dropout; 216 | // // layer0_dropout.initializeValues(0.2, 3, rows / 2, cols / 2); 217 | // // temp.initPointer(DROPOUT); 218 | // // *((DropoutDescriptor *)temp.params) = layer0_dropout; 219 | // // layer_specifier.push_back(temp); 220 | 221 | // layer0.initializeValues(3, 3, 3, 3, rows / 2, cols / 2, 1, 1, 1, 1); 222 | // temp.initPointer(CONV); 223 | // *((ConvDescriptor *)temp.params) = layer0; 224 | // layer_specifier.push_back(temp); 225 | // layer0_actv.initializeValues(RELU, 3, rows / 2, cols / 2); 226 | // temp.initPointer(ACTV); 227 | // *((ActivationDescriptor *)temp.params) = layer0_actv; 228 | // layer_specifier.push_back(temp); 229 | 230 | // layer0_bn.initializeValues(BATCHNORM_SPATIAL, 1e-5, 0.1, 3, rows / 2, cols / 2); 231 | // temp.initPointer(BATCHNORM); 232 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 233 | // layer_specifier.push_back(temp); 234 | 235 | // FCDescriptor layer1; 236 | // layer1.initializeValues(input_channels, hidden_channels1); 237 | // temp.initPointer(FULLY_CONNECTED); 238 | // *((FCDescriptor *)(temp.params)) = layer1; 239 | // layer_specifier.push_back(temp); 240 | 241 | // temp.initPointer(ACTV); 242 | // ActivationDescriptor layer1_actv; 243 | // layer1_actv.initializeValues(RELU, hidden_channels1, 1, 1); 244 | // *((ActivationDescriptor *)temp.params) = layer1_actv; 245 | // layer_specifier.push_back(temp); 246 | 247 | // layer0_bn.initializeValues(BATCHNORM_PER_ACTIVATION, 1e-5, 0.1, hidden_channels1, 1, 1); 248 | // temp.initPointer(BATCHNORM); 249 | // *((BatchNormDescriptor *)temp.params) = layer0_bn; 250 | // layer_specifier.push_back(temp); 251 | 252 | // temp.initPointer(FULLY_CONNECTED); 253 | // FCDescriptor layer2; 254 | // layer2.initializeValues(hidden_channels1, output_channels); 255 | // *((FCDescriptor *)temp.params) = layer2; 256 | // layer_specifier.push_back(temp); 257 | 258 | // // temp.initPointer(FULLY_CONNECTED); 259 | // // FCDescriptor layer3; 260 | // // layer3.initializeValues(hidden_channels2, output_channels); 261 | // // *((FCDescriptor *)temp.params) = layer3; 262 | // // layer_specifier.push_back(temp); 263 | 264 | // temp.initPointer(SOFTMAX); 265 | // SoftmaxDescriptor smax; 266 | // smax.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, output_channels, 1, 1); 267 | // *((SoftmaxDescriptor *)(temp.params)) = smax; 268 | // layer_specifier.push_back(temp); 269 | 270 | // AlexNet 271 | vector layer_specifier; 272 | { 273 | ConvDescriptor layer0; 274 | layer0.initializeValues(3, 96, 11, 11, 227, 227, 0, 0, 4, 4, RELU); 275 | LayerSpecifier temp; 276 | temp.initPointer(CONV); 277 | *((ConvDescriptor *)temp.params) = layer0; 278 | layer_specifier.push_back(temp); 279 | } 280 | { 281 | PoolingDescriptor layer1; 282 | layer1.initializeValues(96, 3, 3, 55, 55, 0, 0, 2, 2, POOLING_MAX); 283 | LayerSpecifier temp; 284 | temp.initPointer(POOLING); 285 | *((PoolingDescriptor *)temp.params) = layer1; 286 | layer_specifier.push_back(temp); 287 | } 288 | { 289 | ConvDescriptor layer2; 290 | layer2.initializeValues(96, 256, 5, 5, 27, 27, 2, 2, 1, 1, RELU); 291 | LayerSpecifier temp; 292 | temp.initPointer(CONV); 293 | *((ConvDescriptor *)temp.params) = layer2; 294 | layer_specifier.push_back(temp); 295 | } 296 | { 297 | PoolingDescriptor layer3; 298 | layer3.initializeValues(256, 3, 3, 27, 27, 0, 0, 2, 2, POOLING_MAX); 299 | LayerSpecifier temp; 300 | temp.initPointer(POOLING); 301 | *((PoolingDescriptor *)temp.params) = layer3; 302 | layer_specifier.push_back(temp); 303 | } 304 | { 305 | ConvDescriptor layer4; 306 | layer4.initializeValues(256, 384, 3, 3, 13, 13, 1, 1, 1, 1, RELU); 307 | LayerSpecifier temp; 308 | temp.initPointer(CONV); 309 | *((ConvDescriptor *)temp.params) = layer4; 310 | layer_specifier.push_back(temp); 311 | } 312 | { 313 | ConvDescriptor layer5; 314 | layer5.initializeValues(384, 384, 3, 3, 13, 13, 1, 1, 1, 1, RELU); 315 | LayerSpecifier temp; 316 | temp.initPointer(CONV); 317 | *((ConvDescriptor *)temp.params) = layer5; 318 | layer_specifier.push_back(temp); 319 | } 320 | { 321 | ConvDescriptor layer6; 322 | layer6.initializeValues(384, 256, 3, 3, 13, 13, 1, 1, 1, 1, RELU); 323 | LayerSpecifier temp; 324 | temp.initPointer(CONV); 325 | *((ConvDescriptor *)temp.params) = layer6; 326 | layer_specifier.push_back(temp); 327 | } 328 | { 329 | PoolingDescriptor layer7; 330 | layer7.initializeValues(256, 3, 3, 13, 13, 0, 0, 2, 2, POOLING_MAX); 331 | LayerSpecifier temp; 332 | temp.initPointer(POOLING); 333 | *((PoolingDescriptor *)temp.params) = layer7; 334 | layer_specifier.push_back(temp); 335 | } 336 | { 337 | FCDescriptor layer8; 338 | layer8.initializeValues(9216, 4096, RELU); 339 | LayerSpecifier temp; 340 | temp.initPointer(FULLY_CONNECTED); 341 | *((FCDescriptor *)temp.params) = layer8; 342 | layer_specifier.push_back(temp); 343 | } 344 | { 345 | FCDescriptor layer9; 346 | layer9.initializeValues(4096, 4096, RELU); 347 | LayerSpecifier temp; 348 | temp.initPointer(FULLY_CONNECTED); 349 | *((FCDescriptor *)temp.params) = layer9; 350 | layer_specifier.push_back(temp); 351 | } 352 | { 353 | FCDescriptor layer10; 354 | layer10.initializeValues(4096, 1000); 355 | LayerSpecifier temp; 356 | temp.initPointer(FULLY_CONNECTED); 357 | *((FCDescriptor *)temp.params) = layer10; 358 | layer_specifier.push_back(temp); 359 | } 360 | { 361 | SoftmaxDescriptor layer11; 362 | layer11.initializeValues(SOFTMAX_ACCURATE, SOFTMAX_MODE_INSTANCE, 1000, 1, 1); 363 | LayerSpecifier temp; 364 | temp.initPointer(SOFTMAX); 365 | *((SoftmaxDescriptor *)temp.params) = layer11; 366 | layer_specifier.push_back(temp); 367 | } 368 | 369 | vDNNConvAlgo vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL; 370 | vDNNType vdnn_type = vDNN_DYN; 371 | string filename("vdnn_dyn"); 372 | if (argc == 3) { 373 | filename.assign("vdnn"); 374 | // argv[1] - layers to offload, argv[2] - conv algo to use 375 | if (strcmp(argv[1], "dyn") == 0) { 376 | vdnn_type = vDNN_DYN; 377 | filename.append("_dyn"); 378 | } 379 | else if (strcmp(argv[1], "conv") == 0) { 380 | vdnn_type = vDNN_CONV; 381 | filename.append("_conv"); 382 | } 383 | else if (strcmp(argv[1], "all") == 0) { 384 | vdnn_type = vDNN_ALL; 385 | filename.append("_all"); 386 | } 387 | else if (strcmp(argv[1], "alternate_conv") == 0) { 388 | vdnn_type = vDNN_ALTERNATE_CONV; 389 | filename.append("_alternate_conv"); 390 | } 391 | else { 392 | printf("invalid argument.. using vdnn dynamic\n"); 393 | filename.assign("vdnn_dyn"); 394 | } 395 | if ((strcmp(argv[1], "conv") == 0 or strcmp(argv[1], "all") == 0 or strcmp(argv[1], "alternate_conv") == 0)) { 396 | if (strcmp(argv[2], "p") == 0) { 397 | vdnn_conv_algo = vDNN_PERFORMANCE_OPTIMAL; 398 | filename.append("_p"); 399 | } 400 | else if (strcmp(argv[2], "m") == 0) { 401 | vdnn_conv_algo = vDNN_MEMORY_OPTIMAL; 402 | filename.append("_m"); 403 | } 404 | else { 405 | printf("invalid argument.. using vdnn dynamic\n"); 406 | filename.assign("vdnn_dyn"); 407 | } 408 | } 409 | } 410 | 411 | int batch_size = 256; 412 | long long dropout_seed = 1; 413 | float softmax_eps = 1e-8; 414 | float init_std_dev = 0.1; 415 | NeuralNet net(layer_specifier, DATA_FLOAT, batch_size, TENSOR_NCHW, dropout_seed, softmax_eps, init_std_dev, vdnn_type, vdnn_conv_algo, SGD); 416 | 417 | int num_epoch = 1000; 418 | double learning_rate = 1e-3; 419 | double learning_rate_decay = 0.9; 420 | 421 | Solver solver(&net, (void *)f_train_images, f_train_labels, (void *)f_train_images, f_train_labels, num_epoch, SGD, learning_rate, learning_rate_decay, num_train, num_train); 422 | vector loss; 423 | vector time; 424 | vector > fwd_vdnn_lag, bwd_vdnn_lag; 425 | solver.getTrainTime(loss, time, 100, fwd_vdnn_lag, bwd_vdnn_lag); 426 | printTimes(time, filename); 427 | printvDNNLag(fwd_vdnn_lag, bwd_vdnn_lag, filename); 428 | 429 | vector > fwd_computation_time, bwd_computation_time; 430 | solver.getComputationTime(1, fwd_computation_time, bwd_computation_time); 431 | 432 | vector > fwd_transfer_time, bwd_transfer_time; 433 | solver.getTransferTime(1, fwd_transfer_time, bwd_transfer_time); 434 | 435 | printComputationTransferTimes(fwd_computation_time, bwd_computation_time, true, filename); 436 | printComputationTransferTimes(fwd_transfer_time, bwd_transfer_time, false, filename); 437 | 438 | } 439 | 440 | void printTimes(vector &time, string filename) { 441 | float mean_time = 0.0; 442 | float std_dev = 0.0; 443 | int N = time.size(); 444 | for (int i = 0; i < N; i++) { 445 | mean_time += time[i]; 446 | } 447 | mean_time /= N; 448 | for (int i = 0; i < N; i++) { 449 | std_dev += pow(time[i] - mean_time, 2); 450 | } 451 | std_dev /= N; 452 | std_dev = pow(std_dev, 0.5); 453 | cout << "Average time: " << mean_time << endl; 454 | cout << "Standard deviation: " << std_dev << endl; 455 | 456 | filename.append(".dat"); 457 | fstream f; 458 | f.open(filename.c_str(), ios_base::out); 459 | 460 | for (int i = 0; i < N; i++) { 461 | f << time[i] << endl; 462 | } 463 | f << "mean_time: " << mean_time << endl; 464 | f << "standard_deviation: " << std_dev << endl; 465 | f.close(); 466 | 467 | filename.append(".bin"); 468 | fstream f_bin; 469 | f_bin.open(filename.c_str(), ios_base::out); 470 | f_bin.write((char *)&N, sizeof(N)); 471 | for (int i = 0; i < N; i++) { 472 | f_bin.write((char *)&time[i], sizeof(time[i])); 473 | } 474 | f_bin.close(); 475 | 476 | } 477 | 478 | void printvDNNLag(vector > &fwd_vdnn_lag, vector > &bwd_vdnn_lag, string filename) { 479 | filename.append("_lag.dat"); 480 | 481 | fstream f; 482 | f.open(filename.c_str(), ios_base::out); 483 | 484 | int N = fwd_vdnn_lag.size(); 485 | for (int i = 0; i < N; i++) { 486 | for (int j = 0; j < fwd_vdnn_lag[i].size(); j++) { 487 | f << "fwd" << j << ": " << fwd_vdnn_lag[i][j] << endl; 488 | } 489 | for (int j = 0; j < bwd_vdnn_lag[i].size(); j++) { 490 | f << "bwd" << j << ": " << bwd_vdnn_lag[i][j] << endl; 491 | } 492 | f << endl; 493 | } 494 | f.close(); 495 | } 496 | 497 | void printComputationTransferTimes(vector > &fwd_times, vector >&bwd_times, bool computation, string filename) { 498 | if (computation) 499 | filename.append("_compute_time.dat"); 500 | else 501 | filename.append("_transfer_time.dat"); 502 | 503 | fstream f; 504 | f.open(filename.c_str(), ios_base::out); 505 | 506 | int N = fwd_times.size(); 507 | for (int i = 0; i < N; i++) { 508 | for (int j = 0; j < fwd_times[i].size(); j++) { 509 | f << "fwd" << j << ": " << fwd_times[i][j] << endl; 510 | } 511 | for (int j = 0; j < bwd_times[i].size(); j++) { 512 | f << "bwd" << j << ": " << bwd_times[i][j] << endl; 513 | } 514 | f << endl; 515 | } 516 | f.close(); 517 | } -------------------------------------------------------------------------------- /src/neural_net_time.cu: -------------------------------------------------------------------------------- 1 | #include "neural_net.h" 2 | 3 | void NeuralNet::getComputationTime(void *X, int *y, double learning_rate, 4 | std::vector &fwd_computation_time, std::vector &bwd_computation_time) { 5 | for (int i = 0; i < num_layers; i++) 6 | prefetched[i] = false; 7 | 8 | // checkCNMEM(cnmemMalloc(&layer_input[0], layer_input_size[0] * data_type_size, NULL)); 9 | // checkCudaErrors(cudaMemcpy(layer_input[0], X, batch_size * input_channels * input_h * input_w * data_type_size, cudaMemcpyHostToDevice)); 10 | // checkCudaErrors(cudaMemcpy(this->y, y, batch_size * data_type_size, cudaMemcpyHostToDevice)); 11 | 12 | float alpha = 1.0, beta = 0.0; 13 | float Salpha = 1.0, Sbeta = 0.0; 14 | double Dalpha = 1.0, Dbeta = 0.0; 15 | 16 | // forward propagate 17 | for (int i = 0; i < num_layers; i++) { 18 | size_t cur_workspace_size; 19 | void *cur_workspace; 20 | 21 | checkCNMEM(cnmemMalloc(&layer_input[i], layer_input_size[i] * data_type_size, NULL)); 22 | checkCNMEM(cnmemMalloc(&layer_input[i + 1], layer_input_size[i + 1] * data_type_size, NULL)); 23 | if (layer_type[i] == CONV) { 24 | ConvLayerParams *cur_params = (ConvLayerParams *)params[i]; 25 | 26 | cur_workspace_size = cur_params->fwd_workspace_size; 27 | checkCNMEM(cnmemMalloc(&cur_workspace, cur_workspace_size, NULL)); 28 | } 29 | 30 | checkCudaErrors(cudaEventRecord(start_compute, stream_compute)); 31 | if (layer_type[i] == CONV) { 32 | ConvLayerParams *cur_params = (ConvLayerParams *)params[i]; 33 | 34 | cur_workspace_size = cur_params->fwd_workspace_size; 35 | // computation 36 | checkCUDNN(cudnnConvolutionForward(cudnn_handle, &alpha, 37 | cur_params->input_tensor, layer_input[i], 38 | cur_params->filter_desc, cur_params->W, 39 | cur_params->conv_desc, cur_params->fwd_algo, 40 | cur_workspace, cur_workspace_size, 41 | &beta, 42 | cur_params->output_tensor, layer_input[i + 1])); 43 | checkCUDNN(cudnnAddTensor(cudnn_handle, &alpha, 44 | cur_params->bias_desc, cur_params->b, 45 | &alpha, 46 | cur_params->output_tensor, layer_input[i + 1])); 47 | 48 | // if activation required 49 | if (cur_params->activation_mode != ACTIVATION_NONE) { 50 | checkCUDNN(cudnnActivationForward(cudnn_handle, cur_params->actv_desc, 51 | &alpha, 52 | cur_params->output_tensor, layer_input[i + 1], 53 | &beta, 54 | cur_params->output_tensor, layer_input[i + 1])); 55 | } 56 | 57 | } 58 | 59 | else if (layer_type[i] == FULLY_CONNECTED) { 60 | // std::cout << "FC\n"; 61 | FCLayerParams *cur_params = (FCLayerParams *)params[i]; 62 | // std::cout << "FChere" << i << std::endl; 63 | 64 | if (data_type == CUDNN_DATA_FLOAT) { 65 | checkCUBLAS(cublasSgemm(cublas_handle, 66 | CUBLAS_OP_N, CUBLAS_OP_N, 67 | cur_params->C_out, batch_size, cur_params->C_in, 68 | &Salpha, 69 | (float *)cur_params->W, cur_params->C_out, 70 | (float *)layer_input[i], cur_params->C_in, 71 | &Sbeta, 72 | (float *)layer_input[i + 1], cur_params->C_out)); 73 | checkCUBLAS(cublasSgemm(cublas_handle, 74 | CUBLAS_OP_N, CUBLAS_OP_N, 75 | cur_params->C_out, batch_size, 1, 76 | &Salpha, 77 | (float *)cur_params->b, cur_params->C_out, 78 | (float *)one_vec, 1, 79 | &Salpha, 80 | (float *)layer_input[i + 1], cur_params->C_out)); 81 | } 82 | else if (data_type == CUDNN_DATA_DOUBLE) { 83 | checkCUBLAS(cublasDgemm(cublas_handle, 84 | CUBLAS_OP_N, CUBLAS_OP_N, 85 | cur_params->C_out, batch_size, cur_params->C_in, 86 | &Dalpha, 87 | (double *)cur_params->W, cur_params->C_out, 88 | (double *)layer_input[i], cur_params->C_in, 89 | &Dbeta, 90 | (double *)layer_input[i + 1], cur_params->C_out)); 91 | checkCUBLAS(cublasDgemm(cublas_handle, 92 | CUBLAS_OP_N, CUBLAS_OP_N, 93 | cur_params->C_out, batch_size, 1, 94 | &Dalpha, 95 | (double *)cur_params->b, cur_params->C_out, 96 | (double *)one_vec, 1, 97 | &Dalpha, 98 | (double *)layer_input[i + 1], cur_params->C_out)); 99 | } 100 | if (cur_params->activation_mode != ACTIVATION_NONE) { 101 | checkCUDNN(cudnnActivationForward(cudnn_handle, cur_params->actv_desc, 102 | &alpha, 103 | cur_params->output_tensor, layer_input[i + 1], 104 | &beta, 105 | cur_params->output_tensor, layer_input[i + 1])); 106 | } 107 | } 108 | else if (layer_type[i] == DROPOUT) { 109 | DropoutLayerParams *cur_params = (DropoutLayerParams *)params[i]; 110 | checkCUDNN(cudnnDropoutForward(cudnn_handle, cur_params->dropout_desc, 111 | cur_params->input_tensor, layer_input[i], 112 | cur_params->input_tensor, layer_input[i + 1], 113 | cur_params->reserved_space, 114 | cur_params->reserved_space_size)); 115 | } 116 | else if (layer_type[i] == BATCHNORM) { 117 | BatchNormLayerParams *cur_params = (BatchNormLayerParams *)params[i]; 118 | 119 | checkCUDNN(cudnnBatchNormalizationForwardTraining(cudnn_handle, cur_params->mode, 120 | &alpha, &beta, 121 | cur_params->input_tensor, layer_input[i], 122 | cur_params->input_tensor, layer_input[i + 1], 123 | cur_params->sbmv_desc, 124 | cur_params->scale, cur_params->bias, 125 | cur_params->factor, 126 | cur_params->running_mean, cur_params->running_variance, 127 | cur_params->epsilon, 128 | cur_params->result_save_mean, cur_params->result_save_inv_var)); 129 | 130 | } 131 | else if (layer_type[i] == POOLING) { 132 | PoolingLayerParams *cur_params = (PoolingLayerParams *)params[i]; 133 | checkCUDNN(cudnnPoolingForward(cudnn_handle, cur_params->pool_desc, 134 | &alpha, 135 | cur_params->input_tensor, layer_input[i], 136 | &beta, 137 | cur_params->output_tensor, layer_input[i + 1])); 138 | } 139 | else if (layer_type[i] == ACTV) { 140 | std::cout << "Panic!! ACTV wrong place\n"; 141 | exit(0); 142 | ActivationLayerParams *cur_params = (ActivationLayerParams *)params[i]; 143 | checkCUDNN(cudnnActivationForward(cudnn_handle, cur_params->actv_desc, 144 | &alpha, 145 | cur_params->input_tensor, layer_input[i], 146 | &beta, 147 | cur_params->input_tensor, layer_input[i + 1])); 148 | } 149 | else if (layer_type[i] == SOFTMAX) { 150 | // std::cout << "Softmax\n"; 151 | std::cout << "Panic!! SOFTMAX wrong place\n"; 152 | exit(0); 153 | SoftmaxLayerParams *cur_params = (SoftmaxLayerParams *)params[i]; 154 | checkCUDNN(cudnnSoftmaxForward(cudnn_handle, cur_params->algo, cur_params->mode, 155 | &alpha, 156 | cur_params->input_tensor, layer_input[i], 157 | &beta, 158 | cur_params->input_tensor, layer_input[i + 1])); 159 | } 160 | 161 | // ---------------------- vDNN start ---------------------- 162 | // synchronization 163 | // checkCudaErrors(cudaDeviceSynchronize()); 164 | 165 | // if next layer is ACTV or SOFTMAX, complete that and come to synchronization 166 | // the case in above if for ACTV and SOFTMAX never occurs 167 | if (layer_type[i + 1] == SOFTMAX) { 168 | i++; 169 | layer_input[i + 1] = layer_input[i]; 170 | SoftmaxLayerParams *cur_params = (SoftmaxLayerParams *)params[i]; 171 | checkCUDNN(cudnnSoftmaxForward(cudnn_handle, cur_params->algo, cur_params->mode, 172 | &alpha, 173 | cur_params->input_tensor, layer_input[i], 174 | &beta, 175 | cur_params->input_tensor, layer_input[i + 1])); 176 | i--; 177 | } 178 | 179 | // sync with stream_compute guaranteed 180 | checkCudaErrors(cudaEventRecord(stop_compute, stream_compute)); 181 | checkCudaErrors(cudaEventSynchronize(stop_compute)); 182 | float compute_time = 0; 183 | checkCudaErrors(cudaEventElapsedTime(&compute_time, start_compute, stop_compute)); 184 | 185 | fwd_computation_time.push_back(compute_time); 186 | 187 | if (layer_type[i] == CONV) { 188 | checkCNMEM(cnmemFree(cur_workspace, NULL)); 189 | } 190 | 191 | checkCNMEM(cnmemFree(layer_input[i], NULL)); 192 | checkCNMEM(cnmemFree(layer_input[i + 1], NULL)); 193 | 194 | if (layer_type[i + 1] == ACTV or layer_type[i + 1] == SOFTMAX) { 195 | i = i + 1; 196 | } 197 | 198 | // ---------------------- vDNN end ------------------------ 199 | } 200 | 201 | // time for loss compute ignored 202 | // *scalar_loss = computeLoss(); 203 | 204 | // time for softmax backward ignored 205 | // ---------------------- vDNN start ---------------------- 206 | // checkCNMEM(cnmemMalloc(&dlayer_input[num_layers], batch_size * num_classes * data_type_size, NULL)); 207 | // space_tracker.updateSpace(CnmemSpace::SUB, layer_input_size[num_layers] * data_type_size); 208 | // // std::cout << "Free bytes: " << free_bytes << std::endl; 209 | // // ---------------------- vDNN end ------------------------ 210 | // if (layer_type[num_layers - 1] == SOFTMAX) { 211 | // // SoftmaxLayerParams *cur_params = (SoftmaxLayerParams *)params[num_layers - 1]; 212 | // if (data_type == CUDNN_DATA_FLOAT) { 213 | // checkCudaErrors(cudaMemset(dlayer_input[num_layers], 0, batch_size * num_classes * sizeof(float))); 214 | // softmaxLossBackProp<<>>(this->y, (float *)layer_input[num_layers], 215 | // (float *)dlayer_input[num_layers], batch_size, num_classes, softmax_eps); 216 | // } 217 | // else if (data_type == CUDNN_DATA_DOUBLE) { 218 | // checkCudaErrors(cudaMemset(dlayer_input[num_layers], 0, batch_size * num_classes * sizeof(double))); 219 | // softmaxLossBackProp<<>>(this->y, (double *)layer_input[num_layers], 220 | // (double *)dlayer_input[num_layers], batch_size, num_classes, softmax_eps); 221 | // } 222 | // } 223 | 224 | for (int i = num_layers - 1; i >= 0; i--) { 225 | // ---------------------- vDNN start ---------------------- 226 | size_t cur_filter_workspace_size, cur_data_workspace_size, cur_workspace_size; 227 | void *cur_workspace; 228 | 229 | checkCNMEM(cnmemMalloc(&layer_input[i + 1], layer_input_size[i + 1] * data_type_size, NULL)); 230 | checkCNMEM(cnmemMalloc(&layer_input[i], layer_input_size[i] * data_type_size, NULL)); 231 | checkCNMEM(cnmemMalloc(&dlayer_input[i + 1], layer_input_size[i] * data_type_size, NULL)); 232 | 233 | if (i > 0) { 234 | if (layer_type[i] == ACTV or layer_type[i] == SOFTMAX) { 235 | dlayer_input[i] = dlayer_input[i + 1]; 236 | } 237 | else { 238 | checkCNMEM(cnmemMalloc(&dlayer_input[i], layer_input_size[i] * data_type_size, NULL)); 239 | } 240 | } 241 | // ---------------------- vDNN end ------------------------ 242 | 243 | if (layer_type[i] == CONV) { 244 | ConvLayerParams *cur_params = (ConvLayerParams *)params[i]; 245 | 246 | // allocate space for derivative 247 | if (!pre_alloc_conv_derivative) { 248 | cur_params->cnmemAllocDerivatives(data_type_size, NULL); 249 | } 250 | 251 | cur_filter_workspace_size = cur_params->bwd_filter_workspace_size; 252 | if (i > 0) 253 | cur_data_workspace_size = cur_params->bwd_data_workspace_size; 254 | else 255 | cur_data_workspace_size = 0; 256 | // std::cout << "bwd cur_workspace_size: " << cur_workspace_size << std::endl; 257 | cur_workspace_size = (cur_filter_workspace_size > cur_data_workspace_size) ? cur_filter_workspace_size : cur_data_workspace_size; 258 | checkCNMEM(cnmemMalloc(&cur_workspace, cur_workspace_size, NULL)); 259 | 260 | } 261 | 262 | else if (layer_type[i] == FULLY_CONNECTED) { 263 | FCLayerParams *cur_params = (FCLayerParams *)params[i]; 264 | 265 | if (!pre_alloc_fc_derivative) { 266 | cur_params->cnmemAllocDerivatives(data_type_size, NULL); 267 | } 268 | } 269 | 270 | else if (layer_type[i] == BATCHNORM) { 271 | BatchNormLayerParams *cur_params = (BatchNormLayerParams *)params[i]; 272 | 273 | if (!pre_alloc_batch_norm_derivative) { 274 | cur_params->cnmemAllocDerivatives(data_type_size, NULL); 275 | } 276 | } 277 | 278 | 279 | if (!(i + 1 < num_layers && layer_type[i + 1] == SOFTMAX)) 280 | checkCudaErrors(cudaEventRecord(start_compute, stream_compute)); 281 | 282 | if (layer_type[i] == CONV) { 283 | ConvLayerParams *cur_params = (ConvLayerParams *)params[i]; 284 | 285 | if (cur_params->activation_mode != ACTIVATION_NONE) { 286 | checkCUDNN(cudnnActivationBackward(cudnn_handle, cur_params->actv_desc, &alpha, 287 | cur_params->output_tensor, layer_input[i + 1], 288 | cur_params->output_tensor, dlayer_input[i + 1], 289 | cur_params->output_tensor, layer_input[i + 1], 290 | &beta, 291 | cur_params->output_tensor, dlayer_input[i + 1])); 292 | } 293 | 294 | cur_filter_workspace_size = cur_params->bwd_filter_workspace_size; 295 | if (i > 0) 296 | cur_data_workspace_size = cur_params->bwd_data_workspace_size; 297 | else 298 | cur_data_workspace_size = 0; 299 | // std::cout << "bwd cur_workspace_size: " << cur_workspace_size << std::endl; 300 | cur_workspace_size = (cur_filter_workspace_size > cur_data_workspace_size) ? cur_filter_workspace_size : cur_data_workspace_size; 301 | 302 | checkCUDNN(cudnnConvolutionBackwardBias(cudnn_handle, &alpha, 303 | cur_params->output_tensor, dlayer_input[i + 1], 304 | &beta, 305 | cur_params->bias_desc, cur_params->db)); 306 | 307 | // std::cout << "neural_net: backward conv i:" << i << std::endl; 308 | 309 | checkCUDNN(cudnnConvolutionBackwardFilter(cudnn_handle, &alpha, 310 | cur_params->input_tensor, layer_input[i], 311 | cur_params->output_tensor, dlayer_input[i + 1], 312 | cur_params->conv_desc, cur_params->bwd_filter_algo, 313 | cur_workspace, cur_workspace_size, 314 | &beta, 315 | cur_params->filter_desc, 316 | cur_params->dW)); 317 | if (i > 0) 318 | checkCUDNN(cudnnConvolutionBackwardData(cudnn_handle, &alpha, 319 | cur_params->filter_desc, cur_params->W, 320 | cur_params->output_tensor, dlayer_input[i + 1], 321 | cur_params->conv_desc, cur_params->bwd_data_algo, 322 | cur_workspace, cur_workspace_size, 323 | &beta, 324 | cur_params->input_tensor, dlayer_input[i])); 325 | 326 | // std::cout << "Free bytes: " << free_bytes << std::endl; 327 | // std::cout << "here\n"; 328 | cur_params->stepParams(cublas_handle, learning_rate); 329 | } 330 | 331 | else if (layer_type[i] == FULLY_CONNECTED) { 332 | FCLayerParams *cur_params = (FCLayerParams *)params[i]; 333 | 334 | if (cur_params->activation_mode != ACTIVATION_NONE) { 335 | checkCUDNN(cudnnActivationBackward(cudnn_handle, cur_params->actv_desc, &alpha, 336 | cur_params->output_tensor, layer_input[i + 1], 337 | cur_params->output_tensor, dlayer_input[i + 1], 338 | cur_params->output_tensor, layer_input[i + 1], 339 | &beta, 340 | cur_params->output_tensor, dlayer_input[i + 1])); 341 | } 342 | 343 | if (data_type == CUDNN_DATA_FLOAT) { 344 | // bias backward 345 | checkCUBLAS(cublasSgemm(cublas_handle, 346 | CUBLAS_OP_N, CUBLAS_OP_N, 347 | cur_params->C_out, 1, batch_size, 348 | &Salpha, 349 | (float *)dlayer_input[i + 1], cur_params->C_out, 350 | (float *)one_vec, batch_size, 351 | &Sbeta, 352 | (float *)cur_params->db, cur_params->C_out)); 353 | 354 | // weight backward 355 | checkCUBLAS(cublasSgemm(cublas_handle, 356 | CUBLAS_OP_N, CUBLAS_OP_T, 357 | cur_params->C_out, cur_params->C_in, batch_size, 358 | &Salpha, 359 | (float *)dlayer_input[i + 1], cur_params->C_out, 360 | (float *)layer_input[i], cur_params->C_in, 361 | &Sbeta, 362 | (float *)cur_params->dW, cur_params->C_out)); 363 | 364 | // data backward 365 | if (i > 0) 366 | checkCUBLAS(cublasSgemm(cublas_handle, 367 | CUBLAS_OP_T, CUBLAS_OP_N, 368 | cur_params->C_in, batch_size, cur_params->C_out, 369 | &Salpha, 370 | (float *)cur_params->W, cur_params->C_out, 371 | (float *)dlayer_input[i + 1], cur_params->C_out, 372 | &Sbeta, 373 | (float *)dlayer_input[i], cur_params->C_in)); 374 | } 375 | 376 | else if (data_type == CUDNN_DATA_DOUBLE) { 377 | // bias backward 378 | checkCUBLAS(cublasDgemm(cublas_handle, 379 | CUBLAS_OP_N, CUBLAS_OP_N, 380 | cur_params->C_out, 1, batch_size, 381 | &Dalpha, 382 | (double *)dlayer_input[i + 1], cur_params->C_out, 383 | (double *)one_vec, batch_size, 384 | &Dbeta, 385 | (double *)cur_params->db, cur_params->C_out)); 386 | 387 | // weight backward 388 | checkCUBLAS(cublasDgemm(cublas_handle, 389 | CUBLAS_OP_N, CUBLAS_OP_T, 390 | cur_params->C_out, cur_params->C_in, batch_size, 391 | &Dalpha, 392 | (double *)dlayer_input[i + 1], cur_params->C_out, 393 | (double *)layer_input[i], cur_params->C_in, 394 | &Dbeta, 395 | (double *)cur_params->dW, cur_params->C_out)); 396 | 397 | // data backward 398 | if (i > 0) 399 | checkCUBLAS(cublasDgemm(cublas_handle, 400 | CUBLAS_OP_T, CUBLAS_OP_N, 401 | cur_params->C_in, batch_size, cur_params->C_out, 402 | &Dalpha, 403 | (double *)cur_params->W, cur_params->C_out, 404 | (double *)dlayer_input[i + 1], cur_params->C_out, 405 | &Dbeta, 406 | (double *)dlayer_input[i], cur_params->C_in)); 407 | } 408 | cur_params->stepParams(cublas_handle, learning_rate); 409 | } 410 | 411 | else if (layer_type[i] == DROPOUT) { 412 | DropoutLayerParams *cur_params = (DropoutLayerParams *)params[i]; 413 | checkCUDNN(cudnnDropoutBackward(cudnn_handle, cur_params->dropout_desc, 414 | cur_params->input_tensor, dlayer_input[i + 1], 415 | cur_params->input_tensor, dlayer_input[i], 416 | cur_params->reserved_space, cur_params->reserved_space_size)); 417 | } 418 | 419 | else if (layer_type[i] == BATCHNORM) { 420 | BatchNormLayerParams *cur_params = (BatchNormLayerParams *)params[i]; 421 | 422 | checkCUDNN(cudnnBatchNormalizationBackward(cudnn_handle, cur_params->mode, 423 | &alpha, &beta, 424 | &alpha, &beta, 425 | cur_params->input_tensor, layer_input[i], 426 | cur_params->input_tensor, dlayer_input[i + 1], 427 | cur_params->input_tensor, dlayer_input[i], 428 | cur_params->sbmv_desc, cur_params->scale, 429 | cur_params->dscale, cur_params->dbias, 430 | cur_params->epsilon, 431 | cur_params->result_save_mean, cur_params->result_save_inv_var)); 432 | 433 | cur_params->stepParams(cublas_handle, learning_rate); 434 | } 435 | 436 | else if (layer_type[i] == POOLING) { 437 | PoolingLayerParams *cur_params = (PoolingLayerParams *)params[i]; 438 | checkCUDNN(cudnnPoolingBackward(cudnn_handle, cur_params->pool_desc, &alpha, 439 | cur_params->output_tensor, layer_input[i + 1], 440 | cur_params->output_tensor, dlayer_input[i + 1], 441 | cur_params->input_tensor, layer_input[i], 442 | &beta, 443 | cur_params->input_tensor, dlayer_input[i])); 444 | } 445 | 446 | else if (layer_type[i] == ACTV) { 447 | ActivationLayerParams *cur_params = (ActivationLayerParams *)params[i]; 448 | checkCUDNN(cudnnActivationBackward(cudnn_handle, cur_params->actv_desc, &alpha, 449 | cur_params->input_tensor, layer_input[i + 1], 450 | cur_params->input_tensor, dlayer_input[i + 1], 451 | cur_params->input_tensor, layer_input[i], 452 | &beta, 453 | cur_params->input_tensor, dlayer_input[i])); 454 | continue; 455 | } 456 | 457 | else if (layer_type[i] == SOFTMAX) { 458 | // std::cout << "compute here\n"; 459 | SoftmaxLayerParams *cur_params = (SoftmaxLayerParams *)params[i]; 460 | checkCUDNN(cudnnSoftmaxBackward(cudnn_handle, cur_params->algo, cur_params->mode, &alpha, 461 | cur_params->input_tensor, layer_input[i + 1], 462 | cur_params->input_tensor, dlayer_input[i + 1], 463 | &beta, 464 | cur_params->input_tensor, dlayer_input[i])); 465 | // std::cout << "compute here\n"; 466 | continue; 467 | } 468 | 469 | // ---------------------- vDNN start ---------------------- 470 | 471 | // checkCudaErrors(cudaDeviceSynchronize()); 472 | 473 | checkCudaErrors(cudaEventRecord(stop_compute, stream_compute)); 474 | checkCudaErrors(cudaEventSynchronize(stop_compute)); 475 | float compute_time; 476 | checkCudaErrors(cudaEventElapsedTime(&compute_time, start_compute, stop_compute)); 477 | 478 | bwd_computation_time.insert(bwd_computation_time.begin(), compute_time); 479 | 480 | if (layer_type[i] == CONV) { 481 | checkCNMEM(cnmemFree(cur_workspace, NULL)); 482 | if (!pre_alloc_conv_derivative) { 483 | ConvLayerParams *cur_params = (ConvLayerParams *)params[i]; 484 | cur_params->cnmemFreeDerivatives(NULL); 485 | } 486 | } 487 | else if (layer_type[i] == FULLY_CONNECTED) { 488 | if (!pre_alloc_fc_derivative) { 489 | FCLayerParams *cur_params = (FCLayerParams *)params[i]; 490 | cur_params->cnmemFreeDerivatives(NULL); 491 | } 492 | } 493 | else if (layer_type[i] == BATCHNORM) { 494 | if (!pre_alloc_batch_norm_derivative) { 495 | BatchNormLayerParams *cur_params = (BatchNormLayerParams *)params[i]; 496 | cur_params->cnmemFreeDerivatives(NULL); 497 | } 498 | } 499 | 500 | checkCNMEM(cnmemFree(layer_input[i + 1], NULL)); 501 | checkCNMEM(cnmemFree(dlayer_input[i + 1], NULL)); 502 | checkCNMEM(cnmemFree(layer_input[i], NULL)); 503 | if (i > 0 && layer_type[i] != SOFTMAX) 504 | checkCNMEM(cnmemFree(dlayer_input[i], NULL)); 505 | } 506 | } 507 | 508 | 509 | void NeuralNet::getTransferTime(void *X, int *y, double learning_rate, std::vector &fwd_transfer_time, std::vector &bwd_transfer_time) { 510 | for (int i = 0; i < num_layers; i++) { 511 | if (layer_type[i] == SOFTMAX) 512 | continue; 513 | 514 | void *device_data; 515 | void *host_data; 516 | 517 | checkCNMEM(cnmemMalloc(&device_data, layer_input_size[i] * data_type_size, NULL)); 518 | checkCudaErrors(cudaMallocHost(&host_data, layer_input_size[i] * data_type_size)); 519 | 520 | checkCudaErrors(cudaEventRecord(start_transfer, stream_memory)); 521 | 522 | checkCudaErrors(cudaMemcpyAsync(host_data, device_data, layer_input_size[i] * data_type_size, cudaMemcpyDeviceToHost, stream_memory)); 523 | 524 | checkCudaErrors(cudaEventRecord(stop_transfer, stream_memory)); 525 | checkCudaErrors(cudaEventSynchronize(stop_transfer)); 526 | float transfer_time; 527 | checkCudaErrors(cudaEventElapsedTime(&transfer_time, start_transfer, stop_transfer)); 528 | fwd_transfer_time.push_back(transfer_time); 529 | 530 | checkCudaErrors(cudaEventRecord(start_transfer, stream_memory)); 531 | 532 | checkCudaErrors(cudaMemcpyAsync(device_data, host_data, layer_input_size[i] * data_type_size, cudaMemcpyHostToDevice, stream_memory)); 533 | 534 | checkCudaErrors(cudaEventRecord(stop_transfer, stream_memory)); 535 | checkCudaErrors(cudaEventSynchronize(stop_transfer)); 536 | checkCudaErrors(cudaEventElapsedTime(&transfer_time, start_transfer, stop_transfer)); 537 | bwd_transfer_time.push_back(transfer_time); 538 | } 539 | } -------------------------------------------------------------------------------- /src/layer_params.cu: -------------------------------------------------------------------------------- 1 | #include "layer_params.h" 2 | 3 | void ConvLayerParams::initializeValues(cudnnHandle_t cudnn_handle, ConvDescriptor *user_params, cudnnDataType_t data_type, 4 | int batch_size, cudnnTensorFormat_t tensor_format, size_t data_type_size, LayerDimension &output_size, 5 | UpdateRule update_rule) { 6 | // create tensor, filter, conv descriptor 7 | checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor)); 8 | checkCUDNN(cudnnCreateTensorDescriptor(&output_tensor)); 9 | checkCUDNN(cudnnCreateTensorDescriptor(&bias_desc)); 10 | checkCUDNN(cudnnCreateFilterDescriptor(&filter_desc)); 11 | checkCUDNN(cudnnCreateConvolutionDescriptor(&conv_desc)); 12 | 13 | C_in = user_params->input_channels; 14 | C_out = user_params->output_channels; 15 | filter_h = user_params->kernel_h; 16 | filter_w = user_params->kernel_w; 17 | kernel_size = C_out * C_in * filter_h * filter_w; 18 | this->data_type = data_type; 19 | this->activation_mode = user_params->activation_mode; 20 | 21 | checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 22 | batch_size, user_params->input_channels, user_params->input_h, user_params->input_w)); 23 | 24 | 25 | checkCUDNN(cudnnSetFilter4dDescriptor(filter_desc, data_type, tensor_format, 26 | user_params->output_channels, user_params->input_channels, user_params->kernel_h, user_params->kernel_w)); 27 | 28 | int dilation_h = 1, dilation_w = 1; 29 | checkCUDNN(cudnnSetConvolution2dDescriptor(conv_desc, user_params->pad_h, user_params->pad_w, 30 | user_params->stride_y, user_params->stride_x, 31 | dilation_h, dilation_w, 32 | CUDNN_CROSS_CORRELATION, data_type)); 33 | 34 | int output_batch_size, output_channels, output_h, output_w; 35 | checkCUDNN(cudnnGetConvolution2dForwardOutputDim(conv_desc, input_tensor, filter_desc, 36 | &output_batch_size, &output_channels, &output_h, &output_w)); 37 | 38 | checkCUDNN(cudnnSetTensor4dDescriptor(output_tensor, tensor_format, data_type, 39 | output_batch_size, output_channels, output_h, output_w)); 40 | checkCUDNN(cudnnSetTensor4dDescriptor(bias_desc, tensor_format, data_type, 41 | 1, output_channels, 1, 1)); 42 | 43 | fwd_req_count = 10; 44 | fwd_perf = (cudnnConvolutionFwdAlgoPerf_t *)malloc(fwd_req_count * sizeof(cudnnConvolutionFwdAlgoPerf_t)); 45 | checkCUDNN(cudnnFindConvolutionForwardAlgorithm(cudnn_handle, 46 | input_tensor, filter_desc, conv_desc, output_tensor, 47 | fwd_req_count, &fwd_ret_count, fwd_perf)); 48 | 49 | // std::cout << "Printing forward conv algo perf\n"; 50 | // std::cout << "CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM: " << CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM << std::endl; 51 | // for (int i = 0; i < fwd_ret_count; i++) { 52 | // std::cout << i << std::endl; 53 | // std::cout << "algo: " << fwd_perf[i].algo << std::endl; 54 | // std::cout << "status: " << cudnnGetErrorString(fwd_perf[i].status) << std::endl; 55 | // std::cout << "time(ms): " << fwd_perf[i].time << std::endl; 56 | // std::cout << "memory(MB): " << fwd_perf[i].memory * 1.0 / 1024 / 1024 << std::endl; 57 | // std::cout << "mathType: " << fwd_perf[i].mathType << std::endl; 58 | // std::cout << std::endl; 59 | // } 60 | 61 | bwd_filter_req_count = 10; 62 | bwd_filter_perf = (cudnnConvolutionBwdFilterAlgoPerf_t *)malloc(bwd_filter_req_count * sizeof(cudnnConvolutionBwdFilterAlgoPerf_t)); 63 | checkCUDNN(cudnnFindConvolutionBackwardFilterAlgorithm(cudnn_handle, 64 | input_tensor, output_tensor, conv_desc, filter_desc, 65 | bwd_filter_req_count, &bwd_filter_ret_count, bwd_filter_perf)); 66 | 67 | // std::cout << "Printing bwdfilter conv algo perf\n"; 68 | // std::cout << "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 " << CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 << std::endl; 69 | // for (int i = 0; i < bwd_filter_ret_count; i++) { 70 | // std::cout << i << std::endl; 71 | // std::cout << "algo: " << bwd_filter_perf[i].algo << std::endl; 72 | // std::cout << "status: " << cudnnGetErrorString(bwd_filter_perf[i].status) << std::endl; 73 | // std::cout << "time(ms): " << bwd_filter_perf[i].time << std::endl; 74 | // std::cout << "memory(MB): " << bwd_filter_perf[i].memory * 1.0 / 1024 / 1024 << std::endl; 75 | // std::cout << "mathType: " << bwd_filter_perf[i].mathType << std::endl; 76 | // std::cout << std::endl; 77 | // } 78 | bwd_data_req_count = 10; 79 | bwd_data_perf = (cudnnConvolutionBwdDataAlgoPerf_t *)malloc(bwd_data_req_count * sizeof(cudnnConvolutionBwdDataAlgoPerf_t)); 80 | checkCUDNN(cudnnFindConvolutionBackwardDataAlgorithm(cudnn_handle, 81 | filter_desc, output_tensor, conv_desc, input_tensor, 82 | bwd_data_req_count, &bwd_data_ret_count, bwd_data_perf)); 83 | 84 | // std::cout << "Printing bwddata conv algo perf\n"; 85 | // for (int i = 0; i < bwd_data_ret_count; i++) { 86 | // std::cout << i << std::endl; 87 | // std::cout << "algo: " << bwd_data_perf[i].algo << std::endl; 88 | // std::cout << "status: " << cudnnGetErrorString(bwd_data_perf[i].status) << std::endl; 89 | // std::cout << "time(ms): " << bwd_data_perf[i].time << std::endl; 90 | // std::cout << "memory(MB): " << bwd_data_perf[i].memory * 1.0 / 1024 / 1024 << std::endl; 91 | // std::cout << "mathType: " << bwd_data_perf[i].mathType << std::endl; 92 | // std::cout << std::endl; 93 | // } 94 | 95 | this->update_rule = update_rule; 96 | 97 | cudnnActivationMode_t mode; 98 | if (activation_mode == SIGMOID) 99 | mode = CUDNN_ACTIVATION_SIGMOID; 100 | else if (activation_mode == RELU) 101 | mode = CUDNN_ACTIVATION_RELU; 102 | else if (activation_mode == TANH) 103 | mode = CUDNN_ACTIVATION_TANH; 104 | else if (activation_mode == CLIPPED_RELU) 105 | mode = CUDNN_ACTIVATION_CLIPPED_RELU; 106 | else if (activation_mode == ELU) 107 | mode = CUDNN_ACTIVATION_ELU; 108 | 109 | if (activation_mode != ACTIVATION_NONE) { 110 | checkCUDNN(cudnnCreateActivationDescriptor(&actv_desc)); 111 | checkCUDNN(cudnnSetActivationDescriptor(actv_desc, mode, CUDNN_PROPAGATE_NAN, user_params->actv_coef)); 112 | } 113 | 114 | output_size.N = output_batch_size, output_size.C = output_channels, output_size.H = output_h, output_size.W = output_w; 115 | 116 | } 117 | 118 | void ConvLayerParams::allocateSpace(curandGenerator_t curand_gen, cudnnDataType_t data_type, size_t data_type_size, 119 | float std_dev, size_t &free_bytes, bool alloc_derivative) { 120 | 121 | if (kernel_size % 2 != 0) 122 | kernel_size += 1; 123 | checkCudaErrors(cudaMalloc(&W, kernel_size * data_type_size)); 124 | checkCudaErrors(cudaMalloc(&b, C_out * data_type_size)); 125 | 126 | if (alloc_derivative) { 127 | checkCudaErrors(cudaMalloc(&dW, kernel_size * data_type_size)); 128 | checkCudaErrors(cudaMalloc(&db, C_out * data_type_size)); 129 | } 130 | 131 | if (data_type == CUDNN_DATA_FLOAT) { 132 | checkCURAND(curandGenerateNormal(curand_gen, (float *)W, kernel_size, 0, std_dev)); 133 | fillValue<<>>((float *)b, C_out, 0); 134 | } 135 | else { 136 | checkCURAND(curandGenerateNormalDouble(curand_gen, (double *)W, kernel_size, 0, std_dev)); 137 | fillValue<<>>((double *)b, C_out, 0); 138 | } 139 | 140 | free_bytes = free_bytes - 2 * (kernel_size + C_out) * data_type_size; 141 | 142 | } 143 | 144 | void ConvLayerParams::cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream) { 145 | checkCNMEM(cnmemMalloc(&dW, kernel_size * data_type_size, stream)); 146 | checkCNMEM(cnmemMalloc(&db, C_out * data_type_size, stream)); 147 | } 148 | 149 | bool ConvLayerParams::cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, 150 | size_t &max_consume, size_t free_bytes, bool &out_of_memory) { 151 | checkCNMEMSim(cnmemMalloc(&dW, kernel_size * data_type_size, stream), 152 | kernel_size * data_type_size, max_consume, free_bytes, return false, out_of_memory); 153 | checkCNMEMSim(cnmemMalloc(&db, C_out * data_type_size, stream), 154 | C_out * data_type_size, max_consume, free_bytes, return false, out_of_memory); 155 | 156 | return true; 157 | } 158 | 159 | void ConvLayerParams::stepParams(cublasHandle_t cublas_handle, double learning_rate) { 160 | float Salpha = -learning_rate; 161 | double Dalpha = -learning_rate; 162 | 163 | if (update_rule == SGD) { 164 | if (data_type == CUDNN_DATA_FLOAT) { 165 | checkCUBLAS(cublasSaxpy(cublas_handle, kernel_size, 166 | &Salpha, 167 | (float *)dW, 1, 168 | (float *)W, 1)); 169 | 170 | checkCUBLAS(cublasSaxpy(cublas_handle, C_out, 171 | &Salpha, 172 | (float *)db, 1, 173 | (float *)b, 1)); 174 | } 175 | else if (data_type == CUDNN_DATA_DOUBLE) { 176 | checkCUBLAS(cublasDaxpy(cublas_handle, kernel_size, 177 | &Dalpha, 178 | (double *)dW, 1, 179 | (double *)W, 1)); 180 | 181 | checkCUBLAS(cublasDaxpy(cublas_handle, C_out, 182 | &Dalpha, 183 | (double *)db, 1, 184 | (double *)b, 1)); 185 | } 186 | } 187 | } 188 | 189 | void ConvLayerParams::cnmemFreeDerivatives(cudaStream_t stream) { 190 | checkCNMEM(cnmemFree(dW, stream)); 191 | checkCNMEM(cnmemFree(db, stream)); 192 | } 193 | 194 | size_t ConvLayerParams::getWorkspaceSize(size_t &free_bytes, ConvLayerParams::ConvDirection conv_direction, vDNNConvAlgo vdnn_conv_algo) { 195 | if (vdnn_conv_algo == vDNN_PERFORMANCE_OPTIMAL) { 196 | if (conv_direction == FWD) { 197 | if (fwd_perf[0].memory > free_bytes) 198 | outOfMemory(); 199 | fwd_algo = fwd_perf[0].algo; 200 | return fwd_perf[0].memory; 201 | } 202 | else if (conv_direction == BWD_FILTER) { 203 | if (bwd_filter_perf[0].memory > free_bytes) 204 | outOfMemory(); 205 | bwd_filter_algo = bwd_filter_perf[0].algo; 206 | return bwd_filter_perf[0].memory; 207 | } 208 | else if (conv_direction == BWD_DATA) { 209 | if (bwd_data_perf[0].memory > free_bytes) 210 | outOfMemory(); 211 | bwd_data_algo = bwd_data_perf[0].algo; 212 | return bwd_data_perf[0].memory; 213 | } 214 | } 215 | else if (vdnn_conv_algo == vDNN_MEMORY_OPTIMAL) { 216 | if (conv_direction == FWD) { 217 | for (int i = 0; i < fwd_ret_count; i++) { 218 | if (fwd_perf[i].algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM && fwd_perf[i].status == CUDNN_STATUS_SUCCESS && 219 | fwd_perf[i].memory < free_bytes) { 220 | fwd_algo = fwd_perf[i].algo; 221 | return fwd_perf[i].memory; 222 | } 223 | } 224 | } 225 | else if (conv_direction == BWD_FILTER) { 226 | for (int i = 0; i < bwd_filter_ret_count; i++) { 227 | if (bwd_filter_perf[i].algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 && bwd_filter_perf[i].status == CUDNN_STATUS_SUCCESS && 228 | bwd_filter_perf[i].memory < free_bytes) { 229 | bwd_filter_algo = bwd_filter_perf[i].algo; 230 | // std::cout << "Free bytes " << free_bytes << std::endl; 231 | // std::cout << "bwd_filter_perf[i].memory " << bwd_filter_perf[i].memory << std::endl; 232 | return bwd_filter_perf[i].memory; 233 | } 234 | } 235 | } 236 | else if (conv_direction == BWD_DATA) { 237 | for (int i = 0; i < bwd_data_ret_count; i++) { 238 | if (bwd_data_perf[i].algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 && bwd_data_perf[i].status == CUDNN_STATUS_SUCCESS && 239 | bwd_data_perf[i].memory < free_bytes) { 240 | bwd_data_algo = bwd_data_perf[i].algo; 241 | return bwd_data_perf[i].memory; 242 | } 243 | } 244 | } 245 | std::cout << "Error in getWorkspaceSize" << std::endl; 246 | exit(0); 247 | } 248 | return 0; 249 | } 250 | 251 | workspaceStatus_t ConvLayerParams::getWorkspaceSize(size_t &free_bytes, ConvLayerParams::ConvDirection conv_direction, vDNNConvAlgoPref algo_pref, 252 | bool hard_pref, size_t &workspace_size) { 253 | if (hard_pref) { 254 | if (algo_pref == PREFER_PERFORMANCE_OPTIMAL) { 255 | if (conv_direction == FWD) { 256 | if (fwd_perf[0].memory > free_bytes && fwd_perf[0].status == CUDNN_STATUS_SUCCESS) 257 | return WORKSPACE_STATUS_OUT_OF_MEMORY; 258 | fwd_algo = fwd_perf[0].algo; 259 | fwd_workspace_size = fwd_perf[0].memory; 260 | workspace_size = fwd_workspace_size; 261 | return WORKSPACE_STATUS_SUCCESS; 262 | } 263 | else if (conv_direction == BWD_FILTER) { 264 | if (bwd_filter_perf[0].memory > free_bytes && bwd_filter_perf[0].status == CUDNN_STATUS_SUCCESS) 265 | return WORKSPACE_STATUS_OUT_OF_MEMORY; 266 | bwd_filter_algo = bwd_filter_perf[0].algo; 267 | bwd_filter_workspace_size = bwd_filter_perf[0].memory; 268 | workspace_size = bwd_filter_workspace_size; 269 | return WORKSPACE_STATUS_SUCCESS; 270 | } 271 | else if (conv_direction == BWD_DATA) { 272 | if (bwd_data_perf[0].memory > free_bytes && bwd_data_perf[0].status == CUDNN_STATUS_SUCCESS) 273 | return WORKSPACE_STATUS_OUT_OF_MEMORY; 274 | bwd_data_algo = bwd_data_perf[0].algo; 275 | bwd_data_workspace_size = bwd_data_perf[0].memory; 276 | workspace_size = bwd_data_workspace_size; 277 | return WORKSPACE_STATUS_SUCCESS; 278 | } 279 | } 280 | else if (algo_pref == PREFER_MEMORY_OPTIMAL) { 281 | if (conv_direction == FWD) { 282 | for (int i = 0; i < fwd_ret_count; i++) { 283 | if (fwd_perf[i].algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM) 284 | if (fwd_perf[i].memory < free_bytes && fwd_perf[i].status == CUDNN_STATUS_SUCCESS) { 285 | fwd_algo = fwd_perf[i].algo; 286 | fwd_workspace_size = fwd_perf[i].memory; 287 | workspace_size = fwd_workspace_size; 288 | return WORKSPACE_STATUS_SUCCESS; 289 | } 290 | else 291 | return WORKSPACE_STATUS_OUT_OF_MEMORY; 292 | } 293 | } 294 | else if (conv_direction == BWD_FILTER) { 295 | for (int i = 0; i < bwd_filter_ret_count; i++) { 296 | if (bwd_filter_perf[i].algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) 297 | if (bwd_filter_perf[i].memory < free_bytes && bwd_filter_perf[i].status == CUDNN_STATUS_SUCCESS) { 298 | bwd_filter_algo = bwd_filter_perf[i].algo; 299 | // std::cout << "Free bytes " << free_bytes << std::endl; 300 | // std::cout << "bwd_filter_perf[i].memory " << bwd_filter_perf[i].memory << std::endl; 301 | bwd_filter_workspace_size = bwd_filter_perf[i].memory; 302 | workspace_size = bwd_filter_workspace_size; 303 | return WORKSPACE_STATUS_SUCCESS; 304 | } 305 | else 306 | return WORKSPACE_STATUS_OUT_OF_MEMORY; 307 | } 308 | } 309 | else if (conv_direction == BWD_DATA) { 310 | for (int i = 0; i < bwd_data_ret_count; i++) { 311 | if (bwd_data_perf[i].algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) 312 | if (bwd_data_perf[i].memory < free_bytes && bwd_data_perf[i].status == CUDNN_STATUS_SUCCESS) { 313 | bwd_data_algo = bwd_data_perf[i].algo; 314 | bwd_data_workspace_size = bwd_data_perf[i].memory; 315 | workspace_size = bwd_data_workspace_size; 316 | return WORKSPACE_STATUS_SUCCESS; 317 | } 318 | else 319 | return WORKSPACE_STATUS_OUT_OF_MEMORY; 320 | } 321 | } 322 | } 323 | } 324 | else { 325 | // only performance optimal is possible 326 | if (algo_pref == PREFER_PERFORMANCE_OPTIMAL) { 327 | if (conv_direction == FWD) { 328 | for (int i = 0; i < fwd_ret_count; i++) { 329 | if (fwd_perf[i].memory < free_bytes && fwd_perf[i].status == CUDNN_STATUS_SUCCESS) { 330 | fwd_algo = fwd_perf[i].algo; 331 | fwd_workspace_size = fwd_perf[i].memory; 332 | workspace_size = fwd_workspace_size; 333 | return WORKSPACE_STATUS_SUCCESS; 334 | } 335 | } 336 | } 337 | else if (conv_direction == BWD_FILTER) { 338 | for (int i = 0; i < bwd_filter_ret_count; i++) { 339 | if (bwd_filter_perf[i].memory < free_bytes && bwd_filter_perf[i].status == CUDNN_STATUS_SUCCESS) { 340 | bwd_filter_algo = bwd_filter_perf[i].algo; 341 | // std::cout << "Free bytes " << free_bytes << std::endl; 342 | // std::cout << "bwd_filter_perf[i].memory " << bwd_filter_perf[i].memory << std::endl; 343 | bwd_filter_workspace_size = bwd_filter_perf[i].memory; 344 | workspace_size = bwd_filter_workspace_size; 345 | return WORKSPACE_STATUS_SUCCESS; 346 | } 347 | } 348 | } 349 | else if (conv_direction == BWD_DATA) { 350 | for (int i = 0; i < bwd_data_ret_count; i++) { 351 | if (bwd_data_perf[i].memory < free_bytes && bwd_data_perf[i].status == CUDNN_STATUS_SUCCESS) { 352 | bwd_data_algo = bwd_data_perf[i].algo; 353 | bwd_data_workspace_size = bwd_data_perf[i].memory; 354 | workspace_size = bwd_data_workspace_size; 355 | return WORKSPACE_STATUS_SUCCESS; 356 | } 357 | } 358 | } 359 | } 360 | } 361 | return WORKSPACE_STATUS_OUT_OF_MEMORY; 362 | } 363 | 364 | void FCLayerParams::initializeValues(FCDescriptor *user_params, int batch_size, cudnnTensorFormat_t tensor_format, cudnnDataType_t data_type, 365 | LayerDimension &output_size, UpdateRule update_rule) { 366 | C_in = user_params->input_channels; 367 | C_out = user_params->output_channels; 368 | weight_matrix_size = C_in * C_out; 369 | this->data_type = data_type; 370 | this->activation_mode = user_params->activation_mode; 371 | 372 | this->update_rule = update_rule; 373 | 374 | cudnnActivationMode_t mode; 375 | if (activation_mode == SIGMOID) 376 | mode = CUDNN_ACTIVATION_SIGMOID; 377 | else if (activation_mode == RELU) 378 | mode = CUDNN_ACTIVATION_RELU; 379 | else if (activation_mode == TANH) 380 | mode = CUDNN_ACTIVATION_TANH; 381 | else if (activation_mode == CLIPPED_RELU) 382 | mode = CUDNN_ACTIVATION_CLIPPED_RELU; 383 | else if (activation_mode == ELU) 384 | mode = CUDNN_ACTIVATION_ELU; 385 | 386 | if (activation_mode != ACTIVATION_NONE) { 387 | checkCUDNN(cudnnCreateActivationDescriptor(&actv_desc)); 388 | checkCUDNN(cudnnSetActivationDescriptor(actv_desc, mode, CUDNN_PROPAGATE_NAN, user_params->actv_coef)); 389 | checkCUDNN(cudnnCreateTensorDescriptor(&output_tensor)); 390 | checkCUDNN(cudnnSetTensor4dDescriptor(output_tensor, tensor_format, data_type, 391 | batch_size, user_params->output_channels, 1, 1)); 392 | } 393 | 394 | output_size.N = batch_size, output_size.C = C_out, output_size.H = output_size.W = 1; 395 | } 396 | 397 | void FCLayerParams::allocateSpace(curandGenerator_t curand_gen, cudnnDataType_t data_type, size_t data_type_size, 398 | float std_dev, size_t &free_bytes, bool alloc_derivative) { 399 | int wt_alloc_size = weight_matrix_size; 400 | if (wt_alloc_size % 2 != 0) 401 | wt_alloc_size += 1; 402 | checkCudaErrors(cudaMalloc(&W, wt_alloc_size * data_type_size)); 403 | checkCudaErrors(cudaMalloc(&b, C_out * data_type_size)); 404 | if (alloc_derivative) { 405 | checkCudaErrors(cudaMalloc(&dW, wt_alloc_size * data_type_size)); 406 | checkCudaErrors(cudaMalloc(&db, C_out * data_type_size)); 407 | } 408 | 409 | if (data_type == CUDNN_DATA_FLOAT) { 410 | checkCURAND(curandGenerateNormal(curand_gen, (float *)W, wt_alloc_size, 0, std_dev)); 411 | fillValue<<>>((float *)b, C_out, 0); 412 | } 413 | else if (data_type == CUDNN_DATA_DOUBLE) { 414 | checkCURAND(curandGenerateNormalDouble(curand_gen, (double *)W, wt_alloc_size, 0, std_dev)); 415 | fillValue<<>>((double *)b, C_out, 0); 416 | } 417 | free_bytes = free_bytes - 2 * (C_in * C_out + C_out) * data_type_size; 418 | } 419 | 420 | void FCLayerParams::cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream) { 421 | checkCNMEM(cnmemMalloc(&dW, weight_matrix_size * data_type_size, stream)); 422 | checkCNMEM(cnmemMalloc(&db, C_out * data_type_size, stream)); 423 | } 424 | 425 | bool FCLayerParams::cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, 426 | size_t &max_consume, size_t free_bytes, bool &out_of_memory) { 427 | checkCNMEMSim(cnmemMalloc(&dW, weight_matrix_size * data_type_size, stream), 428 | weight_matrix_size * data_type_size, max_consume, free_bytes, return false, out_of_memory); 429 | checkCNMEMSim(cnmemMalloc(&db, C_out * data_type_size, stream), 430 | C_out * data_type_size, max_consume, free_bytes, return false, out_of_memory); 431 | return true; 432 | } 433 | 434 | void FCLayerParams::stepParams(cublasHandle_t cublas_handle, double learning_rate) { 435 | float Salpha = -learning_rate; 436 | double Dalpha = -learning_rate; 437 | 438 | // { 439 | // float *db_h = (float *)malloc(C_out * sizeof(float)); 440 | // checkCudaErrors(cudaMemcpy(db_h, db, C_out * sizeof(float), cudaMemcpyDeviceToHost)); 441 | // for (int i = 0; i < C_out; i++) { 442 | // std::cout << db_h[i] << ' '; 443 | // } 444 | // std::cout << "\n"; 445 | // int n; 446 | // std::cin >> n; 447 | // } 448 | 449 | 450 | if (update_rule == SGD) { 451 | if (data_type == CUDNN_DATA_FLOAT) { 452 | checkCUBLAS(cublasSaxpy(cublas_handle, weight_matrix_size, 453 | &Salpha, 454 | (float *)dW, 1, 455 | (float *)W, 1)); 456 | 457 | checkCUBLAS(cublasSaxpy(cublas_handle, C_out, 458 | &Salpha, 459 | (float *)db, 1, 460 | (float *)b, 1)); 461 | } 462 | else if (data_type == CUDNN_DATA_DOUBLE) { 463 | checkCUBLAS(cublasDaxpy(cublas_handle, weight_matrix_size, 464 | &Dalpha, 465 | (double *)dW, 1, 466 | (double *)W, 1)); 467 | 468 | checkCUBLAS(cublasDaxpy(cublas_handle, C_out, 469 | &Dalpha, 470 | (double *)db, 1, 471 | (double *)b, 1)); 472 | } 473 | } 474 | // { 475 | // float *db_h = (float *)malloc(C_out * sizeof(float)); 476 | // checkCudaErrors(cudaMemcpy(db_h, b, C_out * sizeof(float), cudaMemcpyDeviceToHost)); 477 | // for (int i = 0; i < C_out; i++) { 478 | // std::cout << db_h[i] << ' '; 479 | // } 480 | // std::cout << "\n"; 481 | // int n; 482 | // std::cin >> n; 483 | // } 484 | } 485 | 486 | void FCLayerParams::cnmemFreeDerivatives(cudaStream_t stream) { 487 | checkCNMEM(cnmemFree(dW, stream)); 488 | checkCNMEM(cnmemFree(db, stream)); 489 | } 490 | 491 | void DropoutLayerParams::initializeValues(cudnnHandle_t cudnn_handle, DropoutDescriptor *user_params, cudnnDataType_t data_type, int batch_size, 492 | cudnnTensorFormat_t tensor_format, LayerDimension &output_size) { 493 | checkCUDNN(cudnnCreateDropoutDescriptor(&dropout_desc)); 494 | checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor)); 495 | 496 | checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 497 | batch_size, user_params->channels, user_params->h, user_params->w)); 498 | 499 | checkCUDNN(cudnnDropoutGetStatesSize(cudnn_handle, &state_size)); 500 | 501 | checkCUDNN(cudnnDropoutGetReserveSpaceSize(input_tensor, &reserved_space_size)); 502 | 503 | output_size.N = batch_size, output_size.C = user_params->channels, output_size.H = user_params->h, output_size.W = user_params->w; 504 | 505 | } 506 | 507 | void DropoutLayerParams::allocateSpace(size_t &free_bytes, cudnnHandle_t cudnn_handle, DropoutDescriptor *user_params, long long seed) { 508 | checkCudaErrors(cudaMalloc(&state, state_size)); 509 | checkCudaErrors(cudaMalloc(&reserved_space, reserved_space_size)); 510 | checkCUDNN(cudnnSetDropoutDescriptor(dropout_desc, cudnn_handle, user_params->dropout_value, state, state_size, seed)); 511 | 512 | free_bytes = free_bytes - (state_size + reserved_space_size); 513 | } 514 | 515 | void BatchNormLayerParams::initializeValues(BatchNormDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 516 | int batch_size, LayerDimension &output_size, UpdateRule update_rule) { 517 | checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor)); 518 | checkCUDNN(cudnnCreateTensorDescriptor(&sbmv_desc)); 519 | c = user_params->channels, h = user_params->h, w = user_params->w; 520 | if (user_params->mode == BATCHNORM_PER_ACTIVATION) { 521 | mode = CUDNN_BATCHNORM_PER_ACTIVATION; 522 | checkCUDNN(cudnnSetTensor4dDescriptor(sbmv_desc, tensor_format, data_type, 523 | 1, user_params->channels, user_params->h, user_params->w)); 524 | sbmv_size = c * h * w; 525 | } 526 | else if (user_params->mode == BATCHNORM_SPATIAL) { 527 | mode = CUDNN_BATCHNORM_SPATIAL; 528 | checkCUDNN(cudnnSetTensor4dDescriptor(sbmv_desc, tensor_format, data_type, 529 | 1, user_params->channels, 1, 1)); 530 | sbmv_size = c; 531 | } 532 | 533 | checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 534 | batch_size, user_params->channels, user_params->h, user_params->w)); 535 | 536 | factor = user_params->factor; 537 | epsilon = user_params->epsilon; 538 | 539 | this->update_rule = update_rule; 540 | this->data_type = data_type; 541 | 542 | if (mode == CUDNN_BATCHNORM_PER_ACTIVATION) 543 | allocation_size = c * h * w; 544 | else 545 | allocation_size = c; 546 | 547 | output_size.N = batch_size, output_size.C = user_params->channels, output_size.H = user_params->h, output_size.W = user_params->w; 548 | } 549 | 550 | void BatchNormLayerParams::allocateSpace(cudnnDataType_t data_type, size_t data_type_size, size_t &free_bytes, bool alloc_derivative) { 551 | 552 | size_t allocation_size_bytes = allocation_size * data_type_size; 553 | 554 | checkCudaErrors(cudaMalloc(&scale, allocation_size_bytes)); 555 | checkCudaErrors(cudaMalloc(&bias, allocation_size_bytes)); 556 | if (alloc_derivative) { 557 | checkCudaErrors(cudaMalloc(&dscale, allocation_size_bytes)); 558 | checkCudaErrors(cudaMalloc(&dbias, allocation_size_bytes)); 559 | } 560 | 561 | checkCudaErrors(cudaMalloc(&running_mean, allocation_size_bytes)); 562 | checkCudaErrors(cudaMalloc(&running_variance, allocation_size_bytes)); 563 | 564 | checkCudaErrors(cudaMalloc(&result_save_mean, allocation_size_bytes)); 565 | checkCudaErrors(cudaMalloc(&result_save_inv_var, allocation_size_bytes)); 566 | 567 | if (data_type == CUDNN_DATA_FLOAT) { 568 | fillValue<<>>((float *)scale, allocation_size, 1); 569 | fillValue<<>>((float *)bias, allocation_size, 1); 570 | } 571 | else if (data_type == CUDNN_DATA_DOUBLE) { 572 | fillValue<<>>((double *)scale, allocation_size, 1); 573 | fillValue<<>>((double *)bias, allocation_size, 1); 574 | } 575 | free_bytes = free_bytes - 6 * allocation_size_bytes; 576 | 577 | } 578 | 579 | void BatchNormLayerParams::cnmemAllocDerivatives(size_t data_type_size, cudaStream_t stream) { 580 | checkCNMEM(cnmemMalloc(&dscale, allocation_size * data_type_size, stream)); 581 | checkCNMEM(cnmemMalloc(&dbias, allocation_size * data_type_size, stream)); 582 | } 583 | 584 | bool BatchNormLayerParams::cnmemAllocDerivativesCheck(size_t data_type_size, cudaStream_t stream, 585 | size_t &max_consume, size_t free_bytes, bool &out_of_memory) { 586 | checkCNMEMSim(cnmemMalloc(&dscale, allocation_size * data_type_size, stream), 587 | allocation_size * data_type_size, max_consume, free_bytes, return false, out_of_memory); 588 | checkCNMEMSim(cnmemMalloc(&dbias, allocation_size * data_type_size, stream), 589 | allocation_size * data_type_size, max_consume, free_bytes, return false, out_of_memory); 590 | return true; 591 | } 592 | 593 | void BatchNormLayerParams::stepParams(cublasHandle_t cublas_handle, double learning_rate) { 594 | float Salpha = -learning_rate; 595 | double Dalpha = -learning_rate; 596 | 597 | if (update_rule == SGD) { 598 | if (data_type == CUDNN_DATA_FLOAT) { 599 | checkCUBLAS(cublasSaxpy(cublas_handle, sbmv_size, 600 | &Salpha, 601 | (float *)dscale, 1, 602 | (float *)scale, 1)); 603 | checkCUBLAS(cublasSaxpy(cublas_handle, sbmv_size, 604 | &Salpha, 605 | (float *)dbias, 1, 606 | (float *)bias, 1)); 607 | } 608 | else if (data_type == CUDNN_DATA_DOUBLE) { 609 | checkCUBLAS(cublasDaxpy(cublas_handle, sbmv_size, 610 | &Dalpha, 611 | (double *)dscale, 1, 612 | (double *)scale, 1)); 613 | checkCUBLAS(cublasDaxpy(cublas_handle, sbmv_size, 614 | &Dalpha, 615 | (double *)dbias, 1, 616 | (double *)bias, 1)); 617 | } 618 | } 619 | } 620 | 621 | void BatchNormLayerParams::cnmemFreeDerivatives(cudaStream_t stream) { 622 | checkCNMEM(cnmemFree(dscale, stream)); 623 | checkCNMEM(cnmemFree(dbias, stream)); 624 | } 625 | 626 | void PoolingLayerParams::initializeValues(PoolingDescriptor *user_params, cudnnDataType_t data_type, cudnnTensorFormat_t tensor_format, 627 | int batch_size, LayerDimension &output_size) { 628 | checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor)); 629 | checkCUDNN(cudnnCreateTensorDescriptor(&output_tensor)); 630 | 631 | checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 632 | batch_size, user_params->input_channels, user_params->input_h, user_params->input_w)); 633 | 634 | 635 | checkCUDNN(cudnnCreatePoolingDescriptor(&pool_desc)); 636 | 637 | cudnnPoolingMode_t mode; 638 | if (user_params->mode == POOLING_MAX) 639 | mode = CUDNN_POOLING_MAX; 640 | else if (user_params->mode == POOLING_AVERAGE_COUNT_INCLUDE_PADDING) 641 | mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; 642 | else if (user_params->mode == POOLING_AVERAGE_COUNT_EXCLUDE_PADDING) 643 | mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; 644 | 645 | checkCUDNN(cudnnSetPooling2dDescriptor(pool_desc, mode, CUDNN_PROPAGATE_NAN, 646 | user_params->kernel_h, user_params->kernel_w, 647 | user_params->pad_h, user_params->pad_w, 648 | user_params->stride_y, user_params->stride_x)); 649 | 650 | 651 | int output_batch_size, output_channels, output_h, output_w; 652 | checkCUDNN(cudnnGetPooling2dForwardOutputDim(pool_desc, input_tensor, 653 | &output_batch_size, &output_channels, &output_h, &output_w)); 654 | 655 | checkCUDNN(cudnnSetTensor4dDescriptor(output_tensor, tensor_format, data_type, 656 | output_batch_size, output_channels, output_h, output_w)); 657 | 658 | output_size.N = output_batch_size, output_size.C = output_channels, output_size.H = output_h, output_size.W = output_w; 659 | 660 | } 661 | 662 | void PoolingLayerParams::allocateSpace(size_t &free_bytes) { 663 | 664 | } 665 | 666 | void ActivationLayerParams::initializeValues(ActivationDescriptor *user_params, cudnnDataType_t data_type, 667 | cudnnTensorFormat_t tensor_format, int batch_size, LayerDimension &output_size) { 668 | checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor)); 669 | 670 | checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 671 | batch_size, user_params->channels, user_params->h, user_params->w)); 672 | 673 | cudnnActivationMode_t mode; 674 | if (user_params->mode == SIGMOID) 675 | mode = CUDNN_ACTIVATION_SIGMOID; 676 | else if (user_params->mode == RELU) 677 | mode = CUDNN_ACTIVATION_RELU; 678 | else if (user_params->mode == TANH) 679 | mode = CUDNN_ACTIVATION_TANH; 680 | else if (user_params->mode == CLIPPED_RELU) 681 | mode = CUDNN_ACTIVATION_CLIPPED_RELU; 682 | else if (user_params->mode == ELU) 683 | mode = CUDNN_ACTIVATION_ELU; 684 | 685 | checkCUDNN(cudnnCreateActivationDescriptor(&actv_desc)); 686 | checkCUDNN(cudnnSetActivationDescriptor(actv_desc, mode, CUDNN_PROPAGATE_NAN, user_params->coef)); 687 | 688 | output_size.N = batch_size, output_size.C = user_params->channels, output_size.H = user_params->h, output_size.W = user_params->w; 689 | } 690 | 691 | void ActivationLayerParams::allocateSpace(size_t &free_bytes) { 692 | 693 | } 694 | 695 | void SoftmaxLayerParams::initializeValues(SoftmaxDescriptor *user_params, cudnnDataType_t data_type, 696 | cudnnTensorFormat_t tensor_format, int batch_size, LayerDimension &output_size) { 697 | if (user_params->algo == SOFTMAX_FAST) 698 | algo = CUDNN_SOFTMAX_FAST; 699 | else if (user_params->algo == SOFTMAX_ACCURATE) 700 | algo = CUDNN_SOFTMAX_ACCURATE; 701 | 702 | if (user_params->mode == SOFTMAX_MODE_INSTANCE) 703 | mode = CUDNN_SOFTMAX_MODE_INSTANCE; 704 | else if (user_params->mode == SOFTMAX_MODE_CHANNEL) { 705 | mode = CUDNN_SOFTMAX_MODE_CHANNEL; 706 | } 707 | 708 | checkCUDNN(cudnnCreateTensorDescriptor(&input_tensor)); 709 | checkCUDNN(cudnnSetTensor4dDescriptor(input_tensor, tensor_format, data_type, 710 | batch_size, user_params->channels, user_params->h, user_params->w)); 711 | 712 | output_size.N = batch_size, output_size.C = user_params->channels, output_size.H = user_params->h, output_size.W = user_params->w; 713 | } 714 | 715 | void SoftmaxLayerParams::allocateSpace(size_t &free_bytes) { 716 | 717 | } --------------------------------------------------------------------------------