├── .gitignore ├── src ├── Makefile ├── single_layer_LSTM.cpp ├── LSTM.h ├── GRU_double.h ├── single_layer_GRU_double.cpp ├── single_layer_GRU_single.cpp ├── GRU_single.h ├── misc.h ├── RNNBase.h ├── LSTM.cu ├── GRU_single.cu └── GRU_double.cu ├── README.md ├── License.txt └── performance_model └── heuristic.py /.gitignore: -------------------------------------------------------------------------------- 1 | src/bin/* 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # Compiler parameters 2 | CXX := nvcc 3 | CPPFLAGS := -O3 -std=c++11 4 | CUDAFLAGS := -arch=compute_70 -code=sm_70 -D_FORCE_INLINES --ptxas-options='-v -warn-lmem-usage -warn-spills' --nvlink-options='-v' 5 | DEBUGFLAGS := -D DEBUG 6 | 7 | ### Regular compilation rules 8 | bin/LSTM.o: LSTM.cu LSTM.h RNNBase.h misc.h 9 | $(CXX) -c $< -o $@ $(CPPFLAGS) $(CUDAFLAGS) 10 | 11 | LSTM: single_layer_LSTM.cpp LSTM.h RNNBase.h misc.h bin/LSTM.o 12 | $(CXX) $< bin/LSTM.o -o bin/$@ $(CPPFLAGS) $(CUDAFLAGS) 13 | 14 | bin/GRU_single.o: GRU_single.cu GRU_single.h RNNBase.h misc.h 15 | $(CXX) -c $< -o $@ $(CPPFLAGS) $(CUDAFLAGS) 16 | 17 | GRU_single: single_layer_GRU_single.cpp GRU_single.h RNNBase.h misc.h bin/GRU_single.o 18 | $(CXX) $< bin/GRU_single.o -o bin/$@ $(CPPFLAGS) $(CUDAFLAGS) 19 | 20 | bin/GRU_double.o: GRU_double.cu GRU_double.h RNNBase.h misc.h 21 | $(CXX) -c $< -o $@ $(CPPFLAGS) $(CUDAFLAGS) 22 | 23 | GRU_double: single_layer_GRU_double.cpp GRU_double.h RNNBase.h misc.h bin/GRU_double.o 24 | $(CXX) $< bin/GRU_double.o -o bin/$@ $(CPPFLAGS) $(CUDAFLAGS) 25 | 26 | clean: 27 | rm bin/* 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GRNN 2 | ## Framework Structure 3 | ### Inference Workflow 4 | In GRNN, models are constructed individually by layer (See caveats in Design Problems) and then incorporated into a Model wrapper that holds collections of layers. Once the model has been defined with its given parameters, tiling parameters are passed to the model and the model is initialized. Layer initialization reorganizes the weight columns to improve locality of gate outputs, transposes the hidden state matrix, and sends these matrices to device memory. Model initialization allocates input, output, and intermediate buffers (this also feeds into design problems) as well as providing the known kernel parameters at this point. 5 | At this point, inputs can be fed to the network. The model assumes a maximum sequence length (needs additional robustness) but otherwise is not constrained by the sequence of the provided batch. The batch elements are assumed to have the same sequence length. 6 | 7 | ## Kernels 8 | The kernels for all cells/types follow the same broad three step process: 9 | 1. Buffer initialization - Initialize arrays in the register file and shared memory for the hidden/cell state and trained parameters. 10 | 2. Data Loading - Load weights and biases to register file, initialize shared cell/hidden states, and calculate offsets into the precompute array. 11 | 3. Recurrent Computation - varies based on cell structure. 12 | 13 | ## Performance Model 14 | The performance model takes in model parameters, permutes the parameters to build the configuration space, prunes based on configuration feasibility, and then ranks based on the four-part performance model. 15 | 16 | -------------------------------------------------------------------------------- /License.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 Colorado School of Mines. All rights reserved. 2 | 3 | 4 | Developed by: Connor Holmes 5 | Colorado School of Mines 6 | cs.mines.edu 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of 9 | this software and associated documentation files (the "Software"), to deal with 10 | the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 12 | of the Software, and to permit persons to whom the Software is furnished to 13 | do so, subject to the following conditions: 14 | * Redistributions of source code must retain the above copyright notice, 15 | this list of conditions and the following disclaimers. 16 | * Redistributions in binary form must reproduce the above copyright notice, 17 | this list of conditions and the following disclaimers in the documentation 18 | and/or other materials provided with the distribution. 19 | * Neither the names of Connor Holmes, Colorado School of Mines, 20 | nor the names of its contributors may be used to endorse or promote products 21 | derived from this Software without specific prior written permission. 22 | 23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 26 | CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE 29 | SOFTWARE. 30 | -------------------------------------------------------------------------------- /src/single_layer_LSTM.cpp: -------------------------------------------------------------------------------- 1 | // Runtime files 2 | #include "LSTM.h" 3 | #include "misc.h" 4 | 5 | // Other includes 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | int main(int argc, char** argv) { 16 | 17 | uint32_t input = 256; 18 | uint32_t hidden = 256; 19 | uint32_t batch = 40; 20 | uint32_t x_tile_size = 2; 21 | uint32_t y_tile_size = 4; 22 | uint32_t num_groups = 64; 23 | uint32_t reduction_width = 8; 24 | uint32_t input_length = 100; 25 | 26 | std::vector weights; 27 | create_dummy_weights_lstm(weights, input, hidden); 28 | 29 | // Create layer 30 | LSTMLayer layer = LSTMLayer(input, hidden, batch, weights); 31 | 32 | // Declare model based on layer 33 | LSTMModel model = LSTMModel( {layer} ); 34 | 35 | model.set_configuration(x_tile_size, y_tile_size, num_groups, reduction_width); 36 | model.initialize(); 37 | 38 | float * testInput; 39 | cudaHostAlloc((void **) &testInput, sizeof(float) * batch * input * input_length, cudaHostAllocDefault); CUDA_ERR; 40 | 41 | for (uint32_t i = 0; i < batch * input * input_length; i++) { 42 | testInput[i] = (i / input) % batch; 43 | } 44 | 45 | #ifdef DEBUG 46 | float temp = model.run_input(testInput, &input_length); 47 | #else 48 | float time = 0.0f; 49 | for (int i = 0; i < 1000; i++) { 50 | float temp = model.run_input(testInput, &input_length); 51 | } 52 | cudaProfilerStart(); 53 | for (int i = 0; i < 1000; i++) { 54 | float run_time = model.run_input(testInput, &input_length); 55 | time += run_time; 56 | } 57 | cudaProfilerStop(); 58 | std::cout << time / 1000 << " ms\n"; 59 | #endif 60 | 61 | return 0; 62 | } 63 | 64 | -------------------------------------------------------------------------------- /src/LSTM.h: -------------------------------------------------------------------------------- 1 | #ifndef LSTMBASE_H 2 | #define LSTMBASE_H 3 | 4 | #include "RNNBase.h" 5 | 6 | #include 7 | 8 | #define WEIGHTS_INPUT_F 0 9 | #define WEIGHTS_INPUT_I 1 10 | #define WEIGHTS_INPUT_C 2 11 | #define WEIGHTS_INPUT_O 3 12 | #define WEIGHTS_HIDDEN_F 4 13 | #define WEIGHTS_HIDDEN_I 5 14 | #define WEIGHTS_HIDDEN_C 6 15 | #define WEIGHTS_HIDDEN_O 7 16 | #define BIAS_F 8 17 | #define BIAS_I 9 18 | #define BIAS_C 10 19 | #define BIAS_O 11 20 | 21 | #define LSTM_GATES 4 22 | 23 | 24 | template 25 | class LSTMLayer : public RNNLayerBase { 26 | 27 | private: 28 | 29 | public: 30 | LSTMLayer(uint32_t i_s, uint32_t h_s, uint32_t b_s, std::vector l) : 31 | RNNLayerBase(i_s, h_s, b_s, l) {} 32 | 33 | uint32_t initialize(); 34 | void reset(); 35 | 36 | // Total footprint of the input weights (makes initialize code cleaner) 37 | uint32_t input_weight_footprint() { 38 | return this->input_size * LSTM_GATES * this->hidden_size * sizeof(T); 39 | } 40 | 41 | // Excludes intermediaries, used for data copying 42 | uint32_t hidden_weight_footprint() { 43 | return this->hidden_size * LSTM_GATES * this->hidden_size * sizeof(T); 44 | } 45 | 46 | // This function may need to be modified in order to avoid bank conflicts 47 | uint32_t bias_weight_footprint() { 48 | return this->hidden_size * LSTM_GATES * sizeof(T); 49 | } 50 | 51 | }; 52 | 53 | template 54 | class LSTMModel : public RNNBase { 55 | 56 | private: 57 | // Kernel launch parameters 58 | void* paramsLSTM[8]; 59 | 60 | public: 61 | LSTMModel(std::initializer_list< LSTMLayer > l) : 62 | RNNBase(l) {} 63 | 64 | void set_configuration(int x, int y, int g, int t); 65 | 66 | uint32_t initialize(); 67 | void reset(); 68 | 69 | float run_input(T* input, uint32_t * length); 70 | }; 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /src/GRU_double.h: -------------------------------------------------------------------------------- 1 | #ifndef GRUBASE_H 2 | #define GRUBASE_H 3 | 4 | #include "RNNBase.h" 5 | 6 | #include 7 | 8 | #define WEIGHTS_INPUT_R 0 9 | #define WEIGHTS_INPUT_Z 1 10 | #define WEIGHTS_INPUT_H 2 11 | #define WEIGHTS_HIDDEN_R 3 12 | #define WEIGHTS_HIDDEN_Z 4 13 | #define WEIGHTS_HIDDEN_H 5 14 | #define BIAS_R 6 15 | #define BIAS_Z 7 16 | #define BIAS_H 8 17 | 18 | #define GRU_GATES 3 19 | 20 | 21 | template 22 | class GRULayerDouble : public RNNLayerBase { 23 | 24 | private: 25 | 26 | public: 27 | GRULayerDouble(uint32_t i_s, uint32_t h_s, uint32_t b_s, std::vector l) : 28 | RNNLayerBase(i_s, h_s, b_s, l) {} 29 | 30 | uint32_t initialize(); 31 | void reset(); 32 | 33 | // Total footprint of the input weights (makes initialize code cleaner) 34 | uint32_t input_weight_footprint() { 35 | return this->input_size * GRU_GATES * this->hidden_size * sizeof(T); 36 | } 37 | 38 | // Excludes intermediaries, used for data copying 39 | uint32_t hidden_weight_footprint() { 40 | return this->hidden_size * GRU_GATES * this->hidden_size * sizeof(T); 41 | } 42 | 43 | // This function may need to be modified in order to avoid bank conflicts 44 | uint32_t bias_weight_footprint() { 45 | return this->hidden_size * GRU_GATES * sizeof(T); 46 | } 47 | }; 48 | 49 | template 50 | class GRUModelDouble : public RNNBase { 51 | 52 | private: 53 | // Buffer for r intermediates 54 | T * gpu_r; 55 | 56 | // GRU Kernel parameter buffer 57 | void * paramsGRU[9]; 58 | 59 | public: 60 | GRUModelDouble(std::initializer_list< GRULayerDouble > l) : 61 | RNNBase(l) {} 62 | 63 | void set_configuration(int x, int y, int g, int t); 64 | 65 | uint32_t initialize(); 66 | void reset(); 67 | 68 | float run_input(T* input, uint32_t * length); 69 | }; 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /src/single_layer_GRU_double.cpp: -------------------------------------------------------------------------------- 1 | // Runtime files 2 | #include "GRU_double.h" 3 | #include "misc.h" 4 | 5 | // Other includes 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | int main(int argc, char** argv) { 16 | 17 | uint32_t input = 1024; 18 | uint32_t hidden = 1024; 19 | uint32_t x_tile_size = 4; 20 | uint32_t y_tile_size = 5; 21 | uint32_t num_groups = 8; 22 | uint32_t group_threads = 32; 23 | uint32_t batch = 5; 24 | uint32_t input_length = 100; 25 | 26 | std::vector weights; 27 | create_dummy_weights_gru(weights, input, hidden); 28 | 29 | // Create layer 30 | GRULayerDouble layer = GRULayerDouble(input, hidden, batch, weights); 31 | 32 | // Declare model based on layer 33 | GRUModelDouble model = GRUModelDouble( {layer} ); 34 | 35 | // Simple checks 36 | assert(input == model.get_initial_input_size()); 37 | assert(batch == model.get_batch_size()); 38 | assert(hidden == model.get_output_size()); 39 | 40 | model.set_configuration(x_tile_size, y_tile_size, num_groups, group_threads); 41 | model.initialize(); 42 | 43 | float * testInput; 44 | cudaHostAlloc((void **) &testInput, sizeof(float) * batch * input * input_length, cudaHostAllocDefault); CUDA_ERR; 45 | 46 | for (uint32_t i = 0; i < batch * input * input_length; i++) { 47 | testInput[i] = 1.; 48 | } 49 | 50 | #ifdef DEBUG 51 | float temp = model.run_input(testInput, &input_length); 52 | #else 53 | float time = 0.0f; 54 | for (int i = 0; i < 1000; i++) { 55 | float temp = model.run_input(testInput, &input_length); 56 | } 57 | cudaProfilerStart(); 58 | for (int i = 0; i < 1000; i++) { 59 | float run_time = model.run_input(testInput, &input_length); 60 | time += run_time; 61 | } 62 | cudaProfilerStop(); 63 | std::cout << time / 1000 << " ms\n"; 64 | #endif 65 | 66 | return 0; 67 | } 68 | 69 | -------------------------------------------------------------------------------- /src/single_layer_GRU_single.cpp: -------------------------------------------------------------------------------- 1 | // Runtime files 2 | #include "GRU_single.h" 3 | #include "misc.h" 4 | 5 | // Other includes 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | int main(int argc, char** argv) { 16 | 17 | uint32_t input = 256; 18 | uint32_t hidden = 256; 19 | uint32_t x_tile_size = 3; 20 | uint32_t y_tile_size = 1; 21 | uint32_t num_groups = 32; 22 | uint32_t group_threads = 8; 23 | uint32_t batch = 10; 24 | uint32_t input_length = 100; 25 | 26 | std::vector weights; 27 | create_dummy_weights_gru(weights, input, hidden); 28 | 29 | // Create layer 30 | GRULayerSingle layer = GRULayerSingle(input, hidden, batch, weights); 31 | 32 | // Declare model based on layer 33 | GRUModelSingle model = GRUModelSingle( {layer} ); 34 | 35 | // Simple checks 36 | assert(input == model.get_initial_input_size()); 37 | assert(batch == model.get_batch_size()); 38 | assert(hidden == model.get_output_size()); 39 | 40 | model.set_configuration(x_tile_size, y_tile_size, num_groups, group_threads); 41 | model.initialize(); 42 | 43 | float * testInput; 44 | cudaHostAlloc((void **) &testInput, sizeof(float) * batch * input * input_length, cudaHostAllocDefault); CUDA_ERR; 45 | 46 | for (uint32_t i = 0; i < batch * input * input_length; i++) { 47 | testInput[i] = (float)(i % input) / (float)input; 48 | } 49 | 50 | #ifdef DEBUG 51 | float temp = model.run_input(testInput, &input_length); 52 | #else 53 | float time = 0.0f; 54 | for (int i = 0; i < 1000; i++) { 55 | float temp = model.run_input(testInput, &input_length); 56 | } 57 | cudaProfilerStart(); 58 | for (int i = 0; i < 1000; i++) { 59 | float run_time = model.run_input(testInput, &input_length); 60 | time += run_time; 61 | } 62 | cudaProfilerStop(); 63 | std::cout << time / 1000 << " ms\n"; 64 | #endif 65 | 66 | return 0; 67 | } 68 | 69 | -------------------------------------------------------------------------------- /src/GRU_single.h: -------------------------------------------------------------------------------- 1 | #ifndef GRUBASE_H 2 | #define GRUBASE_H 3 | 4 | #include "RNNBase.h" 5 | 6 | #include 7 | 8 | #define WEIGHTS_INPUT_R 0 9 | #define WEIGHTS_INPUT_Z 1 10 | #define WEIGHTS_INPUT_H 2 11 | #define WEIGHTS_HIDDEN_R 3 12 | #define WEIGHTS_HIDDEN_Z 4 13 | #define WEIGHTS_HIDDEN_H 5 14 | #define BIAS_R 6 15 | #define BIAS_Z 7 16 | #define BIAS_H 8 17 | 18 | #define GRU_GATES 3 19 | 20 | 21 | template 22 | class GRULayerSingle : public RNNLayerBase { 23 | 24 | private: 25 | T * packed_hidden_weights_r_gpu; 26 | T * packed_biases_r_gpu; 27 | 28 | public: 29 | GRULayerSingle(uint32_t i_s, uint32_t h_s, uint32_t b_s, std::vector l) : 30 | RNNLayerBase(i_s, h_s, b_s, l) {} 31 | 32 | uint32_t initialize(); 33 | void reset(); 34 | 35 | // Total footprint of the input weights (makes initialize code cleaner) 36 | uint32_t input_weight_footprint() { 37 | return this->input_size * GRU_GATES * this->hidden_size * sizeof(T); 38 | } 39 | 40 | // Excludes intermediaries, used for data copying 41 | uint32_t hidden_weight_footprint() { 42 | return this->hidden_size * (GRU_GATES - 1) * this->hidden_size * sizeof(T); 43 | } 44 | 45 | // This function may need to be modified in order to avoid bank conflicts 46 | uint32_t bias_weight_footprint() { 47 | return this->hidden_size * (GRU_GATES - 1) * sizeof(T); 48 | } 49 | 50 | uint32_t hidden_weight_r_footprint() { 51 | return this->hidden_size * this->hidden_size * sizeof(T); 52 | } 53 | 54 | uint32_t bias_weight_r_footprint() { 55 | return this->hidden_size * sizeof(T); 56 | } 57 | 58 | T * get_packed_hidden_weights_r_gpu() { 59 | return this->packed_hidden_weights_r_gpu; 60 | } 61 | 62 | T * get_packed_biases_r_gpu() { 63 | return this->packed_biases_r_gpu; 64 | } 65 | }; 66 | 67 | template 68 | class GRUModelSingle : public RNNBase { 69 | 70 | private: 71 | T * gpu_r; 72 | T * gpu_weights_hidden_r; 73 | T * gpu_biases_r; 74 | 75 | void * paramsGRU[11]; 76 | int num_partials; 77 | 78 | public: 79 | GRUModelSingle(std::initializer_list< GRULayerSingle > l) : 80 | RNNBase(l) {} 81 | 82 | void set_configuration(int x, int y, int g, int t); 83 | 84 | uint32_t initialize(); 85 | void reset(); 86 | 87 | float run_input(T* input, uint32_t * length); 88 | }; 89 | 90 | #endif 91 | -------------------------------------------------------------------------------- /src/misc.h: -------------------------------------------------------------------------------- 1 | #ifndef MISC_H 2 | #define MISC_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #define LINE_SIZE 1 11 | 12 | #define CUDA_ERR { \ 13 | cudaError_t err; \ 14 | if ((err = cudaGetLastError()) != cudaSuccess) { \ 15 | printf("CUDA error: %d : %s : %s, line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__); \ 16 | exit(1); \ 17 | } \ 18 | } 19 | 20 | #define MAX_SMEM 98304 21 | 22 | template 23 | void create_dummy_weights_lstm(std::vector &weights, uint32_t input, uint32_t hidden) { 24 | // DUMMY INPUT WEIGHTS 25 | weights.push_back((T *)malloc(sizeof(T) * input * hidden)); 26 | weights.push_back((T *)malloc(sizeof(T) * input * hidden)); 27 | weights.push_back((T *)malloc(sizeof(T) * input * hidden)); 28 | weights.push_back((T *)malloc(sizeof(T) * input * hidden)); 29 | 30 | // DUMMY HIDDEN WEIGHTS 31 | weights.push_back((T *)malloc(sizeof(T) * hidden * hidden)); 32 | weights.push_back((T *)malloc(sizeof(T) * hidden * hidden)); 33 | weights.push_back((T *)malloc(sizeof(T) * hidden * hidden)); 34 | weights.push_back((T *)malloc(sizeof(T) * hidden * hidden)); 35 | 36 | // DUMMY BIASES 37 | weights.push_back((T *)malloc(sizeof(T) * hidden)); 38 | weights.push_back((T *)malloc(sizeof(T) * hidden)); 39 | weights.push_back((T *)malloc(sizeof(T) * hidden)); 40 | weights.push_back((T *)malloc(sizeof(T) * hidden)); 41 | 42 | uint32_t i, j; 43 | for (i = 0; i < 4; i++) { 44 | for (j = 0; j < input * hidden; j++) { 45 | weights.at(i)[j] = 1 / 1024.; 46 | } 47 | } 48 | 49 | for (i = 4; i < 8; i++) { 50 | for (j = 0; j < hidden * hidden; j++) { 51 | weights.at(i)[j] = 1 / 1024.; 52 | } 53 | } 54 | 55 | for (i = 8; i < 12; i++) { 56 | for (j = 0; j < hidden; j++) { 57 | weights.at(i)[j] = 0.5; 58 | } 59 | } 60 | } 61 | 62 | template 63 | void create_dummy_weights_gru(std::vector &weights, uint32_t input, uint32_t hidden) { 64 | // DUMMY INPUT WEIGHTS 65 | weights.push_back((T *)malloc(sizeof(T) * input * hidden)); 66 | weights.push_back((T *)malloc(sizeof(T) * input * hidden)); 67 | weights.push_back((T *)malloc(sizeof(T) * input * hidden)); 68 | 69 | // DUMMY HIDDEN WEIGHTS 70 | weights.push_back((T *)malloc(sizeof(T) * hidden * hidden)); 71 | weights.push_back((T *)malloc(sizeof(T) * hidden * hidden)); 72 | weights.push_back((T *)malloc(sizeof(T) * hidden * hidden)); 73 | 74 | // DUMMY BIASES 75 | weights.push_back((T *)malloc(sizeof(T) * hidden)); 76 | weights.push_back((T *)malloc(sizeof(T) * hidden)); 77 | weights.push_back((T *)malloc(sizeof(T) * hidden)); 78 | 79 | uint32_t i, j; 80 | for (i = 0; i < 3; i++) { 81 | for (j = 0; j < input * hidden; j++) { 82 | weights.at(i)[j] = 1./256.; 83 | } 84 | } 85 | 86 | for (i = 3; i < 6; i++) { 87 | for (j = 0; j < hidden * hidden; j++) { 88 | weights.at(i)[j] = 1./256.; 89 | } 90 | } 91 | 92 | for (i = 7; i < 9; i++) { 93 | for (j = 0; j < hidden; j++) { 94 | weights.at(i)[j] = 0.5; 95 | } 96 | } 97 | } 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /src/RNNBase.h: -------------------------------------------------------------------------------- 1 | #ifndef RNNBASE_H 2 | #define RNNBASE_H 3 | 4 | #include "misc.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | template 12 | class RNNLayerBase { 13 | 14 | protected: 15 | //LAYER TOPOLOGY 16 | uint32_t hidden_size; 17 | uint32_t input_size; 18 | uint32_t batch_size; 19 | uint32_t block_width; 20 | 21 | // UNMODIFIED HOST WEIGHTS (SHOULD BE POINTERS ON HEAP) 22 | std::vector host_weights; 23 | 24 | // WEIGHTS PACKED INTO SUITABLE CONFIGURATION FOR SHARED MEM 25 | T * packed_input_weights; 26 | T * packed_input_weights_gpu; 27 | T * packed_hidden_weights; 28 | T * packed_hidden_weights_gpu; 29 | T * packed_biases; 30 | T * packed_biases_gpu; 31 | 32 | public: 33 | RNNLayerBase(uint32_t i_s, uint32_t h_s, uint32_t b_s, std::vector l) : 34 | input_size(i_s), 35 | hidden_size(h_s), 36 | batch_size(b_s), 37 | host_weights(l) { 38 | } 39 | 40 | // PACKS WEIGHTS FOR SHARED MEMORY TRANSFER 41 | virtual uint32_t initialize() =0; 42 | virtual void reset() =0; 43 | 44 | virtual uint32_t input_weight_footprint() =0; 45 | virtual uint32_t hidden_weight_footprint() =0; 46 | virtual uint32_t bias_weight_footprint() = 0; 47 | 48 | // SETTERS 49 | void set_block_width(uint32_t width) { block_width = width; } 50 | 51 | // GETTERS (CHANGING SIZE OF LAYER NOT SUPPORTED) 52 | uint32_t get_hidden_size() { return hidden_size; } 53 | uint32_t get_input_size() { return input_size; } 54 | uint32_t get_batch_size() { return batch_size; } 55 | T * get_packed_input_weights_gpu() { return packed_input_weights_gpu; } 56 | T * get_packed_hidden_weights_gpu() { return packed_hidden_weights_gpu; } 57 | T * get_packed_biases_gpu() { return packed_biases_gpu; } 58 | 59 | }; 60 | 61 | template typename L, typename T> 62 | class RNNBase { 63 | 64 | protected: 65 | // Vector of layers 66 | std::vector< L > layers; 67 | 68 | // Topology 69 | uint32_t initial_input_size; 70 | uint32_t batch_size; 71 | uint32_t output_size; 72 | uint32_t tile_width; 73 | uint32_t tile_height; 74 | uint32_t num_groups; 75 | uint32_t group_threads; 76 | uint32_t mm_m; 77 | uint32_t mm_n; 78 | uint32_t mm_k; 79 | 80 | // Data 81 | T * gpu_inputs; 82 | T * gpu_hidden_initializer; 83 | T * gpu_weights_input; 84 | T * gpu_weights_hidden; 85 | T * gpu_biases; 86 | T * gpu_precompute; 87 | T * gpu_output; 88 | int * gpu_syncIn; 89 | int * gpu_syncOut; 90 | T * host_output; 91 | 92 | // Kernel Parameters 93 | void* paramsMM[6]; 94 | 95 | public: 96 | RNNBase(std::initializer_list< L > l) : layers(l) { 97 | this->initial_input_size = layers.front().get_input_size(); 98 | this->batch_size = layers.front().get_batch_size(); 99 | this->output_size = layers.back().get_hidden_size(); 100 | } 101 | 102 | virtual uint32_t initialize() =0; 103 | virtual void reset() =0; 104 | 105 | // Transfers input to the GPU, runs kernel, fetches output 106 | virtual float run_input(T * input, uint32_t * length) =0; 107 | 108 | // Configure tiling parameters 109 | virtual void set_configuration(int x, int y, int g, int t) =0; 110 | 111 | // GETTERS 112 | uint32_t get_initial_input_size() { return initial_input_size; } 113 | uint32_t get_batch_size() { return batch_size; } 114 | uint32_t get_output_size() { return output_size; } 115 | 116 | }; 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /performance_model/heuristic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from enum import Enum 4 | from math import ceil 5 | from math import floor 6 | from math import sqrt 7 | from math import log2 8 | import argparse 9 | import itertools 10 | import numpy as np 11 | 12 | # Hardware parameters 13 | sms = 80 14 | max_regs_thread = 224 # Leave 32 for indexing/pointers 15 | max_regs_sm = 65536 16 | max_threads = 1024 # Per tb 17 | fma_cost = 4 18 | l2_bus = 64 # Bytes / cycle 19 | l2_lat = 50 # Normalized against the FMA cost 20 | l1_lat = 5 # Normalized against the FMA cost 21 | warp_size = 32 22 | 23 | class ModelType(Enum): 24 | LSTM = 0 25 | GRU = 1 26 | 27 | model_gates = {ModelType.LSTM : 4, ModelType.GRU : 2} 28 | model_scale = {ModelType.LSTM : 1, ModelType.GRU : 1.5} ## Weights stored per work group 29 | string_model = {'LSTM' : ModelType.LSTM, 'GRU' : ModelType.GRU} 30 | model_string = {ModelType.LSTM : 'LSTM', ModelType.GRU : 'GRU'} 31 | 32 | class ModelConfig: 33 | 34 | def __init__(self, mt, hs, bs, tw, th, nwg, rw, sy=None): 35 | self.model_type = mt 36 | self.hidden_size = hs 37 | self.batch_size = bs 38 | self.tile_width = tw 39 | self.tile_height = th 40 | self.reduction_width = rw 41 | self.num_work_groups = nwg 42 | self.sub_tile_width = ceil(self.tile_width * model_gates[self.model_type] / self.num_work_groups) 43 | self.num_threads = self.reduction_width * self.num_work_groups 44 | self.num_SMs = ceil(self.hidden_size / self.tile_width) * ceil(self.batch_size / self.tile_height) 45 | if self.model_type is ModelType.GRU: 46 | self.sync = sy 47 | if self.sync is 1: 48 | self.weights_per_thread = self.sub_tile_width * self.hidden_size / self.reduction_width + self.tile_width * ceil(self.hidden_size / self.num_threads) 49 | else: 50 | self.weights_per_thread = self.sub_tile_width * self.hidden_size / self.reduction_width * 1.5 51 | else: 52 | self.weights_per_thread = self.sub_tile_width * self.hidden_size / self.reduction_width 53 | self.cost = self.fitness() 54 | 55 | def __str__(self): 56 | rep = "Model Config:\n" 57 | rep += "\tModel Info:\n" 58 | rep += "\t\tModel Type: " + str(self.model_type) +"\n" 59 | rep += "\t\tHidden Size: " + str(self.hidden_size) +"\n" 60 | rep += "\t\tBatch Size: " + str(self.batch_size) +"\n" 61 | rep += "\tConfiguration Parameters:\n" 62 | rep += "\t\tTile Width: " + str(self.tile_width) +"\n" 63 | rep += "\t\tTile Height: " + str(self.tile_height) +"\n" 64 | rep += "\t\tReduction Width: " + str(self.reduction_width) +"\n" 65 | rep += "\t\tNum Work Groups: " + str(self.num_work_groups) +"\n" 66 | rep += "\tOccupancy Metrics:\n" 67 | rep += "\t\tNumber of SMs: " + str(self.num_SMs) +"\n" 68 | rep += "\t\tWeights Per SM: " + str(self.tile_width * self.hidden_size * model_gates[self.model_type] * model_scale[self.model_type]) +"\n" 69 | rep += "\t\tSub Tile Width: " + str(self.sub_tile_width) +"\n" 70 | rep += "\t\tWeights Per Threads: " + str(self.weights_per_thread) +"\n" 71 | rep += "\tFitness: " + str(self.cost) + "\n" 72 | return rep 73 | 74 | def is_valid(self): 75 | if self.num_SMs > 80: 76 | return False 77 | elif self.weights_per_thread > max_regs_thread: 78 | return False 79 | elif (self.weights_per_thread + 32) * self.num_threads > max_regs_sm: 80 | return False 81 | elif self.sub_tile_width * self.tile_height > self.reduction_width: 82 | return False 83 | elif self.num_threads > max_threads: 84 | return False 85 | elif (model_gates[self.model_type] * self.tile_width % self.num_work_groups) is not 0: 86 | return False 87 | else: 88 | return True 89 | 90 | def fma_heuristic(self): 91 | sequential_length = self.hidden_size / self.reduction_width 92 | self.partition_occupancy = ceil(self.num_threads / 32 / 4) 93 | if self.partition_occupancy * self.sub_tile_width <= 8: 94 | return 1.6 ** log2(self.partition_occupancy) * 1.33 ** log2(self.sub_tile_width) * self.tile_height * sequential_length 95 | else: 96 | return 4.7 * (self.partition_occupancy * self.sub_tile_width / 8) * self.tile_height * sequential_length 97 | 98 | def lstm_fitness(self): 99 | sm_bandwidth = self.hidden_size * self.tile_height * 4 100 | warp_occupancy = ceil(self.num_threads / 128) 101 | 102 | self.mem_cost = round(sm_bandwidth * (1 + floor(self.num_SMs / (sms / 2))) / (fma_cost * l2_bus), 2) 103 | self.sync_cost = 0 104 | if warp_occupancy * self.sub_tile_width * self.tile_height < 12: ## Non-throughput limited 105 | if self.reduction_width <= 16: 106 | self.reduction_cost = (log2(self.reduction_width) + 1) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height) 107 | else: 108 | self.reduction_cost = log2(self.reduction_width) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height) 109 | else: ## Throughput limited 110 | if self.reduction_width <= 16: 111 | self.reduction_cost = (log2(self.reduction_width) + 1) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12) 112 | else: 113 | self.reduction_cost = log2(self.reduction_width) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12) 114 | self.mul_cost = round(self.fma_heuristic(), 2) 115 | 116 | return self.mem_cost + self.sync_cost + self.reduction_cost + self.mul_cost 117 | 118 | def gru_fitness_two(self): 119 | sm_bandwidth = self.hidden_size * self.tile_height * 4 * 2 120 | warp_occupancy = ceil(self.num_threads / 128) 121 | 122 | self.mem_cost = round(sm_bandwidth * (1 + floor(self.num_SMs / (sms / 2))) / (fma_cost * l2_bus), 2) 123 | self.sync_cost = l2_lat * 2 * 2 # ceil(self.num_SMs / 32) * 2 124 | if warp_occupancy * self.sub_tile_width * self.tile_height < 12: ## Non-throughput limited 125 | if self.reduction_width <= 16: 126 | self.reduction_cost = 2 * (log2(self.reduction_width) + 1) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height) 127 | else: 128 | self.reduction_cost = 2 * log2(self.reduction_width) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height) 129 | else: ## Throughput limited 130 | if self.reduction_width <= 16: 131 | self.reduction_cost = 2 * (log2(self.reduction_width) + 1) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12) 132 | else: 133 | self.reduction_cost = 2 * log2(self.reduction_width) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12) 134 | self.mul_cost = round(self.fma_heuristic() * 1.5, 2) 135 | return self.mem_cost + self.sync_cost + self.reduction_cost + self.mul_cost 136 | 137 | def gru_fitness_one(self): 138 | sm_bandwidth = self.hidden_size * self.tile_height * 4 + self.hidden_size * (self.hidden_size / self.tile_width) * self.tile_height * 4 139 | warp_occupancy = ceil(self.num_threads / 128) 140 | 141 | self.mem_cost = round(sm_bandwidth * (1 + floor(self.num_SMs / (sms / 2))) / (fma_cost * l2_bus), 2) 142 | self.sync_cost = 0 143 | if warp_occupancy * self.sub_tile_width * self.tile_height < 12: ## Non-throughput limited 144 | if self.reduction_width <= 16: 145 | self.reduction_cost = 2 * (log2(self.reduction_width) + 1) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height) 146 | else: 147 | self.reduction_cost = 2 * log2(self.reduction_width) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height) 148 | else: ## Throughput limited 149 | if self.reduction_width <= 16: 150 | self.reduction_cost = 2 * (log2(self.reduction_width) + 1) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12) 151 | else: 152 | self.reduction_cost = 2 * log2(self.reduction_width) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12) 153 | self.mul_cost = round(self.fma_heuristic() + self.tile_width + self.hidden_size / self.tile_width, 2) 154 | return self.mem_cost + self.sync_cost + self.reduction_cost + self.mul_cost 155 | 156 | def fitness(self): 157 | if self.model_type is ModelType.LSTM: 158 | return self.lstm_fitness() 159 | elif self.model_type is ModelType.GRU: 160 | if self.sync is 2: 161 | return self.gru_fitness_two() 162 | elif self.sync is 1: 163 | return self.gru_fitness_one() 164 | 165 | def to_csv(self): 166 | rep = str(self.tile_width) + "," 167 | rep += str(self.tile_height) + "," 168 | rep += str(self.num_work_groups) + "," 169 | rep += str(self.reduction_width) + "," 170 | if self.model_type is ModelType.GRU: 171 | rep += str(self.sync) + "," 172 | rep += str(self.mem_cost) + "," 173 | rep += str(self.sync_cost) + "," 174 | rep += str(self.reduction_cost) + "," 175 | rep += str(self.mul_cost) + "," 176 | rep += str(self.cost) + "," 177 | rep += str(self.partition_occupancy * self.sub_tile_width < 16) + "\n" 178 | return rep 179 | 180 | def string_to_model(string): 181 | if (string in string_model): 182 | return string_model[string] 183 | else: 184 | msg = string + " is not a valid model type" 185 | raise argparse.ArgumentTypeError(msg) 186 | 187 | 188 | def main(model, input_size, hidden_size, batch_size, k): 189 | 190 | tile_widths = range(1, 65) 191 | tile_heights = range(1, batch_size + 1) 192 | 193 | tile_configurations = list(itertools.product(tile_widths, tile_heights)) 194 | reduction_widths = [2 ** i for i in range(6)] 195 | 196 | configs = list() 197 | # Build dictionary of configurations 198 | for x, y in tile_configurations: 199 | if batch_size % y is 0: 200 | num_gate_elements = x * model_gates[model] 201 | 202 | for i in range(1, num_gate_elements + 1): 203 | if num_gate_elements % i is 0: 204 | for r in reduction_widths: 205 | if model is ModelType.LSTM: 206 | configs.append(ModelConfig(model, hidden_size, batch_size, x, y, i, r)) 207 | else: 208 | configs.append(ModelConfig(model, hidden_size, batch_size, x, y, i, r, sy=1)) 209 | configs.append(ModelConfig(model, hidden_size, batch_size, x, y, i, r, sy=2)) 210 | 211 | # Prune 212 | configs = [x for x in configs if x.is_valid()] 213 | # Evaluate 214 | configs.sort(key=lambda config: config.cost) 215 | 216 | # Save chosen configurations 217 | with open("configs_" + str(hidden_size) + "_" + str(batch_size) + "_" + model_string[model] + ".csv", mode='w') as f: 218 | if k is -1: 219 | for entry in configs: 220 | f.write(entry.to_csv()) 221 | else: 222 | for entry in configs[:k]: 223 | f.write(entry.to_csv()) 224 | 225 | if __name__ == '__main__': 226 | parser = argparse.ArgumentParser(description='Use heuristic based analysis of \ 227 | an RNN layer to determine a near optimal \ 228 | configuration for instantiation') 229 | parser.add_argument('-m', '--model_type', default='LSTM', type=string_to_model, required=False, 230 | help='The type of RNN layer for analysis') 231 | parser.add_argument('-i', '--input_size', default=256, type=int, required=False, 232 | help='Length of input vector to layer') 233 | parser.add_argument('-s', '--hidden_size', default=256, type=int, required=False, 234 | help='Length of hidden size/output of layer') 235 | parser.add_argument('-b', '--batch_size', default=1, type=int, required=False, 236 | help='Size of batch to be computed simultaneously') 237 | parser.add_argument('-k', '--top_k', default=-1, type=int, required=False, 238 | help='How many candidate configurations to return') 239 | args = parser.parse_args() 240 | main(args.model_type, args.input_size, args.hidden_size, args.batch_size, args.top_k) 241 | -------------------------------------------------------------------------------- /src/LSTM.cu: -------------------------------------------------------------------------------- 1 | #include "LSTM.h" 2 | #include "misc.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace cooperative_groups; 12 | 13 | __device__ __forceinline__ float sigmoidf(float x) { 14 | return 1.0f / (1.0f + expf(-1.0f * x)); 15 | } 16 | 17 | #define MM_BLOCK_SIZE 16 18 | #define MM_REG_TILE 4 19 | #define MM_TILE_SIZE 64 20 | 21 | // This is a mostly optimized kernel for matrix multiplication 22 | // The kernel uses a two tiered tiling mechanism that first tiles large 23 | // tiles from global memory to shared memory. This shared memory tile is 24 | // then used as the source to stream data into register arrays that perform 25 | // a calculation on a 4x4 tile. 26 | 27 | __global__ void matmul(float * A, float * B, float * C, 28 | uint32_t M, uint32_t K, uint32_t N) { 29 | 30 | extern __shared__ float base[]; 31 | float* bufferA = base; 32 | float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE]; 33 | 34 | float regA[MM_REG_TILE]; 35 | float regB[MM_REG_TILE]; 36 | float regC[MM_REG_TILE][MM_REG_TILE]; 37 | 38 | uint32_t tidx = threadIdx.x; 39 | uint32_t tidy = threadIdx.y; 40 | uint32_t id = threadIdx.y * blockDim.x + threadIdx.x; 41 | uint32_t bidx = blockIdx.x; 42 | uint32_t bidy = blockIdx.y; 43 | 44 | // Number of rows that are traversed in a single fully coalesced load sequence 45 | constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE); 46 | constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE; 47 | 48 | // Zero the intermediate output 49 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 50 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 51 | regC[y][x] = 0.0f; 52 | } 53 | } 54 | 55 | for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) { 56 | 57 | // Load lhs tile from global memory to shared memory (fully coalesced) 58 | #pragma unroll 59 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 60 | uint32_t index = j * NUM_THREADS + id; 61 | if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) { 62 | bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE]; 63 | } else { 64 | bufferA[index] = 0.0f; 65 | } 66 | } 67 | 68 | // Not necessary for correctness, but improves performance by avoiding thrashing shared memory 69 | __syncthreads(); 70 | 71 | // Load rhs tile from global memory to shared memory (fully coalesced) 72 | #pragma unroll 73 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 74 | uint32_t index = j * NUM_THREADS + id; 75 | if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) { 76 | bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE]; 77 | } else { 78 | bufferB[index] = 0.0f; 79 | } 80 | } 81 | 82 | // Ensures all data is written from global memory to shared memory before it is streamed 83 | // into register arrays. 84 | __syncthreads(); 85 | 86 | 87 | 88 | // Loop through full tile 89 | for (uint32_t j = 0; j < MM_TILE_SIZE; j++) { 90 | 91 | // Load vector from lhs and rhs 92 | #pragma unroll 93 | for (uint32_t l = 0; l < MM_REG_TILE; l++) { 94 | regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j]; 95 | regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l]; 96 | } 97 | 98 | #pragma unroll 99 | // Perform a narrow matmul 100 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 101 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 102 | regC[y][x] += regA[y] * regB[x]; 103 | } 104 | } 105 | } 106 | 107 | __syncthreads(); 108 | } 109 | 110 | // Write register intermediates to shared memory (possibly unnecessary) 111 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 112 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 113 | bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x]; 114 | } 115 | } 116 | 117 | __syncthreads(); 118 | 119 | 120 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 121 | uint32_t index = j * NUM_THREADS + id; 122 | if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) { 123 | C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)] = bufferA[index]; 124 | } 125 | } 126 | } 127 | 128 | 129 | // This kernel assumes the input multiplications were precomputed in a large matrix-matrix multiplication 130 | template 131 | __global__ void lstm_rnn( const float* precomputed_inputs, 132 | const float* hidden_initializer, 133 | const float* weights, 134 | const float* biases, 135 | float* output, 136 | volatile int* syncIn, 137 | volatile int* syncOut, 138 | uint32_t length) { 139 | 140 | // Indexing helpers 141 | int tid = threadIdx.x; 142 | int bidx = blockIdx.x; 143 | int bidy = blockIdx.y; 144 | int wg_id = tid / GROUP_THREADS; 145 | // LENGTH - How many weights for each gate output does a single thread need to store 146 | constexpr int LENGTH = (HIDDEN_SIZE + GROUP_THREADS - 1) / GROUP_THREADS; 147 | // BUFFER_SIZE - Number of elements to reserve in shared memory for each output. Effectively 148 | // rounds up HIDDEN_SIZE to multiple of GROUP_THREADS 149 | constexpr int BUFFER_SIZE = LENGTH * GROUP_THREADS; 150 | // OUTPUT_TILE_WIDTH - How many full elements are produced by the threadblock. At scheduling time, 151 | // must ensure that launched configuration produces full elements within a single threadblock 152 | constexpr int OUTPUT_TILE_WIDTH = NUM_GROUPS * TILE_WIDTH / LSTM_GATES; 153 | 154 | // Static shared memory allocation 155 | __shared__ float hidden_tile[TILE_HEIGHT][BUFFER_SIZE]; 156 | __shared__ float cell_state[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 157 | __shared__ float forget_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 158 | __shared__ float input_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 159 | __shared__ float cand_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 160 | __shared__ float out_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 161 | 162 | // Weights in the register file 163 | float weights_reg[TILE_WIDTH][LENGTH]; 164 | float outputs_reg[TILE_HEIGHT][TILE_WIDTH]; 165 | float bias = 0.0f; 166 | float precompute = 0.0f; 167 | 168 | // Cooperative group helpers 169 | thread_block bl = this_thread_block(); 170 | thread_block_tile work_group = tiled_partition(bl); 171 | 172 | // Tile width is the number of gate outputs produce by a single warp 173 | for (int i = 0; i < TILE_WIDTH; i++) { 174 | // Global gate id for fetching weights. 175 | // bidx * TILE_WIDTH * NUM_GROUPS -> first gate index processed by the threadblock 176 | // (tid / GROUP_THREADS) * TILE_WIDTH -> first gate index within processed by a warp within the threadblock 177 | // i -> current gate index within the warp's assigned gates 178 | int gate_id = bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + i; 179 | // HIDDEN_SIZE * LSTM_GATES -> number of total gates that need to be computed 180 | if (gate_id < HIDDEN_SIZE * LSTM_GATES) { 181 | for (int j = 0; j < LENGTH; j++) { 182 | // Better to fully populate and check weight bounds once at loading than during each computation. 183 | if (j * GROUP_THREADS + work_group.thread_rank() < HIDDEN_SIZE) { 184 | weights_reg[i][j] = weights[gate_id * HIDDEN_SIZE + j * GROUP_THREADS + work_group.thread_rank()]; 185 | } else { 186 | weights_reg[i][j] = 0.0f; 187 | } 188 | } 189 | } 190 | } 191 | 192 | // Assigns correct bias value to specific output. Prunes to only ensure that values are fetched that are necessary for later 193 | // for later computation 194 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 195 | if ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH) < HIDDEN_SIZE * LSTM_GATES) { 196 | bias = biases[bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH]; 197 | } else { 198 | bias = 0; 199 | } 200 | } 201 | 202 | // Zero initialize the cell state 203 | if (tid < TILE_HEIGHT * OUTPUT_TILE_WIDTH) { 204 | cell_state[tid / OUTPUT_TILE_WIDTH][tid % OUTPUT_TILE_WIDTH] = 0.0f; 205 | } 206 | 207 | // Initialize hidden state buffer according to input / zero out rest of buffer 208 | for (int j = 0; j < TILE_HEIGHT; j++) { 209 | for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) { 210 | if (i + tid < HIDDEN_SIZE) { 211 | hidden_tile[j][i + tid] = hidden_initializer[(bidy * TILE_HEIGHT + j) * HIDDEN_SIZE + i + tid]; 212 | } else if (i + tid < BUFFER_SIZE) { 213 | hidden_tile[j][i + tid] = 0.0f; 214 | } 215 | } 216 | } 217 | __syncthreads(); 218 | 219 | // Zero dot product accumulators 220 | #pragma unroll 221 | for (int j = 0; j < TILE_HEIGHT; j++) { 222 | #pragma unroll 223 | for (int i = 0; i < TILE_WIDTH; i++) { 224 | outputs_reg[j][i] = 0.0f; 225 | } 226 | } 227 | 228 | // Load first time independent values 229 | if ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH < HIDDEN_SIZE * LSTM_GATES) 230 | && work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 231 | precompute = precomputed_inputs[bidy * TILE_HEIGHT * HIDDEN_SIZE * LSTM_GATES + 232 | bidx * TILE_WIDTH * NUM_GROUPS + 233 | wg_id * TILE_WIDTH + 234 | work_group.thread_rank() % TILE_WIDTH + 235 | (work_group.thread_rank() / TILE_WIDTH) * HIDDEN_SIZE * LSTM_GATES]; 236 | 237 | } 238 | 239 | // Loop for each iteration of the sequence length 240 | for (int sequence_iteration = 0; sequence_iteration < length; sequence_iteration++) { 241 | 242 | // Dot products 243 | #pragma unroll 244 | for (int k = 0; k < LENGTH; k++) { 245 | #pragma unroll 246 | for (int j = 0; j < TILE_HEIGHT; j++) { 247 | float val = hidden_tile[j][k * GROUP_THREADS + work_group.thread_rank()]; 248 | #pragma unroll 249 | for (int i = 0; i < TILE_WIDTH; i++) { 250 | outputs_reg[j][i] += weights_reg[i][k] * val; 251 | } 252 | } 253 | } 254 | 255 | // Reductions 256 | #pragma unroll 257 | for (int k = 1; k < GROUP_THREADS; k *= 2) { 258 | #pragma unroll 259 | for (int j = 0; j < TILE_HEIGHT; j++) { 260 | #pragma unroll 261 | for (int i = 0; i < TILE_WIDTH; i++) { 262 | outputs_reg[j][i] += work_group.shfl_xor(outputs_reg[j][i], k); 263 | } 264 | } 265 | } 266 | 267 | // Remap work and compute activations 268 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 269 | int reg_y = work_group.thread_rank() / TILE_WIDTH; 270 | int reg_x = work_group.thread_rank() % TILE_WIDTH; 271 | float val = outputs_reg[reg_y][reg_x] + bias + precompute; 272 | 273 | int gate_id = (wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH) % LSTM_GATES; 274 | 275 | if (gate_id != 2) { 276 | val = sigmoidf(val); 277 | } else { 278 | val = tanhf(val); 279 | } 280 | int out_id = (wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH) / LSTM_GATES; 281 | if (gate_id == 0) { 282 | forget_gate[reg_y][out_id] = val; 283 | } else if (gate_id == 1) { 284 | input_gate[reg_y][out_id] = val; 285 | } else if (gate_id == 2) { 286 | cand_gate[reg_y][out_id] = val; 287 | } else { 288 | out_gate[reg_y][out_id] = val; 289 | } 290 | } 291 | 292 | // Synchronization enforces all intermediates are calculated before the data is shared across threads 293 | // for the elementwise operations. 294 | __syncthreads(); 295 | 296 | int x = tid % OUTPUT_TILE_WIDTH; 297 | int y = tid / OUTPUT_TILE_WIDTH; 298 | 299 | // Elementwise operations 300 | if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT && 301 | (bidx * OUTPUT_TILE_WIDTH + x) < HIDDEN_SIZE && 302 | (bidy * TILE_HEIGHT + y) < BATCH_SIZE) { 303 | // Calculates the new cell state 304 | float cell_reg = cell_state[y][x] * forget_gate[y][x] + input_gate[y][x] * cand_gate[y][x]; 305 | // Calculates the new output 306 | float out_reg = tanhf(cell_reg) * out_gate[y][x]; 307 | 308 | // No synchronization necessary between the read and writes of cell state because it is guaranteed that only the 309 | // same thread will read/write to the element. 310 | cell_state[y][x] = cell_reg; 311 | 312 | // Broadcast output to global memory 313 | output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + y) * HIDDEN_SIZE + bidx * OUTPUT_TILE_WIDTH + x] = out_reg; 314 | } 315 | 316 | // Escape recurrent loop when full sequence has been processed 317 | if (sequence_iteration + 1 == length) break; 318 | 319 | 320 | 321 | // Synchronize between recurrent iterations - signal stage 322 | if (tid == 0 ) { 323 | syncIn[(bidy * gridDim.x + bidx)] = sequence_iteration + 1; 324 | } 325 | __threadfence(); 326 | 327 | // Zero the dot product accumulators 328 | #pragma unroll 329 | for (int j = 0; j < TILE_HEIGHT; j++) { 330 | #pragma unroll 331 | for (int i = 0; i < TILE_WIDTH; i++) { 332 | outputs_reg[j][i] = 0.0f; 333 | } 334 | } 335 | 336 | // Read precomputed value from memory (Since this is a read-only operation that does ot 337 | // use a shared intermediate, this can go before the memory barrier without correctness issues 338 | // Ideally, this will hide some latency, but needs profiling 339 | if ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH < HIDDEN_SIZE * LSTM_GATES) 340 | && work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 341 | precompute = precomputed_inputs[bidy * TILE_HEIGHT * HIDDEN_SIZE * LSTM_GATES + 342 | bidx * TILE_WIDTH * NUM_GROUPS + 343 | wg_id * TILE_WIDTH + 344 | work_group.thread_rank() % TILE_WIDTH + 345 | (work_group.thread_rank() / TILE_WIDTH) * HIDDEN_SIZE * LSTM_GATES + 346 | (sequence_iteration + 1) * BATCH_SIZE * HIDDEN_SIZE * LSTM_GATES]; 347 | 348 | } 349 | 350 | // Synchronize between recurrent iterations - spin stage 351 | if (bidx == 0) { 352 | if (tid < gridDim.x) { 353 | while (syncIn[(bidy * gridDim.x + tid)] != sequence_iteration + 1) { 354 | } 355 | } 356 | 357 | __syncthreads(); 358 | 359 | if (tid == 0) { 360 | syncOut[bidy] = sequence_iteration + 1; 361 | } 362 | } else { 363 | if (tid == 0) { 364 | while (syncOut[bidy] != sequence_iteration + 1) { 365 | } 366 | } 367 | __syncthreads(); 368 | } 369 | 370 | // Load the hidden state into the input buffer in shared memory (coalesced) 371 | // Tile height * REDUCTION_WIDTH * SEQUENTIAL_LENGTH is equivalent to the tile height by hidden size 372 | // Reduction_width * TILE_WIDTH * LSTM_GATES is the number of threads launched (allows for loop unrolling) 373 | #pragma unroll 374 | for (int i = 0; i < TILE_HEIGHT; i++) { 375 | if (i + bidy * TILE_HEIGHT < BATCH_SIZE) { 376 | #pragma unroll 377 | for (int j = 0; j < HIDDEN_SIZE; j += NUM_GROUPS * GROUP_THREADS) { 378 | if (j + tid < HIDDEN_SIZE) { 379 | hidden_tile[i][j+tid] = output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + i) * HIDDEN_SIZE + j + tid]; 380 | } else if (j + tid < BUFFER_SIZE) { 381 | hidden_tile[i][j+tid] = 0.0f; 382 | } 383 | } 384 | } 385 | } 386 | 387 | // Enforce loading of data to shared memory before computation 388 | __syncthreads(); 389 | 390 | } 391 | } 392 | 393 | template 394 | void process_input_weights(T * output, std::vector weights, uint32_t input_size, uint32_t hidden_size) { 395 | 396 | // Outside loop is the input size 397 | for (uint32_t j = 0; j < input_size; j++) { 398 | // Width of the input weight matrix 399 | for (uint32_t k = 0; k < hidden_size; k++) { 400 | // Colocate the weights for each element 401 | for (uint32_t i = 0; i < LSTM_GATES; i++) { 402 | output[(j * hidden_size + k) * LSTM_GATES + i] = weights.at(i)[j * hidden_size + k]; 403 | } 404 | } 405 | } 406 | } 407 | 408 | template 409 | void process_hidden_weights(T * output, std::vector weights, uint32_t hidden_size) { 410 | 411 | // For each output element 412 | for (uint32_t j = 0; j < hidden_size; j++) { 413 | // For each gate 414 | for (uint32_t k = 0; k < LSTM_GATES; k++) { 415 | // For each element for that gate 416 | for (uint32_t i = 0; i < hidden_size; i++) { 417 | output[j * LSTM_GATES * hidden_size + k * hidden_size + i] = weights.at(4 + k)[i * hidden_size + j]; 418 | } 419 | } 420 | } 421 | } 422 | 423 | template 424 | void process_biases(T * output, std::vector weights, uint32_t hidden_size) { 425 | 426 | // For each output element 427 | for (uint32_t k = 0; k < hidden_size; k++) { 428 | // Colocate the biases for each element 429 | for (uint32_t i = 0; i < LSTM_GATES; i++) { 430 | output[k * LSTM_GATES + i] = weights.at(i + 8)[k]; 431 | } 432 | } 433 | } 434 | 435 | // Initialize all layer weights and send to GPU 436 | template 437 | uint32_t LSTMLayer::initialize() { 438 | 439 | uint32_t input_footprint = input_weight_footprint(); 440 | uint32_t hidden_footprint = hidden_weight_footprint(); 441 | uint32_t bias_footprint = bias_weight_footprint(); 442 | 443 | // Weight buffer allocations 444 | cudaHostAlloc((void **) &(this->packed_input_weights), input_footprint, cudaHostAllocDefault); CUDA_ERR; 445 | cudaHostAlloc((void **) &(this->packed_hidden_weights), hidden_footprint, cudaHostAllocDefault); CUDA_ERR; 446 | cudaHostAlloc((void **) &(this->packed_biases), bias_footprint, cudaHostAllocDefault); CUDA_ERR; 447 | cudaMalloc((void **) &(this->packed_input_weights_gpu), input_footprint); CUDA_ERR; 448 | cudaMalloc((void **) &(this->packed_hidden_weights_gpu), hidden_footprint); CUDA_ERR; 449 | cudaMalloc((void **) &(this->packed_biases_gpu), bias_footprint); CUDA_ERR; 450 | 451 | // Reorganize weights (typically a transpose) 452 | process_input_weights(this->packed_input_weights, this->host_weights, this->input_size, this->hidden_size); 453 | process_hidden_weights(this->packed_hidden_weights, this->host_weights, this->hidden_size); 454 | process_biases(this->packed_biases, this->host_weights, this->hidden_size); 455 | 456 | // Send weights to GPU 457 | cudaMemcpy(this->packed_input_weights_gpu, this->packed_input_weights, input_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 458 | cudaMemcpy(this->packed_hidden_weights_gpu, this->packed_hidden_weights, hidden_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 459 | cudaMemcpy(this->packed_biases_gpu, this->packed_biases, bias_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 460 | 461 | return 0; 462 | 463 | } 464 | 465 | // Free all allocated buffers. Only needed for full sweep benchmarking. 466 | template 467 | void LSTMLayer::reset() { 468 | cudaFreeHost((void *) this->packed_input_weights); 469 | cudaFreeHost((void *) this->packed_hidden_weights); 470 | cudaFreeHost((void *) this->packed_biases); 471 | cudaFree((void *) this->packed_input_weights_gpu); 472 | cudaFree((void *) this->packed_hidden_weights_gpu); 473 | cudaFree((void *) this->packed_biases_gpu); 474 | } 475 | 476 | // Allocate input/output buffers for the layer. Currently set up for only single layer models, but can be extended to multi-layer 477 | // without dramatic refactoring 478 | template 479 | uint32_t LSTMModel::initialize() { 480 | 481 | for (auto& l: this->layers) { 482 | uint32_t debug = l.initialize(); 483 | if (debug != 0) { 484 | std::cout << "FAILURE\n"; 485 | return debug; 486 | } 487 | } 488 | 489 | this->gpu_weights_input = this->layers[0].get_packed_input_weights_gpu(); 490 | this->gpu_weights_hidden = this->layers[0].get_packed_hidden_weights_gpu(); 491 | this->gpu_biases = this->layers[0].get_packed_biases_gpu(); 492 | this->mm_k = this->initial_input_size; 493 | this->mm_n = this->output_size * LSTM_GATES; 494 | 495 | // Output allocations, assumes sequence length less than 100 496 | cudaHostAlloc((void **) &(this->host_output), this->output_size * this->batch_size * sizeof(T) * 100, cudaHostAllocDefault); 497 | cudaMalloc((void **) &(this->gpu_output), this->output_size * this->batch_size * sizeof(T) * 100); 498 | 499 | // Input allocations, assumes sequence length less than 100 500 | cudaMalloc((void **) &(this->gpu_inputs), this->initial_input_size * this->batch_size * 100 * sizeof(T)); 501 | cudaMalloc((void **) &(this->gpu_precompute), this->output_size * this->batch_size * LSTM_GATES * 100 * sizeof(T)); 502 | 503 | // Initialize hidden state, for our purposes we use 0's 504 | cudaMalloc((void **) &(this->gpu_hidden_initializer), this->output_size * this->batch_size * sizeof(T)); 505 | cudaMemset((void *)this->gpu_hidden_initializer, 0, this->output_size * this->batch_size * sizeof(T)); 506 | 507 | // Synchronization buffers. Always allocated to full dimensionality so that they may be easily reused from run to run 508 | cudaMalloc((void **) &(this->gpu_syncIn), 80 * sizeof(int) * LINE_SIZE); 509 | cudaMalloc((void **) &(this->gpu_syncOut), 80 * sizeof(int) * LINE_SIZE); 510 | 511 | // GEMM Kernel parameters 512 | this->paramsMM[0] = (void*) &(this->gpu_inputs); 513 | this->paramsMM[1] = (void*) &(this->gpu_weights_input); 514 | this->paramsMM[2] = (void*) &(this->gpu_precompute); 515 | this->paramsMM[4] = (void*) &(this->mm_k); 516 | this->paramsMM[5] = (void*) &(this->mm_n); 517 | 518 | // LSTM Kernel parameters 519 | this->paramsLSTM[0] = (void*) &(this->gpu_precompute); 520 | this->paramsLSTM[1] = (void*) &(this->gpu_hidden_initializer); 521 | this->paramsLSTM[2] = (void*) &(this->gpu_weights_hidden); 522 | this->paramsLSTM[3] = (void*) &(this->gpu_biases); 523 | this->paramsLSTM[4] = (void*) &(this->gpu_output); 524 | this->paramsLSTM[5] = (void*) &(this->gpu_syncIn); 525 | this->paramsLSTM[6] = (void*) &(this->gpu_syncOut); 526 | 527 | return 0; 528 | } 529 | 530 | // Frees model buffers 531 | template 532 | void LSTMModel::reset() { 533 | 534 | for (auto& l: this->layers) { 535 | l.reset(); 536 | } 537 | 538 | cudaFreeHost((void *) this->host_output); 539 | cudaFree((void *) this->gpu_output); 540 | 541 | cudaFree((void *) this->gpu_inputs); 542 | cudaFree((void *) this->gpu_precompute); 543 | } 544 | 545 | // Defines tiling configuration (should be encapsulated elsewhere in the future) 546 | template 547 | void LSTMModel::set_configuration(int x, int y, int g, int t) { 548 | this->tile_width = x; 549 | this->tile_height = y; 550 | this->num_groups = g; 551 | this->group_threads = t; 552 | } 553 | 554 | // Processes input sequence (both independent and dependent) 555 | template 556 | float LSTMModel::run_input(T* input, uint32_t * length) { 557 | 558 | // Define remaining kernel parameters (primarily dependent on sequence length) 559 | this->mm_m = this->batch_size * *length; 560 | this->paramsMM[3] = (void *) &(this->mm_m); 561 | this->paramsLSTM[7] = (void *) length; 562 | 563 | // GEMM Kernel dimensioning 564 | dim3 mm_grid = dim3((this->mm_n + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (this->mm_m + MM_TILE_SIZE - 1) / MM_TILE_SIZE); 565 | dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE); 566 | size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float); 567 | 568 | // LSTM Kernel dimensioning 569 | int effective_w = (this->num_groups * this->tile_width) / LSTM_GATES; 570 | dim3 lstm_rnn_grid = dim3((this->output_size + effective_w - 1) / effective_w, (this->batch_size + this->tile_height - 1) / this->tile_height); 571 | dim3 lstm_rnn_block = dim3(this->num_groups * this->group_threads); 572 | unsigned block_size = lstm_rnn_block.x; 573 | unsigned grid_size = lstm_rnn_grid.x * lstm_rnn_grid.y; 574 | 575 | // Kernel instantiation (currently configured for manual application of parameters) 576 | void * kernel = (void*)lstm_rnn<256, 2, 4, 64, 8, 40>; 577 | int numBlocks = 0; 578 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, block_size, 0); 579 | 580 | // Check occupancy prior to launch to prevent program hangs 581 | if (numBlocks == 0 || grid_size > 80) { 582 | printf("numBlocks: %2d grid_size: %3d, block_size: %3d\n", numBlocks, grid_size, block_size); 583 | return -std::numeric_limits::infinity(); 584 | } 585 | 586 | cudaEvent_t start, end; 587 | float elapsed; 588 | 589 | cudaMemcpy(this->gpu_inputs, input, this->initial_input_size * this->batch_size * *length * sizeof(T), cudaMemcpyHostToDevice); 590 | 591 | // Timing info 592 | cudaEventCreate(&start); 593 | cudaEventCreate(&end); 594 | cudaEventRecord(start); 595 | 596 | // Kernel launches 597 | cudaLaunchKernel((void *)matmul, mm_grid, mm_block, this->paramsMM, mm_sm_requirement); 598 | cudaLaunchKernel(kernel, lstm_rnn_grid, lstm_rnn_block, this->paramsLSTM); 599 | 600 | cudaEventRecord(end); 601 | cudaEventSynchronize(end); 602 | cudaEventElapsedTime(&elapsed, start, end); 603 | 604 | cudaMemcpy(this->host_output, this->gpu_output, this->output_size * this->batch_size * sizeof(T), cudaMemcpyDeviceToHost); 605 | 606 | #ifdef DEBUG 607 | // Value checking 608 | for (int i = 0; i < this->batch_size; i++) { 609 | printf("Sequence %2d\n", i); 610 | for (int j = 0; j < this->output_size; j++) { 611 | printf("%f ", this->host_output[i * this->output_size + j]); 612 | } 613 | printf("\n"); 614 | } 615 | printf("\n"); 616 | #endif 617 | 618 | // Check for runtime errors 619 | cudaError_t err; 620 | cudaDeviceSynchronize(); 621 | if ((err = cudaGetLastError()) != cudaSuccess) { 622 | printf("CUDA error: %d : %s : %s, line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__); 623 | return std::numeric_limits::infinity(); 624 | } 625 | 626 | return elapsed; 627 | } 628 | 629 | // Explicit template instantiations 630 | template void process_input_weights(float *, std::vector, uint32_t, uint32_t); 631 | template void process_hidden_weights(float *, std::vector, uint32_t); 632 | template void process_biases(float *, std::vector, uint32_t); 633 | template uint32_t LSTMLayer::initialize(); 634 | template uint32_t LSTMModel::initialize(); 635 | template void LSTMModel::set_configuration(int, int, int, int); 636 | template float LSTMModel::run_input(float *, uint32_t *); 637 | template void LSTMModel::reset(); 638 | template void LSTMLayer::reset(); 639 | -------------------------------------------------------------------------------- /src/GRU_single.cu: -------------------------------------------------------------------------------- 1 | #include "GRU_single.h" 2 | #include "misc.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace cooperative_groups; 12 | 13 | __device__ __forceinline__ float sigmoidf(float x) { 14 | return 1.0f / (1.0f + expf(-1.0f * x)); 15 | } 16 | 17 | #define MM_BLOCK_SIZE 16 18 | #define MM_REG_TILE 4 19 | #define MM_TILE_SIZE 64 20 | 21 | // This is a mostly optimized kernel for matrix multiplication 22 | // The kernel uses a two tiered tiling mechanism that first tiles large 23 | // tiles from global memory to shared memory. This shared memory tile is 24 | // then used as the source to stream data into register arrays that perform 25 | // a calculation on a 8x8 tile. 26 | 27 | __global__ void matmul(float * A, float * B, float * C, 28 | uint32_t M, uint32_t K, uint32_t N) { 29 | 30 | extern __shared__ float base[]; 31 | float* bufferA = base; 32 | float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE]; 33 | 34 | float regA[MM_REG_TILE]; 35 | float regB[MM_REG_TILE]; 36 | float regC[MM_REG_TILE][MM_REG_TILE]; 37 | 38 | uint32_t tidx = threadIdx.x; 39 | uint32_t tidy = threadIdx.y; 40 | uint32_t id = threadIdx.y * blockDim.x + threadIdx.x; 41 | uint32_t bidx = blockIdx.x; 42 | uint32_t bidy = blockIdx.y; 43 | 44 | // Number of rows that are traversed in a single fully coalesced load sequence 45 | constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE); 46 | constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE; 47 | 48 | // Zero the intermediate output 49 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 50 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 51 | regC[y][x] = 0.0f; 52 | } 53 | } 54 | 55 | for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) { 56 | 57 | // Load lhs tile from global memory to shared memory (fully coalesced) 58 | #pragma unroll 59 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 60 | uint32_t index = j * NUM_THREADS + id; 61 | if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) { 62 | bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE]; 63 | } else { 64 | bufferA[index] = 0.0f; 65 | } 66 | } 67 | 68 | // Not necessary for correctness, but improves performance by avoiding thrashing shared memory 69 | __syncthreads(); 70 | 71 | // Load rhs tile from global memory to shared memory (fully coalesced) 72 | #pragma unroll 73 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 74 | uint32_t index = j * NUM_THREADS + id; 75 | if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) { 76 | bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE]; 77 | } else { 78 | bufferB[index] = 0.0f; 79 | } 80 | } 81 | 82 | // Ensures all data is written from global memory to shared memory before it is streamed 83 | // into register arrays. 84 | __syncthreads(); 85 | 86 | // Loop through full tile 87 | for (uint32_t j = 0; j < MM_TILE_SIZE; j++) { 88 | 89 | // Load vector from lhs and rhs 90 | #pragma unroll 91 | for (uint32_t l = 0; l < MM_REG_TILE; l++) { 92 | regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j]; 93 | regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l]; 94 | } 95 | 96 | #pragma unroll 97 | // Perform a narrow matmul 98 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 99 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 100 | regC[y][x] += regA[y] * regB[x]; 101 | } 102 | } 103 | } 104 | 105 | __syncthreads(); 106 | } 107 | 108 | // Write register intermediates to shared memory (possibly unnecessary) 109 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 110 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 111 | bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x]; 112 | } 113 | } 114 | 115 | __syncthreads(); 116 | 117 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 118 | uint32_t index = j * NUM_THREADS + id; 119 | if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) { 120 | C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)] = bufferA[index]; 121 | } 122 | } 123 | } 124 | 125 | template 126 | __global__ void gru_rnn(const float* precomputed_inputs, 127 | const float* hidden_initializer, 128 | const float* weights_r, 129 | const float* weights_zh, 130 | const float* biases_r, 131 | const float* biases_zh, 132 | float* r_buf, 133 | float* output, 134 | volatile int* syncIn, 135 | volatile int* syncOut, 136 | uint32_t length) { 137 | // Indexing Helpers 138 | int tid = threadIdx.x; 139 | int bidx = blockIdx.x; 140 | int bidy = blockIdx.y; 141 | // Number of weights stored per tile width 142 | constexpr int LENGTH = (HIDDEN_SIZE + GROUP_THREADS - 1) / GROUP_THREADS; 143 | // Number of elements to reserve in shared memory for each output. Effectively 144 | // rounds up HIDDEN_SIZE to multiple of GROUP_THREADS 145 | constexpr int BUFFER_SIZE = LENGTH * GROUP_THREADS; 146 | // Number of elements horizontally produced by a single threadblock 147 | constexpr int OUTPUT_TILE_WIDTH = TILE_WIDTH * NUM_GROUPS / (GRU_GATES - 1); 148 | // Number of threads in a launched block 149 | constexpr int NUM_THREADS = NUM_GROUPS * GROUP_THREADS; 150 | // Number of outputs per tile row a single thread must compute for the partial sums of r values 151 | constexpr int ELEMS_PER_THREAD = (HIDDEN_SIZE + NUM_THREADS - 1) / NUM_THREADS; 152 | // Number of partial sums produced by the kernel for each input in the batch for the r gate 153 | constexpr int NUM_PARTIALS = (HIDDEN_SIZE + OUTPUT_TILE_WIDTH - 1) / OUTPUT_TILE_WIDTH; 154 | 155 | // Determines whether a group is the h gate or the z gate 156 | int g_type = 2 * tid / (NUM_THREADS); 157 | int wg_id = (tid % (NUM_THREADS / 2)) / GROUP_THREADS; 158 | 159 | // Shared memory workspaces 160 | __shared__ float h_tile[TILE_HEIGHT][BUFFER_SIZE]; 161 | __shared__ float r_tile[TILE_HEIGHT][BUFFER_SIZE]; 162 | __shared__ float z_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 163 | __shared__ float h_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 164 | 165 | // Tiled weights for z or h gates 166 | float weights_reg[TILE_WIDTH][LENGTH]; 167 | // Weight for the r gate 168 | float weights_reg_r[OUTPUT_TILE_WIDTH][ELEMS_PER_THREAD]; 169 | float outputs_reg[TILE_HEIGHT][TILE_WIDTH]; 170 | float bias = 0.f; 171 | float bias_r[ELEMS_PER_THREAD]; 172 | float precompute = 0.f; 173 | const float* precomputed_offset = precomputed_inputs; 174 | const float* precomputed_offset_r = precomputed_inputs + bidy * TILE_HEIGHT * HIDDEN_SIZE * GRU_GATES; 175 | 176 | // Work group declaration 177 | thread_block bl = this_thread_block(); 178 | thread_block_tile work_group = tiled_partition(bl); 179 | 180 | // Stream appropriate weights for element_id and gate_id into the register file 181 | for (int i = 0; i < TILE_WIDTH; i++) { 182 | int group_id = bidx * OUTPUT_TILE_WIDTH + wg_id * TILE_WIDTH + i; 183 | 184 | if (group_id < HIDDEN_SIZE){ 185 | for (int j = 0; j < LENGTH; j++) { 186 | if ( j * GROUP_THREADS + work_group.thread_rank() < HIDDEN_SIZE) { 187 | weights_reg[i][j] = weights_zh[(group_id * (GRU_GATES - 1) + g_type) * HIDDEN_SIZE + j * GROUP_THREADS + work_group.thread_rank()]; 188 | } else { 189 | weights_reg[i][j] = 0.f; 190 | } 191 | } 192 | } 193 | } 194 | 195 | // Load biases and define time independent offsets 196 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 197 | int group_id = (bidx * OUTPUT_TILE_WIDTH + wg_id * TILE_WIDTH + work_group.thread_rank()) % TILE_WIDTH; 198 | int gate_id = (bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank()) % TILE_WIDTH; 199 | if (group_id < HIDDEN_SIZE) { 200 | bias = biases_zh[group_id * (GRU_GATES - 1) + g_type]; 201 | 202 | precomputed_offset += bidy * TILE_HEIGHT * HIDDEN_SIZE * GRU_GATES; 203 | precomputed_offset += group_id * GRU_GATES; 204 | precomputed_offset += g_type + 1; 205 | precomputed_offset += (work_group.thread_rank() / TILE_WIDTH) * HIDDEN_SIZE; 206 | } else { 207 | bias = 0.f; 208 | } 209 | } 210 | 211 | // Stream weights for the r gate into the register file 212 | for (int j = 0; j < OUTPUT_TILE_WIDTH; j++) { 213 | for (int i = 0; i < ELEMS_PER_THREAD; i++) { 214 | if ( i * NUM_THREADS + tid < HIDDEN_SIZE) { 215 | weights_reg_r[j][i] = weights_r[bidx * OUTPUT_TILE_WIDTH * HIDDEN_SIZE + j * HIDDEN_SIZE + i * NUM_THREADS + tid]; 216 | } else { 217 | weights_reg_r[j][i] = 0.f; 218 | } 219 | } 220 | } 221 | 222 | // Stream biases for the r_gate into the register file 223 | for (int i = 0; i < ELEMS_PER_THREAD; i++) { 224 | if ( i * NUM_THREADS + tid < HIDDEN_SIZE) { 225 | bias_r[i] = biases_r[i * NUM_THREADS + tid]; 226 | } else { 227 | bias_r[i] = 0.f; 228 | } 229 | } 230 | 231 | // For the first iteration, load initial hidden state into the hidden tile. 232 | // This doesn't need to be repeated because once the recurrent pattern is established 233 | // the loop will populate the hidden_tile as the necessary outputs are produced. 234 | if ( tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT ) { 235 | int x = tid % OUTPUT_TILE_WIDTH; 236 | int y = tid / OUTPUT_TILE_WIDTH; 237 | h_tile[y][x] = hidden_initializer[(bidy * TILE_HEIGHT + y) * HIDDEN_SIZE + bidx * OUTPUT_TILE_WIDTH + x]; 238 | } 239 | 240 | // Hidden state initialization 241 | #pragma unroll 242 | for (int j = 0; j < TILE_HEIGHT; j++) { 243 | #pragma unroll 244 | for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) { 245 | if ( i + tid < HIDDEN_SIZE) { 246 | h_tile[j][i + tid] = hidden_initializer[(bidy * TILE_HEIGHT + j) * HIDDEN_SIZE + i + tid]; 247 | } else if (i + tid < BUFFER_SIZE) { 248 | h_tile[j][i + tid] = 0.f; 249 | } 250 | } 251 | } 252 | 253 | __syncthreads(); 254 | 255 | // Main recurrent loop 256 | for (int sequence_iteration = 0; sequence_iteration < length; sequence_iteration++) { 257 | 258 | // Produce partial dot products for the r gate 259 | for (int k = 0; k < TILE_HEIGHT; k++) { 260 | float r_dot_products[ELEMS_PER_THREAD]; 261 | 262 | // Zero initialize partials 263 | for (int i = 0; i < ELEMS_PER_THREAD; i++) { 264 | r_dot_products[i] = 0.f; 265 | } 266 | 267 | // Process hidden_tile elements, getting maximum reuse 268 | for (int j = 0; j < OUTPUT_TILE_WIDTH; j++) { 269 | float rhs = h_tile[k][bidx * OUTPUT_TILE_WIDTH + j]; 270 | for (int i = 0; i < ELEMS_PER_THREAD; i++) { 271 | r_dot_products[i] += weights_reg_r[j][i] * rhs; 272 | } 273 | } 274 | 275 | // Write to the global buffer 276 | for (int i = 0; i < ELEMS_PER_THREAD; i++) { 277 | if ( i * NUM_THREADS + tid < HIDDEN_SIZE) { 278 | r_buf[bidy * TILE_HEIGHT * NUM_PARTIALS * HIDDEN_SIZE + 279 | k * NUM_PARTIALS * HIDDEN_SIZE + 280 | bidx * HIDDEN_SIZE + 281 | i * NUM_THREADS + tid] = r_dot_products[i]; 282 | } 283 | } 284 | } 285 | 286 | // Synchronize between recurrent iterations - signal stage 287 | if (tid == 0) { 288 | syncIn[bidy * gridDim.x + bidx] = sequence_iteration + 1; 289 | } 290 | 291 | // Clear the output buffer 292 | for (int j = 0; j < TILE_HEIGHT; j++) { 293 | for (int i = 0; i < TILE_WIDTH; i++) { 294 | outputs_reg[j][i] = 0.f; 295 | } 296 | } 297 | 298 | // Populate time independent r value 299 | float precompute_r[TILE_HEIGHT][ELEMS_PER_THREAD]; 300 | for (int j = 0; j < TILE_HEIGHT; j++) { 301 | for (int i = 0; i < ELEMS_PER_THREAD; i++) { 302 | if ( i * NUM_THREADS + tid < HIDDEN_SIZE) { 303 | precompute_r[j][i] = precomputed_offset_r[j * HIDDEN_SIZE * GRU_GATES + i * NUM_THREADS + tid]; 304 | } 305 | } 306 | } 307 | precomputed_offset_r += HIDDEN_SIZE * BATCH_SIZE * GRU_GATES; 308 | 309 | // Populate the other time indepedent gate inputs 310 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 311 | precompute = *precomputed_offset; 312 | precomputed_offset += BATCH_SIZE * HIDDEN_SIZE * GRU_GATES; 313 | } 314 | 315 | // Synchronize between recurrent iterations - spin stage 316 | __threadfence(); 317 | 318 | if (bidx == 0) { 319 | if (tid < gridDim.x) { 320 | while ( syncIn[bidy * gridDim.x + tid] != sequence_iteration + 1) { 321 | } 322 | } 323 | 324 | __syncthreads(); 325 | 326 | if (tid == 0) { 327 | syncOut[bidy] = sequence_iteration + 1; 328 | } 329 | } else { 330 | if (tid == 0) { 331 | while (syncOut[bidy] != sequence_iteration + 1) { 332 | } 333 | } 334 | __syncthreads(); 335 | } 336 | 337 | // Load r gate partial dot products 338 | float r[TILE_HEIGHT][ELEMS_PER_THREAD][NUM_PARTIALS]; 339 | for (int k = 0; k < TILE_HEIGHT; k++) { 340 | for (int i = 0; i < NUM_PARTIALS; i++) { 341 | for (int j = 0; j < ELEMS_PER_THREAD; j++) { 342 | if (j * NUM_THREADS + tid < HIDDEN_SIZE) { 343 | r[k][j][i] = r_buf[bidy * TILE_HEIGHT * NUM_PARTIALS * HIDDEN_SIZE + 344 | k * NUM_PARTIALS * HIDDEN_SIZE + 345 | j * NUM_THREADS + tid + 346 | i * HIDDEN_SIZE]; 347 | } 348 | } 349 | } 350 | } 351 | 352 | // Load h_t-1 into shared memory 353 | if (sequence_iteration != 0) { 354 | for (int j = 0; j < TILE_HEIGHT; j++) { 355 | for (int i = 0; i < HIDDEN_SIZE; i+= NUM_THREADS) { 356 | if (i + tid < HIDDEN_SIZE) { 357 | h_tile[j][i + tid] = output[(sequence_iteration - 1) * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + j) * HIDDEN_SIZE + i + tid]; 358 | } 359 | } 360 | } 361 | } 362 | 363 | __syncthreads(); 364 | 365 | // Redundant calculate of r gate calculations (dot product, time independent, activation) and broadcast to shared memory 366 | for (int k = 0; k < TILE_HEIGHT; k++) { 367 | for (int j = 0; j < ELEMS_PER_THREAD; j++) { 368 | if (j * NUM_THREADS + tid < HIDDEN_SIZE) { 369 | float r_val = 0.f; 370 | for (int i = 0; i < NUM_PARTIALS; i++) { 371 | r_val += r[k][j][i]; 372 | } 373 | r_val += bias_r[j]; 374 | r_val += precompute_r[k][j]; 375 | r_val = sigmoidf(r_val); 376 | r_val = r_val * h_tile[k][j * NUM_THREADS + tid]; 377 | r_tile[k][j * NUM_THREADS + tid] = r_val; 378 | } 379 | } 380 | } 381 | 382 | __syncthreads(); 383 | 384 | // R gate computation finished, so gates z and h_cand now perform tiled matrix multiplication 385 | // Note separate codepaths because compiler would otherwise introduce divergence 386 | if (g_type == 0) { 387 | for (int k = 0; k < LENGTH; k++) { 388 | for (int j = 0; j < TILE_HEIGHT; j++) { 389 | float val = r_tile[j][k * GROUP_THREADS + work_group.thread_rank()]; 390 | for (int i = 0; i < TILE_WIDTH; i++) { 391 | outputs_reg[j][i] += weights_reg[i][k] * val; 392 | } 393 | } 394 | } 395 | } else { 396 | for (int k = 0; k < LENGTH; k++) { 397 | for (int j = 0; j < TILE_HEIGHT; j++) { 398 | float val = h_tile[j][k * GROUP_THREADS + work_group.thread_rank()]; 399 | for (int i = 0; i < TILE_WIDTH; i++) { 400 | outputs_reg[j][i] += weights_reg[i][k] * val; 401 | } 402 | } 403 | } 404 | } 405 | 406 | // Reduction 407 | for (int j = 0; j < TILE_HEIGHT; j++) { 408 | for (int i = 0; i < TILE_WIDTH; i++) { 409 | for (int k = 1; k < GROUP_THREADS; k *= 2) { 410 | outputs_reg[j][i] += work_group.shfl_xor(outputs_reg[j][i], k); 411 | } 412 | } 413 | } 414 | 415 | // Gate activations 416 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 417 | int x = work_group.thread_rank() % TILE_WIDTH; 418 | int y = work_group.thread_rank() / TILE_WIDTH; 419 | 420 | float val = outputs_reg[y][x] + precompute + bias; 421 | 422 | if (g_type == 0) { 423 | val = sigmoidf(val); 424 | z_gate[y][wg_id * TILE_WIDTH + x] = val; 425 | } else { 426 | val = tanh(val); 427 | h_gate[y][wg_id * TILE_WIDTH + x] = val; 428 | } 429 | } 430 | 431 | __syncthreads(); 432 | 433 | // Broadcast outputs 434 | if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT) { 435 | int x = tid % OUTPUT_TILE_WIDTH; 436 | int y = tid / OUTPUT_TILE_WIDTH; 437 | if (bidx * OUTPUT_TILE_WIDTH + x < HIDDEN_SIZE) { 438 | float z_val = z_gate[y][x]; 439 | float h_val = h_gate[y][x]; 440 | float h_old_val = h_tile[y][bidx * OUTPUT_TILE_WIDTH + x]; 441 | 442 | float out_val = (1 - z_val) * h_val + z_val * h_old_val; 443 | h_tile[y][bidx * OUTPUT_TILE_WIDTH + x] = out_val; 444 | output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + y) * HIDDEN_SIZE + bidx * OUTPUT_TILE_WIDTH + x] = out_val; 445 | } 446 | } 447 | 448 | __syncthreads(); 449 | 450 | } 451 | } 452 | 453 | template 454 | void process_input_weights(T * output, std::vector weights, uint32_t input_size, uint32_t hidden_size) { 455 | 456 | // Outside loop is the input size 457 | for (uint32_t j = 0; j < input_size; j++) { 458 | // Width of the input weight matrix 459 | for (uint32_t k = 0; k < hidden_size; k++) { 460 | // Colocate the weights for each element 461 | for (uint32_t i = 0; i < GRU_GATES; i++) { 462 | output[j * hidden_size * GRU_GATES + k * GRU_GATES + i] = weights.at(i)[j * hidden_size + k]; 463 | } 464 | } 465 | } 466 | } 467 | 468 | template 469 | void process_hidden_weights(T * output, std::vector weights, uint32_t hidden_size) { 470 | 471 | // For each output element 472 | for (uint32_t j = 0; j < hidden_size; j++) { 473 | // For gates z and h 474 | for (uint32_t k = 0; k < GRU_GATES - 1; k++) { 475 | // For each element for that gate 476 | for (uint32_t i = 0; i < hidden_size; i++) { 477 | // Indices 4 and 5 correspond to the z and h weights 478 | output[j * (GRU_GATES - 1) * hidden_size + k * hidden_size + i] = weights.at(4 + k)[i * hidden_size + j]; 479 | } 480 | } 481 | } 482 | } 483 | 484 | template 485 | void process_biases(T * output, std::vector weights, uint32_t hidden_size) { 486 | int err = 0; 487 | // For each output element 488 | for (uint32_t k = 0; k < hidden_size; k++) { 489 | // Colocate the biases for each element 490 | for (uint32_t i = 0; i < GRU_GATES - 1; i++) { 491 | output[k * (GRU_GATES - 1) + i] = weights.at(i + 7)[k]; 492 | if (weights.at(i + 7)[k] != 0.5) err++; 493 | } 494 | } 495 | } 496 | 497 | // Free buffers (all tiling dimension dependent) 498 | template 499 | void GRULayerSingle::reset() { 500 | cudaFreeHost((void *) this->packed_input_weights); 501 | cudaFreeHost((void *) this->packed_hidden_weights); 502 | cudaFreeHost((void *) this->packed_biases); 503 | cudaFree((void *) this->packed_hidden_weights_r_gpu); 504 | cudaFree((void *) this->packed_biases_r_gpu); 505 | cudaFree((void *) this->packed_input_weights_gpu); 506 | cudaFree((void *) this->packed_hidden_weights_gpu); 507 | cudaFree((void *) this->packed_biases_gpu); 508 | } 509 | 510 | // Initialize and fill trained parameter buffers 511 | template 512 | uint32_t GRULayerSingle::initialize() { 513 | 514 | uint32_t input_footprint = input_weight_footprint(); 515 | uint32_t hidden_footprint = hidden_weight_footprint(); 516 | uint32_t hidden_r_footprint = hidden_weight_r_footprint(); 517 | uint32_t bias_footprint = bias_weight_footprint(); 518 | uint32_t bias_r_footprint = bias_weight_r_footprint(); 519 | 520 | // Allocate buffers 521 | cudaHostAlloc((void **) &(this->packed_input_weights), input_footprint, cudaHostAllocDefault); CUDA_ERR; 522 | cudaHostAlloc((void **) &(this->packed_hidden_weights), hidden_footprint, cudaHostAllocDefault); CUDA_ERR; 523 | cudaHostAlloc((void **) &(this->packed_biases), bias_footprint, cudaHostAllocDefault); CUDA_ERR; 524 | cudaMalloc((void **) &(this->packed_input_weights_gpu), input_footprint); CUDA_ERR; 525 | cudaMalloc((void **) &(this->packed_hidden_weights_gpu), hidden_footprint); CUDA_ERR; 526 | cudaMalloc((void **) &(this->packed_biases_gpu), bias_footprint); CUDA_ERR; 527 | cudaMalloc((void **) &(this->packed_hidden_weights_r_gpu), hidden_r_footprint); CUDA_ERR; 528 | cudaMalloc((void **) &(this->packed_biases_r_gpu), bias_r_footprint); CUDA_ERR; 529 | 530 | // Reorganize weights 531 | process_input_weights(this->packed_input_weights, this->host_weights, this->input_size, this->hidden_size); 532 | process_hidden_weights(this->packed_hidden_weights, this->host_weights, this->hidden_size); 533 | process_biases(this->packed_biases, this->host_weights, this->hidden_size); 534 | 535 | // Send to GPU 536 | cudaMemcpy(this->packed_input_weights_gpu, this->packed_input_weights, input_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 537 | cudaMemcpy(this->packed_hidden_weights_gpu, this->packed_hidden_weights, hidden_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 538 | cudaMemcpy(this->packed_biases_gpu, this->packed_biases, bias_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 539 | cudaMemcpy(this->packed_hidden_weights_r_gpu, this->host_weights.at(WEIGHTS_HIDDEN_R), hidden_r_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 540 | cudaMemcpy(this->packed_biases_r_gpu, this->host_weights.at(BIAS_R), bias_r_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 541 | 542 | return 0; 543 | } 544 | 545 | // Reset model parameters 546 | template 547 | void GRUModelSingle::reset() { 548 | 549 | for (auto& l: this->layers) { 550 | l.reset(); 551 | } 552 | 553 | cudaFreeHost((void *) this->host_output); 554 | cudaFree((void *) this->gpu_output); 555 | 556 | cudaFree((void *) this->gpu_r); 557 | cudaFree((void *) this->gpu_inputs); 558 | cudaFree((void *) this->gpu_precompute); 559 | cudaFree((void *) this->gpu_syncIn); 560 | cudaFree((void *) this->gpu_syncOut); 561 | } 562 | 563 | // Initialize model buffers 564 | template 565 | uint32_t GRUModelSingle::initialize() { 566 | 567 | for (auto& l: this->layers) { 568 | uint32_t debug = l.initialize(); 569 | if (debug != 0) { 570 | std::cout << "FAILURE\n"; 571 | return debug; 572 | } 573 | } 574 | 575 | this->gpu_weights_input = this->layers[0].get_packed_input_weights_gpu(); 576 | this->gpu_weights_hidden = this->layers[0].get_packed_hidden_weights_gpu(); 577 | this->gpu_biases = this->layers[0].get_packed_biases_gpu(); 578 | this->gpu_weights_hidden_r = this->layers[0].get_packed_hidden_weights_r_gpu(); 579 | this->gpu_biases_r = this->layers[0].get_packed_biases_r_gpu(); 580 | 581 | this->mm_k = this->initial_input_size; 582 | this->mm_n = this->output_size * GRU_GATES; 583 | this->num_partials = (this->output_size + this->tile_width - 1) / this->tile_width; 584 | 585 | // Single sized output buffer (Will change for multi-layer networks, one output per iteration networks) 586 | cudaHostAlloc((void **) &(this->host_output), this->output_size * this->batch_size * sizeof(T), cudaHostAllocDefault); 587 | cudaMalloc((void **) &(this->gpu_output), this->output_size * this->batch_size * sizeof(T)); 588 | 589 | // Assume batch size less than 200 590 | cudaMalloc((void **) &(this->gpu_inputs), this->initial_input_size * this->batch_size * 200 * sizeof(T)); 591 | cudaMalloc((void **) &(this->gpu_r), this->output_size * this->batch_size * this->num_partials * sizeof(T)); 592 | cudaMalloc((void **) &(this->gpu_precompute), this->output_size * this->batch_size * GRU_GATES * 200 * sizeof(T)); 593 | 594 | // Hidden state initializer allocation 595 | cudaMalloc((void **) &(this->gpu_hidden_initializer), this->output_size * this->batch_size * sizeof(T)); 596 | cudaMemset((void *)this->gpu_hidden_initializer, 0, this->output_size * this->batch_size * sizeof(T)); 597 | 598 | cudaMalloc((void **) &(this->gpu_syncIn), 80 * sizeof(int)); 599 | cudaMalloc((void **) &(this->gpu_syncOut), 80 * sizeof(int)); 600 | 601 | this->paramsMM[0] = (void*) &(this->gpu_inputs); 602 | this->paramsMM[1] = (void*) &(this->gpu_weights_input); 603 | this->paramsMM[2] = (void*) &(this->gpu_precompute); 604 | this->paramsMM[4] = (void*) &(this->mm_k); 605 | this->paramsMM[5] = (void*) &(this->mm_n); 606 | 607 | this->paramsGRU[0] = (void*) &(this->gpu_precompute); 608 | this->paramsGRU[1] = (void*) &(this->gpu_hidden_initializer); 609 | this->paramsGRU[2] = (void*) &(this->gpu_weights_hidden_r); 610 | this->paramsGRU[3] = (void*) &(this->gpu_weights_hidden); 611 | this->paramsGRU[4] = (void*) &(this->gpu_biases_r); 612 | this->paramsGRU[5] = (void*) &(this->gpu_biases); 613 | this->paramsGRU[6] = (void*) &(this->gpu_r); 614 | this->paramsGRU[7] = (void*) &(this->gpu_output); 615 | this->paramsGRU[8] = (void*) &(this->gpu_syncIn); 616 | this->paramsGRU[9] = (void*) &(this->gpu_syncOut); 617 | 618 | return 0; 619 | } 620 | 621 | // Define tiling configuration (should be encapsulated elsewhere) 622 | template 623 | void GRUModelSingle::set_configuration(int x, int y, int g, int t) { 624 | this->tile_width = x; 625 | this->tile_height = y; 626 | this->num_groups = g; 627 | this->group_threads = t; 628 | } 629 | 630 | // Process input sequence batch 631 | template 632 | float GRUModelSingle::run_input(T* input, uint32_t * length) { 633 | 634 | // Define remaining kernel parameters 635 | this->mm_m = this->batch_size * *length; 636 | this->paramsMM[3] = (void *) &(this->mm_m); 637 | this->paramsGRU[10] = (void *) length; 638 | 639 | // GEMM Kernel dimensioning 640 | dim3 mm_grid = dim3((this->mm_n + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (this->mm_m + MM_TILE_SIZE - 1) / MM_TILE_SIZE); 641 | dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE); 642 | size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float); 643 | 644 | // GRU Kernel dimensioning 645 | int effective_w = (this->tile_width * this->num_groups) / 2; 646 | dim3 gru_rnn_grid = dim3((this->output_size + effective_w - 1) / effective_w, (this->batch_size + this->tile_height - 1) / this->tile_height); 647 | // While there are three gates, we use just two work groups per output to satisfy the dependency 648 | dim3 gru_rnn_block = dim3(this->num_groups * this->group_threads); 649 | unsigned block_size = gru_rnn_block.x; 650 | unsigned grid_size = gru_rnn_grid.x * gru_rnn_grid.y; 651 | 652 | // GRU Kernel instantiation (currently only configured for manual tuning) 653 | void * kernel = (void *)gru_rnn<256, 3, 1, 32, 8, 10>; 654 | 655 | // Check occupancy to prevent hangs 656 | int numBlocks = 0; 657 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, block_size, 0); 658 | if (grid_size > 80 * numBlocks) { 659 | printf("grid_size: %3d numBlocks: %3d\n", grid_size, numBlocks); 660 | return -std::numeric_limits::infinity(); 661 | } 662 | 663 | cudaEvent_t start, end; 664 | float elapsed; 665 | 666 | // Send sequence 667 | cudaMemcpy(this->gpu_inputs, input, this->initial_input_size * this->batch_size * *length * sizeof(T), cudaMemcpyHostToDevice); 668 | 669 | // Timing 670 | cudaEventCreate(&start); 671 | cudaEventCreate(&end); 672 | cudaEventRecord(start); 673 | 674 | // Kernel launches 675 | cudaLaunchKernel((void *)matmul, mm_grid, mm_block, this->paramsMM, mm_sm_requirement); 676 | cudaLaunchKernel(kernel, gru_rnn_grid, gru_rnn_block, this->paramsGRU); 677 | 678 | cudaEventRecord(end); 679 | cudaEventSynchronize(end); 680 | cudaEventElapsedTime(&elapsed, start, end); 681 | 682 | cudaMemcpy(this->host_output, this->gpu_output, this->output_size * this->batch_size * sizeof(T), cudaMemcpyDeviceToHost); 683 | 684 | #ifdef DEBUG 685 | // Value checking 686 | for (int i = 0; i < this->batch_size; i++) { 687 | printf("Sequence %2d\n", i); 688 | for (int j = 0; j < this->output_size; j++) { 689 | printf("%f ", this->host_output[i * this->output_size + j]); 690 | } 691 | printf("\n"); 692 | } 693 | printf("\n"); 694 | #endif 695 | 696 | // Runtime error checking 697 | cudaError_t err; 698 | cudaDeviceSynchronize(); 699 | if ((err = cudaGetLastError()) != cudaSuccess) { 700 | printf("CUDA error: %d : %s : %s, line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__); 701 | return std::numeric_limits::infinity(); 702 | } 703 | 704 | return elapsed; 705 | } 706 | 707 | // Explicit template instantiations 708 | template void process_input_weights(float *, std::vector, uint32_t, uint32_t); 709 | template void process_hidden_weights(float *, std::vector, uint32_t); 710 | template void process_biases(float *, std::vector, uint32_t); 711 | template uint32_t GRULayerSingle::initialize(); 712 | template uint32_t GRUModelSingle::initialize(); 713 | template void GRULayerSingle::reset(); 714 | template void GRUModelSingle::reset(); 715 | template void GRUModelSingle::set_configuration(int, int, int, int); 716 | template float GRUModelSingle::run_input(float *, uint32_t *); 717 | -------------------------------------------------------------------------------- /src/GRU_double.cu: -------------------------------------------------------------------------------- 1 | #include "GRU_double.h" 2 | #include "misc.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace cooperative_groups; 12 | 13 | __device__ __forceinline__ float sigmoidf(float x) { 14 | return 1.0f / (1.0f + expf(-1.0f * x)); 15 | } 16 | 17 | #define MM_BLOCK_SIZE 16 18 | #define MM_REG_TILE 4 19 | #define MM_TILE_SIZE 64 20 | 21 | // This is a mostly optimized kernel for matrix multiplication 22 | // The kernel uses a two tiered tiling mechanism that first tiles large 23 | // tiles from global memory to shared memory. This shared memory tile is 24 | // then used as the source to stream data into register arrays that perform 25 | // a calculation on a 8x8 tile. 26 | 27 | __global__ void matmul(float * A, float * B, float * C, 28 | uint32_t M, uint32_t K, uint32_t N) { 29 | 30 | extern __shared__ float base[]; 31 | float* bufferA = base; 32 | float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE]; 33 | 34 | float regA[MM_REG_TILE]; 35 | float regB[MM_REG_TILE]; 36 | float regC[MM_REG_TILE][MM_REG_TILE]; 37 | 38 | uint32_t tidx = threadIdx.x; 39 | uint32_t tidy = threadIdx.y; 40 | uint32_t id = threadIdx.y * blockDim.x + threadIdx.x; 41 | uint32_t bidx = blockIdx.x; 42 | uint32_t bidy = blockIdx.y; 43 | 44 | // Number of rows that are traversed in a single fully coalesced load sequence 45 | constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE); 46 | constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE; 47 | 48 | // Zero the intermediate output 49 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 50 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 51 | regC[y][x] = 0.0f; 52 | } 53 | } 54 | 55 | for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) { 56 | 57 | // Load lhs tile from global memory to shared memory (fully coalesced) 58 | #pragma unroll 59 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 60 | uint32_t index = j * NUM_THREADS + id; 61 | if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) { 62 | bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE]; 63 | } else { 64 | bufferA[index] = 0.0f; 65 | } 66 | } 67 | 68 | // Not necessary for correctness, but improves performance by avoiding thrashing shared memory 69 | __syncthreads(); 70 | 71 | // Load rhs tile from global memory to shared memory (fully coalesced) 72 | #pragma unroll 73 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 74 | uint32_t index = j * NUM_THREADS + id; 75 | if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) { 76 | bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE]; 77 | } else { 78 | bufferB[index] = 0.0f; 79 | } 80 | } 81 | 82 | // Ensures all data is written from global memory to shared memory before it is streamed 83 | // into register arrays. 84 | __syncthreads(); 85 | 86 | // Loop through full tile 87 | for (uint32_t j = 0; j < MM_TILE_SIZE; j++) { 88 | 89 | // Load vector from lhs and rhs 90 | #pragma unroll 91 | for (uint32_t l = 0; l < MM_REG_TILE; l++) { 92 | regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j]; 93 | regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l]; 94 | } 95 | 96 | #pragma unroll 97 | // Perform a narrow matmul 98 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 99 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 100 | regC[y][x] += regA[y] * regB[x]; 101 | } 102 | } 103 | } 104 | 105 | __syncthreads(); 106 | } 107 | 108 | // Write register intermediates to shared memory (possibly unnecessary) 109 | for (uint32_t y = 0; y < MM_REG_TILE; y++) { 110 | for (uint32_t x = 0; x < MM_REG_TILE; x++) { 111 | bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x]; 112 | } 113 | } 114 | 115 | __syncthreads(); 116 | 117 | for (uint32_t j = 0; j < LOAD_STEPS; j++) { 118 | uint32_t index = j * NUM_THREADS + id; 119 | if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) { 120 | C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)] = bufferA[index]; 121 | } 122 | } 123 | } 124 | 125 | // This kernel assumes the input multiplications were precomputed in a large matrix-matrix multiplication 126 | template 127 | __global__ void gru_rnn(const float* precomputed_inputs, 128 | const float* hidden_initializer, 129 | const float* weights, 130 | const float* biases, 131 | float* r, 132 | float* output, 133 | volatile int* syncIn, 134 | volatile int* syncOut, 135 | uint32_t length) { 136 | 137 | // Indexing helpers 138 | int tid = threadIdx.x; 139 | int bidx = blockIdx.x; 140 | int bidy = blockIdx.y; 141 | int wg_id = tid / GROUP_THREADS; 142 | int r_id = tid / (2 * GROUP_THREADS); 143 | 144 | // LENGTH - How many weights for each output does a single thread need to store 145 | constexpr int LENGTH = (HIDDEN_SIZE + GROUP_THREADS - 1) / GROUP_THREADS; 146 | // BUFFER_SIZE - Number of elements to reserve in shared memory for each outout. Effectively 147 | // rounds up HIDDEN_SIZE to the next multiple of NUM_THREADS 148 | constexpr int BUFFER_SIZE = LENGTH * GROUP_THREADS; 149 | // OUTPUT_TILE_WIDTH - How many full elements are produced by the threadblock. At scheduling time, 150 | // must ensure that the launched configuration produces full elements within a single threadblock 151 | constexpr int OUTPUT_TILE_WIDTH = NUM_GROUPS * TILE_WIDTH / (GRU_GATES - 1); 152 | 153 | 154 | // Static shared memory allocation 155 | __shared__ float buffer_tile[TILE_HEIGHT][BUFFER_SIZE]; 156 | __shared__ float z_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 157 | __shared__ float z_h_res[TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 158 | __shared__ float h_gate[2][TILE_HEIGHT][OUTPUT_TILE_WIDTH]; 159 | 160 | // Weights in the register file 161 | float weights_reg[TILE_WIDTH][LENGTH]; 162 | float h_weights_reg[TILE_WIDTH][LENGTH / 2]; 163 | float outputs_reg[TILE_HEIGHT][TILE_WIDTH]; 164 | float bias = 0.0f; 165 | float bias_h = 0.0f; 166 | float precompute = 0.0f; 167 | float precompute_h = 0.0f; 168 | const float * precomputed_offset; 169 | const float * precomputed_offset_h; 170 | 171 | // Cooperative group helpers 172 | thread_block bl = this_thread_block(); 173 | thread_block_tile work_group = tiled_partition(bl); 174 | 175 | // Load weights to register array for either z or h gate 176 | for (int i = 0; i < TILE_WIDTH; i++) { 177 | // Global gate id for fetching weights. 178 | // bidx * TILE_WIDTH * NUM_GROUPS -> the first gate index processed by the threadblock 179 | // wg_id * TILE_WIDTH -> the first gate index processed by a given warp within the threadblock 180 | // i -> current gate within the warp's assigned gates 181 | // These gate indexes will only refer to gates r and z, not the h gate 182 | int gate_id = bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + i; 183 | // The following lines transform the assigned r/z gate into its real index within the weight array. 184 | // Since we don't assign one of the gates, we undo the indexing to get a valid output_element. 185 | // We then determine which gate within the output the current assigned gate is. 186 | int output_element = (gate_id / (GRU_GATES - 1)) * GRU_GATES; 187 | int gate_index = gate_id % (GRU_GATES - 1); 188 | // Prevent segfaults 189 | if (output_element < HIDDEN_SIZE) { 190 | // 0 initialize rounded values. Better to have a single check now then on each recurrent iteration. 191 | for (int j = 0; j < LENGTH; j++) { 192 | if ( j * GROUP_THREADS + work_group.thread_rank() < HIDDEN_SIZE) { 193 | weights_reg[i][j] = weights[(output_element * GRU_GATES + gate_index) * HIDDEN_SIZE + j * GROUP_THREADS + work_group.thread_rank()]; 194 | } else { 195 | weights_reg[i][j] = 0.f; 196 | } 197 | } 198 | } 199 | } 200 | 201 | // Load weights to register arrays for h gate (weight columns divided between two workgroups) 202 | for (int i = 0; i < TILE_WIDTH / 2; i++) { 203 | int output_element = bidx * NUM_GROUPS * TILE_WIDTH / 2 + r_id * TILE_WIDTH + i; 204 | int which_half = wg_id % 2; 205 | if (output_element < HIDDEN_SIZE) { 206 | for (int j = 0; j < LENGTH; j++) { 207 | if ( which_half * BUFFER_SIZE / 2 + j * GROUP_THREADS + work_group.thread_rank() < HIDDEN_SIZE) { 208 | h_weights_reg[i][j] = weights[output_element * GRU_GATES * HIDDEN_SIZE + 209 | (GRU_GATES - 1) * HIDDEN_SIZE + 210 | which_half * BUFFER_SIZE / 2 + 211 | j * GROUP_THREADS + work_group.thread_rank()]; 212 | } else { 213 | h_weights_reg[i][j] = 0.f; 214 | } 215 | } 216 | } 217 | } 218 | 219 | // Calculate indexing for time independent partial sums for r and z gates 220 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 221 | int x = work_group.thread_rank() % TILE_WIDTH; 222 | int y = work_group.thread_rank() / TILE_WIDTH; 223 | 224 | if ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + x) < HIDDEN_SIZE * (GRU_GATES - 1) && (bidy * TILE_HEIGHT + y < BATCH_SIZE)) { 225 | int output_element = ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + x) / (GRU_GATES - 1)) * GRU_GATES; 226 | int gate_index = (bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + x) % (GRU_GATES - 1); 227 | bias = biases[output_element * GRU_GATES + gate_index]; 228 | 229 | precomputed_offset = precomputed_inputs; 230 | precomputed_offset += bidy * TILE_HEIGHT * HIDDEN_SIZE * GRU_GATES; 231 | precomputed_offset += y * HIDDEN_SIZE * GRU_GATES; 232 | precomputed_offset += output_element * GRU_GATES; 233 | precomputed_offset += gate_index; 234 | precompute = *precomputed_offset; 235 | precomputed_offset += BATCH_SIZE * HIDDEN_SIZE * GRU_GATES; 236 | } 237 | 238 | } 239 | 240 | // Calculate indexing for time independent partial sums for h gate 241 | if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT) { 242 | int x = tid % OUTPUT_TILE_WIDTH; 243 | int y = tid / OUTPUT_TILE_WIDTH; 244 | if ((bidx * OUTPUT_TILE_WIDTH + x < HIDDEN_SIZE) && (bidy * TILE_HEIGHT + y < BATCH_SIZE)) { 245 | bias_h = biases[(bidx * OUTPUT_TILE_WIDTH + x) * GRU_GATES + (GRU_GATES - 1)]; 246 | 247 | precomputed_offset_h = precomputed_inputs; 248 | precomputed_offset_h += (GRU_GATES - 1); 249 | precomputed_offset_h += bidy * TILE_HEIGHT * HIDDEN_SIZE * GRU_GATES; 250 | precomputed_offset_h += y * HIDDEN_SIZE * GRU_GATES; 251 | precomputed_offset_h += (bidx * OUTPUT_TILE_WIDTH + x) * GRU_GATES; 252 | precompute_h = *precomputed_offset_h; 253 | precomputed_offset_h += BATCH_SIZE * HIDDEN_SIZE * GRU_GATES; 254 | } 255 | } 256 | 257 | // Zero the dot product accumulators 258 | #pragma unroll 259 | for (int j = 0; j < TILE_HEIGHT; j++) { 260 | #pragma unroll 261 | for (int i = 0; i < TILE_WIDTH; i++) { 262 | outputs_reg[i][j] = 0.f; 263 | } 264 | } 265 | 266 | // Initialize hidden state according to memory / zero rest of buffer 267 | #pragma unroll 268 | for (int j = 0; j < TILE_HEIGHT; j++) { 269 | #pragma unroll 270 | for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) { 271 | if ( i + tid < HIDDEN_SIZE) { 272 | buffer_tile[j][i + tid] = hidden_initializer[(bidy * TILE_HEIGHT + j) * HIDDEN_SIZE + i + tid]; 273 | } else if (i + tid < BUFFER_SIZE) { 274 | buffer_tile[j][i + tid] = 0.f; 275 | } 276 | } 277 | } 278 | 279 | // Recurrent loop 280 | for (int sequence_iteration = 0; sequence_iteration < length; sequence_iteration++) { 281 | 282 | /* r and z gates */ 283 | 284 | // Dot product 285 | #pragma unroll 286 | for (int k = 0; k < LENGTH; k++) { 287 | #pragma unroll 288 | for (int j = 0; j < TILE_HEIGHT; j++) { 289 | float val = buffer_tile[j][k * GROUP_THREADS + work_group.thread_rank()]; 290 | #pragma unroll 291 | for (int i = 0; i < TILE_WIDTH; i++) { 292 | outputs_reg[j][i] += val * weights_reg[i][k]; 293 | } 294 | } 295 | } 296 | 297 | // Reduction 298 | #pragma unroll 299 | for (int j = 0; j < TILE_HEIGHT; j++) { 300 | #pragma unroll 301 | for (int i = 0; i < TILE_WIDTH; i++) { 302 | #pragma unroll 303 | for (int k = 1; k < GROUP_THREADS; k *= 2) { 304 | outputs_reg[j][i] += work_group.shfl_xor(outputs_reg[j][i], k); 305 | } 306 | } 307 | } 308 | 309 | // Activations and broadcast of r gate 310 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 311 | int reg_x = work_group.thread_rank() % TILE_WIDTH; 312 | int reg_y = work_group.thread_rank() / TILE_WIDTH; 313 | 314 | float val = outputs_reg[reg_y][reg_x] + bias + precompute; 315 | val = sigmoidf(val); 316 | 317 | int gate_id = (wg_id * TILE_WIDTH + reg_x) % (GRU_GATES - 1); 318 | int output_id = (bidx * NUM_GROUPS * TILE_WIDTH + wg_id * TILE_WIDTH + reg_x) / (GRU_GATES - 1); 319 | 320 | //r gate 321 | if (gate_id == 0) { 322 | val = val * buffer_tile[reg_y][output_id]; 323 | if (output_id < HIDDEN_SIZE) { 324 | r[(bidy * TILE_HEIGHT + reg_y) * HIDDEN_SIZE + output_id] = val; 325 | } 326 | } else { 327 | int smem_id = (wg_id * TILE_WIDTH + reg_x) / (GRU_GATES - 1); 328 | z_gate[reg_y][smem_id] = (1 - val); 329 | z_h_res[reg_y][smem_id] = val * buffer_tile[reg_y][output_id]; 330 | } 331 | } 332 | 333 | // Synchronize between r/z and h stages - signal stage 334 | if (tid == 0) { 335 | syncIn[bidy * gridDim.x + bidx] = 2 * sequence_iteration + 1; 336 | } 337 | 338 | __threadfence(); 339 | 340 | // Zero dot product accumulators 341 | #pragma unroll 342 | for (int j = 0; j < TILE_HEIGHT; j++) { 343 | #pragma unroll 344 | for (int i = 0; i < TILE_WIDTH; i++) { 345 | outputs_reg[i][j] = 0.f; 346 | } 347 | } 348 | 349 | // Synchronize between r/z and h stages - spin stage 350 | if (bidx == 0) { 351 | if (tid < gridDim.x) { 352 | while ( syncIn[bidy * gridDim.x + tid] != 2 * sequence_iteration + 1) { 353 | } 354 | } 355 | 356 | __syncthreads(); 357 | 358 | if (tid == 0) { 359 | syncOut[bidy] = 2 * sequence_iteration + 1; 360 | } 361 | } else { 362 | if (tid == 0) { 363 | while (syncOut[bidy] != 2 * sequence_iteration + 1) { 364 | } 365 | } 366 | __syncthreads(); 367 | } 368 | 369 | /* h gate */ 370 | 371 | // Load r gate intermediate 372 | #pragma unroll 373 | for (int j = 0; j < TILE_HEIGHT; j++) { 374 | #pragma unroll 375 | for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) { 376 | if ( i + tid < HIDDEN_SIZE ) { 377 | buffer_tile[j][i + tid] = r[bidy * TILE_HEIGHT * HIDDEN_SIZE + j * HIDDEN_SIZE + i + tid]; 378 | } else if ( i + tid < BUFFER_SIZE) { 379 | buffer_tile[j][i + tid] = 0.f; 380 | } 381 | } 382 | } 383 | 384 | __syncthreads(); 385 | 386 | int which_half = wg_id % 2; 387 | 388 | // Dot product 389 | #pragma unroll 390 | for (int k = 0; k < LENGTH / 2; k++) { 391 | #pragma unroll 392 | for (int j = 0; j < TILE_HEIGHT; j++) { 393 | float val = buffer_tile[j][which_half * LENGTH * GROUP_THREADS / 2 + k * GROUP_THREADS + work_group.thread_rank()]; 394 | #pragma unroll 395 | for (int i = 0; i < TILE_WIDTH; i++) { 396 | outputs_reg[j][i] += val * h_weights_reg[i][k]; 397 | } 398 | } 399 | } 400 | 401 | // Reduction 402 | #pragma unroll 403 | for (int j = 0; j < TILE_HEIGHT; j++) { 404 | #pragma unroll 405 | for (int i = 0; i < TILE_WIDTH; i++) { 406 | #pragma unroll 407 | for (int k = 1; k < GROUP_THREADS; k *= 2) { 408 | outputs_reg[j][i] += work_group.shfl_xor(outputs_reg[j][i], k); 409 | } 410 | } 411 | } 412 | 413 | // Broadcast to shared memory 414 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 415 | int x = work_group.thread_rank() % TILE_WIDTH; 416 | int y = work_group.thread_rank() / TILE_WIDTH; 417 | 418 | h_gate[which_half][y][r_id * TILE_WIDTH + x] = outputs_reg[y][x]; 419 | } 420 | 421 | __syncthreads(); 422 | 423 | // Activation and elementwise operations 424 | if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT) { 425 | int y = tid / OUTPUT_TILE_WIDTH; 426 | int smem_x = tid % OUTPUT_TILE_WIDTH; 427 | int global_x = bidx * OUTPUT_TILE_WIDTH + smem_x; 428 | if (global_x < HIDDEN_SIZE) { 429 | float val = tanh(h_gate[0][y][smem_x] + h_gate[1][y][smem_x] + precompute_h + bias_h); 430 | 431 | output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + y) * HIDDEN_SIZE + global_x] = z_h_res[y][smem_x] + z_gate[y][smem_x] * val; 432 | } 433 | } 434 | 435 | // Escape if at end of sequence length 436 | if (sequence_iteration + 1 == length) break; 437 | 438 | // Synchronize between recurrent iterations - signal stage 439 | if (tid == 0) { 440 | syncIn[bidy * gridDim.x + bidx] = 2 * sequence_iteration + 2; 441 | } 442 | __threadfence(); 443 | 444 | // Fetch time independent partial sums for the next timestep 445 | if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) { 446 | precompute = *precomputed_offset; 447 | precomputed_offset += HIDDEN_SIZE * BATCH_SIZE * GRU_GATES; 448 | } 449 | 450 | if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT) { 451 | precompute_h = *precomputed_offset_h; 452 | precomputed_offset_h += HIDDEN_SIZE * BATCH_SIZE * GRU_GATES; 453 | } 454 | 455 | // Synchronize between recurrent iterations - spin stage 456 | if (bidx == 0) { 457 | if (tid < gridDim.x) { 458 | while ( syncIn[bidy * gridDim.x + tid] != 2 * sequence_iteration + 2) { 459 | } 460 | } 461 | 462 | __syncthreads(); 463 | 464 | if (tid == 0) { 465 | syncOut[bidy] = 2 * sequence_iteration + 2; 466 | } 467 | } else { 468 | if (tid == 0) { 469 | while (syncOut[bidy] != 2 * sequence_iteration + 2) { 470 | } 471 | } 472 | __syncthreads(); 473 | } 474 | 475 | // Load output from t - 1 to buffer 476 | #pragma unroll 477 | for (int j = 0; j < TILE_HEIGHT; j++) { 478 | #pragma unroll 479 | for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) { 480 | if ( i + tid < HIDDEN_SIZE ) { 481 | buffer_tile[j][i + tid] = output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + bidy * TILE_HEIGHT * HIDDEN_SIZE + j * HIDDEN_SIZE + i + tid]; 482 | } else if ( i + tid < BUFFER_SIZE) { 483 | buffer_tile[j][i + tid] = 0.f; 484 | } 485 | } 486 | } 487 | } 488 | } 489 | 490 | template 491 | void process_input_weights(T * output, std::vector weights, uint32_t input_size, uint32_t hidden_size) { 492 | 493 | // Outside loop is the input size 494 | for (uint32_t j = 0; j < input_size; j++) { 495 | // Width of the input weight matrix 496 | for (uint32_t k = 0; k < hidden_size; k++) { 497 | // Colocate the weights for each element 498 | for (uint32_t i = 0; i < GRU_GATES; i++) { 499 | output[(j * hidden_size + k) * GRU_GATES + i] = weights.at(i)[j * hidden_size + k]; 500 | } 501 | } 502 | } 503 | } 504 | 505 | template 506 | void process_hidden_weights(T * output, std::vector weights, uint32_t hidden_size) { 507 | 508 | // For each output element 509 | for (uint32_t j = 0; j < hidden_size; j++) { 510 | // For each gate 511 | for (uint32_t k = 0; k < GRU_GATES; k++) { 512 | // For each element for that gate 513 | for (uint32_t i = 0; i < hidden_size; i++) { 514 | output[j * GRU_GATES * hidden_size + k * hidden_size + i] = weights.at(3 + k)[i * hidden_size + j]; 515 | } 516 | } 517 | } 518 | } 519 | 520 | template 521 | void process_biases(T * output, std::vector weights, uint32_t hidden_size) { 522 | 523 | // For each output element 524 | for (uint32_t k = 0; k < hidden_size; k++) { 525 | // Colocate the biases for each element 526 | for (uint32_t i = 0; i < GRU_GATES; i++) { 527 | output[k * GRU_GATES + i] = weights.at(i + 6)[k]; 528 | } 529 | } 530 | } 531 | 532 | template 533 | void GRULayerDouble::reset() { 534 | cudaFreeHost((void *) this->packed_input_weights); 535 | cudaFreeHost((void *) this->packed_hidden_weights); 536 | cudaFreeHost((void *) this->packed_biases); 537 | cudaFree((void *) this->packed_input_weights_gpu); 538 | cudaFree((void *) this->packed_hidden_weights_gpu); 539 | cudaFree((void *) this->packed_biases_gpu); 540 | } 541 | 542 | // Initialize and fill buffers for trained parameters 543 | template 544 | uint32_t GRULayerDouble::initialize() { 545 | 546 | uint32_t input_footprint = input_weight_footprint(); 547 | uint32_t hidden_footprint = hidden_weight_footprint(); 548 | uint32_t bias_footprint = bias_weight_footprint(); 549 | 550 | // Allocate weights 551 | cudaHostAlloc((void **) &(this->packed_input_weights), input_footprint, cudaHostAllocDefault); CUDA_ERR; 552 | cudaHostAlloc((void **) &(this->packed_hidden_weights), hidden_footprint, cudaHostAllocDefault); CUDA_ERR; 553 | cudaHostAlloc((void **) &(this->packed_biases), bias_footprint, cudaHostAllocDefault); CUDA_ERR; 554 | cudaMalloc((void **) &(this->packed_input_weights_gpu), input_footprint); CUDA_ERR; 555 | cudaMalloc((void **) &(this->packed_hidden_weights_gpu), hidden_footprint); CUDA_ERR; 556 | cudaMalloc((void **) &(this->packed_biases_gpu), bias_footprint); CUDA_ERR; 557 | 558 | // Reorganize weights 559 | process_input_weights(this->packed_input_weights, this->host_weights, this->input_size, this->hidden_size); 560 | process_hidden_weights(this->packed_hidden_weights, this->host_weights, this->hidden_size); 561 | process_biases(this->packed_biases, this->host_weights, this->hidden_size); 562 | 563 | // Transfer weights 564 | cudaMemcpy(this->packed_input_weights_gpu, this->packed_input_weights, input_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 565 | cudaMemcpy(this->packed_hidden_weights_gpu, this->packed_hidden_weights, hidden_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 566 | cudaMemcpy(this->packed_biases_gpu, this->packed_biases, bias_footprint, cudaMemcpyHostToDevice); CUDA_ERR; 567 | 568 | return 0; 569 | } 570 | 571 | // Frees allocated memory 572 | template 573 | void GRUModelDouble::reset() { 574 | 575 | for (auto& l: this->layers) { 576 | l.reset(); 577 | } 578 | 579 | cudaFreeHost((void *) this->host_output); 580 | cudaFree((void *) this->gpu_output); 581 | 582 | cudaFree((void *) this->gpu_r); 583 | cudaFree((void *) this->gpu_inputs); 584 | cudaFree((void *) this->gpu_precompute); 585 | cudaFree((void *) this->gpu_syncIn); 586 | cudaFree((void *) this->gpu_syncOut); 587 | } 588 | 589 | // Allocates model buffers and initializes most kernel parameters 590 | template 591 | uint32_t GRUModelDouble::initialize() { 592 | 593 | for (auto& l: this->layers) { 594 | uint32_t debug = l.initialize(); 595 | if (debug != 0) { 596 | std::cout << "FAILURE\n"; 597 | return debug; 598 | } 599 | } 600 | 601 | this->gpu_weights_input = this->layers[0].get_packed_input_weights_gpu(); 602 | this->gpu_weights_hidden = this->layers[0].get_packed_hidden_weights_gpu(); 603 | this->gpu_biases = this->layers[0].get_packed_biases_gpu(); 604 | this->mm_k = this->initial_input_size; 605 | this->mm_n = this->output_size * GRU_GATES; 606 | 607 | // Output allocation, assume sequence length less than 200 608 | cudaHostAlloc((void **) &(this->host_output), this->output_size * this->batch_size * 200 * sizeof(T), cudaHostAllocDefault); 609 | cudaMalloc((void **) &(this->gpu_output), this->output_size * this->batch_size * 200 * sizeof(T)); 610 | 611 | // Input allocations, assume sequence length less than 200 612 | cudaMalloc((void **) &(this->gpu_inputs), this->initial_input_size * this->batch_size * 200 * sizeof(T)); 613 | cudaMalloc((void **) &(this->gpu_r), this->output_size * this->batch_size * sizeof(T)); 614 | cudaMalloc((void **) &(this->gpu_precompute), this->output_size * this->batch_size * GRU_GATES * 200 * sizeof(T)); 615 | 616 | // Hidden state initializer allocation 617 | cudaMalloc((void **) &(this->gpu_hidden_initializer), this->output_size * this->batch_size * sizeof(T)); 618 | cudaMemset((void *)this->gpu_hidden_initializer, 0, this->output_size * this->batch_size * sizeof(T)); 619 | 620 | // Synchronization buffer initialization 621 | cudaMalloc((void **) &(this->gpu_syncIn), 80 * sizeof(int)); 622 | cudaMalloc((void **) &(this->gpu_syncOut), 80 * sizeof(int)); 623 | 624 | //cudaFuncSetAttribute(gru_rnn, cudaFuncAttributeMaxDynamicSharedMemorySize, MAX_SMEM); CUDA_ERR; 625 | cudaDeviceSetLimit(cudaLimitStackSize, 0); CUDA_ERR; 626 | 627 | this->paramsMM[0] = (void*) &(this->gpu_inputs); 628 | this->paramsMM[1] = (void*) &(this->gpu_weights_input); 629 | this->paramsMM[2] = (void*) &(this->gpu_precompute); 630 | this->paramsMM[4] = (void*) &(this->mm_k); 631 | this->paramsMM[5] = (void*) &(this->mm_n); 632 | 633 | this->paramsGRU[0] = (void*) &(this->gpu_precompute); 634 | this->paramsGRU[1] = (void*) &(this->gpu_hidden_initializer); 635 | this->paramsGRU[2] = (void*) &(this->gpu_weights_hidden); 636 | this->paramsGRU[3] = (void*) &(this->gpu_biases); 637 | this->paramsGRU[4] = (void*) &(this->gpu_r); 638 | this->paramsGRU[5] = (void*) &(this->gpu_output); 639 | this->paramsGRU[6] = (void*) &(this->gpu_syncIn); 640 | this->paramsGRU[7] = (void*) &(this->gpu_syncOut); 641 | 642 | return 0; 643 | } 644 | 645 | // Set tiling parameters (should be encapsulated elsewhere) 646 | template 647 | void GRUModelDouble::set_configuration(int x, int y, int g, int t) { 648 | this->tile_width = x; 649 | this->tile_height = y; 650 | this->num_groups = g; 651 | this->group_threads = t; 652 | } 653 | 654 | // Process input sequence (both time dependent and independent 655 | template 656 | float GRUModelDouble::run_input(T* input, uint32_t * length) { 657 | 658 | // Initialize remaining kernel parameters 659 | this->mm_m = this->batch_size * *length; 660 | this->paramsMM[3] = (void *) &(this->mm_m); 661 | this->paramsGRU[8] = (void *) length; 662 | 663 | // GEMM Kernel Dimensioning 664 | dim3 mm_grid = dim3((this->mm_n + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (this->mm_m + MM_TILE_SIZE - 1) / MM_TILE_SIZE); 665 | dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE); 666 | size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float); 667 | 668 | // GRU Double Kernel Dimensioning 669 | int effective_w = (this->num_groups * this->tile_width) / (GRU_GATES - 1); 670 | dim3 gru_rnn_grid = dim3((this->output_size + effective_w - 1) / effective_w, (this->batch_size + this->tile_height - 1) / this->tile_height); 671 | dim3 gru_rnn_block = dim3(this->num_groups * this->group_threads); 672 | unsigned block_size = gru_rnn_block.x; 673 | unsigned grid_size = gru_rnn_grid.x * gru_rnn_grid.y; 674 | 675 | // Kernel instantiation (currently only configured for manual tuning) 676 | void * kernel = (void *)gru_rnn<1024, 4, 5, 8, 32, 5>; 677 | 678 | // Check occupancy before running to prevent program hangs 679 | int numBlocks = 0; 680 | cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, block_size, 0); 681 | if (grid_size > 80 * numBlocks) { 682 | printf("grid_size: %3d numBlocks: %3d block_size: %3d\n", grid_size, numBlocks * 80, block_size); 683 | return -std::numeric_limits::infinity(); 684 | } 685 | 686 | cudaEvent_t start, end; 687 | float elapsed; 688 | 689 | // Send inputs 690 | cudaMemcpy(this->gpu_inputs, input, this->initial_input_size * this->batch_size * *length * sizeof(T), cudaMemcpyHostToDevice); 691 | 692 | // Timing 693 | cudaEventCreate(&start); 694 | cudaEventCreate(&end); 695 | cudaEventRecord(start); 696 | 697 | // Kernel launches 698 | cudaLaunchKernel((void *)matmul, mm_grid, mm_block, this->paramsMM, mm_sm_requirement); 699 | cudaLaunchKernel(kernel, gru_rnn_grid, gru_rnn_block, this->paramsGRU); 700 | 701 | cudaEventRecord(end); 702 | cudaEventSynchronize(end); 703 | cudaEventElapsedTime(&elapsed, start, end); 704 | 705 | cudaMemcpy(this->host_output, this->gpu_output, this->output_size * this->batch_size * sizeof(T), cudaMemcpyDeviceToHost); 706 | 707 | #ifdef DEBUG 708 | // Value checking 709 | for (int i = 0; i < this->batch_size; i++) { 710 | printf("Sequence %2d\n", i); 711 | for (int j = 0; j < this->output_size; j++) { 712 | printf("%f ", this->host_output[i * this->output_size + j]); 713 | } 714 | printf("\n"); 715 | } 716 | printf("\n"); 717 | #endif 718 | 719 | // Check for runtime errors 720 | cudaError_t err; 721 | cudaDeviceSynchronize(); 722 | if ((err = cudaGetLastError()) != cudaSuccess) { 723 | printf("CUDA error: %d : %s : %s, line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__); 724 | return std::numeric_limits::infinity(); 725 | } 726 | 727 | return elapsed; 728 | } 729 | 730 | // Explicit template instantiations 731 | template void process_input_weights(float *, std::vector, uint32_t, uint32_t); 732 | template void process_hidden_weights(float *, std::vector, uint32_t); 733 | template void process_biases(float *, std::vector, uint32_t); 734 | template uint32_t GRULayerDouble::initialize(); 735 | template uint32_t GRUModelDouble::initialize(); 736 | template void GRULayerDouble::reset(); 737 | template void GRUModelDouble::reset(); 738 | template void GRUModelDouble::set_configuration(int, int, int, int); 739 | template float GRUModelDouble::run_input(float *, uint32_t *); 740 | --------------------------------------------------------------------------------