├── .gitignore
├── src
    ├── Makefile
    ├── single_layer_LSTM.cpp
    ├── LSTM.h
    ├── GRU_double.h
    ├── single_layer_GRU_double.cpp
    ├── single_layer_GRU_single.cpp
    ├── GRU_single.h
    ├── misc.h
    ├── RNNBase.h
    ├── LSTM.cu
    ├── GRU_single.cu
    └── GRU_double.cu
├── README.md
├── License.txt
└── performance_model
    └── heuristic.py


/.gitignore:
--------------------------------------------------------------------------------
1 | src/bin/*
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | # Compiler parameters
 2 | CXX := nvcc
 3 | CPPFLAGS := -O3 -std=c++11
 4 | CUDAFLAGS := -arch=compute_70 -code=sm_70 -D_FORCE_INLINES --ptxas-options='-v -warn-lmem-usage -warn-spills' --nvlink-options='-v'
 5 | DEBUGFLAGS := -D DEBUG
 6 | 
 7 | ### Regular compilation rules
 8 | bin/LSTM.o: LSTM.cu LSTM.h RNNBase.h misc.h
 9 | 	$(CXX) -c $< -o $@ $(CPPFLAGS) $(CUDAFLAGS)
10 | 
11 | LSTM: single_layer_LSTM.cpp LSTM.h RNNBase.h misc.h bin/LSTM.o
12 | 	$(CXX) $< bin/LSTM.o -o bin/$@ $(CPPFLAGS) $(CUDAFLAGS)
13 | 
14 | bin/GRU_single.o: GRU_single.cu GRU_single.h RNNBase.h misc.h
15 | 	$(CXX) -c $< -o $@ $(CPPFLAGS) $(CUDAFLAGS)
16 | 
17 | GRU_single: single_layer_GRU_single.cpp GRU_single.h RNNBase.h misc.h bin/GRU_single.o
18 | 	$(CXX) $< bin/GRU_single.o -o bin/$@ $(CPPFLAGS) $(CUDAFLAGS)
19 | 
20 | bin/GRU_double.o: GRU_double.cu GRU_double.h RNNBase.h misc.h
21 | 	$(CXX) -c $< -o $@ $(CPPFLAGS) $(CUDAFLAGS)
22 | 
23 | GRU_double: single_layer_GRU_double.cpp GRU_double.h RNNBase.h misc.h bin/GRU_double.o
24 | 	$(CXX) $< bin/GRU_double.o -o bin/$@ $(CPPFLAGS) $(CUDAFLAGS)
25 | 
26 | clean:
27 | 	rm bin/*
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # GRNN
 2 | ## Framework Structure
 3 | ### Inference Workflow
 4 | In GRNN, models are constructed individually by layer (See caveats in Design Problems) and then incorporated into a Model wrapper that holds collections of layers. Once the model has been defined with its given parameters, tiling parameters are passed to the model and the model is initialized. Layer initialization reorganizes the weight columns to improve locality of gate outputs, transposes the hidden state matrix, and sends these matrices to device memory. Model initialization allocates input, output, and intermediate buffers (this also feeds into design problems) as well as providing the known kernel parameters at this point.
 5 | At this point, inputs can be fed to the network. The model assumes a maximum sequence length (needs additional robustness) but otherwise is not constrained by the sequence of the provided batch. The batch elements are assumed to have the same sequence length.
 6 | 
 7 | ## Kernels
 8 | The kernels for all cells/types follow the same broad three step process:
 9 | 1. Buffer initialization - Initialize arrays in the register file and shared memory for the hidden/cell state and trained parameters.
10 | 2. Data Loading - Load weights and biases to register file, initialize shared cell/hidden states, and calculate offsets into the precompute array.
11 | 3. Recurrent Computation - varies based on cell structure.
12 | 
13 | ## Performance Model
14 | The performance model takes in model parameters, permutes the parameters to build the configuration space, prunes based on configuration feasibility, and then ranks based on the four-part performance model.
15 | 
16 | 


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019 Colorado School of Mines.  All rights reserved.
 2 | 
 3 | 
 4 | Developed by: Connor Holmes
 5 |               Colorado School of Mines
 6 |               cs.mines.edu
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 9 | this software and associated documentation files (the "Software"), to deal with
10 | the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
12 | of the Software, and to permit persons to whom the Software is furnished to
13 | do so, subject to the following conditions:
14 | * Redistributions of source code must retain the above copyright notice,
15 |   this list of conditions and the following disclaimers.
16 | * Redistributions in binary form must reproduce the above copyright notice,
17 |   this list of conditions and the following disclaimers in the documentation
18 |   and/or other materials provided with the distribution.
19 | * Neither the names of Connor Holmes, Colorado School of Mines,
20 |   nor the names of its contributors may be used to endorse or promote products
21 |   derived from this Software without specific prior written permission.
22 | 
23 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
26 | CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
29 | SOFTWARE.
30 | 


--------------------------------------------------------------------------------
/src/single_layer_LSTM.cpp:
--------------------------------------------------------------------------------
 1 | // Runtime files
 2 | #include "LSTM.h"
 3 | #include "misc.h"
 4 | 
 5 | // Other includes
 6 | #include <iostream>
 7 | #include <getopt.h>
 8 | #include <stdlib.h>
 9 | #include <vector>
10 | #include <cassert>
11 | #include <cstdio>
12 | #include <cuda_profiler_api.h>
13 | 
14 | 
15 | int main(int argc, char** argv) {
16 | 
17 |   uint32_t input = 256;
18 |   uint32_t hidden = 256;
19 |   uint32_t batch = 40;
20 |   uint32_t x_tile_size = 2;
21 |   uint32_t y_tile_size = 4;
22 |   uint32_t num_groups = 64;
23 |   uint32_t reduction_width = 8;
24 |   uint32_t input_length = 100;
25 | 
26 |   std::vector<float *> weights;
27 |   create_dummy_weights_lstm(weights, input, hidden);
28 |  
29 |   // Create layer
30 |   LSTMLayer<float> layer = LSTMLayer<float>(input, hidden, batch, weights);
31 |   
32 |   // Declare model based on layer
33 |   LSTMModel<float> model = LSTMModel<float>( {layer} );
34 | 
35 |   model.set_configuration(x_tile_size, y_tile_size, num_groups, reduction_width);
36 |   model.initialize();
37 |   
38 |   float * testInput;
39 |   cudaHostAlloc((void **) &testInput, sizeof(float) * batch * input * input_length, cudaHostAllocDefault); CUDA_ERR;
40 | 
41 |   for (uint32_t i = 0; i < batch * input * input_length; i++) {
42 |     testInput[i] = (i / input) % batch;
43 |   }
44 | 
45 | #ifdef DEBUG
46 |   float temp = model.run_input(testInput, &input_length);
47 | #else
48 |   float time = 0.0f;
49 |   for (int i = 0; i < 1000; i++) {
50 |     float temp = model.run_input(testInput, &input_length);
51 |   }
52 |   cudaProfilerStart();
53 |   for (int i = 0; i < 1000; i++) {
54 |     float run_time = model.run_input(testInput, &input_length);
55 |     time += run_time;
56 |   }
57 |   cudaProfilerStop();
58 |   std::cout << time / 1000 << " ms\n";
59 | #endif
60 | 
61 |   return 0;
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/src/LSTM.h:
--------------------------------------------------------------------------------
 1 | #ifndef LSTMBASE_H
 2 | #define LSTMBASE_H
 3 | 
 4 | #include "RNNBase.h"
 5 | 
 6 | #include <iostream>
 7 | 
 8 | #define WEIGHTS_INPUT_F 0
 9 | #define WEIGHTS_INPUT_I 1
10 | #define WEIGHTS_INPUT_C 2
11 | #define WEIGHTS_INPUT_O 3
12 | #define WEIGHTS_HIDDEN_F 4
13 | #define WEIGHTS_HIDDEN_I 5
14 | #define WEIGHTS_HIDDEN_C 6
15 | #define WEIGHTS_HIDDEN_O 7
16 | #define BIAS_F 8
17 | #define BIAS_I 9
18 | #define BIAS_C 10
19 | #define BIAS_O 11
20 | 
21 | #define LSTM_GATES 4
22 | 
23 | 
24 | template<typename T>
25 | class LSTMLayer : public RNNLayerBase<T> {
26 |   
27 |   private:
28 | 
29 |   public:
30 |     LSTMLayer(uint32_t i_s, uint32_t h_s, uint32_t b_s, std::vector<T*> l) :
31 |       RNNLayerBase<T>(i_s, h_s, b_s, l) {}
32 | 
33 |     uint32_t initialize();
34 |     void reset();
35 | 
36 |     // Total footprint of the input weights (makes initialize code cleaner)
37 |     uint32_t input_weight_footprint() {
38 |       return this->input_size * LSTM_GATES * this->hidden_size * sizeof(T);
39 |     }
40 |     
41 |     // Excludes intermediaries, used for data copying
42 |     uint32_t hidden_weight_footprint() {
43 |       return this->hidden_size * LSTM_GATES * this->hidden_size * sizeof(T);
44 |     }
45 | 
46 |     // This function may need to be modified in order to avoid bank conflicts
47 |     uint32_t bias_weight_footprint() {
48 |       return this->hidden_size * LSTM_GATES * sizeof(T);
49 |     }
50 | 
51 | };
52 | 
53 | template<typename T>
54 | class LSTMModel : public RNNBase<LSTMLayer, T> {
55 | 
56 |   private:
57 |     // Kernel launch parameters
58 |     void* paramsLSTM[8];
59 |     
60 |   public:
61 |     LSTMModel(std::initializer_list< LSTMLayer<T> > l) :
62 |       RNNBase<LSTMLayer, T>(l) {}
63 | 
64 |     void set_configuration(int x, int y, int g, int t);
65 | 
66 |     uint32_t initialize();
67 |     void reset();
68 | 
69 |     float run_input(T* input, uint32_t * length);
70 | };
71 | 
72 | #endif
73 | 


--------------------------------------------------------------------------------
/src/GRU_double.h:
--------------------------------------------------------------------------------
 1 | #ifndef GRUBASE_H
 2 | #define GRUBASE_H
 3 | 
 4 | #include "RNNBase.h"
 5 | 
 6 | #include <iostream>
 7 | 
 8 | #define WEIGHTS_INPUT_R 0
 9 | #define WEIGHTS_INPUT_Z 1
10 | #define WEIGHTS_INPUT_H 2
11 | #define WEIGHTS_HIDDEN_R 3
12 | #define WEIGHTS_HIDDEN_Z 4
13 | #define WEIGHTS_HIDDEN_H 5
14 | #define BIAS_R 6
15 | #define BIAS_Z 7
16 | #define BIAS_H 8
17 | 
18 | #define GRU_GATES 3
19 | 
20 | 
21 | template<typename T>
22 | class GRULayerDouble : public RNNLayerBase<T> {
23 |   
24 |   private:
25 | 
26 |   public:
27 |     GRULayerDouble(uint32_t i_s, uint32_t h_s, uint32_t b_s, std::vector<T*> l) :
28 |       RNNLayerBase<T>(i_s, h_s, b_s, l) {}
29 | 
30 |     uint32_t initialize();
31 |     void reset();
32 | 
33 |     // Total footprint of the input weights (makes initialize code cleaner)
34 |     uint32_t input_weight_footprint() {
35 |       return this->input_size * GRU_GATES * this->hidden_size * sizeof(T);
36 |     }
37 |     
38 |     // Excludes intermediaries, used for data copying
39 |     uint32_t hidden_weight_footprint() {
40 |       return this->hidden_size * GRU_GATES * this->hidden_size * sizeof(T);
41 |     }
42 | 
43 |     // This function may need to be modified in order to avoid bank conflicts
44 |     uint32_t bias_weight_footprint() {
45 |       return this->hidden_size * GRU_GATES * sizeof(T);
46 |     }
47 | };
48 | 
49 | template<typename T>
50 | class GRUModelDouble : public RNNBase<GRULayerDouble, T> {
51 | 
52 |   private:
53 |     // Buffer for r intermediates
54 |     T * gpu_r;
55 |     
56 |     // GRU Kernel parameter buffer
57 |     void * paramsGRU[9];
58 |     
59 |   public:
60 |     GRUModelDouble(std::initializer_list< GRULayerDouble<T> > l) :
61 |       RNNBase<GRULayerDouble, T>(l) {}
62 | 
63 |     void set_configuration(int x, int y, int g, int t);
64 | 
65 |     uint32_t initialize();
66 |     void reset();
67 | 
68 |     float run_input(T* input, uint32_t * length);
69 | };
70 | 
71 | #endif
72 | 


--------------------------------------------------------------------------------
/src/single_layer_GRU_double.cpp:
--------------------------------------------------------------------------------
 1 | // Runtime files
 2 | #include "GRU_double.h"
 3 | #include "misc.h"
 4 | 
 5 | // Other includes
 6 | #include <iostream>
 7 | #include <getopt.h>
 8 | #include <stdlib.h>
 9 | #include <vector>
10 | #include <cassert>
11 | #include <cstdio>
12 | #include <cuda_profiler_api.h>
13 | 
14 | 
15 | int main(int argc, char** argv) {
16 | 
17 |   uint32_t input = 1024;
18 |   uint32_t hidden = 1024;
19 |   uint32_t x_tile_size = 4;
20 |   uint32_t y_tile_size = 5;
21 |   uint32_t num_groups = 8;
22 |   uint32_t group_threads = 32;
23 |   uint32_t batch = 5;
24 |   uint32_t input_length = 100;
25 | 
26 |   std::vector<float *> weights;
27 |   create_dummy_weights_gru(weights, input, hidden);
28 |  
29 |   // Create layer
30 |   GRULayerDouble<float> layer = GRULayerDouble<float>(input, hidden, batch, weights);
31 |   
32 |   // Declare model based on layer
33 |   GRUModelDouble<float> model = GRUModelDouble<float>( {layer} );
34 |   
35 |   // Simple checks
36 |   assert(input == model.get_initial_input_size());
37 |   assert(batch == model.get_batch_size());
38 |   assert(hidden == model.get_output_size());
39 |   
40 |   model.set_configuration(x_tile_size, y_tile_size, num_groups, group_threads);
41 |   model.initialize();
42 |   
43 |   float * testInput;
44 |   cudaHostAlloc((void **) &testInput, sizeof(float) * batch * input * input_length, cudaHostAllocDefault); CUDA_ERR;
45 | 
46 |   for (uint32_t i = 0; i < batch * input * input_length; i++) {
47 |     testInput[i] = 1.;
48 |   }
49 | 
50 | #ifdef DEBUG
51 |   float temp = model.run_input(testInput, &input_length);
52 | #else
53 |   float time = 0.0f;
54 |   for (int i = 0; i < 1000; i++) {
55 |     float temp = model.run_input(testInput, &input_length);
56 |   }
57 |   cudaProfilerStart();
58 |   for (int i = 0; i < 1000; i++) {
59 |     float run_time = model.run_input(testInput, &input_length);
60 |     time += run_time;
61 |   }
62 |   cudaProfilerStop();
63 |   std::cout << time / 1000 << " ms\n";
64 | #endif
65 | 
66 |   return 0;
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/src/single_layer_GRU_single.cpp:
--------------------------------------------------------------------------------
 1 | // Runtime files
 2 | #include "GRU_single.h"
 3 | #include "misc.h"
 4 | 
 5 | // Other includes
 6 | #include <iostream>
 7 | #include <getopt.h>
 8 | #include <stdlib.h>
 9 | #include <vector>
10 | #include <cassert>
11 | #include <cstdio>
12 | #include <cuda_profiler_api.h>
13 | 
14 | 
15 | int main(int argc, char** argv) {
16 | 
17 |   uint32_t input = 256;
18 |   uint32_t hidden = 256;
19 |   uint32_t x_tile_size = 3;
20 |   uint32_t y_tile_size = 1;
21 |   uint32_t num_groups = 32;
22 |   uint32_t group_threads = 8;
23 |   uint32_t batch = 10;
24 |   uint32_t input_length = 100;
25 | 
26 |   std::vector<float *> weights;
27 |   create_dummy_weights_gru(weights, input, hidden);
28 |  
29 |   // Create layer
30 |   GRULayerSingle<float> layer = GRULayerSingle<float>(input, hidden, batch, weights);
31 |   
32 |   // Declare model based on layer
33 |   GRUModelSingle<float> model = GRUModelSingle<float>( {layer} );
34 |   
35 |   // Simple checks
36 |   assert(input == model.get_initial_input_size());
37 |   assert(batch == model.get_batch_size());
38 |   assert(hidden == model.get_output_size());
39 |   
40 |   model.set_configuration(x_tile_size, y_tile_size, num_groups, group_threads);
41 |   model.initialize();
42 |   
43 |   float * testInput;
44 |   cudaHostAlloc((void **) &testInput, sizeof(float) * batch * input * input_length, cudaHostAllocDefault); CUDA_ERR;
45 | 
46 |   for (uint32_t i = 0; i < batch * input * input_length; i++) {
47 |     testInput[i] = (float)(i % input) / (float)input;
48 |   }
49 | 
50 | #ifdef DEBUG
51 |   float temp = model.run_input(testInput, &input_length);
52 | #else
53 |   float time = 0.0f;
54 |   for (int i = 0; i < 1000; i++) {
55 |     float temp = model.run_input(testInput, &input_length);
56 |   }
57 |   cudaProfilerStart();
58 |   for (int i = 0; i < 1000; i++) {
59 |     float run_time = model.run_input(testInput, &input_length);
60 |     time += run_time;
61 |   }
62 |   cudaProfilerStop();
63 |   std::cout << time / 1000 << " ms\n";
64 | #endif
65 | 
66 |   return 0;
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/src/GRU_single.h:
--------------------------------------------------------------------------------
 1 | #ifndef GRUBASE_H
 2 | #define GRUBASE_H
 3 | 
 4 | #include "RNNBase.h"
 5 | 
 6 | #include <iostream>
 7 | 
 8 | #define WEIGHTS_INPUT_R 0
 9 | #define WEIGHTS_INPUT_Z 1
10 | #define WEIGHTS_INPUT_H 2
11 | #define WEIGHTS_HIDDEN_R 3
12 | #define WEIGHTS_HIDDEN_Z 4
13 | #define WEIGHTS_HIDDEN_H 5
14 | #define BIAS_R 6
15 | #define BIAS_Z 7
16 | #define BIAS_H 8
17 | 
18 | #define GRU_GATES 3
19 | 
20 | 
21 | template<typename T>
22 | class GRULayerSingle : public RNNLayerBase<T> {
23 |   
24 |   private:
25 |     T * packed_hidden_weights_r_gpu;
26 |     T * packed_biases_r_gpu;
27 | 
28 |   public:
29 |     GRULayerSingle(uint32_t i_s, uint32_t h_s, uint32_t b_s, std::vector<T*> l) :
30 |       RNNLayerBase<T>(i_s, h_s, b_s, l) {}
31 | 
32 |     uint32_t initialize();
33 |     void reset();
34 | 
35 |     // Total footprint of the input weights (makes initialize code cleaner)
36 |     uint32_t input_weight_footprint() {
37 |       return this->input_size * GRU_GATES * this->hidden_size * sizeof(T);
38 |     }
39 |     
40 |     // Excludes intermediaries, used for data copying
41 |     uint32_t hidden_weight_footprint() {
42 |       return this->hidden_size * (GRU_GATES - 1) * this->hidden_size * sizeof(T);
43 |     }
44 | 
45 |     // This function may need to be modified in order to avoid bank conflicts
46 |     uint32_t bias_weight_footprint() {
47 |       return this->hidden_size * (GRU_GATES - 1) * sizeof(T);
48 |     }
49 | 
50 |     uint32_t hidden_weight_r_footprint() {
51 |       return this->hidden_size * this->hidden_size * sizeof(T);
52 |     }
53 | 
54 |     uint32_t bias_weight_r_footprint() {
55 |       return this->hidden_size * sizeof(T);
56 |     }
57 | 
58 |     T * get_packed_hidden_weights_r_gpu() {
59 |       return this->packed_hidden_weights_r_gpu;
60 |     }
61 | 
62 |     T * get_packed_biases_r_gpu() {
63 |       return this->packed_biases_r_gpu;
64 |     }
65 | };
66 | 
67 | template<typename T>
68 | class GRUModelSingle : public RNNBase<GRULayerSingle, T> {
69 | 
70 |   private:
71 |     T * gpu_r;
72 |     T * gpu_weights_hidden_r;
73 |     T * gpu_biases_r;
74 | 
75 |     void * paramsGRU[11];
76 |     int num_partials;
77 |     
78 |   public:
79 |     GRUModelSingle(std::initializer_list< GRULayerSingle<T> > l) :
80 |       RNNBase<GRULayerSingle, T>(l) {}
81 |     
82 |     void set_configuration(int x, int y, int g, int t);
83 | 
84 |     uint32_t initialize();
85 |     void reset();
86 | 
87 |     float run_input(T* input, uint32_t * length);
88 | };
89 | 
90 | #endif
91 | 


--------------------------------------------------------------------------------
/src/misc.h:
--------------------------------------------------------------------------------
  1 | #ifndef MISC_H
  2 | #define MISC_H
  3 | 
  4 | #include <vector>
  5 | #include <iostream>
  6 | #include <stdio.h>
  7 | #include <cassert>
  8 | #include <fstream>
  9 | 
 10 | #define LINE_SIZE 1
 11 | 
 12 | #define CUDA_ERR {                                                                          \
 13 |   cudaError_t err;                                                                          \
 14 |   if ((err = cudaGetLastError()) != cudaSuccess) {                                         \
 15 |     printf("CUDA error: %d : %s : %s, line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__);  \
 16 |     exit(1);                                                                                \
 17 |   }                                                                                         \
 18 | }
 19 | 
 20 | #define MAX_SMEM 98304
 21 | 
 22 | template<typename T>
 23 | void create_dummy_weights_lstm(std::vector<T *> &weights, uint32_t input, uint32_t hidden) {
 24 |   // DUMMY INPUT WEIGHTS
 25 |   weights.push_back((T *)malloc(sizeof(T) * input * hidden));
 26 |   weights.push_back((T *)malloc(sizeof(T) * input * hidden));
 27 |   weights.push_back((T *)malloc(sizeof(T) * input * hidden));
 28 |   weights.push_back((T *)malloc(sizeof(T) * input * hidden));
 29 |   
 30 |   // DUMMY HIDDEN WEIGHTS
 31 |   weights.push_back((T *)malloc(sizeof(T) * hidden * hidden));
 32 |   weights.push_back((T *)malloc(sizeof(T) * hidden * hidden));
 33 |   weights.push_back((T *)malloc(sizeof(T) * hidden * hidden));
 34 |   weights.push_back((T *)malloc(sizeof(T) * hidden * hidden));
 35 | 
 36 |   // DUMMY BIASES
 37 |   weights.push_back((T *)malloc(sizeof(T) * hidden));
 38 |   weights.push_back((T *)malloc(sizeof(T) * hidden));
 39 |   weights.push_back((T *)malloc(sizeof(T) * hidden));
 40 |   weights.push_back((T *)malloc(sizeof(T) * hidden));
 41 | 
 42 |   uint32_t i, j;
 43 |   for (i = 0; i < 4; i++) {
 44 |     for (j = 0; j < input * hidden; j++) {
 45 |     weights.at(i)[j] = 1 / 1024.;
 46 |     }
 47 |   }
 48 | 
 49 |   for (i = 4; i < 8; i++) {
 50 |     for (j = 0; j < hidden * hidden; j++) {
 51 |     weights.at(i)[j] = 1 / 1024.;
 52 |     }
 53 |   }
 54 |   
 55 |   for (i = 8; i < 12; i++) {
 56 |     for (j = 0; j < hidden; j++) {
 57 |     weights.at(i)[j] = 0.5;
 58 |     }
 59 |   }
 60 | }
 61 | 
 62 | template<typename T>
 63 | void create_dummy_weights_gru(std::vector<T *> &weights, uint32_t input, uint32_t hidden) {
 64 |   // DUMMY INPUT WEIGHTS
 65 |   weights.push_back((T *)malloc(sizeof(T) * input * hidden));
 66 |   weights.push_back((T *)malloc(sizeof(T) * input * hidden));
 67 |   weights.push_back((T *)malloc(sizeof(T) * input * hidden));
 68 |   
 69 |   // DUMMY HIDDEN WEIGHTS
 70 |   weights.push_back((T *)malloc(sizeof(T) * hidden * hidden));
 71 |   weights.push_back((T *)malloc(sizeof(T) * hidden * hidden));
 72 |   weights.push_back((T *)malloc(sizeof(T) * hidden * hidden));
 73 | 
 74 |   // DUMMY BIASES
 75 |   weights.push_back((T *)malloc(sizeof(T) * hidden));
 76 |   weights.push_back((T *)malloc(sizeof(T) * hidden));
 77 |   weights.push_back((T *)malloc(sizeof(T) * hidden));
 78 | 
 79 |   uint32_t i, j;
 80 |   for (i = 0; i < 3; i++) {
 81 |     for (j = 0; j < input * hidden; j++) {
 82 |     weights.at(i)[j] = 1./256.;
 83 |     }
 84 |   }
 85 | 
 86 |   for (i = 3; i < 6; i++) {
 87 |     for (j = 0; j < hidden * hidden; j++) {
 88 |     weights.at(i)[j] = 1./256.;
 89 |     }
 90 |   }
 91 |   
 92 |   for (i = 7; i < 9; i++) {
 93 |     for (j = 0; j < hidden; j++) {
 94 |     weights.at(i)[j] = 0.5;
 95 |     }
 96 |   }
 97 | }
 98 | 
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/src/RNNBase.h:
--------------------------------------------------------------------------------
  1 | #ifndef RNNBASE_H
  2 | #define RNNBASE_H
  3 | 
  4 | #include "misc.h"
  5 | 
  6 | #include <vector>
  7 | #include <initializer_list>
  8 | #include <iostream>
  9 | #include <cuda_runtime_api.h>
 10 | 
 11 | template<typename T>
 12 | class RNNLayerBase {
 13 |   
 14 |   protected:
 15 |     //LAYER TOPOLOGY
 16 |     uint32_t hidden_size;
 17 |     uint32_t input_size;
 18 |     uint32_t batch_size;
 19 |     uint32_t block_width;
 20 | 
 21 |     // UNMODIFIED HOST WEIGHTS (SHOULD BE POINTERS ON HEAP)
 22 |     std::vector<T*> host_weights;
 23 | 
 24 |     // WEIGHTS PACKED INTO SUITABLE CONFIGURATION FOR SHARED MEM
 25 |     T * packed_input_weights;
 26 |     T * packed_input_weights_gpu;
 27 |     T * packed_hidden_weights;
 28 |     T * packed_hidden_weights_gpu;
 29 |     T * packed_biases;
 30 |     T * packed_biases_gpu;
 31 |     
 32 |   public:
 33 |     RNNLayerBase(uint32_t i_s, uint32_t h_s, uint32_t b_s, std::vector<T*> l) :
 34 |       input_size(i_s),
 35 |       hidden_size(h_s),
 36 |       batch_size(b_s),
 37 |       host_weights(l) {
 38 |     }
 39 | 
 40 |     // PACKS WEIGHTS FOR SHARED MEMORY TRANSFER
 41 |     virtual uint32_t initialize() =0;
 42 |     virtual void reset() =0;
 43 |     
 44 |     virtual uint32_t input_weight_footprint() =0;
 45 |     virtual uint32_t hidden_weight_footprint() =0;
 46 |     virtual uint32_t bias_weight_footprint() = 0;
 47 | 
 48 |     // SETTERS
 49 |     void set_block_width(uint32_t width) { block_width = width; }
 50 |     
 51 |     // GETTERS (CHANGING SIZE OF LAYER NOT SUPPORTED)
 52 |     uint32_t get_hidden_size() { return hidden_size; }
 53 |     uint32_t get_input_size() { return input_size; }
 54 |     uint32_t get_batch_size() { return batch_size; }
 55 |     T * get_packed_input_weights_gpu() { return packed_input_weights_gpu; }
 56 |     T * get_packed_hidden_weights_gpu() { return packed_hidden_weights_gpu; }
 57 |     T * get_packed_biases_gpu() { return packed_biases_gpu; }
 58 | 
 59 | };
 60 | 
 61 | template<template<typename> typename L, typename T>
 62 | class RNNBase {
 63 | 
 64 |   protected:
 65 |     // Vector of layers
 66 |     std::vector< L<T> > layers;
 67 | 
 68 |     // Topology
 69 |     uint32_t initial_input_size;
 70 |     uint32_t batch_size;
 71 |     uint32_t output_size;
 72 |     uint32_t tile_width;
 73 |     uint32_t tile_height;
 74 |     uint32_t num_groups;
 75 |     uint32_t group_threads;
 76 |     uint32_t mm_m;
 77 |     uint32_t mm_n;
 78 |     uint32_t mm_k;
 79 |     
 80 |     // Data
 81 |     T * gpu_inputs;
 82 |     T * gpu_hidden_initializer;
 83 |     T * gpu_weights_input;
 84 |     T * gpu_weights_hidden;
 85 |     T * gpu_biases;
 86 |     T * gpu_precompute;
 87 |     T * gpu_output;
 88 |     int * gpu_syncIn;
 89 |     int * gpu_syncOut;
 90 |     T * host_output;
 91 | 
 92 |     // Kernel Parameters
 93 |     void* paramsMM[6];
 94 | 
 95 |   public:
 96 |     RNNBase(std::initializer_list< L<T> > l) : layers(l) {
 97 |       this->initial_input_size = layers.front().get_input_size();
 98 |       this->batch_size = layers.front().get_batch_size();
 99 |       this->output_size = layers.back().get_hidden_size();
100 |     }
101 | 
102 |     virtual uint32_t initialize() =0;
103 |     virtual void reset() =0;
104 |     
105 |     // Transfers input to the GPU, runs kernel, fetches output
106 |     virtual float run_input(T * input, uint32_t * length) =0;
107 |   
108 |     // Configure tiling parameters
109 |     virtual void set_configuration(int x, int y, int g, int t) =0;
110 |     
111 |     // GETTERS
112 |     uint32_t get_initial_input_size() { return initial_input_size; }
113 |     uint32_t get_batch_size() { return batch_size; }
114 |     uint32_t get_output_size() { return output_size; }
115 |     
116 | };
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/performance_model/heuristic.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | from enum import Enum
  4 | from math import ceil
  5 | from math import floor
  6 | from math import sqrt
  7 | from math import log2
  8 | import argparse
  9 | import itertools
 10 | import numpy as np
 11 | 
 12 | # Hardware parameters
 13 | sms = 80
 14 | max_regs_thread = 224 # Leave 32 for indexing/pointers
 15 | max_regs_sm = 65536 
 16 | max_threads = 1024 # Per tb
 17 | fma_cost = 4
 18 | l2_bus = 64 # Bytes / cycle
 19 | l2_lat = 50 # Normalized against the FMA cost
 20 | l1_lat = 5 # Normalized against the FMA cost
 21 | warp_size = 32
 22 | 
 23 | class ModelType(Enum):
 24 |   LSTM = 0
 25 |   GRU = 1
 26 | 
 27 | model_gates = {ModelType.LSTM : 4, ModelType.GRU : 2}
 28 | model_scale = {ModelType.LSTM : 1, ModelType.GRU : 1.5} ## Weights stored per work group
 29 | string_model = {'LSTM' : ModelType.LSTM, 'GRU' : ModelType.GRU}
 30 | model_string = {ModelType.LSTM : 'LSTM', ModelType.GRU : 'GRU'}
 31 | 
 32 | class ModelConfig:
 33 |   
 34 |   def __init__(self, mt, hs, bs, tw, th, nwg, rw, sy=None):
 35 |     self.model_type = mt
 36 |     self.hidden_size = hs
 37 |     self.batch_size = bs
 38 |     self.tile_width = tw
 39 |     self.tile_height = th
 40 |     self.reduction_width = rw
 41 |     self.num_work_groups = nwg
 42 |     self.sub_tile_width = ceil(self.tile_width * model_gates[self.model_type] / self.num_work_groups)
 43 |     self.num_threads = self.reduction_width * self.num_work_groups
 44 |     self.num_SMs = ceil(self.hidden_size / self.tile_width) * ceil(self.batch_size / self.tile_height)
 45 |     if self.model_type is ModelType.GRU:
 46 |       self.sync = sy
 47 |       if self.sync is 1:
 48 |         self.weights_per_thread = self.sub_tile_width * self.hidden_size / self.reduction_width + self.tile_width * ceil(self.hidden_size / self.num_threads)
 49 |       else:
 50 |         self.weights_per_thread = self.sub_tile_width * self.hidden_size / self.reduction_width * 1.5
 51 |     else:
 52 |       self.weights_per_thread = self.sub_tile_width * self.hidden_size / self.reduction_width
 53 |     self.cost = self.fitness()
 54 | 
 55 |   def __str__(self):
 56 |     rep = "Model Config:\n"
 57 |     rep += "\tModel Info:\n"
 58 |     rep += "\t\tModel Type: " + str(self.model_type) +"\n"
 59 |     rep += "\t\tHidden Size: " + str(self.hidden_size) +"\n"
 60 |     rep += "\t\tBatch Size: " + str(self.batch_size) +"\n"
 61 |     rep += "\tConfiguration Parameters:\n"
 62 |     rep += "\t\tTile Width: " + str(self.tile_width) +"\n"
 63 |     rep += "\t\tTile Height: " + str(self.tile_height) +"\n"
 64 |     rep += "\t\tReduction Width: " + str(self.reduction_width) +"\n"
 65 |     rep += "\t\tNum Work Groups: " + str(self.num_work_groups) +"\n"
 66 |     rep += "\tOccupancy Metrics:\n"
 67 |     rep += "\t\tNumber of SMs: " + str(self.num_SMs) +"\n"
 68 |     rep += "\t\tWeights Per SM: " + str(self.tile_width * self.hidden_size * model_gates[self.model_type] * model_scale[self.model_type]) +"\n"
 69 |     rep += "\t\tSub Tile Width: " + str(self.sub_tile_width) +"\n"
 70 |     rep += "\t\tWeights Per Threads: " + str(self.weights_per_thread) +"\n"
 71 |     rep += "\tFitness: " + str(self.cost) + "\n"
 72 |     return rep
 73 | 
 74 |   def is_valid(self):
 75 |     if self.num_SMs > 80:
 76 |       return False
 77 |     elif self.weights_per_thread > max_regs_thread:
 78 |       return False
 79 |     elif (self.weights_per_thread + 32) * self.num_threads > max_regs_sm:
 80 |       return False
 81 |     elif self.sub_tile_width * self.tile_height > self.reduction_width:
 82 |       return False
 83 |     elif self.num_threads > max_threads:
 84 |       return False
 85 |     elif (model_gates[self.model_type] * self.tile_width % self.num_work_groups) is not 0:
 86 |       return False
 87 |     else:
 88 |       return True
 89 |   
 90 |   def fma_heuristic(self):
 91 |     sequential_length = self.hidden_size / self.reduction_width
 92 |     self.partition_occupancy = ceil(self.num_threads / 32 / 4)
 93 |     if self.partition_occupancy * self.sub_tile_width <= 8:
 94 |       return 1.6 ** log2(self.partition_occupancy) * 1.33 ** log2(self.sub_tile_width) * self.tile_height * sequential_length
 95 |     else:
 96 |       return 4.7 * (self.partition_occupancy * self.sub_tile_width / 8) * self.tile_height * sequential_length
 97 | 
 98 |   def lstm_fitness(self):
 99 |     sm_bandwidth = self.hidden_size * self.tile_height * 4
100 |     warp_occupancy = ceil(self.num_threads / 128)
101 | 
102 |     self.mem_cost = round(sm_bandwidth * (1 + floor(self.num_SMs / (sms / 2))) / (fma_cost * l2_bus), 2)
103 |     self.sync_cost = 0 
104 |     if warp_occupancy * self.sub_tile_width * self.tile_height < 12: ## Non-throughput limited 
105 |       if self.reduction_width <= 16:
106 |         self.reduction_cost = (log2(self.reduction_width) + 1) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height)
107 |       else:
108 |         self.reduction_cost = log2(self.reduction_width) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height)
109 |     else: ## Throughput limited
110 |       if self.reduction_width <= 16:
111 |         self.reduction_cost = (log2(self.reduction_width) + 1) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12)
112 |       else:
113 |         self.reduction_cost = log2(self.reduction_width) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12)
114 |     self.mul_cost = round(self.fma_heuristic(), 2)
115 |     
116 |     return self.mem_cost + self.sync_cost + self.reduction_cost + self.mul_cost
117 | 
118 |   def gru_fitness_two(self):
119 |     sm_bandwidth = self.hidden_size * self.tile_height * 4 * 2
120 |     warp_occupancy = ceil(self.num_threads / 128)
121 | 
122 |     self.mem_cost = round(sm_bandwidth * (1 + floor(self.num_SMs / (sms / 2))) / (fma_cost * l2_bus), 2)
123 |     self.sync_cost = l2_lat * 2 * 2 # ceil(self.num_SMs / 32) * 2
124 |     if warp_occupancy * self.sub_tile_width * self.tile_height < 12: ## Non-throughput limited 
125 |       if self.reduction_width <= 16:
126 |         self.reduction_cost = 2 * (log2(self.reduction_width) + 1) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height)
127 |       else:
128 |         self.reduction_cost = 2 * log2(self.reduction_width) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height)
129 |     else: ## Throughput limited
130 |       if self.reduction_width <= 16:
131 |         self.reduction_cost = 2 * (log2(self.reduction_width) + 1) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12)
132 |       else:
133 |         self.reduction_cost = 2 * log2(self.reduction_width) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12)
134 |     self.mul_cost = round(self.fma_heuristic() * 1.5, 2)
135 |     return self.mem_cost + self.sync_cost + self.reduction_cost + self.mul_cost
136 | 
137 |   def gru_fitness_one(self):
138 |     sm_bandwidth = self.hidden_size * self.tile_height * 4 + self.hidden_size * (self.hidden_size / self.tile_width) * self.tile_height * 4
139 |     warp_occupancy = ceil(self.num_threads / 128)
140 | 
141 |     self.mem_cost = round(sm_bandwidth * (1 + floor(self.num_SMs / (sms / 2))) / (fma_cost * l2_bus), 2)
142 |     self.sync_cost = 0
143 |     if warp_occupancy * self.sub_tile_width * self.tile_height < 12: ## Non-throughput limited 
144 |       if self.reduction_width <= 16:
145 |         self.reduction_cost = 2 * (log2(self.reduction_width) + 1) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height)
146 |       else:
147 |         self.reduction_cost = 2 * log2(self.reduction_width) * 7 * 1.03 ** (warp_occupancy * self.sub_tile_width * self.tile_height)
148 |     else: ## Throughput limited
149 |       if self.reduction_width <= 16:
150 |         self.reduction_cost = 2 * (log2(self.reduction_width) + 1) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12)
151 |       else:
152 |         self.reduction_cost = 2 * log2(self.reduction_width) * 7 * 1.03 ** 8 * (warp_occupancy * self.sub_tile_width * self.tile_height / 12)
153 |     self.mul_cost = round(self.fma_heuristic() + self.tile_width + self.hidden_size / self.tile_width, 2)
154 |     return self.mem_cost + self.sync_cost + self.reduction_cost + self.mul_cost
155 |     
156 |   def fitness(self):
157 |     if self.model_type is ModelType.LSTM:
158 |       return self.lstm_fitness()
159 |     elif self.model_type is ModelType.GRU:
160 |       if self.sync is 2:
161 |         return self.gru_fitness_two()
162 |       elif self.sync is 1:
163 |         return self.gru_fitness_one()
164 | 
165 |   def to_csv(self):
166 |     rep = str(self.tile_width) + ","
167 |     rep += str(self.tile_height) + ","
168 |     rep += str(self.num_work_groups) + "," 
169 |     rep += str(self.reduction_width) + ","
170 |     if self.model_type is ModelType.GRU:
171 |       rep += str(self.sync) + ","
172 |     rep += str(self.mem_cost) + ","
173 |     rep += str(self.sync_cost) + ","
174 |     rep += str(self.reduction_cost) + ","
175 |     rep += str(self.mul_cost) + ","
176 |     rep += str(self.cost) + ","
177 |     rep += str(self.partition_occupancy * self.sub_tile_width < 16) + "\n"
178 |     return rep
179 | 
180 | def string_to_model(string):
181 |   if (string in string_model):
182 |     return string_model[string]
183 |   else:
184 |     msg = string + " is not a valid model type"
185 |     raise argparse.ArgumentTypeError(msg)
186 | 
187 | 
188 | def main(model, input_size, hidden_size, batch_size, k):
189 | 
190 |   tile_widths = range(1, 65)
191 |   tile_heights = range(1, batch_size + 1)
192 |   
193 |   tile_configurations = list(itertools.product(tile_widths, tile_heights))
194 |   reduction_widths = [2 ** i for i in range(6)]
195 | 
196 |   configs = list()
197 |   # Build dictionary of configurations
198 |   for x, y in tile_configurations:
199 |     if batch_size % y is 0:
200 |       num_gate_elements = x * model_gates[model]
201 | 
202 |       for i in range(1, num_gate_elements + 1):
203 |         if num_gate_elements % i is 0:
204 |           for r in reduction_widths:
205 |             if model is ModelType.LSTM:
206 |               configs.append(ModelConfig(model, hidden_size, batch_size, x, y, i, r))
207 |             else:
208 |               configs.append(ModelConfig(model, hidden_size, batch_size, x, y, i, r, sy=1))
209 |               configs.append(ModelConfig(model, hidden_size, batch_size, x, y, i, r, sy=2))
210 |   
211 |   # Prune
212 |   configs = [x for x in configs if x.is_valid()]
213 |   # Evaluate
214 |   configs.sort(key=lambda config: config.cost)
215 |   
216 |   # Save chosen configurations
217 |   with open("configs_" + str(hidden_size) + "_" + str(batch_size) + "_" + model_string[model] + ".csv", mode='w') as f:
218 |     if k is -1:
219 |       for entry in configs:
220 |         f.write(entry.to_csv())
221 |     else:
222 |       for entry in configs[:k]:
223 |         f.write(entry.to_csv())
224 | 
225 | if __name__ == '__main__':
226 |   parser = argparse.ArgumentParser(description='Use heuristic based analysis of \
227 |                                                 an RNN layer to determine a near optimal \
228 |                                                 configuration for instantiation')
229 |   parser.add_argument('-m', '--model_type', default='LSTM', type=string_to_model, required=False,
230 |                       help='The type of RNN layer for analysis')
231 |   parser.add_argument('-i', '--input_size', default=256, type=int, required=False,
232 |                       help='Length of input vector to layer')
233 |   parser.add_argument('-s', '--hidden_size', default=256, type=int, required=False,
234 |                       help='Length of hidden size/output of layer')
235 |   parser.add_argument('-b', '--batch_size', default=1, type=int, required=False,
236 |                       help='Size of batch to be computed simultaneously')
237 |   parser.add_argument('-k', '--top_k', default=-1, type=int, required=False,
238 |                       help='How many candidate configurations to return')
239 |   args = parser.parse_args()
240 |   main(args.model_type, args.input_size, args.hidden_size, args.batch_size, args.top_k)
241 | 


--------------------------------------------------------------------------------
/src/LSTM.cu:
--------------------------------------------------------------------------------
  1 | #include "LSTM.h"
  2 | #include "misc.h"
  3 | 
  4 | #include <cstring>
  5 | #include <iostream>
  6 | #include <cooperative_groups.h>
  7 | #include <math.h>
  8 | #include <cassert>
  9 | #include <limits>
 10 | 
 11 | using namespace cooperative_groups;
 12 | 
 13 | __device__ __forceinline__ float sigmoidf(float x) {
 14 |   return 1.0f / (1.0f + expf(-1.0f * x));
 15 | }
 16 | 
 17 | #define MM_BLOCK_SIZE 16
 18 | #define MM_REG_TILE 4
 19 | #define MM_TILE_SIZE 64
 20 | 
 21 | // This is a mostly optimized kernel for matrix multiplication
 22 | // The kernel uses a two tiered tiling mechanism that first tiles large
 23 | // tiles from global memory to shared memory. This shared memory tile is
 24 | // then used as the source to stream data into register arrays that perform
 25 | // a calculation on a 4x4 tile.
 26 | 
 27 | __global__ void matmul(float * A, float * B, float * C,
 28 |                        uint32_t M, uint32_t K, uint32_t N) {
 29 |   
 30 |   extern __shared__ float base[];
 31 |   float* bufferA = base;
 32 |   float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE];
 33 | 
 34 |   float regA[MM_REG_TILE];
 35 |   float regB[MM_REG_TILE];
 36 |   float regC[MM_REG_TILE][MM_REG_TILE];
 37 |   
 38 |   uint32_t tidx = threadIdx.x;
 39 |   uint32_t tidy = threadIdx.y;
 40 |   uint32_t id = threadIdx.y * blockDim.x + threadIdx.x;
 41 |   uint32_t bidx = blockIdx.x;
 42 |   uint32_t bidy = blockIdx.y;
 43 | 
 44 |   // Number of rows that are traversed in a single fully coalesced load sequence
 45 |   constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE);
 46 |   constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE;
 47 |   
 48 |   // Zero the intermediate output
 49 |   for (uint32_t y = 0; y < MM_REG_TILE; y++) {
 50 |     for (uint32_t x = 0; x < MM_REG_TILE; x++) {
 51 |       regC[y][x] = 0.0f;
 52 |     }
 53 |   }
 54 | 
 55 |   for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) {
 56 |     
 57 |     // Load lhs tile from global memory to shared memory (fully coalesced)
 58 |     #pragma unroll
 59 |     for (uint32_t j = 0; j < LOAD_STEPS; j++) {
 60 |       uint32_t index = j * NUM_THREADS + id;
 61 |       if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) {
 62 |         bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE];
 63 |       } else {
 64 |         bufferA[index] = 0.0f;
 65 |       }
 66 |     }
 67 |     
 68 |     // Not necessary for correctness, but improves performance by avoiding thrashing shared memory
 69 |     __syncthreads();
 70 | 
 71 |     // Load rhs tile from global memory to shared memory (fully coalesced)
 72 |     #pragma unroll
 73 |     for (uint32_t j = 0; j < LOAD_STEPS; j++) {
 74 |       uint32_t index = j * NUM_THREADS + id;
 75 |       if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) {
 76 |         bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE];
 77 |       } else {
 78 |         bufferB[index] = 0.0f;
 79 |       }
 80 |     }
 81 | 
 82 |     // Ensures all data is written from global memory to shared memory before it is streamed
 83 |     // into register arrays.
 84 |     __syncthreads();
 85 |     
 86 |     
 87 |       
 88 |     // Loop through full tile
 89 |     for (uint32_t j  = 0; j < MM_TILE_SIZE; j++) {
 90 |       
 91 |       // Load vector from lhs and rhs
 92 |       #pragma unroll
 93 |       for (uint32_t l = 0; l < MM_REG_TILE; l++) {
 94 |         regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j];
 95 |         regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l];
 96 |       }
 97 |       
 98 |       #pragma unroll
 99 |       // Perform a narrow matmul
100 |       for (uint32_t y = 0; y < MM_REG_TILE; y++) {
101 |         for (uint32_t x = 0; x < MM_REG_TILE; x++) {
102 |           regC[y][x] += regA[y] * regB[x];
103 |         }
104 |       }
105 |     }
106 | 
107 |     __syncthreads();
108 |   }
109 |  
110 |   // Write register intermediates to shared memory (possibly unnecessary)
111 |   for (uint32_t y = 0; y < MM_REG_TILE; y++) {
112 |     for (uint32_t x = 0; x < MM_REG_TILE; x++) {
113 |       bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x];
114 |     }
115 |   }
116 | 
117 |   __syncthreads();
118 | 
119 |   
120 |   for (uint32_t j = 0; j < LOAD_STEPS; j++) {
121 |     uint32_t index = j * NUM_THREADS + id;
122 |     if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) {
123 |       C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE +  (index % MM_TILE_SIZE)] = bufferA[index];
124 |     }
125 |   }
126 | }
127 | 
128 | 
129 | // This kernel assumes the input multiplications were precomputed in a large matrix-matrix multiplication
130 | template<int HIDDEN_SIZE, int TILE_WIDTH, int TILE_HEIGHT, int NUM_GROUPS, int GROUP_THREADS, int BATCH_SIZE>
131 | __global__ void lstm_rnn( const float* precomputed_inputs,
132 |                           const float* hidden_initializer,
133 |                           const float* weights, 
134 |                           const float* biases, 
135 |                           float* output,
136 |                           volatile int* syncIn,
137 |                           volatile int* syncOut,
138 |                           uint32_t length) {
139 |   
140 |   // Indexing helpers
141 |   int tid = threadIdx.x;
142 |   int bidx = blockIdx.x;
143 |   int bidy = blockIdx.y;
144 |   int wg_id = tid / GROUP_THREADS;
145 |   // LENGTH - How many weights for each gate output does a single thread need to store
146 |   constexpr int LENGTH = (HIDDEN_SIZE + GROUP_THREADS - 1) / GROUP_THREADS;
147 |   // BUFFER_SIZE - Number of elements to reserve in shared memory for each output. Effectively 
148 |   // rounds up HIDDEN_SIZE to multiple of GROUP_THREADS
149 |   constexpr int BUFFER_SIZE = LENGTH * GROUP_THREADS;
150 |   // OUTPUT_TILE_WIDTH - How many full elements are produced by the threadblock. At scheduling time,
151 |   // must ensure that launched configuration produces full elements within a single threadblock
152 |   constexpr int OUTPUT_TILE_WIDTH = NUM_GROUPS * TILE_WIDTH / LSTM_GATES;
153 |   
154 |   // Static shared memory allocation
155 |   __shared__ float hidden_tile[TILE_HEIGHT][BUFFER_SIZE];
156 |   __shared__ float cell_state[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
157 |   __shared__ float forget_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
158 |   __shared__ float input_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
159 |   __shared__ float cand_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
160 |   __shared__ float out_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
161 |   
162 |   // Weights in the register file
163 |   float weights_reg[TILE_WIDTH][LENGTH];
164 |   float outputs_reg[TILE_HEIGHT][TILE_WIDTH];
165 |   float bias = 0.0f;
166 |   float precompute = 0.0f;
167 | 
168 |   // Cooperative group helpers
169 |   thread_block bl = this_thread_block();
170 |   thread_block_tile<GROUP_THREADS> work_group = tiled_partition<GROUP_THREADS>(bl);
171 | 
172 |   // Tile width is the number of gate outputs produce by a single warp
173 |   for (int i = 0; i < TILE_WIDTH; i++) {
174 |     // Global gate id for fetching weights. 
175 |     // bidx * TILE_WIDTH * NUM_GROUPS -> first gate index processed by the threadblock
176 |     // (tid / GROUP_THREADS) * TILE_WIDTH -> first gate index within processed by a warp within the threadblock
177 |     // i -> current gate index within the warp's assigned gates
178 |     int gate_id = bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + i;
179 |     // HIDDEN_SIZE * LSTM_GATES -> number of total gates that need to be computed
180 |     if (gate_id < HIDDEN_SIZE * LSTM_GATES) {
181 |       for (int j = 0; j < LENGTH; j++) {
182 |         // Better to fully populate and check weight bounds once at loading than during each computation.
183 |         if (j * GROUP_THREADS + work_group.thread_rank() < HIDDEN_SIZE) {
184 |           weights_reg[i][j] = weights[gate_id * HIDDEN_SIZE + j * GROUP_THREADS + work_group.thread_rank()];
185 |         } else {
186 |           weights_reg[i][j] = 0.0f;
187 |         }
188 |       }
189 |     }
190 |   }
191 | 
192 |   // Assigns correct bias value to specific output. Prunes to only ensure that values are fetched that are necessary for later
193 |   // for later computation
194 |   if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
195 |     if ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH) < HIDDEN_SIZE * LSTM_GATES) {
196 |       bias = biases[bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH];
197 |     } else {
198 |       bias = 0;
199 |     }
200 |   }
201 | 
202 |   // Zero initialize the cell state
203 |   if (tid < TILE_HEIGHT * OUTPUT_TILE_WIDTH) {
204 |     cell_state[tid / OUTPUT_TILE_WIDTH][tid % OUTPUT_TILE_WIDTH] = 0.0f;
205 |   }
206 |   
207 |   // Initialize hidden state buffer according to input / zero out rest of buffer
208 |   for (int j = 0; j < TILE_HEIGHT; j++) {
209 |     for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) {
210 |       if (i + tid < HIDDEN_SIZE) {
211 |         hidden_tile[j][i + tid] = hidden_initializer[(bidy * TILE_HEIGHT + j) * HIDDEN_SIZE + i + tid];
212 |       } else if (i + tid < BUFFER_SIZE) {
213 |         hidden_tile[j][i + tid] = 0.0f;
214 |       }
215 |     }
216 |   }
217 |   __syncthreads();
218 |   
219 |   // Zero dot product accumulators
220 |   #pragma unroll
221 |   for (int j = 0; j < TILE_HEIGHT; j++) {
222 |     #pragma unroll
223 |     for (int i = 0; i < TILE_WIDTH; i++) {
224 |       outputs_reg[j][i] = 0.0f;
225 |     }
226 |   }
227 |   
228 |   // Load first time independent values
229 |   if ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH < HIDDEN_SIZE *  LSTM_GATES)
230 |        && work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
231 |     precompute = precomputed_inputs[bidy * TILE_HEIGHT * HIDDEN_SIZE * LSTM_GATES +
232 |                                     bidx * TILE_WIDTH * NUM_GROUPS +
233 |                                     wg_id * TILE_WIDTH +
234 |                                     work_group.thread_rank() % TILE_WIDTH + 
235 |                                     (work_group.thread_rank() / TILE_WIDTH) * HIDDEN_SIZE * LSTM_GATES];
236 | 
237 |   }
238 |   
239 |   // Loop for each iteration of the sequence length
240 |   for (int sequence_iteration = 0; sequence_iteration < length; sequence_iteration++) {
241 |     
242 |     // Dot products
243 |     #pragma unroll
244 |     for (int k = 0; k < LENGTH; k++) {
245 |       #pragma unroll
246 |       for (int j = 0; j < TILE_HEIGHT; j++) {
247 |         float val = hidden_tile[j][k * GROUP_THREADS + work_group.thread_rank()];
248 |         #pragma unroll
249 |         for (int i = 0; i < TILE_WIDTH; i++) {
250 |           outputs_reg[j][i] += weights_reg[i][k] * val;
251 |         }
252 |       }
253 |     }
254 |     
255 |     // Reductions
256 |     #pragma unroll
257 |     for (int k = 1; k < GROUP_THREADS; k *= 2) {
258 |       #pragma unroll
259 |       for (int j = 0; j < TILE_HEIGHT; j++) {
260 |         #pragma unroll
261 |         for (int i = 0; i < TILE_WIDTH; i++) {
262 |           outputs_reg[j][i] += work_group.shfl_xor(outputs_reg[j][i], k);
263 |         }
264 |       }
265 |     }
266 |     
267 |     // Remap work and compute activations
268 |     if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
269 |       int reg_y = work_group.thread_rank() / TILE_WIDTH;
270 |       int reg_x = work_group.thread_rank() % TILE_WIDTH;
271 |       float val = outputs_reg[reg_y][reg_x] + bias + precompute;
272 | 
273 |       int gate_id = (wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH) % LSTM_GATES;
274 | 
275 |       if (gate_id != 2) {
276 |         val = sigmoidf(val);
277 |       } else {
278 |         val = tanhf(val);
279 |       }
280 |       int out_id = (wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH) / LSTM_GATES;
281 |       if (gate_id == 0) {
282 |         forget_gate[reg_y][out_id] = val;
283 |       } else if (gate_id == 1) {
284 |         input_gate[reg_y][out_id] = val;
285 |       } else if (gate_id == 2) {
286 |         cand_gate[reg_y][out_id] = val;
287 |       } else {
288 |         out_gate[reg_y][out_id] = val;
289 |       }
290 |     }
291 | 
292 |     // Synchronization enforces all intermediates are calculated before the data is shared across threads
293 |     // for the elementwise operations.
294 |     __syncthreads();
295 |     
296 |     int x = tid  % OUTPUT_TILE_WIDTH; 
297 |     int y = tid  / OUTPUT_TILE_WIDTH;
298 |     
299 |     // Elementwise operations
300 |     if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT  &&
301 |         (bidx * OUTPUT_TILE_WIDTH + x) < HIDDEN_SIZE &&
302 |         (bidy * TILE_HEIGHT + y) < BATCH_SIZE) { 
303 |       // Calculates the new cell state
304 |       float cell_reg = cell_state[y][x] * forget_gate[y][x] + input_gate[y][x] * cand_gate[y][x];
305 |       // Calculates the new output
306 |       float out_reg = tanhf(cell_reg) * out_gate[y][x];
307 | 
308 |       // No synchronization necessary between the read and writes of cell state because it is guaranteed that only the
309 |       // same thread will read/write to the element.
310 |       cell_state[y][x] = cell_reg;
311 | 
312 |       // Broadcast output to global memory
313 |       output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + y) * HIDDEN_SIZE + bidx * OUTPUT_TILE_WIDTH + x] = out_reg;
314 |     }
315 |     
316 |     // Escape recurrent loop when full sequence has been processed
317 |     if (sequence_iteration + 1 == length) break;
318 |     
319 | 
320 |     
321 |     // Synchronize between recurrent iterations - signal stage
322 |     if (tid == 0 ) {
323 |       syncIn[(bidy * gridDim.x + bidx)] = sequence_iteration + 1;
324 |     }
325 |     __threadfence();
326 |     
327 |     // Zero the dot product accumulators
328 |     #pragma unroll
329 |     for (int j = 0; j < TILE_HEIGHT; j++) {
330 |       #pragma unroll
331 |       for (int i = 0; i < TILE_WIDTH; i++) {
332 |         outputs_reg[j][i] = 0.0f;
333 |       }
334 |     }
335 |     
336 |     // Read precomputed value from memory (Since this is a read-only operation that does ot
337 |     // use a shared intermediate, this can go before the memory barrier without correctness issues
338 |     // Ideally, this will hide some latency, but needs profiling
339 |     if ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank() % TILE_WIDTH < HIDDEN_SIZE *  LSTM_GATES)
340 |          && work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
341 |       precompute = precomputed_inputs[bidy * TILE_HEIGHT * HIDDEN_SIZE * LSTM_GATES +
342 |                                       bidx * TILE_WIDTH * NUM_GROUPS +
343 |                                       wg_id * TILE_WIDTH +
344 |                                       work_group.thread_rank() % TILE_WIDTH + 
345 |                                       (work_group.thread_rank() / TILE_WIDTH) * HIDDEN_SIZE * LSTM_GATES +
346 |                                       (sequence_iteration + 1) * BATCH_SIZE * HIDDEN_SIZE * LSTM_GATES];
347 | 
348 |     }
349 |     
350 |     // Synchronize between recurrent iterations - spin stage
351 |     if (bidx == 0) {
352 |       if (tid < gridDim.x) {
353 |         while (syncIn[(bidy * gridDim.x + tid)]  != sequence_iteration + 1) {
354 |         }
355 |       }
356 | 
357 |       __syncthreads();
358 | 
359 |       if (tid == 0) {
360 |         syncOut[bidy] = sequence_iteration + 1;
361 |       }
362 |     } else {
363 |       if (tid == 0) {
364 |         while (syncOut[bidy] != sequence_iteration + 1) {
365 |         }
366 |       }
367 |       __syncthreads();
368 |     }
369 |     
370 |     // Load the hidden state into the input buffer in shared memory (coalesced)
371 |     // Tile height * REDUCTION_WIDTH * SEQUENTIAL_LENGTH is equivalent to the tile height by hidden size
372 |     // Reduction_width * TILE_WIDTH * LSTM_GATES is the number of threads launched (allows for loop unrolling)
373 |     #pragma unroll
374 |     for (int i = 0; i < TILE_HEIGHT; i++) {
375 |       if (i + bidy * TILE_HEIGHT < BATCH_SIZE) {
376 |         #pragma unroll
377 |         for (int j = 0; j < HIDDEN_SIZE; j += NUM_GROUPS * GROUP_THREADS) {
378 |           if (j + tid < HIDDEN_SIZE) {
379 |             hidden_tile[i][j+tid] = output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + i) * HIDDEN_SIZE + j + tid];
380 |           } else if (j + tid < BUFFER_SIZE) {
381 |             hidden_tile[i][j+tid] = 0.0f;
382 |           }
383 |         }
384 |       }
385 |     }
386 | 
387 |     // Enforce loading of data to shared memory before computation
388 |     __syncthreads();
389 | 
390 |   }
391 | }
392 | 
393 | template<typename T>
394 | void process_input_weights(T * output, std::vector<T*> weights, uint32_t input_size, uint32_t hidden_size) { 
395 |   
396 |   // Outside loop is the input size
397 |   for (uint32_t j = 0; j < input_size; j++) {
398 |     // Width of the input weight matrix
399 |     for (uint32_t k = 0; k < hidden_size; k++) {
400 |       // Colocate the weights for each element
401 |       for (uint32_t i = 0; i < LSTM_GATES; i++) {
402 |         output[(j * hidden_size + k) * LSTM_GATES + i] = weights.at(i)[j * hidden_size + k];
403 |       }
404 |     }
405 |   }
406 | }
407 | 
408 | template<typename T>
409 | void process_hidden_weights(T * output, std::vector<T*> weights, uint32_t hidden_size) {
410 |   
411 |   // For each output element
412 |   for (uint32_t j = 0; j < hidden_size; j++) {
413 |     // For each gate
414 |     for (uint32_t k = 0; k < LSTM_GATES; k++) {
415 |       // For each element for that gate
416 |       for (uint32_t i = 0; i < hidden_size; i++) {
417 |         output[j * LSTM_GATES * hidden_size + k * hidden_size + i] = weights.at(4 + k)[i * hidden_size + j];
418 |       }
419 |     }
420 |   }
421 | }
422 | 
423 | template<typename T>
424 | void process_biases(T * output, std::vector<T*> weights, uint32_t hidden_size) {
425 | 
426 |   // For each output element
427 |   for (uint32_t k = 0; k < hidden_size; k++) {
428 |     // Colocate the biases for each element
429 |     for (uint32_t i = 0; i < LSTM_GATES; i++) {
430 |       output[k * LSTM_GATES + i] = weights.at(i + 8)[k];
431 |     }
432 |   }
433 | }
434 | 
435 | // Initialize all layer weights and send to GPU
436 | template<typename T>
437 | uint32_t LSTMLayer<T>::initialize() {
438 |   
439 |   uint32_t input_footprint = input_weight_footprint();
440 |   uint32_t hidden_footprint = hidden_weight_footprint();
441 |   uint32_t bias_footprint = bias_weight_footprint();
442 |   
443 |   // Weight buffer allocations
444 |   cudaHostAlloc((void **) &(this->packed_input_weights), input_footprint, cudaHostAllocDefault); CUDA_ERR;
445 |   cudaHostAlloc((void **) &(this->packed_hidden_weights), hidden_footprint, cudaHostAllocDefault); CUDA_ERR;
446 |   cudaHostAlloc((void **) &(this->packed_biases), bias_footprint, cudaHostAllocDefault); CUDA_ERR;
447 |   cudaMalloc((void **) &(this->packed_input_weights_gpu), input_footprint); CUDA_ERR;
448 |   cudaMalloc((void **) &(this->packed_hidden_weights_gpu), hidden_footprint); CUDA_ERR;
449 |   cudaMalloc((void **) &(this->packed_biases_gpu), bias_footprint); CUDA_ERR;
450 |   
451 |   // Reorganize weights (typically a transpose)
452 |   process_input_weights(this->packed_input_weights, this->host_weights, this->input_size, this->hidden_size);
453 |   process_hidden_weights(this->packed_hidden_weights, this->host_weights, this->hidden_size);
454 |   process_biases(this->packed_biases, this->host_weights, this->hidden_size);
455 |   
456 |   // Send weights to GPU
457 |   cudaMemcpy(this->packed_input_weights_gpu, this->packed_input_weights, input_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
458 |   cudaMemcpy(this->packed_hidden_weights_gpu, this->packed_hidden_weights, hidden_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
459 |   cudaMemcpy(this->packed_biases_gpu, this->packed_biases, bias_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
460 | 
461 |   return 0;
462 | 
463 | }
464 | 
465 | // Free all allocated buffers. Only needed for full sweep benchmarking.
466 | template <typename T>
467 | void LSTMLayer<T>::reset() {
468 |   cudaFreeHost((void *) this->packed_input_weights);
469 |   cudaFreeHost((void *) this->packed_hidden_weights);
470 |   cudaFreeHost((void *) this->packed_biases);
471 |   cudaFree((void *) this->packed_input_weights_gpu);
472 |   cudaFree((void *) this->packed_hidden_weights_gpu);
473 |   cudaFree((void *) this->packed_biases_gpu);
474 | }
475 | 
476 | // Allocate input/output buffers for the layer. Currently set up for only single layer models, but can be extended to multi-layer 
477 | // without dramatic refactoring
478 | template <typename T>
479 | uint32_t LSTMModel<T>::initialize() {
480 |   
481 |   for (auto& l: this->layers) {
482 |     uint32_t debug = l.initialize();
483 |     if (debug != 0) {
484 |       std::cout << "FAILURE\n";
485 |       return debug;
486 |     }
487 |   }
488 | 
489 |   this->gpu_weights_input = this->layers[0].get_packed_input_weights_gpu();
490 |   this->gpu_weights_hidden = this->layers[0].get_packed_hidden_weights_gpu();
491 |   this->gpu_biases = this->layers[0].get_packed_biases_gpu();
492 |   this->mm_k = this->initial_input_size;
493 |   this->mm_n = this->output_size * LSTM_GATES;
494 | 
495 |   // Output allocations, assumes sequence length less than 100
496 |   cudaHostAlloc((void **) &(this->host_output), this->output_size * this->batch_size * sizeof(T) * 100, cudaHostAllocDefault);
497 |   cudaMalloc((void **) &(this->gpu_output), this->output_size * this->batch_size * sizeof(T) * 100);
498 |   
499 |   // Input allocations, assumes sequence length less than 100
500 |   cudaMalloc((void **) &(this->gpu_inputs), this->initial_input_size * this->batch_size * 100 * sizeof(T));
501 |   cudaMalloc((void **) &(this->gpu_precompute), this->output_size * this->batch_size * LSTM_GATES * 100 * sizeof(T));
502 | 
503 |   // Initialize hidden state, for our purposes we use 0's
504 |   cudaMalloc((void **) &(this->gpu_hidden_initializer), this->output_size * this->batch_size * sizeof(T));
505 |   cudaMemset((void *)this->gpu_hidden_initializer, 0, this->output_size * this->batch_size * sizeof(T));
506 |   
507 |   // Synchronization buffers. Always allocated to full dimensionality so that they may be easily reused from run to run
508 |   cudaMalloc((void **) &(this->gpu_syncIn), 80 * sizeof(int) * LINE_SIZE);
509 |   cudaMalloc((void **) &(this->gpu_syncOut), 80 * sizeof(int) * LINE_SIZE);
510 |  
511 |   // GEMM Kernel parameters
512 |   this->paramsMM[0] = (void*) &(this->gpu_inputs);
513 |   this->paramsMM[1] = (void*) &(this->gpu_weights_input);
514 |   this->paramsMM[2] = (void*) &(this->gpu_precompute);
515 |   this->paramsMM[4] = (void*) &(this->mm_k);
516 |   this->paramsMM[5] = (void*) &(this->mm_n);
517 | 
518 |   // LSTM Kernel parameters
519 |   this->paramsLSTM[0] = (void*) &(this->gpu_precompute);
520 |   this->paramsLSTM[1] = (void*) &(this->gpu_hidden_initializer);
521 |   this->paramsLSTM[2] = (void*) &(this->gpu_weights_hidden);
522 |   this->paramsLSTM[3] = (void*) &(this->gpu_biases);
523 |   this->paramsLSTM[4] = (void*) &(this->gpu_output);
524 |   this->paramsLSTM[5] = (void*) &(this->gpu_syncIn);
525 |   this->paramsLSTM[6] = (void*) &(this->gpu_syncOut);
526 | 
527 |   return 0;
528 | }
529 | 
530 | // Frees model buffers
531 | template <typename T>
532 | void LSTMModel<T>::reset() {
533 | 
534 |   for (auto& l: this->layers) {
535 |     l.reset();
536 |   }
537 | 
538 |   cudaFreeHost((void *) this->host_output);
539 |   cudaFree((void *) this->gpu_output);
540 | 
541 |   cudaFree((void *) this->gpu_inputs);
542 |   cudaFree((void *) this->gpu_precompute);
543 | }
544 | 
545 | // Defines tiling configuration (should be encapsulated elsewhere in the future)
546 | template <typename T>
547 | void LSTMModel<T>::set_configuration(int x, int y, int g, int t) {
548 |   this->tile_width = x;
549 |   this->tile_height = y;
550 |   this->num_groups = g;
551 |   this->group_threads = t;
552 | }
553 | 
554 | // Processes input sequence (both independent and dependent)
555 | template <typename T>
556 | float LSTMModel<T>::run_input(T* input, uint32_t * length) {
557 |   
558 |   // Define remaining kernel parameters (primarily dependent on sequence length)
559 |   this->mm_m = this->batch_size * *length;
560 |   this->paramsMM[3] = (void *) &(this->mm_m);
561 |   this->paramsLSTM[7] = (void *) length;
562 |   
563 |   // GEMM Kernel dimensioning
564 |   dim3 mm_grid = dim3((this->mm_n + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (this->mm_m + MM_TILE_SIZE - 1) / MM_TILE_SIZE);
565 |   dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE);
566 |   size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float);
567 |   
568 |   // LSTM Kernel dimensioning
569 |   int effective_w = (this->num_groups * this->tile_width) / LSTM_GATES;
570 |   dim3 lstm_rnn_grid = dim3((this->output_size + effective_w - 1) / effective_w, (this->batch_size + this->tile_height - 1) / this->tile_height);
571 |   dim3 lstm_rnn_block = dim3(this->num_groups * this->group_threads);  
572 |   unsigned block_size = lstm_rnn_block.x;
573 |   unsigned grid_size = lstm_rnn_grid.x * lstm_rnn_grid.y;
574 | 
575 |   // Kernel instantiation (currently configured for manual application of parameters)
576 |   void * kernel = (void*)lstm_rnn<256, 2, 4, 64, 8, 40>;
577 |   int numBlocks = 0;
578 |   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, block_size, 0);
579 |    
580 |   // Check occupancy prior to launch to prevent program hangs
581 |   if (numBlocks == 0 || grid_size > 80) {
582 |     printf("numBlocks: %2d grid_size: %3d, block_size: %3d\n", numBlocks, grid_size, block_size);
583 |     return -std::numeric_limits<float>::infinity();
584 |   }
585 | 
586 |   cudaEvent_t start, end;
587 |   float elapsed;
588 |   
589 |   cudaMemcpy(this->gpu_inputs, input, this->initial_input_size * this->batch_size * *length * sizeof(T), cudaMemcpyHostToDevice);
590 |   
591 |   // Timing info
592 |   cudaEventCreate(&start);
593 |   cudaEventCreate(&end);
594 |   cudaEventRecord(start);
595 |   
596 |   // Kernel launches
597 |   cudaLaunchKernel((void *)matmul, mm_grid, mm_block, this->paramsMM, mm_sm_requirement);
598 |   cudaLaunchKernel(kernel, lstm_rnn_grid, lstm_rnn_block, this->paramsLSTM);
599 |   
600 |   cudaEventRecord(end);
601 |   cudaEventSynchronize(end);
602 |   cudaEventElapsedTime(&elapsed, start, end);
603 |    
604 |   cudaMemcpy(this->host_output, this->gpu_output, this->output_size * this->batch_size * sizeof(T), cudaMemcpyDeviceToHost); 
605 |   
606 | #ifdef DEBUG
607 |   // Value checking
608 |   for (int i = 0; i < this->batch_size; i++) {
609 |     printf("Sequence %2d\n", i);
610 |     for (int j = 0; j < this->output_size; j++) {
611 |       printf("%f ", this->host_output[i * this->output_size + j]);
612 |     }
613 |     printf("\n");
614 |   }
615 |   printf("\n");
616 | #endif
617 |   
618 |   // Check for runtime errors
619 |   cudaError_t err;
620 |   cudaDeviceSynchronize();
621 |   if ((err = cudaGetLastError()) != cudaSuccess) {
622 |     printf("CUDA error: %d : %s : %s, line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__);
623 |     return std::numeric_limits<float>::infinity();
624 |   }
625 |   
626 |   return elapsed;
627 | }
628 | 
629 | // Explicit template instantiations
630 | template void process_input_weights<float>(float *, std::vector<float *>, uint32_t, uint32_t);
631 | template void process_hidden_weights<float>(float *, std::vector<float *>, uint32_t);
632 | template void process_biases<float>(float *, std::vector<float *>, uint32_t);
633 | template uint32_t LSTMLayer<float>::initialize();
634 | template uint32_t LSTMModel<float>::initialize();
635 | template void LSTMModel<float>::set_configuration(int, int, int, int);
636 | template float LSTMModel<float>::run_input(float *, uint32_t *);
637 | template void LSTMModel<float>::reset();
638 | template void LSTMLayer<float>::reset();
639 | 


--------------------------------------------------------------------------------
/src/GRU_single.cu:
--------------------------------------------------------------------------------
  1 | #include "GRU_single.h"
  2 | #include "misc.h"
  3 | 
  4 | #include <cstring>
  5 | #include <iostream>
  6 | #include <cooperative_groups.h>
  7 | #include <math.h>
  8 | #include <cassert>
  9 | #include <limits>
 10 | 
 11 | using namespace cooperative_groups;
 12 | 
 13 | __device__ __forceinline__ float sigmoidf(float x) {
 14 |   return 1.0f / (1.0f + expf(-1.0f * x));
 15 | }
 16 | 
 17 | #define MM_BLOCK_SIZE 16
 18 | #define MM_REG_TILE 4
 19 | #define MM_TILE_SIZE 64
 20 | 
 21 | // This is a mostly optimized kernel for matrix multiplication
 22 | // The kernel uses a two tiered tiling mechanism that first tiles large
 23 | // tiles from global memory to shared memory. This shared memory tile is
 24 | // then used as the source to stream data into register arrays that perform
 25 | // a calculation on a 8x8 tile.
 26 | 
 27 | __global__ void matmul(float * A, float * B, float * C,
 28 |                        uint32_t M, uint32_t K, uint32_t N) {
 29 |   
 30 |   extern __shared__ float base[];
 31 |   float* bufferA = base;
 32 |   float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE];
 33 | 
 34 |   float regA[MM_REG_TILE];
 35 |   float regB[MM_REG_TILE];
 36 |   float regC[MM_REG_TILE][MM_REG_TILE];
 37 |   
 38 |   uint32_t tidx = threadIdx.x;
 39 |   uint32_t tidy = threadIdx.y;
 40 |   uint32_t id = threadIdx.y * blockDim.x + threadIdx.x;
 41 |   uint32_t bidx = blockIdx.x;
 42 |   uint32_t bidy = blockIdx.y;
 43 | 
 44 |   // Number of rows that are traversed in a single fully coalesced load sequence
 45 |   constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE);
 46 |   constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE;
 47 |   
 48 |   // Zero the intermediate output
 49 |   for (uint32_t y = 0; y < MM_REG_TILE; y++) {
 50 |     for (uint32_t x = 0; x < MM_REG_TILE; x++) {
 51 |       regC[y][x] = 0.0f;
 52 |     }
 53 |   }
 54 | 
 55 |   for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) {
 56 |     
 57 |     // Load lhs tile from global memory to shared memory (fully coalesced)
 58 |     #pragma unroll
 59 |     for (uint32_t j = 0; j < LOAD_STEPS; j++) {
 60 |       uint32_t index = j * NUM_THREADS + id;
 61 |       if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) {
 62 |         bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE];
 63 |       } else {
 64 |         bufferA[index] = 0.0f;
 65 |       }
 66 |     }
 67 |     
 68 |     // Not necessary for correctness, but improves performance by avoiding thrashing shared memory
 69 |     __syncthreads();
 70 | 
 71 |     // Load rhs tile from global memory to shared memory (fully coalesced)
 72 |     #pragma unroll
 73 |     for (uint32_t j = 0; j < LOAD_STEPS; j++) {
 74 |       uint32_t index = j * NUM_THREADS + id;
 75 |       if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) {
 76 |         bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE];
 77 |       } else {
 78 |         bufferB[index] = 0.0f;
 79 |       }
 80 |     }
 81 | 
 82 |     // Ensures all data is written from global memory to shared memory before it is streamed
 83 |     // into register arrays.
 84 |     __syncthreads();
 85 |       
 86 |     // Loop through full tile
 87 |     for (uint32_t j  = 0; j < MM_TILE_SIZE; j++) {
 88 |       
 89 |       // Load vector from lhs and rhs
 90 |       #pragma unroll
 91 |       for (uint32_t l = 0; l < MM_REG_TILE; l++) {
 92 |         regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j];
 93 |         regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l];
 94 |       }
 95 |       
 96 |       #pragma unroll
 97 |       // Perform a narrow matmul
 98 |       for (uint32_t y = 0; y < MM_REG_TILE; y++) {
 99 |         for (uint32_t x = 0; x < MM_REG_TILE; x++) {
100 |           regC[y][x] += regA[y] * regB[x];
101 |         }
102 |       }
103 |     }
104 | 
105 |     __syncthreads();
106 |   }
107 |  
108 |   // Write register intermediates to shared memory (possibly unnecessary)
109 |   for (uint32_t y = 0; y < MM_REG_TILE; y++) {
110 |     for (uint32_t x = 0; x < MM_REG_TILE; x++) {
111 |       bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x];
112 |     }
113 |   }
114 | 
115 |   __syncthreads();
116 |   
117 |   for (uint32_t j = 0; j < LOAD_STEPS; j++) {
118 |     uint32_t index = j * NUM_THREADS + id;
119 |     if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) {
120 |       C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE +  (index % MM_TILE_SIZE)] = bufferA[index];
121 |     }
122 |   }
123 | }
124 | 
125 | template<int HIDDEN_SIZE, int TILE_WIDTH, int TILE_HEIGHT, int NUM_GROUPS, int GROUP_THREADS, int BATCH_SIZE>
126 | __global__ void gru_rnn(const float* precomputed_inputs,
127 |                         const float* hidden_initializer,
128 |                         const float* weights_r,
129 |                         const float* weights_zh,
130 |                         const float* biases_r,
131 |                         const float* biases_zh,
132 |                         float* r_buf,
133 |                         float* output,
134 |                         volatile int* syncIn,
135 |                         volatile int* syncOut,
136 |                         uint32_t length) {
137 |   // Indexing Helpers
138 |   int tid = threadIdx.x;
139 |   int bidx = blockIdx.x;
140 |   int bidy = blockIdx.y;
141 |   // Number of weights stored per tile width
142 |   constexpr int LENGTH = (HIDDEN_SIZE + GROUP_THREADS - 1) / GROUP_THREADS;
143 |   // Number of elements to reserve in shared memory for each output. Effectively 
144 |   // rounds up HIDDEN_SIZE to multiple of GROUP_THREADS
145 |   constexpr int BUFFER_SIZE = LENGTH * GROUP_THREADS;
146 |   // Number of elements horizontally produced by a single threadblock
147 |   constexpr int OUTPUT_TILE_WIDTH = TILE_WIDTH * NUM_GROUPS / (GRU_GATES - 1);
148 |   // Number of threads in a launched block
149 |   constexpr int NUM_THREADS = NUM_GROUPS * GROUP_THREADS;
150 |   // Number of outputs per tile row a single thread must compute for the partial sums of r values
151 |   constexpr int ELEMS_PER_THREAD = (HIDDEN_SIZE + NUM_THREADS - 1) / NUM_THREADS;
152 |   // Number of partial sums produced by the kernel for each input in the batch for the r gate
153 |   constexpr int NUM_PARTIALS = (HIDDEN_SIZE + OUTPUT_TILE_WIDTH - 1) / OUTPUT_TILE_WIDTH;
154 |   
155 |   // Determines whether a group is the h gate or the z gate
156 |   int g_type = 2 * tid / (NUM_THREADS);
157 |   int wg_id = (tid % (NUM_THREADS / 2)) / GROUP_THREADS;
158 | 
159 |   // Shared memory workspaces
160 |   __shared__ float h_tile[TILE_HEIGHT][BUFFER_SIZE];
161 |   __shared__ float r_tile[TILE_HEIGHT][BUFFER_SIZE];
162 |   __shared__ float z_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
163 |   __shared__ float h_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
164 |   
165 |   // Tiled weights for z or h gates
166 |   float weights_reg[TILE_WIDTH][LENGTH];
167 |   // Weight for the r gate
168 |   float weights_reg_r[OUTPUT_TILE_WIDTH][ELEMS_PER_THREAD];
169 |   float outputs_reg[TILE_HEIGHT][TILE_WIDTH];
170 |   float bias = 0.f;
171 |   float bias_r[ELEMS_PER_THREAD];
172 |   float precompute = 0.f;
173 |   const float* precomputed_offset = precomputed_inputs;
174 |   const float* precomputed_offset_r = precomputed_inputs + bidy * TILE_HEIGHT * HIDDEN_SIZE * GRU_GATES;
175 |   
176 |   // Work group declaration
177 |   thread_block bl = this_thread_block();
178 |   thread_block_tile<GROUP_THREADS> work_group = tiled_partition<GROUP_THREADS>(bl);
179 |   
180 |   // Stream appropriate weights for element_id and gate_id into the register file
181 |   for (int i = 0; i < TILE_WIDTH; i++) {
182 |     int group_id = bidx * OUTPUT_TILE_WIDTH + wg_id * TILE_WIDTH + i;
183 | 
184 |     if (group_id < HIDDEN_SIZE){
185 |       for (int j = 0; j < LENGTH; j++) {
186 |         if ( j * GROUP_THREADS + work_group.thread_rank() < HIDDEN_SIZE) {
187 |           weights_reg[i][j] = weights_zh[(group_id * (GRU_GATES - 1) + g_type) * HIDDEN_SIZE + j * GROUP_THREADS + work_group.thread_rank()];
188 |         } else {
189 |           weights_reg[i][j] = 0.f;
190 |         }
191 |       }
192 |     }
193 |   }
194 |   
195 |   // Load biases and define time independent offsets
196 |   if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
197 |     int group_id = (bidx * OUTPUT_TILE_WIDTH + wg_id * TILE_WIDTH + work_group.thread_rank()) % TILE_WIDTH;
198 |     int gate_id = (bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + work_group.thread_rank()) % TILE_WIDTH;
199 |     if (group_id < HIDDEN_SIZE) {
200 |       bias = biases_zh[group_id * (GRU_GATES - 1) + g_type];
201 | 
202 |       precomputed_offset += bidy * TILE_HEIGHT * HIDDEN_SIZE * GRU_GATES;
203 |       precomputed_offset += group_id * GRU_GATES; 
204 |       precomputed_offset += g_type + 1;
205 |       precomputed_offset += (work_group.thread_rank() / TILE_WIDTH) * HIDDEN_SIZE;
206 |     } else {
207 |       bias = 0.f;
208 |     }
209 |   }
210 | 
211 |   // Stream weights for the r gate into the register file
212 |   for (int j = 0; j < OUTPUT_TILE_WIDTH; j++) {
213 |     for (int i = 0; i < ELEMS_PER_THREAD; i++) {
214 |       if ( i * NUM_THREADS + tid < HIDDEN_SIZE) {
215 |         weights_reg_r[j][i] = weights_r[bidx * OUTPUT_TILE_WIDTH * HIDDEN_SIZE + j * HIDDEN_SIZE + i * NUM_THREADS + tid];
216 |       } else {
217 |         weights_reg_r[j][i] = 0.f;
218 |       }
219 |     }
220 |   }
221 |   
222 |   // Stream biases for the r_gate into the register file
223 |   for (int i = 0; i < ELEMS_PER_THREAD; i++) {
224 |     if ( i * NUM_THREADS + tid < HIDDEN_SIZE) {
225 |       bias_r[i] = biases_r[i * NUM_THREADS + tid];
226 |     } else {
227 |       bias_r[i] = 0.f;
228 |     }
229 |   }
230 |   
231 |   // For the first iteration, load initial hidden state into the hidden tile.
232 |   // This doesn't need to be repeated because once the recurrent pattern is established
233 |   // the loop will populate the hidden_tile as the necessary outputs are produced.
234 |   if ( tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT ) {
235 |     int x = tid % OUTPUT_TILE_WIDTH;
236 |     int y = tid / OUTPUT_TILE_WIDTH;
237 |     h_tile[y][x] = hidden_initializer[(bidy * TILE_HEIGHT + y) * HIDDEN_SIZE + bidx * OUTPUT_TILE_WIDTH + x];
238 |   }
239 | 
240 |   // Hidden state initialization
241 |   #pragma unroll
242 |   for (int j = 0; j < TILE_HEIGHT; j++) {
243 |     #pragma unroll
244 |     for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) {
245 |       if ( i + tid < HIDDEN_SIZE) {
246 |         h_tile[j][i + tid] = hidden_initializer[(bidy * TILE_HEIGHT + j) * HIDDEN_SIZE + i + tid];
247 |       } else if (i + tid < BUFFER_SIZE) {
248 |         h_tile[j][i + tid] = 0.f;
249 |       }
250 |     }
251 |   }
252 | 
253 |   __syncthreads();
254 |   
255 |   // Main recurrent loop
256 |   for (int sequence_iteration = 0; sequence_iteration < length; sequence_iteration++) {
257 | 
258 |     // Produce partial dot products for the r gate
259 |     for (int k = 0; k < TILE_HEIGHT; k++) {
260 |       float r_dot_products[ELEMS_PER_THREAD];
261 |       
262 |       //  Zero initialize partials
263 |       for (int i = 0; i < ELEMS_PER_THREAD; i++) {
264 |         r_dot_products[i] = 0.f;
265 |       }
266 |       
267 |       // Process hidden_tile elements, getting maximum reuse
268 |       for (int j = 0; j < OUTPUT_TILE_WIDTH; j++) {
269 |         float rhs = h_tile[k][bidx * OUTPUT_TILE_WIDTH + j];
270 |         for (int i = 0; i < ELEMS_PER_THREAD; i++) {
271 |           r_dot_products[i] += weights_reg_r[j][i] * rhs;
272 |         }
273 |       }
274 |       
275 |       // Write to the global buffer
276 |       for (int i = 0; i < ELEMS_PER_THREAD; i++) {
277 |         if ( i * NUM_THREADS + tid < HIDDEN_SIZE) {
278 |           r_buf[bidy * TILE_HEIGHT * NUM_PARTIALS * HIDDEN_SIZE +
279 |                 k * NUM_PARTIALS * HIDDEN_SIZE + 
280 |                 bidx * HIDDEN_SIZE + 
281 |                 i * NUM_THREADS + tid] = r_dot_products[i];
282 |         }
283 |       }
284 |     }
285 |     
286 |     // Synchronize between recurrent iterations - signal stage
287 |     if (tid == 0) {
288 |       syncIn[bidy * gridDim.x + bidx] = sequence_iteration + 1;
289 |     }
290 |     
291 |     // Clear the output buffer
292 |     for (int j = 0; j < TILE_HEIGHT; j++) {
293 |       for (int i = 0; i < TILE_WIDTH; i++) {
294 |         outputs_reg[j][i] = 0.f;
295 |       }
296 |     }
297 |     
298 |     // Populate time independent r value
299 |     float precompute_r[TILE_HEIGHT][ELEMS_PER_THREAD];
300 |     for (int j = 0; j < TILE_HEIGHT; j++) {
301 |       for (int i = 0; i < ELEMS_PER_THREAD; i++) {
302 |         if ( i * NUM_THREADS + tid < HIDDEN_SIZE) {
303 |           precompute_r[j][i] = precomputed_offset_r[j * HIDDEN_SIZE * GRU_GATES + i * NUM_THREADS + tid];
304 |         }
305 |       }
306 |     }
307 |     precomputed_offset_r += HIDDEN_SIZE * BATCH_SIZE * GRU_GATES;
308 |     
309 |     // Populate the other time indepedent gate inputs
310 |     if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
311 |       precompute = *precomputed_offset;
312 |       precomputed_offset += BATCH_SIZE * HIDDEN_SIZE * GRU_GATES;
313 |     }
314 |     
315 |     // Synchronize between recurrent iterations - spin stage
316 |     __threadfence();
317 | 
318 |     if (bidx == 0) {
319 |       if (tid < gridDim.x) {
320 |         while ( syncIn[bidy * gridDim.x + tid] != sequence_iteration + 1) {
321 |         }
322 |       }
323 | 
324 |       __syncthreads();
325 | 
326 |       if (tid == 0) {
327 |         syncOut[bidy] = sequence_iteration + 1;
328 |       }
329 |     } else {
330 |       if (tid == 0) {
331 |         while (syncOut[bidy] != sequence_iteration + 1) {
332 |         }
333 |       }
334 |       __syncthreads();
335 |     }
336 |     
337 |     // Load r gate partial dot products
338 |     float r[TILE_HEIGHT][ELEMS_PER_THREAD][NUM_PARTIALS];
339 |     for (int k = 0; k < TILE_HEIGHT; k++) {
340 |       for (int i = 0; i < NUM_PARTIALS; i++) {
341 |         for (int j = 0; j < ELEMS_PER_THREAD; j++) {
342 |           if (j * NUM_THREADS + tid < HIDDEN_SIZE) {
343 |               r[k][j][i] = r_buf[bidy * TILE_HEIGHT * NUM_PARTIALS * HIDDEN_SIZE +
344 |                                  k * NUM_PARTIALS * HIDDEN_SIZE +
345 |                                  j * NUM_THREADS + tid +
346 |                                  i * HIDDEN_SIZE];
347 |           }
348 |         }
349 |       }
350 |     }
351 |     
352 |     // Load h_t-1 into shared memory
353 |     if (sequence_iteration != 0) {
354 |       for (int j = 0; j < TILE_HEIGHT; j++) {
355 |         for (int i = 0; i < HIDDEN_SIZE; i+= NUM_THREADS) {
356 |           if (i + tid < HIDDEN_SIZE) {
357 |             h_tile[j][i + tid] = output[(sequence_iteration - 1) * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + j) * HIDDEN_SIZE + i + tid];
358 |           }
359 |         }
360 |       }
361 |     }
362 |     
363 |     __syncthreads(); 
364 |     
365 |     // Redundant calculate of r gate calculations (dot product, time independent, activation) and broadcast to shared memory 
366 |     for (int k = 0; k < TILE_HEIGHT; k++) {
367 |       for (int j = 0; j < ELEMS_PER_THREAD; j++) {
368 |         if (j * NUM_THREADS + tid < HIDDEN_SIZE) {
369 |           float r_val = 0.f;
370 |           for (int i = 0; i < NUM_PARTIALS; i++) {
371 |             r_val += r[k][j][i];
372 |           }
373 |           r_val += bias_r[j];
374 |           r_val += precompute_r[k][j];
375 |           r_val = sigmoidf(r_val);
376 |           r_val = r_val * h_tile[k][j * NUM_THREADS + tid];
377 |           r_tile[k][j * NUM_THREADS + tid] = r_val;
378 |         }
379 |       }
380 |     }
381 |     
382 |     __syncthreads();
383 |      
384 |     // R gate computation finished, so gates z and h_cand now perform tiled matrix multiplication
385 |     // Note separate codepaths because compiler would otherwise introduce divergence
386 |     if (g_type == 0) {
387 |       for (int k = 0; k < LENGTH; k++) {
388 |         for (int j = 0; j < TILE_HEIGHT; j++) {
389 |           float val = r_tile[j][k * GROUP_THREADS + work_group.thread_rank()];
390 |           for (int i = 0; i < TILE_WIDTH; i++) {
391 |             outputs_reg[j][i] += weights_reg[i][k] * val;
392 |           }
393 |         }
394 |       }
395 |     } else {
396 |       for (int k = 0; k < LENGTH; k++) {
397 |         for (int j = 0; j < TILE_HEIGHT; j++) {
398 |           float val = h_tile[j][k * GROUP_THREADS + work_group.thread_rank()];
399 |           for (int i = 0; i < TILE_WIDTH; i++) {
400 |             outputs_reg[j][i] += weights_reg[i][k] * val;
401 |           }
402 |         }
403 |       }
404 |     }
405 | 
406 |     // Reduction
407 |     for (int j = 0; j < TILE_HEIGHT; j++) {
408 |       for (int i = 0; i < TILE_WIDTH; i++) {
409 |         for (int k = 1; k < GROUP_THREADS; k *= 2) {
410 |           outputs_reg[j][i] += work_group.shfl_xor(outputs_reg[j][i], k);
411 |         }
412 |       }
413 |     }
414 |     
415 |     // Gate activations
416 |     if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
417 |       int x = work_group.thread_rank() % TILE_WIDTH;
418 |       int y = work_group.thread_rank() / TILE_WIDTH;
419 | 
420 |       float val = outputs_reg[y][x] + precompute + bias;
421 |       
422 |       if (g_type == 0) {
423 |         val = sigmoidf(val);
424 |         z_gate[y][wg_id * TILE_WIDTH + x] = val;
425 |       } else {
426 |         val = tanh(val);
427 |         h_gate[y][wg_id * TILE_WIDTH + x] = val;
428 |       }
429 |     }
430 | 
431 |     __syncthreads();
432 |     
433 |     // Broadcast outputs
434 |     if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT) {
435 |       int x = tid % OUTPUT_TILE_WIDTH;
436 |       int y = tid / OUTPUT_TILE_WIDTH;
437 |       if (bidx * OUTPUT_TILE_WIDTH + x < HIDDEN_SIZE) {
438 |         float z_val = z_gate[y][x];
439 |         float h_val = h_gate[y][x];
440 |         float h_old_val = h_tile[y][bidx * OUTPUT_TILE_WIDTH + x];
441 | 
442 |         float out_val = (1 - z_val) * h_val + z_val * h_old_val;
443 |         h_tile[y][bidx * OUTPUT_TILE_WIDTH + x] = out_val;
444 |         output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + y) * HIDDEN_SIZE + bidx * OUTPUT_TILE_WIDTH + x] = out_val;
445 |       }
446 |     }
447 |     
448 |     __syncthreads();
449 | 
450 |   }
451 | }
452 | 
453 | template<typename T>
454 | void process_input_weights(T * output, std::vector<T*> weights, uint32_t input_size, uint32_t hidden_size) { 
455 |   
456 |   // Outside loop is the input size
457 |   for (uint32_t j = 0; j < input_size; j++) {
458 |     // Width of the input weight matrix
459 |     for (uint32_t k = 0; k < hidden_size; k++) {
460 |       // Colocate the weights for each element
461 |       for (uint32_t i = 0; i < GRU_GATES; i++) {
462 |         output[j * hidden_size * GRU_GATES + k * GRU_GATES + i] = weights.at(i)[j * hidden_size + k];
463 |       }
464 |     }
465 |   }
466 | }
467 | 
468 | template<typename T>
469 | void process_hidden_weights(T * output, std::vector<T*> weights, uint32_t hidden_size) {
470 |   
471 |   // For each output element
472 |   for (uint32_t j = 0; j < hidden_size; j++) {
473 |     // For gates z and h
474 |     for (uint32_t k = 0; k < GRU_GATES - 1; k++) {
475 |       // For each element for that gate
476 |       for (uint32_t i = 0; i < hidden_size; i++) {
477 |         // Indices 4 and 5 correspond to the z and h weights
478 |         output[j * (GRU_GATES - 1) * hidden_size + k * hidden_size + i] = weights.at(4 + k)[i * hidden_size + j];
479 |       }
480 |     }
481 |   }
482 | }
483 | 
484 | template<typename T>
485 | void process_biases(T * output, std::vector<T*> weights, uint32_t hidden_size) {
486 |   int err = 0;
487 |   // For each output element
488 |   for (uint32_t k = 0; k < hidden_size; k++) {
489 |     // Colocate the biases for each element
490 |     for (uint32_t i = 0; i < GRU_GATES - 1; i++) {
491 |       output[k * (GRU_GATES - 1) + i] = weights.at(i + 7)[k];
492 |       if (weights.at(i + 7)[k] != 0.5) err++;
493 |     }
494 |   }
495 | }
496 | 
497 | // Free buffers (all tiling dimension dependent)
498 | template <typename T>
499 | void GRULayerSingle<T>::reset() {
500 |   cudaFreeHost((void *) this->packed_input_weights);
501 |   cudaFreeHost((void *) this->packed_hidden_weights);
502 |   cudaFreeHost((void *) this->packed_biases);
503 |   cudaFree((void *) this->packed_hidden_weights_r_gpu);
504 |   cudaFree((void *) this->packed_biases_r_gpu);
505 |   cudaFree((void *) this->packed_input_weights_gpu);
506 |   cudaFree((void *) this->packed_hidden_weights_gpu);
507 |   cudaFree((void *) this->packed_biases_gpu);
508 | }
509 | 
510 | // Initialize and fill trained parameter buffers
511 | template<typename T>
512 | uint32_t GRULayerSingle<T>::initialize() {
513 |   
514 |   uint32_t input_footprint = input_weight_footprint();
515 |   uint32_t hidden_footprint = hidden_weight_footprint();
516 |   uint32_t hidden_r_footprint = hidden_weight_r_footprint();
517 |   uint32_t bias_footprint = bias_weight_footprint();
518 |   uint32_t bias_r_footprint = bias_weight_r_footprint();
519 |   
520 |   // Allocate buffers
521 |   cudaHostAlloc((void **) &(this->packed_input_weights), input_footprint, cudaHostAllocDefault); CUDA_ERR;
522 |   cudaHostAlloc((void **) &(this->packed_hidden_weights), hidden_footprint, cudaHostAllocDefault); CUDA_ERR;
523 |   cudaHostAlloc((void **) &(this->packed_biases), bias_footprint, cudaHostAllocDefault); CUDA_ERR;
524 |   cudaMalloc((void **) &(this->packed_input_weights_gpu), input_footprint); CUDA_ERR;
525 |   cudaMalloc((void **) &(this->packed_hidden_weights_gpu), hidden_footprint); CUDA_ERR;
526 |   cudaMalloc((void **) &(this->packed_biases_gpu), bias_footprint); CUDA_ERR;
527 |   cudaMalloc((void **) &(this->packed_hidden_weights_r_gpu), hidden_r_footprint); CUDA_ERR;
528 |   cudaMalloc((void **) &(this->packed_biases_r_gpu), bias_r_footprint); CUDA_ERR;
529 |   
530 |   // Reorganize weights
531 |   process_input_weights(this->packed_input_weights, this->host_weights, this->input_size, this->hidden_size);
532 |   process_hidden_weights(this->packed_hidden_weights, this->host_weights, this->hidden_size);
533 |   process_biases(this->packed_biases, this->host_weights, this->hidden_size);
534 | 
535 |   // Send to GPU
536 |   cudaMemcpy(this->packed_input_weights_gpu, this->packed_input_weights, input_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
537 |   cudaMemcpy(this->packed_hidden_weights_gpu, this->packed_hidden_weights, hidden_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
538 |   cudaMemcpy(this->packed_biases_gpu, this->packed_biases, bias_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
539 |   cudaMemcpy(this->packed_hidden_weights_r_gpu, this->host_weights.at(WEIGHTS_HIDDEN_R), hidden_r_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
540 |   cudaMemcpy(this->packed_biases_r_gpu, this->host_weights.at(BIAS_R), bias_r_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
541 | 
542 |   return 0;
543 | }
544 | 
545 | // Reset model parameters
546 | template <typename T>
547 | void GRUModelSingle<T>::reset() {
548 | 
549 |   for (auto& l: this->layers) {
550 |     l.reset();
551 |   }
552 | 
553 |   cudaFreeHost((void *) this->host_output);
554 |   cudaFree((void *) this->gpu_output);
555 |   
556 |   cudaFree((void *) this->gpu_r);
557 |   cudaFree((void *) this->gpu_inputs);
558 |   cudaFree((void *) this->gpu_precompute);
559 |   cudaFree((void *) this->gpu_syncIn);
560 |   cudaFree((void *) this->gpu_syncOut);
561 | }
562 | 
563 | // Initialize model buffers
564 | template <typename T>
565 | uint32_t GRUModelSingle<T>::initialize() {
566 |   
567 |   for (auto& l: this->layers) {
568 |     uint32_t debug = l.initialize();
569 |     if (debug != 0) {
570 |       std::cout << "FAILURE\n";
571 |       return debug;
572 |     }
573 |   }
574 | 
575 |   this->gpu_weights_input = this->layers[0].get_packed_input_weights_gpu();
576 |   this->gpu_weights_hidden = this->layers[0].get_packed_hidden_weights_gpu();
577 |   this->gpu_biases = this->layers[0].get_packed_biases_gpu();
578 |   this->gpu_weights_hidden_r = this->layers[0].get_packed_hidden_weights_r_gpu();
579 |   this->gpu_biases_r = this->layers[0].get_packed_biases_r_gpu();
580 |   
581 |   this->mm_k = this->initial_input_size;
582 |   this->mm_n = this->output_size * GRU_GATES;
583 |   this->num_partials = (this->output_size + this->tile_width - 1) / this->tile_width;
584 | 
585 |   // Single sized output buffer (Will change for multi-layer networks, one output per iteration networks)
586 |   cudaHostAlloc((void **) &(this->host_output), this->output_size * this->batch_size * sizeof(T), cudaHostAllocDefault);
587 |   cudaMalloc((void **) &(this->gpu_output), this->output_size * this->batch_size * sizeof(T));
588 |   
589 |   // Assume batch size less than 200
590 |   cudaMalloc((void **) &(this->gpu_inputs), this->initial_input_size * this->batch_size * 200 * sizeof(T));
591 |   cudaMalloc((void **) &(this->gpu_r), this->output_size * this->batch_size * this->num_partials * sizeof(T));
592 |   cudaMalloc((void **) &(this->gpu_precompute), this->output_size * this->batch_size * GRU_GATES * 200 * sizeof(T));
593 | 
594 |   // Hidden state initializer allocation
595 |   cudaMalloc((void **) &(this->gpu_hidden_initializer), this->output_size * this->batch_size * sizeof(T));
596 |   cudaMemset((void *)this->gpu_hidden_initializer, 0, this->output_size * this->batch_size * sizeof(T));
597 | 
598 |   cudaMalloc((void **) &(this->gpu_syncIn), 80 * sizeof(int));
599 |   cudaMalloc((void **) &(this->gpu_syncOut), 80 * sizeof(int));
600 |  
601 |   this->paramsMM[0] = (void*) &(this->gpu_inputs);
602 |   this->paramsMM[1] = (void*) &(this->gpu_weights_input);
603 |   this->paramsMM[2] = (void*) &(this->gpu_precompute);
604 |   this->paramsMM[4] = (void*) &(this->mm_k);
605 |   this->paramsMM[5] = (void*) &(this->mm_n);
606 | 
607 |   this->paramsGRU[0] = (void*) &(this->gpu_precompute);
608 |   this->paramsGRU[1] = (void*) &(this->gpu_hidden_initializer);
609 |   this->paramsGRU[2] = (void*) &(this->gpu_weights_hidden_r);
610 |   this->paramsGRU[3] = (void*) &(this->gpu_weights_hidden);
611 |   this->paramsGRU[4] = (void*) &(this->gpu_biases_r);
612 |   this->paramsGRU[5] = (void*) &(this->gpu_biases);
613 |   this->paramsGRU[6] = (void*) &(this->gpu_r);
614 |   this->paramsGRU[7] = (void*) &(this->gpu_output);
615 |   this->paramsGRU[8] = (void*) &(this->gpu_syncIn);
616 |   this->paramsGRU[9] = (void*) &(this->gpu_syncOut);
617 | 
618 |   return 0;
619 | }
620 | 
621 | // Define tiling configuration (should be encapsulated elsewhere)
622 | template <typename T>
623 | void GRUModelSingle<T>::set_configuration(int x, int y, int g, int t) {
624 |   this->tile_width = x;
625 |   this->tile_height = y;
626 |   this->num_groups = g;
627 |   this->group_threads = t;
628 | }
629 | 
630 | // Process input sequence batch
631 | template <typename T>
632 | float GRUModelSingle<T>::run_input(T* input, uint32_t * length) {
633 |   
634 |   // Define remaining kernel parameters
635 |   this->mm_m = this->batch_size * *length;
636 |   this->paramsMM[3] = (void *) &(this->mm_m);
637 |   this->paramsGRU[10] = (void *) length;
638 |   
639 |   // GEMM Kernel dimensioning
640 |   dim3 mm_grid = dim3((this->mm_n + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (this->mm_m + MM_TILE_SIZE - 1) / MM_TILE_SIZE);
641 |   dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE);
642 |   size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float);
643 |   
644 |   // GRU Kernel dimensioning
645 |   int effective_w = (this->tile_width * this->num_groups) / 2;
646 |   dim3 gru_rnn_grid = dim3((this->output_size + effective_w - 1) / effective_w, (this->batch_size + this->tile_height - 1) / this->tile_height);
647 |   // While there are three gates, we use just two work groups per output to satisfy the dependency
648 |   dim3 gru_rnn_block = dim3(this->num_groups * this->group_threads);
649 |   unsigned block_size = gru_rnn_block.x;
650 |   unsigned grid_size = gru_rnn_grid.x * gru_rnn_grid.y;
651 | 
652 |   // GRU Kernel instantiation (currently only configured for manual tuning)
653 |   void * kernel = (void *)gru_rnn<256, 3, 1, 32, 8, 10>;
654 |   
655 |   // Check occupancy to prevent hangs
656 |   int numBlocks = 0;
657 |   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, block_size, 0);
658 |   if (grid_size > 80 * numBlocks) {
659 |     printf("grid_size: %3d numBlocks: %3d\n", grid_size, numBlocks);
660 |     return -std::numeric_limits<float>::infinity();
661 |   }
662 | 
663 |   cudaEvent_t start, end;
664 |   float elapsed;
665 |   
666 |   // Send sequence
667 |   cudaMemcpy(this->gpu_inputs, input, this->initial_input_size * this->batch_size * *length * sizeof(T), cudaMemcpyHostToDevice);
668 |   
669 |   // Timing
670 |   cudaEventCreate(&start);
671 |   cudaEventCreate(&end);
672 |   cudaEventRecord(start);
673 |   
674 |   // Kernel launches
675 |   cudaLaunchKernel((void *)matmul, mm_grid, mm_block, this->paramsMM, mm_sm_requirement);
676 |   cudaLaunchKernel(kernel, gru_rnn_grid, gru_rnn_block, this->paramsGRU);
677 |   
678 |   cudaEventRecord(end);
679 |   cudaEventSynchronize(end);
680 |   cudaEventElapsedTime(&elapsed, start, end);
681 |   
682 |   cudaMemcpy(this->host_output, this->gpu_output, this->output_size * this->batch_size * sizeof(T), cudaMemcpyDeviceToHost);
683 | 
684 | #ifdef DEBUG
685 |   // Value checking
686 |   for (int i = 0; i < this->batch_size; i++) {
687 |     printf("Sequence %2d\n", i);
688 |     for (int j = 0; j < this->output_size; j++) {
689 |       printf("%f ", this->host_output[i * this->output_size + j]);
690 |     }
691 |     printf("\n");
692 |   }
693 |   printf("\n");
694 | #endif
695 |   
696 |   // Runtime error checking
697 |   cudaError_t err;
698 |   cudaDeviceSynchronize();
699 |   if ((err = cudaGetLastError()) != cudaSuccess) {
700 |     printf("CUDA error: %d : %s : %s, line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__);
701 |     return std::numeric_limits<float>::infinity();
702 |   }
703 |  
704 |   return elapsed;
705 | }
706 | 
707 | // Explicit template instantiations
708 | template void process_input_weights<float>(float *, std::vector<float *>, uint32_t, uint32_t);
709 | template void process_hidden_weights<float>(float *, std::vector<float *>, uint32_t);
710 | template void process_biases<float>(float *, std::vector<float *>, uint32_t);
711 | template uint32_t GRULayerSingle<float>::initialize();
712 | template uint32_t GRUModelSingle<float>::initialize();
713 | template void GRULayerSingle<float>::reset();
714 | template void GRUModelSingle<float>::reset();
715 | template void GRUModelSingle<float>::set_configuration(int, int, int, int);
716 | template float GRUModelSingle<float>::run_input(float *, uint32_t *);
717 | 


--------------------------------------------------------------------------------
/src/GRU_double.cu:
--------------------------------------------------------------------------------
  1 | #include "GRU_double.h"
  2 | #include "misc.h"
  3 | 
  4 | #include <cstring>
  5 | #include <iostream>
  6 | #include <cooperative_groups.h>
  7 | #include <math.h>
  8 | #include <cassert>
  9 | #include <limits>
 10 | 
 11 | using namespace cooperative_groups;
 12 | 
 13 | __device__ __forceinline__ float sigmoidf(float x) {
 14 |   return 1.0f / (1.0f + expf(-1.0f * x));
 15 | }
 16 | 
 17 | #define MM_BLOCK_SIZE 16
 18 | #define MM_REG_TILE 4
 19 | #define MM_TILE_SIZE 64
 20 | 
 21 | // This is a mostly optimized kernel for matrix multiplication
 22 | // The kernel uses a two tiered tiling mechanism that first tiles large
 23 | // tiles from global memory to shared memory. This shared memory tile is
 24 | // then used as the source to stream data into register arrays that perform
 25 | // a calculation on a 8x8 tile.
 26 | 
 27 | __global__ void matmul(float * A, float * B, float * C,
 28 |                        uint32_t M, uint32_t K, uint32_t N) {
 29 |   
 30 |   extern __shared__ float base[];
 31 |   float* bufferA = base;
 32 |   float* bufferB = &bufferA[MM_TILE_SIZE * MM_TILE_SIZE];
 33 | 
 34 |   float regA[MM_REG_TILE];
 35 |   float regB[MM_REG_TILE];
 36 |   float regC[MM_REG_TILE][MM_REG_TILE];
 37 |   
 38 |   uint32_t tidx = threadIdx.x;
 39 |   uint32_t tidy = threadIdx.y;
 40 |   uint32_t id = threadIdx.y * blockDim.x + threadIdx.x;
 41 |   uint32_t bidx = blockIdx.x;
 42 |   uint32_t bidy = blockIdx.y;
 43 | 
 44 |   // Number of rows that are traversed in a single fully coalesced load sequence
 45 |   constexpr uint32_t LOAD_STEPS = MM_TILE_SIZE * MM_TILE_SIZE / (MM_BLOCK_SIZE * MM_BLOCK_SIZE);
 46 |   constexpr uint32_t NUM_THREADS = MM_BLOCK_SIZE * MM_BLOCK_SIZE;
 47 |   
 48 |   // Zero the intermediate output
 49 |   for (uint32_t y = 0; y < MM_REG_TILE; y++) {
 50 |     for (uint32_t x = 0; x < MM_REG_TILE; x++) {
 51 |       regC[y][x] = 0.0f;
 52 |     }
 53 |   }
 54 | 
 55 |   for (uint32_t i = 0; i < K; i += MM_TILE_SIZE) {
 56 |     
 57 |     // Load lhs tile from global memory to shared memory (fully coalesced)
 58 |     #pragma unroll
 59 |     for (uint32_t j = 0; j < LOAD_STEPS; j++) {
 60 |       uint32_t index = j * NUM_THREADS + id;
 61 |       if (((bidy * MM_TILE_SIZE + index / MM_TILE_SIZE) < M) && ((i + index % MM_TILE_SIZE) < K)) {
 62 |         bufferA[index] = A[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * K + i + index % MM_TILE_SIZE];
 63 |       } else {
 64 |         bufferA[index] = 0.0f;
 65 |       }
 66 |     }
 67 |     
 68 |     // Not necessary for correctness, but improves performance by avoiding thrashing shared memory
 69 |     __syncthreads();
 70 | 
 71 |     // Load rhs tile from global memory to shared memory (fully coalesced)
 72 |     #pragma unroll
 73 |     for (uint32_t j = 0; j < LOAD_STEPS; j++) {
 74 |       uint32_t index = j * NUM_THREADS + id;
 75 |       if (((i + index / MM_TILE_SIZE) < K) && ((bidx * MM_TILE_SIZE + index % MM_TILE_SIZE) < N)) {
 76 |         bufferB[index] = B[ ((index / MM_TILE_SIZE) + i) * N + bidx * MM_TILE_SIZE + index % MM_TILE_SIZE];
 77 |       } else {
 78 |         bufferB[index] = 0.0f;
 79 |       }
 80 |     }
 81 | 
 82 |     // Ensures all data is written from global memory to shared memory before it is streamed
 83 |     // into register arrays.
 84 |     __syncthreads();
 85 |     
 86 |     // Loop through full tile
 87 |     for (uint32_t j  = 0; j < MM_TILE_SIZE; j++) {
 88 |       
 89 |       // Load vector from lhs and rhs
 90 |       #pragma unroll
 91 |       for (uint32_t l = 0; l < MM_REG_TILE; l++) {
 92 |         regA[l] = bufferA[(tidy * MM_REG_TILE + l) * MM_TILE_SIZE + j];
 93 |         regB[l] = bufferB[j * MM_TILE_SIZE + tidx * MM_REG_TILE + l];
 94 |       }
 95 |       
 96 |       #pragma unroll
 97 |       // Perform a narrow matmul
 98 |       for (uint32_t y = 0; y < MM_REG_TILE; y++) {
 99 |         for (uint32_t x = 0; x < MM_REG_TILE; x++) {
100 |           regC[y][x] += regA[y] * regB[x];
101 |         }
102 |       }
103 |     }
104 | 
105 |     __syncthreads();
106 |   }
107 |  
108 |   // Write register intermediates to shared memory (possibly unnecessary)
109 |   for (uint32_t y = 0; y < MM_REG_TILE; y++) {
110 |     for (uint32_t x = 0; x < MM_REG_TILE; x++) {
111 |       bufferA[(tidy * MM_REG_TILE + y) * MM_TILE_SIZE + tidx * MM_REG_TILE + x] = regC[y][x];
112 |     }
113 |   }
114 | 
115 |   __syncthreads();
116 |   
117 |   for (uint32_t j = 0; j < LOAD_STEPS; j++) {
118 |     uint32_t index = j * NUM_THREADS + id;
119 |     if (((bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) < M) && ((bidx * MM_TILE_SIZE + (index % MM_TILE_SIZE)) < N)) {
120 |       C[ (bidy * MM_TILE_SIZE + (index / MM_TILE_SIZE)) * N + bidx * MM_TILE_SIZE +  (index % MM_TILE_SIZE)] = bufferA[index];
121 |     }
122 |   }
123 | }
124 | 
125 | // This kernel assumes the input multiplications were precomputed in a large matrix-matrix multiplication
126 | template<int HIDDEN_SIZE, int TILE_WIDTH, int TILE_HEIGHT, int NUM_GROUPS, int GROUP_THREADS, int BATCH_SIZE>
127 | __global__ void gru_rnn(const float* precomputed_inputs,
128 |                         const float* hidden_initializer,
129 |                         const float* weights, 
130 |                         const float* biases, 
131 |                         float* r, 
132 |                         float* output, 
133 |                         volatile int* syncIn,
134 |                         volatile int* syncOut,
135 |                         uint32_t length) {
136 |   
137 |   // Indexing helpers
138 |   int tid = threadIdx.x;
139 |   int bidx = blockIdx.x;
140 |   int bidy = blockIdx.y;
141 |   int wg_id = tid / GROUP_THREADS;
142 |   int r_id = tid / (2 * GROUP_THREADS);
143 | 
144 |   // LENGTH - How many weights for each output does a single thread need to store
145 |   constexpr int LENGTH = (HIDDEN_SIZE + GROUP_THREADS - 1) / GROUP_THREADS;
146 |   // BUFFER_SIZE - Number of elements to reserve in shared memory for each outout. Effectively
147 |   // rounds up HIDDEN_SIZE to the next multiple of NUM_THREADS
148 |   constexpr int BUFFER_SIZE = LENGTH * GROUP_THREADS;
149 |   // OUTPUT_TILE_WIDTH - How many full elements are produced by the threadblock. At scheduling time,
150 |   // must ensure that the launched configuration produces full elements within a single threadblock
151 |   constexpr int OUTPUT_TILE_WIDTH = NUM_GROUPS * TILE_WIDTH / (GRU_GATES - 1);
152 |   
153 |   
154 |   // Static shared memory allocation
155 |   __shared__ float buffer_tile[TILE_HEIGHT][BUFFER_SIZE];
156 |   __shared__ float z_gate[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
157 |   __shared__ float z_h_res[TILE_HEIGHT][OUTPUT_TILE_WIDTH];
158 |   __shared__ float h_gate[2][TILE_HEIGHT][OUTPUT_TILE_WIDTH];
159 |   
160 |   // Weights in the register file
161 |   float weights_reg[TILE_WIDTH][LENGTH];
162 |   float h_weights_reg[TILE_WIDTH][LENGTH / 2];
163 |   float outputs_reg[TILE_HEIGHT][TILE_WIDTH];
164 |   float bias = 0.0f;
165 |   float bias_h =  0.0f;
166 |   float precompute = 0.0f;
167 |   float precompute_h = 0.0f;
168 |   const float * precomputed_offset;
169 |   const float * precomputed_offset_h;
170 | 
171 |   // Cooperative group helpers
172 |   thread_block bl = this_thread_block();
173 |   thread_block_tile<GROUP_THREADS> work_group = tiled_partition<GROUP_THREADS>(bl);
174 |   
175 |   // Load weights to register array for either z or h gate
176 |   for (int i = 0; i < TILE_WIDTH; i++) {
177 |     // Global gate id for fetching weights.
178 |     // bidx * TILE_WIDTH * NUM_GROUPS -> the first gate index processed by the threadblock
179 |     // wg_id * TILE_WIDTH -> the first gate index processed by a given warp within the threadblock
180 |     // i -> current gate within the warp's assigned gates
181 |     // These gate indexes will only refer to gates r and z, not the h gate
182 |     int gate_id = bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + i;
183 |     // The following lines transform the assigned r/z gate into its real index within the weight array.
184 |     // Since we don't assign one of the gates, we undo the indexing to get a valid output_element.
185 |     // We then determine which gate within the output the current assigned gate is.
186 |     int output_element = (gate_id / (GRU_GATES - 1)) * GRU_GATES;
187 |     int gate_index = gate_id % (GRU_GATES - 1);
188 |     // Prevent segfaults
189 |     if (output_element < HIDDEN_SIZE) {
190 |       // 0 initialize rounded values. Better to have a single check now then on each recurrent iteration.
191 |       for (int j = 0; j < LENGTH; j++) {
192 |         if ( j * GROUP_THREADS + work_group.thread_rank() < HIDDEN_SIZE) {
193 |           weights_reg[i][j] = weights[(output_element * GRU_GATES + gate_index) * HIDDEN_SIZE + j * GROUP_THREADS + work_group.thread_rank()];
194 |         } else {
195 |           weights_reg[i][j] = 0.f;
196 |         }
197 |       }
198 |     }
199 |   }
200 |   
201 |   // Load weights to register arrays for h gate (weight columns divided between two workgroups)
202 |   for (int i = 0; i < TILE_WIDTH / 2; i++) {
203 |     int output_element = bidx * NUM_GROUPS * TILE_WIDTH / 2 + r_id * TILE_WIDTH + i;
204 |     int which_half = wg_id % 2;
205 |     if (output_element < HIDDEN_SIZE) {
206 |       for (int j = 0; j < LENGTH; j++) {
207 |         if ( which_half * BUFFER_SIZE / 2 + j * GROUP_THREADS + work_group.thread_rank() < HIDDEN_SIZE) {
208 |           h_weights_reg[i][j] = weights[output_element * GRU_GATES * HIDDEN_SIZE + 
209 |                                         (GRU_GATES - 1) * HIDDEN_SIZE + 
210 |                                         which_half * BUFFER_SIZE / 2 + 
211 |                                         j * GROUP_THREADS + work_group.thread_rank()];
212 |         } else {
213 |           h_weights_reg[i][j] = 0.f;
214 |         }
215 |       }
216 |     }
217 |   }
218 |   
219 |   // Calculate indexing for time independent partial sums for r and z gates
220 |   if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
221 |     int x = work_group.thread_rank() % TILE_WIDTH;
222 |     int y = work_group.thread_rank() / TILE_WIDTH;
223 | 
224 |     if ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + x) < HIDDEN_SIZE * (GRU_GATES - 1) && (bidy * TILE_HEIGHT + y < BATCH_SIZE)) {
225 |       int output_element = ((bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + x) / (GRU_GATES - 1)) * GRU_GATES;
226 |       int gate_index = (bidx * TILE_WIDTH * NUM_GROUPS + wg_id * TILE_WIDTH + x) % (GRU_GATES - 1);
227 |       bias = biases[output_element * GRU_GATES + gate_index];
228 | 
229 |       precomputed_offset = precomputed_inputs;
230 |       precomputed_offset += bidy * TILE_HEIGHT * HIDDEN_SIZE * GRU_GATES;
231 |       precomputed_offset += y * HIDDEN_SIZE * GRU_GATES;
232 |       precomputed_offset += output_element * GRU_GATES;
233 |       precomputed_offset += gate_index;
234 |       precompute = *precomputed_offset;
235 |       precomputed_offset += BATCH_SIZE * HIDDEN_SIZE * GRU_GATES;
236 |     }
237 | 
238 |   }
239 |   
240 |   // Calculate indexing for time independent partial sums for h gate
241 |   if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT) {
242 |     int x = tid % OUTPUT_TILE_WIDTH;
243 |     int y = tid / OUTPUT_TILE_WIDTH; 
244 |     if ((bidx * OUTPUT_TILE_WIDTH + x < HIDDEN_SIZE) && (bidy * TILE_HEIGHT + y < BATCH_SIZE)) {
245 |       bias_h = biases[(bidx * OUTPUT_TILE_WIDTH + x) * GRU_GATES + (GRU_GATES - 1)];
246 | 
247 |       precomputed_offset_h = precomputed_inputs;
248 |       precomputed_offset_h += (GRU_GATES - 1);
249 |       precomputed_offset_h += bidy * TILE_HEIGHT * HIDDEN_SIZE * GRU_GATES;
250 |       precomputed_offset_h += y * HIDDEN_SIZE * GRU_GATES;
251 |       precomputed_offset_h += (bidx * OUTPUT_TILE_WIDTH + x) * GRU_GATES;
252 |       precompute_h = *precomputed_offset_h;
253 |       precomputed_offset_h += BATCH_SIZE * HIDDEN_SIZE * GRU_GATES;
254 |     }
255 |   }
256 |   
257 |   // Zero the dot product accumulators
258 |   #pragma unroll
259 |   for (int j = 0; j < TILE_HEIGHT; j++) {
260 |     #pragma unroll
261 |     for (int i = 0; i < TILE_WIDTH; i++) {
262 |       outputs_reg[i][j] = 0.f;
263 |     }
264 |   }
265 |   
266 |   // Initialize hidden state according to memory / zero rest of buffer
267 |   #pragma unroll
268 |   for (int j = 0; j < TILE_HEIGHT; j++) {
269 |     #pragma unroll
270 |     for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) {
271 |       if ( i + tid < HIDDEN_SIZE) {
272 |         buffer_tile[j][i + tid] = hidden_initializer[(bidy * TILE_HEIGHT + j) * HIDDEN_SIZE + i + tid];
273 |       } else if (i + tid < BUFFER_SIZE) {
274 |         buffer_tile[j][i + tid] = 0.f;
275 |       }
276 |     }
277 |   }
278 |   
279 |   // Recurrent loop
280 |   for (int sequence_iteration = 0; sequence_iteration < length; sequence_iteration++) {
281 |     
282 |     /* r and z gates */
283 | 
284 |     // Dot product
285 |     #pragma unroll
286 |     for (int k = 0; k < LENGTH; k++) {
287 |       #pragma unroll
288 |       for (int j = 0; j < TILE_HEIGHT; j++) {
289 |         float val = buffer_tile[j][k * GROUP_THREADS + work_group.thread_rank()];
290 |         #pragma unroll
291 |         for (int i = 0; i < TILE_WIDTH; i++) {
292 |           outputs_reg[j][i] += val * weights_reg[i][k];
293 |         }
294 |       }
295 |     }
296 |     
297 |     // Reduction
298 |     #pragma unroll
299 |     for (int j = 0; j < TILE_HEIGHT; j++) {
300 |       #pragma unroll
301 |       for (int i = 0; i < TILE_WIDTH; i++) {
302 |         #pragma unroll
303 |         for (int k = 1; k < GROUP_THREADS; k *= 2) {
304 |           outputs_reg[j][i] += work_group.shfl_xor(outputs_reg[j][i], k);
305 |         }
306 |       }
307 |     }
308 |     
309 |     // Activations and broadcast of r gate
310 |     if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
311 |       int reg_x = work_group.thread_rank() % TILE_WIDTH;
312 |       int reg_y = work_group.thread_rank() / TILE_WIDTH;
313 | 
314 |       float val = outputs_reg[reg_y][reg_x] + bias + precompute;
315 |       val = sigmoidf(val);
316 | 
317 |       int gate_id = (wg_id * TILE_WIDTH + reg_x) % (GRU_GATES - 1);
318 |       int output_id = (bidx * NUM_GROUPS * TILE_WIDTH + wg_id * TILE_WIDTH + reg_x) / (GRU_GATES - 1);
319 |       
320 |       //r gate
321 |       if (gate_id == 0) { 
322 |         val = val * buffer_tile[reg_y][output_id];
323 |         if (output_id < HIDDEN_SIZE) {
324 |           r[(bidy * TILE_HEIGHT + reg_y) * HIDDEN_SIZE + output_id] = val;
325 |         }
326 |       } else {
327 |         int smem_id = (wg_id * TILE_WIDTH + reg_x) / (GRU_GATES - 1);
328 |         z_gate[reg_y][smem_id] = (1 - val);
329 |         z_h_res[reg_y][smem_id] = val * buffer_tile[reg_y][output_id];
330 |       }
331 |     }
332 | 
333 |     // Synchronize between r/z and h stages - signal stage 
334 |     if (tid == 0) {
335 |       syncIn[bidy * gridDim.x + bidx] =  2 * sequence_iteration + 1;
336 |     }
337 | 
338 |     __threadfence();
339 |     
340 |     // Zero dot product accumulators
341 |     #pragma unroll
342 |     for (int j = 0; j < TILE_HEIGHT; j++) {
343 |       #pragma unroll
344 |       for (int i = 0; i < TILE_WIDTH; i++) {
345 |         outputs_reg[i][j] = 0.f;
346 |       }
347 |     }
348 |     
349 |     // Synchronize between r/z and h stages - spin stage
350 |     if (bidx == 0) {
351 |       if (tid < gridDim.x) {
352 |         while ( syncIn[bidy * gridDim.x + tid] != 2 * sequence_iteration + 1) {
353 |         }
354 |       }
355 | 
356 |       __syncthreads();
357 | 
358 |       if (tid == 0) {
359 |         syncOut[bidy] = 2 * sequence_iteration + 1;
360 |       }
361 |     } else {
362 |       if (tid == 0) {
363 |         while (syncOut[bidy] != 2 * sequence_iteration + 1) {
364 |         }
365 |       }
366 |       __syncthreads();
367 |     }
368 |     
369 |     /* h gate */
370 | 
371 |     // Load r gate intermediate
372 |     #pragma unroll
373 |     for (int j = 0; j < TILE_HEIGHT; j++) {
374 |       #pragma unroll
375 |       for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) {
376 |         if ( i + tid < HIDDEN_SIZE ) {
377 |           buffer_tile[j][i + tid] = r[bidy * TILE_HEIGHT * HIDDEN_SIZE + j * HIDDEN_SIZE + i + tid];
378 |         } else if ( i + tid < BUFFER_SIZE) {
379 |           buffer_tile[j][i + tid] = 0.f;
380 |         }
381 |       }
382 |     }
383 | 
384 |     __syncthreads();
385 |     
386 |     int which_half = wg_id % 2;
387 |     
388 |     // Dot product
389 |     #pragma unroll
390 |     for (int k = 0; k < LENGTH / 2; k++) {
391 |       #pragma unroll
392 |       for (int j = 0; j < TILE_HEIGHT; j++) {
393 |         float val = buffer_tile[j][which_half * LENGTH * GROUP_THREADS / 2 + k * GROUP_THREADS + work_group.thread_rank()];
394 |         #pragma unroll
395 |         for (int i = 0; i < TILE_WIDTH; i++) {
396 |           outputs_reg[j][i] += val * h_weights_reg[i][k];
397 |         }
398 |       }
399 |     }
400 |     
401 |     // Reduction
402 |     #pragma unroll
403 |     for (int j = 0; j < TILE_HEIGHT; j++) {
404 |       #pragma unroll
405 |       for (int i = 0; i < TILE_WIDTH; i++) {
406 |         #pragma unroll
407 |         for (int k = 1; k < GROUP_THREADS; k *= 2) {
408 |           outputs_reg[j][i] += work_group.shfl_xor(outputs_reg[j][i], k);
409 |         }
410 |       }
411 |     }
412 |     
413 |     // Broadcast to shared memory
414 |     if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
415 |       int x = work_group.thread_rank() % TILE_WIDTH;
416 |       int y = work_group.thread_rank() / TILE_WIDTH;
417 | 
418 |       h_gate[which_half][y][r_id * TILE_WIDTH + x] = outputs_reg[y][x];
419 |     }
420 | 
421 |     __syncthreads();
422 |     
423 |     // Activation and elementwise operations
424 |     if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT) {
425 |       int y = tid / OUTPUT_TILE_WIDTH;
426 |       int smem_x = tid % OUTPUT_TILE_WIDTH;
427 |       int global_x = bidx * OUTPUT_TILE_WIDTH + smem_x;
428 |       if (global_x < HIDDEN_SIZE) {
429 |         float val = tanh(h_gate[0][y][smem_x] + h_gate[1][y][smem_x] + precompute_h + bias_h);
430 | 
431 |         output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + (bidy * TILE_HEIGHT + y) * HIDDEN_SIZE + global_x] = z_h_res[y][smem_x] + z_gate[y][smem_x] * val;
432 |       }
433 |     }
434 |     
435 |     // Escape if at end of sequence length
436 |     if (sequence_iteration + 1 == length) break;
437 |     
438 |     // Synchronize between recurrent iterations - signal stage 
439 |     if (tid == 0) {
440 |       syncIn[bidy * gridDim.x + bidx] =  2 * sequence_iteration + 2;
441 |     }
442 |     __threadfence();
443 | 
444 |     // Fetch time independent partial sums for the next timestep
445 |     if (work_group.thread_rank() < TILE_WIDTH * TILE_HEIGHT) {
446 |       precompute = *precomputed_offset;
447 |       precomputed_offset += HIDDEN_SIZE * BATCH_SIZE * GRU_GATES;
448 |     }
449 | 
450 |     if (tid < OUTPUT_TILE_WIDTH * TILE_HEIGHT) {
451 |       precompute_h = *precomputed_offset_h;
452 |       precomputed_offset_h += HIDDEN_SIZE * BATCH_SIZE * GRU_GATES;
453 |     }
454 |     
455 |     // Synchronize between recurrent iterations - spin stage
456 |     if (bidx == 0) {
457 |       if (tid < gridDim.x) {
458 |         while ( syncIn[bidy * gridDim.x + tid] != 2 * sequence_iteration + 2) {
459 |         }
460 |       }
461 | 
462 |       __syncthreads();
463 | 
464 |       if (tid == 0) {
465 |         syncOut[bidy] = 2 * sequence_iteration + 2;
466 |       }
467 |     } else {
468 |       if (tid == 0) {
469 |         while (syncOut[bidy] != 2 * sequence_iteration + 2) {
470 |         }
471 |       }
472 |       __syncthreads();
473 |     }
474 |     
475 |     // Load output from t - 1 to buffer
476 |     #pragma unroll
477 |     for (int j = 0; j < TILE_HEIGHT; j++) {
478 |       #pragma unroll
479 |       for (int i = 0; i < BUFFER_SIZE; i += NUM_GROUPS * GROUP_THREADS) {
480 |         if ( i + tid < HIDDEN_SIZE ) {
481 |           buffer_tile[j][i + tid] = output[sequence_iteration * HIDDEN_SIZE * BATCH_SIZE + bidy * TILE_HEIGHT * HIDDEN_SIZE + j * HIDDEN_SIZE + i + tid];
482 |         } else if ( i + tid < BUFFER_SIZE) {
483 |           buffer_tile[j][i + tid] = 0.f;
484 |         }
485 |       }
486 |     }
487 |   }
488 | }
489 | 
490 | template<typename T>
491 | void process_input_weights(T * output, std::vector<T*> weights, uint32_t input_size, uint32_t hidden_size) { 
492 |   
493 |   // Outside loop is the input size
494 |   for (uint32_t j = 0; j < input_size; j++) {
495 |     // Width of the input weight matrix
496 |     for (uint32_t k = 0; k < hidden_size; k++) {
497 |       // Colocate the weights for each element
498 |       for (uint32_t i = 0; i < GRU_GATES; i++) {
499 |         output[(j * hidden_size + k) * GRU_GATES + i] = weights.at(i)[j * hidden_size + k];
500 |       }
501 |     }
502 |   }
503 | }
504 | 
505 | template<typename T>
506 | void process_hidden_weights(T * output, std::vector<T*> weights, uint32_t hidden_size) {
507 |   
508 |   // For each output element
509 |   for (uint32_t j = 0; j < hidden_size; j++) {
510 |     // For each gate
511 |     for (uint32_t k = 0; k < GRU_GATES; k++) {
512 |       // For each element for that gate
513 |       for (uint32_t i = 0; i < hidden_size; i++) {
514 |         output[j * GRU_GATES * hidden_size + k * hidden_size + i] = weights.at(3 + k)[i * hidden_size + j];
515 |       }
516 |     }
517 |   }
518 | }
519 | 
520 | template<typename T>
521 | void process_biases(T * output, std::vector<T*> weights, uint32_t hidden_size) {
522 | 
523 |   // For each output element
524 |   for (uint32_t k = 0; k < hidden_size; k++) {
525 |     // Colocate the biases for each element
526 |     for (uint32_t i = 0; i < GRU_GATES; i++) {
527 |       output[k * GRU_GATES + i] = weights.at(i + 6)[k];
528 |     }
529 |   }
530 | }
531 | 
532 | template <typename T>
533 | void GRULayerDouble<T>::reset() {
534 |   cudaFreeHost((void *) this->packed_input_weights);
535 |   cudaFreeHost((void *) this->packed_hidden_weights);
536 |   cudaFreeHost((void *) this->packed_biases);
537 |   cudaFree((void *) this->packed_input_weights_gpu);
538 |   cudaFree((void *) this->packed_hidden_weights_gpu);
539 |   cudaFree((void *) this->packed_biases_gpu);
540 | }
541 | 
542 | // Initialize and fill buffers for trained parameters
543 | template<typename T>
544 | uint32_t GRULayerDouble<T>::initialize() {
545 |   
546 |   uint32_t input_footprint = input_weight_footprint();
547 |   uint32_t hidden_footprint = hidden_weight_footprint();
548 |   uint32_t bias_footprint = bias_weight_footprint();
549 |   
550 |   // Allocate weights
551 |   cudaHostAlloc((void **) &(this->packed_input_weights), input_footprint, cudaHostAllocDefault); CUDA_ERR;
552 |   cudaHostAlloc((void **) &(this->packed_hidden_weights), hidden_footprint, cudaHostAllocDefault); CUDA_ERR;
553 |   cudaHostAlloc((void **) &(this->packed_biases), bias_footprint, cudaHostAllocDefault); CUDA_ERR;
554 |   cudaMalloc((void **) &(this->packed_input_weights_gpu), input_footprint); CUDA_ERR;
555 |   cudaMalloc((void **) &(this->packed_hidden_weights_gpu), hidden_footprint); CUDA_ERR;
556 |   cudaMalloc((void **) &(this->packed_biases_gpu), bias_footprint); CUDA_ERR;
557 |   
558 |   // Reorganize weights
559 |   process_input_weights(this->packed_input_weights, this->host_weights, this->input_size, this->hidden_size);
560 |   process_hidden_weights(this->packed_hidden_weights, this->host_weights, this->hidden_size);
561 |   process_biases(this->packed_biases, this->host_weights, this->hidden_size);
562 |   
563 |   // Transfer weights
564 |   cudaMemcpy(this->packed_input_weights_gpu, this->packed_input_weights, input_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
565 |   cudaMemcpy(this->packed_hidden_weights_gpu, this->packed_hidden_weights, hidden_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
566 |   cudaMemcpy(this->packed_biases_gpu, this->packed_biases, bias_footprint, cudaMemcpyHostToDevice); CUDA_ERR;
567 | 
568 |   return 0;
569 | }
570 | 
571 | // Frees allocated memory
572 | template <typename T>
573 | void GRUModelDouble<T>::reset() {
574 | 
575 |   for (auto& l: this->layers) {
576 |     l.reset();
577 |   }
578 | 
579 |   cudaFreeHost((void *) this->host_output);
580 |   cudaFree((void *) this->gpu_output);
581 |   
582 |   cudaFree((void *) this->gpu_r);
583 |   cudaFree((void *) this->gpu_inputs);
584 |   cudaFree((void *) this->gpu_precompute);
585 |   cudaFree((void *) this->gpu_syncIn);
586 |   cudaFree((void *) this->gpu_syncOut);
587 | }
588 | 
589 | // Allocates model buffers and initializes most kernel parameters
590 | template <typename T>
591 | uint32_t GRUModelDouble<T>::initialize() {
592 |   
593 |   for (auto& l: this->layers) {
594 |     uint32_t debug = l.initialize();
595 |     if (debug != 0) {
596 |       std::cout << "FAILURE\n";
597 |       return debug;
598 |     }
599 |   }
600 | 
601 |   this->gpu_weights_input = this->layers[0].get_packed_input_weights_gpu();
602 |   this->gpu_weights_hidden = this->layers[0].get_packed_hidden_weights_gpu();
603 |   this->gpu_biases = this->layers[0].get_packed_biases_gpu();
604 |   this->mm_k = this->initial_input_size;
605 |   this->mm_n = this->output_size * GRU_GATES;
606 | 
607 |   // Output allocation, assume sequence length less than 200
608 |   cudaHostAlloc((void **) &(this->host_output), this->output_size * this->batch_size * 200 * sizeof(T), cudaHostAllocDefault);
609 |   cudaMalloc((void **) &(this->gpu_output), this->output_size * this->batch_size * 200 * sizeof(T));
610 |   
611 |   // Input allocations, assume sequence length less than 200
612 |   cudaMalloc((void **) &(this->gpu_inputs), this->initial_input_size * this->batch_size * 200 * sizeof(T));
613 |   cudaMalloc((void **) &(this->gpu_r), this->output_size * this->batch_size * sizeof(T));
614 |   cudaMalloc((void **) &(this->gpu_precompute), this->output_size * this->batch_size * GRU_GATES * 200 * sizeof(T));
615 | 
616 |   // Hidden state initializer allocation
617 |   cudaMalloc((void **) &(this->gpu_hidden_initializer), this->output_size * this->batch_size * sizeof(T));
618 |   cudaMemset((void *)this->gpu_hidden_initializer, 0, this->output_size * this->batch_size * sizeof(T));
619 | 
620 |   // Synchronization buffer initialization
621 |   cudaMalloc((void **) &(this->gpu_syncIn), 80 * sizeof(int));
622 |   cudaMalloc((void **) &(this->gpu_syncOut), 80 * sizeof(int));
623 |  
624 |   //cudaFuncSetAttribute(gru_rnn, cudaFuncAttributeMaxDynamicSharedMemorySize, MAX_SMEM); CUDA_ERR;
625 |   cudaDeviceSetLimit(cudaLimitStackSize, 0); CUDA_ERR;
626 | 
627 |   this->paramsMM[0] = (void*) &(this->gpu_inputs);
628 |   this->paramsMM[1] = (void*) &(this->gpu_weights_input);
629 |   this->paramsMM[2] = (void*) &(this->gpu_precompute);
630 |   this->paramsMM[4] = (void*) &(this->mm_k);
631 |   this->paramsMM[5] = (void*) &(this->mm_n);
632 | 
633 |   this->paramsGRU[0] = (void*) &(this->gpu_precompute);
634 |   this->paramsGRU[1] = (void*) &(this->gpu_hidden_initializer);
635 |   this->paramsGRU[2] = (void*) &(this->gpu_weights_hidden);
636 |   this->paramsGRU[3] = (void*) &(this->gpu_biases);
637 |   this->paramsGRU[4] = (void*) &(this->gpu_r);
638 |   this->paramsGRU[5] = (void*) &(this->gpu_output);
639 |   this->paramsGRU[6] = (void*) &(this->gpu_syncIn);
640 |   this->paramsGRU[7] = (void*) &(this->gpu_syncOut);
641 | 
642 |   return 0;
643 | }
644 | 
645 | // Set tiling parameters (should be encapsulated elsewhere)
646 | template <typename T>
647 | void GRUModelDouble<T>::set_configuration(int x, int y, int g, int t) {
648 |   this->tile_width = x;
649 |   this->tile_height = y;
650 |   this->num_groups = g;
651 |   this->group_threads = t;
652 | }
653 | 
654 | // Process input sequence (both time dependent and independent
655 | template <typename T>
656 | float GRUModelDouble<T>::run_input(T* input, uint32_t * length) {
657 |   
658 |   // Initialize remaining kernel parameters
659 |   this->mm_m = this->batch_size * *length;
660 |   this->paramsMM[3] = (void *) &(this->mm_m);
661 |   this->paramsGRU[8] = (void *) length;
662 |  
663 |   // GEMM Kernel Dimensioning
664 |   dim3 mm_grid = dim3((this->mm_n + MM_TILE_SIZE - 1) / MM_TILE_SIZE, (this->mm_m + MM_TILE_SIZE - 1) / MM_TILE_SIZE);
665 |   dim3 mm_block = dim3(MM_BLOCK_SIZE, MM_BLOCK_SIZE);
666 |   size_t mm_sm_requirement = MM_TILE_SIZE * MM_TILE_SIZE * 2 * sizeof(float);
667 |   
668 |   // GRU Double Kernel Dimensioning
669 |   int effective_w = (this->num_groups * this->tile_width) / (GRU_GATES - 1);
670 |   dim3 gru_rnn_grid = dim3((this->output_size + effective_w - 1) / effective_w, (this->batch_size + this->tile_height - 1) / this->tile_height);
671 |   dim3 gru_rnn_block = dim3(this->num_groups * this->group_threads);
672 |   unsigned block_size = gru_rnn_block.x;
673 |   unsigned grid_size = gru_rnn_grid.x * gru_rnn_grid.y;
674 |   
675 |   // Kernel instantiation (currently only configured for manual tuning)
676 |   void * kernel = (void *)gru_rnn<1024, 4, 5, 8, 32, 5>;
677 | 
678 |   // Check occupancy before running to prevent program hangs
679 |   int numBlocks = 0;
680 |   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, block_size, 0);
681 |   if (grid_size > 80 * numBlocks) {
682 |     printf("grid_size: %3d numBlocks: %3d block_size: %3d\n", grid_size, numBlocks * 80, block_size);
683 |     return -std::numeric_limits<float>::infinity();
684 |   }
685 | 
686 |   cudaEvent_t start, end;
687 |   float elapsed;
688 |   
689 |   // Send inputs
690 |   cudaMemcpy(this->gpu_inputs, input, this->initial_input_size * this->batch_size * *length * sizeof(T), cudaMemcpyHostToDevice);
691 |   
692 |   // Timing
693 |   cudaEventCreate(&start);
694 |   cudaEventCreate(&end);
695 |   cudaEventRecord(start);
696 |   
697 |   // Kernel launches
698 |   cudaLaunchKernel((void *)matmul, mm_grid, mm_block, this->paramsMM, mm_sm_requirement);
699 |   cudaLaunchKernel(kernel, gru_rnn_grid, gru_rnn_block, this->paramsGRU);
700 |   
701 |   cudaEventRecord(end);
702 |   cudaEventSynchronize(end);
703 |   cudaEventElapsedTime(&elapsed, start, end);
704 |   
705 |   cudaMemcpy(this->host_output, this->gpu_output, this->output_size * this->batch_size * sizeof(T), cudaMemcpyDeviceToHost);
706 | 
707 | #ifdef DEBUG
708 |   // Value checking
709 |   for (int i = 0; i < this->batch_size; i++) {
710 |     printf("Sequence %2d\n", i);
711 |     for (int j = 0; j < this->output_size; j++) {
712 |       printf("%f ", this->host_output[i * this->output_size + j]);
713 |     }
714 |     printf("\n");
715 |   }
716 |   printf("\n");
717 | #endif
718 |   
719 |   // Check for runtime errors
720 |   cudaError_t err;
721 |   cudaDeviceSynchronize();
722 |   if ((err = cudaGetLastError()) != cudaSuccess) {
723 |     printf("CUDA error: %d : %s : %s, line %d\n", err, cudaGetErrorString(err), __FILE__, __LINE__);
724 |     return std::numeric_limits<float>::infinity();
725 |   }
726 |  
727 |   return elapsed;
728 | }
729 | 
730 | // Explicit template instantiations
731 | template void process_input_weights<float>(float *, std::vector<float *>, uint32_t, uint32_t);
732 | template void process_hidden_weights<float>(float *, std::vector<float *>, uint32_t);
733 | template void process_biases<float>(float *, std::vector<float *>, uint32_t);
734 | template uint32_t GRULayerDouble<float>::initialize();
735 | template uint32_t GRUModelDouble<float>::initialize();
736 | template void GRULayerDouble<float>::reset();
737 | template void GRUModelDouble<float>::reset();
738 | template void GRUModelDouble<float>::set_configuration(int, int, int, int);
739 | template float GRUModelDouble<float>::run_input(float *, uint32_t *);
740 | 


--------------------------------------------------------------------------------