├── .gitignore ├── README.md ├── batching ├── Makefile ├── gemm.cu ├── k.sh ├── kernel.h ├── kernel_128.h ├── kernel_256.h ├── log ├── run.sh └── thres.sh ├── cke ├── Makefile ├── gemm.cu ├── log └── run.sh ├── data ├── Makefile ├── gen_data ├── gen_data.cpp └── input ├── default ├── Makefile ├── gemm.cu ├── log └── run.sh ├── google-net_cudnn ├── .gitignore ├── Makefile ├── activation.cpp ├── activation.h ├── batch-inception.cu ├── batch-inception.h ├── concat.cu ├── concat.h ├── conv.cpp ├── conv.h ├── dropout.cpp ├── dropout.h ├── gemm_kernel.h ├── im2col.h ├── inception.cpp ├── inception.h ├── loss.cpp ├── loss.h ├── lrn.cpp ├── lrn.h ├── main.cpp ├── pooling.cpp ├── pooling.h ├── softmax.cpp ├── softmax.h └── util.h ├── include └── util.h ├── magma ├── Makefile ├── gemm.cu ├── kernel.h ├── log └── run.sh └── tiling ├── Makefile ├── gemm.cu ├── kernel.h ├── kernel_128.h ├── kernel_256.h ├── log ├── run.sh └── thres.sh /.gitignore: -------------------------------------------------------------------------------- 1 | */gemm 2 | */gemm.o 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | General matrix multiplication (GEMM) plays a paramount role in a broad range of domains such as deep learning, scientific computing, and image processing. Many researchers have spent large amounts of efforts on optimizing GEMM to exploit the enormous computing power of GPUs. The primary optimization method is to partition the matrix into many tiles and exploit the parallelism between and within each tile, which closely mirrors the thread hierarchy on GPUs. In practice, GPUs can fully unleash its computing power when the matrix size is large and there are a sufficient number of tiles and enough workload within each tile. However, in many real-world applications especially deep learning domains, the matrix size is small. Besides, in many other fields, such as astrophysics, metabolic networks, high-order FEM schemes and deep learning, the matrix size is also not large enough to fully drive the GPU hardware resource. To this end, batched GEMMs has been proposed to process a group of small independent GEMMs together. However, prior works only optimize either from the tiling or from the batching perspective. 3 | 4 | In this paper, we propose a coordinated tiling and batching framework for accelerating GEMM on GPUs. Our solution exploits the synergistic interaction between the two optimization knobs. It is composed of two engines: tiling engine and batching engine. In the tiling engine, we first design a series of tiling strategies dedicated for the batched GEMM scenario. Then, we design an algorithm to select the tiling strategy for each GEMM. After tiling engine, it generates multiple tiles from the GEMMs. In the batching engine, it is responsible to assign the tiles into thread blocks. We design a series of batching algorithms to determine the assignment from tiles to thread blocks. Then, we propose a general programming style to describe the coordinated tiling and batching solution. Finally, experiment evaluation results show that our framework can achieve about 40% performance speedup over the state-of-the-art work. 5 | -------------------------------------------------------------------------------- /batching/Makefile: -------------------------------------------------------------------------------- 1 | #GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 2 | GENCODE_FLAGS = -gencode arch=compute_70,code=compute_70 3 | 4 | gemm:gemm.cu kernel.h 5 | nvcc $< -o $@ --std=c++11 -O3 ${GENCODE_FLAGS} -Xptxas -v -res-usage 6 | clean: 7 | rm -rf gemm *.o 8 | -------------------------------------------------------------------------------- /batching/gemm.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../include/util.h" 6 | #include "kernel.h" 7 | 8 | #define N_RUNS 10 9 | 10 | int main (int argc, char** argv) { 11 | 12 | ErrChk(cudaSetDevice(0)); 13 | 14 | if(argc<2){ 15 | printf("Usage: input the batch size\n"); 16 | exit(EXIT_FAILURE); 17 | } 18 | 19 | int BATCH = atoi(argv[1]); 20 | //int TLP_thres = atoi(argv[2]); 21 | int TLP_thres = 65536*2; 22 | 23 | int *M; 24 | int *N; 25 | int *K; 26 | 27 | M = (int*) malloc(BATCH * sizeof(int)); 28 | N = (int*) malloc(BATCH * sizeof(int)); 29 | K = (int*) malloc(BATCH * sizeof(int)); 30 | 31 | std::fstream fs; 32 | fs.open("../data/input"); 33 | if (!fs.is_open()){ 34 | printf("Error opening input\n"); 35 | exit(EXIT_FAILURE); 36 | } 37 | 38 | //read matrix config 39 | for (int i=0; i>M[i]>>N[i]>>K[i]; 41 | } 42 | 43 | float **A; 44 | float **B; 45 | float **C; 46 | 47 | A = (float**) malloc(BATCH * sizeof(float*)); 48 | B = (float**) malloc(BATCH * sizeof(float*)); 49 | C = (float**) malloc(BATCH * sizeof(float*)); 50 | 51 | for (int i=0; i TLP_thres && M[j]>t_strategy[j] && K[j]<=32) 156 | b_strategy[j] = 2; 157 | } 158 | 159 | 160 | int *dev_Ba; 161 | ErrChk(cudaMalloc((void**)&dev_Ba, BATCH*sizeof(int))); 162 | ErrChk(cudaMemcpy(dev_Ba, b_strategy, BATCH*sizeof(int), cudaMemcpyHostToDevice)); 163 | 164 | 165 | /* 166 | //print the obtained batching strategy 167 | for (int j=0; j M[j]/b_strategy[j]/tile_size[t_strategy[j]][0])? (grid_size.x):(M[j]/b_strategy[j]/tile_size[t_strategy[j]][0]); 189 | grid_size.y = (grid_size.y > N[j]/tile_size[t_strategy[j]][1])? (grid_size.y):(N[j]/tile_size[t_strategy[j]][1]); 190 | } 191 | 192 | // printf("%d %d %d\n", grid_size.x, grid_size.y, grid_size.z); 193 | 194 | //warm-up 195 | gemm_256<<>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C, dev_T, dev_Ba); 196 | KernelErrChk(); 197 | 198 | ErrChk(cudaEventCreate(&start)); 199 | ErrChk(cudaEventRecord(start,0)); 200 | 201 | for (int run = 0; run>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C, dev_T, dev_Ba); 203 | KernelErrChk(); 204 | } 205 | 206 | ErrChk(cudaEventCreate(&stop)); 207 | ErrChk(cudaEventRecord(stop,0)); 208 | ErrChk(cudaEventSynchronize(stop)); 209 | ErrChk(cudaEventElapsedTime(&elapsedTime, start,stop)); 210 | 211 | time = elapsedTime/N_RUNS; 212 | time /= 1.0e3; //convert time unit from millisecond to second 213 | gflops_per_sec = gflops / time; 214 | printf("%f\n", gflops_per_sec); 215 | 216 | for (int i=0; i> log 7 | done 8 | for ((thres=16; thres<=1024; thres=thres*2)) 9 | do 10 | ./gemm 32 $thres >> log 11 | done 12 | for ((thres=16; thres<=1024; thres=thres*2)) 13 | do 14 | ./gemm 64 $thres >> log 15 | done 16 | for ((thres=16; thres<=1024; thres=thres*2)) 17 | do 18 | ./gemm 128 $thres >> log 19 | done 20 | for ((thres=16; thres<=1024; thres=thres*2)) 21 | do 22 | ./gemm 256 $thres >> log 23 | done 24 | -------------------------------------------------------------------------------- /batching/kernel.h: -------------------------------------------------------------------------------- 1 | #include "kernel_128.h" 2 | #include "kernel_256.h" 3 | 4 | template 5 | __global__ void gemm(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[], int B_strategy[]){} 6 | 7 | 8 | /* 9 | template<> 10 | __global__ void gemm<128>(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[]){ 11 | 12 | extern __shared__ float sh[]; 13 | 14 | int begin = Tile[blockIdx.x]; 15 | int end = Tile[blockIdx.x+1]; 16 | int t = T_strategy[blockIdx.z]; 17 | 18 | //main loop for all tiles assigned to this block 19 | #pragma unroll 20 | for (int b=begin; b 67 | __global__ void gemm_256(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[], int B_strategy[]){ 68 | 69 | extern __shared__ float sh[]; 70 | 71 | int i = blockIdx.z; 72 | int t = T_strategy[i]; 73 | int b = B_strategy[i]; 74 | int by; 75 | int bx; 76 | //main loop for all tiles assigned to this block 77 | 78 | for (int j=0; j /dev/null 11 | ./gemm 4 >> log 12 | ./gemm 8 >> log 13 | ./gemm 16 >> log 14 | ./gemm 32 >> log 15 | ./gemm 64 >> log 16 | ./gemm 128 >> log 17 | ./gemm 256 >> log 18 | done 19 | done 20 | -------------------------------------------------------------------------------- /batching/thres.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -f log 4 | for ((thres=65536; thres<=102400000; thres=thres*2)) 5 | do 6 | ./gemm 16 $thres >> log 7 | done 8 | for ((thres=65536; thres<=102400000; thres=thres*2)) 9 | do 10 | ./gemm 32 $thres >> log 11 | done 12 | for ((thres=65536; thres<=102400000; thres=thres*2)) 13 | do 14 | ./gemm 64 $thres >> log 15 | done 16 | for ((thres=65536; thres<=102400000; thres=thres*2)) 17 | do 18 | ./gemm 128 $thres >> log 19 | done 20 | for ((thres=65536; thres<=102400000; thres=thres*2)) 21 | do 22 | ./gemm 256 $thres >> log 23 | done 24 | -------------------------------------------------------------------------------- /cke/Makefile: -------------------------------------------------------------------------------- 1 | GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 2 | 3 | gemm:gemm.cu 4 | nvcc $< -o $@ --std=c++11 -O3 -lcublas ${GENCODE_FLAGS} 5 | clean: 6 | rm -rf gemm *.o 7 | -------------------------------------------------------------------------------- /cke/gemm.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../include/util.h" 6 | 7 | #define N_RUNS 10 8 | 9 | int main (int argc, char** argv) { 10 | 11 | ErrChk(cudaSetDevice(0)); 12 | 13 | if(argc<2){ 14 | printf("Usage: input the batch size\n"); 15 | exit(EXIT_FAILURE); 16 | } 17 | 18 | int BATCH = atoi(argv[1]); 19 | 20 | int *M; 21 | int *N; 22 | int *K; 23 | 24 | M = (int*) malloc(BATCH * sizeof(int)); 25 | N = (int*) malloc(BATCH * sizeof(int)); 26 | K = (int*) malloc(BATCH * sizeof(int)); 27 | 28 | std::fstream fs; 29 | fs.open("../data/input"); 30 | if (!fs.is_open()){ 31 | printf("Error opening input\n"); 32 | exit(EXIT_FAILURE); 33 | } 34 | 35 | //read matrix config 36 | for (int i=0; i>M[i]>>N[i]>>K[i]; 38 | } 39 | 40 | float **A; 41 | float **B; 42 | float **C; 43 | float alpha = 1.f; 44 | float beta = 0.f; 45 | 46 | A = (float**) malloc(BATCH * sizeof(float*)); 47 | B = (float**) malloc(BATCH * sizeof(float*)); 48 | C = (float**) malloc(BATCH * sizeof(float*)); 49 | 50 | for (int i=0; i /dev/null 11 | ./gemm 4 >> log 12 | ./gemm 8 >> log 13 | ./gemm 16 >> log 14 | ./gemm 32 >> log 15 | ./gemm 64 >> log 16 | ./gemm 128 >> log 17 | ./gemm 256 >> log 18 | done 19 | done 20 | -------------------------------------------------------------------------------- /data/Makefile: -------------------------------------------------------------------------------- 1 | gen_data:gen_data.cpp 2 | rm -f input 3 | touch input 4 | g++ $< -o $@ 5 | 6 | clean: 7 | rm -f input gen_data 8 | -------------------------------------------------------------------------------- /data/gen_data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lixiuhong/batched_gemm/03f1b28238b4c6da913aa561972f8f0202399571/data/gen_data -------------------------------------------------------------------------------- /data/gen_data.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #define random(x) (rand()%(x)) 7 | 8 | int main(int argc, char *argv[]){ 9 | 10 | if (argc<3){ 11 | printf("Usage: please input two integers\n"); 12 | printf("The first one represents the largest matrix size (M, N)\n"); 13 | printf("The second one represents the K\n"); 14 | exit(EXIT_FAILURE); 15 | } 16 | 17 | 18 | std::fstream fs; 19 | fs.open("../data/input"); 20 | if (!fs.is_open()){ 21 | printf("Error opening input\n"); 22 | exit(EXIT_FAILURE); 23 | } 24 | 25 | int e = atoi(argv[1]); 26 | int log_e = 0; 27 | 28 | while(e>=16){ 29 | e = e>>1; 30 | log_e++; 31 | } 32 | 33 | int K = atoi(argv[2]); 34 | //read matrix config 35 | for (int i=0; i<256; ++i){ 36 | int M = 16< 2 | #include 3 | #include 4 | #include 5 | #include "../include/util.h" 6 | 7 | #define N_RUNS 10 8 | 9 | int main (int argc, char** argv) { 10 | 11 | ErrChk(cudaSetDevice(0)); 12 | 13 | if(argc<2){ 14 | printf("Usage: input the batch size\n"); 15 | exit(EXIT_FAILURE); 16 | } 17 | 18 | int BATCH = atoi(argv[1]); 19 | 20 | int *M; 21 | int *N; 22 | int *K; 23 | 24 | M = (int*) malloc(BATCH * sizeof(int)); 25 | N = (int*) malloc(BATCH * sizeof(int)); 26 | K = (int*) malloc(BATCH * sizeof(int)); 27 | 28 | std::fstream fs; 29 | fs.open("../data/input"); 30 | if (!fs.is_open()){ 31 | printf("Error opening input\n"); 32 | exit(EXIT_FAILURE); 33 | } 34 | 35 | //read matrix config 36 | for (int i=0; i>M[i]>>N[i]>>K[i]; 38 | } 39 | 40 | float **A; 41 | float **B; 42 | float **C; 43 | float alpha = 1.f; 44 | float beta = 0.f; 45 | 46 | A = (float**) malloc(BATCH * sizeof(float*)); 47 | B = (float**) malloc(BATCH * sizeof(float*)); 48 | C = (float**) malloc(BATCH * sizeof(float*)); 49 | 50 | for (int i=0; i /dev/null 11 | ./gemm 4 >> log 12 | ./gemm 8 >> log 13 | ./gemm 16 >> log 14 | ./gemm 32 >> log 15 | ./gemm 64 >> log 16 | ./gemm 128 >> log 17 | ./gemm 256 >> log 18 | done 19 | done 20 | -------------------------------------------------------------------------------- /google-net_cudnn/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | google-net_cudnn 3 | tags 4 | .cproject 5 | .project 6 | .ptp-sync/ 7 | .settings/ 8 | Debug/ 9 | Release/ 10 | -------------------------------------------------------------------------------- /google-net_cudnn/Makefile: -------------------------------------------------------------------------------- 1 | USE_MULTI_STREAM ?= 0 2 | 3 | ifeq ($(USE_MULTI_STREAM), 1) 4 | COMMON_FLAGS += -DUSE_MULTI_STREAM=1 5 | endif 6 | 7 | GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 8 | 9 | google-net_cudnn:conv.o main.o activation.o pooling.o lrn.o concat.o dropout.o inception.o batch-inception.o loss.o softmax.o 10 | nvcc $^ -o $@ -lcudnn -lcublas ${GENCODE_FLAGS} $(COMMON_FLAGS) 11 | 12 | main.o:main.cpp util.h conv.h activation.h pooling.h lrn.h 13 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 14 | conv.o:conv.cpp util.h 15 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 16 | activation.o:activation.cpp util.h 17 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 18 | pooling.o:pooling.cpp util.h 19 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 20 | lrn.o:lrn.cpp util.h 21 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 22 | concat.o:concat.cu concat.h 23 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 24 | dropout.o:dropout.cpp dropout.h 25 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 26 | loss.o:loss.cpp loss.h 27 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 28 | softmax.o:softmax.cpp softmax.h 29 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 30 | inception.o:inception.cpp inception.h 31 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 32 | batch-inception.o:batch-inception.cu batch-inception.h 33 | nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS) 34 | 35 | clean: 36 | rm -f *.o 37 | -------------------------------------------------------------------------------- /google-net_cudnn/activation.cpp: -------------------------------------------------------------------------------- 1 | #include "cudnn.h" 2 | #include "util.h" 3 | #include 4 | 5 | void activation(cudnnHandle_t handle, int N, int C, int H, int W, float *input, float *output, cudaStream_t s){ 6 | 7 | float one = 1.0, zero = 0.0; 8 | 9 | ErrChk(cudnnSetStream(handle, s)); 10 | 11 | cudnnActivationDescriptor_t activationDesc; 12 | ErrChk(cudnnCreateActivationDescriptor(&activationDesc)); 13 | ErrChk(cudnnSetActivationDescriptor(activationDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.f)); 14 | 15 | cudnnTensorDescriptor_t xDesc; 16 | ErrChk(cudnnCreateTensorDescriptor(&xDesc)); 17 | ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 18 | 19 | cudnnTensorDescriptor_t yDesc; 20 | ErrChk(cudnnCreateTensorDescriptor(&yDesc)); 21 | ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 22 | 23 | ErrChk(cudnnActivationForward(handle, activationDesc, &one, xDesc, input, &zero, yDesc, output)); 24 | 25 | ErrChk(cudnnDestroyActivationDescriptor(activationDesc)); 26 | ErrChk(cudnnDestroyTensorDescriptor(xDesc)); 27 | ErrChk(cudnnDestroyTensorDescriptor(yDesc)); 28 | 29 | } 30 | -------------------------------------------------------------------------------- /google-net_cudnn/activation.h: -------------------------------------------------------------------------------- 1 | #ifndef __ACTIVATION_H__ 2 | #define __ACTIVATION_H__ 3 | void activation(cudnnHandle_t handle, int N, int C, int H, int W, float *input, float *output, cudaStream_t s=0); 4 | #endif 5 | -------------------------------------------------------------------------------- /google-net_cudnn/batch-inception.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "cudnn.h" 4 | #include "util.h" 5 | #include 6 | #include "conv.h" 7 | #include "pooling.h" 8 | #include "activation.h" 9 | #include "dropout.h" 10 | #include "lrn.h" 11 | #include "concat.h" 12 | #include "im2col.h" 13 | #include "gemm_kernel.h" 14 | 15 | 16 | /* 17 | * do Inception 18 | * 19 | * This func will consume 6 filters and 4 features. 20 | * Use x[xIdx] as x[xIdx], which should be set before this func. 21 | * Use x[xIdx + 4] as output. 22 | * 23 | */ 24 | void batchGoogleNetInception(cudnnHandle_t handle, const int N, const int C, 25 | const int H, const int W, const int xIdx, const int filterIdx, 26 | const int K1, const int K2, const int K3, const int K4, const int K5, 27 | const int K6, int *reC, float **x, float** filter, float* buf, 28 | const int *algo_best) { 29 | /* 30 | * Use x[xIdx + 8] as output. 31 | * We can concat the result directly when N == 1. 32 | */ 33 | float *output = x[xIdx + 4]; 34 | float *output1 = output; 35 | float *output2 = output1 + K1 * H * W; 36 | float *output3 = output2 + K3 * H * W; 37 | float *output4 = output3 + K5 * H * W; 38 | 39 | //pool 40 | pooling(handle, N, C, H, W, 3, 3, 1, 1, 1, 1, H, W, 41 | x[xIdx], x[xIdx + 3]); 42 | 43 | // the first four-batch conv 44 | int M_MAX = N * H * W; 45 | int N_MAX = std::max(K1, std::max(K2, std::max(K3, K4))); 46 | dim3 grid_size((M_MAX - 1) / 16 + 1, (N_MAX - 1) / 16 + 1, 4); 47 | dim3 block_size(64, 1, 1); 48 | gemm_4<<>>( 49 | N * H * W, K1, K2, K4, K6, C, H, W, x[xIdx], x[xIdx + 3], 50 | filter[filterIdx], filter[filterIdx + 1], filter[filterIdx + 3], 51 | filter[filterIdx + 5], output1, x[xIdx + 1], x[xIdx + 2], output4); 52 | KernelErrChk(); 53 | 54 | //relu 1*1 55 | activation(handle, N, K1, H, W, output1, output1); 56 | 57 | //relu 3*3 reduce 58 | activation(handle, N, K2, H, W, x[xIdx + 1], 59 | x[xIdx + 1]); 60 | 61 | //3*3 62 | int algo = algo_best[(filterIdx+2)*7]; 63 | conv(handle, N, C, H, W, K3, 3, 3, 1, 1, 1, 1, H, W, 64 | x[xIdx + 1], filter[filterIdx+2], buf, 65 | output2, algo); 66 | 67 | //relu 3*3 68 | activation(handle, N, K3, H, W, output2, output2); 69 | 70 | //relu 5*5 reduce 71 | activation(handle, N, K4, H, W, x[xIdx + 2], 72 | x[xIdx + 2]); 73 | 74 | //5*5 75 | algo = algo_best[(filterIdx+4)*7]; 76 | conv(handle, N, C, H, W, K5, 5, 5, 1, 1, 2, 2, H, W, 77 | x[xIdx + 2], filter[filterIdx+4], buf, 78 | output3, algo); 79 | 80 | //relu 5*5 81 | activation(handle, N, K5, H, W, output3, output3); 82 | 83 | //relu pool proj 84 | activation(handle, N, K6, H, W, output4, output4); 85 | 86 | //compute return shape 87 | *reC = K1 + K3 + K5 + K6; 88 | } 89 | -------------------------------------------------------------------------------- /google-net_cudnn/batch-inception.h: -------------------------------------------------------------------------------- 1 | /* 2 | * batch-inception.h 3 | * 4 | * Created on: Nov 5, 2018 5 | * Author: cambricon 6 | */ 7 | 8 | #ifndef BATCH_INCEPTION_H_ 9 | #define BATCH_INCEPTION_H_ 10 | 11 | /* 12 | * do Inception 13 | * 14 | * This func will consume 6 filters and 4 features. 15 | * Use x[xIdx] as x[xIdx], which should be set before this func. 16 | * Use x[xIdx + 4] as output. 17 | * 18 | */ 19 | void batchGoogleNetInception(cudnnHandle_t handle, const int N, const int C, 20 | const int H, const int W, const int xIdx, const int filterIdx, 21 | const int K1, const int K2, const int K3, const int K4, const int K5, 22 | const int K6, int *reC, float **x, float** filter, float* buf, 23 | const int *algo_best); 24 | 25 | #endif /* BATCH_INCEPTION_H_ */ 26 | -------------------------------------------------------------------------------- /google-net_cudnn/concat.cu: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | 3 | __global__ void cudaConcatKernel(size_t numIns, size_t innerStride, 4 | size_t outerStride, size_t* concatDims, const float **ins, float *out) { 5 | size_t batchSize = 0; 6 | for (size_t i = 0; i < numIns; ++i) { 7 | batchSize += concatDims[i]*innerStride; 8 | } 9 | 10 | size_t iOuter = blockIdx.x; 11 | float* outPtr = out + iOuter*batchSize; 12 | for (size_t j = 0; j < numIns; ++j) { 13 | for (size_t k = 0; k < concatDims[j]; ++k) { 14 | for (size_t l = 0; l < (innerStride - 1)/blockDim.x + 1; ++l) { 15 | size_t x = l*blockDim.x + threadIdx.x; 16 | if (x < innerStride) { 17 | outPtr[k*innerStride + x] = *(ins[j] + 18 | iOuter*concatDims[j]*innerStride + k*innerStride + x); 19 | } 20 | } 21 | } 22 | outPtr += concatDims[j]*innerStride; 23 | } 24 | } 25 | 26 | void launchCudaConcatKernel(size_t numIns, 27 | size_t innerStride, size_t outerStride, size_t* concatDims, 28 | const float **ins, float *out) { 29 | size_t gridsize = outerStride; 30 | size_t blocksize = 256; 31 | switch ((innerStride + 63)/64) { 32 | case 1: blocksize = 64; break; 33 | case 2: blocksize = 128; break; 34 | case 3: blocksize = 192; break; 35 | default: blocksize = 256; break; 36 | } 37 | cudaConcatKernel<<>>(numIns, 38 | innerStride, outerStride, concatDims, ins, out); 39 | KernelErrChk(); 40 | } 41 | 42 | size_t* concatDims = new size_t[4]; 43 | float** ins = new float*[4]; 44 | void concat(int N, int H, int W, int C1, int C2, int C3, int C4, 45 | float *input1, float *input2, float *input3, float *input4, 46 | float *buf, float *output) { 47 | concatDims[0] = static_cast(C1); 48 | concatDims[1] = static_cast(C2); 49 | concatDims[2] = static_cast(C3); 50 | concatDims[3] = static_cast(C4); 51 | ins[0] = input1; 52 | ins[1] = input2; 53 | ins[2] = input3; 54 | ins[3] = input4; 55 | size_t *devConcatDims = (size_t*)buf; 56 | const float **devIns = (const float **)(buf + 128);//bigger step 57 | ErrChk(cudaMemcpy(devIns, ins, 4*sizeof(float*), cudaMemcpyHostToDevice)); 58 | ErrChk(cudaMemcpy(devConcatDims, concatDims, 4*sizeof(size_t), cudaMemcpyHostToDevice)); 59 | 60 | launchCudaConcatKernel((size_t)4, size_t(H * W), size_t(N), devConcatDims, (const float **)devIns, output); 61 | } 62 | -------------------------------------------------------------------------------- /google-net_cudnn/concat.h: -------------------------------------------------------------------------------- 1 | #ifndef __CONCAT_H__ 2 | #define __CONCAT_H__ 3 | void concat(int N, int H, int W, int C1, int C2, int C3, int C4, 4 | float *input1, float *input2, float *input3, float *input4, 5 | float *buf, float *output); 6 | #endif 7 | -------------------------------------------------------------------------------- /google-net_cudnn/conv.cpp: -------------------------------------------------------------------------------- 1 | #include "cudnn.h" 2 | #include "util.h" 3 | #include 4 | 5 | void conv(cudnnHandle_t handle, int N, int C, int H, int W, int K, int R, int S, 6 | int U, int V, int pad_h, int pad_w, int P, int Q, 7 | float *input, float *filter, 8 | float *buf, float *output, 9 | int algo, 10 | cudaStream_t s){ 11 | 12 | float one = 1.0, zero = 0.0; 13 | size_t size; 14 | 15 | ErrChk(cudnnSetStream(handle, s)); 16 | 17 | cudnnTensorDescriptor_t xDesc; 18 | ErrChk(cudnnCreateTensorDescriptor(&xDesc)); 19 | ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 20 | 21 | cudnnTensorDescriptor_t yDesc; 22 | ErrChk(cudnnCreateTensorDescriptor(&yDesc)); 23 | ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, K, P, Q)); 24 | 25 | cudnnFilterDescriptor_t filterDesc; 26 | ErrChk(cudnnCreateFilterDescriptor(&filterDesc)); 27 | ErrChk(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, K, C, R, S)); 28 | 29 | cudnnConvolutionDescriptor_t convDesc; 30 | ErrChk(cudnnCreateConvolutionDescriptor(&convDesc)); 31 | ErrChk(cudnnSetConvolution2dDescriptor(convDesc, pad_h, pad_w, U, V, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); 32 | 33 | ErrChk(cudnnGetConvolutionForwardWorkspaceSize(handle, xDesc, filterDesc, convDesc, yDesc, (cudnnConvolutionFwdAlgo_t)algo, (size_t *)&(size))); 34 | 35 | ErrChk(cudnnConvolutionForward(handle, &one, xDesc, input, filterDesc, filter, convDesc, (cudnnConvolutionFwdAlgo_t)algo, buf, size, &zero, yDesc, output)); 36 | 37 | ErrChk(cudnnDestroyTensorDescriptor(xDesc)); 38 | ErrChk(cudnnDestroyTensorDescriptor(yDesc)); 39 | ErrChk(cudnnDestroyFilterDescriptor(filterDesc)); 40 | ErrChk(cudnnDestroyConvolutionDescriptor(convDesc)); 41 | } 42 | -------------------------------------------------------------------------------- /google-net_cudnn/conv.h: -------------------------------------------------------------------------------- 1 | #ifndef __CONV_H__ 2 | #define __CONV_H__ 3 | void conv(cudnnHandle_t handle, int N, int C, int H, int W, int K, int R, int S, 4 | int U, int V, int pad_h, int pad_w, int P, int Q, 5 | float *input, float *filter, 6 | float *buf, float *output, 7 | int algo, 8 | cudaStream_t s=0); 9 | #endif 10 | -------------------------------------------------------------------------------- /google-net_cudnn/dropout.cpp: -------------------------------------------------------------------------------- 1 | #include "cudnn.h" 2 | #include "util.h" 3 | #include 4 | 5 | void dropout(cudnnHandle_t handle, float dropout, int N, int C, int H, int W, 6 | float *input, float *buf, float *output) { 7 | cudnnDropoutDescriptor_t dropoutDesc; 8 | ErrChk(cudnnCreateDropoutDescriptor(&dropoutDesc)); 9 | size_t stateSize; 10 | ErrChk(cudnnDropoutGetStatesSize(handle, &stateSize)); 11 | ErrChk(cudnnSetDropoutDescriptor(dropoutDesc, handle, dropout, buf, stateSize, 462565)); 12 | 13 | cudnnTensorDescriptor_t xDesc; 14 | ErrChk(cudnnCreateTensorDescriptor(&xDesc)); 15 | ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 16 | 17 | cudnnTensorDescriptor_t yDesc; 18 | ErrChk(cudnnCreateTensorDescriptor(&yDesc)); 19 | ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 20 | 21 | size_t reserveSize; 22 | ErrChk(cudnnDropoutGetReserveSpaceSize(xDesc, &reserveSize)); 23 | 24 | ErrChk(cudnnDropoutForward(handle, dropoutDesc, xDesc, input, yDesc, output, buf + stateSize, reserveSize)); 25 | 26 | ErrChk(cudnnDestroyTensorDescriptor(xDesc)); 27 | ErrChk(cudnnDestroyTensorDescriptor(yDesc)); 28 | ErrChk(cudnnDestroyDropoutDescriptor(dropoutDesc)); 29 | } 30 | -------------------------------------------------------------------------------- /google-net_cudnn/dropout.h: -------------------------------------------------------------------------------- 1 | #ifndef __DROPOUT_H__ 2 | #define __DROPOUT_H__ 3 | void dropout(cudnnHandle_t handle, float dropout, int N, int C, int H, int W, 4 | float *input, float *buf, float *output); 5 | #endif 6 | -------------------------------------------------------------------------------- /google-net_cudnn/gemm_kernel.h: -------------------------------------------------------------------------------- 1 | /* 2 | * gemm_kernel.h 3 | * 4 | * Created on: Nov 5, 2018 5 | * Author: cambricon 6 | */ 7 | 8 | #ifndef GEMM_KERNEL_H_ 9 | #define GEMM_KERNEL_H_ 10 | 11 | 12 | //(N*P*Q)%16==0 && (P*Q)%4==0 13 | __device__ void gemm_64_16x16_1(int M, int N, int K, int P, int Q, float *A, float *B, float *C, float *sh){ 14 | 15 | float* sh_A = sh; 16 | float* sh_B = sh + 2*16*8; 17 | 18 | float4 reg_C; 19 | reg_C.x =0.f; 20 | reg_C.y =0.f; 21 | reg_C.z =0.f; 22 | reg_C.w =0.f; 23 | 24 | float reg_A[8]; 25 | float reg_B[2]; 26 | 27 | // Compute block's starting coordinate 28 | int block_base_x = blockIdx.y*16; 29 | int block_base_y = blockIdx.x*16; 30 | 31 | //load A from global memory to shared memory 32 | float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%8)*2 + (threadIdx.x/8)*M); 33 | *((float2*) (sh_A + 2*threadIdx.x)) = *(A_start); 34 | 35 | //load A from global memory to shared memory 36 | float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/16)*2 + (threadIdx.x%16)*K); 37 | *((float2*) (sh_B + 2*threadIdx.x)) = *(B_start); 38 | 39 | int double_buffer = 0; 40 | #pragma unroll 41 | for(int k=0; k 12 | __global__ void im2col_1101(float *out, float *in, int N, int C, int H, int W){ 13 | //C*N blocks, and each block is responsible for a H*W data block of transformed matrix 14 | int n = blockIdx.x/C; 15 | int c = blockIdx.x%C; 16 | 17 | float *in_start = in + n*C*H*W + c*H*W; 18 | float *out_start = out + c*N*H*W + n*H*W; 19 | 20 | for (int i=0; i 33 | __global__ void im2col_3311_version1(float *out, float *in, int N, int C, int H, int W, int R, int S, int P, int Q){ 34 | //C*N*(Q+S-1) blocks, and each block is assigned for a series of P*R data blocks along the diagonal 35 | int c = blockIdx.z; 36 | int n = blockIdx.y; 37 | 38 | int q = (blockIdx.x>=Q)? (Q-1):blockIdx.x; 39 | int s = (blockIdx.x>=Q)? (blockIdx.x-Q+1):0; 40 | 41 | int task = (q>1 && s==0)? 3:2; 42 | 43 | extern __shared__ float line_buffer[]; 44 | 45 | float *result = out + c*N*Q*S*P*R + n*P*Q + s*(N*P*Q*R) + q*P; 46 | if ( ((q==0) && (s==0)) || ( (q==(Q-1)) && (s==(S-1)) ) ) { 47 | 48 | for(int j=0; j<(P*R)/BLOCK_SIZE; ++j){ 49 | 50 | int y = (j*BLOCK_SIZE+threadIdx.x)/P; 51 | int x = (j*BLOCK_SIZE+threadIdx.x)%P; 52 | 53 | int ind = y*P*Q*N + x; 54 | 55 | result[ind] = 0.f; 56 | } 57 | 58 | if (((P*R)%BLOCK_SIZE)!=0 && threadIdx.x<((P*R)%BLOCK_SIZE)){ 59 | int y = (((P*R)/BLOCK_SIZE)*BLOCK_SIZE + threadIdx.x)/P; 60 | int x = (((P*R)/BLOCK_SIZE)*BLOCK_SIZE + threadIdx.x)%P; 61 | 62 | int ind = y*P*Q*N + x; 63 | 64 | result[ind] = 0.f; 65 | } 66 | } 67 | else { 68 | float *data = in + n*C*H*W + c*H*W + (q+s-1)*W; 69 | line_buffer[0] = 0.f; 70 | 71 | for(int j=0; j<<>>(output, input, N, C, H, W); 116 | KernelErrChk(); 117 | } 118 | else if (R==3 && U==1 && pad_h==1){ 119 | dim3 grid; 120 | grid.x = Q+S-1; 121 | grid.y = N; 122 | grid.z = C; 123 | im2col_3311_version1<32><<>>(output, input, N, C, H, W, R, S, P, Q); 124 | KernelErrChk(); 125 | } 126 | else{ 127 | cudnnTensorDescriptor_t xDesc; 128 | ErrChk(cudnnCreateTensorDescriptor(&xDesc)); 129 | ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 130 | 131 | cudnnFilterDescriptor_t filterDesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW 132 | ErrChk(cudnnCreateFilterDescriptor(&filterDesc)); 133 | ErrChk(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, K, C, R, S)); 134 | 135 | cudnnConvolutionDescriptor_t convDesc; 136 | ErrChk(cudnnCreateConvolutionDescriptor(&convDesc)); 137 | ErrChk(cudnnSetConvolution2dDescriptor(convDesc, pad_h, pad_w, U, V, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT)); 138 | ErrChk(cudnnIm2Col(handle, xDesc, input, filterDesc, convDesc, output)); 139 | 140 | ErrChk(cudnnDestroyTensorDescriptor(xDesc)); 141 | ErrChk(cudnnDestroyFilterDescriptor(filterDesc)); 142 | ErrChk(cudnnDestroyConvolutionDescriptor(convDesc)); 143 | } 144 | } 145 | 146 | 147 | #endif /* IM2COL_H_ */ 148 | -------------------------------------------------------------------------------- /google-net_cudnn/inception.cpp: -------------------------------------------------------------------------------- 1 | #include "cudnn.h" 2 | #include "util.h" 3 | #include 4 | #include "conv.h" 5 | #include "pooling.h" 6 | #include "activation.h" 7 | #include "dropout.h" 8 | #include "lrn.h" 9 | #include "concat.h" 10 | 11 | /* 12 | * Do Inception 13 | * 14 | * This func will consume 6 filters and 4 feature(x). 15 | * Use x[xIdx] as input, which should be set before this func. 16 | * Use x[xIdx + 4] as output. 17 | * 18 | */ 19 | void cudnnGoogleNetInception(cudnnHandle_t handle, const int N, const int C, 20 | const int H, const int W, const int xIdx, const int filterIdx, 21 | const int K1, const int K2, const int K3, const int K4, const int K5, 22 | const int K6, int *reC, float **x, float** filter, float* buf, 23 | cudaStream_t *s, const int *algo_best) { 24 | /* 25 | * Use x[xIdx + 4] as output. 26 | * We can concat the result directly when N == 1. 27 | */ 28 | float *output = x[xIdx + 4]; 29 | float *output1 = output; 30 | float *output2 = output1 + K1 * H * W; 31 | float *output3 = output2 + K3 * H * W; 32 | float *output4 = output3 + K5 * H * W; 33 | 34 | //1*1 conv 35 | int algo = algo_best[filterIdx*7]; 36 | conv(handle, N, C, H, W, K1, 1, 1, 1, 1, 0, 0, H, W, 37 | x[xIdx], filter[filterIdx], buf, 38 | output1, algo, s[0]); 39 | 40 | //relu 1*1 41 | activation(handle, N, K1, H, W, output1, output1, s[0]); 42 | 43 | //3*3 reduce 44 | algo = algo_best[(filterIdx+1)*7]; 45 | conv(handle, N, C, H, W, K2, 1, 1, 1, 1, 0, 0, H, W, 46 | x[xIdx], filter[filterIdx+1], buf, 47 | x[xIdx + 1], algo, s[1]); 48 | 49 | //relu 3*3 reduce 50 | activation(handle, N, K2, H, W, x[xIdx + 1], 51 | x[xIdx + 1], s[1]); 52 | 53 | //3*3 54 | algo = algo_best[(filterIdx+2)*7]; 55 | conv(handle, N, C, H, W, K3, 3, 3, 1, 1, 1, 1, H, W, 56 | x[xIdx + 1], filter[filterIdx+2], buf, 57 | output2, algo, s[1]); 58 | 59 | //relu 3*3 60 | activation(handle, N, K3, H, W, output2, output2, s[1]); 61 | 62 | //5*5 reduce 63 | algo = algo_best[(filterIdx+3)*7]; 64 | conv(handle, N, C, H, W, K4, 1, 1, 1, 1, 0, 0, H, W, 65 | x[xIdx], filter[filterIdx+2], buf, 66 | x[xIdx + 2], algo, s[2]); 67 | 68 | //relu 5*5 reduce 69 | activation(handle, N, K4, H, W, x[xIdx + 2], 70 | x[xIdx + 2], s[2]); 71 | 72 | //5*5 73 | algo = algo_best[(filterIdx+4)*7]; 74 | conv(handle, N, C, H, W, K5, 5, 5, 1, 1, 2, 2, H, W, 75 | x[xIdx + 2], filter[filterIdx+4], buf, 76 | output3, algo, s[2]); 77 | 78 | //relu 5*5 79 | activation(handle, N, K5, H, W, output3, output3, s[2]); 80 | 81 | //pool 82 | pooling(handle, N, C, H, W, 3, 3, 1, 1, 1, 1, H, W, 83 | x[xIdx], x[xIdx + 3], s[3]); 84 | 85 | //pool proj 86 | algo = algo_best[(filterIdx+5)*7]; 87 | conv(handle, N, C, H, W, K6, 1, 1, 1, 1, 0, 0, H, W, 88 | x[xIdx + 3], filter[filterIdx+5], buf, output4, algo, s[3]); 89 | 90 | //relu pool proj 91 | activation(handle, N, K6, H, W, output4, output4, s[3]); 92 | 93 | 94 | ErrChk(cudaDeviceSynchronize()); 95 | 96 | *reC = K1 + K3 + K5 + K6; 97 | } 98 | -------------------------------------------------------------------------------- /google-net_cudnn/inception.h: -------------------------------------------------------------------------------- 1 | /* 2 | * inception.h 3 | * 4 | * Created on: Nov 5, 2018 5 | * Author: cambricon 6 | */ 7 | 8 | #ifndef INCEPTION_H_ 9 | #define INCEPTION_H_ 10 | 11 | /* 12 | * Do Inception 13 | * 14 | * This func will consume 6 filters and 8 features. 15 | * Use feature[featureIndex] as input, which should be set before this func. 16 | * Use feature[featureIndex + 8] as output. 17 | * 18 | */ 19 | void cudnnGoogleNetInception(cudnnHandle_t handle, const int N, const int C, 20 | const int H, const int W, const int xIdx, const int filterIdx, 21 | const int K1, const int K2, const int K3, const int K4, const int K5, 22 | const int K6, int *reC, float **x, float** filter, float* buf, 23 | cudaStream_t *s, const int *algo_best); 24 | 25 | #endif /* INCEPTION_H_ */ 26 | -------------------------------------------------------------------------------- /google-net_cudnn/loss.cpp: -------------------------------------------------------------------------------- 1 | #include "cudnn.h" 2 | #include "util.h" 3 | #include 4 | 5 | void loss(cublasHandle_t cublas_handle, int N, int C, int K, 6 | float *input, float *filter, float *output) { 7 | float alpha = 1.f, beta = 0.f; 8 | 9 | ErrChk(cublasGemmEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, N, K, C, 10 | (void*) &alpha, (void*) input, CUDA_R_32F, C, 11 | (void*) filter, CUDA_R_32F, C, 12 | (void*) &beta, (void*) output, CUDA_R_32F, N, CUDA_R_32F, 13 | CUBLAS_GEMM_DEFAULT)); 14 | } 15 | -------------------------------------------------------------------------------- /google-net_cudnn/loss.h: -------------------------------------------------------------------------------- 1 | /* 2 | * loss.h 3 | * 4 | * Created on: Nov 5, 2018 5 | * Author: cambricon 6 | */ 7 | 8 | #ifndef LOSS_H_ 9 | #define LOSS_H_ 10 | 11 | 12 | void loss(cublasHandle_t cublas_handle, int N, int C, int K, float *input, 13 | float *filter, float *output); 14 | 15 | 16 | #endif /* LOSS_H_ */ 17 | -------------------------------------------------------------------------------- /google-net_cudnn/lrn.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cudnn.h" 3 | #include "util.h" 4 | 5 | void lrn(cudnnHandle_t handle, int N, int C, int H, int W, int R, int S, float lrnAlpha, float lrnBeta, float lrnK, float *input, float *output){ 6 | 7 | cudnnTensorDescriptor_t xDesc; 8 | ErrChk(cudnnCreateTensorDescriptor(&xDesc)); 9 | ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 10 | 11 | cudnnTensorDescriptor_t yDesc; 12 | ErrChk(cudnnCreateTensorDescriptor(&yDesc)); 13 | ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 14 | 15 | cudnnLRNDescriptor_t normDesc; 16 | ErrChk(cudnnCreateLRNDescriptor(&normDesc)); 17 | ErrChk(cudnnSetLRNDescriptor(normDesc, R, lrnAlpha, lrnBeta, lrnK)); 18 | 19 | 20 | float one = 1.f, zero = 0.f; 21 | ErrChk(cudnnLRNCrossChannelForward(handle, normDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &one, xDesc, input, &zero, yDesc, output)); 22 | 23 | ErrChk(cudnnDestroyLRNDescriptor(normDesc)); 24 | ErrChk(cudnnDestroyTensorDescriptor(xDesc)); 25 | ErrChk(cudnnDestroyTensorDescriptor(yDesc)); 26 | } 27 | -------------------------------------------------------------------------------- /google-net_cudnn/lrn.h: -------------------------------------------------------------------------------- 1 | #ifndef __LRN_H__ 2 | #define __LRN_H__ 3 | void lrn(cudnnHandle_t handle, int N, int C, int H, int W, int R, int S, float lrnAlpha, float lrnBeta, float lrnK, float *input, float *output); 4 | #endif 5 | -------------------------------------------------------------------------------- /google-net_cudnn/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "cudnn.h" 5 | #include "util.h" 6 | #include "conv.h" 7 | #include "activation.h" 8 | #include "pooling.h" 9 | #include "concat.h" 10 | #include "dropout.h" 11 | #include "lrn.h" 12 | #include "loss.h" 13 | #include "softmax.h" 14 | #include "inception.h" 15 | #include "batch-inception.h" 16 | 17 | 18 | void batchGoogleNetForward(cudnnHandle_t handle, cublasHandle_t cublas_handle, 19 | int N, float **x, float **filter, float* buf, const int *algo_best) { 20 | int C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q; 21 | 22 | // conv1/7x7_s2 23 | C = 3; 24 | H = W = 227; 25 | K = 64; 26 | R = S = 7; 27 | U = V = 2; 28 | pad_h = pad_w = 3; 29 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 30 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 31 | 32 | int algo = algo_best[0]; 33 | conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q, 34 | x[0], filter[0], buf, x[1], algo); 35 | 36 | // conv1/relu_7x7 37 | C = 64; 38 | H = W = 114; 39 | activation(handle, N, C, H, W, x[1], x[1]); 40 | 41 | // pool1/3x3_s2 42 | R = 3; 43 | S = 3; 44 | U = 2; 45 | V = 2; 46 | pad_h = 1; 47 | pad_w = 1; 48 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 49 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 50 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[1], x[2]); 51 | 52 | H = P; 53 | W = Q; 54 | 55 | // pool1/norm1 56 | R = 5; 57 | S = 5; 58 | float lrnAlpha = 0.0001f; 59 | float lrnBeta = 0.75f; 60 | float lrnK = 2.f; 61 | 62 | lrn(handle, N, C, H, W, R, S, lrnAlpha, lrnBeta, lrnK, x[2], x[3]); 63 | 64 | // conv2/3x3_reduce 65 | K = 64; 66 | R = 1; 67 | S = 1; 68 | U = 1; 69 | V = 1; 70 | pad_h = 0; 71 | pad_w = 0; 72 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 73 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 74 | 75 | algo = algo_best[7]; 76 | conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q, 77 | x[3], filter[1], buf, x[4], algo); 78 | C = K; 79 | H = P; 80 | W = Q; 81 | 82 | // conv2/relu_3x3_reduce 83 | activation(handle, N, C, H, W, x[4], x[4]); 84 | 85 | // conv2/3x3 86 | K = 192; 87 | R = 3; 88 | S = 3; 89 | U = 1; 90 | V = 1; 91 | pad_h = 1; 92 | pad_w = 1; 93 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 94 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 95 | 96 | algo = algo_best[14]; 97 | conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q, 98 | x[4], filter[2], buf, x[5], algo); 99 | C = K; 100 | H = P; 101 | W = Q; 102 | 103 | 104 | // conv2/relu_3x3 105 | activation(handle, N, C, H, W, x[5], x[5]); 106 | 107 | // conv2/norm2 108 | R = 5; 109 | S = 5; 110 | lrnAlpha = 0.0001f; 111 | lrnBeta = 0.75f; 112 | lrnK = 2.f; 113 | 114 | lrn(handle, N, C, H, W, R, S, lrnAlpha, lrnBeta, lrnK, x[5], x[6]); 115 | 116 | // pool2/3x3_s2 117 | R = 3; 118 | S = 3; 119 | U = 2; 120 | V = 2; 121 | pad_h = 0; 122 | pad_w = 0; 123 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 124 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 125 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[6], x[7]); 126 | 127 | // inception3a 128 | H = P; 129 | W = Q; 130 | batchGoogleNetInception(handle, N, C, H, W, 7, 3, 131 | 64, 96, 128, 16, 32, 32, // K1, K2, K3, K4, K5, K6 132 | &C, x, filter, buf, algo_best); 133 | 134 | // inception3b 135 | batchGoogleNetInception(handle, N, C, H, W, 11, 9, 136 | 128, 128, 192, 32, 96, 64, // K1, K2, K3, K4, K5, K6 137 | &C, x, filter, buf, algo_best); 138 | 139 | // pool3/3x3_s2 140 | R = 3; 141 | S = 3; 142 | U = 2; 143 | V = 2; 144 | pad_h = 1; 145 | pad_w = 1; 146 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 147 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 148 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[15], x[16]); 149 | 150 | // inception4a 151 | H = P; 152 | W = Q; 153 | batchGoogleNetInception(handle, N, C, H, W, 16, 15, 154 | 192, 96, 208, 16, 48, 64, // K1, K2, K3, K4, K5, K6 155 | &C, x, filter, buf, algo_best); 156 | 157 | // inception4b 158 | batchGoogleNetInception(handle, N, C, H, W, 20, 21, 159 | 160, 112, 224, 24, 64, 64, // K1, K2, K3, K4, K5, K6 160 | &C, x, filter, buf, algo_best); 161 | 162 | // inception4c 163 | batchGoogleNetInception(handle, N, C, H, W, 24, 27, 164 | 128, 128, 256, 24, 64, 64, // K1, K2, K3, K4, K5, K6 165 | &C, x, filter, buf, algo_best); 166 | 167 | // inception4d 168 | batchGoogleNetInception(handle, N, C, H, W, 28, 33, 169 | 112, 144, 288, 32, 64, 64, // K1, K2, K3, K4, K5, K6 170 | &C, x, filter, buf, algo_best); 171 | 172 | // inception4e 173 | batchGoogleNetInception(handle, N, C, H, W, 32, 39, 174 | 256, 160, 320, 32, 128, 128, // K1, K2, K3, K4, K5, K6 175 | &C, x, filter, buf, algo_best); 176 | 177 | // pool4/3x3_s2 178 | R = 3; 179 | S = 3; 180 | U = 2; 181 | V = 2; 182 | pad_h = 1; 183 | pad_w = 1; 184 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 185 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 186 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[36], x[37]); 187 | 188 | // inception5a 189 | H = P; 190 | W = Q; 191 | batchGoogleNetInception(handle, N, C, H, W, 37, 45, 192 | 256, 160, 320, 32, 128, 128, // K1, K2, K3, K4, K5, K6 193 | &C, x, filter, buf, algo_best); 194 | 195 | // inception5b 196 | batchGoogleNetInception(handle, N, C, H, W, 41, 51, 197 | 384, 192, 384, 48, 128, 128, // K1, K2, K3, K4, K5, K6 198 | &C, x, filter, buf, algo_best); 199 | 200 | // pool5/3x3_s2 201 | R = 7; 202 | S = 7; 203 | U = 1; 204 | V = 1; 205 | pad_h = 0; 206 | pad_w = 0; 207 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 208 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 209 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[45], x[46]); 210 | 211 | // loss3 212 | K = 1000; 213 | loss(cublas_handle, N, C, K, x[46], filter[57], x[47]); 214 | 215 | // softmax 216 | softmax(handle, N, C, x[47], x[48]); 217 | } 218 | 219 | 220 | void cudnnGoogleNetForward(cudnnHandle_t handle, cublasHandle_t cublas_handle, 221 | int N, float **x, float** filter, float* buf, const int *algo_best) { 222 | int C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q; 223 | 224 | // conv1/7x7_s2 225 | C = 3; 226 | H = W = 227; 227 | K = 64; 228 | R = S = 7; 229 | U = V = 2; 230 | pad_h = pad_w = 3; 231 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 232 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 233 | 234 | int algo = algo_best[0]; 235 | conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q, 236 | x[0], filter[0], buf, x[1], algo); 237 | 238 | // conv1/relu_7x7 239 | C = 64; 240 | H = W = 114; 241 | activation(handle, N, C, H, W, x[1], x[1]); 242 | 243 | // pool1/3x3_s2 244 | R = 3; 245 | S = 3; 246 | U = 2; 247 | V = 2; 248 | pad_h = 1; 249 | pad_w = 1; 250 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 251 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 252 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[1], x[2]); 253 | 254 | H = P; 255 | W = Q; 256 | 257 | // pool1/norm1 258 | R = 5; 259 | S = 5; 260 | float lrnAlpha = 0.0001f; 261 | float lrnBeta = 0.75f; 262 | float lrnK = 2.f; 263 | 264 | lrn(handle, N, C, H, W, R, S, lrnAlpha, lrnBeta, lrnK, x[2], x[3]); 265 | 266 | // conv2/3x3_reduce 267 | K = 64; 268 | R = 1; 269 | S = 1; 270 | U = 1; 271 | V = 1; 272 | pad_h = 0; 273 | pad_w = 0; 274 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 275 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 276 | 277 | algo = algo_best[7]; 278 | conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q, 279 | x[3], filter[1], buf, x[4], algo); 280 | C = K; 281 | H = P; 282 | W = Q; 283 | 284 | // conv2/relu_3x3_reduce 285 | activation(handle, N, C, H, W, x[4], x[4]); 286 | 287 | // conv2/3x3 288 | K = 192; 289 | R = 3; 290 | S = 3; 291 | U = 1; 292 | V = 1; 293 | pad_h = 1; 294 | pad_w = 1; 295 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 296 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 297 | 298 | algo = algo_best[14]; 299 | conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q, 300 | x[4], filter[2], buf, x[5], algo); 301 | C = K; 302 | H = P; 303 | W = Q; 304 | 305 | 306 | // conv2/relu_3x3 307 | activation(handle, N, C, H, W, x[5], x[5]); 308 | 309 | // conv2/norm2 310 | R = 5; 311 | S = 5; 312 | lrnAlpha = 0.0001f; 313 | lrnBeta = 0.75f; 314 | lrnK = 2.f; 315 | 316 | lrn(handle, N, C, H, W, R, S, lrnAlpha, lrnBeta, lrnK, x[5], x[6]); 317 | 318 | // pool2/3x3_s2 319 | R = 3; 320 | S = 3; 321 | U = 2; 322 | V = 2; 323 | pad_h = 0; 324 | pad_w = 0; 325 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 326 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 327 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[6], x[7]); 328 | 329 | #ifdef USE_MULTI_STREAM 330 | cudaStream_t s[4]; 331 | ErrChk(cudaStreamCreate(&s[0])); 332 | ErrChk(cudaStreamCreate(&s[1])); 333 | ErrChk(cudaStreamCreate(&s[2])); 334 | ErrChk(cudaStreamCreate(&s[3])); 335 | #else 336 | cudaStream_t s[4] = {0, 0, 0, 0}; 337 | #endif 338 | 339 | // inception3a 340 | H = P; 341 | W = Q; 342 | cudnnGoogleNetInception(handle, N, C, H, W, 7, 3, 343 | 64, 96, 128, 16, 32, 32, // K1, K2, K3, K4, K5, K6 344 | &C, x, filter, buf, s, algo_best); 345 | 346 | // inception3b 347 | cudnnGoogleNetInception(handle, N, C, H, W, 11, 9, 348 | 128, 128, 192, 32, 96, 64, // K1, K2, K3, K4, K5, K6 349 | &C, x, filter, buf, s, algo_best); 350 | 351 | // pool3/3x3_s2 352 | R = 3; 353 | S = 3; 354 | U = 2; 355 | V = 2; 356 | pad_h = 1; 357 | pad_w = 1; 358 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 359 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 360 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[15], x[16]); 361 | 362 | // inception4a 363 | H = P; 364 | W = Q; 365 | cudnnGoogleNetInception(handle, N, C, H, W, 16, 15, 366 | 192, 96, 208, 16, 48, 64, // K1, K2, K3, K4, K5, K6 367 | &C, x, filter, buf, s, algo_best); 368 | 369 | // inception4b 370 | cudnnGoogleNetInception(handle, N, C, H, W, 20, 21, 371 | 160, 112, 224, 24, 64, 64, // K1, K2, K3, K4, K5, K6 372 | &C, x, filter, buf, s, algo_best); 373 | 374 | // inception4c 375 | cudnnGoogleNetInception(handle, N, C, H, W, 24, 27, 376 | 128, 128, 256, 24, 64, 64, // K1, K2, K3, K4, K5, K6 377 | &C, x, filter, buf, s, algo_best); 378 | 379 | // inception4d 380 | cudnnGoogleNetInception(handle, N, C, H, W, 28, 33, 381 | 112, 144, 288, 32, 64, 64, // K1, K2, K3, K4, K5, K6 382 | &C, x, filter, buf, s, algo_best); 383 | 384 | // inception4e 385 | cudnnGoogleNetInception(handle, N, C, H, W, 32, 39, 386 | 256, 160, 320, 32, 128, 128, // K1, K2, K3, K4, K5, K6 387 | &C, x, filter, buf, s, algo_best); 388 | 389 | // pool4/3x3_s2 390 | R = 3; 391 | S = 3; 392 | U = 2; 393 | V = 2; 394 | pad_h = 1; 395 | pad_w = 1; 396 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 397 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 398 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[36], x[37]); 399 | 400 | // inception5a 401 | H = P; 402 | W = Q; 403 | cudnnGoogleNetInception(handle, N, C, H, W, 37, 45, 404 | 256, 160, 320, 32, 128, 128, // K1, K2, K3, K4, K5, K6 405 | &C, x, filter, buf, s, algo_best); 406 | 407 | // inception5b 408 | cudnnGoogleNetInception(handle, N, C, H, W, 41, 51, 409 | 384, 192, 384, 48, 128, 128, // K1, K2, K3, K4, K5, K6 410 | &C, x, filter, buf, s, algo_best); 411 | 412 | // pool5/3x3_s2 413 | R = 7; 414 | S = 7; 415 | U = 1; 416 | V = 1; 417 | pad_h = 0; 418 | pad_w = 0; 419 | P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U); 420 | Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V); 421 | pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[45], x[46]); 422 | 423 | // loss3 424 | K = 1000; 425 | loss(cublas_handle, N, C, K, x[46], filter[57], x[47]); 426 | 427 | // softmax 428 | softmax(handle, N, C, x[47], x[48]); 429 | 430 | #ifdef USE_MULTI_STREAM 431 | ErrChk(cudaStreamDestroy(s[0])); 432 | ErrChk(cudaStreamDestroy(s[1])); 433 | ErrChk(cudaStreamDestroy(s[2])); 434 | ErrChk(cudaStreamDestroy(s[3])); 435 | #endif 436 | } 437 | 438 | const int algo_best[7*57] = { 439 | 0, 0, 0, 1, 1, 1, 1, 440 | 0, 0, 0, 0, 0, 1, 1, 441 | 6, 6, 6, 6, 7, 7, 5, 442 | 0, 0, 0, 0, 0, 1, 1, 443 | 0, 0, 1, 0, 0, 1, 1, 444 | 6, 6, 6, 6, 7, 7, 7, 445 | 0, 0, 0, 1, 0, 0, 1, 446 | 0, 0, 0, 0, 0, 5, 5, 447 | 0, 0, 0, 1, 0, 0, 1, 448 | 0, 0, 0, 0, 1, 1, 1, 449 | 0, 0, 0, 0, 1, 1, 1, 450 | 6, 6, 6, 6, 7, 7, 7, 451 | 0, 0, 0, 0, 0, 0, 1, 452 | 0, 0, 0, 5, 5, 5, 5, 453 | 0, 0, 0, 0, 0, 1, 1, 454 | 0, 0, 0, 1, 1, 1, 1, 455 | 0, 1, 1, 0, 0, 0, 1, 456 | 6, 6, 6, 6, 7, 7, 7, 457 | 0, 0, 0, 0, 0, 0, 0, 458 | 0, 0, 0, 7, 7, 7, 7, 459 | 1, 0, 1, 0, 0, 0, 0, 460 | 0, 0, 0, 0, 1, 1, 1, 461 | 0, 1, 1, 1, 0, 0, 1, 462 | 6, 6, 6, 6, 7, 7, 7, 463 | 0, 0, 0, 0, 0, 0, 0, 464 | 0, 7, 0, 7, 7, 7, 7, 465 | 1, 0, 1, 0, 0, 0, 0, 466 | 0, 0, 1, 1, 0, 0, 1, 467 | 0, 0, 1, 1, 0, 0, 1, 468 | 6, 6, 6, 6, 7, 7, 7, 469 | 0, 0, 0, 0, 0, 0, 0, 470 | 0, 0, 0, 7, 7, 7, 7, 471 | 1, 0, 1, 0, 0, 0, 1, 472 | 0, 1, 1, 1, 0, 0, 1, 473 | 0, 1, 1, 1, 1, 1, 1, 474 | 6, 6, 6, 6, 7, 7, 7, 475 | 0, 0, 0, 0, 0, 0, 0, 476 | 7, 7, 7, 7, 7, 7, 7, 477 | 1, 0, 1, 0, 0, 0, 1, 478 | 0, 0, 0, 0, 1, 0, 1, 479 | 0, 0, 0, 0, 1, 1, 1, 480 | 6, 6, 6, 6, 7, 7, 7, 481 | 0, 0, 0, 0, 0, 0, 0, 482 | 7, 7, 7, 7, 7, 7, 7, 483 | 0, 0, 1, 1, 0, 0, 0, 484 | 1, 1, 0, 1, 1, 0, 1, 485 | 0, 1, 0, 0, 0, 0, 1, 486 | 6, 6, 6, 6, 7, 7, 7, 487 | 0, 0, 0, 0, 0, 1, 0, 488 | 7, 7, 7, 7, 7, 7, 4, 489 | 0, 1, 0, 0, 1, 1, 0, 490 | 1, 1, 1, 1, 1, 0, 1, 491 | 1, 0, 1, 1, 1, 1, 0, 492 | 6, 6, 6, 7, 7, 7, 7, 493 | 1, 1, 1, 1, 1, 0, 1, 494 | 7, 7, 7, 7, 7, 7, 4, 495 | 0, 1, 0, 0, 1, 1, 0 496 | }; 497 | 498 | int main() { 499 | const int warmupIters = 2; 500 | const int TestIters = 10; 501 | 502 | int N = 1; // batch size 503 | const int filterNum = 58; 504 | const int xNum = 50; 505 | 506 | float **filter = new float*[filterNum]; // filter 507 | float **x = new float*[xNum]; // result 508 | 509 | const int MAX_TENSOR_SIZE=N * 200704 * 9; 510 | ErrChk(cudaMalloc(&x[0], (xNum + 10) *MAX_TENSOR_SIZE * sizeof(float))); 511 | for (int i = 1; i < xNum; ++i) { 512 | x[i] = x[i - 1] + MAX_TENSOR_SIZE; 513 | } 514 | float *buf = x[xNum - 1] + MAX_TENSOR_SIZE; 515 | 516 | const int MAX_FILTER_SIZE = 8000000; 517 | ErrChk(cudaMalloc(&filter[0], filterNum * MAX_FILTER_SIZE * sizeof(float))); 518 | for (int i = 1; i < filterNum; ++i) { 519 | filter[i] = filter[i - 1] + MAX_FILTER_SIZE; 520 | } 521 | 522 | const int RESULT_SIZE=1000; 523 | float *h_cudnn_result = (float*)malloc(2 * RESULT_SIZE * sizeof(float)); 524 | float *h_our_result = h_cudnn_result + RESULT_SIZE; 525 | 526 | // prepare data 527 | float *h_input = (float*) malloc(MAX_TENSOR_SIZE * sizeof(float)); 528 | for (int j = 0; j < MAX_TENSOR_SIZE; ++j) 529 | h_input[j] = j%10; 530 | float *h_filter = (float*) malloc( 531 | filterNum * MAX_FILTER_SIZE * sizeof(float)); 532 | for (int j = 0; j < filterNum*MAX_FILTER_SIZE; ++j) 533 | h_filter[j] = j%5; 534 | ErrChk(cudaMemcpy(x[0], h_input, MAX_TENSOR_SIZE * sizeof(float), 535 | cudaMemcpyHostToDevice)); 536 | ErrChk(cudaMemcpy(filter[0], h_filter, 537 | filterNum * MAX_FILTER_SIZE * sizeof(float), 538 | cudaMemcpyHostToDevice)); 539 | 540 | cudnnHandle_t handle; 541 | cublasHandle_t cublas_handle; 542 | ErrChk(cudnnCreate(&handle)); 543 | ErrChk(cublasCreate(&cublas_handle)); 544 | 545 | // warm up 546 | for (int i = 0; i < warmupIters; ++i) { 547 | cudnnGoogleNetForward(handle, cublas_handle, N, x, filter, buf, 548 | algo_best); 549 | } 550 | 551 | cudaEvent_t start, stop; 552 | float elapsedTime = 0; 553 | ErrChk(cudaEventCreate(&start)); 554 | ErrChk(cudaEventCreate(&stop)); 555 | ErrChk(cudaEventRecord(start,0)); 556 | 557 | for (int i = 0; i < TestIters; ++i) { 558 | cudnnGoogleNetForward(handle, cublas_handle, N, x, filter, buf, 559 | algo_best); 560 | } 561 | 562 | ErrChk(cudaEventRecord(stop, 0)); 563 | ErrChk(cudaEventSynchronize(stop)); 564 | ErrChk(cudaEventElapsedTime(&elapsedTime, start, stop)); 565 | 566 | printf("Time for cuDNN implementation is %0.6f\n", elapsedTime / TestIters); 567 | ErrChk(cudaMemcpy(h_cudnn_result, x[48], RESULT_SIZE * sizeof(float), 568 | cudaMemcpyDeviceToHost)); 569 | 570 | // warm up 571 | for (int i = 0; i < warmupIters; ++i) { 572 | batchGoogleNetForward(handle, cublas_handle, N, x, filter, buf, 573 | algo_best); 574 | } 575 | 576 | ErrChk(cudaEventRecord(start,0)); 577 | for (int i = 0; i < TestIters; ++i) { 578 | batchGoogleNetForward(handle, cublas_handle, N, x, filter, buf, 579 | algo_best); 580 | } 581 | 582 | ErrChk(cudaEventRecord(stop, 0)); 583 | ErrChk(cudaEventSynchronize(stop)); 584 | ErrChk(cudaEventElapsedTime(&elapsedTime, start, stop)); 585 | printf("Time for batched-Conv implementation is %0.6f\n", 586 | elapsedTime / TestIters); 587 | ErrChk(cudaMemcpy(h_our_result, x[48], RESULT_SIZE * sizeof(float), 588 | cudaMemcpyDeviceToHost)); 589 | 590 | // compare the result 591 | double ep = 0.0001; 592 | for (int i = 0; i < RESULT_SIZE; ++i) { 593 | if (std::abs(h_our_result[i] - (double)h_cudnn_result[i]) > ep) { 594 | printf("result error at %d: %f, %f\n", i, h_our_result[i], 595 | h_cudnn_result[i]); 596 | return -1; 597 | } 598 | } 599 | printf("result is correctly!\n"); 600 | 601 | ErrChk(cublasDestroy(cublas_handle)); 602 | ErrChk(cudnnDestroy(handle)); 603 | 604 | return 0; 605 | } 606 | -------------------------------------------------------------------------------- /google-net_cudnn/pooling.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "cudnn.h" 3 | #include "util.h" 4 | 5 | void pooling(cudnnHandle_t handle, int N, int C, int H, int W, int R, int S, int U, int V, int pad_h, int pad_w, int P, int Q, float *input, float *output, cudaStream_t s){ 6 | 7 | ErrChk(cudnnSetStream(handle, s)); 8 | 9 | cudnnPoolingDescriptor_t poolingDesc; 10 | ErrChk(cudnnCreatePoolingDescriptor(&poolingDesc)); 11 | ErrChk(cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, R, S, pad_h, pad_w, U, V)); 12 | 13 | cudnnTensorDescriptor_t xDesc; 14 | ErrChk(cudnnCreateTensorDescriptor(&xDesc)); 15 | ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W)); 16 | 17 | cudnnTensorDescriptor_t yDesc; 18 | ErrChk(cudnnCreateTensorDescriptor(&yDesc)); 19 | ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, P, Q)); 20 | 21 | float one = 1.0, zero = 0.0; 22 | ErrChk(cudnnPoolingForward(handle, poolingDesc, &one, xDesc, input, &zero, yDesc, output)); 23 | 24 | ErrChk(cudnnDestroyPoolingDescriptor(poolingDesc)); 25 | ErrChk(cudnnDestroyTensorDescriptor(xDesc)); 26 | ErrChk(cudnnDestroyTensorDescriptor(yDesc)); 27 | } 28 | -------------------------------------------------------------------------------- /google-net_cudnn/pooling.h: -------------------------------------------------------------------------------- 1 | #ifndef __POOLING_H__ 2 | #define __POOLING_H__ 3 | void pooling(cudnnHandle_t handle, int N, int C, int H, int W, int R, int S, int U, int V, int pad_h, int pad_w, int P, int Q, float *input, float *output, cudaStream_t s=0); 4 | #endif 5 | -------------------------------------------------------------------------------- /google-net_cudnn/softmax.cpp: -------------------------------------------------------------------------------- 1 | #include "cudnn.h" 2 | #include "util.h" 3 | #include 4 | 5 | void softmax(cudnnHandle_t handle, int N, int C, float *input, float *output){ 6 | 7 | float one = 1.0, zero = 0.0; 8 | size_t size; 9 | 10 | cudnnTensorDescriptor_t xDesc; 11 | ErrChk(cudnnCreateTensorDescriptor(&xDesc)); 12 | ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, 1, 1)); 13 | 14 | cudnnTensorDescriptor_t yDesc; 15 | ErrChk(cudnnCreateTensorDescriptor(&yDesc)); 16 | ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, 1, 1)); 17 | 18 | cudnnSoftmaxAlgorithm_t algo = CUDNN_SOFTMAX_FAST; 19 | cudnnSoftmaxMode_t mode = CUDNN_SOFTMAX_MODE_INSTANCE; 20 | 21 | ErrChk(cudnnSoftmaxForward(handle, algo, mode, &one, xDesc, input, &zero, yDesc, output)); 22 | 23 | ErrChk(cudnnDestroyTensorDescriptor(xDesc)); 24 | ErrChk(cudnnDestroyTensorDescriptor(yDesc)); 25 | } 26 | -------------------------------------------------------------------------------- /google-net_cudnn/softmax.h: -------------------------------------------------------------------------------- 1 | /* 2 | * softmax.h 3 | * 4 | * Created on: Nov 5, 2018 5 | * Author: cambricon 6 | */ 7 | 8 | #ifndef SOFTMAX_H_ 9 | #define SOFTMAX_H_ 10 | 11 | 12 | void softmax(cudnnHandle_t handle, int N, int C, float *input, float *output); 13 | 14 | 15 | #endif /* SOFTMAX_H_ */ 16 | -------------------------------------------------------------------------------- /google-net_cudnn/util.h: -------------------------------------------------------------------------------- 1 | #ifndef __UTIL_H__ 2 | #define __UTIL_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include "cudnn.h" 8 | 9 | 10 | static inline const char* cublasGetErrorString(cublasStatus_t error) 11 | { 12 | switch (error) 13 | { 14 | case CUBLAS_STATUS_SUCCESS: 15 | return "CUBLAS_STATUS_SUCCESS"; 16 | 17 | case CUBLAS_STATUS_NOT_INITIALIZED: 18 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 19 | 20 | case CUBLAS_STATUS_ALLOC_FAILED: 21 | return "CUBLAS_STATUS_ALLOC_FAILED"; 22 | 23 | case CUBLAS_STATUS_INVALID_VALUE: 24 | return "CUBLAS_STATUS_INVALID_VALUE"; 25 | 26 | case CUBLAS_STATUS_ARCH_MISMATCH: 27 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 28 | 29 | case CUBLAS_STATUS_MAPPING_ERROR: 30 | return "CUBLAS_STATUS_MAPPING_ERROR"; 31 | 32 | case CUBLAS_STATUS_EXECUTION_FAILED: 33 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 34 | 35 | case CUBLAS_STATUS_INTERNAL_ERROR: 36 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 37 | 38 | case CUBLAS_STATUS_NOT_SUPPORTED: 39 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 40 | 41 | case CUBLAS_STATUS_LICENSE_ERROR: 42 | return "CUBLAS_STATUS_LICENSE_ERROR"; 43 | } 44 | return ""; 45 | } 46 | 47 | 48 | #define ErrChk(code) { Assert((code), __FILE__, __LINE__); } 49 | static inline void Assert(cudaError_t code, const char *file, int line){ 50 | if(code!=cudaSuccess) { 51 | printf("CUDA Runtime Error: %s:%d:'%s'\n", file, line,cudaGetErrorString(code)); 52 | exit(EXIT_FAILURE); 53 | } 54 | } 55 | static inline void Assert(cudnnStatus_t code, const char *file, int line){ 56 | if (code!=CUDNN_STATUS_SUCCESS){ 57 | printf("cuDNN API Error: %s:%d:'%s'\n", file, line, cudnnGetErrorString(code)); 58 | exit(EXIT_FAILURE); 59 | } 60 | } 61 | static inline void Assert(cublasStatus_t code, const char *file, int line){ 62 | if (code!=CUBLAS_STATUS_SUCCESS){ 63 | printf("cuBLAS API Error: %s:%d:'%s'\n", file, line, cublasGetErrorString(code)); 64 | exit(EXIT_FAILURE); 65 | } 66 | } 67 | 68 | 69 | #define KernelErrChk(){\ 70 | cudaError_t errSync = cudaGetLastError();\ 71 | cudaError_t errAsync = cudaDeviceSynchronize();\ 72 | if (errSync != cudaSuccess) {\ 73 | printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));\ 74 | exit(EXIT_FAILURE);\ 75 | }\ 76 | if (errAsync != cudaSuccess){\ 77 | printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));\ 78 | exit(EXIT_FAILURE);\ 79 | }\ 80 | } 81 | #endif 82 | -------------------------------------------------------------------------------- /include/util.h: -------------------------------------------------------------------------------- 1 | #ifndef __UTIL_H__ 2 | #define __UTIL_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include "cudnn.h" 8 | 9 | 10 | static inline const char* cublasGetErrorString(cublasStatus_t error) 11 | { 12 | switch (error) 13 | { 14 | case CUBLAS_STATUS_SUCCESS: 15 | return "CUBLAS_STATUS_SUCCESS"; 16 | 17 | case CUBLAS_STATUS_NOT_INITIALIZED: 18 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 19 | 20 | case CUBLAS_STATUS_ALLOC_FAILED: 21 | return "CUBLAS_STATUS_ALLOC_FAILED"; 22 | 23 | case CUBLAS_STATUS_INVALID_VALUE: 24 | return "CUBLAS_STATUS_INVALID_VALUE"; 25 | 26 | case CUBLAS_STATUS_ARCH_MISMATCH: 27 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 28 | 29 | case CUBLAS_STATUS_MAPPING_ERROR: 30 | return "CUBLAS_STATUS_MAPPING_ERROR"; 31 | 32 | case CUBLAS_STATUS_EXECUTION_FAILED: 33 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 34 | 35 | case CUBLAS_STATUS_INTERNAL_ERROR: 36 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 37 | 38 | case CUBLAS_STATUS_NOT_SUPPORTED: 39 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 40 | 41 | case CUBLAS_STATUS_LICENSE_ERROR: 42 | return "CUBLAS_STATUS_LICENSE_ERROR"; 43 | } 44 | return ""; 45 | } 46 | 47 | 48 | #define ErrChk(code) { Assert((code), __FILE__, __LINE__); } 49 | static inline void Assert(cudaError_t code, const char *file, int line){ 50 | if(code!=cudaSuccess) { 51 | printf("CUDA Runtime Error: %s:%d:'%s'\n", file, line, cudaGetErrorString(code)); 52 | exit(EXIT_FAILURE); 53 | } 54 | } 55 | static inline void Assert(cudnnStatus_t code, const char *file, int line){ 56 | if (code!=CUDNN_STATUS_SUCCESS){ 57 | printf("cuDNN API Error: %s:%d:'%s'\n", file, line, cudnnGetErrorString(code)); 58 | exit(EXIT_FAILURE); 59 | } 60 | } 61 | static inline void Assert(cublasStatus_t code, const char *file, int line){ 62 | if (code!=CUBLAS_STATUS_SUCCESS){ 63 | printf("cuBLAS API Error: %s:%d:'%s'\n", file, line, cublasGetErrorString(code)); 64 | exit(EXIT_FAILURE); 65 | } 66 | } 67 | 68 | 69 | #define KernelErrChk(){\ 70 | cudaError_t errSync = cudaGetLastError();\ 71 | cudaError_t errAsync = cudaDeviceSynchronize();\ 72 | if (errSync != cudaSuccess) {\ 73 | printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));\ 74 | exit(EXIT_FAILURE);\ 75 | }\ 76 | if (errAsync != cudaSuccess){\ 77 | printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));\ 78 | exit(EXIT_FAILURE);\ 79 | }\ 80 | } 81 | #endif 82 | -------------------------------------------------------------------------------- /magma/Makefile: -------------------------------------------------------------------------------- 1 | #GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 2 | GENCODE_FLAGS = -gencode arch=compute_70,code=sm_70 3 | 4 | gemm:gemm.cu kernel.h 5 | nvcc $< -o $@ --std=c++11 -O3 ${GENCODE_FLAGS} -Xptxas -v 6 | clean: 7 | rm -rf gemm *.o 8 | -------------------------------------------------------------------------------- /magma/gemm.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../include/util.h" 6 | #include "kernel.h" 7 | 8 | #define N_RUNS 10 9 | 10 | 11 | int main (int argc, char** argv) { 12 | 13 | ErrChk(cudaSetDevice(0)); 14 | 15 | if(argc<2){ 16 | printf("Usage: input the batch size\n"); 17 | exit(EXIT_FAILURE); 18 | } 19 | 20 | int BATCH = atoi(argv[1]); 21 | 22 | int *M; 23 | int *N; 24 | int *K; 25 | 26 | M = (int*) malloc(BATCH * sizeof(int)); 27 | N = (int*) malloc(BATCH * sizeof(int)); 28 | K = (int*) malloc(BATCH * sizeof(int)); 29 | 30 | std::fstream fs; 31 | fs.open("../data/input"); 32 | if (!fs.is_open()){ 33 | printf("Error opening input\n"); 34 | exit(EXIT_FAILURE); 35 | } 36 | 37 | //read matrix config 38 | for (int i=0; i>M[i]>>N[i]>>K[i]; 40 | } 41 | 42 | float **A; 43 | float **B; 44 | float **C; 45 | 46 | A = (float**) malloc(BATCH * sizeof(float*)); 47 | B = (float**) malloc(BATCH * sizeof(float*)); 48 | C = (float**) malloc(BATCH * sizeof(float*)); 49 | 50 | for (int i=0; i M[j]/16)?(grid_size.x):(M[j]/16); 98 | grid_size.y = (grid_size.y > N[j]/16)?(grid_size.y):(N[j]/16); 99 | } 100 | 101 | //warm-up 102 | gemm<64, 16, 16><<>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C); 103 | KernelErrChk(); 104 | 105 | ErrChk(cudaEventCreate(&start)); 106 | ErrChk(cudaEventRecord(start,0)); 107 | 108 | for (int run = 0; run<<>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C); 110 | KernelErrChk(); 111 | } 112 | 113 | ErrChk(cudaEventCreate(&stop)); 114 | ErrChk(cudaEventRecord(stop,0)); 115 | ErrChk(cudaEventSynchronize(stop)); 116 | ErrChk(cudaEventElapsedTime(&elapsedTime, start,stop)); 117 | 118 | time = elapsedTime/N_RUNS; 119 | time /= 1.0e3; //convert time unit from millisecond to second 120 | gflops_per_sec = gflops / time; 121 | printf("%f\n", gflops_per_sec); 122 | 123 | for (int i=0; i /dev/null 11 | ./gemm 4 >> log 12 | ./gemm 8 >> log 13 | ./gemm 16 >> log 14 | ./gemm 32 >> log 15 | ./gemm 64 >> log 16 | ./gemm 128 >> log 17 | ./gemm 256 >> log 18 | done 19 | done 20 | -------------------------------------------------------------------------------- /tiling/Makefile: -------------------------------------------------------------------------------- 1 | #GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 2 | GENCODE_FLAGS = -gencode arch=compute_70,code=compute_70 3 | 4 | gemm:gemm.cu kernel.h 5 | nvcc $< -o $@ --std=c++11 -O3 ${GENCODE_FLAGS} -Xptxas -v 6 | clean: 7 | rm -rf gemm *.o 8 | -------------------------------------------------------------------------------- /tiling/gemm.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "../include/util.h" 6 | #include "kernel.h" 7 | 8 | #define N_RUNS 10 9 | 10 | int main (int argc, char** argv) { 11 | 12 | ErrChk(cudaSetDevice(0)); 13 | 14 | if(argc<2){ 15 | printf("Usage: input the batch size\n"); 16 | exit(EXIT_FAILURE); 17 | } 18 | 19 | int BATCH = atoi(argv[1]); 20 | //int TLP_thres = atoi(argv[2]); 21 | int TLP_thres = 65536; 22 | 23 | int *M; 24 | int *N; 25 | int *K; 26 | 27 | M = (int*) malloc(BATCH * sizeof(int)); 28 | N = (int*) malloc(BATCH * sizeof(int)); 29 | K = (int*) malloc(BATCH * sizeof(int)); 30 | 31 | std::fstream fs; 32 | fs.open("../data/input"); 33 | if (!fs.is_open()){ 34 | printf("Error opening input\n"); 35 | exit(EXIT_FAILURE); 36 | } 37 | 38 | //read matrix config 39 | for (int i=0; i>M[i]>>N[i]>>K[i]; 41 | } 42 | 43 | float **A; 44 | float **B; 45 | float **C; 46 | 47 | A = (float**) malloc(BATCH * sizeof(float*)); 48 | B = (float**) malloc(BATCH * sizeof(float*)); 49 | C = (float**) malloc(BATCH * sizeof(float*)); 50 | 51 | for (int i=0; i M[j]/tile_size[t_strategy[j]][0])? (grid_size.x):(M[j]/tile_size[t_strategy[j]][0]); 157 | grid_size.y = (grid_size.y > N[j]/tile_size[t_strategy[j]][1])? (grid_size.y):(N[j]/tile_size[t_strategy[j]][1]); 158 | } 159 | 160 | // printf("%d %d %d\n", grid_size.x, grid_size.y, grid_size.z); 161 | 162 | //warm-up 163 | gemm<256><<>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C, dev_T); 164 | KernelErrChk(); 165 | 166 | ErrChk(cudaEventCreate(&start)); 167 | ErrChk(cudaEventRecord(start,0)); 168 | 169 | for (int run = 0; run<<>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C, dev_T); 171 | KernelErrChk(); 172 | } 173 | 174 | ErrChk(cudaEventCreate(&stop)); 175 | ErrChk(cudaEventRecord(stop,0)); 176 | ErrChk(cudaEventSynchronize(stop)); 177 | ErrChk(cudaEventElapsedTime(&elapsedTime, start,stop)); 178 | 179 | time = elapsedTime/N_RUNS; 180 | time /= 1.0e3; //convert time unit from millisecond to second 181 | gflops_per_sec = gflops / time; 182 | printf("%f\n", gflops_per_sec); 183 | 184 | for (int i=0; i 5 | __global__ void gemm(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[]); 6 | 7 | 8 | template<> 9 | __global__ void gemm<128>(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[]){ 10 | 11 | int i = blockIdx.z; 12 | extern __shared__ float sh[]; 13 | int t = T_strategy[i]; 14 | 15 | switch(t){ 16 | case 0: 17 | if (blockIdx.x * 16 < M[i] && blockIdx.y * 16 < N[i]) 18 | gemm_128_16x16(M[i], N[i], K[i], A[i], B[i], C[i], sh); 19 | break; 20 | case 1: 21 | if (blockIdx.x * 32 < M[i] && blockIdx.y * 32 < N[i]) 22 | gemm_128_32x32(M[i], N[i], K[i], A[i], B[i], C[i], sh); 23 | break; 24 | case 2: 25 | if (blockIdx.x * 64 < M[i] && blockIdx.y * 64 < N[i]) 26 | gemm_128_64x64(M[i], N[i], K[i], A[i], B[i], C[i], sh); 27 | break; 28 | case 3: 29 | if (blockIdx.x * 128 < M[i] && blockIdx.y * 64 < N[i]) 30 | gemm_128_128x64(M[i], N[i], K[i], A[i], B[i], C[i], sh); 31 | break; 32 | case 4: 33 | if (blockIdx.x * 64 < M[i] && blockIdx.y * 128 < N[i]) 34 | gemm_128_64x128(M[i], N[i], K[i], A[i], B[i], C[i], sh); 35 | break; 36 | case 5: 37 | // if (blockIdx.x * 128 < M[i] && blockIdx.y * 128 < N[i]) 38 | // gemm_128_128x128(M[i], N[i], K[i], A[i], B[i], C[i], sh); 39 | break; 40 | } 41 | 42 | return; 43 | } 44 | 45 | template<> 46 | __global__ void gemm<256>(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[]){ 47 | 48 | int i = blockIdx.z; 49 | extern __shared__ float sh[]; 50 | int t = T_strategy[i]; 51 | 52 | switch(t){ 53 | case 0: 54 | if (blockIdx.x * 16 < M[i] && blockIdx.y * 16 < N[i]) 55 | gemm_256_16x16(M[i], N[i], K[i], A[i], B[i], C[i], sh); 56 | break; 57 | case 1: 58 | if (blockIdx.x * 32 < M[i] && blockIdx.y * 32 < N[i]) 59 | gemm_256_32x32(M[i], N[i], K[i], A[i], B[i], C[i], sh); 60 | break; 61 | case 2: 62 | if (blockIdx.x * 64 < M[i] && blockIdx.y * 64 < N[i]) 63 | gemm_256_64x64(M[i], N[i], K[i], A[i], B[i], C[i], sh); 64 | break; 65 | case 3: 66 | if (blockIdx.x * 128 < M[i] && blockIdx.y * 64 < N[i]) 67 | gemm_256_128x64(M[i], N[i], K[i], A[i], B[i], C[i], sh); 68 | break; 69 | case 4: 70 | if (blockIdx.x * 128 < M[i] && blockIdx.y * 64 < N[i]) 71 | gemm_256_128x64(M[i], N[i], K[i], A[i], B[i], C[i], sh); 72 | // if (blockIdx.x * 64 < M[i] && blockIdx.y * 128 < N[i]) 73 | // gemm_256_64x128(M[i], N[i], K[i], A[i], B[i], C[i], sh); 74 | break; 75 | case 5: 76 | if (blockIdx.x * 128 < M[i] && blockIdx.y * 128 < N[i]) 77 | gemm_256_128x128(M[i], N[i], K[i], A[i], B[i], C[i], sh); 78 | break; 79 | } 80 | 81 | return; 82 | } 83 | -------------------------------------------------------------------------------- /tiling/kernel_128.h: -------------------------------------------------------------------------------- 1 | __device__ void gemm_128_16x16(int M, int N, int K, float *A, float *B, float *C, float *sh){ 2 | 3 | float *sh_A = sh; 4 | float *sh_B = sh + 2*16*8; 5 | 6 | float2 reg_C; 7 | float2 reg_A; 8 | float reg_B; 9 | 10 | // Compute block's starting coordinate 11 | int block_base_x = blockIdx.y*16; 12 | int block_base_y = blockIdx.x*16; 13 | 14 | //Load C from global memory to register file 15 | float2 *C_start = (float2*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*2 + (threadIdx.x/8)*M); 16 | 17 | reg_C = *C_start; 18 | 19 | //load A from global memory to shared memory 20 | float *A_start = A + block_base_y + (threadIdx.x%16) + (threadIdx.x/16)*M; 21 | *(sh_A + threadIdx.x) = *(A_start); 22 | 23 | //load A from global memory to shared memory 24 | float *B_start = B + K*block_base_x + (threadIdx.x/16) + (threadIdx.x%16)*K; 25 | *(sh_B + threadIdx.x) = *(B_start); 26 | 27 | 28 | int double_buffer = 0; 29 | #pragma unroll 30 | for(int k=0; k /dev/null 11 | ./gemm 4 >> log 12 | ./gemm 8 >> log 13 | ./gemm 16 >> log 14 | ./gemm 32 >> log 15 | ./gemm 64 >> log 16 | ./gemm 128 >> log 17 | ./gemm 256 >> log 18 | done 19 | done 20 | -------------------------------------------------------------------------------- /tiling/thres.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for ((thres=1024; thres<=102400000; thres=thres*2)) 4 | do 5 | ./gemm 16 $thres >> log 6 | done 7 | for ((thres=1024; thres<=102400000; thres=thres*2)) 8 | do 9 | ./gemm 32 $thres >> log 10 | done 11 | for ((thres=1024; thres<=102400000; thres=thres*2)) 12 | do 13 | ./gemm 64 $thres >> log 14 | done 15 | for ((thres=1024; thres<=102400000; thres=thres*2)) 16 | do 17 | ./gemm 128 $thres >> log 18 | done 19 | for ((thres=1024; thres<=102400000; thres=thres*2)) 20 | do 21 | ./gemm 256 $thres >> log 22 | done 23 | --------------------------------------------------------------------------------