├── Makefile ├── README.md ├── conv_para.bak ├── convolution.cu ├── implicit_gemm_kernel.h └── run.sh /Makefile: -------------------------------------------------------------------------------- 1 | GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61, -gencode arch=compute_70,code=sm_70 2 | 3 | 4 | convolution:convolution.o 5 | nvcc $^ -o $@ -lcudnn ${GENCODE_FLAGS} 6 | 7 | convolution.o:convolution.cu implicit_gemm_kernel.h 8 | nvcc -c $< -o $@ -Xptxas -v -lineinfo --std=c++11 ${GENCODE_FLAGS} 9 | 10 | clean: 11 | rm -f convolution convolution.o 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # implicit_gemm_convolution 2 | -------------------------------------------------------------------------------- /conv_para.bak: -------------------------------------------------------------------------------- 1 | 1 64 56 56 64 1 1 0 1 56 56 2 | 1 192 28 28 64 1 1 0 1 28 28 3 | 1 192 28 28 96 1 1 0 1 28 28 4 | 1 192 28 28 16 1 1 0 1 28 28 5 | 1 192 28 28 32 1 1 0 1 28 28 6 | 1 256 28 28 128 1 1 0 1 28 28 7 | 1 256 28 28 128 1 1 0 1 28 28 8 | 1 256 28 28 32 1 1 0 1 28 28 9 | 1 256 28 28 64 1 1 0 1 28 28 10 | 1 480 14 14 192 1 1 0 1 14 14 11 | 1 480 14 14 96 1 1 0 1 14 14 12 | 1 480 14 14 16 1 1 0 1 14 14 13 | 1 480 14 14 64 1 1 0 1 14 14 14 | 1 512 14 14 160 1 1 0 1 14 14 15 | 1 512 14 14 112 1 1 0 1 14 14 16 | 1 512 14 14 24 1 1 0 1 14 14 17 | 1 512 14 14 64 1 1 0 1 14 14 18 | 1 512 14 14 128 1 1 0 1 14 14 19 | 1 512 14 14 128 1 1 0 1 14 14 20 | 1 512 14 14 24 1 1 0 1 14 14 21 | 1 512 14 14 64 1 1 0 1 14 14 22 | 1 512 14 14 112 1 1 0 1 14 14 23 | 1 512 14 14 144 1 1 0 1 14 14 24 | 1 512 14 14 64 1 1 0 1 14 14 25 | 1 528 14 14 256 1 1 0 1 14 14 26 | 1 528 14 14 160 1 1 0 1 14 14 27 | 1 528 14 14 32 1 1 0 1 14 14 28 | 1 528 14 14 128 1 1 0 1 14 14 29 | 1 832 7 7 256 1 1 0 1 7 7 30 | 1 832 7 7 160 1 1 0 1 7 7 31 | 1 832 7 7 32 1 1 0 1 7 7 32 | 1 832 7 7 128 1 1 0 1 7 7 33 | 1 832 7 7 384 1 1 0 1 7 7 34 | 1 832 7 7 192 1 1 0 1 7 7 35 | 1 832 7 7 48 1 1 0 1 7 7 36 | 1 832 7 7 128 1 1 0 1 7 7 37 | 1 192 56 56 128 1 1 0 1 56 56 38 | 1 256 56 56 256 1 1 0 1 56 56 39 | 1 512 28 28 256 1 1 0 1 28 28 40 | 1 512 28 28 256 1 1 0 1 28 28 41 | 1 512 28 28 256 1 1 0 1 28 28 42 | 1 512 28 28 256 1 1 0 1 28 28 43 | 1 512 28 28 512 1 1 0 1 28 28 44 | 1 1024 14 14 512 1 1 0 1 14 14 45 | 1 1024 14 14 512 1 1 0 1 14 14 46 | 1 96 55 55 16 1 1 0 1 55 55 47 | 1 16 55 55 64 1 1 0 1 55 55 48 | 1 128 55 55 16 1 1 0 1 55 55 49 | 1 16 55 55 64 1 1 0 1 55 55 50 | 1 128 55 55 32 1 1 0 1 55 55 51 | 1 32 55 55 128 1 1 0 1 55 55 52 | 1 256 27 27 32 1 1 0 1 27 27 53 | 1 32 27 27 128 1 1 0 1 27 27 54 | 1 256 27 27 48 1 1 0 1 27 27 55 | 1 48 27 27 192 1 1 0 1 27 27 56 | 1 384 27 27 48 1 1 0 1 27 27 57 | 1 48 27 27 192 1 1 0 1 27 27 58 | 1 384 27 27 64 1 1 0 1 27 27 59 | 1 64 27 27 256 1 1 0 1 27 27 60 | 1 512 13 13 64 1 1 0 1 13 13 61 | 1 64 13 13 256 1 1 0 1 13 13 62 | 1 512 13 13 1000 1 1 0 1 13 13 63 | 1 64 56 56 256 1 1 0 1 56 56 64 | 1 64 56 56 64 1 1 0 1 56 56 65 | 1 64 56 56 256 1 1 0 1 56 56 66 | 1 256 56 56 64 1 1 0 1 56 56 67 | 1 64 56 56 256 1 1 0 1 56 56 68 | 1 256 56 56 64 1 1 0 1 56 56 69 | 1 64 56 56 256 1 1 0 1 56 56 70 | 1 128 28 28 512 1 1 0 1 28 28 71 | 1 512 28 28 128 1 1 0 1 28 28 72 | 1 128 28 28 512 1 1 0 1 28 28 73 | 1 512 28 28 128 1 1 0 1 28 28 74 | 1 128 28 28 512 1 1 0 1 28 28 75 | 1 512 28 28 128 1 1 0 1 28 28 76 | 1 128 28 28 512 1 1 0 1 28 28 77 | 1 256 14 14 1024 1 1 0 1 14 14 78 | 1 1024 14 14 256 1 1 0 1 14 14 79 | 1 256 14 14 1024 1 1 0 1 14 14 80 | 1 1024 14 14 256 1 1 0 1 14 14 81 | 1 256 14 14 1024 1 1 0 1 14 14 82 | 1 1024 14 14 256 1 1 0 1 14 14 83 | 1 256 14 14 1024 1 1 0 1 14 14 84 | 1 1024 14 14 256 1 1 0 1 14 14 85 | 1 1024 14 14 256 1 1 0 1 14 14 86 | 1 256 14 14 1024 1 1 0 1 14 14 87 | 1 512 7 7 2048 1 1 0 1 7 7 88 | 1 2048 7 7 512 1 1 0 1 7 7 89 | 1 512 7 7 2048 1 1 0 1 7 7 90 | 1 2048 7 7 512 1 1 0 1 7 7 91 | 1 512 7 7 2048 1 1 0 1 7 7 92 | -------------------------------------------------------------------------------- /convolution.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cudnn.h" 4 | #include "implicit_gemm_kernel.h" 5 | 6 | #define ErrChk(code) { Assert((code), __FILE__, __LINE__); } 7 | inline void Assert(cudaError_t code, const char *file, int line){ 8 | if(code!=cudaSuccess) { 9 | printf("CUDA Runtime Error: %s:%d:'%s'\n", file, line, cudaGetErrorString(code)); 10 | exit(EXIT_FAILURE); 11 | } 12 | } 13 | inline void Assert(cudnnStatus_t code, const char *file, int line){ 14 | if (code!=CUDNN_STATUS_SUCCESS){ 15 | printf("cuDNN API Error: %s:%d:'%s'\n", file, line, cudnnGetErrorString(code)); 16 | exit(EXIT_FAILURE); 17 | } 18 | } 19 | 20 | #define KernelErrChk(){\ 21 | cudaError_t errSync = cudaGetLastError();\ 22 | cudaError_t errAsync = cudaDeviceSynchronize();\ 23 | if (errSync != cudaSuccess) {\ 24 | printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));\ 25 | exit(EXIT_FAILURE);\ 26 | }\ 27 | if (errAsync != cudaSuccess){\ 28 | printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));\ 29 | exit(EXIT_FAILURE);\ 30 | }\ 31 | } 32 | 33 | 34 | int main(int argc, char *argv[]){ 35 | 36 | //convolution parameters 37 | int N = 32; //batch size 38 | int C = 1024; //channel 39 | int H = 24; 40 | int W = 24; 41 | 42 | int K = 16; //number of filter 43 | int R = 1; 44 | int S = 1; 45 | int U = 1; 46 | int V = 1; 47 | 48 | int pad_h = 0; 49 | int pad_w = 0; 50 | 51 | int dilation = 1; 52 | 53 | int P = (H + 2*pad_h - (((R-1)*dilation) + 1) )/U + 1; 54 | int Q = (W + 2*pad_w - (((S-1)*dilation) + 1) )/U + 1; 55 | 56 | /* 57 | if (argc != 12){ 58 | printf("Usage: You need to type 11 arguments: N C H W K R S pad_h U P Q\n"); 59 | exit(EXIT_FAILURE); 60 | } 61 | 62 | int N = atoi(argv[1]); 63 | int C = atoi(argv[2]); 64 | int H = atoi(argv[3]); 65 | int W = atoi(argv[4]); 66 | int K = atoi(argv[5]); 67 | int R = atoi(argv[6]); 68 | int S = atoi(argv[7]); 69 | int pad_h = atoi(argv[8]); 70 | int pad_w = atoi(argv[8]); 71 | int U = atoi(argv[9]); 72 | int V = atoi(argv[9]); 73 | int P = atoi(argv[10]); 74 | int Q = atoi(argv[11]); 75 | 76 | if (!(R==1 && pad_h==0 && U==1)) 77 | return 1; 78 | */ 79 | 80 | //int dilation = 1; 81 | //prepare data 82 | float *h_input = (float*) malloc(N*C*H*W*sizeof(float)); 83 | for (int j=0; j BT * AT 144 | 145 | float *result_our; 146 | ErrChk(cudaMalloc((void**)&result_our, N*K*P*Q*sizeof(float))); 147 | 148 | 149 | //gemm 1101 150 | dim3 block_size; 151 | block_size.x = 64; 152 | block_size.y = 1; 153 | block_size.z = 1; 154 | 155 | dim3 grid_size; 156 | grid_size.x = K/16; 157 | grid_size.y = (Q*P-1)/16 + 1; 158 | grid_size.z = N; 159 | 160 | if (H*W%2) 161 | implicit_gemm_1101_1<<>>(input, filter, result_our, N, C, H, W, K); 162 | else if (H*W%16) 163 | implicit_gemm_1101_2<<>>(input, filter, result_our, N, C, H, W, K); 164 | else 165 | implicit_gemm_1101_16<<>>(input, filter, result_our, N, C, H, W, K); 166 | 167 | KernelErrChk(); 168 | 169 | ErrChk(cudaMemcpy(h_result_our, result_our, sizeof(float)*N*K*P*Q, cudaMemcpyDeviceToHost)); 170 | 171 | //Result Test 172 | for (int j=0; j 10e-2){ 189 | printf("Rejected @ %d\n", j); 190 | exit(EXIT_FAILURE); 191 | } 192 | } 193 | printf("Passed\n"); 194 | 195 | 196 | ErrChk(cudnnDestroy(handle)); 197 | ErrChk(cudnnDestroyTensorDescriptor(xDesc)); 198 | ErrChk(cudnnDestroyTensorDescriptor(yDesc)); 199 | ErrChk(cudnnDestroyFilterDescriptor(filterDesc)); 200 | ErrChk(cudnnDestroyConvolutionDescriptor(convDesc)); 201 | 202 | ErrChk(cudaFree(input)); 203 | ErrChk(cudaFree(filter)); 204 | ErrChk(cudaFree(result_our)); 205 | ErrChk(cudaFree(result_cudnn)); 206 | ErrChk(cudaFree(extra)); 207 | 208 | free(h_input); 209 | free(h_filter); 210 | free(h_result_our); 211 | free(h_result_cudnn); 212 | 213 | 214 | return 0; 215 | } 216 | -------------------------------------------------------------------------------- /implicit_gemm_kernel.h: -------------------------------------------------------------------------------- 1 | __global__ void implicit_gemm_1101_16(float *input, float *filter, float *output, int N, int C, int H, int W, int K){ 2 | 3 | float reg_C[4]; 4 | float reg_A[8]; 5 | float reg_B[2]; 6 | 7 | __shared__ float sh_A[2*16*8]; 8 | __shared__ float sh_B[2*16*8]; 9 | 10 | reg_C[0] = 0.f; 11 | reg_C[1] = 0.f; 12 | reg_C[2] = 0.f; 13 | reg_C[3] = 0.f; 14 | 15 | //load A from input feature map to shared memory 16 | float2 *A = (float2 *)(filter + blockIdx.x*16*C + (threadIdx.x/16)*2 + (threadIdx.x%16)*C); 17 | *((float2 *)(sh_A + 2*threadIdx.x)) = *A; 18 | 19 | float2 *B = (float2 *)(input + blockIdx.z*C*H*W + blockIdx.y*16 + (threadIdx.x%8)*2 + (threadIdx.x/8)*H*W); 20 | *((float2 *)(sh_B + 2*threadIdx.x)) = *B; 21 | 22 | int double_buffer = 0; 23 | 24 | #pragma unroll 25 | for (int c=0; c conv_para 8 | nvprof ./convolution `cat conv_para` > log 2>&1 9 | sed -n '/1101/p' log | awk '{printf("%s ", $4);}' 10 | sed -n '/cudnn/p' log | awk '{printf("%s", $4);}' 11 | echo 12 | done 13 | --------------------------------------------------------------------------------