├── Makefile
├── README.md
├── conv_para.bak
├── convolution.cu
├── implicit_gemm_kernel.h
└── run.sh


/Makefile:
--------------------------------------------------------------------------------
 1 | GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61, -gencode arch=compute_70,code=sm_70
 2 | 
 3 | 
 4 | convolution:convolution.o
 5 | 	nvcc  $^ -o $@ -lcudnn ${GENCODE_FLAGS}
 6 | 
 7 | convolution.o:convolution.cu implicit_gemm_kernel.h
 8 | 	nvcc -c $< -o $@ -Xptxas -v -lineinfo --std=c++11 ${GENCODE_FLAGS}
 9 | 
10 | clean:
11 | 	rm -f convolution convolution.o
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # implicit_gemm_convolution
2 | 


--------------------------------------------------------------------------------
/conv_para.bak:
--------------------------------------------------------------------------------
 1 | 1 64	56	56	64	1	1	0	1	56	56
 2 | 1 192	28	28	64	1	1	0	1	28	28
 3 | 1 192	28	28	96	1	1	0	1	28	28
 4 | 1 192	28	28	16	1	1	0	1	28	28
 5 | 1 192	28	28	32	1	1	0	1	28	28
 6 | 1 256	28	28	128	1	1	0	1	28	28
 7 | 1 256	28	28	128	1	1	0	1	28	28
 8 | 1 256	28	28	32	1	1	0	1	28	28
 9 | 1 256	28	28	64	1	1	0	1	28	28
10 | 1 480	14	14	192	1	1	0	1	14	14
11 | 1 480	14	14	96	1	1	0	1	14	14
12 | 1 480	14	14	16	1	1	0	1	14	14
13 | 1 480	14	14	64	1	1	0	1	14	14
14 | 1 512	14	14	160	1	1	0	1	14	14
15 | 1 512	14	14	112	1	1	0	1	14	14
16 | 1 512	14	14	24	1	1	0	1	14	14
17 | 1 512	14	14	64	1	1	0	1	14	14
18 | 1 512	14	14	128	1	1	0	1	14	14
19 | 1 512	14	14	128	1	1	0	1	14	14
20 | 1 512	14	14	24	1	1	0	1	14	14
21 | 1 512	14	14	64	1	1	0	1	14	14
22 | 1 512	14	14	112	1	1	0	1	14	14
23 | 1 512	14	14	144	1	1	0	1	14	14
24 | 1 512	14	14	64	1	1	0	1	14	14
25 | 1 528	14	14	256	1	1	0	1	14	14
26 | 1 528	14	14	160	1	1	0	1	14	14
27 | 1 528	14	14	32	1	1	0	1	14	14
28 | 1 528	14	14	128	1	1	0	1	14	14
29 | 1 832	7	7	256	1	1	0	1	7	7
30 | 1 832	7	7	160	1	1	0	1	7	7
31 | 1 832	7	7	32	1	1	0	1	7	7
32 | 1 832	7	7	128	1	1	0	1	7	7
33 | 1 832	7	7	384	1	1	0	1	7	7
34 | 1 832	7	7	192	1	1	0	1	7	7
35 | 1 832	7	7	48	1	1	0	1	7	7
36 | 1 832	7	7	128	1	1	0	1	7	7
37 | 1 192	56	56	128	1	1	0	1	56	56
38 | 1 256	56	56	256	1	1	0	1	56	56
39 | 1 512	28	28	256	1	1	0	1	28	28
40 | 1 512	28	28	256	1	1	0	1	28	28
41 | 1 512	28	28	256	1	1	0	1	28	28
42 | 1 512	28	28	256	1	1	0	1	28	28
43 | 1 512	28	28	512	1	1	0	1	28	28
44 | 1 1024	14	14	512	1	1	0	1	14	14
45 | 1 1024	14	14	512	1	1	0	1	14	14
46 | 1 96	55	55	16	1	1	0	1	55	55
47 | 1 16	55	55	64	1	1	0	1	55	55
48 | 1 128	55	55	16	1	1	0	1	55	55
49 | 1 16	55	55	64	1	1	0	1	55	55
50 | 1 128	55	55	32	1	1	0	1	55	55
51 | 1 32	55	55	128	1	1	0	1	55	55
52 | 1 256	27	27	32	1	1	0	1	27	27
53 | 1 32	27	27	128	1	1	0	1	27	27
54 | 1 256	27	27	48	1	1	0	1	27	27
55 | 1 48	27	27	192	1	1	0	1	27	27
56 | 1 384	27	27	48	1	1	0	1	27	27
57 | 1 48	27	27	192	1	1	0	1	27	27
58 | 1 384	27	27	64	1	1	0	1	27	27
59 | 1 64	27	27	256	1	1	0	1	27	27
60 | 1 512	13	13	64	1	1	0	1	13	13
61 | 1 64	13	13	256	1	1	0	1	13	13
62 | 1 512	13	13	1000	1	1	0	1	13	13
63 | 1 64	56	56	256	1	1	0	1	56	56
64 | 1 64	56	56	64	1	1	0	1	56	56
65 | 1 64	56	56	256	1	1	0	1	56	56
66 | 1 256	56	56	64	1	1	0	1	56	56
67 | 1 64	56	56	256	1	1	0	1	56	56
68 | 1 256	56	56	64	1	1	0	1	56	56
69 | 1 64	56	56	256	1	1	0	1	56	56
70 | 1 128	28	28	512	1	1	0	1	28	28
71 | 1 512	28	28	128	1	1	0	1	28	28
72 | 1 128	28	28	512	1	1	0	1	28	28
73 | 1 512	28	28	128	1	1	0	1	28	28
74 | 1 128	28	28	512	1	1	0	1	28	28
75 | 1 512	28	28	128	1	1	0	1	28	28
76 | 1 128	28	28	512	1	1	0	1	28	28
77 | 1 256	14	14	1024	1	1	0	1	14	14
78 | 1 1024	14	14	256	1	1	0	1	14	14
79 | 1 256	14	14	1024	1	1	0	1	14	14
80 | 1 1024	14	14	256	1	1	0	1	14	14
81 | 1 256	14	14	1024	1	1	0	1	14	14
82 | 1 1024	14	14	256	1	1	0	1	14	14
83 | 1 256	14	14	1024	1	1	0	1	14	14
84 | 1 1024	14	14	256	1	1	0	1	14	14
85 | 1 1024	14	14	256	1	1	0	1	14	14
86 | 1 256	14	14	1024	1	1	0	1	14	14
87 | 1 512	7	7	2048	1	1	0	1	7	7
88 | 1 2048	7	7	512	1	1	0	1	7	7
89 | 1 512	7	7	2048	1	1	0	1	7	7
90 | 1 2048	7	7	512	1	1	0	1	7	7
91 | 1 512	7	7	2048	1	1	0	1	7	7
92 | 


--------------------------------------------------------------------------------
/convolution.cu:
--------------------------------------------------------------------------------
  1 | #include<cstdlib>
  2 | #include<cstdio>
  3 | #include "cudnn.h"
  4 | #include "implicit_gemm_kernel.h"
  5 | 
  6 | #define ErrChk(code) { Assert((code), __FILE__, __LINE__); }
  7 | inline void Assert(cudaError_t  code, const char *file, int line){
  8 | 	if(code!=cudaSuccess) {
  9 | 		printf("CUDA Runtime Error: %s:%d:'%s'\n", file, line, cudaGetErrorString(code));
 10 | 		exit(EXIT_FAILURE);
 11 | 	}
 12 | }
 13 | inline void Assert(cudnnStatus_t code, const char *file, int line){
 14 |     if (code!=CUDNN_STATUS_SUCCESS){
 15 | 		printf("cuDNN API Error: %s:%d:'%s'\n", file, line, cudnnGetErrorString(code));
 16 |         exit(EXIT_FAILURE);
 17 |     }
 18 | }
 19 | 
 20 | #define KernelErrChk(){\
 21 | 		cudaError_t errSync  = cudaGetLastError();\
 22 | 		cudaError_t errAsync = cudaDeviceSynchronize();\
 23 | 		if (errSync != cudaSuccess) {\
 24 | 			  printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));\
 25 | 			  exit(EXIT_FAILURE);\
 26 | 		}\
 27 | 		if (errAsync != cudaSuccess){\
 28 | 			printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));\
 29 | 			exit(EXIT_FAILURE);\
 30 | 		}\
 31 | }
 32 | 
 33 | 
 34 | int main(int argc, char *argv[]){
 35 | 	
 36 | 	//convolution parameters
 37 | 	int N = 32; //batch size
 38 | 	int C = 1024; //channel
 39 | 	int H = 24;
 40 | 	int W = 24;
 41 | 	
 42 | 	int K = 16; //number of filter
 43 | 	int R = 1;
 44 | 	int S = 1;
 45 | 	int U = 1;
 46 | 	int V = 1;
 47 | 
 48 | 	int pad_h = 0;
 49 | 	int pad_w = 0;
 50 | 	
 51 | 	int dilation = 1;
 52 | 
 53 | 	int P = (H + 2*pad_h - (((R-1)*dilation) + 1) )/U + 1;
 54 | 	int Q = (W + 2*pad_w - (((S-1)*dilation) + 1) )/U + 1;
 55 | 
 56 | /*
 57 | 	if (argc != 12){
 58 | 		printf("Usage: You need to type 11 arguments: N C H W K R S pad_h U P Q\n");
 59 | 		exit(EXIT_FAILURE);
 60 | 	}
 61 | 
 62 | 	int	N = atoi(argv[1]);
 63 | 	int C = atoi(argv[2]);
 64 | 	int H = atoi(argv[3]);
 65 | 	int W = atoi(argv[4]);
 66 | 	int K = atoi(argv[5]);
 67 | 	int R = atoi(argv[6]);
 68 | 	int S = atoi(argv[7]);
 69 | 	int pad_h = atoi(argv[8]);
 70 | 	int pad_w = atoi(argv[8]);
 71 | 	int U = atoi(argv[9]);
 72 | 	int V = atoi(argv[9]);
 73 | 	int P = atoi(argv[10]);
 74 | 	int Q = atoi(argv[11]);
 75 | 
 76 | 	if (!(R==1 && pad_h==0 && U==1))
 77 | 		return 1;
 78 | */
 79 | 	
 80 | 	//int dilation = 1;
 81 | 	//prepare data
 82 | 	float *h_input = (float*) malloc(N*C*H*W*sizeof(float));
 83 | 	for (int j=0; j<N*C*H*W; ++j)
 84 | 		h_input[j] = 1.f;
 85 | 
 86 | 	float *h_filter = (float*) malloc(K*C*R*S*sizeof(float));
 87 | 	for (int j=0; j<K*C*R*S; ++j)
 88 | 		h_filter[j] = 1.f;
 89 | 
 90 | 	float *h_result_cudnn = (float*) malloc(K*P*Q*N*sizeof(float));
 91 | 	float *h_result_our = (float*) malloc(K*P*Q*N*sizeof(float));
 92 | 
 93 | 	//cuDNN
 94 | 	//prepare data
 95 | 	float *input; //input data
 96 | 	float *filter; //filter
 97 | 	float *result_cudnn; //result
 98 | 
 99 | 	ErrChk(cudaMalloc(&input, N*C*H*W*sizeof(float)));
100 | 	ErrChk(cudaMalloc(&filter, K*C*R*S*sizeof(float)));
101 | 	ErrChk(cudaMalloc(&result_cudnn, N*K*P*Q*sizeof(float)));
102 | 	
103 | 	ErrChk(cudaMemcpy(input, h_input, N*C*H*W*sizeof(float), cudaMemcpyHostToDevice));
104 | 	ErrChk(cudaMemcpy(filter, h_filter, K*C*R*S*sizeof(float), cudaMemcpyHostToDevice));
105 | 	
106 | 	float one = 1.0, zero = 0.0;
107 | 	size_t size;
108 | 
109 | 	cudnnHandle_t handle;
110 | 	ErrChk(cudnnCreate(&handle));
111 | 
112 | 
113 | 	cudnnTensorDescriptor_t xDesc, yDesc;
114 | 	cudnnFilterDescriptor_t filterDesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW
115 | 	ErrChk(cudnnCreateTensorDescriptor(&xDesc));
116 | 	ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
117 | 
118 | 	ErrChk(cudnnCreateTensorDescriptor(&yDesc));
119 | 	ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, K, P, Q));
120 | 
121 | 	ErrChk(cudnnCreateFilterDescriptor(&filterDesc));
122 | 	ErrChk(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, K, C, R, S));
123 | 
124 | 	cudnnConvolutionDescriptor_t convDesc;
125 | 	ErrChk(cudnnCreateConvolutionDescriptor(&convDesc));
126 | 	ErrChk(cudnnSetConvolution2dDescriptor(convDesc, pad_h, pad_w, U, V, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
127 | 
128 | 	cudnnConvolutionFwdAlgo_t algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
129 | 
130 | 	ErrChk(cudnnGetConvolutionForwardWorkspaceSize(handle, xDesc, filterDesc, convDesc, yDesc, algo, (size_t *)&(size)));
131 | 
132 | 	float *extra;
133 | 	ErrChk(cudaMalloc((void **) &extra, size));
134 | 
135 | 	//  3. Computing
136 | 	ErrChk(cudnnConvolutionForward(handle, &one, xDesc, input, filterDesc, filter, convDesc, algo, extra, size, &zero, yDesc, result_cudnn));
137 | 	
138 | 	ErrChk(cudaMemcpy(h_result_cudnn, result_cudnn, sizeof(float)*N*K*P*Q, cudaMemcpyDeviceToHost));
139 | 
140 | 
141 | 
142 | 	//Our implementation
143 | 	//matrix parameters, because the matrix is stored in Row-Major style and MM is Column-Major, A*B -> BT * AT
144 | 		
145 | 	float *result_our;
146 | 	ErrChk(cudaMalloc((void**)&result_our, N*K*P*Q*sizeof(float)));
147 | 	
148 | 
149 | 	//gemm	1101
150 | 	dim3 block_size;
151 | 	block_size.x = 64;
152 | 	block_size.y = 1;
153 | 	block_size.z = 1;
154 | 	
155 | 	dim3 grid_size;
156 | 	grid_size.x = K/16;
157 | 	grid_size.y = (Q*P-1)/16 + 1;
158 | 	grid_size.z = N;
159 | 
160 | 	if (H*W%2)
161 | 		implicit_gemm_1101_1<<<grid_size, block_size>>>(input, filter, result_our, N, C, H, W, K);
162 | 	else if (H*W%16)
163 | 		implicit_gemm_1101_2<<<grid_size, block_size>>>(input, filter, result_our, N, C, H, W, K);
164 | 	else
165 | 		implicit_gemm_1101_16<<<grid_size, block_size>>>(input, filter, result_our, N, C, H, W, K);
166 | 	
167 |  	KernelErrChk();
168 | 
169 | 	ErrChk(cudaMemcpy(h_result_our, result_our, sizeof(float)*N*K*P*Q, cudaMemcpyDeviceToHost));
170 | 	
171 | 	//Result Test
172 | 	for (int j=0; j<N*K; ++j){
173 | 		for (int i=0; i<P*Q; ++i)
174 | 			printf("%.f ", h_result_cudnn[j*P*Q+i]);
175 | 		printf("\n");
176 | 	}
177 | 	printf("\n");
178 | 
179 | 	printf("----------------------------------\n");
180 | 	for (int j=0; j<N*K; ++j){
181 | 		for (int i=0; i<P*Q; ++i)
182 | 			printf("%.f ", h_result_our[j*P*Q+i]);
183 | 		printf("\n");
184 | 	}
185 | 	printf("\n");
186 | 
187 | 	for (int j=0; j<N*K*P*Q; ++j){
188 | 		if (abs(h_result_cudnn[j] - h_result_our[j]) > 10e-2){
189 | 			printf("Rejected @ %d\n", j);
190 | 			exit(EXIT_FAILURE);
191 | 		}
192 | 	}
193 | 	printf("Passed\n");
194 | 
195 | 	
196 | 	ErrChk(cudnnDestroy(handle));
197 | 	ErrChk(cudnnDestroyTensorDescriptor(xDesc));
198 | 	ErrChk(cudnnDestroyTensorDescriptor(yDesc));
199 | 	ErrChk(cudnnDestroyFilterDescriptor(filterDesc));
200 | 	ErrChk(cudnnDestroyConvolutionDescriptor(convDesc));
201 | 
202 | 	ErrChk(cudaFree(input));
203 | 	ErrChk(cudaFree(filter));
204 | 	ErrChk(cudaFree(result_our));
205 | 	ErrChk(cudaFree(result_cudnn));
206 | 	ErrChk(cudaFree(extra));
207 | 	
208 | 	free(h_input);
209 | 	free(h_filter);
210 | 	free(h_result_our);
211 | 	free(h_result_cudnn);
212 | 
213 | 
214 | 	return 0;
215 | }
216 | 


--------------------------------------------------------------------------------
/implicit_gemm_kernel.h:
--------------------------------------------------------------------------------
  1 | __global__ void implicit_gemm_1101_16(float *input, float *filter, float *output, int N, int C, int H, int W, int K){
  2 | 
  3 | 	float reg_C[4];
  4 | 	float reg_A[8];
  5 | 	float reg_B[2];
  6 | 
  7 | 	__shared__ float sh_A[2*16*8];
  8 | 	__shared__ float sh_B[2*16*8];
  9 | 
 10 | 	reg_C[0] = 0.f;
 11 | 	reg_C[1] = 0.f;
 12 | 	reg_C[2] = 0.f;
 13 | 	reg_C[3] = 0.f;
 14 | 
 15 | 	//load A from input feature map to shared memory
 16 | 	float2 *A = (float2 *)(filter + blockIdx.x*16*C + (threadIdx.x/16)*2 + (threadIdx.x%16)*C);
 17 | 	*((float2 *)(sh_A + 2*threadIdx.x)) = *A;
 18 | 
 19 | 	float2 *B = (float2 *)(input + blockIdx.z*C*H*W + blockIdx.y*16 + (threadIdx.x%8)*2 + (threadIdx.x/8)*H*W);
 20 | 	*((float2 *)(sh_B + 2*threadIdx.x)) = *B;
 21 | 
 22 | 	int double_buffer = 0;
 23 | 
 24 | #pragma unroll
 25 | 	for (int c=0; c<C; c+=8){
 26 | 	
 27 | 		__syncthreads();
 28 | 
 29 | 		int A_offset = double_buffer + (threadIdx.x%4)*8;
 30 | 		int B_offset = double_buffer + (threadIdx.x/4);
 31 | 
 32 | #pragma unroll
 33 | 		
 34 | 		for (int i=0; i<8; i+=2){
 35 | 			
 36 | 			reg_A[0] = sh_A[A_offset];
 37 | 			reg_A[1] = sh_A[A_offset+2];
 38 | 			reg_A[2] = sh_A[A_offset+4];
 39 | 			reg_A[3] = sh_A[A_offset+6];
 40 | 			reg_A[4] = sh_A[A_offset+1];
 41 | 			reg_A[5] = sh_A[A_offset+3];
 42 | 			reg_A[6] = sh_A[A_offset+5];
 43 | 			reg_A[7] = sh_A[A_offset+7];
 44 | 		
 45 | 			reg_B[0] = sh_B[B_offset];
 46 | 			reg_B[1] = sh_B[B_offset+16];
 47 | 			
 48 | 			reg_C[0] = fma(reg_A[0], reg_B[0], reg_C[0]);
 49 | 			reg_C[1] = fma(reg_A[1], reg_B[0], reg_C[1]);
 50 | 			reg_C[2] = fma(reg_A[2], reg_B[0], reg_C[2]);
 51 | 			reg_C[3] = fma(reg_A[3], reg_B[0], reg_C[3]);
 52 | 			reg_C[0] = fma(reg_A[4], reg_B[1], reg_C[0]);
 53 | 			reg_C[1] = fma(reg_A[5], reg_B[1], reg_C[1]);
 54 | 			reg_C[2] = fma(reg_A[6], reg_B[1], reg_C[2]);
 55 | 			reg_C[3] = fma(reg_A[7], reg_B[1], reg_C[3]);
 56 | 
 57 | 			A_offset += 32;
 58 | 			B_offset += 32;
 59 | 		}
 60 | 		
 61 | 		double_buffer ^= 128;
 62 | 		
 63 | 		if (c+8<C){
 64 | 			A += 4;
 65 | 			B += 4*H*W;
 66 | 			*((float2 *)(sh_A + double_buffer + 2*threadIdx.x)) = *A;
 67 | 			*((float2 *)(sh_B + double_buffer + 2*threadIdx.x)) = *B;
 68 | 		}
 69 | 	}
 70 | 	
 71 | 	int C_offset = blockIdx.z*K*H*W + blockIdx.x*16*H*W + blockIdx.y*16 + (threadIdx.x/4) + (threadIdx.x%4)*4*H*W;
 72 | 	output[C_offset] = reg_C[0];
 73 | 	output[C_offset+H*W] = reg_C[1];
 74 | 	output[C_offset+2*H*W] = reg_C[2];
 75 | 	output[C_offset+3*H*W] = reg_C[3];
 76 | }
 77 | 
 78 | __global__ void implicit_gemm_1101_2(float *input, float *filter, float *output, int N, int C, int H, int W, int K){
 79 | 
 80 | 	if (blockIdx.y < gridDim.y-1){
 81 | 		float reg_C[4];
 82 | 		float reg_A[8];
 83 | 		float reg_B[2];
 84 | 
 85 | 		__shared__ float sh_A[2*16*8];
 86 | 		__shared__ float sh_B[2*16*8];
 87 | 
 88 | 		reg_C[0] = 0.f;
 89 | 		reg_C[1] = 0.f;
 90 | 		reg_C[2] = 0.f;
 91 | 		reg_C[3] = 0.f;
 92 | 
 93 | 		//load filter to shared memory
 94 | 		float2 *A = (float2 *)(filter + blockIdx.x*16*C + (threadIdx.x/16)*2 + (threadIdx.x%16)*C);
 95 | 		*((float2 *)(sh_A + 2*threadIdx.x)) = *A;
 96 | 
 97 | 		//load input feature map to shared memory
 98 | 		float2 *B = (float2 *)(input + blockIdx.z*C*H*W + blockIdx.y*16 + (threadIdx.x%8)*2 + (threadIdx.x/8)*H*W);
 99 | 		*((float2 *)(sh_B + 2*threadIdx.x)) = *B;
100 | 
101 | 		int double_buffer = 0;
102 | 
103 | #pragma unroll
104 | 		for (int c=0; c<C; c+=8){
105 | 		
106 | 			__syncthreads();
107 | 
108 | 			int A_offset = double_buffer + (threadIdx.x%4)*8;
109 | 			int B_offset = double_buffer + (threadIdx.x/4);
110 | 
111 | #pragma unroll
112 | 			
113 | 			for (int i=0; i<8; i+=2){
114 | 				
115 | 				reg_A[0] = sh_A[A_offset];
116 | 				reg_A[1] = sh_A[A_offset+2];
117 | 				reg_A[2] = sh_A[A_offset+4];
118 | 				reg_A[3] = sh_A[A_offset+6];
119 | 				reg_A[4] = sh_A[A_offset+1];
120 | 				reg_A[5] = sh_A[A_offset+3];
121 | 				reg_A[6] = sh_A[A_offset+5];
122 | 				reg_A[7] = sh_A[A_offset+7];
123 | 			
124 | 				reg_B[0] = sh_B[B_offset];
125 | 				reg_B[1] = sh_B[B_offset+16];
126 | 				
127 | 				reg_C[0] = fma(reg_A[0], reg_B[0], reg_C[0]);
128 | 				reg_C[1] = fma(reg_A[1], reg_B[0], reg_C[1]);
129 | 				reg_C[2] = fma(reg_A[2], reg_B[0], reg_C[2]);
130 | 				reg_C[3] = fma(reg_A[3], reg_B[0], reg_C[3]);
131 | 				reg_C[0] = fma(reg_A[4], reg_B[1], reg_C[0]);
132 | 				reg_C[1] = fma(reg_A[5], reg_B[1], reg_C[1]);
133 | 				reg_C[2] = fma(reg_A[6], reg_B[1], reg_C[2]);
134 | 				reg_C[3] = fma(reg_A[7], reg_B[1], reg_C[3]);
135 | 
136 | 				A_offset += 32;
137 | 				B_offset += 32;
138 | 			}
139 | 			
140 | 			double_buffer ^= 128;
141 | 			
142 | 			if (c+8<C){
143 | 				A += 4;
144 | 				B += 4*H*W;
145 | 				*((float2 *)(sh_A + double_buffer + 2*threadIdx.x)) = *A;
146 | 				*((float2 *)(sh_B + double_buffer + 2*threadIdx.x)) = *B;
147 | 			}
148 | 		}
149 | 		
150 | 		int C_offset = blockIdx.z*K*H*W + blockIdx.x*16*H*W + blockIdx.y*16 + (threadIdx.x/4) + (threadIdx.x%4)*4*H*W;
151 | 		output[C_offset] = reg_C[0];
152 | 		output[C_offset+H*W] = reg_C[1];
153 | 		output[C_offset+2*H*W] = reg_C[2];
154 | 		output[C_offset+3*H*W] = reg_C[3];
155 | 	}
156 | 
157 | 	else{
158 | 		float reg_C[4];
159 | 		float reg_A[8];
160 | 		float reg_B[2];
161 | 
162 | 		__shared__ float sh_A[2*16*8];
163 | 		__shared__ float sh_B[2*16*8];
164 | 
165 | 		reg_C[0] = 0.f;
166 | 		reg_C[1] = 0.f;
167 | 		reg_C[2] = 0.f;
168 | 		reg_C[3] = 0.f;
169 | 
170 | 		//load filter to shared memory
171 | 		float2 *A = (float2 *)(filter + blockIdx.x*16*C + (threadIdx.x/16)*2 + (threadIdx.x%16)*C);
172 | 		*((float2 *)(sh_A + 2*threadIdx.x)) = *A;
173 | 
174 | 		//load input feature map to shared memory
175 | 
176 | 		float2 *B = (float2 *)(input + blockIdx.z*C*H*W + blockIdx.y*16 + (threadIdx.x%8)*2 + (threadIdx.x/8)*H*W);
177 | 		if (threadIdx.x%8 < (H*W%16)/2 ){
178 | 			*((float2 *)(sh_B + 2*threadIdx.x)) = *B;
179 | 		}
180 | 
181 | 		int double_buffer = 0;
182 | 
183 | #pragma unroll
184 | 		for (int c=0; c<C; c+=8){
185 | 		
186 | 			__syncthreads();
187 | 
188 | 			int A_offset = double_buffer + (threadIdx.x%4)*8;
189 | 			int B_offset = double_buffer + (threadIdx.x/4);
190 | 
191 | 			if (threadIdx.x < (H*W%16)*4)
192 | #pragma unroll
193 | 				for (int i=0; i<8; i+=2){
194 | 					
195 | 					reg_A[0] = sh_A[A_offset];
196 | 					reg_A[1] = sh_A[A_offset+2];
197 | 					reg_A[2] = sh_A[A_offset+4];
198 | 					reg_A[3] = sh_A[A_offset+6];
199 | 					reg_A[4] = sh_A[A_offset+1];
200 | 					reg_A[5] = sh_A[A_offset+3];
201 | 					reg_A[6] = sh_A[A_offset+5];
202 | 					reg_A[7] = sh_A[A_offset+7];
203 | 				
204 | 					reg_B[0] = sh_B[B_offset];
205 | 					reg_B[1] = sh_B[B_offset+16];
206 | 					
207 | 					reg_C[0] = fma(reg_A[0], reg_B[0], reg_C[0]);
208 | 					reg_C[1] = fma(reg_A[1], reg_B[0], reg_C[1]);
209 | 					reg_C[2] = fma(reg_A[2], reg_B[0], reg_C[2]);
210 | 					reg_C[3] = fma(reg_A[3], reg_B[0], reg_C[3]);
211 | 					reg_C[0] = fma(reg_A[4], reg_B[1], reg_C[0]);
212 | 					reg_C[1] = fma(reg_A[5], reg_B[1], reg_C[1]);
213 | 					reg_C[2] = fma(reg_A[6], reg_B[1], reg_C[2]);
214 | 					reg_C[3] = fma(reg_A[7], reg_B[1], reg_C[3]);
215 | 
216 | 					A_offset += 32;
217 | 					B_offset += 32;
218 | 				}
219 | 				
220 | 			double_buffer ^= 128;
221 | 			
222 | 			if (c+8<C){
223 | 				A += 4;
224 | 				B += 4*H*W;
225 | 				*((float2 *)(sh_A + double_buffer + 2*threadIdx.x)) = *A;
226 | 				if (threadIdx.x%8 < (H*W%16)/2 )
227 | 					*((float2 *)(sh_B + double_buffer + 2*threadIdx.x)) = *B;
228 | 			}
229 | 		}
230 | 		
231 | 		if (threadIdx.x < (H*W%16)*4){
232 | 		int C_offset = blockIdx.z*K*H*W + blockIdx.x*16*H*W + blockIdx.y*16 + (threadIdx.x/4) + (threadIdx.x%4)*4*H*W;
233 | 		output[C_offset] = reg_C[0];
234 | 		output[C_offset+H*W] = reg_C[1];
235 | 		output[C_offset+2*H*W] = reg_C[2];
236 | 		output[C_offset+3*H*W] = reg_C[3];
237 | 		}
238 | 	 }
239 | 
240 | }
241 | 
242 | 
243 | __global__ void implicit_gemm_1101_1(float *input, float *filter, float *output, int N, int C, int H, int W, int K){
244 | 
245 | 	if (blockIdx.y < gridDim.y-1){
246 | 		float reg_C[4];
247 | 		float reg_A[8];
248 | 		float reg_B[2];
249 | 
250 | 		__shared__ float sh_A[2*16*8];
251 | 		__shared__ float sh_B[2*16*8];
252 | 
253 | 		reg_C[0] = 0.f;
254 | 		reg_C[1] = 0.f;
255 | 		reg_C[2] = 0.f;
256 | 		reg_C[3] = 0.f;
257 | 
258 | 		//load filter to shared memory
259 | 		float2 *A = (float2 *)(filter + blockIdx.x*16*C + (threadIdx.x/16)*2 + (threadIdx.x%16)*C);
260 | 		*((float2 *)(sh_A + 2*threadIdx.x)) = *A;
261 | 
262 | 		//load input feature map to shared memory
263 | 		float *B = (input + blockIdx.z*C*H*W + blockIdx.y*16 + (threadIdx.x%8)*2 + (threadIdx.x/8)*H*W);
264 | 		*(sh_B + 2*threadIdx.x) = *B;
265 | 		*(sh_B + 2*threadIdx.x + 1) = *(B+1);
266 | 
267 | 		int double_buffer = 0;
268 | 
269 | #pragma unroll
270 | 		for (int c=0; c<C; c+=8){
271 | 		
272 | 			__syncthreads();
273 | 
274 | 			int A_offset = double_buffer + (threadIdx.x%4)*8;
275 | 			int B_offset = double_buffer + (threadIdx.x/4);
276 | 
277 | #pragma unroll
278 | 			
279 | 			for (int i=0; i<8; i+=2){
280 | 				
281 | 				reg_A[0] = sh_A[A_offset];
282 | 				reg_A[1] = sh_A[A_offset+2];
283 | 				reg_A[2] = sh_A[A_offset+4];
284 | 				reg_A[3] = sh_A[A_offset+6];
285 | 				reg_A[4] = sh_A[A_offset+1];
286 | 				reg_A[5] = sh_A[A_offset+3];
287 | 				reg_A[6] = sh_A[A_offset+5];
288 | 				reg_A[7] = sh_A[A_offset+7];
289 | 			
290 | 				reg_B[0] = sh_B[B_offset];
291 | 				reg_B[1] = sh_B[B_offset+16];
292 | 				
293 | 				reg_C[0] = fma(reg_A[0], reg_B[0], reg_C[0]);
294 | 				reg_C[1] = fma(reg_A[1], reg_B[0], reg_C[1]);
295 | 				reg_C[2] = fma(reg_A[2], reg_B[0], reg_C[2]);
296 | 				reg_C[3] = fma(reg_A[3], reg_B[0], reg_C[3]);
297 | 				reg_C[0] = fma(reg_A[4], reg_B[1], reg_C[0]);
298 | 				reg_C[1] = fma(reg_A[5], reg_B[1], reg_C[1]);
299 | 				reg_C[2] = fma(reg_A[6], reg_B[1], reg_C[2]);
300 | 				reg_C[3] = fma(reg_A[7], reg_B[1], reg_C[3]);
301 | 
302 | 				A_offset += 32;
303 | 				B_offset += 32;
304 | 			}
305 | 			
306 | 			double_buffer ^= 128;
307 | 			
308 | 			if (c+8<C){
309 | 				A += 4;
310 | 				B += 4*H*W;
311 | 				*((float2 *)(sh_A + double_buffer + 2*threadIdx.x)) = *A;
312 | 				*(sh_B + double_buffer + 2*threadIdx.x) = *B;
313 | 				*(sh_B + double_buffer + 2*threadIdx.x + 1) = *(B+1);
314 | 			}
315 | 		}
316 | 		
317 | 		int C_offset = blockIdx.z*K*H*W + blockIdx.x*16*H*W + blockIdx.y*16 + (threadIdx.x/4) + (threadIdx.x%4)*4*H*W;
318 | 		output[C_offset] = reg_C[0];
319 | 		output[C_offset+H*W] = reg_C[1];
320 | 		output[C_offset+2*H*W] = reg_C[2];
321 | 		output[C_offset+3*H*W] = reg_C[3];
322 | 	}
323 | 
324 | 	else{
325 | 		float reg_C[4];
326 | 		float reg_A[8];
327 | 		float reg_B[2];
328 | 
329 | 		__shared__ float sh_A[2*16*8];
330 | 		__shared__ float sh_B[2*16*8];
331 | 
332 | 		reg_C[0] = 0.f;
333 | 		reg_C[1] = 0.f;
334 | 		reg_C[2] = 0.f;
335 | 		reg_C[3] = 0.f;
336 | 
337 | 		//load filter to shared memory
338 | 		float2 *A = (float2 *)(filter + blockIdx.x*16*C + (threadIdx.x/16)*2 + (threadIdx.x%16)*C);
339 | 		*((float2 *)(sh_A + 2*threadIdx.x)) = *A;
340 | 
341 | 		//load input feature map to shared memory
342 | 
343 | 		int ruler = (H*W)%16;
344 | 		float *B = input + blockIdx.z*C*H*W + blockIdx.y*16 + (threadIdx.x%4)*2*H*W + (threadIdx.x/4);
345 | 		if (threadIdx.x < ruler*4 ){
346 | 			*(sh_B + 2*threadIdx.x) = *B;
347 | 			*(sh_B + 2*threadIdx.x + 1) = *(B+H*W);
348 | 		}
349 | 
350 | 		int double_buffer = 0;
351 | 
352 | #pragma unroll
353 | 		for (int c=0; c<C; c+=8){
354 | 		
355 | 			__syncthreads();
356 | 
357 | 			int A_offset = double_buffer + (threadIdx.x%4)*8;
358 | 			int B_offset = double_buffer + (threadIdx.x/4)*8;
359 | 
360 | 			if (threadIdx.x < ruler*4)
361 | #pragma unroll
362 | 				for (int i=0; i<8; i+=2){
363 | 					
364 | 					reg_A[0] = sh_A[A_offset];
365 | 					reg_A[1] = sh_A[A_offset+2];
366 | 					reg_A[2] = sh_A[A_offset+4];
367 | 					reg_A[3] = sh_A[A_offset+6];
368 | 					reg_A[4] = sh_A[A_offset+1];
369 | 					reg_A[5] = sh_A[A_offset+3];
370 | 					reg_A[6] = sh_A[A_offset+5];
371 | 					reg_A[7] = sh_A[A_offset+7];
372 | 				
373 | 					reg_B[0] = sh_B[B_offset];
374 | 					reg_B[1] = sh_B[B_offset+1];
375 | 					
376 | 					reg_C[0] = fma(reg_A[0], reg_B[0], reg_C[0]);
377 | 					reg_C[1] = fma(reg_A[1], reg_B[0], reg_C[1]);
378 | 					reg_C[2] = fma(reg_A[2], reg_B[0], reg_C[2]);
379 | 					reg_C[3] = fma(reg_A[3], reg_B[0], reg_C[3]);
380 | 					reg_C[0] = fma(reg_A[4], reg_B[1], reg_C[0]);
381 | 					reg_C[1] = fma(reg_A[5], reg_B[1], reg_C[1]);
382 | 					reg_C[2] = fma(reg_A[6], reg_B[1], reg_C[2]);
383 | 					reg_C[3] = fma(reg_A[7], reg_B[1], reg_C[3]);
384 | 
385 | 					A_offset += 32;
386 | 					B_offset += 2;
387 | 				}
388 | 				
389 | 			double_buffer ^= 128;
390 | 			
391 | 			if (c+8<C){
392 | 				A += 4;
393 | 				B += 8*H*W;
394 | 				*((float2 *)(sh_A + double_buffer + 2*threadIdx.x)) = *A;
395 | 				if (threadIdx.x < ruler*4 ){
396 | 					*(sh_B + 2*threadIdx.x + double_buffer) = *B;
397 | 					*(sh_B + 2*threadIdx.x + double_buffer + 1) = *(B+H*W);
398 | 				}
399 | 			}
400 | 		}
401 | 		
402 | 		if (threadIdx.x < ruler*4){
403 | 		int C_offset = blockIdx.z*K*H*W + blockIdx.x*16*H*W + blockIdx.y*16 + (threadIdx.x/4) + (threadIdx.x%4)*4*H*W;
404 | 		output[C_offset] = reg_C[0];
405 | 		output[C_offset+H*W] = reg_C[1];
406 | 		output[C_offset+2*H*W] = reg_C[2];
407 | 		output[C_offset+3*H*W] = reg_C[3];
408 | 		}
409 | 	 }
410 | 
411 | }
412 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -f conv_para
 4 | rm -f log
 5 | for ((i=1; i<190; i=i+1))
 6 | do
 7 | 	sed -n ''$i'p' conv_para.bak > conv_para
 8 | 	nvprof ./convolution `cat conv_para` > log 2>&1
 9 | 	sed -n '/1101/p' log | awk '{printf("%s ", $4);}'
10 | 	sed -n '/cudnn/p' log | awk '{printf("%s", $4);}'
11 | 	echo
12 | done
13 | 


--------------------------------------------------------------------------------