├── .gitignore
├── README.md
├── batching
    ├── Makefile
    ├── gemm.cu
    ├── k.sh
    ├── kernel.h
    ├── kernel_128.h
    ├── kernel_256.h
    ├── log
    ├── run.sh
    └── thres.sh
├── cke
    ├── Makefile
    ├── gemm.cu
    ├── log
    └── run.sh
├── data
    ├── Makefile
    ├── gen_data
    ├── gen_data.cpp
    └── input
├── default
    ├── Makefile
    ├── gemm.cu
    ├── log
    └── run.sh
├── google-net_cudnn
    ├── .gitignore
    ├── Makefile
    ├── activation.cpp
    ├── activation.h
    ├── batch-inception.cu
    ├── batch-inception.h
    ├── concat.cu
    ├── concat.h
    ├── conv.cpp
    ├── conv.h
    ├── dropout.cpp
    ├── dropout.h
    ├── gemm_kernel.h
    ├── im2col.h
    ├── inception.cpp
    ├── inception.h
    ├── loss.cpp
    ├── loss.h
    ├── lrn.cpp
    ├── lrn.h
    ├── main.cpp
    ├── pooling.cpp
    ├── pooling.h
    ├── softmax.cpp
    ├── softmax.h
    └── util.h
├── include
    └── util.h
├── magma
    ├── Makefile
    ├── gemm.cu
    ├── kernel.h
    ├── log
    └── run.sh
└── tiling
    ├── Makefile
    ├── gemm.cu
    ├── kernel.h
    ├── kernel_128.h
    ├── kernel_256.h
    ├── log
    ├── run.sh
    └── thres.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | */gemm
2 | */gemm.o
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 | General matrix multiplication (GEMM) plays a paramount role in a broad range of domains such as deep learning, scientific computing, and image processing. Many researchers have spent large amounts of efforts on optimizing GEMM to exploit the enormous computing power of GPUs. The primary optimization method is to partition the matrix into many tiles and exploit the parallelism between and within each tile, which closely mirrors the thread hierarchy on GPUs. In practice, GPUs can fully unleash its computing power when the matrix size is large and there are a sufficient number of tiles and enough workload within each tile. However, in many real-world applications especially deep learning domains, the matrix size is small. Besides, in many other fields, such as astrophysics, metabolic networks, high-order FEM schemes and deep learning, the matrix size is also not large enough to fully drive the GPU hardware resource. To this end, batched GEMMs has been proposed to process a group of small independent GEMMs together. However, prior works only optimize either from the tiling or from the batching perspective.
3 | 
4 | In this paper, we propose a coordinated tiling and batching framework for accelerating GEMM on GPUs. Our solution exploits the synergistic interaction between the two optimization knobs. It is composed of two engines: tiling engine and batching engine. In the tiling engine, we first design a series of tiling strategies dedicated for the batched GEMM scenario. Then, we design an algorithm to select the tiling strategy for each GEMM. After tiling engine, it generates multiple tiles from the GEMMs. In the batching engine, it is responsible to assign the tiles into thread blocks. We design a series of batching algorithms to determine the assignment from tiles to thread blocks. Then, we propose a general programming style to describe the coordinated tiling and batching solution. Finally, experiment evaluation results show that our framework can achieve about 40% performance speedup over the state-of-the-art work.
5 | 


--------------------------------------------------------------------------------
/batching/Makefile:
--------------------------------------------------------------------------------
1 | #GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70
2 | GENCODE_FLAGS =  -gencode arch=compute_70,code=compute_70
3 | 
4 | gemm:gemm.cu kernel.h
5 | 	nvcc  $< -o $@ --std=c++11 -O3 ${GENCODE_FLAGS} -Xptxas -v -res-usage
6 | clean:
7 | 	rm -rf gemm *.o
8 | 


--------------------------------------------------------------------------------
/batching/gemm.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <cstdio>
  3 | #include <fstream>
  4 | #include <cublas_v2.h>
  5 | #include "../include/util.h"
  6 | #include "kernel.h"
  7 | 
  8 | #define N_RUNS 10
  9 | 
 10 | int  main (int argc, char** argv) {
 11 | 
 12 | 	ErrChk(cudaSetDevice(0));
 13 | 
 14 | 	if(argc<2){
 15 | 		printf("Usage: input the batch size\n");
 16 | 		exit(EXIT_FAILURE);
 17 | 	}
 18 | 
 19 | 	int BATCH = atoi(argv[1]);
 20 | 	//int TLP_thres = atoi(argv[2]);
 21 | 	int TLP_thres = 65536*2;
 22 | 	
 23 | 	int *M;
 24 | 	int *N;
 25 | 	int *K;
 26 | 
 27 | 	M = (int*) malloc(BATCH * sizeof(int));
 28 | 	N = (int*) malloc(BATCH * sizeof(int));
 29 | 	K = (int*) malloc(BATCH * sizeof(int));
 30 | 
 31 | 	std::fstream fs;
 32 | 	fs.open("../data/input");
 33 | 	if (!fs.is_open()){
 34 | 		printf("Error opening input\n");
 35 | 		exit(EXIT_FAILURE);
 36 | 	}
 37 | 	
 38 | 	//read matrix config	
 39 | 	for (int i=0; i<BATCH; ++i){
 40 | 		fs>>M[i]>>N[i]>>K[i];
 41 | 	}
 42 | 
 43 |     float **A;
 44 | 	float **B;
 45 | 	float **C;
 46 | 
 47 | 	A = (float**) malloc(BATCH * sizeof(float*));
 48 | 	B = (float**) malloc(BATCH * sizeof(float*));
 49 | 	C = (float**) malloc(BATCH * sizeof(float*));
 50 | 
 51 | 	for (int i=0; i<BATCH; ++i){
 52 | 		ErrChk(cudaMalloc((void**)&A[i], M[i]*K[i]*sizeof(float)));
 53 | 		ErrChk(cudaMalloc((void**)&B[i], K[i]*N[i]*sizeof(float)));
 54 | 		ErrChk(cudaMalloc((void**)&C[i], M[i]*N[i]*sizeof(float)));
 55 | 	}
 56 | 
 57 | 	float **dev_A;
 58 | 	float **dev_B;
 59 | 	float **dev_C;
 60 | 
 61 |     ErrChk(cudaMalloc((void**)&dev_A, BATCH*sizeof(float*)));
 62 |     ErrChk(cudaMalloc((void**)&dev_B, BATCH*sizeof(float*)));
 63 |     ErrChk(cudaMalloc((void**)&dev_C, BATCH*sizeof(float*)));
 64 | 
 65 | 	ErrChk(cudaMemcpy(dev_A, A, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 66 | 	ErrChk(cudaMemcpy(dev_B, B, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 67 | 	ErrChk(cudaMemcpy(dev_C, C, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 68 | 
 69 | 
 70 | 	int *dev_M, *dev_N, *dev_K;
 71 | 	ErrChk(cudaMalloc((void**)&dev_M, BATCH*sizeof(int)));
 72 | 	ErrChk(cudaMalloc((void**)&dev_N, BATCH*sizeof(int)));
 73 | 	ErrChk(cudaMalloc((void**)&dev_K, BATCH*sizeof(int)));
 74 | 
 75 | 	ErrChk(cudaMemcpy(dev_M, M, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 76 | 	ErrChk(cudaMemcpy(dev_N, N, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 77 | 	ErrChk(cudaMemcpy(dev_K, K, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 78 | 
 79 | 	
 80 | 	float elapsedTime = 0.f;
 81 |     double time=0.f;
 82 | 	float gflops_per_sec = 0.f;
 83 | 	double gflops = 0.f;
 84 | 	for (int i=0; i<BATCH; ++i)
 85 | 		gflops += ((2 * int64_t(M[i]) * int64_t(N[i]) * int64_t(K[i])) + (2 * int64_t(M[i]) * int64_t(N[i])) ) / 1.0e9;
 86 | 	cudaEvent_t start, stop;
 87 | 
 88 | 	//Tiling Strategy
 89 | 	int TLP = 0;
 90 | 
 91 | 	const int tile_size[6][2] = {
 92 | 		16, 16,
 93 | 		32, 32,
 94 | 		64, 64,
 95 | 		128, 64,
 96 | 		64, 128,
 97 | 		128, 128
 98 | 	};
 99 | 	
100 | 	int *t_strategy;
101 | 	t_strategy = (int*) malloc(BATCH * sizeof(int));
102 | 
103 | 	int t;	
104 | 	for (t=0; t<6; ++t){
105 | 		TLP = 0;
106 | 		for (int j=0; j<BATCH; ++j)
107 | 			TLP += (M[j]/tile_size[t][0])*(N[j]/tile_size[t][1])*256;
108 | 		
109 | 		if (TLP < TLP_thres)
110 | 			break;
111 | 	}
112 | 
113 | 	for (int j=0; j<BATCH; ++j){
114 | 	
115 | 		t_strategy[j] = 0;
116 | 		t = (t==6?5:t);
117 | 
118 | 		if (tile_size[t][0] <= M[j] && tile_size[t][1] <= N[j])
119 | 			t_strategy[j] = t;
120 | 		else{
121 | 			for (int k=0; k<t; ++k){
122 | 				if (tile_size[k][0] == M[j] && tile_size[k][1] <= N[j]){
123 | 					t_strategy[j] = k;
124 | 				}
125 | 			}
126 | 		}
127 | 	}
128 | 
129 | 	
130 | 	
131 | 	int *dev_T;
132 | 	ErrChk(cudaMalloc((void**)&dev_T, BATCH*sizeof(int)));
133 | 	ErrChk(cudaMemcpy(dev_T, t_strategy, BATCH*sizeof(int), cudaMemcpyHostToDevice));
134 | 
135 | /*	
136 | 	//print the obtained tiling strategy
137 | 	for (int j=0; j<BATCH; ++j)
138 | 		printf("%d ", t_strategy[j]);
139 | 	printf("\n");
140 | */
141 | 
142 | 	
143 | 	
144 | 
145 | 	//Batching Strategy
146 | 	int *b_strategy;
147 | 	b_strategy = (int*) malloc(BATCH * sizeof(int));
148 | 
149 | 	for (int j=0; j<BATCH; ++j){
150 | 		b_strategy[j] = 1;
151 | 	}
152 | 
153 | 	for (int j=0; j<BATCH; ++j){
154 | 		TLP -= M[j]/2/tile_size[t_strategy[j]][0]*N[j]/tile_size[t_strategy[j]][1];
155 | 		if (TLP > TLP_thres && M[j]>t_strategy[j] && K[j]<=32)
156 | 			b_strategy[j] = 2;
157 | 	}
158 | 
159 | 
160 | 	int *dev_Ba;
161 | 	ErrChk(cudaMalloc((void**)&dev_Ba, BATCH*sizeof(int)));
162 | 	ErrChk(cudaMemcpy(dev_Ba, b_strategy, BATCH*sizeof(int), cudaMemcpyHostToDevice));
163 | 
164 | 	
165 | /*
166 | 	//print the obtained batching strategy
167 | 	for (int j=0; j<BATCH; ++j)
168 | 		printf("%d ", b_strategy[j]);
169 | 	printf("\n");
170 | */
171 | 
172 | 
173 | 	
174 | 
175 | 
176 | 	//GEMM
177 |     dim3 block_size;
178 |     block_size.x = 256;
179 |     block_size.y = 1;
180 | 	block_size.z = 1;
181 | 
182 |     dim3 grid_size;
183 | 	
184 |     grid_size.x = M[0]/b_strategy[0]/tile_size[t_strategy[0]][0];
185 |     grid_size.y = N[0]/b_strategy[0]/tile_size[t_strategy[0]][1];
186 | 	grid_size.z = BATCH;
187 | 	for (int j=1; j<BATCH; ++j){
188 | 		grid_size.x = (grid_size.x > M[j]/b_strategy[j]/tile_size[t_strategy[j]][0])? (grid_size.x):(M[j]/b_strategy[j]/tile_size[t_strategy[j]][0]);
189 | 		grid_size.y = (grid_size.y > N[j]/tile_size[t_strategy[j]][1])? (grid_size.y):(N[j]/tile_size[t_strategy[j]][1]);
190 | 	}
191 | 
192 | //	printf("%d %d %d\n", grid_size.x, grid_size.y, grid_size.z);
193 | 
194 | 	//warm-up
195 | 	gemm_256<<<grid_size, block_size, sizeof(float)*4*128*8>>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C, dev_T, dev_Ba);
196 | 	KernelErrChk();
197 | 
198 | 	ErrChk(cudaEventCreate(&start));
199 | 	ErrChk(cudaEventRecord(start,0));
200 | 
201 | 	for (int run = 0; run<N_RUNS; ++run){
202 | 		gemm_256<<<grid_size, block_size, sizeof(float)*4*128*8>>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C, dev_T, dev_Ba);
203 | 		KernelErrChk();
204 | 	}
205 | 
206 | 	ErrChk(cudaEventCreate(&stop));
207 | 	ErrChk(cudaEventRecord(stop,0));
208 | 	ErrChk(cudaEventSynchronize(stop));
209 | 	ErrChk(cudaEventElapsedTime(&elapsedTime, start,stop));
210 | 
211 | 	time = elapsedTime/N_RUNS;
212 | 	time /= 1.0e3; //convert time unit from millisecond to second
213 | 	gflops_per_sec   = gflops / time;
214 | 	printf("%f\n", gflops_per_sec);
215 | 
216 | 	for (int i=0; i<BATCH; ++i){
217 | 		ErrChk(cudaFree(A[i]));		
218 | 		ErrChk(cudaFree(B[i]));		
219 | 		ErrChk(cudaFree(C[i]));		
220 | 	}
221 | 
222 | 	free(M);
223 | 	free(N);
224 | 	free(K);
225 | 	free(A);
226 | 	free(B);
227 | 	free(C);
228 | 	free(t_strategy);
229 | 
230 | 	ErrChk(cudaFree(dev_M));		
231 | 	ErrChk(cudaFree(dev_N));		
232 | 	ErrChk(cudaFree(dev_K));		
233 | 	ErrChk(cudaFree(dev_T));		
234 | 
235 | 	ErrChk(cudaFree(dev_A));		
236 | 	ErrChk(cudaFree(dev_B));		
237 | 	ErrChk(cudaFree(dev_C));		
238 | 
239 | 	return 0;
240 | }
241 | 


--------------------------------------------------------------------------------
/batching/k.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -f log
 4 | for ((thres=16; thres<=1024; thres=thres*2))
 5 | do
 6 | 	./gemm 16 $thres >> log
 7 | done
 8 | for ((thres=16; thres<=1024; thres=thres*2))
 9 | do
10 | 	./gemm 32 $thres >> log
11 | done
12 | for ((thres=16; thres<=1024; thres=thres*2))
13 | do
14 | 	./gemm 64 $thres >> log
15 | done
16 | for ((thres=16; thres<=1024; thres=thres*2))
17 | do
18 | 	./gemm 128 $thres >> log
19 | done
20 | for ((thres=16; thres<=1024; thres=thres*2))
21 | do
22 | 	./gemm 256 $thres >> log
23 | done
24 | 


--------------------------------------------------------------------------------
/batching/kernel.h:
--------------------------------------------------------------------------------
  1 | #include "kernel_128.h"
  2 | #include "kernel_256.h"
  3 | 
  4 | template<int kThreads>
  5 | __global__ void gemm(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[], int B_strategy[]){}
  6 | 
  7 | 
  8 | /*
  9 | template<>
 10 | __global__ void gemm<128>(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[]){
 11 | 	
 12 | 	extern __shared__ float sh[];
 13 | 
 14 | 	int begin = Tile[blockIdx.x];
 15 | 	int end = Tile[blockIdx.x+1];
 16 | 	int t = T_strategy[blockIdx.z];
 17 | 
 18 | 	//main loop for all tiles assigned to this block
 19 | #pragma unroll
 20 | 	for (int b=begin; b<end; ++b){
 21 | 		
 22 | 		int ind = GEMM[b];
 23 | 		int m = M[ind];
 24 | 		int n = N[ind];
 25 | 		int k = K[ind];
 26 | 		
 27 | 		float *a = A[ind];
 28 | 		float *b = B[ind];
 29 | 		float *c = C[ind];
 30 | 	
 31 | 		int by = Y_Coord[ind];
 32 | 		int bx = X_Coord[ind];	
 33 | 
 34 | 		switch(t){
 35 | 			case 0:
 36 | 				if (blockIdx.x * 16 < M[i] && blockIdx.y * 16 < N[i])	
 37 | 					gemm_128_16x16(M[i], N[i], K[i], A[i], B[i], C[i], sh);
 38 | 				break;
 39 | 			case 1:
 40 | 				if (blockIdx.x * 32 < M[i] && blockIdx.y * 32 < N[i])	
 41 | 					gemm_128_32x32(M[i], N[i], K[i], A[i], B[i], C[i], sh);
 42 | 				break;
 43 | 			case 2:
 44 | 				if (blockIdx.x * 64 < M[i] && blockIdx.y * 64 < N[i])	
 45 | 					gemm_128_64x64(M[i], N[i], K[i], A[i], B[i], C[i], sh);
 46 | 				break;
 47 | 			case 3:
 48 | 				if (blockIdx.x * 128 < M[i] && blockIdx.y * 64 < N[i])	
 49 | 					gemm_128_128x64(M[i], N[i], K[i], A[i], B[i], C[i], sh);
 50 | 				break;
 51 | 			case 4:
 52 | 				if (blockIdx.x * 64 < M[i] && blockIdx.y * 128 < N[i])	
 53 | 					gemm_128_64x128(M[i], N[i], K[i], A[i], B[i], C[i], sh);
 54 | 				break;
 55 | 			case 5:
 56 | 	//			if (blockIdx.x * 128 < M[i] && blockIdx.y * 128 < N[i])	
 57 | 	//				gemm_128_128x128(M[i], N[i], K[i], A[i], B[i], C[i], sh);
 58 | 				break;
 59 | 		}
 60 | 	}
 61 | 
 62 | 	return;
 63 | }
 64 | */
 65 | 
 66 | //template<>
 67 | __global__ void gemm_256(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[], int B_strategy[]){
 68 | 	
 69 | 	extern __shared__ float sh[];
 70 | 
 71 | 	int i = blockIdx.z;
 72 | 	int t = T_strategy[i];
 73 | 	int b = B_strategy[i];
 74 | 	int by;
 75 | 	int bx;
 76 | 	//main loop for all tiles assigned to this block
 77 | 
 78 | 	for (int j=0; j<b; ++j){
 79 | 		
 80 | 		switch(t){
 81 | 			case 0:
 82 | 				by = blockIdx.x * 16 * b + j*16;		
 83 | 				bx = blockIdx.y * 16;		
 84 | 				if (blockIdx.x *b* 16 < M[i] && blockIdx.y * 16 < N[i])	
 85 | 					gemm_256_16x16(M[i], N[i], K[i], A[i], B[i], C[i], by, bx, sh);
 86 | 				break;
 87 | 			case 1:
 88 | 				by = blockIdx.x *b* 32 * b + j*32;		
 89 | 				bx = blockIdx.y * 32;		
 90 | 				if (blockIdx.x * 32 < M[i] && blockIdx.y * 32 < N[i])	
 91 | 					gemm_256_32x32(M[i], N[i], K[i], A[i], B[i], C[i], by, bx, sh);
 92 | 				break;
 93 | 			case 2:
 94 | 				by = blockIdx.x * 64 * b + j*64;		
 95 | 				bx = blockIdx.y * 64;		
 96 | 				if (blockIdx.x *b* 64 < M[i] && blockIdx.y * 64 < N[i])	
 97 | 					gemm_256_64x64(M[i], N[i], K[i], A[i], B[i], C[i], by, bx, sh);
 98 | 				break;
 99 | 			case 3:
100 | 				by = blockIdx.x * 128 * b + j*128;		
101 | 				bx = blockIdx.y * 64;		
102 | 				if (blockIdx.x *b* 128 < M[i] && blockIdx.y * 64 < N[i])	
103 | 					gemm_256_128x64(M[i], N[i], K[i], A[i], B[i], C[i], by, bx, sh);
104 | 				break;
105 | 			case 4:
106 | 				by = blockIdx.x * 64 * b + j*64;		
107 | 				bx = blockIdx.y * 128;		
108 | 				if (blockIdx.x *b* 64 < M[i] && blockIdx.y * 128 < N[i])	
109 | 					gemm_256_64x128(M[i], N[i], K[i], A[i], B[i], C[i], by, bx, sh);
110 | 				break;
111 | 			case 5:
112 | 				by = blockIdx.x * 128 * b + j*128;		
113 | 				bx = blockIdx.y * 128;		
114 | 				if (blockIdx.x *b* 128 < M[i] && blockIdx.y * 128 < N[i])	
115 | 					gemm_256_128x128(M[i], N[i], K[i], A[i], B[i], C[i], by, bx, sh);
116 | 				break;
117 | 		}
118 | 	}
119 | 
120 | 	return;
121 | }
122 | 


--------------------------------------------------------------------------------
/batching/kernel_128.h:
--------------------------------------------------------------------------------
  1 | __device__ void gemm_128_16x16(int M, int N, int K, float *A, float *B, float *C, float *sh){
  2 | 
  3 | 	float *sh_A = sh;
  4 | 	float *sh_B = sh + 2*16*8;
  5 | 
  6 | 	float2 reg_C;
  7 | 	float2 reg_A;
  8 | 	float reg_B;
  9 | 
 10 | 	// Compute block's starting coordinate
 11 | 	int block_base_x = blockIdx.y*16;
 12 | 	int block_base_y = blockIdx.x*16;
 13 | 
 14 | 	//Load C from global memory to register file
 15 | 	float2 *C_start = (float2*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*2 + (threadIdx.x/8)*M);
 16 | 
 17 | 	reg_C = *C_start;
 18 | 
 19 | 	//load A from global memory to shared memory
 20 | 	float *A_start = A + block_base_y + (threadIdx.x%16) + (threadIdx.x/16)*M;
 21 | 	*(sh_A + threadIdx.x) = *(A_start);
 22 | 
 23 | 	//load A from global memory to shared memory
 24 | 	float *B_start = B + K*block_base_x + (threadIdx.x/16) + (threadIdx.x%16)*K;
 25 | 	*(sh_B + threadIdx.x) = *(B_start);
 26 | 
 27 | 
 28 | 	int double_buffer = 0;
 29 | #pragma unroll
 30 | 	for(int k=0; k<K; k+=8){
 31 | 		__syncthreads();
 32 | 		int A_offset = double_buffer + (threadIdx.x%8)*2;
 33 | 		int B_offset = double_buffer + (threadIdx.x/8);
 34 | 			
 35 | #pragma unroll
 36 | 		for (int i=0; i<8; i++)	{
 37 | 			
 38 | 			reg_A.x = sh_A[A_offset];
 39 | 			reg_A.y = sh_A[A_offset+1];
 40 | 
 41 | 			reg_B = sh_B[B_offset];
 42 | 
 43 | 			reg_C.x = fma(reg_A.x, reg_B, reg_C.x);
 44 | 			reg_C.y = fma(reg_A.y, reg_B, reg_C.y);
 45 | 
 46 | 			A_offset += 16;
 47 | 			B_offset += 16;
 48 | 		}
 49 | 
 50 | 		double_buffer ^= 128;
 51 | 
 52 | 		if (k+8 < K){
 53 | 			A_start += 8*M; 
 54 | 			*(sh_A + double_buffer + threadIdx.x) = *(A_start);
 55 | 			B_start += 8; 
 56 | 			*(sh_B + double_buffer + threadIdx.x) = *(B_start);
 57 | 		}
 58 | 	}
 59 | 	
 60 |     *C_start = reg_C;
 61 | }
 62 | 
 63 | __device__ void gemm_128_32x32(int M, int N, int K, float *A, float *B, float *C, float *sh){
 64 | 
 65 | 	float *sh_A = sh;
 66 | 	float *sh_B = sh + 2*32*8;
 67 | 
 68 | 	float4 reg_C[2];
 69 | 	float4 reg_A;
 70 | 	float  reg_B[2];
 71 | 
 72 | 	// Compute block's starting coordinate
 73 | 	int block_base_x = blockIdx.y*32;
 74 | 	int block_base_y = blockIdx.x*32;
 75 | 
 76 | 	//Load C from global memory to register file
 77 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*M);
 78 | 
 79 | 	reg_C[0] = *C_start;
 80 | 	reg_C[1] = *(C_start + 4*M);
 81 | 
 82 | 	//load A from global memory to shared memory
 83 | 	float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%16)*2 + (threadIdx.x/16)*M);
 84 | 	*((float2*)(sh_A + 2*threadIdx.x)) = *(A_start);
 85 | 
 86 | 	//load B from global memory to shared memory
 87 | 	float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/32)*2 + (threadIdx.x%32)*K);
 88 | 	*((float2*)(sh_B + 2*threadIdx.x)) = *(B_start);
 89 | 
 90 | 	int double_buffer = 0;
 91 | #pragma unroll
 92 | 	for(int k=0; k<K; k+=8){
 93 | 		__syncthreads();
 94 | 		int A_offset = double_buffer + (threadIdx.x%8)*4;
 95 | 		int B_offset = double_buffer + (threadIdx.x/8)*2;
 96 | 			
 97 | #pragma unroll
 98 | 		for (int i=0; i<8; i++)	{
 99 | 			
100 | 			reg_A.x = sh_A[A_offset];
101 | 			reg_A.y = sh_A[A_offset+1];
102 | 			reg_A.z = sh_A[A_offset+2];
103 | 			reg_A.w = sh_A[A_offset+3];
104 | 
105 | 			reg_B[0] = sh_B[B_offset];
106 | 			reg_B[1] = sh_B[B_offset+32];
107 | 
108 | 			reg_C[0].x = fma(reg_A.x, reg_B[0], reg_C[0].x);
109 | 			reg_C[0].y = fma(reg_A.y, reg_B[0], reg_C[0].y);
110 | 			reg_C[0].z = fma(reg_A.z, reg_B[0], reg_C[0].z);
111 | 			reg_C[0].w = fma(reg_A.w, reg_B[0], reg_C[0].w);
112 | 			reg_C[1].x = fma(reg_A.x, reg_B[1], reg_C[1].x);
113 | 			reg_C[1].y = fma(reg_A.y, reg_B[1], reg_C[1].y);
114 | 			reg_C[1].z = fma(reg_A.z, reg_B[1], reg_C[1].z);
115 | 			reg_C[1].w = fma(reg_A.w, reg_B[1], reg_C[1].w);
116 | 
117 | 			A_offset += 32;
118 | 			B_offset += ((i%2)*62 + 1);
119 | 		}
120 | 
121 | 		double_buffer ^= 256;
122 | 
123 | 		if (k+8 < K){
124 | 			A_start += 4*M; 
125 | 			*((float2*)(sh_A + double_buffer + 2*threadIdx.x)) = *(A_start);
126 | 			B_start += 4; 
127 | 			*((float2*)(sh_B + double_buffer + 2*threadIdx.x)) = *(B_start);
128 | 		}
129 | 	}
130 | 	
131 |     *C_start = reg_C[0];
132 |     *(C_start + 4*M) = reg_C[1];
133 | }
134 | 
135 | __device__ void gemm_128_64x64(int M, int N, int K, float *A, float *B, float *C, float *sh){
136 | 
137 | 	float *sh_A = sh;
138 | 	float *sh_B = sh + 2*64*8;
139 | 
140 | 	float4 reg_C[8];
141 | 	float4 reg_A;
142 | 	float  reg_B[8];
143 | 
144 | 	// Compute block's starting coordinate
145 | 	int block_base_x = blockIdx.y*64;
146 | 	int block_base_y = blockIdx.x*64;
147 | 
148 | 	//Load C from global memory to register file
149 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*4*M);
150 | 
151 |     reg_C[0] = *C_start;
152 | 	reg_C[1] = *(C_start + M/4);
153 | 	reg_C[2] = *(C_start + M/2);
154 | 	reg_C[3] = *(C_start + 3*M/4);
155 | 
156 | 	C_start += 8*M;
157 | 	reg_C[4] = *(C_start);
158 | 	reg_C[5] = *(C_start + M/4);
159 | 	reg_C[6] = *(C_start + M/2);
160 | 	reg_C[7] = *(C_start + 3*M/4);
161 | 
162 | 	//load A from global memory to shared memory
163 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*M); 
164 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
165 | 
166 | 	//load A from global memory to shared memory
167 | 	float4 *B_start = (float4*) (B + K*block_base_x + (threadIdx.x/64)*4 + (threadIdx.x%64)*K); 
168 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
169 | 		
170 | 	int double_buffer = 0;
171 | 
172 | #pragma unroll
173 | 	for(int k=0; k<K; k+=8){
174 | 
175 | 		__syncthreads();
176 | 		int A_offset = double_buffer + (threadIdx.x%16)*4;
177 | 		int B_offset = double_buffer + ((threadIdx.x/16)*16);
178 | 			
179 | #pragma unroll
180 | 		for (int i=0; i<8; ++i)	{
181 | 			
182 | 			reg_A = *((float4*)(sh_A + A_offset));
183 | 			reg_B[0] = sh_B[B_offset];
184 | 			reg_B[1] = sh_B[B_offset+4];
185 | 			reg_B[2] = sh_B[B_offset+8];
186 | 			reg_B[3] = sh_B[B_offset+12];
187 | 			reg_B[4] = sh_B[B_offset+128];
188 | 			reg_B[5] = sh_B[B_offset+132];
189 | 			reg_B[6] = sh_B[B_offset+136];
190 | 			reg_B[7] = sh_B[B_offset+140];
191 | 
192 | 			reg_C[0].x = fma(reg_A.x, reg_B[0], reg_C[0].x);
193 | 			reg_C[1].x = fma(reg_A.x, reg_B[1], reg_C[1].x);
194 | 			reg_C[2].x = fma(reg_A.x, reg_B[2], reg_C[2].x);
195 | 			reg_C[3].x = fma(reg_A.x, reg_B[3], reg_C[3].x);
196 | 			reg_C[4].x = fma(reg_A.x, reg_B[4], reg_C[4].x);
197 | 			reg_C[5].x = fma(reg_A.x, reg_B[5], reg_C[5].x);
198 | 			reg_C[6].x = fma(reg_A.x, reg_B[6], reg_C[6].x);
199 | 			reg_C[7].x = fma(reg_A.x, reg_B[7], reg_C[7].x);
200 | 
201 | 			reg_C[0].y = fma(reg_A.y, reg_B[0], reg_C[0].y);
202 | 			reg_C[1].y = fma(reg_A.y, reg_B[1], reg_C[1].y);
203 | 			reg_C[2].y = fma(reg_A.y, reg_B[2], reg_C[2].y);
204 | 			reg_C[3].y = fma(reg_A.y, reg_B[3], reg_C[3].y);
205 | 			reg_C[4].y = fma(reg_A.y, reg_B[4], reg_C[4].y);
206 | 			reg_C[5].y = fma(reg_A.y, reg_B[5], reg_C[5].y);
207 | 			reg_C[6].y = fma(reg_A.y, reg_B[6], reg_C[6].y);
208 | 			reg_C[7].y = fma(reg_A.y, reg_B[7], reg_C[7].y);
209 | 
210 | 			reg_C[0].z = fma(reg_A.z, reg_B[0], reg_C[0].z);
211 | 			reg_C[1].z = fma(reg_A.z, reg_B[1], reg_C[1].z);
212 | 			reg_C[2].z = fma(reg_A.z, reg_B[2], reg_C[2].z);
213 | 			reg_C[3].z = fma(reg_A.z, reg_B[3], reg_C[3].z);
214 | 			reg_C[4].z = fma(reg_A.z, reg_B[4], reg_C[4].z);
215 | 			reg_C[5].z = fma(reg_A.z, reg_B[5], reg_C[5].z);
216 | 			reg_C[6].z = fma(reg_A.z, reg_B[6], reg_C[6].z);
217 | 			reg_C[7].z = fma(reg_A.z, reg_B[7], reg_C[7].z);
218 | 
219 | 			reg_C[0].w = fma(reg_A.w, reg_B[0], reg_C[0].w);
220 | 			reg_C[1].w = fma(reg_A.w, reg_B[1], reg_C[1].w);
221 | 			reg_C[2].w = fma(reg_A.w, reg_B[2], reg_C[2].w);
222 | 			reg_C[3].w = fma(reg_A.w, reg_B[3], reg_C[3].w);
223 | 			reg_C[4].w = fma(reg_A.w, reg_B[4], reg_C[4].w);
224 | 			reg_C[5].w = fma(reg_A.w, reg_B[5], reg_C[5].w);
225 | 			reg_C[6].w = fma(reg_A.w, reg_B[6], reg_C[6].w);
226 | 			reg_C[7].w = fma(reg_A.w, reg_B[7], reg_C[7].w);
227 | 
228 | 			A_offset += 64;
229 | 			B_offset += ((i==3)*252 + 1);
230 | 		}
231 | 
232 | 		double_buffer ^= 512;
233 | 
234 | 		if (k+8 < K){
235 | 			A_start += 2*M; 
236 | 			*((float4*) (sh_A + double_buffer + 4*threadIdx.x)) = *(A_start);
237 | 
238 | 			B_start += 2; 
239 | 			*((float4*) (sh_B + double_buffer + 4*threadIdx.x)) = *(B_start);
240 | 		}
241 | 				
242 | 	}
243 | 	C_start -= 8*M;
244 |     *C_start = reg_C[0];
245 | 	*(C_start + M/4) = reg_C[1];
246 | 	*(C_start + M/2) = reg_C[2];
247 | 	*(C_start + 3*M/4) = reg_C[3];
248 | 
249 | 	C_start += 8*M;
250 | 	*(C_start) = reg_C[4];
251 | 	*(C_start + M/4) = reg_C[5];
252 | 	*(C_start + M/2) = reg_C[6];
253 | 	*(C_start + 3*M/4) = reg_C[7];
254 | 
255 | }
256 | __device__ void gemm_128_64x128(int M, int N, int K, float *A, float *B, float *C, float *sh){
257 | 
258 | 	float *sh_A = sh;
259 | 	float *sh_B = sh + 2*64*8;
260 | 
261 | 	float4 reg_C[16];
262 | 	float4 reg_A[2];
263 | 	float reg_B[8];
264 | 
265 | 	// Compute block's starting coordinate
266 | 	int block_base_x = blockIdx.y*128;
267 | 	int block_base_y = blockIdx.x*64;
268 | 
269 | 	//Load C from global memory to register file
270 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*4*M);
271 | 
272 | 	reg_C[0] = *C_start;
273 | 	reg_C[1] = *(C_start + M/4);
274 | 	reg_C[2] = *(C_start + M/2);
275 | 	reg_C[3] = *(C_start + 3*M/4);
276 | 
277 | 	C_start += 8;
278 | 	reg_C[4] = *C_start;
279 | 	reg_C[5] = *(C_start + M/4);
280 | 	reg_C[6] = *(C_start + M/2);
281 | 	reg_C[7] = *(C_start + 3*M/4);
282 | 
283 | 	C_start += (16*M - 8);
284 | 	reg_C[8] = *C_start;
285 | 	reg_C[9] = *(C_start + M/4);
286 | 	reg_C[10] = *(C_start + M/2);
287 | 	reg_C[11] = *(C_start + 3*M/4);
288 | 
289 | 	C_start += 8;
290 | 	reg_C[12] = *C_start;
291 | 	reg_C[13] = *(C_start + M/4);
292 | 	reg_C[14] = *(C_start + M/2);
293 | 	reg_C[15] = *(C_start + 3*M/4);
294 | 
295 | 	//load A from global memory to shared memory
296 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*M); 
297 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
298 | 
299 | 	//load B from global memory to shared memory
300 | 	float4 *B_start = (float4*) (B + K*block_base_x + threadIdx.x*K); 
301 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
302 | 	*((float4*) (sh_B + 512 + 4*threadIdx.x)) = *(B_start + 1);
303 | 		
304 | 	int double_buffer_A = 0;
305 | 	int double_buffer_B = 0;
306 | #pragma unroll
307 | 	for(int k=0; k<K; k+=8){
308 | 
309 | 		__syncthreads();
310 | 		int A_offset = double_buffer_A + (threadIdx.x%8)*4;
311 | 		int B_offset = double_buffer_B + ((threadIdx.x/8)*16);
312 | 			
313 | #pragma unroll
314 | 		for (int i=0; i<8; ++i)	{
315 | 			
316 | 			reg_A[0] = *((float4*)(sh_A+A_offset));
317 | 			reg_A[1] = *((float4*)(sh_A+A_offset+32));
318 | 
319 | 			reg_B[0] = sh_B[B_offset];
320 | 			reg_B[1] = sh_B[B_offset+4];
321 | 			reg_B[2] = sh_B[B_offset+8];
322 | 			reg_B[3] = sh_B[B_offset+12];
323 | 			reg_B[4] = sh_B[B_offset+256];
324 | 			reg_B[5] = sh_B[B_offset+260];
325 | 			reg_B[6] = sh_B[B_offset+264];
326 | 			reg_B[7] = sh_B[B_offset+268];
327 | 
328 | 			reg_C[0].x = fma(reg_A[0].x, reg_B[0], reg_C[0].x);
329 | 			reg_C[1].x = fma(reg_A[0].x, reg_B[1], reg_C[1].x);
330 | 			reg_C[2].x = fma(reg_A[0].x, reg_B[2], reg_C[2].x);
331 | 			reg_C[3].x = fma(reg_A[0].x, reg_B[3], reg_C[3].x);
332 | 			reg_C[8].x = fma(reg_A[0].x, reg_B[4], reg_C[8].x);
333 | 			reg_C[9].x = fma(reg_A[0].x, reg_B[5], reg_C[9].x);
334 | 			reg_C[10].x = fma(reg_A[0].x, reg_B[6], reg_C[10].x);
335 | 			reg_C[11].x = fma(reg_A[0].x, reg_B[7], reg_C[11].x);
336 | 			reg_C[4].x = fma(reg_A[1].x, reg_B[0], reg_C[4].x);
337 | 			reg_C[5].x = fma(reg_A[1].x, reg_B[1], reg_C[5].x);
338 | 			reg_C[6].x = fma(reg_A[1].x, reg_B[2], reg_C[6].x);
339 | 			reg_C[7].x = fma(reg_A[1].x, reg_B[3], reg_C[7].x);
340 | 			reg_C[12].x = fma(reg_A[1].x, reg_B[4], reg_C[12].x);
341 | 			reg_C[13].x = fma(reg_A[1].x, reg_B[5], reg_C[13].x);
342 | 			reg_C[14].x = fma(reg_A[1].x, reg_B[6], reg_C[14].x);
343 | 			reg_C[15].x = fma(reg_A[1].x, reg_B[7], reg_C[15].x);
344 | 
345 | 			reg_C[0].y = fma(reg_A[0].y, reg_B[0], reg_C[0].y);
346 | 			reg_C[1].y = fma(reg_A[0].y, reg_B[1], reg_C[1].y);
347 | 			reg_C[2].y = fma(reg_A[0].y, reg_B[2], reg_C[2].y);
348 | 			reg_C[3].y = fma(reg_A[0].y, reg_B[3], reg_C[3].y);
349 | 			reg_C[8].y = fma(reg_A[0].y, reg_B[4], reg_C[8].y);
350 | 			reg_C[9].y = fma(reg_A[0].y, reg_B[5], reg_C[9].y);
351 | 			reg_C[10].y = fma(reg_A[0].y, reg_B[6], reg_C[10].y);
352 | 			reg_C[11].y = fma(reg_A[0].y, reg_B[7], reg_C[11].y);
353 | 			reg_C[4].y = fma(reg_A[1].y, reg_B[0], reg_C[4].y);
354 | 			reg_C[5].y = fma(reg_A[1].y, reg_B[1], reg_C[5].y);
355 | 			reg_C[6].y = fma(reg_A[1].y, reg_B[2], reg_C[6].y);
356 | 			reg_C[7].y = fma(reg_A[1].y, reg_B[3], reg_C[7].y);
357 | 			reg_C[12].y = fma(reg_A[1].y, reg_B[4], reg_C[12].y);
358 | 			reg_C[13].y = fma(reg_A[1].y, reg_B[5], reg_C[13].y);
359 | 			reg_C[14].y = fma(reg_A[1].y, reg_B[6], reg_C[14].y);
360 | 			reg_C[15].y = fma(reg_A[1].y, reg_B[7], reg_C[15].y);
361 | 
362 | 			reg_C[0].z = fma(reg_A[0].z, reg_B[0], reg_C[0].z);
363 | 			reg_C[1].z = fma(reg_A[0].z, reg_B[1], reg_C[1].z);
364 | 			reg_C[2].z = fma(reg_A[0].z, reg_B[2], reg_C[2].z);
365 | 			reg_C[3].z = fma(reg_A[0].z, reg_B[3], reg_C[3].z);
366 | 			reg_C[8].z = fma(reg_A[0].z, reg_B[4], reg_C[8].z);
367 | 			reg_C[9].z = fma(reg_A[0].z, reg_B[5], reg_C[9].z);
368 | 			reg_C[10].z = fma(reg_A[0].z, reg_B[6], reg_C[10].z);
369 | 			reg_C[11].z = fma(reg_A[0].z, reg_B[7], reg_C[11].z);
370 | 			reg_C[4].z = fma(reg_A[1].z, reg_B[0], reg_C[4].z);
371 | 			reg_C[5].z = fma(reg_A[1].z, reg_B[1], reg_C[5].z);
372 | 			reg_C[6].z = fma(reg_A[1].z, reg_B[2], reg_C[6].z);
373 | 			reg_C[7].z = fma(reg_A[1].z, reg_B[3], reg_C[7].z);
374 | 			reg_C[12].z = fma(reg_A[1].z, reg_B[4], reg_C[12].z);
375 | 			reg_C[13].z = fma(reg_A[1].z, reg_B[5], reg_C[13].z);
376 | 			reg_C[14].z = fma(reg_A[1].z, reg_B[6], reg_C[14].z);
377 | 			reg_C[15].z = fma(reg_A[1].z, reg_B[7], reg_C[15].z);
378 | 
379 | 			reg_C[0].w = fma(reg_A[0].w, reg_B[0], reg_C[0].w);
380 | 			reg_C[1].w = fma(reg_A[0].w, reg_B[1], reg_C[1].w);
381 | 			reg_C[2].w = fma(reg_A[0].w, reg_B[2], reg_C[2].w);
382 | 			reg_C[3].w = fma(reg_A[0].w, reg_B[3], reg_C[3].w);
383 | 			reg_C[8].w = fma(reg_A[0].w, reg_B[4], reg_C[8].w);
384 | 			reg_C[9].w = fma(reg_A[0].w, reg_B[5], reg_C[9].w);
385 | 			reg_C[10].w = fma(reg_A[0].w, reg_B[6], reg_C[10].w);
386 | 			reg_C[11].w = fma(reg_A[0].w, reg_B[7], reg_C[11].w);
387 | 			reg_C[4].w = fma(reg_A[1].w, reg_B[0], reg_C[4].w);
388 | 			reg_C[5].w = fma(reg_A[1].w, reg_B[1], reg_C[5].w);
389 | 			reg_C[6].w = fma(reg_A[1].w, reg_B[2], reg_C[6].w);
390 | 			reg_C[7].w = fma(reg_A[1].w, reg_B[3], reg_C[7].w);
391 | 			reg_C[12].w = fma(reg_A[1].w, reg_B[4], reg_C[12].w);
392 | 			reg_C[13].w = fma(reg_A[1].w, reg_B[5], reg_C[13].w);
393 | 			reg_C[14].w = fma(reg_A[1].w, reg_B[6], reg_C[14].w);
394 | 			reg_C[15].w = fma(reg_A[1].w, reg_B[7], reg_C[15].w);
395 | 
396 | 			A_offset += 64;
397 | 			if (i==3) B_offset += 508;
398 | 			B_offset += 1;
399 | 		}
400 | 
401 | 		double_buffer_A ^= 512;
402 | 		double_buffer_B ^= 1024;
403 | 
404 | 		if (k+8 < K){
405 | 			A_start += 2*M; 
406 | 			*((float4*) (sh_A + double_buffer_A + 4*threadIdx.x)) = *(A_start);
407 | 
408 | 			B_start += 2; 
409 | 			*((float4*) (sh_B + double_buffer_B + 4*threadIdx.x)) = *(B_start);
410 | 			*((float4*) (sh_B + double_buffer_B + 512 + 4*threadIdx.x)) = *(B_start + 1);
411 | 		}
412 | 				
413 | 	}
414 | 	C_start -= (16*M + 8);
415 |     *C_start = reg_C[0];
416 | 	*(C_start + M/4) = reg_C[1];
417 | 	*(C_start + M/2) = reg_C[2];
418 | 	*(C_start + 3*M/4) = reg_C[3];
419 | 
420 | 	C_start += 8;
421 | 	*(C_start) = reg_C[4];
422 | 	*(C_start + M/4) = reg_C[5];
423 | 	*(C_start + M/2) = reg_C[6];
424 | 	*(C_start + 3*M/4) = reg_C[7];
425 | 
426 | 	C_start += (16*M - 8);
427 | 	*(C_start) = reg_C[8];
428 | 	*(C_start + M/4) = reg_C[9];
429 | 	*(C_start + M/2) = reg_C[10];
430 | 	*(C_start + 3*M/4) = reg_C[11];
431 | 
432 | 	C_start += 8;
433 | 	*(C_start) = reg_C[12];
434 | 	*(C_start + M/4) = reg_C[13];
435 | 	*(C_start + M/2) = reg_C[14];
436 | 	*(C_start + 3*M/4) = reg_C[15];
437 | }
438 | 
439 | __device__ void gemm_128_128x64(int M, int N, int K, float *A, float *B, float *C, float *sh){
440 | 
441 |     float *sh_A = sh;
442 | 	float *sh_B = sh + 2*128*8;
443 | 
444 | 	float4 reg_C[16];
445 | 	float reg_A[8];
446 | 	float reg_B[8];
447 | 
448 | 	// Compute block's starting coordinate
449 | 	int block_base_x = blockIdx.y*64;
450 | 	int block_base_y = blockIdx.x*128;
451 | 
452 | 	//Load C from global memory to register file
453 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*4*M);
454 | 
455 |     reg_C[0] = *C_start;
456 | 	reg_C[1] = *(C_start + M/4);
457 | 	reg_C[2] = *(C_start + M/2);
458 | 	reg_C[3] = *(C_start + 3*M/4);
459 | 
460 | 	C_start += 16;
461 | 	reg_C[4] = *(C_start);
462 | 	reg_C[5] = *(C_start + M/4);
463 | 	reg_C[6] = *(C_start + M/2);
464 | 	reg_C[7] = *(C_start + 3*M/4);
465 | 
466 | 	C_start += (8*M - 16);
467 | 	reg_C[8] = *(C_start);
468 | 	reg_C[9] = *(C_start + M/4);
469 | 	reg_C[10] = *(C_start + M/2);
470 | 	reg_C[11] = *(C_start + 3*M/4);
471 | 
472 | 	C_start += 16;
473 | 	reg_C[12] = *(C_start);
474 | 	reg_C[13] = *(C_start + M/4);
475 | 	reg_C[14] = *(C_start + M/2);
476 | 	reg_C[15] = *(C_start + 3*M/4);
477 | 
478 | 	//load A from global memory to shared memory
479 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%32)*4 + (threadIdx.x/32)*M); 
480 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
481 | 	*((float4*) (sh_A + 512 + 4*threadIdx.x)) = *(A_start + M);
482 | 
483 | 	//load A from global memory to shared memory
484 | 	float4 *B_start = (float4*) (B + K*block_base_x + (threadIdx.x/64)*4 + (threadIdx.x%64)*K); 
485 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
486 | 		
487 | 	int double_buffer_A = 0;
488 | 	int double_buffer_B = 0;
489 | #pragma unroll
490 | 	for(int k=0; k<K; k+=8){
491 | 
492 | 		__syncthreads();
493 | 		int A_offset = double_buffer_A + (threadIdx.x%16)*4;
494 | 		int B_offset = double_buffer_B + ((threadIdx.x/16)*16);
495 | 			
496 | #pragma unroll
497 | 		for (int i=0; i<8; ++i)	{
498 | 			
499 | 			reg_A[0] = sh_A[A_offset];
500 | 			reg_A[1] = sh_A[A_offset+1];
501 | 			reg_A[2] = sh_A[A_offset+2];
502 | 			reg_A[3] = sh_A[A_offset+3];
503 | 			reg_A[4] = sh_A[A_offset+64];
504 | 			reg_A[5] = sh_A[A_offset+65];
505 | 			reg_A[6] = sh_A[A_offset+66];
506 | 			reg_A[7] = sh_A[A_offset+67];
507 | 
508 | 			reg_B[0] = sh_B[B_offset];
509 | 			reg_B[1] = sh_B[B_offset+4];
510 | 			reg_B[2] = sh_B[B_offset+8];
511 | 			reg_B[3] = sh_B[B_offset+12];
512 | 			reg_B[4] = sh_B[B_offset+128];
513 | 			reg_B[5] = sh_B[B_offset+132];
514 | 			reg_B[6] = sh_B[B_offset+136];
515 | 			reg_B[7] = sh_B[B_offset+140];
516 | 
517 | 			reg_C[0].x = fma(reg_A[0], reg_B[0], reg_C[0].x);
518 | 			reg_C[1].x = fma(reg_A[0], reg_B[1], reg_C[1].x);
519 | 			reg_C[2].x = fma(reg_A[0], reg_B[2], reg_C[2].x);
520 | 			reg_C[3].x = fma(reg_A[0], reg_B[3], reg_C[3].x);
521 | 			reg_C[8].x = fma(reg_A[0], reg_B[4], reg_C[8].x);
522 | 			reg_C[9].x = fma(reg_A[0], reg_B[5], reg_C[9].x);
523 | 			reg_C[10].x = fma(reg_A[0], reg_B[6], reg_C[10].x);
524 | 			reg_C[11].x = fma(reg_A[0], reg_B[7], reg_C[11].x);
525 | 			reg_C[4].x = fma(reg_A[4], reg_B[0], reg_C[4].x);
526 | 			reg_C[5].x = fma(reg_A[4], reg_B[1], reg_C[5].x);
527 | 			reg_C[6].x = fma(reg_A[4], reg_B[2], reg_C[6].x);
528 | 			reg_C[7].x = fma(reg_A[4], reg_B[3], reg_C[7].x);
529 | 			reg_C[12].x = fma(reg_A[4], reg_B[4], reg_C[12].x);
530 | 			reg_C[13].x = fma(reg_A[4], reg_B[5], reg_C[13].x);
531 | 			reg_C[14].x = fma(reg_A[4], reg_B[6], reg_C[14].x);
532 | 			reg_C[15].x = fma(reg_A[4], reg_B[7], reg_C[15].x);
533 | 
534 | 			reg_C[0].y = fma(reg_A[1], reg_B[0], reg_C[0].y);
535 | 			reg_C[1].y = fma(reg_A[1], reg_B[1], reg_C[1].y);
536 | 			reg_C[2].y = fma(reg_A[1], reg_B[2], reg_C[2].y);
537 | 			reg_C[3].y = fma(reg_A[1], reg_B[3], reg_C[3].y);
538 | 			reg_C[8].y = fma(reg_A[1], reg_B[4], reg_C[8].y);
539 | 			reg_C[9].y = fma(reg_A[1], reg_B[5], reg_C[9].y);
540 | 			reg_C[10].y = fma(reg_A[1], reg_B[6], reg_C[10].y);
541 | 			reg_C[11].y = fma(reg_A[1], reg_B[7], reg_C[11].y);
542 | 			reg_C[4].y = fma(reg_A[5], reg_B[0], reg_C[4].y);
543 | 			reg_C[5].y = fma(reg_A[5], reg_B[1], reg_C[5].y);
544 | 			reg_C[6].y = fma(reg_A[5], reg_B[2], reg_C[6].y);
545 | 			reg_C[7].y = fma(reg_A[5], reg_B[3], reg_C[7].y);
546 | 			reg_C[12].y = fma(reg_A[5], reg_B[4], reg_C[12].y);
547 | 			reg_C[13].y = fma(reg_A[5], reg_B[5], reg_C[13].y);
548 | 			reg_C[14].y = fma(reg_A[5], reg_B[6], reg_C[14].y);
549 | 			reg_C[15].y = fma(reg_A[5], reg_B[7], reg_C[15].y);
550 | 
551 | 			reg_C[0].z = fma(reg_A[2], reg_B[0], reg_C[0].z);
552 | 			reg_C[1].z = fma(reg_A[2], reg_B[1], reg_C[1].z);
553 | 			reg_C[2].z = fma(reg_A[2], reg_B[2], reg_C[2].z);
554 | 			reg_C[3].z = fma(reg_A[2], reg_B[3], reg_C[3].z);
555 | 			reg_C[8].z = fma(reg_A[2], reg_B[4], reg_C[8].z);
556 | 			reg_C[9].z = fma(reg_A[2], reg_B[5], reg_C[9].z);
557 | 			reg_C[10].z = fma(reg_A[2], reg_B[6], reg_C[10].z);
558 | 			reg_C[11].z = fma(reg_A[2], reg_B[7], reg_C[11].z);
559 | 			reg_C[4].z = fma(reg_A[6], reg_B[0], reg_C[4].z);
560 | 			reg_C[5].z = fma(reg_A[6], reg_B[1], reg_C[5].z);
561 | 			reg_C[6].z = fma(reg_A[6], reg_B[2], reg_C[6].z);
562 | 			reg_C[7].z = fma(reg_A[6], reg_B[3], reg_C[7].z);
563 | 			reg_C[12].z = fma(reg_A[6], reg_B[4], reg_C[12].z);
564 | 			reg_C[13].z = fma(reg_A[6], reg_B[5], reg_C[13].z);
565 | 			reg_C[14].z = fma(reg_A[6], reg_B[6], reg_C[14].z);
566 | 			reg_C[15].z = fma(reg_A[6], reg_B[7], reg_C[15].z);
567 | 
568 | 			reg_C[0].w = fma(reg_A[3], reg_B[0], reg_C[0].w);
569 | 			reg_C[1].w = fma(reg_A[3], reg_B[1], reg_C[1].w);
570 | 			reg_C[2].w = fma(reg_A[3], reg_B[2], reg_C[2].w);
571 | 			reg_C[3].w = fma(reg_A[3], reg_B[3], reg_C[3].w);
572 | 			reg_C[8].w = fma(reg_A[3], reg_B[4], reg_C[8].w);
573 | 			reg_C[9].w = fma(reg_A[3], reg_B[5], reg_C[9].w);
574 | 			reg_C[10].w = fma(reg_A[3], reg_B[6], reg_C[10].w);
575 | 			reg_C[11].w = fma(reg_A[3], reg_B[7], reg_C[11].w);
576 | 			reg_C[4].w = fma(reg_A[7], reg_B[0], reg_C[4].w);
577 | 			reg_C[5].w = fma(reg_A[7], reg_B[1], reg_C[5].w);
578 | 			reg_C[6].w = fma(reg_A[7], reg_B[2], reg_C[6].w);
579 | 			reg_C[7].w = fma(reg_A[7], reg_B[3], reg_C[7].w);
580 | 			reg_C[12].w = fma(reg_A[7], reg_B[4], reg_C[12].w);
581 | 			reg_C[13].w = fma(reg_A[7], reg_B[5], reg_C[13].w);
582 | 			reg_C[14].w = fma(reg_A[7], reg_B[6], reg_C[14].w);
583 | 			reg_C[15].w = fma(reg_A[7], reg_B[7], reg_C[15].w);
584 | 
585 | 			A_offset += 128;
586 | 			if (i==3) B_offset += 252;
587 | 			B_offset += 1;
588 | 		}
589 | 
590 | 		double_buffer_A ^= 1024;
591 | 		double_buffer_B ^= 512;
592 | 
593 | 		if (k+8 < K){
594 | 			A_start += 2*M; 
595 | 			*((float4*) (sh_A + double_buffer_A + 4*threadIdx.x)) = *(A_start);
596 | 			*((float4*) (sh_A + double_buffer_A + 512 + 4*threadIdx.x)) = *(A_start + M);
597 | 
598 | 			B_start += 2; 
599 | 			*((float4*) (sh_B + double_buffer_B + 4*threadIdx.x)) = *(B_start);
600 | 		}
601 | 				
602 | 	}
603 | 	C_start -= (8*M + 16);
604 |     *C_start = reg_C[0];
605 | 	*(C_start + M/4) = reg_C[1];
606 | 	*(C_start + M/2) = reg_C[2];
607 | 	*(C_start + 3*M/4) = reg_C[3];
608 | 
609 | 	C_start += 16;
610 | 	*(C_start) = reg_C[4];
611 | 	*(C_start + M/4) = reg_C[5];
612 | 	*(C_start + M/2) = reg_C[6];
613 | 	*(C_start + 3*M/4) = reg_C[7];
614 | 
615 | 	C_start += (8*M - 16);
616 | 	*(C_start) = reg_C[8];
617 | 	*(C_start + M/4) = reg_C[9];
618 | 	*(C_start + M/2) = reg_C[10];
619 | 	*(C_start + 3*M/4) = reg_C[11];
620 | 
621 | 	C_start += 16;
622 | 	*(C_start) = reg_C[12];
623 | 	*(C_start + M/4) = reg_C[13];
624 | 	*(C_start + M/2) = reg_C[14];
625 | 	*(C_start + 3*M/4) = reg_C[15];
626 | }
627 | 


--------------------------------------------------------------------------------
/batching/kernel_256.h:
--------------------------------------------------------------------------------
  1 | __device__ void gemm_256_16x16(int M, int N, int K, float *A, float *B, float *C, int block_base_y, int block_base_x, float *sh){
  2 | 
  3 | 	float *sh_A = sh;
  4 | 	float *sh_B = sh + 2*16*16;
  5 | 
  6 | 	float reg_C;
  7 | 	float reg_A;
  8 | 	float reg_B;
  9 | 
 10 | 	//Load C from global memory to register file
 11 | 	float *C_start = (C + block_base_x*M + block_base_y + (threadIdx.x%16) + (threadIdx.x/16)*M);
 12 | 
 13 |     reg_C = *C_start;
 14 | 
 15 | 	//load A from global memory to shared memory
 16 | 	float *A_start = (A + block_base_y + (threadIdx.x%16) + (threadIdx.x/16)*M); 
 17 | 	* (sh_A + threadIdx.x) = *(A_start);
 18 | 
 19 | 	//load B from global memory to shared memory
 20 | 	float *B_start = (B + K*block_base_x + (threadIdx.x/16) + (threadIdx.x%16)*K); 
 21 | 	* (sh_B + threadIdx.x) = *(B_start);
 22 | 		
 23 | 	int double_buffer = 0;
 24 | #pragma unroll
 25 | 	for(int k=0; k<K; k+=16){
 26 | 
 27 | 		__syncthreads();
 28 | 		int A_offset = double_buffer + (threadIdx.x%16);
 29 | 		int B_offset = double_buffer + (threadIdx.x/16);
 30 | 			
 31 | #pragma unroll
 32 | 		for (int i=0; i<16; ++i)	{
 33 | 			reg_A = sh_A[A_offset]; 
 34 | 			reg_B = sh_B[B_offset]; 
 35 | 			reg_C = fma(reg_A, reg_B, reg_C);
 36 | 
 37 | 			A_offset += 16;
 38 | 			B_offset += 16;
 39 | 		}
 40 | 
 41 | 		double_buffer ^= 256;
 42 | 
 43 | 		if (k+16 < K){
 44 | 			A_start += 16*M; 
 45 | 			* (sh_A + double_buffer + threadIdx.x) = *(A_start);
 46 | 
 47 | 			B_start += 16; 
 48 | 			* (sh_B + double_buffer + threadIdx.x) = *(B_start);
 49 | 		}
 50 | 				
 51 | 	}
 52 | 	*(C_start) = reg_C;
 53 | }
 54 | 
 55 | __device__ void gemm_256_32x32(int M, int N, int K, float *A, float *B, float *C, int block_base_y, int block_base_x, float *sh){
 56 | 
 57 | 	float *sh_A = sh;
 58 | 	float *sh_B = sh + 2*32*8;
 59 | 
 60 | 	float4 reg_C;
 61 | 	float4 reg_A;
 62 | 	float  reg_B;
 63 | 
 64 | 	//Load C from global memory to register file
 65 | 	float4 *C_start = (float4 *) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*M);
 66 | 
 67 |     reg_C = *C_start;
 68 | 
 69 | 	//load B from global memory to shared memory
 70 | 	float *A_start = (A + block_base_y + (threadIdx.x%32) + (threadIdx.x/32)*M); 
 71 | 	* (sh_A + threadIdx.x) = *(A_start);
 72 | 
 73 | 	//load A from global memory to shared memory
 74 | 	float *B_start = (B + K*block_base_x + (threadIdx.x/32) + (threadIdx.x%32)*K); 
 75 | 	* (sh_B + threadIdx.x) = *(B_start);
 76 | 
 77 | 	int double_buffer = 0;
 78 | #pragma unroll
 79 | 	for(int k=0; k<K; k+=8){
 80 | 
 81 | 		__syncthreads();
 82 | 		int A_offset = double_buffer + (threadIdx.x%8)*4;
 83 | 		int B_offset = double_buffer + (threadIdx.x/8);
 84 | 			
 85 | #pragma unroll
 86 | 		for (int i=0; i<8; ++i)	{
 87 | 			reg_A = *((float4*) (sh_A + A_offset)); 
 88 | 			reg_B = sh_B[B_offset]; 
 89 | 
 90 | 			reg_C.x = fma(reg_A.x, reg_B, reg_C.x);
 91 | 			reg_C.y = fma(reg_A.y, reg_B, reg_C.y);
 92 | 			reg_C.z = fma(reg_A.z, reg_B, reg_C.z);
 93 | 			reg_C.w = fma(reg_A.w, reg_B, reg_C.w);
 94 | 
 95 | 			A_offset += 32;
 96 | 			B_offset += 32;
 97 | 		}
 98 | 
 99 | 		double_buffer ^= 256;
100 | 
101 | 		if (k+8 < K){
102 | 			A_start += 8*M; 
103 | 			* (sh_A + double_buffer + threadIdx.x) = *(A_start);
104 | 
105 | 			B_start += 8; 
106 | 			* (sh_B + double_buffer + threadIdx.x) = *(B_start);
107 | 		}
108 | 				
109 | 	}
110 | 	*(C_start) = reg_C;
111 | 
112 | }
113 | 
114 | __device__ void gemm_256_64x64(int M, int N, int K, float *A, float *B, float *C, int block_base_y, int block_base_x, float *sh){
115 | 
116 | 	float *sh_A = sh;
117 | 	float *sh_B = sh + 2*64*8;
118 | 
119 | 	float4 reg_C[4];
120 | 	float4 reg_A[2];
121 | 	float  reg_B[2];
122 | 
123 | 	//Load C from global memory to register file
124 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*M);
125 |     reg_C[0] = *C_start;
126 | 	reg_C[1] = *(C_start + 8);
127 | 	reg_C[2] = *(C_start + 8*M);
128 | 	reg_C[3] = *(C_start + 8 + 8*M);
129 | 	
130 | 	//load A from global memory to shared memory
131 | 	float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%32)*2 + (threadIdx.x/32)*M); 
132 | 	*((float2*) (sh_A + 2*threadIdx.x)) = *(A_start);
133 | 
134 | 	//load B from global memory to shared memory
135 | 	float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/64)*2 + (threadIdx.x%64)*K); 
136 | 	*((float2*) (sh_B + 2*threadIdx.x)) = *(B_start);
137 | 
138 | 	int double_buffer = 0;
139 | #pragma unroll
140 | 	for(int k=0; k<K; k+=8){
141 | 
142 | 		__syncthreads();
143 | 		int A_offset = double_buffer + (threadIdx.x%8)*4;
144 | 		int B_offset = double_buffer + (threadIdx.x/8)*2;
145 | 			
146 | #pragma unroll
147 | 		for (int i=0; i<8; ++i)	{
148 | 			reg_A[0] = *((float4*) (sh_A + A_offset)); 
149 | 			reg_A[1] = *((float4*) (sh_A + A_offset + 32)); 
150 | 			reg_B[0] = sh_B[B_offset]; 
151 | 			reg_B[1] = sh_B[B_offset + 64]; 
152 | 
153 | 			reg_C[0].x = fma(reg_A[0].x, reg_B[0], reg_C[0].x);
154 | 			reg_C[0].y = fma(reg_A[0].y, reg_B[0], reg_C[0].y);
155 | 			reg_C[0].z = fma(reg_A[0].z, reg_B[0], reg_C[0].z);
156 | 			reg_C[0].w = fma(reg_A[0].w, reg_B[0], reg_C[0].w);
157 | 
158 | 			reg_C[1].x = fma(reg_A[1].x, reg_B[0], reg_C[1].x);
159 | 			reg_C[1].y = fma(reg_A[1].y, reg_B[0], reg_C[1].y);
160 | 			reg_C[1].z = fma(reg_A[1].z, reg_B[0], reg_C[1].z);
161 | 			reg_C[1].w = fma(reg_A[1].w, reg_B[0], reg_C[1].w);
162 | 
163 | 			reg_C[2].x = fma(reg_A[0].x, reg_B[1], reg_C[2].x);
164 | 			reg_C[2].y = fma(reg_A[0].y, reg_B[1], reg_C[2].y);
165 | 			reg_C[2].z = fma(reg_A[0].z, reg_B[1], reg_C[2].z);
166 | 			reg_C[2].w = fma(reg_A[0].w, reg_B[1], reg_C[2].w);
167 | 
168 | 			reg_C[3].x = fma(reg_A[1].x, reg_B[1], reg_C[3].x);
169 | 			reg_C[3].y = fma(reg_A[1].y, reg_B[1], reg_C[3].y);
170 | 			reg_C[3].z = fma(reg_A[1].z, reg_B[1], reg_C[3].z);
171 | 			reg_C[3].w = fma(reg_A[1].w, reg_B[1], reg_C[3].w);
172 | 
173 | 			A_offset += 64;
174 | 			B_offset += 1;
175 | 			if (i%2) B_offset += 126;
176 | 		}
177 | 
178 | 		double_buffer ^= 512;
179 | 
180 | 		if (k+8 < K){
181 | 			A_start += 4*M; 
182 | 			*((float2*) (sh_A + double_buffer + 2*threadIdx.x)) = *(A_start);
183 | 
184 | 			B_start += 4; 
185 | 			*((float2*) (sh_B + double_buffer + 2*threadIdx.x)) = *(B_start);
186 | 		}
187 | 				
188 | 	}
189 | 	*(C_start) = reg_C[0];
190 | 	*(C_start + 8) = reg_C[1];
191 | 	*(C_start + 8*M) = reg_C[2];
192 | 	*(C_start + 8 + 8*M) = reg_C[3];
193 | }
194 | 
195 | __device__ void gemm_256_128x64(int M, int N, int K, float *A, float *B, float *C, int block_base_y, int block_base_x, float *sh){
196 | 
197 | 	float *sh_A = sh;
198 | 	float *sh_B = sh + 2*128*8;
199 | 
200 | 	float4 reg_C[8];
201 | 	float4 reg_A[2];
202 | 	float  reg_B[4];
203 | 
204 | 	//Load C from global memory to register file
205 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*4*M);
206 | 
207 |     reg_C[0] = *C_start;
208 | 	reg_C[1] = *(C_start + M/4);
209 | 	reg_C[2] = *(C_start + M/2);
210 | 	reg_C[3] = *(C_start + 3*M/4);
211 | 
212 | 	C_start += 16;
213 | 	reg_C[4] = *(C_start);
214 | 	reg_C[5] = *(C_start + M/4);
215 | 	reg_C[6] = *(C_start + M/2);
216 | 	reg_C[7] = *(C_start + 3*M/4);
217 | 
218 | 	//load A from global memory to shared memory
219 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%32)*4 + (threadIdx.x/32)*M); 
220 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
221 | 
222 | 	//load A from global memory to shared memory
223 | 	float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/64)*2 + (threadIdx.x%64)*K); 
224 | 	*((float2*) (sh_B + 2*threadIdx.x)) = *(B_start);
225 | 		
226 | 	int double_buffer_A = 0;
227 | 	int double_buffer_B = 0;
228 | #pragma unroll
229 | 	for(int k=0; k<K; k+=8){
230 | 
231 | 		__syncthreads();
232 | 		int A_offset = double_buffer_A + (threadIdx.x%16)*4;
233 | 		int B_offset = double_buffer_B + ((threadIdx.x/16)*8);
234 | 			
235 | #pragma unroll
236 | 		for (int i=0; i<8; ++i)	{
237 | 			
238 | 			reg_A[0] = *((float4*) (sh_A+A_offset));
239 | 			reg_A[1] = *((float4*) (sh_A+A_offset+64));
240 | 
241 | 			reg_B[0] = sh_B[B_offset];
242 | 			reg_B[1] = sh_B[B_offset+2];
243 | 			reg_B[2] = sh_B[B_offset+4];
244 | 			reg_B[3] = sh_B[B_offset+6];
245 | 
246 | 			reg_C[0].x = fma(reg_A[0].x, reg_B[0], reg_C[0].x);
247 | 			reg_C[1].x = fma(reg_A[0].x, reg_B[1], reg_C[1].x);
248 | 			reg_C[2].x = fma(reg_A[0].x, reg_B[2], reg_C[2].x);
249 | 			reg_C[3].x = fma(reg_A[0].x, reg_B[3], reg_C[3].x);
250 | 			reg_C[4].x = fma(reg_A[1].x, reg_B[0], reg_C[4].x);
251 | 			reg_C[5].x = fma(reg_A[1].x, reg_B[1], reg_C[5].x);
252 | 			reg_C[6].x = fma(reg_A[1].x, reg_B[2], reg_C[6].x);
253 | 			reg_C[7].x = fma(reg_A[1].x, reg_B[3], reg_C[7].x);
254 | 
255 | 			reg_C[0].y = fma(reg_A[0].y, reg_B[0], reg_C[0].y);
256 | 			reg_C[1].y = fma(reg_A[0].y, reg_B[1], reg_C[1].y);
257 | 			reg_C[2].y = fma(reg_A[0].y, reg_B[2], reg_C[2].y);
258 | 			reg_C[3].y = fma(reg_A[0].y, reg_B[3], reg_C[3].y);
259 | 			reg_C[4].y = fma(reg_A[1].y, reg_B[0], reg_C[4].y);
260 | 			reg_C[5].y = fma(reg_A[1].y, reg_B[1], reg_C[5].y);
261 | 			reg_C[6].y = fma(reg_A[1].y, reg_B[2], reg_C[6].y);
262 | 			reg_C[7].y = fma(reg_A[1].y, reg_B[3], reg_C[7].y);
263 | 
264 | 			reg_C[0].z = fma(reg_A[0].z, reg_B[0], reg_C[0].z);
265 | 			reg_C[1].z = fma(reg_A[0].z, reg_B[1], reg_C[1].z);
266 | 			reg_C[2].z = fma(reg_A[0].z, reg_B[2], reg_C[2].z);
267 | 			reg_C[3].z = fma(reg_A[0].z, reg_B[3], reg_C[3].z);
268 | 			reg_C[4].z = fma(reg_A[1].z, reg_B[0], reg_C[4].z);
269 | 			reg_C[5].z = fma(reg_A[1].z, reg_B[1], reg_C[5].z);
270 | 			reg_C[6].z = fma(reg_A[1].z, reg_B[2], reg_C[6].z);
271 | 			reg_C[7].z = fma(reg_A[1].z, reg_B[3], reg_C[7].z);
272 | 
273 | 			reg_C[0].w = fma(reg_A[0].w, reg_B[0], reg_C[0].w);
274 | 			reg_C[1].w = fma(reg_A[0].w, reg_B[1], reg_C[1].w);
275 | 			reg_C[2].w = fma(reg_A[0].w, reg_B[2], reg_C[2].w);
276 | 			reg_C[3].w = fma(reg_A[0].w, reg_B[3], reg_C[3].w);
277 | 			reg_C[4].w = fma(reg_A[1].w, reg_B[0], reg_C[4].w);
278 | 			reg_C[5].w = fma(reg_A[1].w, reg_B[1], reg_C[5].w);
279 | 			reg_C[6].w = fma(reg_A[1].w, reg_B[2], reg_C[6].w);
280 | 			reg_C[7].w = fma(reg_A[1].w, reg_B[3], reg_C[7].w);
281 | 
282 | 			A_offset += 128;
283 | 			B_offset += (1 + (i%2)*126);
284 | 		}
285 | 
286 | 		double_buffer_A ^= 1024;
287 | 		double_buffer_B ^= 512;
288 | 
289 | 		if (k+8 < K){
290 | 			A_start += 2*M; 
291 | 			*((float4*) (sh_A + double_buffer_A + 4*threadIdx.x)) = *(A_start);
292 | 
293 | 			B_start += 4; 
294 | 			*((float2*) (sh_B + double_buffer_B + 2*threadIdx.x)) = *(B_start);
295 | 		}
296 | 				
297 | 	}
298 | 	C_start -= 16;
299 |     *C_start = reg_C[0];
300 | 	*(C_start + M/4) = reg_C[1];
301 | 	*(C_start + M/2) = reg_C[2];
302 | 	*(C_start + 3*M/4) = reg_C[3];
303 | 
304 | 	C_start += 16;
305 | 	*(C_start) = reg_C[4];
306 | 	*(C_start + M/4) = reg_C[5];
307 | 	*(C_start + M/2) = reg_C[6];
308 | 	*(C_start + 3*M/4) = reg_C[7];
309 | 
310 | }
311 | 
312 | __device__ void gemm_256_64x128(int M, int N, int K, float *A, float *B, float *C, int block_base_y, int block_base_x, float *sh){
313 | 
314 | 	float *sh_A = sh;
315 | 	float *sh_B = sh + 2*64*8;
316 | 
317 | 	float4 reg_C[8];
318 | 	float4 reg_A[2];
319 | 	float  reg_B[4];
320 | 
321 | 	//Load C from global memory to register file
322 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*4*M);
323 | 
324 |     reg_C[0] = *C_start;
325 | 	reg_C[1] = *(C_start + M/4);
326 | 	reg_C[2] = *(C_start + M/2);
327 | 	reg_C[3] = *(C_start + 3*M/4);
328 | 
329 | 	C_start += 8;
330 | 	reg_C[4] = *(C_start);
331 | 	reg_C[5] = *(C_start + M/4);
332 | 	reg_C[6] = *(C_start + M/2);
333 | 	reg_C[7] = *(C_start + 3*M/4);
334 | 
335 | 	//load A from global memory to shared memory
336 | 	float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%32)*2 + (threadIdx.x/32)*M); 
337 | 	*((float2*) (sh_A + 2*threadIdx.x)) = *(A_start);
338 | 
339 | 	//load A from global memory to shared memory
340 | 	float4 *B_start = (float4*) (B + K*block_base_x + (threadIdx.x/128)*4 + (threadIdx.x%128)*K); 
341 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
342 | 		
343 | 	int double_buffer_A = 0;
344 | 	int double_buffer_B = 0;
345 | #pragma unroll
346 | 	for(int k=0; k<K; k+=8){
347 | 
348 | 		__syncthreads();
349 | 		int A_offset = double_buffer_A + (threadIdx.x%8)*4;
350 | 		int B_offset = double_buffer_B + ((threadIdx.x/8)*16);
351 | 			
352 | #pragma unroll
353 | 		for (int i=0; i<8; ++i)	{
354 | 			
355 | 			reg_A[0] = *((float4*) (sh_A+A_offset));
356 | 			reg_A[1] = *((float4*) (sh_A+A_offset+32));
357 | 
358 | 			reg_B[0] = sh_B[B_offset];
359 | 			reg_B[1] = sh_B[B_offset+4];
360 | 			reg_B[2] = sh_B[B_offset+8];
361 | 			reg_B[3] = sh_B[B_offset+12];
362 | 
363 | 			reg_C[0].x = fma(reg_A[0].x, reg_B[0], reg_C[0].x);
364 | 			reg_C[1].x = fma(reg_A[0].x, reg_B[1], reg_C[1].x);
365 | 			reg_C[2].x = fma(reg_A[0].x, reg_B[2], reg_C[2].x);
366 | 			reg_C[3].x = fma(reg_A[0].x, reg_B[3], reg_C[3].x);
367 | 			reg_C[4].x = fma(reg_A[1].x, reg_B[0], reg_C[4].x);
368 | 			reg_C[5].x = fma(reg_A[1].x, reg_B[1], reg_C[5].x);
369 | 			reg_C[6].x = fma(reg_A[1].x, reg_B[2], reg_C[6].x);
370 | 			reg_C[7].x = fma(reg_A[1].x, reg_B[3], reg_C[7].x);
371 | 
372 | 			reg_C[0].y = fma(reg_A[0].y, reg_B[0], reg_C[0].y);
373 | 			reg_C[1].y = fma(reg_A[0].y, reg_B[1], reg_C[1].y);
374 | 			reg_C[2].y = fma(reg_A[0].y, reg_B[2], reg_C[2].y);
375 | 			reg_C[3].y = fma(reg_A[0].y, reg_B[3], reg_C[3].y);
376 | 			reg_C[4].y = fma(reg_A[1].y, reg_B[0], reg_C[4].y);
377 | 			reg_C[5].y = fma(reg_A[1].y, reg_B[1], reg_C[5].y);
378 | 			reg_C[6].y = fma(reg_A[1].y, reg_B[2], reg_C[6].y);
379 | 			reg_C[7].y = fma(reg_A[1].y, reg_B[3], reg_C[7].y);
380 | 
381 | 			reg_C[0].z = fma(reg_A[0].z, reg_B[0], reg_C[0].z);
382 | 			reg_C[1].z = fma(reg_A[0].z, reg_B[1], reg_C[1].z);
383 | 			reg_C[2].z = fma(reg_A[0].z, reg_B[2], reg_C[2].z);
384 | 			reg_C[3].z = fma(reg_A[0].z, reg_B[3], reg_C[3].z);
385 | 			reg_C[4].z = fma(reg_A[1].z, reg_B[0], reg_C[4].z);
386 | 			reg_C[5].z = fma(reg_A[1].z, reg_B[1], reg_C[5].z);
387 | 			reg_C[6].z = fma(reg_A[1].z, reg_B[2], reg_C[6].z);
388 | 			reg_C[7].z = fma(reg_A[1].z, reg_B[3], reg_C[7].z);
389 | 
390 | 			reg_C[0].w = fma(reg_A[0].w, reg_B[0], reg_C[0].w);
391 | 			reg_C[1].w = fma(reg_A[0].w, reg_B[1], reg_C[1].w);
392 | 			reg_C[2].w = fma(reg_A[0].w, reg_B[2], reg_C[2].w);
393 | 			reg_C[3].w = fma(reg_A[0].w, reg_B[3], reg_C[3].w);
394 | 			reg_C[4].w = fma(reg_A[1].w, reg_B[0], reg_C[4].w);
395 | 			reg_C[5].w = fma(reg_A[1].w, reg_B[1], reg_C[5].w);
396 | 			reg_C[6].w = fma(reg_A[1].w, reg_B[2], reg_C[6].w);
397 | 			reg_C[7].w = fma(reg_A[1].w, reg_B[3], reg_C[7].w);
398 | 
399 | 			A_offset += 64;
400 | 			B_offset += (1 + (i==3)*508);
401 | 		}
402 | 
403 | 		double_buffer_A ^= 512;
404 | 		double_buffer_B ^= 1024;
405 | 
406 | 		if (k+8 < K){
407 | 			A_start += 4*M; 
408 | 			*((float2*) (sh_A + double_buffer_A + 2*threadIdx.x)) = *(A_start);
409 | 
410 | 			B_start += 2; 
411 | 			*((float4*) (sh_B + double_buffer_B + 4*threadIdx.x)) = *(B_start);
412 | 		}
413 | 				
414 | 	}
415 |     *C_start = reg_C[4];
416 | 	*(C_start + M/4) = reg_C[5];
417 | 	*(C_start + M/2) = reg_C[6];
418 | 	*(C_start + 3*M/4) = reg_C[7];
419 | 
420 | 	C_start -= 8;
421 | 	*(C_start) = reg_C[0];
422 | 	*(C_start + M/4) = reg_C[1];
423 | 	*(C_start + M/2) = reg_C[2];
424 | 	*(C_start + 3*M/4) = reg_C[3];
425 | 
426 | }
427 | 
428 | __device__ void gemm_256_128x128(int M, int N, int K, float *A, float *B, float *C, int block_base_y, int block_base_x, float *sh){
429 | 
430 |     float *sh_A = sh;
431 | 	float *sh_B = sh + 2*128*8;
432 | 
433 | 	float4 reg_C[16];
434 | 	float reg_A[8];
435 | 	float reg_B[8];
436 | 
437 | 	//Load C from global memory to register file
438 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*4*M);
439 | 
440 |     reg_C[0] = *C_start;
441 | 	reg_C[1] = *(C_start + M/4);
442 | 	reg_C[2] = *(C_start + M/2);
443 | 	reg_C[3] = *(C_start + 3*M/4);
444 | 
445 | 	C_start += 16;
446 | 	reg_C[4] = *(C_start);
447 | 	reg_C[5] = *(C_start + M/4);
448 | 	reg_C[6] = *(C_start + M/2);
449 | 	reg_C[7] = *(C_start + 3*M/4);
450 | 
451 | 	C_start += (16*M - 16);
452 | 	reg_C[8] = *(C_start);
453 | 	reg_C[9] = *(C_start + M/4);
454 | 	reg_C[10] = *(C_start + M/2);
455 | 	reg_C[11] = *(C_start + 3*M/4);
456 | 
457 | 	C_start += 16;
458 | 	reg_C[12] = *(C_start);
459 | 	reg_C[13] = *(C_start + M/4);
460 | 	reg_C[14] = *(C_start + M/2);
461 | 	reg_C[15] = *(C_start + 3*M/4);
462 | 
463 | 	//load A from global memory to shared memory
464 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%32)*4 + (threadIdx.x/32)*M); 
465 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
466 | 
467 | 	//load A from global memory to shared memory
468 | 	float4 *B_start = (float4*) (B + K*block_base_x + (threadIdx.x/128)*4 + (threadIdx.x%128)*K); 
469 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
470 | 		
471 | 	int double_buffer = 0;
472 | #pragma unroll
473 | 	for(int k=0; k<K; k+=8){
474 | 
475 | 		__syncthreads();
476 | 		int A_offset = double_buffer + (threadIdx.x%16)*4;
477 | 		int B_offset = double_buffer + ((threadIdx.x/16)*16);
478 | 			
479 | #pragma unroll
480 | 		for (int i=0; i<8; ++i)	{
481 | 			
482 | 			reg_A[0] = sh_A[A_offset];
483 | 			reg_A[1] = sh_A[A_offset+1];
484 | 			reg_A[2] = sh_A[A_offset+2];
485 | 			reg_A[3] = sh_A[A_offset+3];
486 | 			reg_A[4] = sh_A[A_offset+64];
487 | 			reg_A[5] = sh_A[A_offset+65];
488 | 			reg_A[6] = sh_A[A_offset+66];
489 | 			reg_A[7] = sh_A[A_offset+67];
490 | 
491 | 			reg_B[0] = sh_B[B_offset];
492 | 			reg_B[1] = sh_B[B_offset+4];
493 | 			reg_B[2] = sh_B[B_offset+8];
494 | 			reg_B[3] = sh_B[B_offset+12];
495 | 			reg_B[4] = sh_B[B_offset+256];
496 | 			reg_B[5] = sh_B[B_offset+260];
497 | 			reg_B[6] = sh_B[B_offset+264];
498 | 			reg_B[7] = sh_B[B_offset+268];
499 | 
500 | 			reg_C[0].x = fma(reg_A[0], reg_B[0], reg_C[0].x);
501 | 			reg_C[1].x = fma(reg_A[0], reg_B[1], reg_C[1].x);
502 | 			reg_C[2].x = fma(reg_A[0], reg_B[2], reg_C[2].x);
503 | 			reg_C[3].x = fma(reg_A[0], reg_B[3], reg_C[3].x);
504 | 			reg_C[8].x = fma(reg_A[0], reg_B[4], reg_C[8].x);
505 | 			reg_C[9].x = fma(reg_A[0], reg_B[5], reg_C[9].x);
506 | 			reg_C[10].x = fma(reg_A[0], reg_B[6], reg_C[10].x);
507 | 			reg_C[11].x = fma(reg_A[0], reg_B[7], reg_C[11].x);
508 | 			reg_C[4].x = fma(reg_A[4], reg_B[0], reg_C[4].x);
509 | 			reg_C[5].x = fma(reg_A[4], reg_B[1], reg_C[5].x);
510 | 			reg_C[6].x = fma(reg_A[4], reg_B[2], reg_C[6].x);
511 | 			reg_C[7].x = fma(reg_A[4], reg_B[3], reg_C[7].x);
512 | 			reg_C[12].x = fma(reg_A[4], reg_B[4], reg_C[12].x);
513 | 			reg_C[13].x = fma(reg_A[4], reg_B[5], reg_C[13].x);
514 | 			reg_C[14].x = fma(reg_A[4], reg_B[6], reg_C[14].x);
515 | 			reg_C[15].x = fma(reg_A[4], reg_B[7], reg_C[15].x);
516 | 
517 | 			reg_C[0].y = fma(reg_A[1], reg_B[0], reg_C[0].y);
518 | 			reg_C[1].y = fma(reg_A[1], reg_B[1], reg_C[1].y);
519 | 			reg_C[2].y = fma(reg_A[1], reg_B[2], reg_C[2].y);
520 | 			reg_C[3].y = fma(reg_A[1], reg_B[3], reg_C[3].y);
521 | 			reg_C[8].y = fma(reg_A[1], reg_B[4], reg_C[8].y);
522 | 			reg_C[9].y = fma(reg_A[1], reg_B[5], reg_C[9].y);
523 | 			reg_C[10].y = fma(reg_A[1], reg_B[6], reg_C[10].y);
524 | 			reg_C[11].y = fma(reg_A[1], reg_B[7], reg_C[11].y);
525 | 			reg_C[4].y = fma(reg_A[5], reg_B[0], reg_C[4].y);
526 | 			reg_C[5].y = fma(reg_A[5], reg_B[1], reg_C[5].y);
527 | 			reg_C[6].y = fma(reg_A[5], reg_B[2], reg_C[6].y);
528 | 			reg_C[7].y = fma(reg_A[5], reg_B[3], reg_C[7].y);
529 | 			reg_C[12].y = fma(reg_A[5], reg_B[4], reg_C[12].y);
530 | 			reg_C[13].y = fma(reg_A[5], reg_B[5], reg_C[13].y);
531 | 			reg_C[14].y = fma(reg_A[5], reg_B[6], reg_C[14].y);
532 | 			reg_C[15].y = fma(reg_A[5], reg_B[7], reg_C[15].y);
533 | 
534 | 			reg_C[0].z = fma(reg_A[2], reg_B[0], reg_C[0].z);
535 | 			reg_C[1].z = fma(reg_A[2], reg_B[1], reg_C[1].z);
536 | 			reg_C[2].z = fma(reg_A[2], reg_B[2], reg_C[2].z);
537 | 			reg_C[3].z = fma(reg_A[2], reg_B[3], reg_C[3].z);
538 | 			reg_C[8].z = fma(reg_A[2], reg_B[4], reg_C[8].z);
539 | 			reg_C[9].z = fma(reg_A[2], reg_B[5], reg_C[9].z);
540 | 			reg_C[10].z = fma(reg_A[2], reg_B[6], reg_C[10].z);
541 | 			reg_C[11].z = fma(reg_A[2], reg_B[7], reg_C[11].z);
542 | 			reg_C[4].z = fma(reg_A[6], reg_B[0], reg_C[4].z);
543 | 			reg_C[5].z = fma(reg_A[6], reg_B[1], reg_C[5].z);
544 | 			reg_C[6].z = fma(reg_A[6], reg_B[2], reg_C[6].z);
545 | 			reg_C[7].z = fma(reg_A[6], reg_B[3], reg_C[7].z);
546 | 			reg_C[12].z = fma(reg_A[6], reg_B[4], reg_C[12].z);
547 | 			reg_C[13].z = fma(reg_A[6], reg_B[5], reg_C[13].z);
548 | 			reg_C[14].z = fma(reg_A[6], reg_B[6], reg_C[14].z);
549 | 			reg_C[15].z = fma(reg_A[6], reg_B[7], reg_C[15].z);
550 | 
551 | 			reg_C[0].w = fma(reg_A[3], reg_B[0], reg_C[0].w);
552 | 			reg_C[1].w = fma(reg_A[3], reg_B[1], reg_C[1].w);
553 | 			reg_C[2].w = fma(reg_A[3], reg_B[2], reg_C[2].w);
554 | 			reg_C[3].w = fma(reg_A[3], reg_B[3], reg_C[3].w);
555 | 			reg_C[8].w = fma(reg_A[3], reg_B[4], reg_C[8].w);
556 | 			reg_C[9].w = fma(reg_A[3], reg_B[5], reg_C[9].w);
557 | 			reg_C[10].w = fma(reg_A[3], reg_B[6], reg_C[10].w);
558 | 			reg_C[11].w = fma(reg_A[3], reg_B[7], reg_C[11].w);
559 | 			reg_C[4].w = fma(reg_A[7], reg_B[0], reg_C[4].w);
560 | 			reg_C[5].w = fma(reg_A[7], reg_B[1], reg_C[5].w);
561 | 			reg_C[6].w = fma(reg_A[7], reg_B[2], reg_C[6].w);
562 | 			reg_C[7].w = fma(reg_A[7], reg_B[3], reg_C[7].w);
563 | 			reg_C[12].w = fma(reg_A[7], reg_B[4], reg_C[12].w);
564 | 			reg_C[13].w = fma(reg_A[7], reg_B[5], reg_C[13].w);
565 | 			reg_C[14].w = fma(reg_A[7], reg_B[6], reg_C[14].w);
566 | 			reg_C[15].w = fma(reg_A[7], reg_B[7], reg_C[15].w);
567 | 
568 | 			A_offset += 128;
569 | 			if (i==3) B_offset += 508;
570 | 			B_offset += 1;
571 | 		}
572 | 
573 | 		double_buffer ^= 1024;
574 | 
575 | 		if (k+8 < K){
576 | 			A_start += 2*M; 
577 | 			*((float4*) (sh_A + double_buffer + 4*threadIdx.x)) = *(A_start);
578 | 
579 | 			B_start += 2; 
580 | 			*((float4*) (sh_B + double_buffer + 4*threadIdx.x)) = *(B_start);
581 | 		}
582 | 				
583 | 	}
584 | 	C_start -= (16*M + 16);
585 |     *C_start = reg_C[0];
586 | 	*(C_start + M/4) = reg_C[1];
587 | 	*(C_start + M/2) = reg_C[2];
588 | 	*(C_start + 3*M/4) = reg_C[3];
589 | 
590 | 	C_start += 16;
591 | 	*(C_start) = reg_C[4];
592 | 	*(C_start + M/4) = reg_C[5];
593 | 	*(C_start + M/2) = reg_C[6];
594 | 	*(C_start + 3*M/4) = reg_C[7];
595 | 
596 | 	C_start += (16*M - 16);
597 | 	*(C_start) = reg_C[8];
598 | 	*(C_start + M/4) = reg_C[9];
599 | 	*(C_start + M/2) = reg_C[10];
600 | 	*(C_start + 3*M/4) = reg_C[11];
601 | 
602 | 	C_start += 16;
603 | 	*(C_start) = reg_C[12];
604 | 	*(C_start + M/4) = reg_C[13];
605 | 	*(C_start + M/2) = reg_C[14];
606 | 	*(C_start + 3*M/4) = reg_C[15];
607 | }
608 | 


--------------------------------------------------------------------------------
/batching/log:
--------------------------------------------------------------------------------
  1 | 272.899109
  2 | 521.666931
  3 | 745.457458
  4 | 802.174500
  5 | 994.791870
  6 | 1177.328491
  7 | 1456.093994
  8 | 260.635101
  9 | 538.017029
 10 | 741.823120
 11 | 808.601196
 12 | 998.584534
 13 | 1185.326416
 14 | 1457.271973
 15 | 260.925812
 16 | 541.546143
 17 | 753.961731
 18 | 804.925415
 19 | 987.974304
 20 | 1183.614136
 21 | 1442.802856
 22 | 258.808777
 23 | 525.541077
 24 | 744.948242
 25 | 799.442322
 26 | 1032.802368
 27 | 1186.782104
 28 | 1449.924561
 29 | 262.291168
 30 | 523.636353
 31 | 733.013611
 32 | 811.331238
 33 | 1000.734009
 34 | 1180.684814
 35 | 1450.600586
 36 | 262.634735
 37 | 524.269714
 38 | 734.412354
 39 | 810.744629
 40 | 994.260010
 41 | 1183.131104
 42 | 1435.780884
 43 | 265.667236
 44 | 533.300537
 45 | 746.222656
 46 | 810.289062
 47 | 992.879883
 48 | 1173.260376
 49 | 1452.601929
 50 | 206.287628
 51 | 472.550018
 52 | 798.846924
 53 | 1297.620850
 54 | 1976.665649
 55 | 2398.120605
 56 | 2862.332520
 57 | 207.356766
 58 | 485.191711
 59 | 789.202576
 60 | 1265.624634
 61 | 2039.980103
 62 | 2381.659668
 63 | 2867.011475
 64 | 207.560303
 65 | 468.006256
 66 | 803.986145
 67 | 1256.456177
 68 | 2020.936523
 69 | 2396.966553
 70 | 2874.022949
 71 | 204.407608
 72 | 476.515472
 73 | 770.363220
 74 | 1245.212646
 75 | 1993.832520
 76 | 2405.067871
 77 | 2860.746582
 78 | 198.860840
 79 | 470.128113
 80 | 799.849365
 81 | 1245.382202
 82 | 2008.601685
 83 | 2409.048828
 84 | 2864.491455
 85 | 200.993881
 86 | 468.511902
 87 | 804.392029
 88 | 1297.666870
 89 | 2031.286743
 90 | 2373.664307
 91 | 2875.425781
 92 | 202.367737
 93 | 470.684814
 94 | 805.052429
 95 | 1257.276367
 96 | 1979.463257
 97 | 2408.220459
 98 | 2879.840332
 99 | 749.696045
100 | 1435.801758
101 | 1910.040894
102 | 2935.379150
103 | 2578.799316
104 | 3938.639648
105 | 3363.301270
106 | 757.315674
107 | 1441.053345
108 | 1901.610107
109 | 2933.341553
110 | 2586.826660
111 | 3937.008789
112 | 3364.765869
113 | 770.082825
114 | 1450.222534
115 | 1914.605469
116 | 2892.895264
117 | 2587.861572
118 | 3953.872314
119 | 3375.071533
120 | 727.125732
121 | 1462.954102
122 | 1912.226074
123 | 2964.124756
124 | 2585.627441
125 | 3913.954834
126 | 3386.582275
127 | 718.375427
128 | 1450.870850
129 | 1924.335571
130 | 2922.790771
131 | 2584.489258
132 | 3942.126221
133 | 3375.453369
134 | 743.173279
135 | 1439.661621
136 | 1915.437744
137 | 2916.738037
138 | 2603.016602
139 | 3945.090332
140 | 3379.266602
141 | 727.528564
142 | 1469.658691
143 | 1892.255615
144 | 2949.887695
145 | 2580.454346
146 | 3908.257568
147 | 3370.738281
148 | 1056.622559
149 | 2032.130615
150 | 4240.775879
151 | 4189.942871
152 | 3954.488770
153 | 4683.043457
154 | 5092.191895
155 | 1142.857178
156 | 2242.138916
157 | 4674.011719
158 | 4629.005371
159 | 4381.795898
160 | 4692.783203
161 | 4659.666504
162 | 1281.298218
163 | 2215.841797
164 | 4544.753418
165 | 4555.655762
166 | 4302.574707
167 | 5112.984375
168 | 5081.896973
169 | 1261.958496
170 | 2241.415527
171 | 4681.474121
172 | 4623.420410
173 | 4396.541504
174 | 5182.580078
175 | 5140.809570
176 | 1155.907959
177 | 2226.853271
178 | 4662.607422
179 | 4630.553711
180 | 4387.822266
181 | 5183.445801
182 | 5137.879395
183 | 1194.397827
184 | 2226.601074
185 | 4668.045410
186 | 4623.687988
187 | 4377.164551
188 | 5175.273926
189 | 5142.314453
190 | 1211.406738
191 | 2228.513184
192 | 4720.754883
193 | 4623.091309
194 | 4367.977539
195 | 5167.831543
196 | 5138.166992
197 | 


--------------------------------------------------------------------------------
/batching/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -f log
 4 | for ((M=128; M<=1024; M=M*2))
 5 | do
 6 | 	for ((K=16; K<=1024; K=K*2))
 7 | 	do
 8 | 		cd ../data
 9 | 		./gen_data $M $M $K
10 | 		cd - > /dev/null
11 | 		./gemm 4 >> log
12 | 		./gemm 8 >> log
13 | 		./gemm 16 >> log
14 | 		./gemm 32 >> log
15 | 		./gemm 64 >> log
16 | 		./gemm 128 >> log
17 | 		./gemm 256 >> log
18 | 	done
19 | done
20 | 


--------------------------------------------------------------------------------
/batching/thres.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -f log
 4 | for ((thres=65536; thres<=102400000; thres=thres*2))
 5 | do
 6 | 	./gemm 16 $thres >> log
 7 | done
 8 | for ((thres=65536; thres<=102400000; thres=thres*2))
 9 | do
10 | 	./gemm 32 $thres >> log
11 | done
12 | for ((thres=65536; thres<=102400000; thres=thres*2))
13 | do
14 | 	./gemm 64 $thres >> log
15 | done
16 | for ((thres=65536; thres<=102400000; thres=thres*2))
17 | do
18 | 	./gemm 128 $thres >> log
19 | done
20 | for ((thres=65536; thres<=102400000; thres=thres*2))
21 | do
22 | 	./gemm 256 $thres >> log
23 | done
24 | 


--------------------------------------------------------------------------------
/cke/Makefile:
--------------------------------------------------------------------------------
1 | GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70
2 | 
3 | gemm:gemm.cu
4 | 	nvcc  $< -o $@ --std=c++11 -O3 -lcublas ${GENCODE_FLAGS}
5 | clean:
6 | 	rm -rf gemm *.o
7 | 


--------------------------------------------------------------------------------
/cke/gemm.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <cstdio>
  3 | #include <fstream>
  4 | #include <cublas_v2.h>
  5 | #include "../include/util.h"
  6 | 
  7 | #define N_RUNS 10
  8 | 
  9 | int  main (int argc, char** argv) {
 10 | 
 11 | 	ErrChk(cudaSetDevice(0));
 12 | 
 13 | 	if(argc<2){
 14 | 		printf("Usage: input the batch size\n");
 15 | 		exit(EXIT_FAILURE);
 16 | 	}
 17 | 
 18 | 	int BATCH = atoi(argv[1]);
 19 | 	
 20 | 	int *M;
 21 | 	int *N;
 22 | 	int *K;
 23 | 
 24 | 	M = (int*) malloc(BATCH * sizeof(int));
 25 | 	N = (int*) malloc(BATCH * sizeof(int));
 26 | 	K = (int*) malloc(BATCH * sizeof(int));
 27 | 
 28 | 	std::fstream fs;
 29 | 	fs.open("../data/input");
 30 | 	if (!fs.is_open()){
 31 | 		printf("Error opening input\n");
 32 | 		exit(EXIT_FAILURE);
 33 | 	}
 34 | 	
 35 | 	//read matrix config	
 36 | 	for (int i=0; i<BATCH; ++i){
 37 | 		fs>>M[i]>>N[i]>>K[i];
 38 | 	}
 39 | 
 40 |     float **A;
 41 | 	float **B;
 42 | 	float **C;
 43 | 	float alpha = 1.f;
 44 | 	float beta = 0.f;
 45 | 
 46 | 	A = (float**) malloc(BATCH * sizeof(float*));
 47 | 	B = (float**) malloc(BATCH * sizeof(float*));
 48 | 	C = (float**) malloc(BATCH * sizeof(float*));
 49 | 
 50 | 	for (int i=0; i<BATCH; ++i){
 51 | 		ErrChk(cudaMalloc((void**)&A[i], M[i]*K[i]*sizeof(float)));
 52 | 		ErrChk(cudaMalloc((void**)&B[i], K[i]*N[i]*sizeof(float)));
 53 | 		ErrChk(cudaMalloc((void**)&C[i], M[i]*N[i]*sizeof(float)));
 54 | 	}
 55 | 
 56 | 	float elapsedTime = 0.f;
 57 |     double time=0.f;
 58 | 	float gflops_per_sec = 0.f;
 59 | 	double gflops = 0.f;
 60 | 	for (int i=0; i<BATCH; ++i)
 61 | 		gflops += ((2 * int64_t(M[i]) * int64_t(N[i]) * int64_t(K[i])) + (2 * int64_t(M[i]) * int64_t(N[i])) ) / 1.0e9;
 62 | 
 63 | 	cudaEvent_t start, stop;
 64 | 	ErrChk(cudaEventCreate(&start));
 65 | 	ErrChk(cudaEventRecord(start,0));
 66 | 
 67 | 	cudaStream_t *stream;
 68 | 	stream = (cudaStream_t*) malloc(BATCH * sizeof(cudaStream_t));
 69 | 
 70 | 	for (int i=0; i<BATCH; ++i)
 71 | 		ErrChk(cudaStreamCreate(&stream[i]));
 72 | 
 73 |     cublasHandle_t handle;
 74 |     ErrChk(cublasCreate(&handle));
 75 | 
 76 | 	//warm-up
 77 | 	for (int i=0; i<BATCH; ++i){
 78 | 		ErrChk(cublasSetStream(handle, stream[i]));
 79 | 		ErrChk(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M[i], N[i], K[i], (const void*) &alpha, (void*) A[i], CUDA_R_32F, M[i], (void*) B[i], CUDA_R_32F, K[i], (const void*) &beta, (void*) C[i], CUDA_R_32F, M[i], CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
 80 | 	}
 81 | 	ErrChk(cudaDeviceSynchronize());
 82 | 
 83 | 
 84 | 	ErrChk(cudaEventCreate(&start));
 85 | 	ErrChk(cudaEventRecord(start,0));
 86 | 
 87 | 	for (int run=0; run<N_RUNS; ++run){
 88 | 		for (int i=0; i<BATCH; ++i){
 89 | 			ErrChk(cublasSetStream(handle, stream[i]));
 90 | 			ErrChk(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M[i], N[i], K[i], &alpha, A[i], M[i], B[i], K[i], &beta, C[i], M[i]));
 91 | 		}
 92 | 	}
 93 | 	cudaEventCreate(&stop);
 94 | 	cudaEventRecord(stop,0);
 95 | 	cudaEventSynchronize(stop);
 96 | 	cudaEventElapsedTime(&elapsedTime, start,stop);
 97 | 
 98 | 	time = elapsedTime/N_RUNS;
 99 | 	time /= 1.0e3; //convert time unit from millisecond to second
100 | 	gflops_per_sec   = gflops / time;
101 | 	printf("%f\n", gflops_per_sec);
102 | 
103 | 	for (int i=0; i<BATCH; ++i){
104 | 		ErrChk(cudaFree(A[i]));		
105 | 		ErrChk(cudaFree(B[i]));		
106 | 		ErrChk(cudaFree(C[i]));		
107 | 		ErrChk(cudaStreamDestroy(stream[i]));
108 | 	}
109 | 
110 | 	free(M);
111 | 	free(N);
112 | 	free(K);
113 | 	free(A);
114 | 	free(B);
115 | 	free(C);
116 | 	free(stream);
117 | 
118 | 	return 0;
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/cke/log:
--------------------------------------------------------------------------------
  1 | 237.001022
  2 | 297.334930
  3 | 266.122192
  4 | 183.902908
  5 | 172.333130
  6 | 170.902618
  7 | 153.324554
  8 | 214.605515
  9 | 294.982574
 10 | 275.153717
 11 | 201.488464
 12 | 173.527924
 13 | 167.349625
 14 | 153.474106
 15 | 243.751297
 16 | 290.899323
 17 | 242.661438
 18 | 191.979904
 19 | 166.862320
 20 | 174.679337
 21 | 156.950623
 22 | 239.182007
 23 | 280.294250
 24 | 272.870117
 25 | 168.244049
 26 | 171.173584
 27 | 173.379150
 28 | 153.670868
 29 | 232.833450
 30 | 286.004608
 31 | 255.601196
 32 | 189.233551
 33 | 172.576218
 34 | 168.697952
 35 | 154.620361
 36 | 242.781540
 37 | 303.848572
 38 | 263.815186
 39 | 192.617432
 40 | 167.573517
 41 | 168.090027
 42 | 151.192413
 43 | 227.807159
 44 | 305.481476
 45 | 249.566299
 46 | 196.406937
 47 | 168.220779
 48 | 168.989929
 49 | 151.646011
 50 | 185.457291
 51 | 311.271210
 52 | 409.206757
 53 | 651.038025
 54 | 977.274353
 55 | 1042.603271
 56 | 950.363586
 57 | 178.072891
 58 | 326.483246
 59 | 417.968414
 60 | 572.477966
 61 | 945.585938
 62 | 999.336182
 63 | 1110.411377
 64 | 188.216156
 65 | 331.106476
 66 | 268.985016
 67 | 659.176147
 68 | 982.177734
 69 | 1040.217896
 70 | 1073.224243
 71 | 181.857498
 72 | 329.778748
 73 | 399.874542
 74 | 657.837219
 75 | 953.362915
 76 | 1040.952271
 77 | 1085.787231
 78 | 185.666626
 79 | 312.740631
 80 | 441.522644
 81 | 670.528931
 82 | 995.664978
 83 | 1042.593628
 84 | 1084.969360
 85 | 168.332947
 86 | 326.706848
 87 | 423.270050
 88 | 636.200562
 89 | 1010.017029
 90 | 1051.623413
 91 | 1091.411743
 92 | 185.410812
 93 | 327.987335
 94 | 445.940186
 95 | 659.176147
 96 | 963.324951
 97 | 1058.476562
 98 | 1100.144409
 99 | 1166.841431
100 | 3152.952881
101 | 2500.876709
102 | 2966.455322
103 | 3096.337646
104 | 4336.383789
105 | 3625.318359
106 | 1119.710083
107 | 3065.512207
108 | 2661.700684
109 | 2941.878174
110 | 3036.380371
111 | 4496.298828
112 | 3722.793457
113 | 1129.338989
114 | 2995.913330
115 | 2729.451904
116 | 2910.830811
117 | 3017.532959
118 | 4377.558105
119 | 3665.760254
120 | 1098.519409
121 | 3118.724365
122 | 2657.250000
123 | 2782.776123
124 | 3084.502686
125 | 4126.134766
126 | 3729.956787
127 | 1145.718628
128 | 3051.267578
129 | 2680.839600
130 | 3018.155762
131 | 3132.459473
132 | 4469.885742
133 | 3619.740234
134 | 980.059692
135 | 3220.514893
136 | 2560.528564
137 | 2954.177734
138 | 3032.707275
139 | 4392.245605
140 | 3681.342773
141 | 1174.669067
142 | 3023.005615
143 | 2680.691162
144 | 3012.561279
145 | 2998.857422
146 | 4412.121094
147 | 3752.288330
148 | 3245.670410
149 | 8029.648926
150 | 6251.117676
151 | 3704.641113
152 | 3835.615723
153 | 3935.210938
154 | 3541.992432
155 | 3565.217285
156 | 9037.541992
157 | 6924.500000
158 | 3692.706055
159 | 3843.955811
160 | 4145.791016
161 | 3616.200195
162 | 3421.464355
163 | 9397.159180
164 | 6722.021484
165 | 3972.183838
166 | 4218.515137
167 | 4355.814941
168 | 3839.067139
169 | 3692.574219
170 | 8995.542969
171 | 7056.580078
172 | 4317.979004
173 | 4129.107910
174 | 4098.443848
175 | 3963.401855
176 | 3627.006836
177 | 9168.769531
178 | 7117.036621
179 | 4238.753906
180 | 4291.671387
181 | 4178.457031
182 | 4089.797852
183 | 3586.207031
184 | 9338.448242
185 | 6842.150391
186 | 4130.265625
187 | 3922.037598
188 | 4308.213867
189 | 3988.938965
190 | 3551.483643
191 | 9172.331055
192 | 6737.323242
193 | 4101.651855
194 | 4227.090820
195 | 4215.158203
196 | 3876.218506
197 | 


--------------------------------------------------------------------------------
/cke/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -f log
 4 | for ((M=128; M<=1024; M=M*2))
 5 | do
 6 | 	for ((K=16; K<=1024; K=K*2))
 7 | 	do
 8 | 		cd ../data
 9 | 		./gen_data $M $M $K
10 | 		cd - > /dev/null
11 | 		./gemm 4 >> log
12 | 		./gemm 8 >> log
13 | 		./gemm 16 >> log
14 | 		./gemm 32 >> log
15 | 		./gemm 64 >> log
16 | 		./gemm 128 >> log
17 | 		./gemm 256 >> log
18 | 	done
19 | done
20 | 


--------------------------------------------------------------------------------
/data/Makefile:
--------------------------------------------------------------------------------
1 | gen_data:gen_data.cpp
2 | 	rm -f input
3 | 	touch input
4 | 	g++ $< -o $@
5 | 
6 | clean:
7 | 	rm -f input gen_data
8 | 


--------------------------------------------------------------------------------
/data/gen_data:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lixiuhong/batched_gemm/03f1b28238b4c6da913aa561972f8f0202399571/data/gen_data


--------------------------------------------------------------------------------
/data/gen_data.cpp:
--------------------------------------------------------------------------------
 1 | #include<cstdlib>
 2 | #include<cstdio>
 3 | #include<cmath>
 4 | #include<fstream>
 5 | 
 6 | #define random(x) (rand()%(x))
 7 | 
 8 | int main(int argc, char *argv[]){
 9 | 
10 | 	if (argc<3){
11 | 		printf("Usage: please input two integers\n");
12 | 		printf("The first one represents the largest matrix size (M, N)\n");
13 | 		printf("The second one represents the K\n");
14 | 		exit(EXIT_FAILURE);	
15 | 	}
16 | 
17 | 	
18 | 	std::fstream fs;
19 | 	fs.open("../data/input");
20 | 	if (!fs.is_open()){
21 | 		printf("Error opening input\n");
22 | 		exit(EXIT_FAILURE);
23 | 	}
24 | 	
25 | 	int e = atoi(argv[1]);
26 | 	int log_e = 0;
27 | 
28 | 	while(e>=16){
29 | 		e = e>>1;
30 | 		log_e++;
31 | 	}	
32 | 	
33 | 	int K = atoi(argv[2]);
34 | 	//read matrix config	
35 | 	for (int i=0; i<256; ++i){
36 | 		int M = 16<<random(log_e);
37 | 		int N = 16<<random(log_e);
38 | 		fs<<M<<' '<<N<<' '<<K<<std::endl;
39 | 	}
40 | 
41 | 	return EXIT_SUCCESS;	
42 | }
43 | 


--------------------------------------------------------------------------------
/data/input:
--------------------------------------------------------------------------------
  1 | 32 256 1024
  2 | 64 512 1024
  3 | 32 128 1024
  4 | 128 64 1024
  5 | 32 128 1024
  6 | 64 512 1024
  7 | 1024 256 1024
  8 | 1024 16 1024
  9 | 128 32 1024
 10 | 64 64 1024
 11 | 128 16 1024
 12 | 512 64 1024
 13 | 64 512 1024
 14 | 512 1024 1024
 15 | 32 512 1024
 16 | 512 16 1024
 17 | 128 512 1024
 18 | 128 64 1024
 19 | 32 16 1024
 20 | 64 64 1024
 21 | 32 256 1024
 22 | 512 16 1024
 23 | 1024 128 1024
 24 | 1024 64 1024
 25 | 64 32 1024
 26 | 128 512 1024
 27 | 16 32 1024
 28 | 32 64 1024
 29 | 256 256 1024
 30 | 32 128 1024
 31 | 64 1024 1024
 32 | 256 512 1024
 33 | 128 512 1024
 34 | 512 256 1024
 35 | 128 16 1024
 36 | 512 128 1024
 37 | 64 128 1024
 38 | 128 64 1024
 39 | 256 16 1024
 40 | 64 16 1024
 41 | 16 128 1024
 42 | 512 16 1024
 43 | 256 256 1024
 44 | 16 64 1024
 45 | 32 32 1024
 46 | 512 256 1024
 47 | 512 16 1024
 48 | 16 32 1024
 49 | 256 1024 1024
 50 | 256 16 1024
 51 | 256 16 1024
 52 | 128 16 1024
 53 | 32 512 1024
 54 | 16 1024 1024
 55 | 128 16 1024
 56 | 1024 32 1024
 57 | 256 64 1024
 58 | 1024 1024 1024
 59 | 16 1024 1024
 60 | 1024 32 1024
 61 | 512 128 1024
 62 | 128 256 1024
 63 | 128 256 1024
 64 | 128 512 1024
 65 | 32 512 1024
 66 | 256 128 1024
 67 | 512 512 1024
 68 | 32 16 1024
 69 | 32 32 1024
 70 | 256 512 1024
 71 | 16 128 1024
 72 | 256 256 1024
 73 | 128 256 1024
 74 | 128 128 1024
 75 | 128 128 1024
 76 | 128 64 1024
 77 | 256 256 1024
 78 | 256 512 1024
 79 | 1024 512 1024
 80 | 256 16 1024
 81 | 256 32 1024
 82 | 64 16 1024
 83 | 1024 128 1024
 84 | 16 1024 1024
 85 | 512 64 1024
 86 | 256 128 1024
 87 | 128 32 1024
 88 | 16 16 1024
 89 | 128 32 1024
 90 | 128 16 1024
 91 | 64 256 1024
 92 | 16 1024 1024
 93 | 64 64 1024
 94 | 512 32 1024
 95 | 16 64 1024
 96 | 16 256 1024
 97 | 32 64 1024
 98 | 128 512 1024
 99 | 512 128 1024
100 | 256 32 1024
101 | 1024 1024 1024
102 | 256 64 1024
103 | 1024 64 1024
104 | 16 64 1024
105 | 256 64 1024
106 | 16 256 1024
107 | 1024 512 1024
108 | 256 1024 1024
109 | 16 16 1024
110 | 1024 32 1024
111 | 16 1024 1024
112 | 128 32 1024
113 | 1024 1024 1024
114 | 256 64 1024
115 | 32 16 1024
116 | 256 512 1024
117 | 256 1024 1024
118 | 512 32 1024
119 | 16 1024 1024
120 | 256 64 1024
121 | 32 64 1024
122 | 1024 512 1024
123 | 32 32 1024
124 | 512 1024 1024
125 | 32 256 1024
126 | 16 32 1024
127 | 32 64 1024
128 | 16 16 1024
129 | 32 128 1024
130 | 64 16 1024
131 | 128 256 1024
132 | 512 512 1024
133 | 64 64 1024
134 | 16 64 1024
135 | 32 64 1024
136 | 64 16 1024
137 | 256 32 1024
138 | 512 128 1024
139 | 128 32 1024
140 | 128 64 1024
141 | 512 32 1024
142 | 256 256 1024
143 | 128 64 1024
144 | 256 512 1024
145 | 512 512 1024
146 | 128 1024 1024
147 | 16 64 1024
148 | 512 64 1024
149 | 256 128 1024
150 | 64 128 1024
151 | 512 256 1024
152 | 128 64 1024
153 | 256 1024 1024
154 | 256 512 1024
155 | 32 16 1024
156 | 16 256 1024
157 | 32 64 1024
158 | 16 512 1024
159 | 512 64 1024
160 | 32 32 1024
161 | 16 256 1024
162 | 1024 1024 1024
163 | 256 64 1024
164 | 1024 1024 1024
165 | 512 64 1024
166 | 64 128 1024
167 | 256 128 1024
168 | 128 1024 1024
169 | 32 16 1024
170 | 256 16 1024
171 | 512 128 1024
172 | 64 16 1024
173 | 512 64 1024
174 | 128 32 1024
175 | 512 256 1024
176 | 128 128 1024
177 | 1024 64 1024
178 | 64 64 1024
179 | 256 64 1024
180 | 32 16 1024
181 | 64 64 1024
182 | 128 256 1024
183 | 128 256 1024
184 | 256 256 1024
185 | 512 1024 1024
186 | 256 32 1024
187 | 64 512 1024
188 | 32 32 1024
189 | 16 64 1024
190 | 64 512 1024
191 | 256 128 1024
192 | 64 256 1024
193 | 128 64 1024
194 | 1024 512 1024
195 | 64 512 1024
196 | 512 256 1024
197 | 16 1024 1024
198 | 16 64 1024
199 | 256 256 1024
200 | 1024 16 1024
201 | 32 64 1024
202 | 32 256 1024
203 | 16 32 1024
204 | 128 16 1024
205 | 128 128 1024
206 | 256 32 1024
207 | 16 256 1024
208 | 128 128 1024
209 | 256 16 1024
210 | 64 16 1024
211 | 512 512 1024
212 | 256 256 1024
213 | 512 256 1024
214 | 1024 64 1024
215 | 1024 128 1024
216 | 16 32 1024
217 | 512 32 1024
218 | 128 128 1024
219 | 16 1024 1024
220 | 64 256 1024
221 | 64 1024 1024
222 | 128 16 1024
223 | 32 1024 1024
224 | 64 512 1024
225 | 1024 64 1024
226 | 512 64 1024
227 | 16 32 1024
228 | 1024 512 1024
229 | 128 512 1024
230 | 512 128 1024
231 | 64 512 1024
232 | 64 512 1024
233 | 512 128 1024
234 | 64 512 1024
235 | 64 256 1024
236 | 16 64 1024
237 | 32 128 1024
238 | 128 64 1024
239 | 16 128 1024
240 | 512 1024 1024
241 | 512 256 1024
242 | 64 512 1024
243 | 128 1024 1024
244 | 64 1024 1024
245 | 512 16 1024
246 | 16 512 1024
247 | 256 64 1024
248 | 128 16 1024
249 | 512 128 1024
250 | 128 512 1024
251 | 16 256 1024
252 | 32 1024 1024
253 | 512 64 1024
254 | 32 1024 1024
255 | 512 16 1024
256 | 128 128 1024
257 | 


--------------------------------------------------------------------------------
/default/Makefile:
--------------------------------------------------------------------------------
1 | GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70
2 | 
3 | gemm:gemm.cu
4 | 	nvcc  $< -o $@ --std=c++11 -O3 -lcublas ${GENCODE_FLAGS}
5 | clean:
6 | 	rm -rf gemm *.o
7 | 


--------------------------------------------------------------------------------
/default/gemm.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <cstdio>
  3 | #include <fstream>
  4 | #include <cublas_v2.h>
  5 | #include "../include/util.h"
  6 | 
  7 | #define N_RUNS 10
  8 | 
  9 | int  main (int argc, char** argv) {
 10 | 
 11 | 	ErrChk(cudaSetDevice(0));
 12 | 
 13 | 	if(argc<2){
 14 | 		printf("Usage: input the batch size\n");
 15 | 		exit(EXIT_FAILURE);
 16 | 	}
 17 | 
 18 | 	int BATCH = atoi(argv[1]);
 19 | 	
 20 | 	int *M;
 21 | 	int *N;
 22 | 	int *K;
 23 | 
 24 | 	M = (int*) malloc(BATCH * sizeof(int));
 25 | 	N = (int*) malloc(BATCH * sizeof(int));
 26 | 	K = (int*) malloc(BATCH * sizeof(int));
 27 | 
 28 | 	std::fstream fs;
 29 | 	fs.open("../data/input");
 30 | 	if (!fs.is_open()){
 31 | 		printf("Error opening input\n");
 32 | 		exit(EXIT_FAILURE);
 33 | 	}
 34 | 	
 35 | 	//read matrix config	
 36 | 	for (int i=0; i<BATCH; ++i){
 37 | 		fs>>M[i]>>N[i]>>K[i];
 38 | 	}
 39 | 
 40 |     float **A;
 41 | 	float **B;
 42 | 	float **C;
 43 | 	float alpha = 1.f;
 44 | 	float beta = 0.f;
 45 | 
 46 | 	A = (float**) malloc(BATCH * sizeof(float*));
 47 | 	B = (float**) malloc(BATCH * sizeof(float*));
 48 | 	C = (float**) malloc(BATCH * sizeof(float*));
 49 | 
 50 | 	for (int i=0; i<BATCH; ++i){
 51 | 		ErrChk(cudaMalloc((void**)&A[i], M[i]*K[i]*sizeof(float)));
 52 | 		ErrChk(cudaMalloc((void**)&B[i], K[i]*N[i]*sizeof(float)));
 53 | 		ErrChk(cudaMalloc((void**)&C[i], M[i]*N[i]*sizeof(float)));
 54 | 	}
 55 | 
 56 | 	float elapsedTime = 0.f;
 57 |     double time=0.f;
 58 | 	float gflops_per_sec = 0.f;
 59 | 	double gflops = 0.f;
 60 | 	for (int i=0; i<BATCH; ++i)
 61 | 		gflops += ((2 * int64_t(M[i]) * int64_t(N[i]) * int64_t(K[i])) + (2 * int64_t(M[i]) * int64_t(N[i])) ) / 1.0e9;
 62 | 
 63 | 	cudaEvent_t start, stop;
 64 | 	ErrChk(cudaEventCreate(&start));
 65 | 	ErrChk(cudaEventRecord(start,0));
 66 | 
 67 |     cublasHandle_t handle;
 68 |     ErrChk(cublasCreate(&handle));
 69 | 
 70 | 	//warm-up
 71 | 	for (int i=0; i<BATCH; ++i){
 72 | 		ErrChk(cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, M[i], N[i], K[i], (const void*) &alpha, (void*) A[i], CUDA_R_32F, M[i], (void*) B[i], CUDA_R_32F, K[i], (const void*) &beta, (void*) C[i], CUDA_R_32F, M[i], CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
 73 | 	}
 74 | 	ErrChk(cudaDeviceSynchronize());
 75 | 
 76 | 
 77 | 	ErrChk(cudaEventCreate(&start));
 78 | 	ErrChk(cudaEventRecord(start,0));
 79 | 
 80 | 	for (int run=0; run<N_RUNS; ++run){
 81 | 		for (int i=0; i<BATCH; ++i){
 82 | 			ErrChk(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, M[i], N[i], K[i], &alpha, A[i], M[i], B[i], K[i], &beta, C[i], M[i]));
 83 | 		}
 84 | 	}
 85 | 	cudaEventCreate(&stop);
 86 | 	cudaEventRecord(stop,0);
 87 | 	cudaEventSynchronize(stop);
 88 | 	cudaEventElapsedTime(&elapsedTime, start,stop);
 89 | 
 90 | 	time = elapsedTime/N_RUNS;
 91 | 	time /= 1.0e3; //convert time unit from millisecond to second
 92 | 	gflops_per_sec   = gflops / time;
 93 | 	printf("%f\n", gflops_per_sec);
 94 | 
 95 | 	for (int i=0; i<BATCH; ++i){
 96 | 		ErrChk(cudaFree(A[i]));		
 97 | 		ErrChk(cudaFree(B[i]));		
 98 | 		ErrChk(cudaFree(C[i]));		
 99 | 	}
100 | 
101 | 	free(M);
102 | 	free(N);
103 | 	free(K);
104 | 	free(A);
105 | 	free(B);
106 | 	free(C);
107 | 
108 | 	return 0;
109 | }
110 | 


--------------------------------------------------------------------------------
/default/log:
--------------------------------------------------------------------------------
  1 | 135.986832
  2 | 168.514359
  3 | 157.629379
  4 | 118.473091
  5 | 106.938087
  6 | 101.784996
  7 | 91.732353
  8 | 136.184753
  9 | 168.883698
 10 | 157.519318
 11 | 118.538506
 12 | 106.545975
 13 | 101.697380
 14 | 92.048683
 15 | 134.025986
 16 | 166.291962
 17 | 157.595200
 18 | 118.503700
 19 | 107.002083
 20 | 102.194290
 21 | 92.048790
 22 | 133.630402
 23 | 168.924835
 24 | 157.724380
 25 | 118.356392
 26 | 106.599716
 27 | 102.181534
 28 | 91.445549
 29 | 135.789474
 30 | 169.081329
 31 | 157.435944
 32 | 118.633232
 33 | 106.973160
 34 | 102.140556
 35 | 91.705925
 36 | 135.750061
 37 | 168.637283
 38 | 157.610382
 39 | 118.542664
 40 | 107.080330
 41 | 101.522873
 42 | 92.001534
 43 | 135.881500
 44 | 168.817917
 45 | 157.538284
 46 | 118.519005
 47 | 106.800514
 48 | 101.782242
 49 | 91.866432
 50 | 107.308449
 51 | 175.642380
 52 | 236.787170
 53 | 306.154572
 54 | 444.227509
 55 | 468.110992
 56 | 481.133820
 57 | 107.362923
 58 | 175.577713
 59 | 236.541092
 60 | 307.968323
 61 | 445.127869
 62 | 468.367279
 63 | 479.208099
 64 | 106.329552
 65 | 175.513107
 66 | 235.849609
 67 | 308.012390
 68 | 443.776825
 69 | 469.753265
 70 | 478.808167
 71 | 107.643982
 72 | 175.590652
 73 | 235.514099
 74 | 306.570038
 75 | 445.008606
 76 | 469.587921
 77 | 479.987946
 78 | 107.401871
 79 | 175.210114
 80 | 236.207993
 81 | 306.873383
 82 | 443.214294
 83 | 469.538757
 84 | 480.587372
 85 | 107.472046
 86 | 175.107193
 87 | 235.444489
 88 | 306.976349
 89 | 444.868927
 90 | 469.195740
 91 | 478.990814
 92 | 107.698799
 93 | 175.610046
 94 | 236.721191
 95 | 306.018921
 96 | 443.347412
 97 | 467.985901
 98 | 479.402100
 99 | 596.877625
100 | 1465.130249
101 | 1163.731323
102 | 1260.245850
103 | 1318.765625
104 | 1796.642822
105 | 1561.584229
106 | 599.158264
107 | 1466.649536
108 | 1162.057129
109 | 1258.609253
110 | 1324.259888
111 | 1798.538818
112 | 1550.624878
113 | 639.921997
114 | 1567.293579
115 | 1235.536499
116 | 1262.799316
117 | 1321.170532
118 | 1803.465942
119 | 1562.154419
120 | 599.294983
121 | 1466.688721
122 | 1163.437866
123 | 1262.897583
124 | 1322.685913
125 | 1802.905396
126 | 1553.996216
127 | 627.988098
128 | 1526.942993
129 | 1208.819946
130 | 1311.340088
131 | 1383.222168
132 | 1869.215454
133 | 1631.342163
134 | 627.013611
135 | 1534.162964
136 | 1194.770142
137 | 1308.270508
138 | 1375.149414
139 | 1873.757690
140 | 1621.838257
141 | 658.368225
142 | 1619.049072
143 | 1263.015991
144 | 1375.454346
145 | 1438.913574
146 | 1952.764648
147 | 1698.803589
148 | 1408.015747
149 | 3553.094482
150 | 4422.851074
151 | 3961.561279
152 | 4293.050781
153 | 5182.097656
154 | 4991.094727
155 | 1419.558960
156 | 3580.840332
157 | 4420.239258
158 | 3965.055176
159 | 4297.230957
160 | 5167.182129
161 | 4993.140625
162 | 1429.133911
163 | 3585.243652
164 | 4406.617676
165 | 3953.507324
166 | 4294.491211
167 | 5180.987305
168 | 5000.881348
169 | 1424.955078
170 | 3577.856689
171 | 4428.946289
172 | 3961.289062
173 | 4296.803223
174 | 5171.602051
175 | 4990.318848
176 | 1425.193237
177 | 3580.677246
178 | 4425.065430
179 | 3619.900146
180 | 3930.547363
181 | 4727.136230
182 | 4941.793457
183 | 1427.340088
184 | 3584.263916
185 | 4432.185547
186 | 3961.757324
187 | 4300.142090
188 | 5168.886230
189 | 4994.977539
190 | 1427.041504
191 | 3577.693848
192 | 4419.195801
193 | 3960.533691
194 | 4301.006348
195 | 5166.458984
196 | 5004.093262
197 | 


--------------------------------------------------------------------------------
/default/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -f log
 4 | for ((M=128; M<=1024; M=M*2))
 5 | do
 6 | 	for ((K=16; K<=1024; K=K*2))
 7 | 	do
 8 | 		cd ../data
 9 | 		./gen_data $M $M $K
10 | 		cd - > /dev/null
11 | 		./gemm 4 >> log
12 | 		./gemm 8 >> log
13 | 		./gemm 16 >> log
14 | 		./gemm 32 >> log
15 | 		./gemm 64 >> log
16 | 		./gemm 128 >> log
17 | 		./gemm 256 >> log
18 | 	done
19 | done
20 | 


--------------------------------------------------------------------------------
/google-net_cudnn/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | google-net_cudnn
 3 | tags
 4 | .cproject
 5 | .project
 6 | .ptp-sync/
 7 | .settings/
 8 | Debug/
 9 | Release/
10 | 


--------------------------------------------------------------------------------
/google-net_cudnn/Makefile:
--------------------------------------------------------------------------------
 1 | USE_MULTI_STREAM ?= 0
 2 | 
 3 | ifeq ($(USE_MULTI_STREAM), 1)
 4 | 	COMMON_FLAGS += -DUSE_MULTI_STREAM=1
 5 | endif
 6 | 
 7 | GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70
 8 | 
 9 | google-net_cudnn:conv.o main.o activation.o pooling.o lrn.o concat.o dropout.o inception.o batch-inception.o loss.o softmax.o
10 | 	nvcc $^ -o $@ -lcudnn -lcublas ${GENCODE_FLAGS} $(COMMON_FLAGS)
11 | 
12 | main.o:main.cpp util.h conv.h activation.h pooling.h lrn.h
13 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
14 | conv.o:conv.cpp util.h
15 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
16 | activation.o:activation.cpp util.h
17 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
18 | pooling.o:pooling.cpp util.h
19 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
20 | lrn.o:lrn.cpp util.h
21 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
22 | concat.o:concat.cu concat.h
23 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
24 | dropout.o:dropout.cpp dropout.h
25 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
26 | loss.o:loss.cpp loss.h
27 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
28 | softmax.o:softmax.cpp softmax.h
29 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
30 | inception.o:inception.cpp inception.h
31 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
32 | batch-inception.o:batch-inception.cu batch-inception.h
33 | 	nvcc -c $< -o $@ ${GENCODE_FLAGS} $(COMMON_FLAGS)
34 | 
35 | clean:
36 | 	rm -f *.o
37 | 


--------------------------------------------------------------------------------
/google-net_cudnn/activation.cpp:
--------------------------------------------------------------------------------
 1 | #include "cudnn.h"
 2 | #include "util.h"
 3 | #include <cmath>
 4 | 
 5 | void activation(cudnnHandle_t handle, int N, int C, int H, int W, float *input, float *output, cudaStream_t s){
 6 | 
 7 | 	float one = 1.0, zero = 0.0;
 8 | 	
 9 | 	ErrChk(cudnnSetStream(handle, s));
10 | 
11 | 	cudnnActivationDescriptor_t activationDesc;
12 | 	ErrChk(cudnnCreateActivationDescriptor(&activationDesc));
13 | 	ErrChk(cudnnSetActivationDescriptor(activationDesc, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.f));
14 | 
15 | 	cudnnTensorDescriptor_t xDesc;
16 | 	ErrChk(cudnnCreateTensorDescriptor(&xDesc));
17 | 	ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
18 | 
19 | 	cudnnTensorDescriptor_t yDesc;
20 | 	ErrChk(cudnnCreateTensorDescriptor(&yDesc));
21 | 	ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
22 | 	
23 | 	ErrChk(cudnnActivationForward(handle, activationDesc, &one, xDesc, input, &zero, yDesc, output));
24 | 	
25 | 	ErrChk(cudnnDestroyActivationDescriptor(activationDesc));
26 | 	ErrChk(cudnnDestroyTensorDescriptor(xDesc));
27 | 	ErrChk(cudnnDestroyTensorDescriptor(yDesc));
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/google-net_cudnn/activation.h:
--------------------------------------------------------------------------------
1 | #ifndef __ACTIVATION_H__
2 | #define __ACTIVATION_H__
3 | void activation(cudnnHandle_t handle, int N, int C, int H, int W, float *input, float *output, cudaStream_t s=0);
4 | #endif
5 | 


--------------------------------------------------------------------------------
/google-net_cudnn/batch-inception.cu:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | 
 3 | #include "cudnn.h"
 4 | #include "util.h"
 5 | #include <cmath>
 6 | #include "conv.h"
 7 | #include "pooling.h"
 8 | #include "activation.h"
 9 | #include "dropout.h"
10 | #include "lrn.h"
11 | #include "concat.h"
12 | #include "im2col.h"
13 | #include "gemm_kernel.h"
14 | 
15 | 
16 | /*
17 |  * do Inception
18 |  *
19 |  * This func will consume 6 filters and 4 features.
20 |  * Use x[xIdx] as x[xIdx], which should be set before this func.
21 |  * Use x[xIdx + 4] as output.
22 |  *
23 |  */
24 | void batchGoogleNetInception(cudnnHandle_t handle, const int N, const int C,
25 |         const int H, const int W, const int xIdx, const int filterIdx,
26 |         const int K1, const int K2, const int K3, const int K4, const int K5,
27 |         const int K6, int *reC, float **x, float** filter, float* buf,
28 |         const int *algo_best) {
29 |     /*
30 |      * Use x[xIdx + 8] as output.
31 |      * We can concat the result directly when N == 1.
32 |      */
33 |     float *output = x[xIdx + 4];
34 |     float *output1 = output;
35 |     float *output2 = output1 + K1 * H * W;
36 |     float *output3 = output2 + K3 * H * W;
37 |     float *output4 = output3 + K5 * H * W;
38 | 
39 |     //pool
40 |     pooling(handle, N, C, H, W, 3, 3, 1, 1, 1, 1, H, W,
41 |             x[xIdx], x[xIdx + 3]);
42 | 
43 |     // the first four-batch conv
44 |     int M_MAX = N * H * W;
45 |     int N_MAX = std::max(K1, std::max(K2, std::max(K3, K4)));
46 |     dim3 grid_size((M_MAX - 1) / 16 + 1, (N_MAX - 1) / 16 + 1, 4);
47 |    	dim3 block_size(64, 1, 1);
48 |     gemm_4<<<grid_size, block_size, (1U << 9) * sizeof(float)>>>(
49 |             N * H * W, K1, K2, K4, K6, C, H, W, x[xIdx], x[xIdx + 3],
50 |             filter[filterIdx], filter[filterIdx + 1], filter[filterIdx + 3],
51 |             filter[filterIdx + 5], output1, x[xIdx + 1], x[xIdx + 2], output4);
52 |    	KernelErrChk();
53 | 
54 |     //relu 1*1
55 |     activation(handle, N, K1, H, W, output1, output1);
56 | 
57 |     //relu 3*3 reduce
58 |     activation(handle, N, K2, H, W, x[xIdx + 1],
59 |             x[xIdx + 1]);
60 | 
61 |     //3*3
62 |     int algo = algo_best[(filterIdx+2)*7];
63 |     conv(handle, N, C, H, W, K3, 3, 3, 1, 1, 1, 1, H, W,
64 |             x[xIdx + 1], filter[filterIdx+2], buf,
65 |             output2, algo);
66 | 
67 |     //relu 3*3
68 |     activation(handle, N, K3, H, W, output2, output2);
69 | 
70 |     //relu 5*5 reduce
71 |     activation(handle, N, K4, H, W, x[xIdx + 2],
72 |             x[xIdx + 2]);
73 | 
74 |     //5*5
75 |     algo = algo_best[(filterIdx+4)*7];
76 |     conv(handle, N, C, H, W, K5, 5, 5, 1, 1, 2, 2, H, W,
77 |             x[xIdx + 2], filter[filterIdx+4], buf,
78 |             output3, algo);
79 | 
80 |     //relu 5*5
81 |     activation(handle, N, K5, H, W, output3, output3);
82 | 
83 |     //relu pool proj
84 |     activation(handle, N, K6, H, W, output4, output4);
85 | 
86 |     //compute return shape
87 |     *reC = K1 + K3 + K5 + K6;
88 | }
89 | 


--------------------------------------------------------------------------------
/google-net_cudnn/batch-inception.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * batch-inception.h
 3 |  *
 4 |  *  Created on: Nov 5, 2018
 5 |  *      Author: cambricon
 6 |  */
 7 | 
 8 | #ifndef BATCH_INCEPTION_H_
 9 | #define BATCH_INCEPTION_H_
10 | 
11 | /*
12 |  * do Inception
13 |  *
14 |  * This func will consume 6 filters and 4 features.
15 |  * Use x[xIdx] as x[xIdx], which should be set before this func.
16 |  * Use x[xIdx + 4] as output.
17 |  *
18 |  */
19 | void batchGoogleNetInception(cudnnHandle_t handle, const int N, const int C,
20 |         const int H, const int W, const int xIdx, const int filterIdx,
21 |         const int K1, const int K2, const int K3, const int K4, const int K5,
22 |         const int K6, int *reC, float **x, float** filter, float* buf,
23 |         const int *algo_best);
24 | 
25 | #endif /* BATCH_INCEPTION_H_ */
26 | 


--------------------------------------------------------------------------------
/google-net_cudnn/concat.cu:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | 
 3 | __global__ void cudaConcatKernel(size_t numIns, size_t innerStride,
 4 |         size_t outerStride, size_t* concatDims, const float **ins, float *out) {
 5 |     size_t batchSize = 0;
 6 |     for (size_t i = 0; i < numIns; ++i) {
 7 |         batchSize += concatDims[i]*innerStride;
 8 |     }
 9 | 
10 |     size_t iOuter = blockIdx.x;
11 |     float* outPtr = out + iOuter*batchSize;
12 |     for (size_t j = 0; j < numIns; ++j) {
13 |         for (size_t k = 0; k < concatDims[j]; ++k) {
14 |             for (size_t l = 0; l < (innerStride - 1)/blockDim.x + 1; ++l) {
15 |                 size_t x = l*blockDim.x + threadIdx.x;
16 |                 if (x < innerStride) {
17 |                     outPtr[k*innerStride + x] = *(ins[j] +
18 |                         iOuter*concatDims[j]*innerStride + k*innerStride + x);
19 |                 }
20 |             }
21 |         }
22 |         outPtr += concatDims[j]*innerStride;
23 |     }
24 | }
25 | 
26 | void launchCudaConcatKernel(size_t numIns,
27 |         size_t innerStride, size_t outerStride, size_t* concatDims,
28 |         const float **ins, float *out) {
29 |     size_t gridsize = outerStride;
30 |     size_t blocksize = 256;
31 |     switch ((innerStride + 63)/64) {
32 |         case 1: blocksize = 64; break;
33 |         case 2: blocksize = 128; break;
34 |         case 3: blocksize = 192; break;
35 |         default: blocksize = 256; break;
36 |     }
37 |     cudaConcatKernel<<<gridsize, blocksize, 0>>>(numIns,
38 |         innerStride, outerStride, concatDims, ins, out);
39 |     KernelErrChk();
40 | }
41 | 
42 | size_t* concatDims = new size_t[4];
43 | float** ins = new float*[4];
44 | void concat(int N, int H, int W, int C1, int C2, int C3, int C4,
45 |         float *input1, float *input2, float *input3, float *input4,
46 |         float *buf, float *output) {
47 |     concatDims[0] = static_cast<size_t>(C1);
48 |     concatDims[1] = static_cast<size_t>(C2);
49 |     concatDims[2] = static_cast<size_t>(C3);
50 |     concatDims[3] = static_cast<size_t>(C4);
51 |     ins[0] = input1;
52 |     ins[1] = input2;
53 |     ins[2] = input3;
54 |     ins[3] = input4;
55 |     size_t *devConcatDims = (size_t*)buf;
56 |     const float **devIns = (const float **)(buf + 128);//bigger step
57 | 	ErrChk(cudaMemcpy(devIns, ins, 4*sizeof(float*), cudaMemcpyHostToDevice));
58 | 	ErrChk(cudaMemcpy(devConcatDims, concatDims, 4*sizeof(size_t), cudaMemcpyHostToDevice));
59 |     
60 |     launchCudaConcatKernel((size_t)4, size_t(H * W), size_t(N), devConcatDims, (const float **)devIns, output);
61 | }
62 | 


--------------------------------------------------------------------------------
/google-net_cudnn/concat.h:
--------------------------------------------------------------------------------
1 | #ifndef __CONCAT_H__
2 | #define __CONCAT_H__
3 | void concat(int N, int H, int W, int C1, int C2, int C3, int C4,
4 |         float *input1, float *input2, float *input3, float *input4,
5 |         float *buf, float *output);
6 | #endif
7 | 


--------------------------------------------------------------------------------
/google-net_cudnn/conv.cpp:
--------------------------------------------------------------------------------
 1 | #include "cudnn.h"
 2 | #include "util.h"
 3 | #include <cmath>
 4 | 
 5 | void conv(cudnnHandle_t handle, int N, int C, int H, int W, int K, int R, int S,
 6 |         int U, int V, int pad_h, int pad_w, int P, int Q,
 7 |         float *input, float *filter,
 8 |         float *buf, float *output,
 9 |         int algo,
10 |         cudaStream_t s){
11 | 	
12 | 	float one = 1.0, zero = 0.0;
13 | 	size_t size;
14 | 
15 | 	ErrChk(cudnnSetStream(handle, s));
16 | 
17 | 	cudnnTensorDescriptor_t xDesc;
18 | 	ErrChk(cudnnCreateTensorDescriptor(&xDesc));
19 | 	ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
20 | 
21 | 	cudnnTensorDescriptor_t yDesc;
22 | 	ErrChk(cudnnCreateTensorDescriptor(&yDesc));
23 | 	ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, K, P, Q));
24 | 
25 | 	cudnnFilterDescriptor_t filterDesc;
26 | 	ErrChk(cudnnCreateFilterDescriptor(&filterDesc));
27 | 	ErrChk(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, K, C, R, S));
28 | 
29 | 	cudnnConvolutionDescriptor_t convDesc;
30 | 	ErrChk(cudnnCreateConvolutionDescriptor(&convDesc));
31 | 	ErrChk(cudnnSetConvolution2dDescriptor(convDesc, pad_h, pad_w, U, V, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
32 | 
33 | 	ErrChk(cudnnGetConvolutionForwardWorkspaceSize(handle, xDesc, filterDesc, convDesc, yDesc, (cudnnConvolutionFwdAlgo_t)algo, (size_t *)&(size)));
34 | 
35 | 	ErrChk(cudnnConvolutionForward(handle, &one, xDesc, input, filterDesc, filter, convDesc, (cudnnConvolutionFwdAlgo_t)algo, buf, size, &zero, yDesc, output));
36 | 
37 | 	ErrChk(cudnnDestroyTensorDescriptor(xDesc));
38 | 	ErrChk(cudnnDestroyTensorDescriptor(yDesc));
39 | 	ErrChk(cudnnDestroyFilterDescriptor(filterDesc));
40 | 	ErrChk(cudnnDestroyConvolutionDescriptor(convDesc));
41 | }
42 | 


--------------------------------------------------------------------------------
/google-net_cudnn/conv.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CONV_H__
 2 | #define __CONV_H__
 3 | void conv(cudnnHandle_t handle, int N, int C, int H, int W, int K, int R, int S,
 4 |         int U, int V, int pad_h, int pad_w, int P, int Q,
 5 |         float *input, float *filter,
 6 |         float *buf, float *output,
 7 |         int algo,
 8 |         cudaStream_t s=0);
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/google-net_cudnn/dropout.cpp:
--------------------------------------------------------------------------------
 1 | #include "cudnn.h"
 2 | #include "util.h"
 3 | #include <cmath>
 4 | 
 5 | void dropout(cudnnHandle_t handle, float dropout, int N, int C, int H, int W,
 6 |         float *input, float *buf, float *output) {
 7 | 	cudnnDropoutDescriptor_t dropoutDesc;
 8 |     ErrChk(cudnnCreateDropoutDescriptor(&dropoutDesc));
 9 |     size_t stateSize;
10 | 	ErrChk(cudnnDropoutGetStatesSize(handle, &stateSize));
11 |     ErrChk(cudnnSetDropoutDescriptor(dropoutDesc, handle, dropout, buf, stateSize, 462565));
12 | 
13 | 	cudnnTensorDescriptor_t xDesc;
14 | 	ErrChk(cudnnCreateTensorDescriptor(&xDesc));
15 | 	ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
16 | 
17 | 	cudnnTensorDescriptor_t yDesc;
18 | 	ErrChk(cudnnCreateTensorDescriptor(&yDesc));
19 | 	ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
20 | 
21 |     size_t reserveSize;
22 | 	ErrChk(cudnnDropoutGetReserveSpaceSize(xDesc, &reserveSize));
23 | 
24 | 	ErrChk(cudnnDropoutForward(handle, dropoutDesc, xDesc, input, yDesc, output, buf + stateSize, reserveSize));
25 | 
26 | 	ErrChk(cudnnDestroyTensorDescriptor(xDesc));
27 | 	ErrChk(cudnnDestroyTensorDescriptor(yDesc));
28 |     ErrChk(cudnnDestroyDropoutDescriptor(dropoutDesc));
29 | }
30 | 


--------------------------------------------------------------------------------
/google-net_cudnn/dropout.h:
--------------------------------------------------------------------------------
1 | #ifndef __DROPOUT_H__
2 | #define __DROPOUT_H__
3 | void dropout(cudnnHandle_t handle, float dropout, int N, int C, int H, int W,
4 |         float *input, float *buf, float *output);
5 | #endif
6 | 


--------------------------------------------------------------------------------
/google-net_cudnn/gemm_kernel.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * gemm_kernel.h
  3 |  *
  4 |  *  Created on: Nov 5, 2018
  5 |  *      Author: cambricon
  6 |  */
  7 | 
  8 | #ifndef GEMM_KERNEL_H_
  9 | #define GEMM_KERNEL_H_
 10 | 
 11 | 
 12 | //(N*P*Q)%16==0 && (P*Q)%4==0
 13 | __device__ void gemm_64_16x16_1(int M, int N, int K, int P, int Q, float *A, float *B, float *C, float *sh){
 14 | 
 15 | 	float* sh_A = sh;
 16 |     float* sh_B = sh + 2*16*8;
 17 | 
 18 |     float4 reg_C;
 19 | 	reg_C.x =0.f;
 20 | 	reg_C.y =0.f;
 21 | 	reg_C.z =0.f;
 22 | 	reg_C.w =0.f;
 23 | 
 24 |     float reg_A[8];
 25 |     float reg_B[2];
 26 | 
 27 |     // Compute block's starting coordinate
 28 |     int block_base_x = blockIdx.y*16;
 29 |     int block_base_y = blockIdx.x*16;
 30 | 
 31 |     //load A from global memory to shared memory
 32 |     float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%8)*2 + (threadIdx.x/8)*M);
 33 |     *((float2*) (sh_A + 2*threadIdx.x)) = *(A_start);
 34 | 
 35 |     //load A from global memory to shared memory
 36 |     float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/16)*2 + (threadIdx.x%16)*K);
 37 |     *((float2*) (sh_B + 2*threadIdx.x)) = *(B_start);
 38 | 
 39 |     int double_buffer = 0;
 40 | #pragma unroll
 41 |     for(int k=0; k<K; k+=8){
 42 |         __syncthreads();
 43 |         int A_offset = double_buffer + (threadIdx.x%4)*4;
 44 |         int B_offset = double_buffer + ((threadIdx.x/4)*2);
 45 | 
 46 | #pragma unroll
 47 |         for (int i=0; i<8; i+=2)    {
 48 | 
 49 |             reg_A[0] = sh_A[A_offset];
 50 |             reg_A[1] = sh_A[A_offset+1];
 51 |             reg_A[2] = sh_A[A_offset+2];
 52 |             reg_A[3] = sh_A[A_offset+3];
 53 |             reg_A[4] = sh_A[A_offset+16];
 54 |             reg_A[5] = sh_A[A_offset+17];
 55 |             reg_A[6] = sh_A[A_offset+18];
 56 |             reg_A[7] = sh_A[A_offset+19];
 57 | 
 58 |             reg_B[0] = sh_B[B_offset];
 59 |             reg_B[1] = sh_B[B_offset+1];
 60 | 
 61 |             reg_C.x = fma(reg_A[0], reg_B[0], reg_C.x);
 62 |             reg_C.y = fma(reg_A[1], reg_B[0], reg_C.y);
 63 |             reg_C.z = fma(reg_A[2], reg_B[0], reg_C.z);
 64 |             reg_C.w = fma(reg_A[3], reg_B[0], reg_C.w);
 65 |             reg_C.x = fma(reg_A[4], reg_B[1], reg_C.x);
 66 |             reg_C.y = fma(reg_A[5], reg_B[1], reg_C.y);
 67 |             reg_C.z = fma(reg_A[6], reg_B[1], reg_C.z);
 68 |             reg_C.w = fma(reg_A[7], reg_B[1], reg_C.w);
 69 | 
 70 |             A_offset += 32;
 71 |             B_offset += 32;
 72 |         }
 73 | 
 74 |         double_buffer ^= 128;
 75 | 
 76 |         if (k+8 < K){
 77 |             A_start += 4*M;
 78 |             *((float2*) (sh_A + double_buffer + 2*threadIdx.x)) = *(A_start);
 79 |             B_start += 4;
 80 |             *((float2*) (sh_B + double_buffer + 2*threadIdx.x)) = *(B_start);
 81 |         }
 82 |     }
 83 | 
 84 | 	int ind = blockIdx.x*16 + (threadIdx.x%4)*4;
 85 |     int C_offset = ind/(P*Q)*(P*Q*N) + ind%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
 86 |     C[C_offset] = reg_C.x;
 87 |     C[C_offset+1] = reg_C.y;
 88 |     C[C_offset+2] = reg_C.z;
 89 |     C[C_offset+3] = reg_C.w;
 90 | }
 91 | 
 92 | //(N*P*Q)%16==0 && (P*Q)%4!=0
 93 | __device__ void gemm_64_16x16_2(int M, int N, int K, int P, int Q, float *A, float *B, float *C, float *sh){
 94 | 
 95 | 	float* sh_A = sh;
 96 |     float* sh_B = sh + 2*16*8;
 97 | 
 98 |     float4 reg_C;
 99 | 	reg_C.x =0.f;
100 | 	reg_C.y =0.f;
101 | 	reg_C.z =0.f;
102 | 	reg_C.w =0.f;
103 | 
104 |     float reg_A[8];
105 |     float reg_B[2];
106 | 
107 |     // Compute block's starting coordinate
108 |     int block_base_x = blockIdx.y*16;
109 |     int block_base_y = blockIdx.x*16;
110 | 
111 |     //load A from global memory to shared memory
112 |     float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%8)*2 + (threadIdx.x/8)*M);
113 |     *((float2*) (sh_A + 2*threadIdx.x)) = *(A_start);
114 | 
115 |     //load A from global memory to shared memory
116 |     float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/16)*2 + (threadIdx.x%16)*K);
117 |     *((float2*) (sh_B + 2*threadIdx.x)) = *(B_start);
118 | 
119 |     int double_buffer = 0;
120 | #pragma unroll
121 |     for(int k=0; k<K; k+=8){
122 |         __syncthreads();
123 |         int A_offset = double_buffer + (threadIdx.x%4)*4;
124 |         int B_offset = double_buffer + ((threadIdx.x/4)*2);
125 | 
126 | #pragma unroll
127 |         for (int i=0; i<8; i+=2)    {
128 | 
129 |             reg_A[0] = sh_A[A_offset];
130 |             reg_A[1] = sh_A[A_offset+1];
131 |             reg_A[2] = sh_A[A_offset+2];
132 |             reg_A[3] = sh_A[A_offset+3];
133 |             reg_A[4] = sh_A[A_offset+16];
134 |             reg_A[5] = sh_A[A_offset+17];
135 |             reg_A[6] = sh_A[A_offset+18];
136 |             reg_A[7] = sh_A[A_offset+19];
137 | 
138 |             reg_B[0] = sh_B[B_offset];
139 |             reg_B[1] = sh_B[B_offset+1];
140 | 
141 |             reg_C.x = fma(reg_A[0], reg_B[0], reg_C.x);
142 |             reg_C.y = fma(reg_A[1], reg_B[0], reg_C.y);
143 |             reg_C.z = fma(reg_A[2], reg_B[0], reg_C.z);
144 |             reg_C.w = fma(reg_A[3], reg_B[0], reg_C.w);
145 |             reg_C.x = fma(reg_A[4], reg_B[1], reg_C.x);
146 |             reg_C.y = fma(reg_A[5], reg_B[1], reg_C.y);
147 |             reg_C.z = fma(reg_A[6], reg_B[1], reg_C.z);
148 |             reg_C.w = fma(reg_A[7], reg_B[1], reg_C.w);
149 | 
150 |             A_offset += 32;
151 |             B_offset += 32;
152 |         }
153 | 
154 |         double_buffer ^= 128;
155 | 
156 |         if (k+8 < K){
157 |             A_start += 4*M;
158 |             *((float2*) (sh_A + double_buffer + 2*threadIdx.x)) = *(A_start);
159 |             B_start += 4;
160 |             *((float2*) (sh_B + double_buffer + 2*threadIdx.x)) = *(B_start);
161 |         }
162 |     }
163 | 
164 | 	int ind = blockIdx.x*16 + (threadIdx.x%4)*4;
165 |     int C_offset = ind/(P*Q)*(P*Q*N) + ind%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
166 |     C[C_offset] = reg_C.x;
167 |     C_offset = (ind+1)/(P*Q)*(P*Q*N) + (ind+1)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
168 |     C[C_offset] = reg_C.y;
169 |     C_offset = (ind+2)/(P*Q)*(P*Q*N) + (ind+2)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
170 |     C[C_offset] = reg_C.z;
171 |     C_offset = (ind+3)/(P*Q)*(P*Q*N) + (ind+3)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
172 |     C[C_offset] = reg_C.w;
173 | }
174 | 
175 | //(N*P*Q%16!=0)
176 | __device__ void gemm_64_16x16_3(int M, int N, int K, int P, int Q, float *A, float *B, float *C, float *sh){
177 | 
178 |    float* sh_A = sh;
179 |    float* sh_B = sh + 2*16*8;
180 | 
181 |    float reg_C[4];
182 |    reg_C[0] = 0.f;
183 |    reg_C[1] = 0.f;
184 |    reg_C[2] = 0.f;
185 |    reg_C[3] = 0.f;
186 | 
187 |    float reg_A[8]={0.f};
188 |    float reg_B[2]={0.f};
189 | 
190 |    // Compute block's starting coordinate
191 |    int block_base_x = blockIdx.y*16;
192 |    int block_base_y = blockIdx.x*16;
193 | 
194 | 
195 |    //load A from global memory to shared memory
196 |    int A_offset = block_base_y + (threadIdx.x%16) + (threadIdx.x/16)*M;
197 |    sh_A[threadIdx.x] = A[A_offset%(M*K)];
198 |    sh_A[threadIdx.x+64] = A[(A_offset+4*M)%(M*K)];
199 | 
200 |    //load A from global memory to shared memory
201 |    int B_offset =  K*block_base_x + (threadIdx.x/16)*2 + (threadIdx.x%16)*K;
202 |    sh_B[threadIdx.x*2] = B[B_offset%(K*N)];
203 |    sh_B[threadIdx.x*2+1] = B[(B_offset+1)%(K*N)];
204 | 
205 |    int double_buffer = 0;
206 | #pragma unroll
207 |    for(int k=0; k<K; k+=8){
208 |        __syncthreads();
209 |        int shA_offset = double_buffer + (threadIdx.x%4)*4;
210 |        int shB_offset = double_buffer + ((threadIdx.x/4)*2);
211 | #pragma unroll
212 |        for (int i=0; i<8; i+=2)    {
213 | 
214 |            reg_A[0] = sh_A[shA_offset];
215 |            reg_A[1] = sh_A[shA_offset+1];
216 |            reg_A[2] = sh_A[shA_offset+2];
217 |            reg_A[3] = sh_A[shA_offset+3];
218 |            reg_A[4] = sh_A[shA_offset+16];
219 |            reg_A[5] = sh_A[shA_offset+17];
220 |            reg_A[6] = sh_A[shA_offset+18];
221 |            reg_A[7] = sh_A[shA_offset+19];
222 | 
223 |            reg_B[0] = sh_B[shB_offset];
224 |            reg_B[1] = sh_B[shB_offset+1];
225 | 
226 |            reg_C[0] = fma(reg_A[0], reg_B[0], reg_C[0]);
227 |            reg_C[1] = fma(reg_A[1], reg_B[0], reg_C[1]);
228 |            reg_C[2] = fma(reg_A[2], reg_B[0], reg_C[2]);
229 |            reg_C[3] = fma(reg_A[3], reg_B[0], reg_C[3]);
230 |            reg_C[0] = fma(reg_A[4], reg_B[1], reg_C[0]);
231 |            reg_C[1] = fma(reg_A[5], reg_B[1], reg_C[1]);
232 |            reg_C[2] = fma(reg_A[6], reg_B[1], reg_C[2]);
233 |            reg_C[3] = fma(reg_A[7], reg_B[1], reg_C[3]);
234 | 
235 |            shA_offset += 32;
236 |            shB_offset += 32;
237 |        }
238 | 
239 |        double_buffer ^= 128;
240 |        double_buffer ^= 128;
241 | 
242 |        if (k+8 < K){
243 |            A_offset += 8*M;
244 |            sh_A[double_buffer+threadIdx.x] = A[A_offset%(M*K)];
245 |            sh_A[double_buffer+threadIdx.x+64] = A[(A_offset+4*M)%(M*K)];
246 |            B_offset += 8;
247 |            sh_B[double_buffer+threadIdx.x*2] = B[B_offset%(K*N)];
248 |            sh_B[double_buffer+threadIdx.x*2+1] = B[(B_offset+1)%(K*N)];
249 |        }
250 |    }
251 | 
252 | 	int ind = blockIdx.x*16 + (threadIdx.x%4)*4;
253 |     int C_offset = ind/(P*Q)*(P*Q*N) + ind%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
254 | 
255 |    if (blockIdx.x<M/16){
256 |        C[C_offset] = reg_C[0];
257 |      	C_offset = (ind+1)/(P*Q)*(P*Q*N) + (ind+1)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
258 |        C[C_offset] = reg_C[1];
259 |      	C_offset = (ind+2)/(P*Q)*(P*Q*N) + (ind+2)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
260 |        C[C_offset] = reg_C[2];
261 |      	C_offset = (ind+3)/(P*Q)*(P*Q*N) + (ind+3)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
262 |        C[C_offset] = reg_C[3];
263 |    }
264 |    else{
265 |        int ruler = (threadIdx.x%4)*4;
266 |        int rag = M%16;
267 |        if ((ruler)<rag){
268 |            C[C_offset] = reg_C[0];
269 | 		}
270 |        if ((ruler+1)<rag){
271 |      		C_offset = (ind+1)/(P*Q)*(P*Q*N) + (ind+1)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
272 |            C[C_offset] = reg_C[1];
273 | 		}
274 |        if ((ruler+2)<rag){
275 |      	C_offset = (ind+2)/(P*Q)*(P*Q*N) + (ind+2)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
276 |            C[C_offset] = reg_C[2];
277 | 		}
278 |        if ((ruler+3)<rag){
279 |      		C_offset = (ind+3)/(P*Q)*(P*Q*N) + (ind+3)%(P*Q) + (threadIdx.x/4)*(P*Q) + blockIdx.y*16*(P*Q);
280 |            C[C_offset] = reg_C[3];
281 | 		}
282 |    }
283 | }
284 | 
285 | 
286 | 
287 | __global__ void gemm_2(int M1, int M2, int N1, int N2, int K1, int K2, int P, int Q, float *A1, float *A2, float *B1, float *B2, float *C1, float *C2){
288 | 
289 | 	int id = blockIdx.z;
290 | 
291 |     extern __shared__ float sh[];
292 | 
293 |     int M = (id==0)?(M1):(M2);
294 |     int N = (id==0)?(N1):(N2);
295 |     int K = (id==0)?(K1):(K2);
296 |     float *A = (id==0)?(A1):(A2);
297 |     float *B = (id==0)?(B1):(B2);
298 |     float *C = (id==0)?(C1):(C2);
299 | 
300 |     if (blockIdx.x*16 < (M + (M%16!=0)*16) && blockIdx.y*16 < (N + (N%16!=0)*16)){
301 |    		if (M%16==0 && P%2==0){
302 |    			//(N*P*Q)%16==0 && (P*Q)%4==0
303 |    			gemm_64_16x16_1(M, N, K, P, Q, A, B, C, sh);
304 |    		}
305 |    		else if (M%16==0){
306 |     		//(N*P*Q)%16==0 && (P*Q)%4!=0
307 |    			gemm_64_16x16_2(M, N, K, P, Q, A, B, C, sh);
308 |    		}
309 |    		else{
310 |    			//(N*P*Q%16!=0)
311 |    			gemm_64_16x16_3(M, N, K, P, Q, A, B, C, sh);
312 |     	}
313 |     }
314 | }
315 | 
316 | 
317 | 
318 | __global__ void gemm_4(int M, int N1, int N2, int N3, int N4, int K, int P, int Q, float *A1, float *A2, float *B1, float *B2, float *B3, float *B4, float *C1, float *C2, float *C3, float *C4){
319 | 
320 | 	int id = blockIdx.z;
321 |     extern __shared__ float sh[];
322 | 
323 |     int N;
324 |     float *A, *B, *C;
325 | 
326 |     switch(id){
327 |     case 0:
328 |     	N = N1;
329 |     	A = A1;
330 |     	B = B1;
331 |     	C = C1;
332 |     	break;
333 |     case 1:
334 |     	N = N2;
335 |     	A = A1;
336 |     	B = B2;
337 |     	C = C2;
338 |     	break;
339 |     case 2:
340 |     	N = N3;
341 |     	A = A1;
342 |     	B = B3;
343 |     	C = C3;
344 |     	break;
345 |     case 3:
346 |     	N = N4;
347 |     	A = A2;
348 |     	B = B4;
349 |     	C = C4;
350 |     	break;
351 |     }
352 | 
353 |     if (blockIdx.x*16 < (M + (M%16!=0)*16) && blockIdx.y*16 < (N + (N%16!=0)*16)){
354 |    		if (M%16==0 && P%2==0){
355 |    			//(N*P*Q)%16==0 && (P*Q)%4==0
356 |    			gemm_64_16x16_1(M, N, K, P, Q, A, B, C, sh);
357 |    		}
358 |    		else if (M%16==0){
359 |     		//(N*P*Q)%16==0 && (P*Q)%4!=0
360 |    			gemm_64_16x16_2(M, N, K, P, Q, A, B, C, sh);
361 |    		}
362 |    		else{
363 |    			//(N*P*Q%16!=0)
364 |    			gemm_64_16x16_3(M, N, K, P, Q, A, B, C, sh);
365 |     	}
366 |     }
367 | }
368 | 
369 | 
370 | 
371 | #endif /* GEMM_KERNEL_H_ */
372 | 


--------------------------------------------------------------------------------
/google-net_cudnn/im2col.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * im2col.h
  3 |  *
  4 |  *  Created on: Nov 5, 2018
  5 |  *      Author: cambricon
  6 |  */
  7 | 
  8 | #ifndef IM2COL_H_
  9 | #define IM2COL_H_
 10 | 
 11 | template<int BLOCK_SIZE>
 12 | __global__ void im2col_1101(float *out, float *in, int N, int C, int H, int W){
 13 | 	//C*N blocks, and each block is responsible for a H*W data block of transformed matrix
 14 | 	int n = blockIdx.x/C;
 15 | 	int c = blockIdx.x%C;
 16 | 
 17 | 	float *in_start = in + n*C*H*W + c*H*W;
 18 | 	float *out_start = out + c*N*H*W + n*H*W;
 19 | 
 20 | 	for (int i=0; i<H*W/BLOCK_SIZE; ++i){
 21 | 		float *src = in_start + i*BLOCK_SIZE + threadIdx.x;
 22 | 		float *des = out_start + i*BLOCK_SIZE + threadIdx.x;
 23 | 		*des = *src;
 24 | 	}
 25 | 	if ((H*W%BLOCK_SIZE)!=0 && threadIdx.x<(H*W%BLOCK_SIZE)){
 26 | 		float *src = in_start + (H*W/BLOCK_SIZE)*BLOCK_SIZE + threadIdx.x;
 27 | 		float *des = out_start + (H*W/BLOCK_SIZE)*BLOCK_SIZE + threadIdx.x;
 28 | 		*des = *src;
 29 | 	}
 30 | }
 31 | 
 32 | template<int BLOCK_SIZE>
 33 | __global__ void im2col_3311_version1(float *out, float *in, int N, int C, int H, int W, int R, int S, int P, int Q){
 34 | 	//C*N*(Q+S-1) blocks, and each block is assigned for a series of P*R data blocks along the diagonal
 35 | 	int c = blockIdx.z;
 36 | 	int n = blockIdx.y;
 37 | 
 38 | 	int q = (blockIdx.x>=Q)? (Q-1):blockIdx.x;
 39 | 	int s = (blockIdx.x>=Q)? (blockIdx.x-Q+1):0;
 40 | 
 41 | 	int task = (q>1 && s==0)? 3:2;
 42 | 
 43 | 	extern __shared__ float line_buffer[];
 44 | 
 45 | 	float *result = out + c*N*Q*S*P*R + n*P*Q + s*(N*P*Q*R) + q*P;
 46 | 	if ( ((q==0) && (s==0)) || ( (q==(Q-1)) && (s==(S-1)) ) ) {
 47 | 
 48 | 		for(int j=0; j<(P*R)/BLOCK_SIZE; ++j){
 49 | 
 50 | 			int y = (j*BLOCK_SIZE+threadIdx.x)/P;
 51 | 			int x = (j*BLOCK_SIZE+threadIdx.x)%P;
 52 | 
 53 | 			int ind = y*P*Q*N + x;
 54 | 
 55 | 			result[ind] = 0.f;
 56 | 		}
 57 | 
 58 | 		if (((P*R)%BLOCK_SIZE)!=0 && threadIdx.x<((P*R)%BLOCK_SIZE)){
 59 | 			int y = (((P*R)/BLOCK_SIZE)*BLOCK_SIZE + threadIdx.x)/P;
 60 | 			int x = (((P*R)/BLOCK_SIZE)*BLOCK_SIZE + threadIdx.x)%P;
 61 | 
 62 | 			int ind = y*P*Q*N + x;
 63 | 
 64 | 			result[ind] = 0.f;
 65 | 		}
 66 | 	}
 67 | 	else {
 68 | 		float *data = in + n*C*H*W + c*H*W + (q+s-1)*W;
 69 | 		line_buffer[0] = 0.f;
 70 | 
 71 | 		for(int j=0; j<W/BLOCK_SIZE; ++j)
 72 | 			line_buffer[1+threadIdx.x+j*BLOCK_SIZE] = data[threadIdx.x+j*BLOCK_SIZE];
 73 | 
 74 | 		if ((W%BLOCK_SIZE)!=0 && threadIdx.x<(W%BLOCK_SIZE))
 75 | 			line_buffer[1+threadIdx.x+(W/BLOCK_SIZE)*BLOCK_SIZE] = data[threadIdx.x+(W/BLOCK_SIZE)*BLOCK_SIZE];
 76 | 
 77 | 		line_buffer[W+1] = 0.f;
 78 | 		__syncthreads();
 79 | 
 80 | 		for (int i=0; i<task; ++i){
 81 | 			for(int j=0; j<(P*R)/BLOCK_SIZE; ++j){
 82 | 
 83 | 				int y = (j*BLOCK_SIZE+threadIdx.x)/P;
 84 | 				int x = (j*BLOCK_SIZE+threadIdx.x)%P;
 85 | 
 86 | 				int ind = y*P*Q*N + x;
 87 | 
 88 | 				result[ind] = line_buffer[y+x];
 89 | 			}
 90 | 
 91 | 			if (((P*R)%BLOCK_SIZE)!=0 && threadIdx.x<((P*R)%BLOCK_SIZE)){
 92 | 				int y = (((P*R)/BLOCK_SIZE)*BLOCK_SIZE + threadIdx.x)/P;
 93 | 				int x = (((P*R)/BLOCK_SIZE)*BLOCK_SIZE + threadIdx.x)%P;
 94 | 
 95 | 				int ind = y*P*Q*N + x;
 96 | 
 97 | 				result[ind] = line_buffer[y+x];
 98 | 			}
 99 | 			result += (N*P*Q*R - Q);
100 | 		}
101 | 	}
102 | }
103 | 
104 | 
105 | 
106 | static void im2col(cudnnHandle_t handle, int N, int C, int H, int W, int K, int R, int S, int U, int V, int pad_h, int pad_w, float *input, float *output){
107 | 
108 | 	int P = H;
109 | 	int Q = W;
110 | 
111 | 	if (N==1 && R==1 && U==1 && pad_h==0){
112 | 		//There is no need to conduct im2col
113 | 	}
114 | 	else if (R==1 && U==1 && pad_h==0){
115 | 		im2col_1101<128><<<N*C, 128>>>(output, input, N, C, H, W);
116 | 		KernelErrChk();
117 | 	}
118 | 	else if (R==3 && U==1 && pad_h==1){
119 | 		dim3 grid;
120 | 		grid.x = Q+S-1;
121 | 		grid.y = N;
122 | 		grid.z = C;
123 | 		im2col_3311_version1<32><<<grid, 64, (W+2)*sizeof(float)>>>(output, input, N, C, H, W, R, S, P, Q);
124 | 		KernelErrChk();
125 | 	}
126 | 	else{
127 | 		cudnnTensorDescriptor_t xDesc;
128 | 		ErrChk(cudnnCreateTensorDescriptor(&xDesc));
129 | 		ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
130 | 
131 | 		cudnnFilterDescriptor_t filterDesc; // CUDNN_TENSOR_NHWC, CUDNN_TENSOR_NCHW
132 | 		ErrChk(cudnnCreateFilterDescriptor(&filterDesc));
133 | 		ErrChk(cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, K, C, R, S));
134 | 
135 | 		cudnnConvolutionDescriptor_t convDesc;
136 | 		ErrChk(cudnnCreateConvolutionDescriptor(&convDesc));
137 | 		ErrChk(cudnnSetConvolution2dDescriptor(convDesc, pad_h, pad_w, U, V, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));
138 | 		ErrChk(cudnnIm2Col(handle, xDesc, input, filterDesc, convDesc, output));
139 | 
140 | 		ErrChk(cudnnDestroyTensorDescriptor(xDesc));
141 | 		ErrChk(cudnnDestroyFilterDescriptor(filterDesc));
142 | 		ErrChk(cudnnDestroyConvolutionDescriptor(convDesc));
143 | 	}
144 | }
145 | 
146 | 
147 | #endif /* IM2COL_H_ */
148 | 


--------------------------------------------------------------------------------
/google-net_cudnn/inception.cpp:
--------------------------------------------------------------------------------
 1 | #include "cudnn.h"
 2 | #include "util.h"
 3 | #include <cmath>
 4 | #include "conv.h"
 5 | #include "pooling.h"
 6 | #include "activation.h"
 7 | #include "dropout.h"
 8 | #include "lrn.h"
 9 | #include "concat.h"
10 | 
11 | /*
12 |  * Do Inception
13 |  *
14 |  * This func will consume 6 filters and 4 feature(x).
15 |  * Use x[xIdx] as input, which should be set before this func.
16 |  * Use x[xIdx + 4] as output.
17 |  *
18 |  */
19 | void cudnnGoogleNetInception(cudnnHandle_t handle, const int N, const int C,
20 |         const int H, const int W, const int xIdx, const int filterIdx,
21 |         const int K1, const int K2, const int K3, const int K4, const int K5,
22 |         const int K6, int *reC, float **x, float** filter, float* buf,
23 |         cudaStream_t *s, const int *algo_best) {
24 |     /*
25 |      * Use x[xIdx + 4] as output.
26 |      * We can concat the result directly when N == 1.
27 |      */
28 |     float *output = x[xIdx + 4];
29 |     float *output1 = output;
30 |     float *output2 = output1 + K1 * H * W;
31 |     float *output3 = output2 + K3 * H * W;
32 |     float *output4 = output3 + K5 * H * W;
33 | 
34 |     //1*1 conv
35 |     int algo = algo_best[filterIdx*7];
36 |     conv(handle, N, C, H, W, K1, 1, 1, 1, 1, 0, 0, H, W,
37 |             x[xIdx], filter[filterIdx], buf,
38 |             output1, algo, s[0]);
39 | 
40 |     //relu 1*1
41 |     activation(handle, N, K1, H, W, output1, output1, s[0]);
42 | 
43 |     //3*3 reduce
44 |     algo = algo_best[(filterIdx+1)*7];
45 |     conv(handle, N, C, H, W, K2, 1, 1, 1, 1, 0, 0, H, W,
46 |             x[xIdx], filter[filterIdx+1], buf,
47 |             x[xIdx + 1], algo, s[1]);
48 | 
49 |     //relu 3*3 reduce
50 |     activation(handle, N, K2, H, W, x[xIdx + 1],
51 |             x[xIdx + 1], s[1]);
52 | 
53 |     //3*3
54 |     algo = algo_best[(filterIdx+2)*7];
55 |     conv(handle, N, C, H, W, K3, 3, 3, 1, 1, 1, 1, H, W,
56 |             x[xIdx + 1], filter[filterIdx+2], buf,
57 |             output2, algo, s[1]);
58 | 
59 |     //relu 3*3
60 |     activation(handle, N, K3, H, W, output2, output2, s[1]);
61 | 
62 |     //5*5 reduce
63 |     algo = algo_best[(filterIdx+3)*7];
64 |     conv(handle, N, C, H, W, K4, 1, 1, 1, 1, 0, 0, H, W,
65 |             x[xIdx], filter[filterIdx+2], buf,
66 |             x[xIdx + 2], algo, s[2]);
67 | 
68 |     //relu 5*5 reduce
69 |     activation(handle, N, K4, H, W, x[xIdx + 2],
70 |             x[xIdx + 2], s[2]);
71 | 
72 |     //5*5
73 |     algo = algo_best[(filterIdx+4)*7];
74 |     conv(handle, N, C, H, W, K5, 5, 5, 1, 1, 2, 2, H, W,
75 |             x[xIdx + 2], filter[filterIdx+4], buf,
76 |             output3, algo, s[2]);
77 | 
78 |     //relu 5*5
79 |     activation(handle, N, K5, H, W, output3, output3, s[2]);
80 | 
81 |     //pool
82 |     pooling(handle, N, C, H, W, 3, 3, 1, 1, 1, 1, H, W,
83 |             x[xIdx], x[xIdx + 3], s[3]);
84 | 
85 |     //pool proj
86 |     algo = algo_best[(filterIdx+5)*7];
87 |     conv(handle, N, C, H, W, K6, 1, 1, 1, 1, 0, 0, H, W,
88 |             x[xIdx + 3], filter[filterIdx+5], buf, output4, algo, s[3]);
89 | 
90 |     //relu pool proj
91 |     activation(handle, N, K6, H, W, output4, output4, s[3]);
92 | 
93 | 
94 |     ErrChk(cudaDeviceSynchronize());
95 | 
96 |     *reC = K1 + K3 + K5 + K6;
97 | }
98 | 


--------------------------------------------------------------------------------
/google-net_cudnn/inception.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * inception.h
 3 |  *
 4 |  *  Created on: Nov 5, 2018
 5 |  *      Author: cambricon
 6 |  */
 7 | 
 8 | #ifndef INCEPTION_H_
 9 | #define INCEPTION_H_
10 | 
11 | /*
12 |  * Do Inception
13 |  *
14 |  * This func will consume 6 filters and 8 features.
15 |  * Use feature[featureIndex] as input, which should be set before this func.
16 |  * Use feature[featureIndex + 8] as output.
17 |  *
18 |  */
19 | void cudnnGoogleNetInception(cudnnHandle_t handle, const int N, const int C,
20 |         const int H, const int W, const int xIdx, const int filterIdx,
21 |         const int K1, const int K2, const int K3, const int K4, const int K5,
22 |         const int K6, int *reC, float **x, float** filter, float* buf,
23 |         cudaStream_t *s, const int *algo_best);
24 | 
25 | #endif /* INCEPTION_H_ */
26 | 


--------------------------------------------------------------------------------
/google-net_cudnn/loss.cpp:
--------------------------------------------------------------------------------
 1 | #include "cudnn.h"
 2 | #include "util.h"
 3 | #include <cmath>
 4 | 
 5 | void loss(cublasHandle_t cublas_handle, int N, int C, int K,
 6 |         float *input, float *filter, float *output) {
 7 |     float alpha = 1.f, beta = 0.f;
 8 | 
 9 |     ErrChk(cublasGemmEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, N, K, C,
10 |                 (void*) &alpha, (void*) input, CUDA_R_32F, C,
11 |                 (void*) filter, CUDA_R_32F, C,
12 |                 (void*) &beta, (void*) output, CUDA_R_32F, N, CUDA_R_32F,
13 |                 CUBLAS_GEMM_DEFAULT));
14 | }
15 | 


--------------------------------------------------------------------------------
/google-net_cudnn/loss.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * loss.h
 3 |  *
 4 |  *  Created on: Nov 5, 2018
 5 |  *      Author: cambricon
 6 |  */
 7 | 
 8 | #ifndef LOSS_H_
 9 | #define LOSS_H_
10 | 
11 | 
12 | void loss(cublasHandle_t cublas_handle, int N, int C, int K, float *input,
13 |         float *filter, float *output);
14 | 
15 | 
16 | #endif /* LOSS_H_ */
17 | 


--------------------------------------------------------------------------------
/google-net_cudnn/lrn.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include "cudnn.h"
 3 | #include "util.h"
 4 | 
 5 | void lrn(cudnnHandle_t handle, int N, int C, int H, int W, int R, int S, float lrnAlpha, float lrnBeta, float lrnK, float *input, float *output){
 6 | 
 7 | 	cudnnTensorDescriptor_t xDesc;
 8 | 	ErrChk(cudnnCreateTensorDescriptor(&xDesc));
 9 | 	ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
10 | 
11 | 	cudnnTensorDescriptor_t yDesc;
12 | 	ErrChk(cudnnCreateTensorDescriptor(&yDesc));
13 | 	ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
14 | 
15 | 	cudnnLRNDescriptor_t normDesc;
16 | 	ErrChk(cudnnCreateLRNDescriptor(&normDesc));
17 | 	ErrChk(cudnnSetLRNDescriptor(normDesc, R, lrnAlpha, lrnBeta, lrnK));
18 | 
19 | 
20 | 	float one = 1.f, zero = 0.f;
21 | 	ErrChk(cudnnLRNCrossChannelForward(handle, normDesc, CUDNN_LRN_CROSS_CHANNEL_DIM1, &one, xDesc, input, &zero, yDesc, output));
22 | 
23 | 	ErrChk(cudnnDestroyLRNDescriptor(normDesc));
24 | 	ErrChk(cudnnDestroyTensorDescriptor(xDesc));
25 | 	ErrChk(cudnnDestroyTensorDescriptor(yDesc));
26 | }
27 | 


--------------------------------------------------------------------------------
/google-net_cudnn/lrn.h:
--------------------------------------------------------------------------------
1 | #ifndef __LRN_H__
2 | #define __LRN_H__
3 | void lrn(cudnnHandle_t handle, int N, int C, int H, int W, int R, int S, float lrnAlpha, float lrnBeta, float lrnK, float *input, float *output);
4 | #endif
5 | 


--------------------------------------------------------------------------------
/google-net_cudnn/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <cstdio>
  3 | #include <cmath>
  4 | #include "cudnn.h"
  5 | #include "util.h"
  6 | #include "conv.h"
  7 | #include "activation.h"
  8 | #include "pooling.h"
  9 | #include "concat.h"
 10 | #include "dropout.h"
 11 | #include "lrn.h"
 12 | #include "loss.h"
 13 | #include "softmax.h"
 14 | #include "inception.h"
 15 | #include "batch-inception.h"
 16 | 
 17 | 
 18 | void batchGoogleNetForward(cudnnHandle_t handle, cublasHandle_t cublas_handle,
 19 |         int N, float **x, float **filter, float* buf, const int *algo_best) {
 20 |     int C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q;
 21 | 
 22 |     // conv1/7x7_s2
 23 |     C = 3;
 24 |     H = W = 227;
 25 |     K = 64;
 26 |     R = S = 7;
 27 |     U = V = 2;
 28 |     pad_h = pad_w = 3;
 29 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
 30 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
 31 | 
 32 |     int algo = algo_best[0];
 33 |     conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q,
 34 |             x[0], filter[0], buf, x[1], algo);
 35 | 
 36 |     // conv1/relu_7x7
 37 |     C = 64;
 38 |     H = W = 114;
 39 |     activation(handle, N, C, H, W, x[1], x[1]);
 40 | 
 41 |     // pool1/3x3_s2
 42 |     R = 3;
 43 |     S = 3;
 44 |     U = 2;
 45 |     V = 2;
 46 |     pad_h = 1;
 47 |     pad_w = 1;
 48 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
 49 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
 50 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[1], x[2]);
 51 | 
 52 |     H = P;
 53 |     W = Q;
 54 | 
 55 |     // pool1/norm1
 56 |     R = 5;
 57 |     S = 5;
 58 |     float lrnAlpha = 0.0001f;
 59 |     float lrnBeta = 0.75f;
 60 |     float lrnK = 2.f;
 61 | 
 62 |     lrn(handle, N, C, H, W, R, S, lrnAlpha, lrnBeta, lrnK, x[2], x[3]);
 63 | 
 64 |     // conv2/3x3_reduce
 65 |     K = 64;
 66 |     R = 1;
 67 |     S = 1;
 68 |     U = 1;
 69 |     V = 1;
 70 |     pad_h = 0;
 71 |     pad_w = 0;
 72 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
 73 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
 74 | 
 75 |     algo = algo_best[7];
 76 |     conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q,
 77 |             x[3], filter[1], buf, x[4], algo);
 78 |     C = K;
 79 |     H = P;
 80 |     W = Q;
 81 | 
 82 |     // conv2/relu_3x3_reduce
 83 |     activation(handle, N, C, H, W, x[4], x[4]);
 84 | 
 85 |     // conv2/3x3
 86 |     K = 192;
 87 |     R = 3;
 88 |     S = 3;
 89 |     U = 1;
 90 |     V = 1;
 91 |     pad_h = 1;
 92 |     pad_w = 1;
 93 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
 94 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
 95 | 
 96 |     algo = algo_best[14];
 97 |     conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q,
 98 |             x[4], filter[2], buf, x[5], algo);
 99 |     C = K;
100 |     H = P;
101 |     W = Q;
102 | 
103 | 
104 |     // conv2/relu_3x3
105 |     activation(handle, N, C, H, W, x[5], x[5]);
106 | 
107 |     // conv2/norm2
108 |     R = 5;
109 |     S = 5;
110 |     lrnAlpha = 0.0001f;
111 |     lrnBeta = 0.75f;
112 |     lrnK = 2.f;
113 | 
114 |     lrn(handle, N, C, H, W, R, S, lrnAlpha, lrnBeta, lrnK, x[5], x[6]);
115 | 
116 |     // pool2/3x3_s2
117 |     R = 3;
118 |     S = 3;
119 |     U = 2;
120 |     V = 2;
121 |     pad_h = 0;
122 |     pad_w = 0;
123 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
124 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
125 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[6], x[7]);
126 | 
127 |     // inception3a
128 |     H = P;
129 |     W = Q;
130 |     batchGoogleNetInception(handle, N, C, H, W, 7, 3,
131 |             64, 96, 128, 16, 32, 32, // K1, K2, K3, K4, K5, K6
132 |             &C, x, filter, buf, algo_best);
133 | 
134 |     // inception3b
135 |     batchGoogleNetInception(handle, N, C, H, W, 11, 9,
136 |             128, 128, 192, 32, 96, 64, // K1, K2, K3, K4, K5, K6
137 |             &C, x, filter, buf, algo_best);
138 | 
139 |     // pool3/3x3_s2
140 |     R = 3;
141 |     S = 3;
142 |     U = 2;
143 |     V = 2;
144 |     pad_h = 1;
145 |     pad_w = 1;
146 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
147 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
148 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[15], x[16]);
149 | 
150 |     // inception4a
151 |     H = P;
152 |     W = Q;
153 |     batchGoogleNetInception(handle, N, C, H, W, 16, 15,
154 |             192, 96, 208, 16, 48, 64, // K1, K2, K3, K4, K5, K6
155 |             &C, x, filter, buf, algo_best);
156 | 
157 |     // inception4b
158 |     batchGoogleNetInception(handle, N, C, H, W, 20, 21,
159 |             160, 112, 224, 24, 64, 64, // K1, K2, K3, K4, K5, K6
160 |             &C, x, filter, buf, algo_best);
161 | 
162 |     // inception4c
163 |     batchGoogleNetInception(handle, N, C, H, W, 24, 27,
164 |             128, 128, 256, 24, 64, 64, // K1, K2, K3, K4, K5, K6
165 |             &C, x, filter, buf, algo_best);
166 | 
167 |     // inception4d
168 |     batchGoogleNetInception(handle, N, C, H, W, 28, 33,
169 |             112, 144, 288, 32, 64, 64, // K1, K2, K3, K4, K5, K6
170 |             &C, x, filter, buf, algo_best);
171 | 
172 |     // inception4e
173 |     batchGoogleNetInception(handle, N, C, H, W, 32, 39,
174 |             256, 160, 320, 32, 128, 128, // K1, K2, K3, K4, K5, K6
175 |             &C, x, filter, buf, algo_best);
176 | 
177 |     // pool4/3x3_s2
178 |     R = 3;
179 |     S = 3;
180 |     U = 2;
181 |     V = 2;
182 |     pad_h = 1;
183 |     pad_w = 1;
184 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
185 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
186 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[36], x[37]);
187 | 
188 |     // inception5a
189 |     H = P;
190 |     W = Q;
191 |     batchGoogleNetInception(handle, N, C, H, W, 37, 45,
192 |             256, 160, 320, 32, 128, 128, // K1, K2, K3, K4, K5, K6
193 |             &C, x, filter, buf, algo_best);
194 | 
195 |     // inception5b
196 |     batchGoogleNetInception(handle, N, C, H, W, 41, 51,
197 |             384, 192, 384, 48, 128, 128, // K1, K2, K3, K4, K5, K6
198 |             &C, x, filter, buf, algo_best);
199 | 
200 |     // pool5/3x3_s2
201 |     R = 7;
202 |     S = 7;
203 |     U = 1;
204 |     V = 1;
205 |     pad_h = 0;
206 |     pad_w = 0;
207 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
208 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
209 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[45], x[46]);
210 | 
211 |     // loss3
212 |     K = 1000;
213 |     loss(cublas_handle, N, C, K, x[46], filter[57], x[47]);
214 | 
215 |     // softmax
216 |     softmax(handle, N, C, x[47], x[48]);
217 | }
218 | 
219 | 
220 | void cudnnGoogleNetForward(cudnnHandle_t handle, cublasHandle_t cublas_handle,
221 |         int N, float **x, float** filter, float* buf, const int *algo_best) {
222 |     int C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q;
223 | 
224 |     // conv1/7x7_s2
225 |     C = 3;
226 |     H = W = 227;
227 |     K = 64;
228 |     R = S = 7;
229 |     U = V = 2;
230 |     pad_h = pad_w = 3;
231 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
232 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
233 | 
234 |     int algo = algo_best[0];
235 |     conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q,
236 |             x[0], filter[0], buf, x[1], algo);
237 | 
238 |     // conv1/relu_7x7
239 |     C = 64;
240 |     H = W = 114;
241 |     activation(handle, N, C, H, W, x[1], x[1]);
242 | 
243 |     // pool1/3x3_s2
244 |     R = 3;
245 |     S = 3;
246 |     U = 2;
247 |     V = 2;
248 |     pad_h = 1;
249 |     pad_w = 1;
250 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
251 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
252 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[1], x[2]);
253 | 
254 |     H = P;
255 |     W = Q;
256 | 
257 |     // pool1/norm1
258 |     R = 5;
259 |     S = 5;
260 |     float lrnAlpha = 0.0001f;
261 |     float lrnBeta = 0.75f;
262 |     float lrnK = 2.f;
263 | 
264 |     lrn(handle, N, C, H, W, R, S, lrnAlpha, lrnBeta, lrnK, x[2], x[3]);
265 | 
266 |     // conv2/3x3_reduce
267 |     K = 64;
268 |     R = 1;
269 |     S = 1;
270 |     U = 1;
271 |     V = 1;
272 |     pad_h = 0;
273 |     pad_w = 0;
274 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
275 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
276 | 
277 |     algo = algo_best[7];
278 |     conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q,
279 |             x[3], filter[1], buf, x[4], algo);
280 |     C = K;
281 |     H = P;
282 |     W = Q;
283 | 
284 |     // conv2/relu_3x3_reduce
285 |     activation(handle, N, C, H, W, x[4], x[4]);
286 | 
287 |     // conv2/3x3
288 |     K = 192;
289 |     R = 3;
290 |     S = 3;
291 |     U = 1;
292 |     V = 1;
293 |     pad_h = 1;
294 |     pad_w = 1;
295 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
296 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
297 | 
298 |     algo = algo_best[14];
299 |     conv(handle, N, C, H, W, K, R, S, U, V, pad_h, pad_w, P, Q,
300 |             x[4], filter[2], buf, x[5], algo);
301 |     C = K;
302 |     H = P;
303 |     W = Q;
304 | 
305 | 
306 |     // conv2/relu_3x3
307 |     activation(handle, N, C, H, W, x[5], x[5]);
308 | 
309 |     // conv2/norm2
310 |     R = 5;
311 |     S = 5;
312 |     lrnAlpha = 0.0001f;
313 |     lrnBeta = 0.75f;
314 |     lrnK = 2.f;
315 | 
316 |     lrn(handle, N, C, H, W, R, S, lrnAlpha, lrnBeta, lrnK, x[5], x[6]);
317 | 
318 |     // pool2/3x3_s2
319 |     R = 3;
320 |     S = 3;
321 |     U = 2;
322 |     V = 2;
323 |     pad_h = 0;
324 |     pad_w = 0;
325 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
326 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
327 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[6], x[7]);
328 | 
329 | #ifdef USE_MULTI_STREAM
330 |     cudaStream_t s[4];
331 |     ErrChk(cudaStreamCreate(&s[0]));
332 |     ErrChk(cudaStreamCreate(&s[1]));
333 |     ErrChk(cudaStreamCreate(&s[2]));
334 |     ErrChk(cudaStreamCreate(&s[3]));
335 | #else
336 |     cudaStream_t s[4] = {0, 0, 0, 0};
337 | #endif
338 | 
339 |     // inception3a
340 |     H = P;
341 |     W = Q;
342 |     cudnnGoogleNetInception(handle, N, C, H, W, 7, 3,
343 |             64, 96, 128, 16, 32, 32, // K1, K2, K3, K4, K5, K6
344 |             &C, x, filter, buf, s, algo_best);
345 | 
346 |     // inception3b
347 |     cudnnGoogleNetInception(handle, N, C, H, W, 11, 9,
348 |             128, 128, 192, 32, 96, 64, // K1, K2, K3, K4, K5, K6
349 |             &C, x, filter, buf, s, algo_best);
350 | 
351 |     // pool3/3x3_s2
352 |     R = 3;
353 |     S = 3;
354 |     U = 2;
355 |     V = 2;
356 |     pad_h = 1;
357 |     pad_w = 1;
358 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
359 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
360 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[15], x[16]);
361 | 
362 |     // inception4a
363 |     H = P;
364 |     W = Q;
365 |     cudnnGoogleNetInception(handle, N, C, H, W, 16, 15,
366 |             192, 96, 208, 16, 48, 64, // K1, K2, K3, K4, K5, K6
367 |             &C, x, filter, buf, s, algo_best);
368 | 
369 |     // inception4b
370 |     cudnnGoogleNetInception(handle, N, C, H, W, 20, 21,
371 |             160, 112, 224, 24, 64, 64, // K1, K2, K3, K4, K5, K6
372 |             &C, x, filter, buf, s, algo_best);
373 | 
374 |     // inception4c
375 |     cudnnGoogleNetInception(handle, N, C, H, W, 24, 27,
376 |             128, 128, 256, 24, 64, 64, // K1, K2, K3, K4, K5, K6
377 |             &C, x, filter, buf, s, algo_best);
378 | 
379 |     // inception4d
380 |     cudnnGoogleNetInception(handle, N, C, H, W, 28, 33,
381 |             112, 144, 288, 32, 64, 64, // K1, K2, K3, K4, K5, K6
382 |             &C, x, filter, buf, s, algo_best);
383 | 
384 |     // inception4e
385 |     cudnnGoogleNetInception(handle, N, C, H, W, 32, 39,
386 |             256, 160, 320, 32, 128, 128, // K1, K2, K3, K4, K5, K6
387 |             &C, x, filter, buf, s, algo_best);
388 | 
389 |     // pool4/3x3_s2
390 |     R = 3;
391 |     S = 3;
392 |     U = 2;
393 |     V = 2;
394 |     pad_h = 1;
395 |     pad_w = 1;
396 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
397 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
398 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[36], x[37]);
399 | 
400 |     // inception5a
401 |     H = P;
402 |     W = Q;
403 |     cudnnGoogleNetInception(handle, N, C, H, W, 37, 45,
404 |             256, 160, 320, 32, 128, 128, // K1, K2, K3, K4, K5, K6
405 |             &C, x, filter, buf, s, algo_best);
406 | 
407 |     // inception5b
408 |     cudnnGoogleNetInception(handle, N, C, H, W, 41, 51,
409 |             384, 192, 384, 48, 128, 128, // K1, K2, K3, K4, K5, K6
410 |             &C, x, filter, buf, s, algo_best);
411 | 
412 |     // pool5/3x3_s2
413 |     R = 7;
414 |     S = 7;
415 |     U = 1;
416 |     V = 1;
417 |     pad_h = 0;
418 |     pad_w = 0;
419 |     P = ceil((float)(H - R + 1 + 2 * pad_h)/(float)U);
420 |     Q = ceil((float)(W - S + 1 + 2 * pad_w)/(float)V);
421 |     pooling(handle, N, C, H, W, R, S, U, V, pad_h, pad_w, P, Q, x[45], x[46]);
422 | 
423 |     // loss3
424 |     K = 1000;
425 |     loss(cublas_handle, N, C, K, x[46], filter[57], x[47]);
426 | 
427 |     // softmax
428 |     softmax(handle, N, C, x[47], x[48]);
429 | 
430 | #ifdef USE_MULTI_STREAM
431 |     ErrChk(cudaStreamDestroy(s[0]));
432 |     ErrChk(cudaStreamDestroy(s[1]));
433 |     ErrChk(cudaStreamDestroy(s[2]));
434 |     ErrChk(cudaStreamDestroy(s[3]));
435 | #endif
436 | }
437 | 
438 | const int algo_best[7*57] = {
439 |         0, 0, 0, 1, 1, 1, 1,
440 |         0, 0, 0, 0, 0, 1, 1,
441 |         6, 6, 6, 6, 7, 7, 5,
442 |         0, 0, 0, 0, 0, 1, 1,
443 |         0, 0, 1, 0, 0, 1, 1,
444 |         6, 6, 6, 6, 7, 7, 7,
445 |         0, 0, 0, 1, 0, 0, 1,
446 |         0, 0, 0, 0, 0, 5, 5,
447 |         0, 0, 0, 1, 0, 0, 1,
448 |         0, 0, 0, 0, 1, 1, 1,
449 |         0, 0, 0, 0, 1, 1, 1,
450 |         6, 6, 6, 6, 7, 7, 7,
451 |         0, 0, 0, 0, 0, 0, 1,
452 |         0, 0, 0, 5, 5, 5, 5,
453 |         0, 0, 0, 0, 0, 1, 1,
454 |         0, 0, 0, 1, 1, 1, 1,
455 |         0, 1, 1, 0, 0, 0, 1,
456 |         6, 6, 6, 6, 7, 7, 7,
457 |         0, 0, 0, 0, 0, 0, 0,
458 |         0, 0, 0, 7, 7, 7, 7,
459 |         1, 0, 1, 0, 0, 0, 0,
460 |         0, 0, 0, 0, 1, 1, 1,
461 |         0, 1, 1, 1, 0, 0, 1,
462 |         6, 6, 6, 6, 7, 7, 7,
463 |         0, 0, 0, 0, 0, 0, 0,
464 |         0, 7, 0, 7, 7, 7, 7,
465 |         1, 0, 1, 0, 0, 0, 0,
466 |         0, 0, 1, 1, 0, 0, 1,
467 |         0, 0, 1, 1, 0, 0, 1,
468 |         6, 6, 6, 6, 7, 7, 7,
469 |         0, 0, 0, 0, 0, 0, 0,
470 |         0, 0, 0, 7, 7, 7, 7,
471 |         1, 0, 1, 0, 0, 0, 1,
472 |         0, 1, 1, 1, 0, 0, 1,
473 |         0, 1, 1, 1, 1, 1, 1,
474 |         6, 6, 6, 6, 7, 7, 7,
475 |         0, 0, 0, 0, 0, 0, 0,
476 |         7, 7, 7, 7, 7, 7, 7,
477 |         1, 0, 1, 0, 0, 0, 1,
478 |         0, 0, 0, 0, 1, 0, 1,
479 |         0, 0, 0, 0, 1, 1, 1,
480 |         6, 6, 6, 6, 7, 7, 7,
481 |         0, 0, 0, 0, 0, 0, 0,
482 |         7, 7, 7, 7, 7, 7, 7,
483 |         0, 0, 1, 1, 0, 0, 0,
484 |         1, 1, 0, 1, 1, 0, 1,
485 |         0, 1, 0, 0, 0, 0, 1,
486 |         6, 6, 6, 6, 7, 7, 7,
487 |         0, 0, 0, 0, 0, 1, 0,
488 |         7, 7, 7, 7, 7, 7, 4,
489 |         0, 1, 0, 0, 1, 1, 0,
490 |         1, 1, 1, 1, 1, 0, 1,
491 |         1, 0, 1, 1, 1, 1, 0,
492 |         6, 6, 6, 7, 7, 7, 7,
493 |         1, 1, 1, 1, 1, 0, 1,
494 |         7, 7, 7, 7, 7, 7, 4,
495 |         0, 1, 0, 0, 1, 1, 0
496 | };
497 | 
498 | int main() {
499 |     const int warmupIters = 2;
500 |     const int TestIters = 10;
501 | 
502 |     int N = 1; // batch size
503 |     const int filterNum = 58;
504 |     const int xNum = 50;
505 | 
506 |     float **filter = new float*[filterNum]; // filter
507 |     float **x = new float*[xNum]; // result
508 | 
509 |     const int MAX_TENSOR_SIZE=N * 200704 * 9;
510 |     ErrChk(cudaMalloc(&x[0], (xNum + 10) *MAX_TENSOR_SIZE * sizeof(float)));
511 |     for (int i = 1; i < xNum; ++i) {
512 |         x[i] = x[i - 1] + MAX_TENSOR_SIZE;
513 |     }
514 |     float *buf = x[xNum - 1] + MAX_TENSOR_SIZE;
515 | 
516 |     const int MAX_FILTER_SIZE = 8000000;
517 |     ErrChk(cudaMalloc(&filter[0], filterNum * MAX_FILTER_SIZE * sizeof(float)));
518 |     for (int i = 1; i < filterNum; ++i) {
519 |         filter[i] = filter[i - 1] + MAX_FILTER_SIZE;
520 |     }
521 | 
522 |     const int RESULT_SIZE=1000;
523 |     float *h_cudnn_result = (float*)malloc(2 * RESULT_SIZE * sizeof(float));
524 |     float *h_our_result = h_cudnn_result + RESULT_SIZE;
525 | 
526 |     // prepare data
527 |     float *h_input = (float*) malloc(MAX_TENSOR_SIZE * sizeof(float));
528 |     for (int j = 0; j < MAX_TENSOR_SIZE; ++j)
529 |         h_input[j] = j%10;
530 |     float *h_filter = (float*) malloc(
531 |             filterNum * MAX_FILTER_SIZE * sizeof(float));
532 |     for (int j = 0; j < filterNum*MAX_FILTER_SIZE; ++j)
533 |         h_filter[j] = j%5;
534 |     ErrChk(cudaMemcpy(x[0], h_input, MAX_TENSOR_SIZE * sizeof(float),
535 |                 cudaMemcpyHostToDevice));
536 |     ErrChk(cudaMemcpy(filter[0], h_filter,
537 |                 filterNum * MAX_FILTER_SIZE * sizeof(float),
538 |                 cudaMemcpyHostToDevice));
539 | 
540 |     cudnnHandle_t handle;
541 |     cublasHandle_t cublas_handle;
542 |     ErrChk(cudnnCreate(&handle));
543 |     ErrChk(cublasCreate(&cublas_handle));
544 | 
545 |     // warm up
546 |     for (int i = 0; i < warmupIters; ++i) {
547 |         cudnnGoogleNetForward(handle, cublas_handle, N, x, filter, buf,
548 |                 algo_best);
549 |     }
550 | 
551 |     cudaEvent_t start, stop;
552 |     float elapsedTime = 0;
553 |     ErrChk(cudaEventCreate(&start));
554 |     ErrChk(cudaEventCreate(&stop));
555 |     ErrChk(cudaEventRecord(start,0));
556 | 
557 |     for (int i = 0; i < TestIters; ++i) {
558 |         cudnnGoogleNetForward(handle, cublas_handle, N, x, filter, buf,
559 |                 algo_best);
560 |     }
561 | 
562 |     ErrChk(cudaEventRecord(stop, 0));
563 |     ErrChk(cudaEventSynchronize(stop));
564 |     ErrChk(cudaEventElapsedTime(&elapsedTime, start, stop));
565 | 
566 |     printf("Time for cuDNN implementation is %0.6f\n", elapsedTime / TestIters);
567 |     ErrChk(cudaMemcpy(h_cudnn_result, x[48], RESULT_SIZE * sizeof(float),
568 |                 cudaMemcpyDeviceToHost));
569 | 
570 |     // warm up
571 |     for (int i = 0; i < warmupIters; ++i) {
572 |         batchGoogleNetForward(handle, cublas_handle, N, x, filter, buf,
573 |                 algo_best);
574 |     }
575 | 
576 |     ErrChk(cudaEventRecord(start,0));
577 |     for (int i = 0; i < TestIters; ++i) {
578 |         batchGoogleNetForward(handle, cublas_handle, N, x, filter, buf,
579 |                 algo_best);
580 |     }
581 | 
582 |     ErrChk(cudaEventRecord(stop, 0));
583 |     ErrChk(cudaEventSynchronize(stop));
584 |     ErrChk(cudaEventElapsedTime(&elapsedTime, start, stop));
585 |     printf("Time for batched-Conv implementation is %0.6f\n",
586 |             elapsedTime / TestIters);
587 |     ErrChk(cudaMemcpy(h_our_result, x[48], RESULT_SIZE * sizeof(float),
588 |                 cudaMemcpyDeviceToHost));
589 | 
590 |     // compare the result
591 |     double ep = 0.0001;
592 |     for (int i = 0; i < RESULT_SIZE; ++i) {
593 |        if (std::abs(h_our_result[i] - (double)h_cudnn_result[i]) > ep) {
594 |            printf("result error at %d: %f, %f\n", i, h_our_result[i],
595 |                                                   h_cudnn_result[i]);
596 |            return -1;
597 |        }
598 |     }
599 |     printf("result is correctly!\n");
600 | 
601 |     ErrChk(cublasDestroy(cublas_handle));
602 |     ErrChk(cudnnDestroy(handle));
603 | 
604 |     return 0;
605 | }
606 | 


--------------------------------------------------------------------------------
/google-net_cudnn/pooling.cpp:
--------------------------------------------------------------------------------
 1 | #include <cmath>
 2 | #include "cudnn.h"
 3 | #include "util.h"
 4 | 
 5 | void pooling(cudnnHandle_t handle, int N, int C, int H, int W, int R, int S, int U, int V, int pad_h, int pad_w, int P, int Q, float *input, float *output, cudaStream_t s){
 6 | 
 7 | 	ErrChk(cudnnSetStream(handle, s));
 8 | 
 9 | 	cudnnPoolingDescriptor_t poolingDesc;
10 | 	ErrChk(cudnnCreatePoolingDescriptor(&poolingDesc));
11 | 	ErrChk(cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, R, S, pad_h, pad_w, U, V));
12 | 	
13 | 	cudnnTensorDescriptor_t xDesc;
14 | 	ErrChk(cudnnCreateTensorDescriptor(&xDesc));
15 | 	ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, H, W));
16 | 
17 | 	cudnnTensorDescriptor_t yDesc;
18 | 	ErrChk(cudnnCreateTensorDescriptor(&yDesc));
19 | 	ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, P, Q));
20 | 	
21 | 	float one = 1.0, zero = 0.0;
22 | 	ErrChk(cudnnPoolingForward(handle, poolingDesc, &one, xDesc, input, &zero, yDesc, output));	
23 | 	
24 | 	ErrChk(cudnnDestroyPoolingDescriptor(poolingDesc));
25 | 	ErrChk(cudnnDestroyTensorDescriptor(xDesc));
26 | 	ErrChk(cudnnDestroyTensorDescriptor(yDesc));
27 | }
28 | 


--------------------------------------------------------------------------------
/google-net_cudnn/pooling.h:
--------------------------------------------------------------------------------
1 | #ifndef __POOLING_H__
2 | #define __POOLING_H__
3 | void pooling(cudnnHandle_t handle, int N, int C, int H, int W, int R, int S, int U, int V, int pad_h, int pad_w, int P, int Q, float *input, float *output, cudaStream_t s=0);
4 | #endif
5 | 


--------------------------------------------------------------------------------
/google-net_cudnn/softmax.cpp:
--------------------------------------------------------------------------------
 1 | #include "cudnn.h"
 2 | #include "util.h"
 3 | #include <cmath>
 4 | 
 5 | void softmax(cudnnHandle_t handle, int N, int C, float *input, float *output){
 6 | 
 7 | 	float one = 1.0, zero = 0.0;
 8 | 	size_t size;
 9 | 
10 | 	cudnnTensorDescriptor_t xDesc;
11 | 	ErrChk(cudnnCreateTensorDescriptor(&xDesc));
12 | 	ErrChk(cudnnSetTensor4dDescriptor(xDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, 1, 1));
13 | 
14 | 	cudnnTensorDescriptor_t yDesc;
15 | 	ErrChk(cudnnCreateTensorDescriptor(&yDesc));
16 | 	ErrChk(cudnnSetTensor4dDescriptor(yDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, N, C, 1, 1));
17 | 
18 | 	cudnnSoftmaxAlgorithm_t algo = CUDNN_SOFTMAX_FAST;
19 | 	cudnnSoftmaxMode_t mode = CUDNN_SOFTMAX_MODE_INSTANCE;
20 | 
21 | 	ErrChk(cudnnSoftmaxForward(handle, algo, mode, &one, xDesc, input, &zero, yDesc, output));
22 | 
23 | 	ErrChk(cudnnDestroyTensorDescriptor(xDesc));
24 | 	ErrChk(cudnnDestroyTensorDescriptor(yDesc));
25 | }
26 | 


--------------------------------------------------------------------------------
/google-net_cudnn/softmax.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * softmax.h
 3 |  *
 4 |  *  Created on: Nov 5, 2018
 5 |  *      Author: cambricon
 6 |  */
 7 | 
 8 | #ifndef SOFTMAX_H_
 9 | #define SOFTMAX_H_
10 | 
11 | 
12 | void softmax(cudnnHandle_t handle, int N, int C, float *input, float *output);
13 | 
14 | 
15 | #endif /* SOFTMAX_H_ */
16 | 


--------------------------------------------------------------------------------
/google-net_cudnn/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef __UTIL_H__
 2 | #define __UTIL_H__
 3 | 
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | #include <cublas_v2.h>
 7 | #include "cudnn.h"
 8 | 
 9 | 
10 | static inline const char* cublasGetErrorString(cublasStatus_t error)
11 | {
12 |     switch (error)
13 |     {
14 |         case CUBLAS_STATUS_SUCCESS:
15 |             return "CUBLAS_STATUS_SUCCESS";
16 | 
17 |         case CUBLAS_STATUS_NOT_INITIALIZED:
18 |             return "CUBLAS_STATUS_NOT_INITIALIZED";
19 | 
20 |         case CUBLAS_STATUS_ALLOC_FAILED:
21 |             return "CUBLAS_STATUS_ALLOC_FAILED";
22 | 
23 |         case CUBLAS_STATUS_INVALID_VALUE:
24 |             return "CUBLAS_STATUS_INVALID_VALUE";
25 | 
26 |         case CUBLAS_STATUS_ARCH_MISMATCH:
27 |             return "CUBLAS_STATUS_ARCH_MISMATCH";
28 | 
29 |         case CUBLAS_STATUS_MAPPING_ERROR:
30 |             return "CUBLAS_STATUS_MAPPING_ERROR";
31 | 
32 |         case CUBLAS_STATUS_EXECUTION_FAILED:
33 |             return "CUBLAS_STATUS_EXECUTION_FAILED";
34 | 
35 |         case CUBLAS_STATUS_INTERNAL_ERROR:
36 |             return "CUBLAS_STATUS_INTERNAL_ERROR";
37 | 
38 |         case CUBLAS_STATUS_NOT_SUPPORTED:
39 |             return "CUBLAS_STATUS_NOT_SUPPORTED";
40 | 
41 |         case CUBLAS_STATUS_LICENSE_ERROR:
42 |             return "CUBLAS_STATUS_LICENSE_ERROR";
43 |     }
44 |     return "<unknown>";
45 | }
46 | 
47 | 
48 | #define ErrChk(code) { Assert((code), __FILE__, __LINE__); }
49 | static inline void Assert(cudaError_t  code, const char *file, int line){
50 | 	if(code!=cudaSuccess) {
51 | 		printf("CUDA Runtime Error: %s:%d:'%s'\n", file, line,cudaGetErrorString(code));
52 | 		exit(EXIT_FAILURE);
53 | 	}
54 | }
55 | static inline void Assert(cudnnStatus_t code, const char *file, int line){
56 |     if (code!=CUDNN_STATUS_SUCCESS){
57 | 		printf("cuDNN API Error: %s:%d:'%s'\n", file, line, cudnnGetErrorString(code));
58 |         exit(EXIT_FAILURE);
59 |     }
60 | }
61 | static inline void Assert(cublasStatus_t code, const char *file, int line){
62 |     if (code!=CUBLAS_STATUS_SUCCESS){
63 | 		printf("cuBLAS API Error: %s:%d:'%s'\n", file, line, cublasGetErrorString(code));
64 |         exit(EXIT_FAILURE);
65 |     }
66 | }
67 | 
68 | 
69 | #define KernelErrChk(){\
70 | 		cudaError_t errSync  = cudaGetLastError();\
71 | 		cudaError_t errAsync = cudaDeviceSynchronize();\
72 | 		if (errSync != cudaSuccess) {\
73 | 			  printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));\
74 | 			  exit(EXIT_FAILURE);\
75 | 		}\
76 | 		if (errAsync != cudaSuccess){\
77 | 			printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));\
78 | 			exit(EXIT_FAILURE);\
79 | 		}\
80 | }
81 | #endif
82 | 


--------------------------------------------------------------------------------
/include/util.h:
--------------------------------------------------------------------------------
 1 | #ifndef __UTIL_H__
 2 | #define __UTIL_H__
 3 | 
 4 | #include <cstdio>
 5 | #include <cstdlib>
 6 | #include <cublas_v2.h>
 7 | #include "cudnn.h"
 8 | 
 9 | 
10 | static inline const char* cublasGetErrorString(cublasStatus_t error)
11 | {
12 |     switch (error)
13 |     {
14 |         case CUBLAS_STATUS_SUCCESS:
15 |             return "CUBLAS_STATUS_SUCCESS";
16 | 
17 |         case CUBLAS_STATUS_NOT_INITIALIZED:
18 |             return "CUBLAS_STATUS_NOT_INITIALIZED";
19 | 
20 |         case CUBLAS_STATUS_ALLOC_FAILED:
21 |             return "CUBLAS_STATUS_ALLOC_FAILED";
22 | 
23 |         case CUBLAS_STATUS_INVALID_VALUE:
24 |             return "CUBLAS_STATUS_INVALID_VALUE";
25 | 
26 |         case CUBLAS_STATUS_ARCH_MISMATCH:
27 |             return "CUBLAS_STATUS_ARCH_MISMATCH";
28 | 
29 |         case CUBLAS_STATUS_MAPPING_ERROR:
30 |             return "CUBLAS_STATUS_MAPPING_ERROR";
31 | 
32 |         case CUBLAS_STATUS_EXECUTION_FAILED:
33 |             return "CUBLAS_STATUS_EXECUTION_FAILED";
34 | 
35 |         case CUBLAS_STATUS_INTERNAL_ERROR:
36 |             return "CUBLAS_STATUS_INTERNAL_ERROR";
37 | 
38 |         case CUBLAS_STATUS_NOT_SUPPORTED:
39 |             return "CUBLAS_STATUS_NOT_SUPPORTED";
40 | 
41 |         case CUBLAS_STATUS_LICENSE_ERROR:
42 |             return "CUBLAS_STATUS_LICENSE_ERROR";
43 |     }
44 |     return "<unknown>";
45 | }
46 | 
47 | 
48 | #define ErrChk(code) { Assert((code), __FILE__, __LINE__); }
49 | static inline void Assert(cudaError_t  code, const char *file, int line){
50 | 	if(code!=cudaSuccess) {
51 | 		printf("CUDA Runtime Error: %s:%d:'%s'\n", file, line, cudaGetErrorString(code));
52 | 		exit(EXIT_FAILURE);
53 | 	}
54 | }
55 | static inline void Assert(cudnnStatus_t code, const char *file, int line){
56 |     if (code!=CUDNN_STATUS_SUCCESS){
57 | 		printf("cuDNN API Error: %s:%d:'%s'\n", file, line, cudnnGetErrorString(code));
58 |         exit(EXIT_FAILURE);
59 |     }
60 | }
61 | static inline void Assert(cublasStatus_t code, const char *file, int line){
62 |     if (code!=CUBLAS_STATUS_SUCCESS){
63 | 		printf("cuBLAS API Error: %s:%d:'%s'\n", file, line, cublasGetErrorString(code));
64 |         exit(EXIT_FAILURE);
65 |     }
66 | }
67 | 
68 | 
69 | #define KernelErrChk(){\
70 | 		cudaError_t errSync  = cudaGetLastError();\
71 | 		cudaError_t errAsync = cudaDeviceSynchronize();\
72 | 		if (errSync != cudaSuccess) {\
73 | 			  printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));\
74 | 			  exit(EXIT_FAILURE);\
75 | 		}\
76 | 		if (errAsync != cudaSuccess){\
77 | 			printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));\
78 | 			exit(EXIT_FAILURE);\
79 | 		}\
80 | }
81 | #endif
82 | 


--------------------------------------------------------------------------------
/magma/Makefile:
--------------------------------------------------------------------------------
1 | #GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70
2 | GENCODE_FLAGS = -gencode arch=compute_70,code=sm_70
3 | 
4 | gemm:gemm.cu kernel.h
5 | 	nvcc  $< -o $@ --std=c++11 -O3 ${GENCODE_FLAGS} -Xptxas -v
6 | clean:
7 | 	rm -rf gemm *.o
8 | 


--------------------------------------------------------------------------------
/magma/gemm.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <cstdio>
  3 | #include <fstream>
  4 | #include <cublas_v2.h>
  5 | #include "../include/util.h"
  6 | #include "kernel.h"
  7 | 
  8 | #define N_RUNS 10
  9 | 
 10 | 
 11 | int  main (int argc, char** argv) {
 12 | 
 13 | 	ErrChk(cudaSetDevice(0));
 14 | 
 15 | 	if(argc<2){
 16 | 		printf("Usage: input the batch size\n");
 17 | 		exit(EXIT_FAILURE);
 18 | 	}
 19 | 
 20 | 	int BATCH = atoi(argv[1]);
 21 | 	
 22 | 	int *M;
 23 | 	int *N;
 24 | 	int *K;
 25 | 
 26 | 	M = (int*) malloc(BATCH * sizeof(int));
 27 | 	N = (int*) malloc(BATCH * sizeof(int));
 28 | 	K = (int*) malloc(BATCH * sizeof(int));
 29 | 
 30 | 	std::fstream fs;
 31 | 	fs.open("../data/input");
 32 | 	if (!fs.is_open()){
 33 | 		printf("Error opening input\n");
 34 | 		exit(EXIT_FAILURE);
 35 | 	}
 36 | 	
 37 | 	//read matrix config	
 38 | 	for (int i=0; i<BATCH; ++i){
 39 | 		fs>>M[i]>>N[i]>>K[i];
 40 | 	}
 41 | 
 42 |     float **A;
 43 | 	float **B;
 44 | 	float **C;
 45 | 
 46 | 	A = (float**) malloc(BATCH * sizeof(float*));
 47 | 	B = (float**) malloc(BATCH * sizeof(float*));
 48 | 	C = (float**) malloc(BATCH * sizeof(float*));
 49 | 
 50 | 	for (int i=0; i<BATCH; ++i){
 51 | 		ErrChk(cudaMalloc((void**)&A[i], M[i]*K[i]*sizeof(float)));
 52 | 		ErrChk(cudaMalloc((void**)&B[i], K[i]*N[i]*sizeof(float)));
 53 | 		ErrChk(cudaMalloc((void**)&C[i], M[i]*N[i]*sizeof(float)));
 54 | 	}
 55 | 
 56 | 	float **dev_A;
 57 | 	float **dev_B;
 58 | 	float **dev_C;
 59 | 
 60 |     ErrChk(cudaMalloc((void**)&dev_A, BATCH*sizeof(float*)));
 61 |     ErrChk(cudaMalloc((void**)&dev_B, BATCH*sizeof(float*)));
 62 |     ErrChk(cudaMalloc((void**)&dev_C, BATCH*sizeof(float*)));
 63 | 
 64 | 	ErrChk(cudaMemcpy(dev_A, A, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 65 | 	ErrChk(cudaMemcpy(dev_B, B, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 66 | 	ErrChk(cudaMemcpy(dev_C, C, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 67 | 
 68 | 
 69 | 	int *dev_M, *dev_N, *dev_K;
 70 | 	ErrChk(cudaMalloc((void**)&dev_M, BATCH*sizeof(int)));
 71 | 	ErrChk(cudaMalloc((void**)&dev_N, BATCH*sizeof(int)));
 72 | 	ErrChk(cudaMalloc((void**)&dev_K, BATCH*sizeof(int)));
 73 | 
 74 | 	ErrChk(cudaMemcpy(dev_M, M, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 75 | 	ErrChk(cudaMemcpy(dev_N, N, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 76 | 	ErrChk(cudaMemcpy(dev_K, K, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 77 | 
 78 | 	float elapsedTime = 0.f;
 79 |     double time=0.f;
 80 | 	float gflops_per_sec = 0.f;
 81 | 	double gflops = 0.f;
 82 | 	for (int i=0; i<BATCH; ++i)
 83 | 		gflops += ((2 * int64_t(M[i]) * int64_t(N[i]) * int64_t(K[i])) + (2 * int64_t(M[i]) * int64_t(N[i])) ) / 1.0e9;
 84 | 	cudaEvent_t start, stop;
 85 | 
 86 |     dim3 block_size;
 87 |     block_size.x = 64;
 88 |     block_size.y = 1;
 89 | 	block_size.z = 1;
 90 | 
 91 |     dim3 grid_size;
 92 |     grid_size.x = M[0] / 16;
 93 |     grid_size.y = N[0] / 16;
 94 | 	grid_size.z = BATCH;
 95 | 
 96 | 	for (int j=1; j<BATCH; ++j){
 97 | 		grid_size.x = (grid_size.x > M[j]/16)?(grid_size.x):(M[j]/16);
 98 | 		grid_size.y = (grid_size.y > N[j]/16)?(grid_size.y):(N[j]/16);
 99 | 	}
100 | 
101 | 	//warm-up
102 | 	gemm<64, 16, 16><<<grid_size, block_size, sizeof(float)*2*16*16>>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C);
103 | 	KernelErrChk();
104 | 
105 | 	ErrChk(cudaEventCreate(&start));
106 | 	ErrChk(cudaEventRecord(start,0));
107 | 
108 | 	for (int run = 0; run<N_RUNS; ++run){
109 | 		gemm<64, 16, 16><<<grid_size, block_size, sizeof(float)*2*16*16>>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C);
110 | 		KernelErrChk();
111 | 	}
112 | 
113 | 	ErrChk(cudaEventCreate(&stop));
114 | 	ErrChk(cudaEventRecord(stop,0));
115 | 	ErrChk(cudaEventSynchronize(stop));
116 | 	ErrChk(cudaEventElapsedTime(&elapsedTime, start,stop));
117 | 
118 | 	time = elapsedTime/N_RUNS;
119 | 	time /= 1.0e3; //convert time unit from millisecond to second
120 | 	gflops_per_sec   = gflops / time;
121 | 	printf("%f\n", gflops_per_sec);
122 | 
123 | 	for (int i=0; i<BATCH; ++i){
124 | 		ErrChk(cudaFree(A[i]));		
125 | 		ErrChk(cudaFree(B[i]));		
126 | 		ErrChk(cudaFree(C[i]));		
127 | 	}
128 | 
129 | 	free(M);
130 | 	free(N);
131 | 	free(K);
132 | 	free(A);
133 | 	free(B);
134 | 	free(C);
135 | 	ErrChk(cudaFree(dev_M));		
136 | 	ErrChk(cudaFree(dev_N));		
137 | 	ErrChk(cudaFree(dev_K));		
138 | 	ErrChk(cudaFree(dev_A));		
139 | 	ErrChk(cudaFree(dev_B));		
140 | 	ErrChk(cudaFree(dev_C));		
141 | 
142 | 	return 0;
143 | 	
144 | }
145 | 


--------------------------------------------------------------------------------
/magma/log:
--------------------------------------------------------------------------------
  1 | 220.402008
  2 | 461.720367
  3 | 794.038818
  4 | 880.251160
  5 | 1303.655029
  6 | 1492.205688
  7 | 1731.399902
  8 | 216.325531
  9 | 461.966431
 10 | 786.593018
 11 | 971.223999
 12 | 1301.101196
 13 | 1505.377930
 14 | 1718.057251
 15 | 216.692902
 16 | 469.471985
 17 | 796.552246
 18 | 964.725647
 19 | 1330.041504
 20 | 1492.264771
 21 | 1756.731079
 22 | 225.755188
 23 | 469.981049
 24 | 791.828613
 25 | 921.310730
 26 | 1295.030640
 27 | 1486.907227
 28 | 1715.602417
 29 | 208.670837
 30 | 472.736206
 31 | 786.025940
 32 | 951.179016
 33 | 1300.737183
 34 | 1499.388916
 35 | 1732.232666
 36 | 217.397781
 37 | 466.566223
 38 | 793.942505
 39 | 973.190674
 40 | 1342.039429
 41 | 1468.827271
 42 | 1731.838135
 43 | 219.849609
 44 | 466.566223
 45 | 786.025940
 46 | 939.229431
 47 | 1300.373291
 48 | 1512.627930
 49 | 1752.547485
 50 | 166.178726
 51 | 456.189728
 52 | 923.375549
 53 | 1387.977417
 54 | 1608.023926
 55 | 1982.725342
 56 | 2255.758301
 57 | 171.611404
 58 | 448.595856
 59 | 920.707764
 60 | 1409.032471
 61 | 1621.045532
 62 | 2011.930176
 63 | 2257.054443
 64 | 167.211121
 65 | 428.525757
 66 | 913.843262
 67 | 1369.538208
 68 | 1585.075928
 69 | 1991.144409
 70 | 2257.908691
 71 | 167.078995
 72 | 446.496307
 73 | 916.669067
 74 | 1386.872803
 75 | 1636.846924
 76 | 2013.810059
 77 | 2258.740967
 78 | 166.984772
 79 | 447.711670
 80 | 908.373108
 81 | 1393.526733
 82 | 1633.074585
 83 | 2004.696655
 84 | 2251.833984
 85 | 166.834213
 86 | 437.206268
 87 | 927.473816
 88 | 1384.197632
 89 | 1635.084473
 90 | 1985.553101
 91 | 2258.885254
 92 | 168.755127
 93 | 462.335968
 94 | 914.761353
 95 | 1384.092896
 96 | 1621.812378
 97 | 2008.882812
 98 | 2262.778320
 99 | 604.745300
100 | 1674.186768
101 | 1695.227173
102 | 1726.686646
103 | 2105.956299
104 | 2320.672852
105 | 2372.013672
106 | 610.331116
107 | 1681.639038
108 | 1673.569336
109 | 1653.496948
110 | 2110.467773
111 | 2325.627686
112 | 2382.451660
113 | 594.076355
114 | 1675.916260
115 | 1680.707520
116 | 1681.702148
117 | 2104.109375
118 | 2332.856689
119 | 2376.438965
120 | 608.669250
121 | 1664.838379
122 | 1675.217896
123 | 1663.155151
124 | 2118.130615
125 | 2318.256348
126 | 2378.222900
127 | 612.715881
128 | 1672.663696
129 | 1697.275269
130 | 1685.001587
131 | 2088.912354
132 | 2327.686768
133 | 2376.948975
134 | 608.070312
135 | 1685.029663
136 | 1645.892334
137 | 1663.875610
138 | 2112.215820
139 | 2328.505371
140 | 2371.458496
141 | 612.037720
142 | 1683.332520
143 | 1689.671875
144 | 1669.845947
145 | 2087.446777
146 | 2322.573975
147 | 2376.622803
148 | 971.658081
149 | 1878.041870
150 | 2041.627075
151 | 2151.278564
152 | 2340.648682
153 | 2472.239746
154 | 2718.577393
155 | 1090.327881
156 | 2084.469727
157 | 2216.857178
158 | 2354.239746
159 | 2589.738037
160 | 2438.343018
161 | 2691.305420
162 | 1079.186279
163 | 2067.136475
164 | 2244.705078
165 | 2360.559570
166 | 2579.396484
167 | 2703.840088
168 | 2717.526123
169 | 1065.567139
170 | 2101.460205
171 | 2239.585693
172 | 2363.165039
173 | 2591.283691
174 | 2700.285400
175 | 2718.118652
176 | 1062.612915
177 | 2054.405029
178 | 2291.228516
179 | 2349.743652
180 | 2592.430908
181 | 2700.399414
182 | 2715.643799
183 | 1063.474121
184 | 2071.815186
185 | 2206.372314
186 | 2358.249512
187 | 2588.390869
188 | 2699.243408
189 | 2718.219727
190 | 1077.481934
191 | 2072.215332
192 | 2307.405029
193 | 2363.966797
194 | 2590.306641
195 | 2697.028320
196 | 2717.016846
197 | 


--------------------------------------------------------------------------------
/magma/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -f log
 4 | for ((M=128; M<=1024; M=M*2))
 5 | do
 6 | 	for ((K=16; K<=1024; K=K*2))
 7 | 	do
 8 | 		cd ../data
 9 | 		./gen_data $M $M $K
10 | 		cd - > /dev/null
11 | 		./gemm 4 >> log
12 | 		./gemm 8 >> log
13 | 		./gemm 16 >> log
14 | 		./gemm 32 >> log
15 | 		./gemm 64 >> log
16 | 		./gemm 128 >> log
17 | 		./gemm 256 >> log
18 | 	done
19 | done
20 | 


--------------------------------------------------------------------------------
/tiling/Makefile:
--------------------------------------------------------------------------------
1 | #GENCODE_FLAGS = -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70
2 | GENCODE_FLAGS =  -gencode arch=compute_70,code=compute_70
3 | 
4 | gemm:gemm.cu kernel.h
5 | 	nvcc  $< -o $@ --std=c++11 -O3 ${GENCODE_FLAGS} -Xptxas -v
6 | clean:
7 | 	rm -rf gemm *.o
8 | 


--------------------------------------------------------------------------------
/tiling/gemm.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdlib>
  2 | #include <cstdio>
  3 | #include <fstream>
  4 | #include <cublas_v2.h>
  5 | #include "../include/util.h"
  6 | #include "kernel.h"
  7 | 
  8 | #define N_RUNS 10
  9 | 
 10 | int  main (int argc, char** argv) {
 11 | 
 12 | 	ErrChk(cudaSetDevice(0));
 13 | 
 14 | 	if(argc<2){
 15 | 		printf("Usage: input the batch size\n");
 16 | 		exit(EXIT_FAILURE);
 17 | 	}
 18 | 
 19 | 	int BATCH = atoi(argv[1]);
 20 | 	//int TLP_thres = atoi(argv[2]);
 21 | 	int TLP_thres = 65536;
 22 | 	
 23 | 	int *M;
 24 | 	int *N;
 25 | 	int *K;
 26 | 
 27 | 	M = (int*) malloc(BATCH * sizeof(int));
 28 | 	N = (int*) malloc(BATCH * sizeof(int));
 29 | 	K = (int*) malloc(BATCH * sizeof(int));
 30 | 
 31 | 	std::fstream fs;
 32 | 	fs.open("../data/input");
 33 | 	if (!fs.is_open()){
 34 | 		printf("Error opening input\n");
 35 | 		exit(EXIT_FAILURE);
 36 | 	}
 37 | 	
 38 | 	//read matrix config	
 39 | 	for (int i=0; i<BATCH; ++i){
 40 | 		fs>>M[i]>>N[i]>>K[i];
 41 | 	}
 42 | 
 43 |     float **A;
 44 | 	float **B;
 45 | 	float **C;
 46 | 
 47 | 	A = (float**) malloc(BATCH * sizeof(float*));
 48 | 	B = (float**) malloc(BATCH * sizeof(float*));
 49 | 	C = (float**) malloc(BATCH * sizeof(float*));
 50 | 
 51 | 	for (int i=0; i<BATCH; ++i){
 52 | 		ErrChk(cudaMalloc((void**)&A[i], M[i]*K[i]*sizeof(float)));
 53 | 		ErrChk(cudaMalloc((void**)&B[i], K[i]*N[i]*sizeof(float)));
 54 | 		ErrChk(cudaMalloc((void**)&C[i], M[i]*N[i]*sizeof(float)));
 55 | 	}
 56 | 
 57 | 	float **dev_A;
 58 | 	float **dev_B;
 59 | 	float **dev_C;
 60 | 
 61 |     ErrChk(cudaMalloc((void**)&dev_A, BATCH*sizeof(float*)));
 62 |     ErrChk(cudaMalloc((void**)&dev_B, BATCH*sizeof(float*)));
 63 |     ErrChk(cudaMalloc((void**)&dev_C, BATCH*sizeof(float*)));
 64 | 
 65 | 	ErrChk(cudaMemcpy(dev_A, A, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 66 | 	ErrChk(cudaMemcpy(dev_B, B, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 67 | 	ErrChk(cudaMemcpy(dev_C, C, BATCH*sizeof(float*), cudaMemcpyHostToDevice));
 68 | 
 69 | 
 70 | 	int *dev_M, *dev_N, *dev_K;
 71 | 	ErrChk(cudaMalloc((void**)&dev_M, BATCH*sizeof(int)));
 72 | 	ErrChk(cudaMalloc((void**)&dev_N, BATCH*sizeof(int)));
 73 | 	ErrChk(cudaMalloc((void**)&dev_K, BATCH*sizeof(int)));
 74 | 
 75 | 	ErrChk(cudaMemcpy(dev_M, M, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 76 | 	ErrChk(cudaMemcpy(dev_N, N, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 77 | 	ErrChk(cudaMemcpy(dev_K, K, BATCH*sizeof(int), cudaMemcpyHostToDevice));
 78 | 
 79 | 	
 80 | 	float elapsedTime = 0.f;
 81 |     double time=0.f;
 82 | 	float gflops_per_sec = 0.f;
 83 | 	double gflops = 0.f;
 84 | 	for (int i=0; i<BATCH; ++i)
 85 | 		gflops += ((2 * int64_t(M[i]) * int64_t(N[i]) * int64_t(K[i])) + (2 * int64_t(M[i]) * int64_t(N[i])) ) / 1.0e9;
 86 | 	cudaEvent_t start, stop;
 87 | 
 88 | 	//compute grid size and block size
 89 | 
 90 | 	//int kThreads = 256;
 91 | 	int TLP = 0;
 92 | 
 93 | 	const int tile_size[6][2] = {
 94 | 		16, 16,
 95 | 		32, 32,
 96 | 		64, 64,
 97 | 		128, 64,
 98 | 		64, 128,
 99 | 		128, 128
100 | 	};
101 | 	
102 | 	int *t_strategy;
103 | 	t_strategy = (int*) malloc(BATCH * sizeof(int));
104 | 
105 | 	int t;	
106 | 	for (t=0; t<6; ++t){
107 | 		TLP = 0;
108 | 		for (int j=0; j<BATCH; ++j)
109 | 			TLP += (M[j]/tile_size[t][0])*(N[j]/tile_size[t][1])*256;
110 | 		
111 | 		if (TLP < TLP_thres)
112 | 			break;
113 | 	}
114 | 
115 | 	for (int j=0; j<BATCH; ++j){
116 | 	
117 | 		t_strategy[j] = 0;
118 | 		t = (t==6?5:t);
119 | 
120 | 		if (tile_size[t][0] <= M[j] && tile_size[t][1] <= N[j])
121 | 			t_strategy[j] = t;
122 | 		else{
123 | 			for (int k=0; k<t; ++k){
124 | 				if (tile_size[k][0] == M[j] && tile_size[k][1] <= N[j]){
125 | 					t_strategy[j] = k;
126 | 				}
127 | 			}
128 | 		}
129 | 	}
130 | 
131 | /*	
132 | 	//print the obtained tiling strategy
133 | 	for (int j=0; j<BATCH; ++j)
134 | 		printf("%d ", t_strategy[j]);
135 | 	printf("\n");
136 | */
137 | 
138 | 	
139 | 
140 | 	int *dev_T;
141 | 	ErrChk(cudaMalloc((void**)&dev_T, BATCH*sizeof(int)));
142 | 	ErrChk(cudaMemcpy(dev_T, t_strategy, BATCH*sizeof(int), cudaMemcpyHostToDevice));
143 | 
144 | 
145 |     dim3 block_size;
146 |     block_size.x = 256;
147 |     block_size.y = 1;
148 | 	block_size.z = 1;
149 | 
150 |     dim3 grid_size;
151 | 	
152 |     grid_size.x = M[0] / tile_size[t_strategy[0]][0];
153 |     grid_size.y = N[0] / tile_size[t_strategy[0]][1];
154 | 	grid_size.z = BATCH;
155 | 	for (int j=1; j<BATCH; ++j){
156 | 		grid_size.x = (grid_size.x > M[j]/tile_size[t_strategy[j]][0])? (grid_size.x):(M[j]/tile_size[t_strategy[j]][0]);
157 | 		grid_size.y = (grid_size.y > N[j]/tile_size[t_strategy[j]][1])? (grid_size.y):(N[j]/tile_size[t_strategy[j]][1]);
158 | 	}
159 | 
160 | //	printf("%d %d %d\n", grid_size.x, grid_size.y, grid_size.z);
161 | 
162 | 	//warm-up
163 | 	gemm<256><<<grid_size, block_size, sizeof(float)*4*128*8>>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C, dev_T);
164 | 	KernelErrChk();
165 | 
166 | 	ErrChk(cudaEventCreate(&start));
167 | 	ErrChk(cudaEventRecord(start,0));
168 | 
169 | 	for (int run = 0; run<N_RUNS; ++run){
170 | 		gemm<256><<<grid_size, block_size, sizeof(float)*4*128*8>>>(dev_M, dev_N, dev_K, dev_A, dev_B, dev_C, dev_T);
171 | 		KernelErrChk();
172 | 	}
173 | 
174 | 	ErrChk(cudaEventCreate(&stop));
175 | 	ErrChk(cudaEventRecord(stop,0));
176 | 	ErrChk(cudaEventSynchronize(stop));
177 | 	ErrChk(cudaEventElapsedTime(&elapsedTime, start,stop));
178 | 
179 | 	time = elapsedTime/N_RUNS;
180 | 	time /= 1.0e3; //convert time unit from millisecond to second
181 | 	gflops_per_sec   = gflops / time;
182 | 	printf("%f\n", gflops_per_sec);
183 | 
184 | 	for (int i=0; i<BATCH; ++i){
185 | 		ErrChk(cudaFree(A[i]));		
186 | 		ErrChk(cudaFree(B[i]));		
187 | 		ErrChk(cudaFree(C[i]));		
188 | 	}
189 | 
190 | 	free(M);
191 | 	free(N);
192 | 	free(K);
193 | 	free(A);
194 | 	free(B);
195 | 	free(C);
196 | 	free(t_strategy);
197 | 
198 | 	ErrChk(cudaFree(dev_M));		
199 | 	ErrChk(cudaFree(dev_N));		
200 | 	ErrChk(cudaFree(dev_K));		
201 | 	ErrChk(cudaFree(dev_T));		
202 | 
203 | 	ErrChk(cudaFree(dev_A));		
204 | 	ErrChk(cudaFree(dev_B));		
205 | 	ErrChk(cudaFree(dev_C));		
206 | 
207 | 	return 0;
208 | }
209 | 


--------------------------------------------------------------------------------
/tiling/kernel.h:
--------------------------------------------------------------------------------
 1 | #include "kernel_128.h"
 2 | #include "kernel_256.h"
 3 | 
 4 | template<int kThreads>
 5 | __global__ void gemm(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[]);
 6 | 
 7 | 
 8 | template<>
 9 | __global__ void gemm<128>(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[]){
10 | 	
11 | 	int i = blockIdx.z;
12 | 	extern __shared__ float sh[];
13 | 	int t = T_strategy[i];
14 | 
15 | 	switch(t){
16 | 		case 0:
17 | 			if (blockIdx.x * 16 < M[i] && blockIdx.y * 16 < N[i])	
18 | 				gemm_128_16x16(M[i], N[i], K[i], A[i], B[i], C[i], sh);
19 | 			break;
20 | 		case 1:
21 | 			if (blockIdx.x * 32 < M[i] && blockIdx.y * 32 < N[i])	
22 | 				gemm_128_32x32(M[i], N[i], K[i], A[i], B[i], C[i], sh);
23 | 			break;
24 | 		case 2:
25 | 			if (blockIdx.x * 64 < M[i] && blockIdx.y * 64 < N[i])	
26 | 				gemm_128_64x64(M[i], N[i], K[i], A[i], B[i], C[i], sh);
27 | 			break;
28 | 		case 3:
29 | 			if (blockIdx.x * 128 < M[i] && blockIdx.y * 64 < N[i])	
30 | 				gemm_128_128x64(M[i], N[i], K[i], A[i], B[i], C[i], sh);
31 | 			break;
32 | 		case 4:
33 | 			if (blockIdx.x * 64 < M[i] && blockIdx.y * 128 < N[i])	
34 | 				gemm_128_64x128(M[i], N[i], K[i], A[i], B[i], C[i], sh);
35 | 			break;
36 | 		case 5:
37 | //			if (blockIdx.x * 128 < M[i] && blockIdx.y * 128 < N[i])	
38 | //				gemm_128_128x128(M[i], N[i], K[i], A[i], B[i], C[i], sh);
39 | 			break;
40 | 	}
41 | 
42 | 	return;
43 | }
44 | 
45 | template<>
46 | __global__ void gemm<256>(int M[], int N[], int K[], float *A[], float *B[], float *C[], int T_strategy[]){
47 | 	
48 | 	int i = blockIdx.z;
49 | 	extern __shared__ float sh[];
50 | 	int t = T_strategy[i];
51 | 
52 | 	switch(t){
53 | 		case 0:
54 | 			if (blockIdx.x * 16 < M[i] && blockIdx.y * 16 < N[i])	
55 | 				gemm_256_16x16(M[i], N[i], K[i], A[i], B[i], C[i], sh);
56 | 			break;
57 | 		case 1:
58 | 			if (blockIdx.x * 32 < M[i] && blockIdx.y * 32 < N[i])	
59 | 				gemm_256_32x32(M[i], N[i], K[i], A[i], B[i], C[i], sh);
60 | 			break;
61 | 		case 2:
62 | 			if (blockIdx.x * 64 < M[i] && blockIdx.y * 64 < N[i])	
63 | 				gemm_256_64x64(M[i], N[i], K[i], A[i], B[i], C[i], sh);
64 | 			break;
65 | 		case 3:
66 | 			if (blockIdx.x * 128 < M[i] && blockIdx.y * 64 < N[i])	
67 | 				gemm_256_128x64(M[i], N[i], K[i], A[i], B[i], C[i], sh);
68 | 			break;
69 | 		case 4:
70 | 			if (blockIdx.x * 128 < M[i] && blockIdx.y * 64 < N[i])	
71 | 				gemm_256_128x64(M[i], N[i], K[i], A[i], B[i], C[i], sh);
72 | //			if (blockIdx.x * 64 < M[i] && blockIdx.y * 128 < N[i])	
73 | //				gemm_256_64x128(M[i], N[i], K[i], A[i], B[i], C[i], sh);
74 | 			break;
75 | 		case 5:
76 | 			if (blockIdx.x * 128 < M[i] && blockIdx.y * 128 < N[i])	
77 | 				gemm_256_128x128(M[i], N[i], K[i], A[i], B[i], C[i], sh);
78 | 			break;
79 | 	}
80 | 
81 | 	return;
82 | }
83 | 


--------------------------------------------------------------------------------
/tiling/kernel_128.h:
--------------------------------------------------------------------------------
  1 | __device__ void gemm_128_16x16(int M, int N, int K, float *A, float *B, float *C, float *sh){
  2 | 
  3 | 	float *sh_A = sh;
  4 | 	float *sh_B = sh + 2*16*8;
  5 | 
  6 | 	float2 reg_C;
  7 | 	float2 reg_A;
  8 | 	float reg_B;
  9 | 
 10 | 	// Compute block's starting coordinate
 11 | 	int block_base_x = blockIdx.y*16;
 12 | 	int block_base_y = blockIdx.x*16;
 13 | 
 14 | 	//Load C from global memory to register file
 15 | 	float2 *C_start = (float2*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*2 + (threadIdx.x/8)*M);
 16 | 
 17 | 	reg_C = *C_start;
 18 | 
 19 | 	//load A from global memory to shared memory
 20 | 	float *A_start = A + block_base_y + (threadIdx.x%16) + (threadIdx.x/16)*M;
 21 | 	*(sh_A + threadIdx.x) = *(A_start);
 22 | 
 23 | 	//load A from global memory to shared memory
 24 | 	float *B_start = B + K*block_base_x + (threadIdx.x/16) + (threadIdx.x%16)*K;
 25 | 	*(sh_B + threadIdx.x) = *(B_start);
 26 | 
 27 | 
 28 | 	int double_buffer = 0;
 29 | #pragma unroll
 30 | 	for(int k=0; k<K; k+=8){
 31 | 		__syncthreads();
 32 | 		int A_offset = double_buffer + (threadIdx.x%8)*2;
 33 | 		int B_offset = double_buffer + (threadIdx.x/8);
 34 | 			
 35 | #pragma unroll
 36 | 		for (int i=0; i<8; i++)	{
 37 | 			
 38 | 			reg_A.x = sh_A[A_offset];
 39 | 			reg_A.y = sh_A[A_offset+1];
 40 | 
 41 | 			reg_B = sh_B[B_offset];
 42 | 
 43 | 			reg_C.x = fma(reg_A.x, reg_B, reg_C.x);
 44 | 			reg_C.y = fma(reg_A.y, reg_B, reg_C.y);
 45 | 
 46 | 			A_offset += 16;
 47 | 			B_offset += 16;
 48 | 		}
 49 | 
 50 | 		double_buffer ^= 128;
 51 | 
 52 | 		if (k+8 < K){
 53 | 			A_start += 8*M; 
 54 | 			*(sh_A + double_buffer + threadIdx.x) = *(A_start);
 55 | 			B_start += 8; 
 56 | 			*(sh_B + double_buffer + threadIdx.x) = *(B_start);
 57 | 		}
 58 | 	}
 59 | 	
 60 |     *C_start = reg_C;
 61 | }
 62 | 
 63 | __device__ void gemm_128_32x32(int M, int N, int K, float *A, float *B, float *C, float *sh){
 64 | 
 65 | 	float *sh_A = sh;
 66 | 	float *sh_B = sh + 2*32*8;
 67 | 
 68 | 	float4 reg_C[2];
 69 | 	float4 reg_A;
 70 | 	float  reg_B[2];
 71 | 
 72 | 	// Compute block's starting coordinate
 73 | 	int block_base_x = blockIdx.y*32;
 74 | 	int block_base_y = blockIdx.x*32;
 75 | 
 76 | 	//Load C from global memory to register file
 77 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*M);
 78 | 
 79 | 	reg_C[0] = *C_start;
 80 | 	reg_C[1] = *(C_start + 4*M);
 81 | 
 82 | 	//load A from global memory to shared memory
 83 | 	float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%16)*2 + (threadIdx.x/16)*M);
 84 | 	*((float2*)(sh_A + 2*threadIdx.x)) = *(A_start);
 85 | 
 86 | 	//load B from global memory to shared memory
 87 | 	float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/32)*2 + (threadIdx.x%32)*K);
 88 | 	*((float2*)(sh_B + 2*threadIdx.x)) = *(B_start);
 89 | 
 90 | 	int double_buffer = 0;
 91 | #pragma unroll
 92 | 	for(int k=0; k<K; k+=8){
 93 | 		__syncthreads();
 94 | 		int A_offset = double_buffer + (threadIdx.x%8)*4;
 95 | 		int B_offset = double_buffer + (threadIdx.x/8)*2;
 96 | 			
 97 | #pragma unroll
 98 | 		for (int i=0; i<8; i++)	{
 99 | 			
100 | 			reg_A.x = sh_A[A_offset];
101 | 			reg_A.y = sh_A[A_offset+1];
102 | 			reg_A.z = sh_A[A_offset+2];
103 | 			reg_A.w = sh_A[A_offset+3];
104 | 
105 | 			reg_B[0] = sh_B[B_offset];
106 | 			reg_B[1] = sh_B[B_offset+32];
107 | 
108 | 			reg_C[0].x = fma(reg_A.x, reg_B[0], reg_C[0].x);
109 | 			reg_C[0].y = fma(reg_A.y, reg_B[0], reg_C[0].y);
110 | 			reg_C[0].z = fma(reg_A.z, reg_B[0], reg_C[0].z);
111 | 			reg_C[0].w = fma(reg_A.w, reg_B[0], reg_C[0].w);
112 | 			reg_C[1].x = fma(reg_A.x, reg_B[1], reg_C[1].x);
113 | 			reg_C[1].y = fma(reg_A.y, reg_B[1], reg_C[1].y);
114 | 			reg_C[1].z = fma(reg_A.z, reg_B[1], reg_C[1].z);
115 | 			reg_C[1].w = fma(reg_A.w, reg_B[1], reg_C[1].w);
116 | 
117 | 			A_offset += 32;
118 | 			B_offset += ((i%2)*62 + 1);
119 | 		}
120 | 
121 | 		double_buffer ^= 256;
122 | 
123 | 		if (k+8 < K){
124 | 			A_start += 4*M; 
125 | 			*((float2*)(sh_A + double_buffer + 2*threadIdx.x)) = *(A_start);
126 | 			B_start += 4; 
127 | 			*((float2*)(sh_B + double_buffer + 2*threadIdx.x)) = *(B_start);
128 | 		}
129 | 	}
130 | 	
131 |     *C_start = reg_C[0];
132 |     *(C_start + 4*M) = reg_C[1];
133 | }
134 | 
135 | __device__ void gemm_128_64x64(int M, int N, int K, float *A, float *B, float *C, float *sh){
136 | 
137 | 	float *sh_A = sh;
138 | 	float *sh_B = sh + 2*64*8;
139 | 
140 | 	float4 reg_C[8];
141 | 	float4 reg_A;
142 | 	float  reg_B[8];
143 | 
144 | 	// Compute block's starting coordinate
145 | 	int block_base_x = blockIdx.y*64;
146 | 	int block_base_y = blockIdx.x*64;
147 | 
148 | 	//Load C from global memory to register file
149 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*4*M);
150 | 
151 |     reg_C[0] = *C_start;
152 | 	reg_C[1] = *(C_start + M/4);
153 | 	reg_C[2] = *(C_start + M/2);
154 | 	reg_C[3] = *(C_start + 3*M/4);
155 | 
156 | 	C_start += 8*M;
157 | 	reg_C[4] = *(C_start);
158 | 	reg_C[5] = *(C_start + M/4);
159 | 	reg_C[6] = *(C_start + M/2);
160 | 	reg_C[7] = *(C_start + 3*M/4);
161 | 
162 | 	//load A from global memory to shared memory
163 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*M); 
164 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
165 | 
166 | 	//load A from global memory to shared memory
167 | 	float4 *B_start = (float4*) (B + K*block_base_x + (threadIdx.x/64)*4 + (threadIdx.x%64)*K); 
168 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
169 | 		
170 | 	int double_buffer = 0;
171 | 
172 | #pragma unroll
173 | 	for(int k=0; k<K; k+=8){
174 | 
175 | 		__syncthreads();
176 | 		int A_offset = double_buffer + (threadIdx.x%16)*4;
177 | 		int B_offset = double_buffer + ((threadIdx.x/16)*16);
178 | 			
179 | #pragma unroll
180 | 		for (int i=0; i<8; ++i)	{
181 | 			
182 | 			reg_A = *((float4*)(sh_A + A_offset));
183 | 			reg_B[0] = sh_B[B_offset];
184 | 			reg_B[1] = sh_B[B_offset+4];
185 | 			reg_B[2] = sh_B[B_offset+8];
186 | 			reg_B[3] = sh_B[B_offset+12];
187 | 			reg_B[4] = sh_B[B_offset+128];
188 | 			reg_B[5] = sh_B[B_offset+132];
189 | 			reg_B[6] = sh_B[B_offset+136];
190 | 			reg_B[7] = sh_B[B_offset+140];
191 | 
192 | 			reg_C[0].x = fma(reg_A.x, reg_B[0], reg_C[0].x);
193 | 			reg_C[1].x = fma(reg_A.x, reg_B[1], reg_C[1].x);
194 | 			reg_C[2].x = fma(reg_A.x, reg_B[2], reg_C[2].x);
195 | 			reg_C[3].x = fma(reg_A.x, reg_B[3], reg_C[3].x);
196 | 			reg_C[4].x = fma(reg_A.x, reg_B[4], reg_C[4].x);
197 | 			reg_C[5].x = fma(reg_A.x, reg_B[5], reg_C[5].x);
198 | 			reg_C[6].x = fma(reg_A.x, reg_B[6], reg_C[6].x);
199 | 			reg_C[7].x = fma(reg_A.x, reg_B[7], reg_C[7].x);
200 | 
201 | 			reg_C[0].y = fma(reg_A.y, reg_B[0], reg_C[0].y);
202 | 			reg_C[1].y = fma(reg_A.y, reg_B[1], reg_C[1].y);
203 | 			reg_C[2].y = fma(reg_A.y, reg_B[2], reg_C[2].y);
204 | 			reg_C[3].y = fma(reg_A.y, reg_B[3], reg_C[3].y);
205 | 			reg_C[4].y = fma(reg_A.y, reg_B[4], reg_C[4].y);
206 | 			reg_C[5].y = fma(reg_A.y, reg_B[5], reg_C[5].y);
207 | 			reg_C[6].y = fma(reg_A.y, reg_B[6], reg_C[6].y);
208 | 			reg_C[7].y = fma(reg_A.y, reg_B[7], reg_C[7].y);
209 | 
210 | 			reg_C[0].z = fma(reg_A.z, reg_B[0], reg_C[0].z);
211 | 			reg_C[1].z = fma(reg_A.z, reg_B[1], reg_C[1].z);
212 | 			reg_C[2].z = fma(reg_A.z, reg_B[2], reg_C[2].z);
213 | 			reg_C[3].z = fma(reg_A.z, reg_B[3], reg_C[3].z);
214 | 			reg_C[4].z = fma(reg_A.z, reg_B[4], reg_C[4].z);
215 | 			reg_C[5].z = fma(reg_A.z, reg_B[5], reg_C[5].z);
216 | 			reg_C[6].z = fma(reg_A.z, reg_B[6], reg_C[6].z);
217 | 			reg_C[7].z = fma(reg_A.z, reg_B[7], reg_C[7].z);
218 | 
219 | 			reg_C[0].w = fma(reg_A.w, reg_B[0], reg_C[0].w);
220 | 			reg_C[1].w = fma(reg_A.w, reg_B[1], reg_C[1].w);
221 | 			reg_C[2].w = fma(reg_A.w, reg_B[2], reg_C[2].w);
222 | 			reg_C[3].w = fma(reg_A.w, reg_B[3], reg_C[3].w);
223 | 			reg_C[4].w = fma(reg_A.w, reg_B[4], reg_C[4].w);
224 | 			reg_C[5].w = fma(reg_A.w, reg_B[5], reg_C[5].w);
225 | 			reg_C[6].w = fma(reg_A.w, reg_B[6], reg_C[6].w);
226 | 			reg_C[7].w = fma(reg_A.w, reg_B[7], reg_C[7].w);
227 | 
228 | 			A_offset += 64;
229 | 			B_offset += ((i==3)*252 + 1);
230 | 		}
231 | 
232 | 		double_buffer ^= 512;
233 | 
234 | 		if (k+8 < K){
235 | 			A_start += 2*M; 
236 | 			*((float4*) (sh_A + double_buffer + 4*threadIdx.x)) = *(A_start);
237 | 
238 | 			B_start += 2; 
239 | 			*((float4*) (sh_B + double_buffer + 4*threadIdx.x)) = *(B_start);
240 | 		}
241 | 				
242 | 	}
243 | 	C_start -= 8*M;
244 |     *C_start = reg_C[0];
245 | 	*(C_start + M/4) = reg_C[1];
246 | 	*(C_start + M/2) = reg_C[2];
247 | 	*(C_start + 3*M/4) = reg_C[3];
248 | 
249 | 	C_start += 8*M;
250 | 	*(C_start) = reg_C[4];
251 | 	*(C_start + M/4) = reg_C[5];
252 | 	*(C_start + M/2) = reg_C[6];
253 | 	*(C_start + 3*M/4) = reg_C[7];
254 | 
255 | }
256 | __device__ void gemm_128_64x128(int M, int N, int K, float *A, float *B, float *C, float *sh){
257 | 
258 | 	float *sh_A = sh;
259 | 	float *sh_B = sh + 2*64*8;
260 | 
261 | 	float4 reg_C[16];
262 | 	float4 reg_A[2];
263 | 	float reg_B[8];
264 | 
265 | 	// Compute block's starting coordinate
266 | 	int block_base_x = blockIdx.y*128;
267 | 	int block_base_y = blockIdx.x*64;
268 | 
269 | 	//Load C from global memory to register file
270 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*4*M);
271 | 
272 | 	reg_C[0] = *C_start;
273 | 	reg_C[1] = *(C_start + M/4);
274 | 	reg_C[2] = *(C_start + M/2);
275 | 	reg_C[3] = *(C_start + 3*M/4);
276 | 
277 | 	C_start += 8;
278 | 	reg_C[4] = *C_start;
279 | 	reg_C[5] = *(C_start + M/4);
280 | 	reg_C[6] = *(C_start + M/2);
281 | 	reg_C[7] = *(C_start + 3*M/4);
282 | 
283 | 	C_start += (16*M - 8);
284 | 	reg_C[8] = *C_start;
285 | 	reg_C[9] = *(C_start + M/4);
286 | 	reg_C[10] = *(C_start + M/2);
287 | 	reg_C[11] = *(C_start + 3*M/4);
288 | 
289 | 	C_start += 8;
290 | 	reg_C[12] = *C_start;
291 | 	reg_C[13] = *(C_start + M/4);
292 | 	reg_C[14] = *(C_start + M/2);
293 | 	reg_C[15] = *(C_start + 3*M/4);
294 | 
295 | 	//load A from global memory to shared memory
296 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*M); 
297 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
298 | 
299 | 	//load B from global memory to shared memory
300 | 	float4 *B_start = (float4*) (B + K*block_base_x + threadIdx.x*K); 
301 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
302 | 	*((float4*) (sh_B + 512 + 4*threadIdx.x)) = *(B_start + 1);
303 | 		
304 | 	int double_buffer_A = 0;
305 | 	int double_buffer_B = 0;
306 | #pragma unroll
307 | 	for(int k=0; k<K; k+=8){
308 | 
309 | 		__syncthreads();
310 | 		int A_offset = double_buffer_A + (threadIdx.x%8)*4;
311 | 		int B_offset = double_buffer_B + ((threadIdx.x/8)*16);
312 | 			
313 | #pragma unroll
314 | 		for (int i=0; i<8; ++i)	{
315 | 			
316 | 			reg_A[0] = *((float4*)(sh_A+A_offset));
317 | 			reg_A[1] = *((float4*)(sh_A+A_offset+32));
318 | 
319 | 			reg_B[0] = sh_B[B_offset];
320 | 			reg_B[1] = sh_B[B_offset+4];
321 | 			reg_B[2] = sh_B[B_offset+8];
322 | 			reg_B[3] = sh_B[B_offset+12];
323 | 			reg_B[4] = sh_B[B_offset+256];
324 | 			reg_B[5] = sh_B[B_offset+260];
325 | 			reg_B[6] = sh_B[B_offset+264];
326 | 			reg_B[7] = sh_B[B_offset+268];
327 | 
328 | 			reg_C[0].x = fma(reg_A[0].x, reg_B[0], reg_C[0].x);
329 | 			reg_C[1].x = fma(reg_A[0].x, reg_B[1], reg_C[1].x);
330 | 			reg_C[2].x = fma(reg_A[0].x, reg_B[2], reg_C[2].x);
331 | 			reg_C[3].x = fma(reg_A[0].x, reg_B[3], reg_C[3].x);
332 | 			reg_C[8].x = fma(reg_A[0].x, reg_B[4], reg_C[8].x);
333 | 			reg_C[9].x = fma(reg_A[0].x, reg_B[5], reg_C[9].x);
334 | 			reg_C[10].x = fma(reg_A[0].x, reg_B[6], reg_C[10].x);
335 | 			reg_C[11].x = fma(reg_A[0].x, reg_B[7], reg_C[11].x);
336 | 			reg_C[4].x = fma(reg_A[1].x, reg_B[0], reg_C[4].x);
337 | 			reg_C[5].x = fma(reg_A[1].x, reg_B[1], reg_C[5].x);
338 | 			reg_C[6].x = fma(reg_A[1].x, reg_B[2], reg_C[6].x);
339 | 			reg_C[7].x = fma(reg_A[1].x, reg_B[3], reg_C[7].x);
340 | 			reg_C[12].x = fma(reg_A[1].x, reg_B[4], reg_C[12].x);
341 | 			reg_C[13].x = fma(reg_A[1].x, reg_B[5], reg_C[13].x);
342 | 			reg_C[14].x = fma(reg_A[1].x, reg_B[6], reg_C[14].x);
343 | 			reg_C[15].x = fma(reg_A[1].x, reg_B[7], reg_C[15].x);
344 | 
345 | 			reg_C[0].y = fma(reg_A[0].y, reg_B[0], reg_C[0].y);
346 | 			reg_C[1].y = fma(reg_A[0].y, reg_B[1], reg_C[1].y);
347 | 			reg_C[2].y = fma(reg_A[0].y, reg_B[2], reg_C[2].y);
348 | 			reg_C[3].y = fma(reg_A[0].y, reg_B[3], reg_C[3].y);
349 | 			reg_C[8].y = fma(reg_A[0].y, reg_B[4], reg_C[8].y);
350 | 			reg_C[9].y = fma(reg_A[0].y, reg_B[5], reg_C[9].y);
351 | 			reg_C[10].y = fma(reg_A[0].y, reg_B[6], reg_C[10].y);
352 | 			reg_C[11].y = fma(reg_A[0].y, reg_B[7], reg_C[11].y);
353 | 			reg_C[4].y = fma(reg_A[1].y, reg_B[0], reg_C[4].y);
354 | 			reg_C[5].y = fma(reg_A[1].y, reg_B[1], reg_C[5].y);
355 | 			reg_C[6].y = fma(reg_A[1].y, reg_B[2], reg_C[6].y);
356 | 			reg_C[7].y = fma(reg_A[1].y, reg_B[3], reg_C[7].y);
357 | 			reg_C[12].y = fma(reg_A[1].y, reg_B[4], reg_C[12].y);
358 | 			reg_C[13].y = fma(reg_A[1].y, reg_B[5], reg_C[13].y);
359 | 			reg_C[14].y = fma(reg_A[1].y, reg_B[6], reg_C[14].y);
360 | 			reg_C[15].y = fma(reg_A[1].y, reg_B[7], reg_C[15].y);
361 | 
362 | 			reg_C[0].z = fma(reg_A[0].z, reg_B[0], reg_C[0].z);
363 | 			reg_C[1].z = fma(reg_A[0].z, reg_B[1], reg_C[1].z);
364 | 			reg_C[2].z = fma(reg_A[0].z, reg_B[2], reg_C[2].z);
365 | 			reg_C[3].z = fma(reg_A[0].z, reg_B[3], reg_C[3].z);
366 | 			reg_C[8].z = fma(reg_A[0].z, reg_B[4], reg_C[8].z);
367 | 			reg_C[9].z = fma(reg_A[0].z, reg_B[5], reg_C[9].z);
368 | 			reg_C[10].z = fma(reg_A[0].z, reg_B[6], reg_C[10].z);
369 | 			reg_C[11].z = fma(reg_A[0].z, reg_B[7], reg_C[11].z);
370 | 			reg_C[4].z = fma(reg_A[1].z, reg_B[0], reg_C[4].z);
371 | 			reg_C[5].z = fma(reg_A[1].z, reg_B[1], reg_C[5].z);
372 | 			reg_C[6].z = fma(reg_A[1].z, reg_B[2], reg_C[6].z);
373 | 			reg_C[7].z = fma(reg_A[1].z, reg_B[3], reg_C[7].z);
374 | 			reg_C[12].z = fma(reg_A[1].z, reg_B[4], reg_C[12].z);
375 | 			reg_C[13].z = fma(reg_A[1].z, reg_B[5], reg_C[13].z);
376 | 			reg_C[14].z = fma(reg_A[1].z, reg_B[6], reg_C[14].z);
377 | 			reg_C[15].z = fma(reg_A[1].z, reg_B[7], reg_C[15].z);
378 | 
379 | 			reg_C[0].w = fma(reg_A[0].w, reg_B[0], reg_C[0].w);
380 | 			reg_C[1].w = fma(reg_A[0].w, reg_B[1], reg_C[1].w);
381 | 			reg_C[2].w = fma(reg_A[0].w, reg_B[2], reg_C[2].w);
382 | 			reg_C[3].w = fma(reg_A[0].w, reg_B[3], reg_C[3].w);
383 | 			reg_C[8].w = fma(reg_A[0].w, reg_B[4], reg_C[8].w);
384 | 			reg_C[9].w = fma(reg_A[0].w, reg_B[5], reg_C[9].w);
385 | 			reg_C[10].w = fma(reg_A[0].w, reg_B[6], reg_C[10].w);
386 | 			reg_C[11].w = fma(reg_A[0].w, reg_B[7], reg_C[11].w);
387 | 			reg_C[4].w = fma(reg_A[1].w, reg_B[0], reg_C[4].w);
388 | 			reg_C[5].w = fma(reg_A[1].w, reg_B[1], reg_C[5].w);
389 | 			reg_C[6].w = fma(reg_A[1].w, reg_B[2], reg_C[6].w);
390 | 			reg_C[7].w = fma(reg_A[1].w, reg_B[3], reg_C[7].w);
391 | 			reg_C[12].w = fma(reg_A[1].w, reg_B[4], reg_C[12].w);
392 | 			reg_C[13].w = fma(reg_A[1].w, reg_B[5], reg_C[13].w);
393 | 			reg_C[14].w = fma(reg_A[1].w, reg_B[6], reg_C[14].w);
394 | 			reg_C[15].w = fma(reg_A[1].w, reg_B[7], reg_C[15].w);
395 | 
396 | 			A_offset += 64;
397 | 			if (i==3) B_offset += 508;
398 | 			B_offset += 1;
399 | 		}
400 | 
401 | 		double_buffer_A ^= 512;
402 | 		double_buffer_B ^= 1024;
403 | 
404 | 		if (k+8 < K){
405 | 			A_start += 2*M; 
406 | 			*((float4*) (sh_A + double_buffer_A + 4*threadIdx.x)) = *(A_start);
407 | 
408 | 			B_start += 2; 
409 | 			*((float4*) (sh_B + double_buffer_B + 4*threadIdx.x)) = *(B_start);
410 | 			*((float4*) (sh_B + double_buffer_B + 512 + 4*threadIdx.x)) = *(B_start + 1);
411 | 		}
412 | 				
413 | 	}
414 | 	C_start -= (16*M + 8);
415 |     *C_start = reg_C[0];
416 | 	*(C_start + M/4) = reg_C[1];
417 | 	*(C_start + M/2) = reg_C[2];
418 | 	*(C_start + 3*M/4) = reg_C[3];
419 | 
420 | 	C_start += 8;
421 | 	*(C_start) = reg_C[4];
422 | 	*(C_start + M/4) = reg_C[5];
423 | 	*(C_start + M/2) = reg_C[6];
424 | 	*(C_start + 3*M/4) = reg_C[7];
425 | 
426 | 	C_start += (16*M - 8);
427 | 	*(C_start) = reg_C[8];
428 | 	*(C_start + M/4) = reg_C[9];
429 | 	*(C_start + M/2) = reg_C[10];
430 | 	*(C_start + 3*M/4) = reg_C[11];
431 | 
432 | 	C_start += 8;
433 | 	*(C_start) = reg_C[12];
434 | 	*(C_start + M/4) = reg_C[13];
435 | 	*(C_start + M/2) = reg_C[14];
436 | 	*(C_start + 3*M/4) = reg_C[15];
437 | }
438 | 
439 | __device__ void gemm_128_128x64(int M, int N, int K, float *A, float *B, float *C, float *sh){
440 | 
441 |     float *sh_A = sh;
442 | 	float *sh_B = sh + 2*128*8;
443 | 
444 | 	float4 reg_C[16];
445 | 	float reg_A[8];
446 | 	float reg_B[8];
447 | 
448 | 	// Compute block's starting coordinate
449 | 	int block_base_x = blockIdx.y*64;
450 | 	int block_base_y = blockIdx.x*128;
451 | 
452 | 	//Load C from global memory to register file
453 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*4*M);
454 | 
455 |     reg_C[0] = *C_start;
456 | 	reg_C[1] = *(C_start + M/4);
457 | 	reg_C[2] = *(C_start + M/2);
458 | 	reg_C[3] = *(C_start + 3*M/4);
459 | 
460 | 	C_start += 16;
461 | 	reg_C[4] = *(C_start);
462 | 	reg_C[5] = *(C_start + M/4);
463 | 	reg_C[6] = *(C_start + M/2);
464 | 	reg_C[7] = *(C_start + 3*M/4);
465 | 
466 | 	C_start += (8*M - 16);
467 | 	reg_C[8] = *(C_start);
468 | 	reg_C[9] = *(C_start + M/4);
469 | 	reg_C[10] = *(C_start + M/2);
470 | 	reg_C[11] = *(C_start + 3*M/4);
471 | 
472 | 	C_start += 16;
473 | 	reg_C[12] = *(C_start);
474 | 	reg_C[13] = *(C_start + M/4);
475 | 	reg_C[14] = *(C_start + M/2);
476 | 	reg_C[15] = *(C_start + 3*M/4);
477 | 
478 | 	//load A from global memory to shared memory
479 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%32)*4 + (threadIdx.x/32)*M); 
480 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
481 | 	*((float4*) (sh_A + 512 + 4*threadIdx.x)) = *(A_start + M);
482 | 
483 | 	//load A from global memory to shared memory
484 | 	float4 *B_start = (float4*) (B + K*block_base_x + (threadIdx.x/64)*4 + (threadIdx.x%64)*K); 
485 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
486 | 		
487 | 	int double_buffer_A = 0;
488 | 	int double_buffer_B = 0;
489 | #pragma unroll
490 | 	for(int k=0; k<K; k+=8){
491 | 
492 | 		__syncthreads();
493 | 		int A_offset = double_buffer_A + (threadIdx.x%16)*4;
494 | 		int B_offset = double_buffer_B + ((threadIdx.x/16)*16);
495 | 			
496 | #pragma unroll
497 | 		for (int i=0; i<8; ++i)	{
498 | 			
499 | 			reg_A[0] = sh_A[A_offset];
500 | 			reg_A[1] = sh_A[A_offset+1];
501 | 			reg_A[2] = sh_A[A_offset+2];
502 | 			reg_A[3] = sh_A[A_offset+3];
503 | 			reg_A[4] = sh_A[A_offset+64];
504 | 			reg_A[5] = sh_A[A_offset+65];
505 | 			reg_A[6] = sh_A[A_offset+66];
506 | 			reg_A[7] = sh_A[A_offset+67];
507 | 
508 | 			reg_B[0] = sh_B[B_offset];
509 | 			reg_B[1] = sh_B[B_offset+4];
510 | 			reg_B[2] = sh_B[B_offset+8];
511 | 			reg_B[3] = sh_B[B_offset+12];
512 | 			reg_B[4] = sh_B[B_offset+128];
513 | 			reg_B[5] = sh_B[B_offset+132];
514 | 			reg_B[6] = sh_B[B_offset+136];
515 | 			reg_B[7] = sh_B[B_offset+140];
516 | 
517 | 			reg_C[0].x = fma(reg_A[0], reg_B[0], reg_C[0].x);
518 | 			reg_C[1].x = fma(reg_A[0], reg_B[1], reg_C[1].x);
519 | 			reg_C[2].x = fma(reg_A[0], reg_B[2], reg_C[2].x);
520 | 			reg_C[3].x = fma(reg_A[0], reg_B[3], reg_C[3].x);
521 | 			reg_C[8].x = fma(reg_A[0], reg_B[4], reg_C[8].x);
522 | 			reg_C[9].x = fma(reg_A[0], reg_B[5], reg_C[9].x);
523 | 			reg_C[10].x = fma(reg_A[0], reg_B[6], reg_C[10].x);
524 | 			reg_C[11].x = fma(reg_A[0], reg_B[7], reg_C[11].x);
525 | 			reg_C[4].x = fma(reg_A[4], reg_B[0], reg_C[4].x);
526 | 			reg_C[5].x = fma(reg_A[4], reg_B[1], reg_C[5].x);
527 | 			reg_C[6].x = fma(reg_A[4], reg_B[2], reg_C[6].x);
528 | 			reg_C[7].x = fma(reg_A[4], reg_B[3], reg_C[7].x);
529 | 			reg_C[12].x = fma(reg_A[4], reg_B[4], reg_C[12].x);
530 | 			reg_C[13].x = fma(reg_A[4], reg_B[5], reg_C[13].x);
531 | 			reg_C[14].x = fma(reg_A[4], reg_B[6], reg_C[14].x);
532 | 			reg_C[15].x = fma(reg_A[4], reg_B[7], reg_C[15].x);
533 | 
534 | 			reg_C[0].y = fma(reg_A[1], reg_B[0], reg_C[0].y);
535 | 			reg_C[1].y = fma(reg_A[1], reg_B[1], reg_C[1].y);
536 | 			reg_C[2].y = fma(reg_A[1], reg_B[2], reg_C[2].y);
537 | 			reg_C[3].y = fma(reg_A[1], reg_B[3], reg_C[3].y);
538 | 			reg_C[8].y = fma(reg_A[1], reg_B[4], reg_C[8].y);
539 | 			reg_C[9].y = fma(reg_A[1], reg_B[5], reg_C[9].y);
540 | 			reg_C[10].y = fma(reg_A[1], reg_B[6], reg_C[10].y);
541 | 			reg_C[11].y = fma(reg_A[1], reg_B[7], reg_C[11].y);
542 | 			reg_C[4].y = fma(reg_A[5], reg_B[0], reg_C[4].y);
543 | 			reg_C[5].y = fma(reg_A[5], reg_B[1], reg_C[5].y);
544 | 			reg_C[6].y = fma(reg_A[5], reg_B[2], reg_C[6].y);
545 | 			reg_C[7].y = fma(reg_A[5], reg_B[3], reg_C[7].y);
546 | 			reg_C[12].y = fma(reg_A[5], reg_B[4], reg_C[12].y);
547 | 			reg_C[13].y = fma(reg_A[5], reg_B[5], reg_C[13].y);
548 | 			reg_C[14].y = fma(reg_A[5], reg_B[6], reg_C[14].y);
549 | 			reg_C[15].y = fma(reg_A[5], reg_B[7], reg_C[15].y);
550 | 
551 | 			reg_C[0].z = fma(reg_A[2], reg_B[0], reg_C[0].z);
552 | 			reg_C[1].z = fma(reg_A[2], reg_B[1], reg_C[1].z);
553 | 			reg_C[2].z = fma(reg_A[2], reg_B[2], reg_C[2].z);
554 | 			reg_C[3].z = fma(reg_A[2], reg_B[3], reg_C[3].z);
555 | 			reg_C[8].z = fma(reg_A[2], reg_B[4], reg_C[8].z);
556 | 			reg_C[9].z = fma(reg_A[2], reg_B[5], reg_C[9].z);
557 | 			reg_C[10].z = fma(reg_A[2], reg_B[6], reg_C[10].z);
558 | 			reg_C[11].z = fma(reg_A[2], reg_B[7], reg_C[11].z);
559 | 			reg_C[4].z = fma(reg_A[6], reg_B[0], reg_C[4].z);
560 | 			reg_C[5].z = fma(reg_A[6], reg_B[1], reg_C[5].z);
561 | 			reg_C[6].z = fma(reg_A[6], reg_B[2], reg_C[6].z);
562 | 			reg_C[7].z = fma(reg_A[6], reg_B[3], reg_C[7].z);
563 | 			reg_C[12].z = fma(reg_A[6], reg_B[4], reg_C[12].z);
564 | 			reg_C[13].z = fma(reg_A[6], reg_B[5], reg_C[13].z);
565 | 			reg_C[14].z = fma(reg_A[6], reg_B[6], reg_C[14].z);
566 | 			reg_C[15].z = fma(reg_A[6], reg_B[7], reg_C[15].z);
567 | 
568 | 			reg_C[0].w = fma(reg_A[3], reg_B[0], reg_C[0].w);
569 | 			reg_C[1].w = fma(reg_A[3], reg_B[1], reg_C[1].w);
570 | 			reg_C[2].w = fma(reg_A[3], reg_B[2], reg_C[2].w);
571 | 			reg_C[3].w = fma(reg_A[3], reg_B[3], reg_C[3].w);
572 | 			reg_C[8].w = fma(reg_A[3], reg_B[4], reg_C[8].w);
573 | 			reg_C[9].w = fma(reg_A[3], reg_B[5], reg_C[9].w);
574 | 			reg_C[10].w = fma(reg_A[3], reg_B[6], reg_C[10].w);
575 | 			reg_C[11].w = fma(reg_A[3], reg_B[7], reg_C[11].w);
576 | 			reg_C[4].w = fma(reg_A[7], reg_B[0], reg_C[4].w);
577 | 			reg_C[5].w = fma(reg_A[7], reg_B[1], reg_C[5].w);
578 | 			reg_C[6].w = fma(reg_A[7], reg_B[2], reg_C[6].w);
579 | 			reg_C[7].w = fma(reg_A[7], reg_B[3], reg_C[7].w);
580 | 			reg_C[12].w = fma(reg_A[7], reg_B[4], reg_C[12].w);
581 | 			reg_C[13].w = fma(reg_A[7], reg_B[5], reg_C[13].w);
582 | 			reg_C[14].w = fma(reg_A[7], reg_B[6], reg_C[14].w);
583 | 			reg_C[15].w = fma(reg_A[7], reg_B[7], reg_C[15].w);
584 | 
585 | 			A_offset += 128;
586 | 			if (i==3) B_offset += 252;
587 | 			B_offset += 1;
588 | 		}
589 | 
590 | 		double_buffer_A ^= 1024;
591 | 		double_buffer_B ^= 512;
592 | 
593 | 		if (k+8 < K){
594 | 			A_start += 2*M; 
595 | 			*((float4*) (sh_A + double_buffer_A + 4*threadIdx.x)) = *(A_start);
596 | 			*((float4*) (sh_A + double_buffer_A + 512 + 4*threadIdx.x)) = *(A_start + M);
597 | 
598 | 			B_start += 2; 
599 | 			*((float4*) (sh_B + double_buffer_B + 4*threadIdx.x)) = *(B_start);
600 | 		}
601 | 				
602 | 	}
603 | 	C_start -= (8*M + 16);
604 |     *C_start = reg_C[0];
605 | 	*(C_start + M/4) = reg_C[1];
606 | 	*(C_start + M/2) = reg_C[2];
607 | 	*(C_start + 3*M/4) = reg_C[3];
608 | 
609 | 	C_start += 16;
610 | 	*(C_start) = reg_C[4];
611 | 	*(C_start + M/4) = reg_C[5];
612 | 	*(C_start + M/2) = reg_C[6];
613 | 	*(C_start + 3*M/4) = reg_C[7];
614 | 
615 | 	C_start += (8*M - 16);
616 | 	*(C_start) = reg_C[8];
617 | 	*(C_start + M/4) = reg_C[9];
618 | 	*(C_start + M/2) = reg_C[10];
619 | 	*(C_start + 3*M/4) = reg_C[11];
620 | 
621 | 	C_start += 16;
622 | 	*(C_start) = reg_C[12];
623 | 	*(C_start + M/4) = reg_C[13];
624 | 	*(C_start + M/2) = reg_C[14];
625 | 	*(C_start + 3*M/4) = reg_C[15];
626 | }
627 | 


--------------------------------------------------------------------------------
/tiling/kernel_256.h:
--------------------------------------------------------------------------------
  1 | __device__ void gemm_256_16x16(int M, int N, int K, float *A, float *B, float *C, float *sh){
  2 | 
  3 | 	float *sh_A = sh;
  4 | 	float *sh_B = sh + 2*16*16;
  5 | 
  6 | 	float reg_C;
  7 | 	float reg_A;
  8 | 	float reg_B;
  9 | 
 10 | 	// Compute block's starting coordinate
 11 | 	int block_base_x = blockIdx.y*16;
 12 | 	int block_base_y = blockIdx.x*16;
 13 | 
 14 | 	//Load C from global memory to register file
 15 | 	float *C_start = (C + block_base_x*M + block_base_y + (threadIdx.x%16) + (threadIdx.x/16)*M);
 16 | 
 17 |     reg_C = *C_start;
 18 | 
 19 | 	//load A from global memory to shared memory
 20 | 	float *A_start = (A + block_base_y + (threadIdx.x%16) + (threadIdx.x/16)*M); 
 21 | 	* (sh_A + threadIdx.x) = *(A_start);
 22 | 
 23 | 	//load B from global memory to shared memory
 24 | 	float *B_start = (B + K*block_base_x + (threadIdx.x/16) + (threadIdx.x%16)*K); 
 25 | 	* (sh_B + threadIdx.x) = *(B_start);
 26 | 		
 27 | 	int double_buffer = 0;
 28 | #pragma unroll
 29 | 	for(int k=0; k<K; k+=16){
 30 | 
 31 | 		__syncthreads();
 32 | 		int A_offset = double_buffer + (threadIdx.x%16);
 33 | 		int B_offset = double_buffer + (threadIdx.x/16);
 34 | 			
 35 | #pragma unroll
 36 | 		for (int i=0; i<16; ++i)	{
 37 | 			reg_A = sh_A[A_offset]; 
 38 | 			reg_B = sh_B[B_offset]; 
 39 | 			reg_C = fma(reg_A, reg_B, reg_C);
 40 | 
 41 | 			A_offset += 16;
 42 | 			B_offset += 16;
 43 | 		}
 44 | 
 45 | 		double_buffer ^= 256;
 46 | 
 47 | 		if (k+16 < K){
 48 | 			A_start += 16*M; 
 49 | 			* (sh_A + double_buffer + threadIdx.x) = *(A_start);
 50 | 
 51 | 			B_start += 16; 
 52 | 			* (sh_B + double_buffer + threadIdx.x) = *(B_start);
 53 | 		}
 54 | 				
 55 | 	}
 56 | 	*(C_start) = reg_C;
 57 | }
 58 | 
 59 | __device__ void gemm_256_32x32(int M, int N, int K, float *A, float *B, float *C, float *sh){
 60 | 
 61 | 	float *sh_A = sh;
 62 | 	float *sh_B = sh + 2*32*8;
 63 | 
 64 | 	float4 reg_C;
 65 | 	float4 reg_A;
 66 | 	float  reg_B;
 67 | 
 68 | 	// Compute block's starting coordinate
 69 | 	int block_base_x = blockIdx.y*32;
 70 | 	int block_base_y = blockIdx.x*32;
 71 | 
 72 | 	//Load C from global memory to register file
 73 | 	float4 *C_start = (float4 *) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*M);
 74 | 
 75 |     reg_C = *C_start;
 76 | 
 77 | 	//load B from global memory to shared memory
 78 | 	float *A_start = (A + block_base_y + (threadIdx.x%32) + (threadIdx.x/32)*M); 
 79 | 	* (sh_A + threadIdx.x) = *(A_start);
 80 | 
 81 | 	//load A from global memory to shared memory
 82 | 	float *B_start = (B + K*block_base_x + (threadIdx.x/32) + (threadIdx.x%32)*K); 
 83 | 	* (sh_B + threadIdx.x) = *(B_start);
 84 | 
 85 | 	int double_buffer = 0;
 86 | #pragma unroll
 87 | 	for(int k=0; k<K; k+=8){
 88 | 
 89 | 		__syncthreads();
 90 | 		int A_offset = double_buffer + (threadIdx.x%8)*4;
 91 | 		int B_offset = double_buffer + (threadIdx.x/8);
 92 | 			
 93 | #pragma unroll
 94 | 		for (int i=0; i<8; ++i)	{
 95 | 			reg_A = *((float4*) (sh_A + A_offset)); 
 96 | 			reg_B = sh_B[B_offset]; 
 97 | 
 98 | 			reg_C.x = fma(reg_A.x, reg_B, reg_C.x);
 99 | 			reg_C.y = fma(reg_A.y, reg_B, reg_C.y);
100 | 			reg_C.z = fma(reg_A.z, reg_B, reg_C.z);
101 | 			reg_C.w = fma(reg_A.w, reg_B, reg_C.w);
102 | 
103 | 			A_offset += 32;
104 | 			B_offset += 32;
105 | 		}
106 | 
107 | 		double_buffer ^= 256;
108 | 
109 | 		if (k+8 < K){
110 | 			A_start += 8*M; 
111 | 			* (sh_A + double_buffer + threadIdx.x) = *(A_start);
112 | 
113 | 			B_start += 8; 
114 | 			* (sh_B + double_buffer + threadIdx.x) = *(B_start);
115 | 		}
116 | 				
117 | 	}
118 | 	*(C_start) = reg_C;
119 | 
120 | }
121 | 
122 | __device__ void gemm_256_64x64(int M, int N, int K, float *A, float *B, float *C, float *sh){
123 | 
124 | 	float *sh_A = sh;
125 | 	float *sh_B = sh + 2*64*8;
126 | 
127 | 	float4 reg_C[4];
128 | 	float4 reg_A[2];
129 | 	float  reg_B[2];
130 | 
131 | 	// Compute block's starting coordinate
132 | 	int block_base_x = blockIdx.y*64;
133 | 	int block_base_y = blockIdx.x*64;
134 | 
135 | 	//Load C from global memory to register file
136 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*M);
137 |     reg_C[0] = *C_start;
138 | 	reg_C[1] = *(C_start + 8);
139 | 	reg_C[2] = *(C_start + 8*M);
140 | 	reg_C[3] = *(C_start + 8 + 8*M);
141 | 	
142 | 	//load A from global memory to shared memory
143 | 	float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%32)*2 + (threadIdx.x/32)*M); 
144 | 	*((float2*) (sh_A + 2*threadIdx.x)) = *(A_start);
145 | 
146 | 	//load B from global memory to shared memory
147 | 	float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/64)*2 + (threadIdx.x%64)*K); 
148 | 	*((float2*) (sh_B + 2*threadIdx.x)) = *(B_start);
149 | 
150 | 	int double_buffer = 0;
151 | #pragma unroll
152 | 	for(int k=0; k<K; k+=8){
153 | 
154 | 		__syncthreads();
155 | 		int A_offset = double_buffer + (threadIdx.x%8)*4;
156 | 		int B_offset = double_buffer + (threadIdx.x/8)*2;
157 | 			
158 | #pragma unroll
159 | 		for (int i=0; i<8; ++i)	{
160 | 			reg_A[0] = *((float4*) (sh_A + A_offset)); 
161 | 			reg_A[1] = *((float4*) (sh_A + A_offset + 32)); 
162 | 			reg_B[0] = sh_B[B_offset]; 
163 | 			reg_B[1] = sh_B[B_offset + 64]; 
164 | 
165 | 			reg_C[0].x = fma(reg_A[0].x, reg_B[0], reg_C[0].x);
166 | 			reg_C[0].y = fma(reg_A[0].y, reg_B[0], reg_C[0].y);
167 | 			reg_C[0].z = fma(reg_A[0].z, reg_B[0], reg_C[0].z);
168 | 			reg_C[0].w = fma(reg_A[0].w, reg_B[0], reg_C[0].w);
169 | 
170 | 			reg_C[1].x = fma(reg_A[1].x, reg_B[0], reg_C[1].x);
171 | 			reg_C[1].y = fma(reg_A[1].y, reg_B[0], reg_C[1].y);
172 | 			reg_C[1].z = fma(reg_A[1].z, reg_B[0], reg_C[1].z);
173 | 			reg_C[1].w = fma(reg_A[1].w, reg_B[0], reg_C[1].w);
174 | 
175 | 			reg_C[2].x = fma(reg_A[0].x, reg_B[1], reg_C[2].x);
176 | 			reg_C[2].y = fma(reg_A[0].y, reg_B[1], reg_C[2].y);
177 | 			reg_C[2].z = fma(reg_A[0].z, reg_B[1], reg_C[2].z);
178 | 			reg_C[2].w = fma(reg_A[0].w, reg_B[1], reg_C[2].w);
179 | 
180 | 			reg_C[3].x = fma(reg_A[1].x, reg_B[1], reg_C[3].x);
181 | 			reg_C[3].y = fma(reg_A[1].y, reg_B[1], reg_C[3].y);
182 | 			reg_C[3].z = fma(reg_A[1].z, reg_B[1], reg_C[3].z);
183 | 			reg_C[3].w = fma(reg_A[1].w, reg_B[1], reg_C[3].w);
184 | 
185 | 			A_offset += 64;
186 | 			B_offset += 1;
187 | 			if (i%2) B_offset += 126;
188 | 		}
189 | 
190 | 		double_buffer ^= 512;
191 | 
192 | 		if (k+8 < K){
193 | 			A_start += 4*M; 
194 | 			*((float2*) (sh_A + double_buffer + 2*threadIdx.x)) = *(A_start);
195 | 
196 | 			B_start += 4; 
197 | 			*((float2*) (sh_B + double_buffer + 2*threadIdx.x)) = *(B_start);
198 | 		}
199 | 				
200 | 	}
201 | 	*(C_start) = reg_C[0];
202 | 	*(C_start + 8) = reg_C[1];
203 | 	*(C_start + 8*M) = reg_C[2];
204 | 	*(C_start + 8 + 8*M) = reg_C[3];
205 | }
206 | 
207 | __device__ void gemm_256_128x64(int M, int N, int K, float *A, float *B, float *C, float *sh){
208 | 
209 | 	float *sh_A = sh;
210 | 	float *sh_B = sh + 2*128*8;
211 | 
212 | 	float4 reg_C[8];
213 | 	float4 reg_A[2];
214 | 	float  reg_B[4];
215 | 
216 | 	// Compute block's starting coordinate
217 | 	int block_base_x = blockIdx.y*64;
218 | 	int block_base_y = blockIdx.x*128;
219 | 
220 | 	//Load C from global memory to register file
221 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*4*M);
222 | 
223 |     reg_C[0] = *C_start;
224 | 	reg_C[1] = *(C_start + M/4);
225 | 	reg_C[2] = *(C_start + M/2);
226 | 	reg_C[3] = *(C_start + 3*M/4);
227 | 
228 | 	C_start += 16;
229 | 	reg_C[4] = *(C_start);
230 | 	reg_C[5] = *(C_start + M/4);
231 | 	reg_C[6] = *(C_start + M/2);
232 | 	reg_C[7] = *(C_start + 3*M/4);
233 | 
234 | 	//load A from global memory to shared memory
235 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%32)*4 + (threadIdx.x/32)*M); 
236 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
237 | 
238 | 	//load A from global memory to shared memory
239 | 	float2 *B_start = (float2*) (B + K*block_base_x + (threadIdx.x/64)*2 + (threadIdx.x%64)*K); 
240 | 	*((float2*) (sh_B + 2*threadIdx.x)) = *(B_start);
241 | 		
242 | 	int double_buffer_A = 0;
243 | 	int double_buffer_B = 0;
244 | #pragma unroll
245 | 	for(int k=0; k<K; k+=8){
246 | 
247 | 		__syncthreads();
248 | 		int A_offset = double_buffer_A + (threadIdx.x%16)*4;
249 | 		int B_offset = double_buffer_B + ((threadIdx.x/16)*8);
250 | 			
251 | #pragma unroll
252 | 		for (int i=0; i<8; ++i)	{
253 | 			
254 | 			reg_A[0] = *((float4*) (sh_A+A_offset));
255 | 			reg_A[1] = *((float4*) (sh_A+A_offset+64));
256 | 
257 | 			reg_B[0] = sh_B[B_offset];
258 | 			reg_B[1] = sh_B[B_offset+2];
259 | 			reg_B[2] = sh_B[B_offset+4];
260 | 			reg_B[3] = sh_B[B_offset+6];
261 | 
262 | 			reg_C[0].x = fma(reg_A[0].x, reg_B[0], reg_C[0].x);
263 | 			reg_C[1].x = fma(reg_A[0].x, reg_B[1], reg_C[1].x);
264 | 			reg_C[2].x = fma(reg_A[0].x, reg_B[2], reg_C[2].x);
265 | 			reg_C[3].x = fma(reg_A[0].x, reg_B[3], reg_C[3].x);
266 | 			reg_C[4].x = fma(reg_A[1].x, reg_B[0], reg_C[4].x);
267 | 			reg_C[5].x = fma(reg_A[1].x, reg_B[1], reg_C[5].x);
268 | 			reg_C[6].x = fma(reg_A[1].x, reg_B[2], reg_C[6].x);
269 | 			reg_C[7].x = fma(reg_A[1].x, reg_B[3], reg_C[7].x);
270 | 
271 | 			reg_C[0].y = fma(reg_A[0].y, reg_B[0], reg_C[0].y);
272 | 			reg_C[1].y = fma(reg_A[0].y, reg_B[1], reg_C[1].y);
273 | 			reg_C[2].y = fma(reg_A[0].y, reg_B[2], reg_C[2].y);
274 | 			reg_C[3].y = fma(reg_A[0].y, reg_B[3], reg_C[3].y);
275 | 			reg_C[4].y = fma(reg_A[1].y, reg_B[0], reg_C[4].y);
276 | 			reg_C[5].y = fma(reg_A[1].y, reg_B[1], reg_C[5].y);
277 | 			reg_C[6].y = fma(reg_A[1].y, reg_B[2], reg_C[6].y);
278 | 			reg_C[7].y = fma(reg_A[1].y, reg_B[3], reg_C[7].y);
279 | 
280 | 			reg_C[0].z = fma(reg_A[0].z, reg_B[0], reg_C[0].z);
281 | 			reg_C[1].z = fma(reg_A[0].z, reg_B[1], reg_C[1].z);
282 | 			reg_C[2].z = fma(reg_A[0].z, reg_B[2], reg_C[2].z);
283 | 			reg_C[3].z = fma(reg_A[0].z, reg_B[3], reg_C[3].z);
284 | 			reg_C[4].z = fma(reg_A[1].z, reg_B[0], reg_C[4].z);
285 | 			reg_C[5].z = fma(reg_A[1].z, reg_B[1], reg_C[5].z);
286 | 			reg_C[6].z = fma(reg_A[1].z, reg_B[2], reg_C[6].z);
287 | 			reg_C[7].z = fma(reg_A[1].z, reg_B[3], reg_C[7].z);
288 | 
289 | 			reg_C[0].w = fma(reg_A[0].w, reg_B[0], reg_C[0].w);
290 | 			reg_C[1].w = fma(reg_A[0].w, reg_B[1], reg_C[1].w);
291 | 			reg_C[2].w = fma(reg_A[0].w, reg_B[2], reg_C[2].w);
292 | 			reg_C[3].w = fma(reg_A[0].w, reg_B[3], reg_C[3].w);
293 | 			reg_C[4].w = fma(reg_A[1].w, reg_B[0], reg_C[4].w);
294 | 			reg_C[5].w = fma(reg_A[1].w, reg_B[1], reg_C[5].w);
295 | 			reg_C[6].w = fma(reg_A[1].w, reg_B[2], reg_C[6].w);
296 | 			reg_C[7].w = fma(reg_A[1].w, reg_B[3], reg_C[7].w);
297 | 
298 | 			A_offset += 128;
299 | 			B_offset += (1 + (i%2)*126);
300 | 		}
301 | 
302 | 		double_buffer_A ^= 1024;
303 | 		double_buffer_B ^= 512;
304 | 
305 | 		if (k+8 < K){
306 | 			A_start += 2*M; 
307 | 			*((float4*) (sh_A + double_buffer_A + 4*threadIdx.x)) = *(A_start);
308 | 
309 | 			B_start += 4; 
310 | 			*((float2*) (sh_B + double_buffer_B + 2*threadIdx.x)) = *(B_start);
311 | 		}
312 | 				
313 | 	}
314 | 	C_start -= 16;
315 |     *C_start = reg_C[0];
316 | 	*(C_start + M/4) = reg_C[1];
317 | 	*(C_start + M/2) = reg_C[2];
318 | 	*(C_start + 3*M/4) = reg_C[3];
319 | 
320 | 	C_start += 16;
321 | 	*(C_start) = reg_C[4];
322 | 	*(C_start + M/4) = reg_C[5];
323 | 	*(C_start + M/2) = reg_C[6];
324 | 	*(C_start + 3*M/4) = reg_C[7];
325 | 
326 | }
327 | 
328 | __device__ void gemm_256_64x128(int M, int N, int K, float *A, float *B, float *C, float *sh){
329 | 
330 | 	float *sh_A = sh;
331 | 	float *sh_B = sh + 2*64*8;
332 | 
333 | 	float4 reg_C[8];
334 | 	float4 reg_A[2];
335 | 	float  reg_B[4];
336 | 
337 | 	// Compute block's starting coordinate
338 | 	int block_base_x = blockIdx.y*128;
339 | 	int block_base_y = blockIdx.x*64;
340 | 
341 | 	//Load C from global memory to register file
342 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%8)*4 + (threadIdx.x/8)*4*M);
343 | 
344 |     reg_C[0] = *C_start;
345 | 	reg_C[1] = *(C_start + M/4);
346 | 	reg_C[2] = *(C_start + M/2);
347 | 	reg_C[3] = *(C_start + 3*M/4);
348 | 
349 | 	C_start += 8;
350 | 	reg_C[4] = *(C_start);
351 | 	reg_C[5] = *(C_start + M/4);
352 | 	reg_C[6] = *(C_start + M/2);
353 | 	reg_C[7] = *(C_start + 3*M/4);
354 | 
355 | 	//load A from global memory to shared memory
356 | 	float2 *A_start = (float2*) (A + block_base_y + (threadIdx.x%32)*2 + (threadIdx.x/32)*M); 
357 | 	*((float2*) (sh_A + 2*threadIdx.x)) = *(A_start);
358 | 
359 | 	//load A from global memory to shared memory
360 | 	float4 *B_start = (float4*) (B + K*block_base_x + (threadIdx.x/128)*4 + (threadIdx.x%128)*K); 
361 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
362 | 		
363 | 	int double_buffer_A = 0;
364 | 	int double_buffer_B = 0;
365 | #pragma unroll
366 | 	for(int k=0; k<K; k+=8){
367 | 
368 | 		__syncthreads();
369 | 		int A_offset = double_buffer_A + (threadIdx.x%8)*4;
370 | 		int B_offset = double_buffer_B + ((threadIdx.x/8)*16);
371 | 			
372 | #pragma unroll
373 | 		for (int i=0; i<8; ++i)	{
374 | 			
375 | 			reg_A[0] = *((float4*) (sh_A+A_offset));
376 | 			reg_A[1] = *((float4*) (sh_A+A_offset+32));
377 | 
378 | 			reg_B[0] = sh_B[B_offset];
379 | 			reg_B[1] = sh_B[B_offset+4];
380 | 			reg_B[2] = sh_B[B_offset+8];
381 | 			reg_B[3] = sh_B[B_offset+12];
382 | 
383 | 			reg_C[0].x = fma(reg_A[0].x, reg_B[0], reg_C[0].x);
384 | 			reg_C[1].x = fma(reg_A[0].x, reg_B[1], reg_C[1].x);
385 | 			reg_C[2].x = fma(reg_A[0].x, reg_B[2], reg_C[2].x);
386 | 			reg_C[3].x = fma(reg_A[0].x, reg_B[3], reg_C[3].x);
387 | 			reg_C[4].x = fma(reg_A[1].x, reg_B[0], reg_C[4].x);
388 | 			reg_C[5].x = fma(reg_A[1].x, reg_B[1], reg_C[5].x);
389 | 			reg_C[6].x = fma(reg_A[1].x, reg_B[2], reg_C[6].x);
390 | 			reg_C[7].x = fma(reg_A[1].x, reg_B[3], reg_C[7].x);
391 | 
392 | 			reg_C[0].y = fma(reg_A[0].y, reg_B[0], reg_C[0].y);
393 | 			reg_C[1].y = fma(reg_A[0].y, reg_B[1], reg_C[1].y);
394 | 			reg_C[2].y = fma(reg_A[0].y, reg_B[2], reg_C[2].y);
395 | 			reg_C[3].y = fma(reg_A[0].y, reg_B[3], reg_C[3].y);
396 | 			reg_C[4].y = fma(reg_A[1].y, reg_B[0], reg_C[4].y);
397 | 			reg_C[5].y = fma(reg_A[1].y, reg_B[1], reg_C[5].y);
398 | 			reg_C[6].y = fma(reg_A[1].y, reg_B[2], reg_C[6].y);
399 | 			reg_C[7].y = fma(reg_A[1].y, reg_B[3], reg_C[7].y);
400 | 
401 | 			reg_C[0].z = fma(reg_A[0].z, reg_B[0], reg_C[0].z);
402 | 			reg_C[1].z = fma(reg_A[0].z, reg_B[1], reg_C[1].z);
403 | 			reg_C[2].z = fma(reg_A[0].z, reg_B[2], reg_C[2].z);
404 | 			reg_C[3].z = fma(reg_A[0].z, reg_B[3], reg_C[3].z);
405 | 			reg_C[4].z = fma(reg_A[1].z, reg_B[0], reg_C[4].z);
406 | 			reg_C[5].z = fma(reg_A[1].z, reg_B[1], reg_C[5].z);
407 | 			reg_C[6].z = fma(reg_A[1].z, reg_B[2], reg_C[6].z);
408 | 			reg_C[7].z = fma(reg_A[1].z, reg_B[3], reg_C[7].z);
409 | 
410 | 			reg_C[0].w = fma(reg_A[0].w, reg_B[0], reg_C[0].w);
411 | 			reg_C[1].w = fma(reg_A[0].w, reg_B[1], reg_C[1].w);
412 | 			reg_C[2].w = fma(reg_A[0].w, reg_B[2], reg_C[2].w);
413 | 			reg_C[3].w = fma(reg_A[0].w, reg_B[3], reg_C[3].w);
414 | 			reg_C[4].w = fma(reg_A[1].w, reg_B[0], reg_C[4].w);
415 | 			reg_C[5].w = fma(reg_A[1].w, reg_B[1], reg_C[5].w);
416 | 			reg_C[6].w = fma(reg_A[1].w, reg_B[2], reg_C[6].w);
417 | 			reg_C[7].w = fma(reg_A[1].w, reg_B[3], reg_C[7].w);
418 | 
419 | 			A_offset += 64;
420 | 			B_offset += (1 + (i==3)*508);
421 | 		}
422 | 
423 | 		double_buffer_A ^= 512;
424 | 		double_buffer_B ^= 1024;
425 | 
426 | 		if (k+8 < K){
427 | 			A_start += 4*M; 
428 | 			*((float2*) (sh_A + double_buffer_A + 2*threadIdx.x)) = *(A_start);
429 | 
430 | 			B_start += 2; 
431 | 			*((float4*) (sh_B + double_buffer_B + 4*threadIdx.x)) = *(B_start);
432 | 		}
433 | 				
434 | 	}
435 |     *C_start = reg_C[4];
436 | 	*(C_start + M/4) = reg_C[5];
437 | 	*(C_start + M/2) = reg_C[6];
438 | 	*(C_start + 3*M/4) = reg_C[7];
439 | 
440 | 	C_start -= 8;
441 | 	*(C_start) = reg_C[0];
442 | 	*(C_start + M/4) = reg_C[1];
443 | 	*(C_start + M/2) = reg_C[2];
444 | 	*(C_start + 3*M/4) = reg_C[3];
445 | 
446 | }
447 | 
448 | __device__ void gemm_256_128x128(int M, int N, int K, float *A, float *B, float *C, float *sh){
449 | 
450 |     float *sh_A = sh;
451 | 	float *sh_B = sh + 2*128*8;
452 | 
453 | 	float4 reg_C[16];
454 | 	float reg_A[8];
455 | 	float reg_B[8];
456 | 
457 | 	// Compute block's starting coordinate
458 | 	int block_base_x = blockIdx.y*128;
459 | 	int block_base_y = blockIdx.x*128;
460 | 
461 | 	//Load C from global memory to register file
462 | 	float4 *C_start = (float4*) (C + block_base_x*M + block_base_y + (threadIdx.x%16)*4 + (threadIdx.x/16)*4*M);
463 | 
464 |     reg_C[0] = *C_start;
465 | 	reg_C[1] = *(C_start + M/4);
466 | 	reg_C[2] = *(C_start + M/2);
467 | 	reg_C[3] = *(C_start + 3*M/4);
468 | 
469 | 	C_start += 16;
470 | 	reg_C[4] = *(C_start);
471 | 	reg_C[5] = *(C_start + M/4);
472 | 	reg_C[6] = *(C_start + M/2);
473 | 	reg_C[7] = *(C_start + 3*M/4);
474 | 
475 | 	C_start += (16*M - 16);
476 | 	reg_C[8] = *(C_start);
477 | 	reg_C[9] = *(C_start + M/4);
478 | 	reg_C[10] = *(C_start + M/2);
479 | 	reg_C[11] = *(C_start + 3*M/4);
480 | 
481 | 	C_start += 16;
482 | 	reg_C[12] = *(C_start);
483 | 	reg_C[13] = *(C_start + M/4);
484 | 	reg_C[14] = *(C_start + M/2);
485 | 	reg_C[15] = *(C_start + 3*M/4);
486 | 
487 | 	//load A from global memory to shared memory
488 | 	float4 *A_start = (float4*) (A + block_base_y + (threadIdx.x%32)*4 + (threadIdx.x/32)*M); 
489 | 	*((float4*) (sh_A + 4*threadIdx.x)) = *(A_start);
490 | 
491 | 	//load A from global memory to shared memory
492 | 	float4 *B_start = (float4*) (B + K*block_base_x + (threadIdx.x/128)*4 + (threadIdx.x%128)*K); 
493 | 	*((float4*) (sh_B + 4*threadIdx.x)) = *(B_start);
494 | 		
495 | 	int double_buffer = 0;
496 | #pragma unroll
497 | 	for(int k=0; k<K; k+=8){
498 | 
499 | 		__syncthreads();
500 | 		int A_offset = double_buffer + (threadIdx.x%16)*4;
501 | 		int B_offset = double_buffer + ((threadIdx.x/16)*16);
502 | 			
503 | #pragma unroll
504 | 		for (int i=0; i<8; ++i)	{
505 | 			
506 | 			reg_A[0] = sh_A[A_offset];
507 | 			reg_A[1] = sh_A[A_offset+1];
508 | 			reg_A[2] = sh_A[A_offset+2];
509 | 			reg_A[3] = sh_A[A_offset+3];
510 | 			reg_A[4] = sh_A[A_offset+64];
511 | 			reg_A[5] = sh_A[A_offset+65];
512 | 			reg_A[6] = sh_A[A_offset+66];
513 | 			reg_A[7] = sh_A[A_offset+67];
514 | 
515 | 			reg_B[0] = sh_B[B_offset];
516 | 			reg_B[1] = sh_B[B_offset+4];
517 | 			reg_B[2] = sh_B[B_offset+8];
518 | 			reg_B[3] = sh_B[B_offset+12];
519 | 			reg_B[4] = sh_B[B_offset+256];
520 | 			reg_B[5] = sh_B[B_offset+260];
521 | 			reg_B[6] = sh_B[B_offset+264];
522 | 			reg_B[7] = sh_B[B_offset+268];
523 | 
524 | 			reg_C[0].x = fma(reg_A[0], reg_B[0], reg_C[0].x);
525 | 			reg_C[1].x = fma(reg_A[0], reg_B[1], reg_C[1].x);
526 | 			reg_C[2].x = fma(reg_A[0], reg_B[2], reg_C[2].x);
527 | 			reg_C[3].x = fma(reg_A[0], reg_B[3], reg_C[3].x);
528 | 			reg_C[8].x = fma(reg_A[0], reg_B[4], reg_C[8].x);
529 | 			reg_C[9].x = fma(reg_A[0], reg_B[5], reg_C[9].x);
530 | 			reg_C[10].x = fma(reg_A[0], reg_B[6], reg_C[10].x);
531 | 			reg_C[11].x = fma(reg_A[0], reg_B[7], reg_C[11].x);
532 | 			reg_C[4].x = fma(reg_A[4], reg_B[0], reg_C[4].x);
533 | 			reg_C[5].x = fma(reg_A[4], reg_B[1], reg_C[5].x);
534 | 			reg_C[6].x = fma(reg_A[4], reg_B[2], reg_C[6].x);
535 | 			reg_C[7].x = fma(reg_A[4], reg_B[3], reg_C[7].x);
536 | 			reg_C[12].x = fma(reg_A[4], reg_B[4], reg_C[12].x);
537 | 			reg_C[13].x = fma(reg_A[4], reg_B[5], reg_C[13].x);
538 | 			reg_C[14].x = fma(reg_A[4], reg_B[6], reg_C[14].x);
539 | 			reg_C[15].x = fma(reg_A[4], reg_B[7], reg_C[15].x);
540 | 
541 | 			reg_C[0].y = fma(reg_A[1], reg_B[0], reg_C[0].y);
542 | 			reg_C[1].y = fma(reg_A[1], reg_B[1], reg_C[1].y);
543 | 			reg_C[2].y = fma(reg_A[1], reg_B[2], reg_C[2].y);
544 | 			reg_C[3].y = fma(reg_A[1], reg_B[3], reg_C[3].y);
545 | 			reg_C[8].y = fma(reg_A[1], reg_B[4], reg_C[8].y);
546 | 			reg_C[9].y = fma(reg_A[1], reg_B[5], reg_C[9].y);
547 | 			reg_C[10].y = fma(reg_A[1], reg_B[6], reg_C[10].y);
548 | 			reg_C[11].y = fma(reg_A[1], reg_B[7], reg_C[11].y);
549 | 			reg_C[4].y = fma(reg_A[5], reg_B[0], reg_C[4].y);
550 | 			reg_C[5].y = fma(reg_A[5], reg_B[1], reg_C[5].y);
551 | 			reg_C[6].y = fma(reg_A[5], reg_B[2], reg_C[6].y);
552 | 			reg_C[7].y = fma(reg_A[5], reg_B[3], reg_C[7].y);
553 | 			reg_C[12].y = fma(reg_A[5], reg_B[4], reg_C[12].y);
554 | 			reg_C[13].y = fma(reg_A[5], reg_B[5], reg_C[13].y);
555 | 			reg_C[14].y = fma(reg_A[5], reg_B[6], reg_C[14].y);
556 | 			reg_C[15].y = fma(reg_A[5], reg_B[7], reg_C[15].y);
557 | 
558 | 			reg_C[0].z = fma(reg_A[2], reg_B[0], reg_C[0].z);
559 | 			reg_C[1].z = fma(reg_A[2], reg_B[1], reg_C[1].z);
560 | 			reg_C[2].z = fma(reg_A[2], reg_B[2], reg_C[2].z);
561 | 			reg_C[3].z = fma(reg_A[2], reg_B[3], reg_C[3].z);
562 | 			reg_C[8].z = fma(reg_A[2], reg_B[4], reg_C[8].z);
563 | 			reg_C[9].z = fma(reg_A[2], reg_B[5], reg_C[9].z);
564 | 			reg_C[10].z = fma(reg_A[2], reg_B[6], reg_C[10].z);
565 | 			reg_C[11].z = fma(reg_A[2], reg_B[7], reg_C[11].z);
566 | 			reg_C[4].z = fma(reg_A[6], reg_B[0], reg_C[4].z);
567 | 			reg_C[5].z = fma(reg_A[6], reg_B[1], reg_C[5].z);
568 | 			reg_C[6].z = fma(reg_A[6], reg_B[2], reg_C[6].z);
569 | 			reg_C[7].z = fma(reg_A[6], reg_B[3], reg_C[7].z);
570 | 			reg_C[12].z = fma(reg_A[6], reg_B[4], reg_C[12].z);
571 | 			reg_C[13].z = fma(reg_A[6], reg_B[5], reg_C[13].z);
572 | 			reg_C[14].z = fma(reg_A[6], reg_B[6], reg_C[14].z);
573 | 			reg_C[15].z = fma(reg_A[6], reg_B[7], reg_C[15].z);
574 | 
575 | 			reg_C[0].w = fma(reg_A[3], reg_B[0], reg_C[0].w);
576 | 			reg_C[1].w = fma(reg_A[3], reg_B[1], reg_C[1].w);
577 | 			reg_C[2].w = fma(reg_A[3], reg_B[2], reg_C[2].w);
578 | 			reg_C[3].w = fma(reg_A[3], reg_B[3], reg_C[3].w);
579 | 			reg_C[8].w = fma(reg_A[3], reg_B[4], reg_C[8].w);
580 | 			reg_C[9].w = fma(reg_A[3], reg_B[5], reg_C[9].w);
581 | 			reg_C[10].w = fma(reg_A[3], reg_B[6], reg_C[10].w);
582 | 			reg_C[11].w = fma(reg_A[3], reg_B[7], reg_C[11].w);
583 | 			reg_C[4].w = fma(reg_A[7], reg_B[0], reg_C[4].w);
584 | 			reg_C[5].w = fma(reg_A[7], reg_B[1], reg_C[5].w);
585 | 			reg_C[6].w = fma(reg_A[7], reg_B[2], reg_C[6].w);
586 | 			reg_C[7].w = fma(reg_A[7], reg_B[3], reg_C[7].w);
587 | 			reg_C[12].w = fma(reg_A[7], reg_B[4], reg_C[12].w);
588 | 			reg_C[13].w = fma(reg_A[7], reg_B[5], reg_C[13].w);
589 | 			reg_C[14].w = fma(reg_A[7], reg_B[6], reg_C[14].w);
590 | 			reg_C[15].w = fma(reg_A[7], reg_B[7], reg_C[15].w);
591 | 
592 | 			A_offset += 128;
593 | 			if (i==3) B_offset += 508;
594 | 			B_offset += 1;
595 | 		}
596 | 
597 | 		double_buffer ^= 1024;
598 | 
599 | 		if (k+8 < K){
600 | 			A_start += 2*M; 
601 | 			*((float4*) (sh_A + double_buffer + 4*threadIdx.x)) = *(A_start);
602 | 
603 | 			B_start += 2; 
604 | 			*((float4*) (sh_B + double_buffer + 4*threadIdx.x)) = *(B_start);
605 | 		}
606 | 				
607 | 	}
608 | 	C_start -= (16*M + 16);
609 |     *C_start = reg_C[0];
610 | 	*(C_start + M/4) = reg_C[1];
611 | 	*(C_start + M/2) = reg_C[2];
612 | 	*(C_start + 3*M/4) = reg_C[3];
613 | 
614 | 	C_start += 16;
615 | 	*(C_start) = reg_C[4];
616 | 	*(C_start + M/4) = reg_C[5];
617 | 	*(C_start + M/2) = reg_C[6];
618 | 	*(C_start + 3*M/4) = reg_C[7];
619 | 
620 | 	C_start += (16*M - 16);
621 | 	*(C_start) = reg_C[8];
622 | 	*(C_start + M/4) = reg_C[9];
623 | 	*(C_start + M/2) = reg_C[10];
624 | 	*(C_start + 3*M/4) = reg_C[11];
625 | 
626 | 	C_start += 16;
627 | 	*(C_start) = reg_C[12];
628 | 	*(C_start + M/4) = reg_C[13];
629 | 	*(C_start + M/2) = reg_C[14];
630 | 	*(C_start + 3*M/4) = reg_C[15];
631 | }
632 | 


--------------------------------------------------------------------------------
/tiling/log:
--------------------------------------------------------------------------------
  1 | 280.143738
  2 | 534.039734
  3 | 727.471680
  4 | 845.944153
  5 | 1084.099365
  6 | 1422.803467
  7 | 1579.448486
  8 | 269.648407
  9 | 555.514282
 10 | 721.614685
 11 | 885.036804
 12 | 1066.932861
 13 | 1422.266479
 14 | 1570.676758
 15 | 263.423431
 16 | 534.368958
 17 | 721.535034
 18 | 859.121582
 19 | 1072.408813
 20 | 1412.882324
 21 | 1572.264404
 22 | 270.115479
 23 | 523.952820
 24 | 707.109070
 25 | 866.125549
 26 | 1084.162476
 27 | 1402.890015
 28 | 1569.128418
 29 | 264.715210
 30 | 555.603271
 31 | 716.399902
 32 | 865.086121
 33 | 1092.121216
 34 | 1386.733398
 35 | 1560.968872
 36 | 273.271027
 37 | 535.855347
 38 | 710.410461
 39 | 864.641541
 40 | 1087.332031
 41 | 1376.754028
 42 | 1569.776123
 43 | 269.544830
 44 | 536.684692
 45 | 722.491150
 46 | 846.511780
 47 | 1093.919922
 48 | 1381.852295
 49 | 1566.757690
 50 | 209.469360
 51 | 505.555939
 52 | 783.671509
 53 | 1273.065063
 54 | 2116.548828
 55 | 2890.106689
 56 | 2724.704590
 57 | 208.907715
 58 | 507.492310
 59 | 788.811890
 60 | 1287.259888
 61 | 2130.036133
 62 | 2842.028809
 63 | 2724.333252
 64 | 214.476959
 65 | 505.823975
 66 | 781.892944
 67 | 1289.800537
 68 | 2129.311035
 69 | 2874.502930
 70 | 2749.896729
 71 | 207.793365
 72 | 522.444702
 73 | 779.692871
 74 | 1294.087769
 75 | 2109.910889
 76 | 2875.129883
 77 | 2724.478516
 78 | 204.803528
 79 | 509.552429
 80 | 779.883789
 81 | 1295.828979
 82 | 2207.722168
 83 | 2881.449951
 84 | 2721.509766
 85 | 207.997757
 86 | 516.280945
 87 | 766.518372
 88 | 1287.169434
 89 | 2164.342773
 90 | 2833.191650
 91 | 2740.417969
 92 | 205.514359
 93 | 504.913757
 94 | 781.557312
 95 | 1294.957764
 96 | 2116.169678
 97 | 2908.369385
 98 | 2723.961670
 99 | 787.810425
100 | 1460.745728
101 | 2118.552246
102 | 2927.529053
103 | 3094.362549
104 | 3665.558350
105 | 3655.540527
106 | 751.842041
107 | 1458.042847
108 | 2116.193604
109 | 2919.397949
110 | 3097.433838
111 | 3712.432129
112 | 3618.252441
113 | 766.375305
114 | 1465.831055
115 | 2110.526611
116 | 2973.134521
117 | 3131.316406
118 | 3714.875000
119 | 3659.826172
120 | 770.930420
121 | 1433.450928
122 | 2115.593262
123 | 2966.372314
124 | 3092.711182
125 | 3684.868896
126 | 3634.195068
127 | 790.537231
128 | 1440.940430
129 | 2105.437988
130 | 3010.972412
131 | 3119.060791
132 | 3741.483887
133 | 3634.895996
134 | 772.404053
135 | 1473.978760
136 | 2128.414062
137 | 2940.936523
138 | 3079.076172
139 | 3732.108887
140 | 3642.632812
141 | 805.940552
142 | 1466.844727
143 | 2119.803223
144 | 2953.475342
145 | 3113.554932
146 | 3701.825928
147 | 3626.904297
148 | 1150.256226
149 | 3227.508301
150 | 4411.689453
151 | 5057.976074
152 | 4011.392334
153 | 5133.115234
154 | 5121.712891
155 | 1062.977173
156 | 3227.243408
157 | 4353.406738
158 | 5031.728027
159 | 4017.516357
160 | 5135.115234
161 | 5114.385254
162 | 1136.725708
163 | 3527.935303
164 | 4830.340332
165 | 5480.717773
166 | 4479.567871
167 | 5137.231445
168 | 5111.422363
169 | 1135.892944
170 | 3529.411621
171 | 4826.163086
172 | 5502.206543
173 | 4426.696289
174 | 5611.167480
175 | 5588.766113
176 | 1237.780762
177 | 3567.964600
178 | 4916.781250
179 | 5593.488770
180 | 4441.811035
181 | 5668.506348
182 | 5653.515137
183 | 1153.406616
184 | 3570.661865
185 | 4879.754395
186 | 5573.226074
187 | 4363.485352
188 | 5675.595703
189 | 5656.397461
190 | 1236.972900
191 | 3566.132324
192 | 4887.588379
193 | 5580.053711
194 | 4388.849609
195 | 5663.231934
196 | 5657.087891
197 | 


--------------------------------------------------------------------------------
/tiling/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -f log
 4 | for ((M=128; M<=1024; M=M*2))
 5 | do
 6 | 	for ((K=16; K<=1024; K=K*2))
 7 | 	do
 8 | 		cd ../data
 9 | 		./gen_data $M $M $K
10 | 		cd - > /dev/null
11 | 		./gemm 4 >> log
12 | 		./gemm 8 >> log
13 | 		./gemm 16 >> log
14 | 		./gemm 32 >> log
15 | 		./gemm 64 >> log
16 | 		./gemm 128 >> log
17 | 		./gemm 256 >> log
18 | 	done
19 | done
20 | 


--------------------------------------------------------------------------------
/tiling/thres.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | for ((thres=1024; thres<=102400000; thres=thres*2))
 4 | do
 5 | 	./gemm 16 $thres >> log
 6 | done
 7 | for ((thres=1024; thres<=102400000; thres=thres*2))
 8 | do
 9 | 	./gemm 32 $thres >> log
10 | done
11 | for ((thres=1024; thres<=102400000; thres=thres*2))
12 | do
13 | 	./gemm 64 $thres >> log
14 | done
15 | for ((thres=1024; thres<=102400000; thres=thres*2))
16 | do
17 | 	./gemm 128 $thres >> log
18 | done
19 | for ((thres=1024; thres<=102400000; thres=thres*2))
20 | do
21 | 	./gemm 256 $thres >> log
22 | done
23 | 


--------------------------------------------------------------------------------