├── README.md
├── Reduction
    ├── cudastart.h
    ├── readme.md
    ├── reduction
    └── reduction.cu
├── Reduction2
    ├── cudastart.h
    ├── readme.md
    ├── reduction2
    └── reduction2.cu
├── Reduction3
    ├── cudastart.h
    ├── readme.md
    ├── reduction3
    └── reduction3.cu
└── Sum_Matrix
    ├── cudastart.h
    ├── readme.md
    └── sum_martix.cu


/README.md:
--------------------------------------------------------------------------------
1 | # CUDA_study
2 | 
3 | Here are the codes of https://zhuanlan.zhihu.com/c_1188568938097819648
4 | 


--------------------------------------------------------------------------------
/Reduction/cudastart.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUDASTART_H
 2 | #define CUDASTART_H
 3 | #define CHECK(call)\
 4 | {\
 5 |   const cudaError_t error=call;\
 6 |   if(error!=cudaSuccess)\
 7 |   {\
 8 |       printf("ERROR: %s:%d,",__FILE__,__LINE__);\
 9 |       printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
10 |       exit(1);\
11 |   }\
12 | }
13 | 
14 | 
15 | #include <time.h>
16 | #ifdef _WIN32
17 | #	include <windows.h>
18 | #else
19 | #	include <sys/time.h>
20 | #endif
21 | 
22 | double cpuSecond()
23 | {
24 |   struct timeval tp;
25 |   gettimeofday(&tp,NULL);
26 |   return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
27 | 
28 | }
29 | 
30 | void initialData(float* ip,int size)
31 | {
32 |   time_t t;
33 |   srand((unsigned )time(&t));
34 |   for(int i=0;i<size;i++)
35 |   {
36 |     ip[i]=(float)(rand()&0xffff)/1000.0f;
37 |   }
38 | }
39 | 
40 | void initialData_int(int* ip, int size)
41 | {
42 | 	time_t t;
43 | 	srand((unsigned)time(&t));
44 | 	for (int i = 0; i<size; i++)
45 | 	{
46 | 		ip[i] = int(rand()&0xff);
47 | 	}
48 | }
49 | 
50 | void initDevice(int devNum)
51 | {
52 |   int dev = devNum;
53 |   cudaDeviceProp deviceProp;
54 |   CHECK(cudaGetDeviceProperties(&deviceProp,dev));
55 |   printf("Using device %d: %s\n",dev,deviceProp.name);
56 |   CHECK(cudaSetDevice(dev));
57 | 
58 | }
59 | 
60 | void checkResult(float * hostRef,float * gpuRef,const int N)
61 | {
62 |   double epsilon=1.0E-8;
63 |   for(int i=0;i<N;i++)
64 |   {
65 |     if(abs(hostRef[i]-gpuRef[i])>epsilon)
66 |     {
67 |       printf("Results don\'t match!\n");
68 |       printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
69 |       return;
70 |     }
71 |   }
72 |   printf("Check result success!\n");
73 | }
74 | 
75 | #endif
76 | 
77 | 


--------------------------------------------------------------------------------
/Reduction/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | https://zhuanlan.zhihu.com/p/98190609
3 | 


--------------------------------------------------------------------------------
/Reduction/reduction:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZihaoZhao/CUDA_study/1fc8a7bc770593e4e23ce2f08b6546ad6c111e71/Reduction/reduction


--------------------------------------------------------------------------------
/Reduction/reduction.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "cudastart.h"
  4 | int recursiveReduce(int *data, int const size)
  5 | {
  6 | 	// terminate check
  7 | 	if (size == 1) return data[0];
  8 | 	// renew the stride
  9 | 	int const stride = size / 2;
 10 | 	if (size % 2 == 1)
 11 | 	{
 12 | 		for (int i = 0; i < stride; i++)
 13 | 		{
 14 | 			data[i] += data[i + stride];
 15 | 		}
 16 | 		data[0] += data[size - 1];
 17 | 	}
 18 | 	else
 19 | 	{
 20 | 		for (int i = 0; i < stride; i++)
 21 | 		{
 22 | 			data[i] += data[i + stride];
 23 | 		}
 24 | 	}
 25 | 	// call
 26 | 	return recursiveReduce(data, stride);
 27 | }
 28 | 
 29 | 
 30 | 
 31 | __global__ void reduceNeighbored(int * g_idata,int * g_odata,unsigned int n) 
 32 | {
 33 | 	//set thread ID
 34 | 	unsigned int tid = threadIdx.x;
 35 | 	//boundary check
 36 | 	if (tid >= n) return;
 37 | 	//convert global data pointer to the 
 38 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 39 | 	//in-place reduction in global memory
 40 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 41 | 	{
 42 | 		if ((tid % (2 * stride)) == 0)
 43 | 		{
 44 | 			idata[tid] += idata[tid + stride];
 45 | 		}
 46 | 		//synchronize within block
 47 | 		__syncthreads();
 48 | 	}
 49 | 	//write result for this block to global mem
 50 | 	if (tid == 0)
 51 | 		g_odata[blockIdx.x] = idata[0];
 52 | 
 53 | }
 54 | 
 55 | int main(int argc,char** argv)
 56 | {
 57 | 	initDevice(0);
 58 | 	
 59 | 	//initialization
 60 | 
 61 | 	int size = 1 << 24;
 62 | 	printf("	with array size %d  ", size);
 63 | 
 64 | 	//execution configuration
 65 | 	int blocksize = 1024;
 66 | 	if (argc > 1)
 67 | 	{
 68 | 		blocksize = atoi(argv[1]);   //从命令行输入设置block大小
 69 | 	}
 70 | 	dim3 block(blocksize, 1);
 71 | 	dim3 grid((size - 1) / block.x + 1, 1);
 72 | 	printf("grid %d block %d \n", grid.x, block.x);
 73 | 
 74 | 	//allocate host memory
 75 | 	size_t bytes = size * sizeof(int);
 76 | 	int *idata_host = (int*)malloc(bytes);
 77 | 	int *odata_host = (int*)malloc(grid.x * sizeof(int));
 78 | 	int * tmp = (int*)malloc(bytes);
 79 | 
 80 | 	//initialize the array
 81 | 	initialData_int(idata_host, size);
 82 | 
 83 | 	memcpy(tmp, idata_host, bytes);
 84 | 	double timeStart, timeElaps;
 85 | 	int gpu_sum = 0;
 86 | 
 87 | 	// device memory
 88 | 	int * idata_dev = NULL;
 89 | 	int * odata_dev = NULL;
 90 | 	CHECK(cudaMalloc((void**)&idata_dev, bytes));
 91 | 	CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int)));
 92 | 
 93 | 	//cpu reduction 对照组
 94 | 	int cpu_sum = 0;
 95 | 	timeStart = cpuSecond();
 96 | 	//cpu_sum = recursiveReduce(tmp, size);
 97 | 	for (int i = 0; i < size; i++)
 98 | 		cpu_sum += tmp[i];
 99 | 	timeElaps = 1000*(cpuSecond() - timeStart);
100 | 
101 | 	printf("cpu sum:%d \n", cpu_sum);
102 | 	printf("cpu reduction elapsed %lf ms cpu_sum: %d\n", timeElaps, cpu_sum);
103 | 
104 | 
105 | 	//kernel reduceNeighbored
106 | 
107 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
108 | 	CHECK(cudaDeviceSynchronize());
109 | 	timeStart = cpuSecond();
110 | 	reduceNeighbored <<<grid, block >>>(idata_dev, odata_dev, size);
111 | 	cudaDeviceSynchronize();
112 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
113 | 	gpu_sum = 0;
114 | 	for (int i = 0; i < grid.x; i++)
115 | 		gpu_sum += odata_host[i];	
116 |     timeElaps = 1000*(cpuSecond() - timeStart);
117 | 
118 | 	printf("gpu sum:%d \n", gpu_sum);
119 | 	printf("gpu reduceNeighbored elapsed %lf ms     <<<grid %d block %d>>>\n",
120 | 		timeElaps, grid.x, block.x);
121 |     
122 | 	// free host memory
123 | 
124 | 	free(idata_host);
125 | 	free(odata_host);
126 | 	CHECK(cudaFree(idata_dev));
127 | 	CHECK(cudaFree(odata_dev));
128 | 
129 | 	//reset device
130 | 	cudaDeviceReset();
131 | 
132 | 	//check the results
133 | 	if (gpu_sum == cpu_sum)
134 | 	{
135 | 		printf("Test success!\n");
136 | 	}
137 | 	return EXIT_SUCCESS;
138 | }
139 | 


--------------------------------------------------------------------------------
/Reduction2/cudastart.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUDASTART_H
 2 | #define CUDASTART_H
 3 | #define CHECK(call)\
 4 | {\
 5 |   const cudaError_t error=call;\
 6 |   if(error!=cudaSuccess)\
 7 |   {\
 8 |       printf("ERROR: %s:%d,",__FILE__,__LINE__);\
 9 |       printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
10 |       exit(1);\
11 |   }\
12 | }
13 | 
14 | 
15 | #include <time.h>
16 | #ifdef _WIN32
17 | #	include <windows.h>
18 | #else
19 | #	include <sys/time.h>
20 | #endif
21 | 
22 | double cpuSecond()
23 | {
24 |   struct timeval tp;
25 |   gettimeofday(&tp,NULL);
26 |   return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
27 | 
28 | }
29 | 
30 | void initialData(float* ip,int size)
31 | {
32 |   time_t t;
33 |   srand((unsigned )time(&t));
34 |   for(int i=0;i<size;i++)
35 |   {
36 |     ip[i]=(float)(rand()&0xffff)/1000.0f;
37 |   }
38 | }
39 | 
40 | void initialData_int(int* ip, int size)
41 | {
42 | 	time_t t;
43 | 	srand((unsigned)time(&t));
44 | 	for (int i = 0; i<size; i++)
45 | 	{
46 | 		ip[i] = int(rand()&0xff);
47 | 	}
48 | }
49 | 
50 | void initDevice(int devNum)
51 | {
52 |   int dev = devNum;
53 |   cudaDeviceProp deviceProp;
54 |   CHECK(cudaGetDeviceProperties(&deviceProp,dev));
55 |   printf("Using device %d: %s\n",dev,deviceProp.name);
56 |   CHECK(cudaSetDevice(dev));
57 | 
58 | }
59 | 
60 | void checkResult(float * hostRef,float * gpuRef,const int N)
61 | {
62 |   double epsilon=1.0E-8;
63 |   for(int i=0;i<N;i++)
64 |   {
65 |     if(abs(hostRef[i]-gpuRef[i])>epsilon)
66 |     {
67 |       printf("Results don\'t match!\n");
68 |       printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
69 |       return;
70 |     }
71 |   }
72 |   printf("Check result success!\n");
73 | }
74 | 
75 | #endif
76 | 
77 | 


--------------------------------------------------------------------------------
/Reduction2/readme.md:
--------------------------------------------------------------------------------
1 | https://zhuanlan.zhihu.com/p/98416987
2 | 


--------------------------------------------------------------------------------
/Reduction2/reduction2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZihaoZhao/CUDA_study/1fc8a7bc770593e4e23ce2f08b6546ad6c111e71/Reduction2/reduction2


--------------------------------------------------------------------------------
/Reduction2/reduction2.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "cudastart.h"
  4 | int recursiveReduce(int *data, int const size)
  5 | {
  6 | 	// terminate check
  7 | 	if (size == 1) return data[0];
  8 | 	// renew the stride
  9 | 	int const stride = size / 2;
 10 | 	if (size % 2 == 1)
 11 | 	{
 12 | 		for (int i = 0; i < stride; i++)
 13 | 		{
 14 | 			data[i] += data[i + stride];
 15 | 		}
 16 | 		data[0] += data[size - 1];
 17 | 	}
 18 | 	else
 19 | 	{
 20 | 		for (int i = 0; i < stride; i++)
 21 | 		{
 22 | 			data[i] += data[i + stride];
 23 | 		}
 24 | 	}
 25 | 	// call
 26 | 	return recursiveReduce(data, stride);
 27 | }
 28 | 
 29 | 
 30 | 
 31 | __global__ void reduceNeighbored(int * g_idata,int * g_odata,unsigned int n) 
 32 | {
 33 | 	//set thread ID
 34 | 	unsigned int tid = threadIdx.x;
 35 | 	//boundary check
 36 | 	if (tid >= n) return;
 37 | 	//convert global data pointer to the 
 38 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 39 | 	//in-place reduction in global memory
 40 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 41 | 	{
 42 | 		if ((tid % (2 * stride)) == 0)
 43 | 		{
 44 | 			idata[tid] += idata[tid + stride];
 45 | 		}
 46 | 		//synchronize within block
 47 | 		__syncthreads();
 48 | 	}
 49 | 	//write result for this block to global mem
 50 | 	if (tid == 0)
 51 | 		g_odata[blockIdx.x] = idata[0];
 52 | 
 53 | }
 54 | 
 55 | __global__ void reduceNeighboredLess(int * g_idata,int *g_odata,unsigned int n)
 56 | {
 57 | 	unsigned int tid = threadIdx.x;
 58 | 	unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
 59 | 	// convert global data pointer to the local point of this block
 60 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 61 | 	if (idx > n)
 62 | 		return;
 63 | 	//in-place reduction in global memory
 64 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 65 | 	{
 66 | 		//convert tid into local array index
 67 | 		int index = 2 * stride *tid;
 68 | 		if (index < blockDim.x)
 69 | 		{
 70 | 			idata[index] += idata[index + stride];
 71 | 		}
 72 | 		__syncthreads();
 73 | 	}
 74 | 	//write result for this block to global men
 75 | 	if (tid == 0)
 76 | 		g_odata[blockIdx.x] = idata[0];
 77 | }
 78 | 
 79 | __global__ void reduceInterleaved(int * g_idata, int *g_odata, unsigned int n)
 80 | {
 81 | 	unsigned int tid = threadIdx.x;
 82 | 	unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
 83 | 	// convert global data pointer to the local point of this block
 84 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 85 | 	if (idx >= n)
 86 | 		return;
 87 | 	//in-place reduction in global memory
 88 | 	for (int stride = blockDim.x/2; stride >0; stride >>=1)
 89 | 	{
 90 | 		
 91 | 		if (tid <stride)
 92 | 		{
 93 | 			idata[tid] += idata[tid + stride];
 94 | 		}
 95 | 		__syncthreads();
 96 | 	}
 97 | 	//write result for this block to global men
 98 | 	if (tid == 0)
 99 | 		g_odata[blockIdx.x] = idata[0];
100 | }
101 | 
102 | int main(int argc,char** argv)
103 | {
104 | 	initDevice(0);
105 | 	
106 | 	//initialization
107 | 
108 | 	int size = 1 << 24;
109 | 	printf("	with array size %d  ", size);
110 | 
111 | 	//execution configuration
112 | 	int blocksize = 1024;
113 | 	if (argc > 1)
114 | 	{
115 | 		blocksize = atoi(argv[1]);   //从命令行输入设置block大小
116 | 	}
117 | 	dim3 block(blocksize, 1);
118 | 	dim3 grid((size - 1) / block.x + 1, 1);
119 | 	printf("grid %d block %d \n", grid.x, block.x);
120 | 
121 | 	//allocate host memory
122 | 	size_t bytes = size * sizeof(int);
123 | 	int *idata_host = (int*)malloc(bytes);
124 | 	int *odata_host = (int*)malloc(grid.x * sizeof(int));
125 | 	int * tmp = (int*)malloc(bytes);
126 | 
127 | 	//initialize the array
128 | 	initialData_int(idata_host, size);
129 | 
130 | 	memcpy(tmp, idata_host, bytes);
131 | 	double timeStart, timeElaps;
132 | 	int gpu_sum = 0;
133 | 
134 | 	// device memory
135 | 	int * idata_dev = NULL;
136 | 	int * odata_dev = NULL;
137 | 	CHECK(cudaMalloc((void**)&idata_dev, bytes));
138 | 	CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int)));
139 | 
140 | 	//cpu reduction 对照组
141 | 	int cpu_sum = 0;
142 | 	timeStart = cpuSecond();
143 | 	//cpu_sum = recursiveReduce(tmp, size);
144 | 	for (int i = 0; i < size; i++)
145 | 		cpu_sum += tmp[i];
146 | 	timeElaps = 1000*(cpuSecond() - timeStart);
147 | 
148 | 	printf("cpu sum:%d \n", cpu_sum);
149 | 	printf("cpu reduction elapsed %lf ms cpu_sum: %d\n", timeElaps, cpu_sum);
150 | 
151 | 
152 | 	//kernel 1 reduceNeighbored
153 | 
154 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
155 | 	CHECK(cudaDeviceSynchronize());
156 | 	timeStart = cpuSecond();
157 | 	reduceNeighbored <<<grid, block >>>(idata_dev, odata_dev, size);
158 | 	cudaDeviceSynchronize();
159 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
160 | 	gpu_sum = 0;
161 | 	for (int i = 0; i < grid.x; i++)
162 | 		gpu_sum += odata_host[i];	
163 |     timeElaps = 1000*(cpuSecond() - timeStart);
164 | 
165 | 	printf("gpu sum:%d \n", gpu_sum);
166 | 	printf("gpu reduceNeighbored elapsed %lf ms     <<<grid %d block %d>>>\n",
167 | 		timeElaps, grid.x, block.x);
168 |     
169 |     //kernel 2 reduceNeighboredless
170 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
171 | 	CHECK(cudaDeviceSynchronize());
172 | 	timeStart = cpuSecond();
173 | 	reduceNeighboredLess <<<grid, block >>>(idata_dev, odata_dev, size);
174 | 	cudaDeviceSynchronize();
175 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
176 | 	gpu_sum = 0;
177 | 	for (int i = 0; i < grid.x; i++)
178 | 		gpu_sum += odata_host[i];	
179 |     timeElaps = 1000*(cpuSecond() - timeStart);
180 | 
181 | 	printf("gpu sum:%d \n", gpu_sum);
182 | 	printf("gpu reduceNeighboredless elapsed %lf ms     <<<grid %d block %d>>>\n",
183 | 		timeElaps, grid.x, block.x);
184 | 
185 |     //kernel 3 reduceInterleaved
186 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
187 | 	CHECK(cudaDeviceSynchronize());
188 | 	timeStart = cpuSecond();
189 | 	reduceInterleaved <<<grid, block >>>(idata_dev, odata_dev, size);
190 | 	cudaDeviceSynchronize();
191 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
192 | 	gpu_sum = 0;
193 | 	for (int i = 0; i < grid.x; i++)
194 | 		gpu_sum += odata_host[i];	
195 |     timeElaps = 1000*(cpuSecond() - timeStart);
196 | 
197 | 	printf("gpu sum:%d \n", gpu_sum);
198 | 	printf("gpu reduceInterleaved elapsed %lf ms     <<<grid %d block %d>>>\n",
199 | 		timeElaps, grid.x, block.x);
200 |     
201 | 	// free host memory
202 | 
203 | 	free(idata_host);
204 | 	free(odata_host);
205 | 	CHECK(cudaFree(idata_dev));
206 | 	CHECK(cudaFree(odata_dev));
207 | 
208 | 	//reset device
209 | 	cudaDeviceReset();
210 | 
211 | 	//check the results
212 | 	if (gpu_sum == cpu_sum)
213 | 	{
214 | 		printf("Test success!\n");
215 | 	}
216 | 	return EXIT_SUCCESS;
217 | }
218 | 


--------------------------------------------------------------------------------
/Reduction3/cudastart.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUDASTART_H
 2 | #define CUDASTART_H
 3 | #define CHECK(call)\
 4 | {\
 5 |   const cudaError_t error=call;\
 6 |   if(error!=cudaSuccess)\
 7 |   {\
 8 |       printf("ERROR: %s:%d,",__FILE__,__LINE__);\
 9 |       printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
10 |       exit(1);\
11 |   }\
12 | }
13 | 
14 | 
15 | #include <time.h>
16 | #ifdef _WIN32
17 | #	include <windows.h>
18 | #else
19 | #	include <sys/time.h>
20 | #endif
21 | 
22 | double cpuSecond()
23 | {
24 |   struct timeval tp;
25 |   gettimeofday(&tp,NULL);
26 |   return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
27 | 
28 | }
29 | 
30 | void initialData(float* ip,int size)
31 | {
32 |   time_t t;
33 |   srand((unsigned )time(&t));
34 |   for(int i=0;i<size;i++)
35 |   {
36 |     ip[i]=(float)(rand()&0xffff)/1000.0f;
37 |   }
38 | }
39 | 
40 | void initialData_int(int* ip, int size)
41 | {
42 | 	time_t t;
43 | 	srand((unsigned)time(&t));
44 | 	for (int i = 0; i<size; i++)
45 | 	{
46 | 		ip[i] = int(rand()&0xff);
47 | 	}
48 | }
49 | 
50 | void initDevice(int devNum)
51 | {
52 |   int dev = devNum;
53 |   cudaDeviceProp deviceProp;
54 |   CHECK(cudaGetDeviceProperties(&deviceProp,dev));
55 |   printf("Using device %d: %s\n",dev,deviceProp.name);
56 |   CHECK(cudaSetDevice(dev));
57 | 
58 | }
59 | 
60 | void checkResult(float * hostRef,float * gpuRef,const int N)
61 | {
62 |   double epsilon=1.0E-8;
63 |   for(int i=0;i<N;i++)
64 |   {
65 |     if(abs(hostRef[i]-gpuRef[i])>epsilon)
66 |     {
67 |       printf("Results don\'t match!\n");
68 |       printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
69 |       return;
70 |     }
71 |   }
72 |   printf("Check result success!\n");
73 | }
74 | 
75 | #endif
76 | 
77 | 


--------------------------------------------------------------------------------
/Reduction3/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Reduction3/reduction3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZihaoZhao/CUDA_study/1fc8a7bc770593e4e23ce2f08b6546ad6c111e71/Reduction3/reduction3


--------------------------------------------------------------------------------
/Reduction3/reduction3.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "cudastart.h"
  4 | int recursiveReduce(int *data, int const size)
  5 | {
  6 | 	// terminate check
  7 | 	if (size == 1) return data[0];
  8 | 	// renew the stride
  9 | 	int const stride = size / 2;
 10 | 	if (size % 2 == 1)
 11 | 	{
 12 | 		for (int i = 0; i < stride; i++)
 13 | 		{
 14 | 			data[i] += data[i + stride];
 15 | 		}
 16 | 		data[0] += data[size - 1];
 17 | 	}
 18 | 	else
 19 | 	{
 20 | 		for (int i = 0; i < stride; i++)
 21 | 		{
 22 | 			data[i] += data[i + stride];
 23 | 		}
 24 | 	}
 25 | 	// call
 26 | 	return recursiveReduce(data, stride);
 27 | }
 28 | 
 29 | 
 30 | 
 31 | __global__ void reduceNeighbored(int * g_idata,int * g_odata,unsigned int n) 
 32 | {
 33 | 	//set thread ID
 34 | 	unsigned int tid = threadIdx.x;
 35 | 	//boundary check
 36 | 	if (tid >= n) return;
 37 | 	//convert global data pointer to the 
 38 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 39 | 	//in-place reduction in global memory
 40 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 41 | 	{
 42 | 		if ((tid % (2 * stride)) == 0)
 43 | 		{
 44 | 			idata[tid] += idata[tid + stride];
 45 | 		}
 46 | 		//synchronize within block
 47 | 		__syncthreads();
 48 | 	}
 49 | 	//write result for this block to global mem
 50 | 	if (tid == 0)
 51 | 		g_odata[blockIdx.x] = idata[0];
 52 | 
 53 | }
 54 | 
 55 | __global__ void reduceNeighboredLess(int * g_idata,int *g_odata,unsigned int n)
 56 | {
 57 | 	unsigned int tid = threadIdx.x;
 58 | 	unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
 59 | 	// convert global data pointer to the local point of this block
 60 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 61 | 	if (idx > n)
 62 | 		return;
 63 | 	//in-place reduction in global memory
 64 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 65 | 	{
 66 | 		//convert tid into local array index
 67 | 		int index = 2 * stride *tid;
 68 | 		if (index < blockDim.x)
 69 | 		{
 70 | 			idata[index] += idata[index + stride];
 71 | 		}
 72 | 		__syncthreads();
 73 | 	}
 74 | 	//write result for this block to global men
 75 | 	if (tid == 0)
 76 | 		g_odata[blockIdx.x] = idata[0];
 77 | }
 78 | 
 79 | __global__ void reduceInterleaved(int * g_idata, int *g_odata, unsigned int n)
 80 | {
 81 | 	unsigned int tid = threadIdx.x;
 82 | 	unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
 83 | 	// convert global data pointer to the local point of this block
 84 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 85 | 	if (idx >= n)
 86 | 		return;
 87 | 	//in-place reduction in global memory
 88 | 	for (int stride = blockDim.x/2; stride >0; stride >>=1)
 89 | 	{
 90 | 		
 91 | 		if (tid <stride)
 92 | 		{
 93 | 			idata[tid] += idata[tid + stride];
 94 | 		}
 95 | 		__syncthreads();
 96 | 	}
 97 | 	//write result for this block to global men
 98 | 	if (tid == 0)
 99 | 		g_odata[blockIdx.x] = idata[0];
100 | }
101 | 
102 | 
103 | __global__ void reduceUnroll2(int * g_idata,int * g_odata,unsigned int n)
104 | {
105 | 	//set thread ID
106 | 	unsigned int tid = threadIdx.x;
107 | 	unsigned int idx = blockDim.x*blockIdx.x*2+threadIdx.x;
108 | 	//boundary check
109 | 	if (tid >= n) return;
110 | 	//convert global data pointer to the
111 | 	int *idata = g_idata + blockIdx.x*blockDim.x*2;
112 | 	if(idx+blockDim.x<n)
113 | 	{
114 | 		g_idata[idx]+=g_idata[idx+blockDim.x];
115 | 
116 | 	}
117 | 	__syncthreads();
118 | 	//in-place reduction in global memory
119 | 	for (int stride = blockDim.x/2; stride>0 ; stride >>=1)
120 | 	{
121 | 		if (tid <stride)
122 | 		{
123 | 			idata[tid] += idata[tid + stride];
124 | 		}
125 | 		//synchronize within block
126 | 		__syncthreads();
127 | 	}
128 | 	//write result for this block to global mem
129 | 	if (tid == 0)
130 | 		g_odata[blockIdx.x] = idata[0];
131 | 
132 | }
133 | 
134 | __global__ void reduceUnroll4(int * g_idata,int * g_odata,unsigned int n)
135 | {
136 | 	//set thread ID
137 | 	unsigned int tid = threadIdx.x;
138 | 	unsigned int idx = blockDim.x*blockIdx.x*4+threadIdx.x;
139 | 	//boundary check
140 | 	if (tid >= n) return;
141 | 	//convert global data pointer to the
142 | 	int *idata = g_idata + blockIdx.x*blockDim.x*4;
143 | 	if(idx+blockDim.x<n)
144 | 	{
145 | 		g_idata[idx]+=g_idata[idx+blockDim.x];
146 | 		g_idata[idx]+=g_idata[idx+blockDim.x*2];
147 | 		g_idata[idx]+=g_idata[idx+blockDim.x*3];
148 | 	}
149 | 	__syncthreads();
150 | 	//in-place reduction in global memory
151 | 	for (int stride = blockDim.x/2; stride>0 ; stride >>=1)
152 | 	{
153 | 		if (tid <stride)
154 | 		{
155 | 			idata[tid] += idata[tid + stride];
156 | 		}
157 | 		//synchronize within block
158 | 		__syncthreads();
159 | 	}
160 | 	//write result for this block to global mem
161 | 	if (tid == 0)
162 | 		g_odata[blockIdx.x] = idata[0];
163 | 
164 | }
165 | 
166 | __global__ void reduceUnroll8(int * g_idata,int * g_odata,unsigned int n)
167 | {
168 | 	//set thread ID
169 | 	unsigned int tid = threadIdx.x;
170 | 	unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
171 | 	//boundary check
172 | 	if (tid >= n) return;
173 | 	//convert global data pointer to the
174 | 	int *idata = g_idata + blockIdx.x*blockDim.x*8;
175 | 	if(idx+blockDim.x<n)
176 | 	{
177 | 		g_idata[idx]+=g_idata[idx+blockDim.x];
178 | 		g_idata[idx]+=g_idata[idx+blockDim.x*2];
179 | 		g_idata[idx]+=g_idata[idx+blockDim.x*3];
180 | 		g_idata[idx]+=g_idata[idx+blockDim.x*4];
181 | 		g_idata[idx]+=g_idata[idx+blockDim.x*5];
182 | 		g_idata[idx]+=g_idata[idx+blockDim.x*6];
183 | 		g_idata[idx]+=g_idata[idx+blockDim.x*7];
184 | 
185 | 	}
186 | 	__syncthreads();
187 | 	//in-place reduction in global memory
188 | 	for (int stride = blockDim.x/2; stride>0 ; stride >>=1)
189 | 	{
190 | 		if (tid <stride)
191 | 		{
192 | 			idata[tid] += idata[tid + stride];
193 | 		}
194 | 		//synchronize within block
195 | 		__syncthreads();
196 | 	}
197 | 	//write result for this block to global mem
198 | 	if (tid == 0)
199 | 		g_odata[blockIdx.x] = idata[0];
200 | 
201 | }
202 | 
203 | 
204 | __global__ void reduceUnrollWarp8(int * g_idata,int * g_odata,unsigned int n)
205 | {
206 | 	//set thread ID
207 | 	unsigned int tid = threadIdx.x;
208 | 	unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
209 | 	//boundary check
210 | 	if (tid >= n) return;
211 | 	//convert global data pointer to the
212 | 	int *idata = g_idata + blockIdx.x*blockDim.x*8;
213 | 	//unrolling 8;
214 | 	if(idx+7 * blockDim.x<n)
215 | 	{
216 | 		int a1=g_idata[idx];
217 | 		int a2=g_idata[idx+blockDim.x];
218 | 		int a3=g_idata[idx+2*blockDim.x];
219 | 		int a4=g_idata[idx+3*blockDim.x];
220 | 		int a5=g_idata[idx+4*blockDim.x];
221 | 		int a6=g_idata[idx+5*blockDim.x];
222 | 		int a7=g_idata[idx+6*blockDim.x];
223 | 		int a8=g_idata[idx+7*blockDim.x];
224 | 		g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
225 | 
226 | 	}
227 | 	__syncthreads();
228 | 	//in-place reduction in global memory
229 | 	for (int stride = blockDim.x/2; stride>32; stride >>=1)
230 | 	{
231 | 		if (tid <stride)
232 | 		{
233 | 			idata[tid] += idata[tid + stride];
234 | 		}
235 | 		//synchronize within block
236 | 		__syncthreads();
237 | 	}
238 | 	//write result for this block to global mem
239 | 	if(tid<32)
240 | 	{
241 | 		volatile int *vmem = idata;
242 | 		vmem[tid]+=vmem[tid+32];
243 | 		vmem[tid]+=vmem[tid+16];
244 | 		vmem[tid]+=vmem[tid+8];
245 | 		vmem[tid]+=vmem[tid+4];
246 | 		vmem[tid]+=vmem[tid+2];
247 | 		vmem[tid]+=vmem[tid+1];
248 | 
249 | 	}
250 | 
251 | 	if (tid == 0)
252 | 		g_odata[blockIdx.x] = idata[0];
253 | 
254 | }
255 | 
256 | __global__ void reduceCompleteUnrollWarp8(int * g_idata,int * g_odata,unsigned int n)
257 | {
258 | 	//set thread ID
259 | 	unsigned int tid = threadIdx.x;
260 | 	unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
261 | 	//boundary check
262 | 	if (tid >= n) return;
263 | 	//convert global data pointer to the
264 | 	int *idata = g_idata + blockIdx.x*blockDim.x*8;
265 | 	if(idx+7 * blockDim.x<n)
266 | 	{
267 | 		int a1=g_idata[idx];
268 | 		int a2=g_idata[idx+blockDim.x];
269 | 		int a3=g_idata[idx+2*blockDim.x];
270 | 		int a4=g_idata[idx+3*blockDim.x];
271 | 		int a5=g_idata[idx+4*blockDim.x];
272 | 		int a6=g_idata[idx+5*blockDim.x];
273 | 		int a7=g_idata[idx+6*blockDim.x];
274 | 		int a8=g_idata[idx+7*blockDim.x];
275 | 		g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
276 | 
277 | 	}
278 | 	__syncthreads();
279 | 	//in-place reduction in global memory
280 | 	if(blockDim.x>=1024 && tid <512)
281 | 		idata[tid]+=idata[tid+512];
282 | 	__syncthreads();
283 | 	if(blockDim.x>=512 && tid <256)
284 | 		idata[tid]+=idata[tid+256];
285 | 	__syncthreads();
286 | 	if(blockDim.x>=256 && tid <128)
287 | 		idata[tid]+=idata[tid+128];
288 | 	__syncthreads();
289 | 	if(blockDim.x>=128 && tid <64)
290 | 		idata[tid]+=idata[tid+64];
291 | 	__syncthreads();
292 | 	//write result for this block to global mem
293 | 	if(tid<32)
294 | 	{
295 | 		volatile int *vmem = idata;
296 | 		vmem[tid]+=vmem[tid+32];
297 | 		vmem[tid]+=vmem[tid+16];
298 | 		vmem[tid]+=vmem[tid+8];
299 | 		vmem[tid]+=vmem[tid+4];
300 | 		vmem[tid]+=vmem[tid+2];
301 | 		vmem[tid]+=vmem[tid+1];
302 | 
303 | 	}
304 | 
305 | 	if (tid == 0)
306 | 		g_odata[blockIdx.x] = idata[0];
307 | 
308 | }
309 | 
310 | int main(int argc,char** argv)
311 | {
312 | 	initDevice(0);
313 | 	
314 | 	//initialization
315 | 
316 | 	int size = 1 << 24;
317 | 	printf("	with array size %d  ", size);
318 | 
319 | 	//execution configuration
320 | 	int blocksize = 1024;
321 | 	if (argc > 1)
322 | 	{
323 | 		blocksize = atoi(argv[1]);   //从命令行输入设置block大小
324 | 	}
325 | 	dim3 block(blocksize, 1);
326 | 	dim3 grid((size - 1) / block.x + 1, 1);
327 | 	printf("grid %d block %d \n", grid.x, block.x);
328 | 
329 | 	//allocate host memory
330 | 	size_t bytes = size * sizeof(int);
331 | 	int *idata_host = (int*)malloc(bytes);
332 | 	int *odata_host = (int*)malloc(grid.x * sizeof(int));
333 | 	int * tmp = (int*)malloc(bytes);
334 | 
335 | 	//initialize the array
336 | 	initialData_int(idata_host, size);
337 | 
338 | 	memcpy(tmp, idata_host, bytes);
339 | 	double timeStart, timeElaps;
340 | 	int gpu_sum = 0;
341 | 
342 | 	// device memory
343 | 	int * idata_dev = NULL;
344 | 	int * odata_dev = NULL;
345 | 	CHECK(cudaMalloc((void**)&idata_dev, bytes));
346 | 	CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int)));
347 | 
348 | 	//cpu reduction 对照组
349 | 	int cpu_sum = 0;
350 | 	timeStart = cpuSecond();
351 | 	//cpu_sum = recursiveReduce(tmp, size);
352 | 	for (int i = 0; i < size; i++)
353 | 		cpu_sum += tmp[i];
354 | 	timeElaps = 1000*(cpuSecond() - timeStart);
355 | 
356 | 	printf("cpu sum:%d \n", cpu_sum);
357 | 	printf("cpu reduction elapsed %lf ms cpu_sum: %d\n", timeElaps, cpu_sum);
358 | 
359 |     
360 | 	//kernel 1 reduceNeighbored
361 | 
362 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
363 | 	CHECK(cudaDeviceSynchronize());
364 | 	timeStart = cpuSecond();
365 | 	reduceNeighbored <<<grid, block >>>(idata_dev, odata_dev, size);
366 | 	cudaDeviceSynchronize();
367 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
368 | 	gpu_sum = 0;
369 | 	for (int i = 0; i < grid.x; i++)
370 | 		gpu_sum += odata_host[i];	
371 |     timeElaps = 1000*(cpuSecond() - timeStart);
372 | 
373 | 	printf("gpu sum:%d \n", gpu_sum);
374 | 	printf("gpu reduceNeighbored elapsed %lf ms     <<<grid %d block %d>>>\n",
375 | 		timeElaps, grid.x, block.x);
376 |     
377 |     //kernel 2 reduceNeighboredless
378 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
379 | 	CHECK(cudaDeviceSynchronize());
380 | 	timeStart = cpuSecond();
381 | 	reduceNeighboredLess <<<grid, block >>>(idata_dev, odata_dev, size);
382 | 	cudaDeviceSynchronize();
383 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
384 | 	gpu_sum = 0;
385 | 	for (int i = 0; i < grid.x; i++)
386 | 		gpu_sum += odata_host[i];	
387 |     timeElaps = 1000*(cpuSecond() - timeStart);
388 | 
389 | 	printf("gpu sum:%d \n", gpu_sum);
390 | 	printf("gpu reduceNeighboredless elapsed %lf ms     <<<grid %d block %d>>>\n",
391 | 		timeElaps, grid.x, block.x);
392 | 
393 |     //kernel 3 reduceInterleaved
394 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
395 | 	CHECK(cudaDeviceSynchronize());
396 | 	timeStart = cpuSecond();
397 | 	reduceInterleaved <<<grid, block >>>(idata_dev, odata_dev, size);
398 | 	cudaDeviceSynchronize();
399 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
400 | 	gpu_sum = 0;
401 | 	for (int i = 0; i < grid.x; i++)
402 | 		gpu_sum += odata_host[i];	
403 |     timeElaps = 1000*(cpuSecond() - timeStart);
404 | 
405 | 	printf("gpu sum:%d \n", gpu_sum);
406 | 	printf("gpu reduceInterleaved elapsed %lf ms     <<<grid %d block %d>>>\n",
407 | 		timeElaps, grid.x, block.x);
408 |     
409 |     //kernel 4 reduceUnroll2
410 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
411 | 	CHECK(cudaDeviceSynchronize());
412 | 	timeStart = cpuSecond();
413 | 	reduceUnroll2 <<<grid.x/2, block >>>(idata_dev, odata_dev, size);
414 | 	cudaDeviceSynchronize();
415 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
416 | 	gpu_sum = 0;
417 | 	for (int i = 0; i < grid.x/2; i++)
418 | 		gpu_sum += odata_host[i];	
419 |     timeElaps = 1000*(cpuSecond() - timeStart);
420 | 
421 | 	printf("gpu sum:%d \n", gpu_sum);
422 | 	printf("gpu reduceUnroll2 elapsed %lf ms     <<<grid %d block %d>>>\n",
423 | 		timeElaps, grid.x/2, block.x);
424 |     
425 |     
426 |     //kernel 5 reduceUnroll4
427 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
428 | 	CHECK(cudaDeviceSynchronize());
429 | 	timeStart = cpuSecond();
430 | 	reduceUnroll4 <<<grid.x/4, block >>>(idata_dev, odata_dev, size);
431 | 	cudaDeviceSynchronize();
432 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
433 | 	gpu_sum = 0;
434 | 	for (int i = 0; i < grid.x/4; i++)
435 | 		gpu_sum += odata_host[i];	
436 |     timeElaps = 1000*(cpuSecond() - timeStart);
437 | 
438 | 	printf("gpu sum:%d \n", gpu_sum);
439 | 	printf("gpu reduceUnroll4 elapsed %lf ms     <<<grid %d block %d>>>\n",
440 | 		timeElaps, grid.x/4, block.x);
441 |     
442 |     
443 |     //kernel 6 reduceUnroll8
444 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
445 | 	CHECK(cudaDeviceSynchronize());
446 | 	timeStart = cpuSecond();
447 | 	reduceUnroll8 <<<grid.x/8, block >>>(idata_dev, odata_dev, size);
448 | 	cudaDeviceSynchronize();
449 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
450 | 	gpu_sum = 0;
451 | 	for (int i = 0; i < grid.x/8; i++)
452 | 		gpu_sum += odata_host[i];	
453 |     timeElaps = 1000*(cpuSecond() - timeStart);
454 | 
455 | 	printf("gpu sum:%d \n", gpu_sum);
456 | 	printf("gpu reduceUnroll8 elapsed %lf ms     <<<grid %d block %d>>>\n",
457 | 		timeElaps, grid.x/8, block.x);
458 |     
459 |     
460 |     //kernel 7 reduceUnrollWarp8
461 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
462 | 	CHECK(cudaDeviceSynchronize());
463 | 	timeStart = cpuSecond();
464 | 	reduceUnrollWarp8 <<<grid.x/8, block >>>(idata_dev, odata_dev, size);
465 | 	cudaDeviceSynchronize();
466 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
467 | 	gpu_sum = 0;
468 | 	for (int i = 0; i < grid.x/8; i++)
469 | 		gpu_sum += odata_host[i];	
470 |     timeElaps = 1000*(cpuSecond() - timeStart);
471 | 
472 | 	printf("gpu sum:%d \n", gpu_sum);
473 | 	printf("gpu reduceUnrollWarp8 elapsed %lf ms     <<<grid %d block %d>>>\n",
474 | 		timeElaps, grid.x/8, block.x);
475 |     
476 |     
477 |     //kernel 8 reduceCompleteUnrollWarp8
478 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
479 | 	CHECK(cudaDeviceSynchronize());
480 | 	timeStart = cpuSecond();
481 | 	reduceCompleteUnrollWarp8 <<<grid.x/8, block >>>(idata_dev, odata_dev, size);
482 | 	cudaDeviceSynchronize();
483 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
484 | 	gpu_sum = 0;
485 | 	for (int i = 0; i < grid.x/8; i++)
486 | 		gpu_sum += odata_host[i];	
487 |     timeElaps = 1000*(cpuSecond() - timeStart);
488 | 
489 | 	printf("gpu sum:%d \n", gpu_sum);
490 | 	printf("gpu reduceCompleteUnrollWarp8 elapsed %lf ms     <<<grid %d block %d>>>\n",
491 | 		timeElaps, grid.x/8, block.x);    
492 | 
493 | 
494 | 
495 | 	// free host memory
496 | 
497 | 	free(idata_host);
498 | 	free(odata_host);
499 | 	CHECK(cudaFree(idata_dev));
500 | 	CHECK(cudaFree(odata_dev));
501 | 
502 | 	//reset device
503 | 	cudaDeviceReset();
504 | 
505 | 	//check the results
506 | 	if (gpu_sum == cpu_sum)
507 | 	{
508 | 		printf("Test success!\n");
509 | 	}
510 | 	return EXIT_SUCCESS;
511 | }
512 | 


--------------------------------------------------------------------------------
/Sum_Matrix/cudastart.h:
--------------------------------------------------------------------------------
 1 | #ifndef CUDASTART_H
 2 | #define CUDASTART_H
 3 | #define CHECK(call)\
 4 | {\
 5 |   const cudaError_t error=call;\
 6 |   if(error!=cudaSuccess)\
 7 |   {\
 8 |       printf("ERROR: %s:%d,",__FILE__,__LINE__);\
 9 |       printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
10 |       exit(1);\
11 |   }\
12 | }
13 | 
14 | 
15 | #include <time.h>
16 | #ifdef _WIN32
17 | #	include <windows.h>
18 | #else
19 | #	include <sys/time.h>
20 | #endif
21 | 
22 | double cpuSecond()
23 | {
24 |   struct timeval tp;
25 |   gettimeofday(&tp,NULL);
26 |   return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
27 | 
28 | }
29 | 
30 | void initialData(float* ip,int size)
31 | {
32 |   time_t t;
33 |   srand((unsigned )time(&t));
34 |   for(int i=0;i<size;i++)
35 |   {
36 |     ip[i]=(float)(rand()&0xffff)/1000.0f;
37 |   }
38 | }
39 | 
40 | void initDevice(int devNum)
41 | {
42 |   int dev = devNum;
43 |   cudaDeviceProp deviceProp;
44 |   CHECK(cudaGetDeviceProperties(&deviceProp,dev));
45 |   printf("Using device %d: %s\n",dev,deviceProp.name);
46 |   CHECK(cudaSetDevice(dev));
47 | 
48 | }
49 | void checkResult(float * hostRef,float * gpuRef,const int N)
50 | {
51 |   double epsilon=1.0E-8;
52 |   for(int i=0;i<N;i++)
53 |   {
54 |     if(abs(hostRef[i]-gpuRef[i])>epsilon)
55 |     {
56 |       printf("Results don\'t match!\n");
57 |       printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
58 |       return;
59 |     }
60 |   }
61 |   printf("Check result success!\n");
62 | }
63 | 
64 | #endif
65 | 
66 | 


--------------------------------------------------------------------------------
/Sum_Matrix/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | https://zhuanlan.zhihu.com/p/97192227
3 | 


--------------------------------------------------------------------------------
/Sum_Matrix/sum_martix.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "cudastart.h"
  4 | 
  5 | 
  6 | //CPU对照组，用于对比加速比
  7 | void sumMatrix2DonCPU(float * MatA,float * MatB,float * MatC,int nx,int ny)
  8 | {
  9 |     float* a = MatA;
 10 |     float* b = MatB;
 11 |     float* c = MatC;
 12 |     for(int j=0; j<ny; j++)
 13 |     {
 14 |         for(int i=0; i<nx; i++)
 15 |         {
 16 |           c[i] = a[i]+b[i];
 17 |         }
 18 |         c += nx;
 19 |         b += nx;
 20 |         a += nx;
 21 |     }
 22 | }
 23 | 
 24 | //核函数，每一个线程计算矩阵中的一个元素。
 25 | __global__ void sumMatrix(float * MatA,float * MatB,float * MatC,int nx,int ny)
 26 | {
 27 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 28 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 29 |     int idx=ix+iy*ny;
 30 |     if (ix<nx && iy<ny)
 31 |     {
 32 |         MatC[idx] = MatA[idx]+MatB[idx];
 33 |     }
 34 | }
 35 | 
 36 | //主函数
 37 | int main(int argc,char** argv)
 38 | {
 39 |     //设备初始化
 40 |     printf("strating...\n");
 41 |     initDevice(0);
 42 | 
 43 |     //输入二维矩阵，4096*4096，单精度浮点型。
 44 |     int nx = 1<<12;
 45 |     int ny = 1<<12;
 46 |     int nBytes = nx*ny*sizeof(float);
 47 | 
 48 |     //Malloc，开辟主机内存
 49 |     float* A_host = (float*)malloc(nBytes);
 50 |     float* B_host = (float*)malloc(nBytes);
 51 |     float* C_host = (float*)malloc(nBytes);
 52 |     float* C_from_gpu = (float*)malloc(nBytes);
 53 |     initialData(A_host, nx*ny);
 54 |     initialData(B_host, nx*ny);
 55 | 
 56 |     //cudaMalloc，开辟设备内存
 57 |     float* A_dev = NULL;
 58 |     float* B_dev = NULL;
 59 |     float* C_dev = NULL;
 60 |     CHECK(cudaMalloc((void**)&A_dev, nBytes));
 61 |     CHECK(cudaMalloc((void**)&B_dev, nBytes));
 62 |     CHECK(cudaMalloc((void**)&C_dev, nBytes));
 63 | 
 64 |     //输入数据从主机内存拷贝到设备内存
 65 |     CHECK(cudaMemcpy(A_dev, A_host, nBytes, cudaMemcpyHostToDevice));
 66 |     CHECK(cudaMemcpy(B_dev, B_host, nBytes, cudaMemcpyHostToDevice));
 67 | 
 68 |     //二维线程块，32×32
 69 |     dim3 block(32, 32);
 70 |     //二维线程网格，128×128
 71 |     dim3 grid((nx-1)/block.x+1, (ny-1)/block.y+1);
 72 | 
 73 |     //测试GPU执行时间
 74 |     double gpuStart = cpuSecond();
 75 |     //将核函数放在线程网格中执行
 76 |     sumMatrix<<<grid,block>>>(A_dev, B_dev, C_dev, nx, ny);
 77 |     CHECK(cudaDeviceSynchronize());
 78 |     double gpuTime = cpuSecond() - gpuStart;
 79 |     printf("GPU Execution Time: %f sec\n", gpuTime);
 80 | 
 81 |     //在CPU上完成相同的任务
 82 |     cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost);
 83 |     double cpuStart=cpuSecond();
 84 |     sumMatrix2DonCPU(A_host, B_host, C_host, nx, ny);
 85 |     double cpuTime = cpuSecond() - cpuStart;
 86 |     printf("CPU Execution Time: %f sec\n", cpuTime);
 87 | 
 88 |     //检查GPU与CPU计算结果是否相同
 89 |     CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost));
 90 |     checkResult(C_host, C_from_gpu, nx*ny);
 91 | 
 92 |     cudaFree(A_dev);
 93 |     cudaFree(B_dev);
 94 |     cudaFree(C_dev);
 95 |     free(A_host);
 96 |     free(B_host);
 97 |     free(C_host);
 98 |     free(C_from_gpu);
 99 |     cudaDeviceReset();
100 |     return 0;
101 | }
102 | 


--------------------------------------------------------------------------------