├── .gitignore
├── 0_hello_world
    ├── CMakeLists.txt
    └── hello_world.cu
├── 10_reduceInteger
    ├── CMakeLists.txt
    └── reduceInteger.cu
├── 11_simple_sum_matrix2D
    ├── .sudo_as_admin_successful
    ├── CMakeLists.txt
    └── simple_sum_matrix.cu
├── 12_reduce_unrolling
    ├── CMakeLists.txt
    └── reduceUnrolling.cu
├── 13_nested_hello_world
    ├── Makefile
    └── nested_Hello_World.cu
├── 14_global_variable
    ├── CMakeLists.txt
    └── global_variable.cu
├── 15_pine_memory
    ├── CMakeLists.txt
    └── pine_memory.cu
├── 16_zero_copy_memory
    ├── CMakeLists.txt
    └── zero_copy_memory.cu
├── 17_UVA
    ├── CMakeLists.txt
    └── UVA.cu
├── 18_sum_array_offset
    ├── CMakeLists.txt
    └── sum_array_offset.cu
├── 19_AoS
    ├── AoS.cu
    └── CMakeLists.txt
├── 1_check_dimension
    ├── CMakeLists.txt
    └── check_dimension.cu
├── 20_SoA
    ├── CMakeLists.txt
    └── SoA.cu
├── 21_sum_array_offset_unrolling
    ├── CMakeLists.txt
    └── sum_array_offset_unrolling.cu
├── 22_transform_matrix2D
    ├── CMakeLists.txt
    └── transform_matrix2D.cu
├── 23_sum_array_uniform_memory
    ├── CMakeLists.txt
    └── sum_arrays_uniform_memory.cu
├── 24_shared_memory_read_data
    ├── CMakeLists.txt
    └── shared_memory_read_data.cu
├── 25_reduce_integer_shared_memory
    ├── CMakeLists.txt
    └── reduce_integer_shared_memory.cu
├── 26_transform_shared_memory
    ├── CMakeLists.txt
    └── transform_shared_memory.cu
├── 27_stencil_1d_constant_read_only
    ├── CMakeLists.txt
    └── stencil_1d_constant_read_only.cu
├── 28_shfl_test
    ├── CMakeLists.txt
    └── shfl_test.cu
├── 29_reduce_shfl
    ├── CMakeLists.txt
    └── reduce_shfl.cu
├── 2_grid_block
    ├── CMakeLists.txt
    └── grid_block.cu
├── 30_stream
    ├── CMakeLists.txt
    └── stream.cu
├── 31_stream_omp
    └── stream_omp.cu
├── 32_stream_resource
    ├── CMakeLists.txt
    └── stream_resource.cu
├── 33_stream_block
    ├── CMakeLists.txt
    └── stream_block.cu
├── 34_stream_dependence
    ├── CMakeLists.txt
    └── stream_dependence.cu
├── 35_multi_add_depth
    ├── CMakeLists.txt
    └── multi_add_depth.cu
├── 36_multi_add_breadth
    ├── CMakeLists.txt
    └── multi_add_breadth.cu
├── 37_asyncAPI
    ├── CMakeLists.txt
    └── asyncAPI.cu
├── 38_stream_call_back
    ├── CMakeLists.txt
    └── stream_call_back.cu
├── 3_sum_arrays
    ├── CMakeLists.txt
    └── sum_arrays.cu
├── 4_sum_arrays_timer
    ├── CMakeLists.txt
    └── sum_arrays_timer.cu
├── 5_thread_index
    ├── CMakeLists.txt
    └── thread_index.cu
├── 6_sum_matrix
    ├── CMakeLists.txt
    └── sum_matrix.cu
├── 7_device_information
    ├── CMakeLists.txt
    └── device_information.cu
├── 8_divergence
    ├── CMakeLists.txt
    └── divergence.cu
├── 9_sum_matrix2D
    ├── CMakeLists.txt
    └── sum_matrix2D.cu
├── CMakeLists.txt
├── IMG_9066.JPG
├── README.md
└── include
    └── freshman.h


/.gitignore:
--------------------------------------------------------------------------------
1 | /build/
2 | /cmake-build-debug/
3 | .DS_Stroe
4 | .vscode


--------------------------------------------------------------------------------
/0_hello_world/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(hello_world hello_world.cu)
2 | 


--------------------------------------------------------------------------------
/0_hello_world/hello_world.cu:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | __global__ void hello_world(void)
 3 | {
 4 |   printf("GPU: Hello world!\n");
 5 | }
 6 | int main(int argc,char **argv)
 7 | {
 8 |   printf("CPU: Hello world!\n");
 9 |   hello_world<<<1,10>>>();
10 |   cudaDeviceReset();//if no this line ,it can not output hello world from gpu
11 |   return 0;
12 | }
13 | 


--------------------------------------------------------------------------------
/10_reduceInteger/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(reduceInteger reduceInteger.cu)
2 | 


--------------------------------------------------------------------------------
/10_reduceInteger/reduceInteger.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | int recursiveReduce(int *data, int const size)
  5 | {
  6 | 	// terminate check
  7 | 	if (size == 1) return data[0];
  8 | 	// renew the stride
  9 | 	int const stride = size / 2;
 10 | 	if (size % 2 == 1)
 11 | 	{
 12 | 		for (int i = 0; i < stride; i++)
 13 | 		{
 14 | 			data[i] += data[i + stride];
 15 | 		}
 16 | 		data[0] += data[size - 1];
 17 | 	}
 18 | 	else
 19 | 	{
 20 | 		for (int i = 0; i < stride; i++)
 21 | 		{
 22 | 			data[i] += data[i + stride];
 23 | 		}
 24 | 	}
 25 | 	// call
 26 | 	return recursiveReduce(data, stride);
 27 | }
 28 | __global__ void warmup(int * g_idata, int * g_odata, unsigned int n)
 29 | {
 30 | 	//set thread ID
 31 | 	unsigned int tid = threadIdx.x;
 32 | 	//boundary check
 33 | 	if (tid >= n) return;
 34 | 	//convert global data pointer to the 
 35 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 36 | 	//in-place reduction in global memory
 37 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 38 | 	{
 39 | 		if ((tid % (2 * stride)) == 0)
 40 | 		{
 41 | 			idata[tid] += idata[tid + stride];
 42 | 		}
 43 | 		//synchronize within block
 44 | 		__syncthreads();
 45 | 	}
 46 | 	//write result for this block to global mem
 47 | 	if (tid == 0)
 48 | 		g_odata[blockIdx.x] = idata[0];
 49 | 
 50 | }
 51 | __global__ void reduceNeighbored(int * g_idata,int * g_odata,unsigned int n) 
 52 | {
 53 | 	//set thread ID
 54 | 	unsigned int tid = threadIdx.x;
 55 | 	//boundary check
 56 | 	if (tid >= n) return;
 57 | 	//convert global data pointer to the 
 58 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 59 | 	//in-place reduction in global memory
 60 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 61 | 	{
 62 | 		if ((tid % (2 * stride)) == 0)
 63 | 		{
 64 | 			idata[tid] += idata[tid + stride];
 65 | 		}
 66 | 		//synchronize within block
 67 | 		__syncthreads();
 68 | 	}
 69 | 	//write result for this block to global mem
 70 | 	if (tid == 0)
 71 | 		g_odata[blockIdx.x] = idata[0];
 72 | 
 73 | }
 74 | 
 75 | __global__ void reduceNeighboredLess(int * g_idata,int *g_odata,unsigned int n)
 76 | {
 77 | 	unsigned int tid = threadIdx.x;
 78 | 	unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
 79 | 	// convert global data pointer to the local point of this block
 80 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 81 | 	if (idx > n)
 82 | 		return;
 83 | 	//in-place reduction in global memory
 84 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 85 | 	{
 86 | 		//convert tid into local array index
 87 | 		int index = 2 * stride *tid;
 88 | 		if (index < blockDim.x)
 89 | 		{
 90 | 			idata[index] += idata[index + stride];
 91 | 		}
 92 | 		__syncthreads();
 93 | 	}
 94 | 	//write result for this block to global men
 95 | 	if (tid == 0)
 96 | 		g_odata[blockIdx.x] = idata[0];
 97 | }
 98 | 
 99 | __global__ void reduceInterleaved(int * g_idata, int *g_odata, unsigned int n)
100 | {
101 | 	unsigned int tid = threadIdx.x;
102 | 	unsigned idx = blockIdx.x*blockDim.x + threadIdx.x;
103 | 	// convert global data pointer to the local point of this block
104 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
105 | 	if (idx >= n)
106 | 		return;
107 | 	//in-place reduction in global memory
108 | 	for (int stride = blockDim.x/2; stride >0; stride >>=1)
109 | 	{
110 | 		
111 | 		if (tid <stride)
112 | 		{
113 | 			idata[tid] += idata[tid + stride];
114 | 		}
115 | 		__syncthreads();
116 | 	}
117 | 	//write result for this block to global men
118 | 	if (tid == 0)
119 | 		g_odata[blockIdx.x] = idata[0];
120 | }
121 | int main(int argc,char** argv)
122 | {
123 | 	initDevice(0);
124 | 	
125 | 	bool bResult = false;
126 | 	//initialization
127 | 
128 | 	int size = 1 << 24;
129 | 	printf("	with array size %d  ", size);
130 | 
131 | 	//execution configuration
132 | 	int blocksize = 1024;
133 | 	if (argc > 1)
134 | 	{
135 | 		blocksize = atoi(argv[1]);
136 | 	}
137 | 	dim3 block(blocksize, 1);
138 | 	dim3 grid((size - 1) / block.x + 1, 1);
139 | 	printf("grid %d block %d \n", grid.x, block.x);
140 | 
141 | 	//allocate host memory
142 | 	size_t bytes = size * sizeof(int);
143 | 	int *idata_host = (int*)malloc(bytes);
144 | 	int *odata_host = (int*)malloc(grid.x * sizeof(int));
145 | 	int * tmp = (int*)malloc(bytes);
146 | 
147 | 	//initialize the array
148 | 	initialData_int(idata_host, size);
149 | 
150 | 	memcpy(tmp, idata_host, bytes);
151 | 	double iStart, iElaps;
152 | 	int gpu_sum = 0;
153 | 
154 | 	// device memory
155 | 	int * idata_dev = NULL;
156 | 	int * odata_dev = NULL;
157 | 	CHECK(cudaMalloc((void**)&idata_dev, bytes));
158 | 	CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int)));
159 | 
160 | 	//cpu reduction
161 | 	int cpu_sum = 0;
162 | 	iStart = cpuSecond();
163 | 	//cpu_sum = recursiveReduce(tmp, size);
164 | 	for (int i = 0; i < size; i++)
165 | 		cpu_sum += tmp[i];
166 | 	printf("cpu sum:%d \n", cpu_sum);
167 | 	iElaps = cpuSecond() - iStart;
168 | 	printf("cpu reduce                 elapsed %lf ms cpu_sum: %d\n", iElaps, cpu_sum);
169 | 
170 | 
171 | 	//kernel 1:reduceNeighbored
172 | 
173 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
174 | 	CHECK(cudaDeviceSynchronize());
175 | 	iStart = cpuSecond();
176 | 	warmup <<<grid, block >>>(idata_dev, odata_dev, size);
177 | 	cudaDeviceSynchronize();
178 | 	iElaps = cpuSecond() - iStart;
179 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
180 | 	gpu_sum = 0;
181 | 	for (int i = 0; i < grid.x; i++)
182 | 		gpu_sum += odata_host[i];
183 | 	printf("gpu warmup                 elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
184 | 		iElaps, gpu_sum, grid.x, block.x);
185 | 
186 | 	//kernel 1:reduceNeighbored
187 | 
188 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
189 | 	CHECK(cudaDeviceSynchronize());
190 | 	iStart = cpuSecond();
191 | 	reduceNeighbored << <grid, block >> >(idata_dev, odata_dev, size);
192 | 	cudaDeviceSynchronize();
193 | 	iElaps = cpuSecond() - iStart;
194 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
195 | 	gpu_sum = 0;
196 | 	for (int i = 0; i < grid.x; i++)
197 | 		gpu_sum += odata_host[i];
198 | 	printf("gpu reduceNeighbored       elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
199 | 		iElaps, gpu_sum, grid.x, block.x);
200 | 
201 | 	//kernel 2:reduceNeighboredLess
202 | 
203 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
204 | 	CHECK(cudaDeviceSynchronize());
205 | 	iStart = cpuSecond();
206 | 	reduceNeighboredLess <<<grid, block>>>(idata_dev, odata_dev, size);
207 | 	cudaDeviceSynchronize();
208 | 	iElaps = cpuSecond() - iStart;
209 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
210 | 	gpu_sum = 0;
211 | 	for (int i = 0; i < grid.x; i++)
212 | 		gpu_sum += odata_host[i];
213 | 	printf("gpu reduceNeighboredLess   elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
214 | 		iElaps, gpu_sum, grid.x, block.x);
215 | 
216 | 	//kernel 3:reduceInterleaved
217 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
218 | 	CHECK(cudaDeviceSynchronize());
219 | 	iStart = cpuSecond();
220 | 	reduceInterleaved << <grid, block >> >(idata_dev, odata_dev, size);
221 | 	cudaDeviceSynchronize();
222 | 	iElaps = cpuSecond() - iStart;
223 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
224 | 	gpu_sum = 0;
225 | 	for (int i = 0; i < grid.x; i++)
226 | 		gpu_sum += odata_host[i];
227 | 	printf("gpu reduceInterleaved      elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
228 | 		iElaps, gpu_sum, grid.x, block.x);
229 | 	// free host memory
230 | 
231 | 	free(idata_host);
232 | 	free(odata_host);
233 | 	CHECK(cudaFree(idata_dev));
234 | 	CHECK(cudaFree(odata_dev));
235 | 
236 | 	//reset device
237 | 	cudaDeviceReset();
238 | 
239 | 	//check the results
240 | 	if (gpu_sum == cpu_sum)
241 | 	{
242 | 		printf("Test success!\n");
243 | 	}
244 | 	return EXIT_SUCCESS;
245 | 
246 | }
247 | 


--------------------------------------------------------------------------------
/11_simple_sum_matrix2D/.sudo_as_admin_successful:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tony-Tan/CUDA_Freshman/979938216fbbd8bc81ccbc525c4dd1f8c0c9fcbb/11_simple_sum_matrix2D/.sudo_as_admin_successful


--------------------------------------------------------------------------------
/11_simple_sum_matrix2D/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(simple_sum_matrix simple_sum_matrix.cu)
2 | 


--------------------------------------------------------------------------------
/11_simple_sum_matrix2D/simple_sum_matrix.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny)
  5 | {
  6 |   float * a=MatA;
  7 |   float * b=MatB;
  8 |   float * c=MatC;
  9 |   for(int j=0;j<ny;j++)
 10 |   {
 11 |     for(int i=0;i<nx;i++)
 12 |     {
 13 |       c[i]=a[i]+b[i];
 14 |     }
 15 |     c+=nx;
 16 |     b+=nx;
 17 |     a+=nx;
 18 |   }
 19 | }
 20 | __global__ void sumMatrix(float * MatA,float * MatB,float * MatC,int nx,int ny)
 21 | {
 22 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 23 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 24 |     int idx=ix+iy*ny;
 25 |     if (ix<nx && iy<ny)
 26 |     {
 27 |       MatC[idx]=MatA[idx]+MatB[idx];
 28 |     }
 29 | }
 30 | 
 31 | int main(int argc,char** argv)
 32 | {
 33 |   //printf("strating...\n");
 34 |   //initDevice(0);
 35 |   int nx=1<<13;
 36 |   int ny=1<<13;
 37 |   int nxy=nx*ny;
 38 |   int nBytes=nxy*sizeof(float);
 39 | 
 40 |   //Malloc
 41 |   float* A_host=(float*)malloc(nBytes);
 42 |   float* B_host=(float*)malloc(nBytes);
 43 |   float* C_host=(float*)malloc(nBytes);
 44 |   float* C_from_gpu=(float*)malloc(nBytes);
 45 |   initialData(A_host,nxy);
 46 |   initialData(B_host,nxy);
 47 | 
 48 |   //cudaMalloc
 49 |   float *A_dev=NULL;
 50 |   float *B_dev=NULL;
 51 |   float *C_dev=NULL;
 52 |   CHECK(cudaMalloc((void**)&A_dev,nBytes));
 53 |   CHECK(cudaMalloc((void**)&B_dev,nBytes));
 54 |   CHECK(cudaMalloc((void**)&C_dev,nBytes));
 55 | 
 56 | 
 57 |   CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
 58 |   CHECK(cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice));
 59 | 
 60 |   int dimx=argc>2?atoi(argv[1]):32;
 61 |   int dimy=argc>2?atoi(argv[2]):32;
 62 | 
 63 |   double iStart,iElaps;
 64 |   // cpu compute
 65 |    iStart=cpuSecond();
 66 |   sumMatrix2D_CPU(A_host,B_host,C_host,nx,ny);
 67 |   iElaps=cpuSecond()-iStart;
 68 |   printf("CPU Execution Time elapsed %f sec\n",iElaps);
 69 |   //warm up
 70 |   // 2d block and 2d grid
 71 |   dim3 block_0(32,32);
 72 |   dim3 grid_0((nx-1)/block_0.x+1,(ny-1)/block_0.y+1);
 73 |   iStart=cpuSecond();
 74 |   sumMatrix<<<grid_0,block_0>>>(A_dev,B_dev,C_dev,nx,ny);
 75 |   CHECK(cudaDeviceSynchronize());
 76 |   printf("Warm Up \n");
 77 | 
 78 |   // 2d block and 2d grid
 79 |   dim3 block(dimx,dimy);
 80 |   dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
 81 |   iStart=cpuSecond();
 82 |   sumMatrix<<<grid,block>>>(A_dev,B_dev,C_dev,nx,ny);
 83 |   CHECK(cudaDeviceSynchronize());
 84 |   iElaps=cpuSecond()-iStart;
 85 |   printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
 86 |         grid.x,grid.y,block.x,block.y,iElaps);
 87 |   CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost));
 88 | 
 89 |   checkResult(C_host,C_from_gpu,nxy);
 90 | 
 91 |   cudaFree(A_dev);
 92 |   cudaFree(B_dev);
 93 |   cudaFree(C_dev);
 94 |   free(A_host);
 95 |   free(B_host);
 96 |   free(C_host);
 97 |   free(C_from_gpu);
 98 |   cudaDeviceReset();
 99 |   return 0;
100 | }
101 | 


--------------------------------------------------------------------------------
/12_reduce_unrolling/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(reduceUnrolling reduceUnrolling.cu)
2 | 


--------------------------------------------------------------------------------
/12_reduce_unrolling/reduceUnrolling.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | int recursiveReduce(int *data, int const size)
  5 | {
  6 | 	// terminate check
  7 | 	if (size == 1) return data[0];
  8 | 	// renew the stride
  9 | 	int const stride = size / 2;
 10 | 	if (size % 2 == 1)
 11 | 	{
 12 | 		for (int i = 0; i < stride; i++)
 13 | 		{
 14 | 			data[i] += data[i + stride];
 15 | 		}
 16 | 		data[0] += data[size - 1];
 17 | 	}
 18 | 	else
 19 | 	{
 20 | 		for (int i = 0; i < stride; i++)
 21 | 		{
 22 | 			data[i] += data[i + stride];
 23 | 		}
 24 | 	}
 25 | 	// call
 26 | 	return recursiveReduce(data, stride);
 27 | }
 28 | __global__ void warmup(int * g_idata, int * g_odata, unsigned int n)
 29 | {
 30 | 	//set thread ID
 31 | 	unsigned int tid = threadIdx.x;
 32 | 	//boundary check
 33 | 	if (tid >= n) return;
 34 | 	//convert global data pointer to the
 35 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 36 | 	//in-place reduction in global memory
 37 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 38 | 	{
 39 | 		if ((tid % (2 * stride)) == 0)
 40 | 		{
 41 | 			idata[tid] += idata[tid + stride];
 42 | 		}
 43 | 		//synchronize within block
 44 | 		__syncthreads();
 45 | 	}
 46 | 	//write result for this block to global mem
 47 | 	if (tid == 0)
 48 | 		g_odata[blockIdx.x] = idata[0];
 49 | 
 50 | }
 51 | __global__ void reduceUnroll2(int * g_idata,int * g_odata,unsigned int n)
 52 | {
 53 | 	//set thread ID
 54 | 	unsigned int tid = threadIdx.x;
 55 | 	unsigned int idx = blockDim.x*blockIdx.x*2+threadIdx.x;
 56 | 	//boundary check
 57 | 	if (tid >= n) return;
 58 | 	//convert global data pointer to the
 59 | 	int *idata = g_idata + blockIdx.x*blockDim.x*2;
 60 | 	if(idx+blockDim.x<n)
 61 | 	{
 62 | 		g_idata[idx]+=g_idata[idx+blockDim.x];
 63 | 
 64 | 	}
 65 | 	__syncthreads();
 66 | 	//in-place reduction in global memory
 67 | 	for (int stride = blockDim.x/2; stride>0 ; stride >>=1)
 68 | 	{
 69 | 		if (tid <stride)
 70 | 		{
 71 | 			idata[tid] += idata[tid + stride];
 72 | 		}
 73 | 		//synchronize within block
 74 | 		__syncthreads();
 75 | 	}
 76 | 	//write result for this block to global mem
 77 | 	if (tid == 0)
 78 | 		g_odata[blockIdx.x] = idata[0];
 79 | 
 80 | }
 81 | //
 82 | //
 83 | //
 84 | __global__ void reduceUnroll4(int * g_idata,int * g_odata,unsigned int n)
 85 | {
 86 | 	//set thread ID
 87 | 	unsigned int tid = threadIdx.x;
 88 | 	unsigned int idx = blockDim.x*blockIdx.x*4+threadIdx.x;
 89 | 	//boundary check
 90 | 	if (tid >= n) return;
 91 | 	//convert global data pointer to the
 92 | 	int *idata = g_idata + blockIdx.x*blockDim.x*4;
 93 | 	if(idx+blockDim.x<n)
 94 | 	{
 95 | 		g_idata[idx]+=g_idata[idx+blockDim.x];
 96 | 		g_idata[idx]+=g_idata[idx+blockDim.x*2];
 97 | 		g_idata[idx]+=g_idata[idx+blockDim.x*3];
 98 | 	}
 99 | 	__syncthreads();
100 | 	//in-place reduction in global memory
101 | 	for (int stride = blockDim.x/2; stride>0 ; stride >>=1)
102 | 	{
103 | 		if (tid <stride)
104 | 		{
105 | 			idata[tid] += idata[tid + stride];
106 | 		}
107 | 		//synchronize within block
108 | 		__syncthreads();
109 | 	}
110 | 	//write result for this block to global mem
111 | 	if (tid == 0)
112 | 		g_odata[blockIdx.x] = idata[0];
113 | 
114 | }
115 | //
116 | //
117 | //
118 | __global__ void reduceUnroll8(int * g_idata,int * g_odata,unsigned int n)
119 | {
120 | 	//set thread ID
121 | 	unsigned int tid = threadIdx.x;
122 | 	unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
123 | 	//boundary check
124 | 	if (tid >= n) return;
125 | 	//convert global data pointer to the
126 | 	int *idata = g_idata + blockIdx.x*blockDim.x*8;
127 | 	if(idx+blockDim.x<n)
128 | 	{
129 | 		g_idata[idx]+=g_idata[idx+blockDim.x];
130 | 		g_idata[idx]+=g_idata[idx+blockDim.x*2];
131 | 		g_idata[idx]+=g_idata[idx+blockDim.x*3];
132 | 		g_idata[idx]+=g_idata[idx+blockDim.x*4];
133 | 		g_idata[idx]+=g_idata[idx+blockDim.x*5];
134 | 		g_idata[idx]+=g_idata[idx+blockDim.x*6];
135 | 		g_idata[idx]+=g_idata[idx+blockDim.x*7];
136 | 
137 | 	}
138 | 	__syncthreads();
139 | 	//in-place reduction in global memory
140 | 	for (int stride = blockDim.x/2; stride>0 ; stride >>=1)
141 | 	{
142 | 		if (tid <stride)
143 | 		{
144 | 			idata[tid] += idata[tid + stride];
145 | 		}
146 | 		//synchronize within block
147 | 		__syncthreads();
148 | 	}
149 | 	//write result for this block to global mem
150 | 	if (tid == 0)
151 | 		g_odata[blockIdx.x] = idata[0];
152 | 
153 | }
154 | //
155 | //
156 | //
157 | __global__ void reduceUnrollWarp8(int * g_idata,int * g_odata,unsigned int n)
158 | {
159 | 	//set thread ID
160 | 	unsigned int tid = threadIdx.x;
161 | 	unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
162 | 	//boundary check
163 | 	if (tid >= n) return;
164 | 	//convert global data pointer to the
165 | 	int *idata = g_idata + blockIdx.x*blockDim.x*8;
166 | 	//unrolling 8;
167 | 	if(idx+7 * blockDim.x<n)
168 | 	{
169 | 		int a1=g_idata[idx];
170 | 		int a2=g_idata[idx+blockDim.x];
171 | 		int a3=g_idata[idx+2*blockDim.x];
172 | 		int a4=g_idata[idx+3*blockDim.x];
173 | 		int a5=g_idata[idx+4*blockDim.x];
174 | 		int a6=g_idata[idx+5*blockDim.x];
175 | 		int a7=g_idata[idx+6*blockDim.x];
176 | 		int a8=g_idata[idx+7*blockDim.x];
177 | 		g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
178 | 
179 | 	}
180 | 	__syncthreads();
181 | 	//in-place reduction in global memory
182 | 	for (int stride = blockDim.x/2; stride>32; stride >>=1)
183 | 	{
184 | 		if (tid <stride)
185 | 		{
186 | 			idata[tid] += idata[tid + stride];
187 | 		}
188 | 		//synchronize within block
189 | 		__syncthreads();
190 | 	}
191 | 	//write result for this block to global mem
192 | 	if(tid<32)
193 | 	{
194 | 		volatile int *vmem = idata;
195 | 		vmem[tid]+=vmem[tid+32];
196 | 		vmem[tid]+=vmem[tid+16];
197 | 		vmem[tid]+=vmem[tid+8];
198 | 		vmem[tid]+=vmem[tid+4];
199 | 		vmem[tid]+=vmem[tid+2];
200 | 		vmem[tid]+=vmem[tid+1];
201 | 
202 | 	}
203 | 
204 | 	if (tid == 0)
205 | 		g_odata[blockIdx.x] = idata[0];
206 | 
207 | }
208 | //
209 | //
210 | //
211 | __global__ void reduceCompleteUnrollWarp8(int * g_idata,int * g_odata,unsigned int n)
212 | {
213 | 	//set thread ID
214 | 	unsigned int tid = threadIdx.x;
215 | 	unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
216 | 	//boundary check
217 | 	if (tid >= n) return;
218 | 	//convert global data pointer to the
219 | 	int *idata = g_idata + blockIdx.x*blockDim.x*8;
220 | 	if(idx+7 * blockDim.x<n)
221 | 	{
222 | 		int a1=g_idata[idx];
223 | 		int a2=g_idata[idx+blockDim.x];
224 | 		int a3=g_idata[idx+2*blockDim.x];
225 | 		int a4=g_idata[idx+3*blockDim.x];
226 | 		int a5=g_idata[idx+4*blockDim.x];
227 | 		int a6=g_idata[idx+5*blockDim.x];
228 | 		int a7=g_idata[idx+6*blockDim.x];
229 | 		int a8=g_idata[idx+7*blockDim.x];
230 | 		g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
231 | 
232 | 	}
233 | 	__syncthreads();
234 | 	//in-place reduction in global memory
235 | 	if(blockDim.x>=1024 && tid <512)
236 | 		idata[tid]+=idata[tid+512];
237 | 	__syncthreads();
238 | 	if(blockDim.x>=512 && tid <256)
239 | 		idata[tid]+=idata[tid+256];
240 | 	__syncthreads();
241 | 	if(blockDim.x>=256 && tid <128)
242 | 		idata[tid]+=idata[tid+128];
243 | 	__syncthreads();
244 | 	if(blockDim.x>=128 && tid <64)
245 | 		idata[tid]+=idata[tid+64];
246 | 	__syncthreads();
247 | 	//write result for this block to global mem
248 | 	if(tid<32)
249 | 	{
250 | 		volatile int *vmem = idata;
251 | 		vmem[tid]+=vmem[tid+32];
252 | 		vmem[tid]+=vmem[tid+16];
253 | 		vmem[tid]+=vmem[tid+8];
254 | 		vmem[tid]+=vmem[tid+4];
255 | 		vmem[tid]+=vmem[tid+2];
256 | 		vmem[tid]+=vmem[tid+1];
257 | 
258 | 	}
259 | 
260 | 	if (tid == 0)
261 | 		g_odata[blockIdx.x] = idata[0];
262 | 
263 | }
264 | //
265 | //
266 | //
267 | template <unsigned int iBlockSize>
268 | __global__ void reduceCompleteUnroll(int * g_idata,int * g_odata,unsigned int n)
269 | {
270 | 	//set thread ID
271 | 	unsigned int tid = threadIdx.x;
272 | 	unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x;
273 | 	//boundary check
274 | 	if (tid >= n) return;
275 | 	//convert global data pointer to the
276 | 	int *idata = g_idata + blockIdx.x*blockDim.x*8;
277 | 	if(idx+7 * blockDim.x<n)
278 | 	{
279 | 		int a1=g_idata[idx];
280 | 		int a2=g_idata[idx+blockDim.x];
281 | 		int a3=g_idata[idx+2*blockDim.x];
282 | 		int a4=g_idata[idx+3*blockDim.x];
283 | 		int a5=g_idata[idx+4*blockDim.x];
284 | 		int a6=g_idata[idx+5*blockDim.x];
285 | 		int a7=g_idata[idx+6*blockDim.x];
286 | 		int a8=g_idata[idx+7*blockDim.x];
287 | 		g_idata[idx]=a1+a2+a3+a4+a5+a6+a7+a8;
288 | 
289 | 	}
290 | 	__syncthreads();
291 | 	//in-place reduction in global memory
292 | 	if(iBlockSize>=1024 && tid <512)
293 | 		idata[tid]+=idata[tid+512];
294 | 	__syncthreads();
295 | 	if(iBlockSize>=512 && tid <256)
296 | 		idata[tid]+=idata[tid+256];
297 | 	__syncthreads();
298 | 	if(iBlockSize>=256 && tid <128)
299 | 		idata[tid]+=idata[tid+128];
300 | 	__syncthreads();
301 | 	if(iBlockSize>=128 && tid <64)
302 | 		idata[tid]+=idata[tid+64];
303 | 	__syncthreads();
304 | 	//write result for this block to global mem
305 | 	if(tid<32)
306 | 	{
307 | 		volatile int *vmem = idata;
308 | 		vmem[tid]+=vmem[tid+32];
309 | 		vmem[tid]+=vmem[tid+16];
310 | 		vmem[tid]+=vmem[tid+8];
311 | 		vmem[tid]+=vmem[tid+4];
312 | 		vmem[tid]+=vmem[tid+2];
313 | 		vmem[tid]+=vmem[tid+1];
314 | 
315 | 	}
316 | 
317 | 	if (tid == 0)
318 | 		g_odata[blockIdx.x] = idata[0];
319 | 
320 | }
321 | 
322 | int main(int argc,char** argv)
323 | {
324 | 	initDevice(0);
325 | 
326 | 	bool bResult = false;
327 | 	//initialization
328 | 
329 | 	int size = 1 << 24;
330 | 	printf("	with array size %d  ", size);
331 | 
332 | 	//execution configuration
333 | 	int blocksize = 1024;
334 | 	if (argc > 1)
335 | 	{
336 | 		blocksize = atoi(argv[1]);
337 | 	}
338 | 	dim3 block(blocksize, 1);
339 | 	dim3 grid((size - 1) / block.x + 1, 1);
340 | 	printf("grid %d block %d \n", grid.x, block.x);
341 | 
342 | 	//allocate host memory
343 | 	size_t bytes = size * sizeof(int);
344 | 	int *idata_host = (int*)malloc(bytes);
345 | 	int *odata_host = (int*)malloc(grid.x * sizeof(int));
346 | 	int * tmp = (int*)malloc(bytes);
347 | 
348 | 	//initialize the array
349 | 	initialData_int(idata_host, size);
350 | 
351 | 	memcpy(tmp, idata_host, bytes);
352 | 	double iStart, iElaps;
353 | 	int gpu_sum = 0;
354 | 
355 | 	// device memory
356 | 	int * idata_dev = NULL;
357 | 	int * odata_dev = NULL;
358 | 	CHECK(cudaMalloc((void**)&idata_dev, bytes));
359 | 	CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int)));
360 | 
361 | 	//cpu reduction
362 | 	int cpu_sum = 0;
363 | 	iStart = cpuSecond();
364 | 	//cpu_sum = recursiveReduce(tmp, size);
365 | 	for (int i = 0; i < size; i++)
366 | 		cpu_sum += tmp[i];
367 | 	printf("cpu sum:%d \n", cpu_sum);
368 | 	iElaps = cpuSecond() - iStart;
369 | 	printf("cpu reduce                  elapsed %lf ms cpu_sum: %d\n", iElaps, cpu_sum);
370 | 
371 | 
372 | 	//kernel 1:warmup
373 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
374 | 	CHECK(cudaDeviceSynchronize());
375 | 	iStart = cpuSecond();
376 | 	warmup <<<grid.x/2, block >>>(idata_dev, odata_dev, size);
377 | 	cudaDeviceSynchronize();
378 | 	iElaps = cpuSecond() - iStart;
379 | 	printf("gpu warmup                  elapsed %lf ms \n",iElaps);
380 | 
381 | 
382 | 	//kernel 1:reduceUnrolling2
383 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
384 | 	CHECK(cudaDeviceSynchronize());
385 | 	iStart = cpuSecond();
386 | 	reduceUnroll2 <<<grid.x/2, block >>>(idata_dev, odata_dev, size);
387 | 	cudaDeviceSynchronize();
388 | 	iElaps = cpuSecond() - iStart;
389 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
390 | 	gpu_sum = 0;
391 | 	for (int i = 0; i < grid.x/2; i++)
392 | 		gpu_sum += odata_host[i];
393 | 	printf("reduceUnrolling2            elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
394 | 		iElaps, gpu_sum, grid.x/2, block.x);
395 | 
396 | 	//kernel 1.1:reduceUnrolling4
397 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
398 | 	CHECK(cudaDeviceSynchronize());
399 | 	iStart = cpuSecond();
400 | 	reduceUnroll4 <<<grid.x/4, block >>>(idata_dev, odata_dev, size);
401 | 	cudaDeviceSynchronize();
402 | 	iElaps = cpuSecond() - iStart;
403 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
404 | 	gpu_sum = 0;
405 | 	for (int i = 0; i < grid.x/4; i++)
406 | 		gpu_sum += odata_host[i];
407 | 	printf("reduceUnrolling4            elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
408 | 		iElaps, gpu_sum, grid.x/4, block.x);
409 | 	//kernel 1.2:reduceUnrolling8
410 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
411 | 	CHECK(cudaDeviceSynchronize());
412 | 	iStart = cpuSecond();
413 | 	reduceUnroll8 <<<grid.x/8, block >>>(idata_dev, odata_dev, size);
414 | 	cudaDeviceSynchronize();
415 | 	iElaps = cpuSecond() - iStart;
416 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
417 | 	gpu_sum = 0;
418 | 	for (int i = 0; i < grid.x/8; i++)
419 | 		gpu_sum += odata_host[i];
420 | 	printf("reduceUnrolling8            elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
421 | 		iElaps, gpu_sum, grid.x/8, block.x);
422 | 
423 | 	//kernel 2:reduceUnrollingWarp8
424 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
425 | 	CHECK(cudaDeviceSynchronize());
426 | 	iStart = cpuSecond();
427 | 	reduceUnrollWarp8<<<grid.x/8, block >>>(idata_dev, odata_dev, size);
428 | 	cudaDeviceSynchronize();
429 | 	iElaps = cpuSecond() - iStart;
430 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
431 | 	gpu_sum = 0;
432 | 	for (int i = 0; i < grid.x/8; i++)
433 | 		gpu_sum += odata_host[i];
434 | 	printf("reduceUnrollingWarp8        elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
435 | 		iElaps, gpu_sum, grid.x/8, block.x);
436 | 
437 | 
438 | 	//kernel 3:reduceCompleteUnrollWarp8
439 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
440 | 	CHECK(cudaDeviceSynchronize());
441 | 	iStart = cpuSecond();
442 | 	reduceCompleteUnrollWarp8 <<<grid.x/8, block>>>(idata_dev, odata_dev, size);
443 | 	cudaDeviceSynchronize();
444 | 	iElaps = cpuSecond() - iStart;
445 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
446 | 	gpu_sum = 0;
447 | 	for (int i = 0; i < grid.x/8; i++)
448 | 		gpu_sum += odata_host[i];
449 | 	printf("reduceCompleteUnrollWarp8   elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
450 | 		iElaps, gpu_sum, grid.x/8, block.x);
451 | 
452 | 
453 | 	//kernel 4:reduceCompleteUnroll
454 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
455 | 	CHECK(cudaDeviceSynchronize());
456 | 	iStart = cpuSecond();
457 | 	switch(blocksize)
458 | 	{
459 | 		case 1024:
460 | 			reduceCompleteUnroll <1024><< <grid.x/8, block >> >(idata_dev, odata_dev, size);
461 | 		break;
462 | 		case 512:
463 | 			reduceCompleteUnroll <512><< <grid.x/8, block >> >(idata_dev, odata_dev, size);
464 | 		break;
465 | 		case 256:
466 | 			reduceCompleteUnroll <256><< <grid.x/8, block >> >(idata_dev, odata_dev, size);
467 | 		break;
468 | 		case 128:
469 | 			reduceCompleteUnroll <128><< <grid.x/8, block >> >(idata_dev, odata_dev, size);
470 | 		break;
471 | 	}
472 | 	cudaDeviceSynchronize();
473 | 	iElaps = cpuSecond() - iStart;
474 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
475 | 	gpu_sum = 0;
476 | 	for (int i = 0; i < grid.x/8; i++)
477 | 		gpu_sum += odata_host[i];
478 | 	printf("reduceCompleteUnroll        elapsed %lf ms gpu_sum: %d<<<grid %d block %d>>>\n",
479 | 		iElaps, gpu_sum, grid.x/8, block.x);
480 | 	// free host memory
481 | 
482 | 	free(idata_host);
483 | 	free(odata_host);
484 | 	CHECK(cudaFree(idata_dev));
485 | 	CHECK(cudaFree(odata_dev));
486 | 
487 | 	//reset device
488 | 	cudaDeviceReset();
489 | 
490 | 	//check the results
491 | 	if (gpu_sum == cpu_sum)
492 | 	{
493 | 		printf("Test success!\n");
494 | 	}
495 | 	return EXIT_SUCCESS;
496 | 
497 | }
498 | 


--------------------------------------------------------------------------------
/13_nested_hello_world/Makefile:
--------------------------------------------------------------------------------
1 | nested_Hello_World:
2 | 	nvcc -arch=sm_35  nested_Hello_World.cu -o nested_Hello_World -lcudadevrt --relocatable-device-code true
3 | clean:
4 | 	rm nested_Hello_World
5 | 


--------------------------------------------------------------------------------
/13_nested_hello_world/nested_Hello_World.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | __global__ void nesthelloworld(int iSize,int iDepth)
 4 | {
 5 |     unsigned int tid=threadIdx.x;
 6 |     printf("depth : %d blockIdx: %d,threadIdx: %d\n",iDepth,blockIdx.x,threadIdx.x);
 7 |     if (iSize==1)
 8 |         return;
 9 |     int nthread=(iSize>>1);
10 |     if (tid==0 && nthread>0)
11 |     {
12 |         nesthelloworld<<<1,nthread>>>(nthread,++iDepth);
13 |         printf("-----------> nested execution depth: %d\n",iDepth);
14 |     }
15 | 
16 | }
17 | 
18 | int main(int argc,char* argv[])
19 | {
20 |     int size=64;
21 |     int block_x=2;
22 |     dim3 block(block_x,1);
23 |     dim3 grid((size-1)/block.x+1,1);
24 |     nesthelloworld<<<grid,block>>>(size,0);
25 |     cudaGetLastError();
26 |     cudaDeviceReset();
27 |     return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/14_global_variable/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(global_variable global_variable.cu)
2 | 


--------------------------------------------------------------------------------
/14_global_variable/global_variable.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | __device__ float devData;
 4 | __global__ void checkGlobalVariable()
 5 | {
 6 |     printf("Device: The value of the global variable is %f\n",devData);
 7 |     devData+=2.0;
 8 | }
 9 | int main()
10 | {
11 |     float value=3.14f;
12 |     cudaMemcpyToSymbol(devData,&value,sizeof(float));
13 |     printf("Host: copy %f to the global variable\n",value);
14 |     checkGlobalVariable<<<1,1>>>();
15 |     cudaMemcpyFromSymbol(&value,devData,sizeof(float));
16 |     printf("Host: the value changed by the kernel to %f \n",value);
17 |     cudaDeviceReset();
18 |     return EXIT_SUCCESS;
19 | }
20 | 


--------------------------------------------------------------------------------
/15_pine_memory/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(pine_memory pine_memory.cu)
2 | 


--------------------------------------------------------------------------------
/15_pine_memory/pine_memory.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | 
 6 | void sumArrays(float * a,float * b,float * res,const int size)
 7 | {
 8 |   for(int i=0;i<size;i+=4)
 9 |   {
10 |     res[i]=a[i]+b[i];
11 |     res[i+1]=a[i+1]+b[i+1];
12 |     res[i+2]=a[i+2]+b[i+2];
13 |     res[i+3]=a[i+3]+b[i+3];
14 |   }
15 | }
16 | __global__ void sumArraysGPU(float*a,float*b,float*res)
17 | {
18 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
19 |   res[i]=a[i]+b[i];
20 | }
21 | int main(int argc,char **argv)
22 | {
23 |   int dev = 0;
24 |   cudaSetDevice(dev);
25 | 
26 |   int nElem=1<<14;
27 |   printf("Vector size:%d\n",nElem);
28 |   int nByte=sizeof(float)*nElem;
29 |   float *a_h=(float*)malloc(nByte);
30 |   float *b_h=(float*)malloc(nByte);
31 |   float *res_h=(float*)malloc(nByte);
32 |   float *res_from_gpu_h=(float*)malloc(nByte);
33 |   memset(res_h,0,nByte);
34 |   memset(res_from_gpu_h,0,nByte);
35 | 
36 |   float *a_d,*b_d,*res_d;
37 |   // pine memory malloc
38 |   CHECK(cudaMallocHost((float**)&a_d,nByte));
39 |   CHECK(cudaMallocHost((float**)&b_d,nByte));
40 |   CHECK(cudaMallocHost((float**)&res_d,nByte));
41 | 
42 |   initialData(a_h,nElem);
43 |   initialData(b_h,nElem);
44 | 
45 |   CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
46 |   CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
47 | 
48 |   dim3 block(1024);
49 |   dim3 grid(nElem/block.x);
50 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d);
51 |   printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
52 | 
53 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
54 |   sumArrays(a_h,b_h,res_h,nElem);
55 | 
56 |   checkResult(res_h,res_from_gpu_h,nElem);
57 |   cudaFreeHost(a_d);
58 |   cudaFreeHost(b_d);
59 |   cudaFreeHost(res_d);
60 | 
61 |   free(a_h);
62 |   free(b_h);
63 |   free(res_h);
64 |   free(res_from_gpu_h);
65 | 
66 |   return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/16_zero_copy_memory/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(zero_copy_memory zero_copy_memory.cu)
2 | 


--------------------------------------------------------------------------------
/16_zero_copy_memory/zero_copy_memory.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | 
  5 | 
  6 | void sumArrays(float * a,float * b,float * res,const int size)
  7 | {
  8 |   for(int i=0;i<size;i+=4)
  9 |   {
 10 |     res[i]=a[i]+b[i];
 11 |     res[i+1]=a[i+1]+b[i+1];
 12 |     res[i+2]=a[i+2]+b[i+2];
 13 |     res[i+3]=a[i+3]+b[i+3];
 14 |   }
 15 | }
 16 | __global__ void sumArraysGPU(float*a,float*b,float*res)
 17 | {
 18 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
 19 |   res[i]=a[i]+b[i];
 20 | }
 21 | int main(int argc,char **argv)
 22 | {
 23 |   int dev = 0;
 24 |   cudaSetDevice(dev);
 25 |   int power=10;
 26 |   if(argc>=2)
 27 |     power=atoi(argv[1]);
 28 |   int nElem=1<<power;
 29 |   printf("Vector size:%d\n",nElem);
 30 |   int nByte=sizeof(float)*nElem;
 31 |   float *res_from_gpu_h=(float*)malloc(nByte);
 32 |   float *res_h=(float*)malloc(nByte);
 33 |   memset(res_h,0,nByte);
 34 |   memset(res_from_gpu_h,0,nByte);
 35 | 
 36 |   float *a_host,*b_host,*res_d;
 37 |   double iStart,iElaps;
 38 |   dim3 block(1024);
 39 |   dim3 grid(nElem/block.x);
 40 |   res_from_gpu_h=(float*)malloc(nByte);
 41 |   float *a_dev,*b_dev;
 42 |   CHECK(cudaHostAlloc((float**)&a_host,nByte,cudaHostAllocMapped));
 43 |   CHECK(cudaHostAlloc((float**)&b_host,nByte,cudaHostAllocMapped));
 44 |   CHECK(cudaMalloc((float**)&res_d,nByte));
 45 |   initialData(a_host,nElem);
 46 |   initialData(b_host,nElem);
 47 | 
 48 |  //=============================================================//
 49 |   iStart = cpuSecond();
 50 |   CHECK(cudaHostGetDevicePointer((void**)&a_dev,(void*) a_host,0));
 51 |   CHECK(cudaHostGetDevicePointer((void**)&b_dev,(void*) b_host,0));
 52 |   sumArraysGPU<<<grid,block>>>(a_dev,b_dev,res_d);
 53 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
 54 |   iElaps = cpuSecond() - iStart;
 55 |  //=============================================================//
 56 |   printf("zero copy memory elapsed %lf ms \n", iElaps);
 57 |   printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
 58 | //-----------------------normal memory---------------------------
 59 |   float *a_h_n=(float*)malloc(nByte);
 60 |   float *b_h_n=(float*)malloc(nByte);
 61 |   float *res_h_n=(float*)malloc(nByte);
 62 |   float *res_from_gpu_h_n=(float*)malloc(nByte);
 63 |   memset(res_h_n,0,nByte);
 64 |   memset(res_from_gpu_h_n,0,nByte);
 65 | 
 66 |   float *a_d_n,*b_d_n,*res_d_n;
 67 |   CHECK(cudaMalloc((float**)&a_d_n,nByte));
 68 |   CHECK(cudaMalloc((float**)&b_d_n,nByte));
 69 |   CHECK(cudaMalloc((float**)&res_d_n,nByte));
 70 | 
 71 |   initialData(a_h_n,nElem);
 72 |   initialData(b_h_n,nElem);
 73 | //=============================================================//
 74 |   iStart = cpuSecond();
 75 |   CHECK(cudaMemcpy(a_d_n,a_h_n,nByte,cudaMemcpyHostToDevice));
 76 |   CHECK(cudaMemcpy(b_d_n,b_h_n,nByte,cudaMemcpyHostToDevice));
 77 |   sumArraysGPU<<<grid,block>>>(a_d_n,b_d_n,res_d_n);
 78 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
 79 |   iElaps = cpuSecond() - iStart;
 80 | //=============================================================//
 81 |   printf("device memory elapsed %lf ms \n", iElaps);
 82 |   printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
 83 | //--------------------------------------------------------------------
 84 | 
 85 |   sumArrays(a_host,b_host,res_h,nElem);
 86 |   checkResult(res_h,res_from_gpu_h,nElem);
 87 | 
 88 |   cudaFreeHost(a_host);
 89 |   cudaFreeHost(b_host);
 90 |   cudaFree(res_d);
 91 |   free(res_h);
 92 |   free(res_from_gpu_h);
 93 | 
 94 |   cudaFree(a_d_n);
 95 |   cudaFree(b_d_n);
 96 |   cudaFree(res_d_n);
 97 | 
 98 |   free(a_h_n);
 99 |   free(b_h_n);
100 |   free(res_h_n);
101 |   free(res_from_gpu_h_n);
102 |   return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/17_UVA/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(UVA UVA.cu)
2 | 


--------------------------------------------------------------------------------
/17_UVA/UVA.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | 
 6 | void sumArrays(float * a,float * b,float * res,const int size)
 7 | {
 8 |   for(int i=0;i<size;i+=4)
 9 |   {
10 |     res[i]=a[i]+b[i];
11 |     res[i+1]=a[i+1]+b[i+1];
12 |     res[i+2]=a[i+2]+b[i+2];
13 |     res[i+3]=a[i+3]+b[i+3];
14 |   }
15 | }
16 | __global__ void sumArraysGPU(float*a,float*b,float*res)
17 | {
18 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
19 |   res[i]=a[i]+b[i];
20 | }
21 | int main(int argc,char **argv)
22 | {
23 |   int dev = 0;
24 |   cudaSetDevice(dev);
25 | 
26 |   int nElem=1<<14;
27 |   printf("Vector size:%d\n",nElem);
28 |   int nByte=sizeof(float)*nElem;
29 |   float *res_from_gpu_h=(float*)malloc(nByte);
30 |   float *res_h=(float*)malloc(nByte);
31 |   memset(res_h,0,nByte);
32 |   memset(res_from_gpu_h,0,nByte);
33 | 
34 |   float *a_host,*b_host,*res_d;
35 |   CHECK(cudaHostAlloc((float**)&a_host,nByte,cudaHostAllocMapped));
36 |   CHECK(cudaHostAlloc((float**)&b_host,nByte,cudaHostAllocMapped));
37 |   CHECK(cudaMalloc((float**)&res_d,nByte));
38 |   res_from_gpu_h=(float*)malloc(nByte);
39 | 
40 |   initialData(a_host,nElem);
41 |   initialData(b_host,nElem);
42 | 
43 |   dim3 block(1024);
44 |   dim3 grid(nElem/block.x);
45 |   sumArraysGPU<<<grid,block>>>(a_host,b_host,res_d);
46 |   printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
47 | 
48 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
49 |   sumArrays(a_host,b_host,res_h,nElem);
50 | 
51 |   checkResult(res_h,res_from_gpu_h,nElem);
52 |   cudaFreeHost(a_host);
53 |   cudaFreeHost(b_host);
54 |   cudaFree(res_d);
55 | 
56 |   free(res_h);
57 |   free(res_from_gpu_h);
58 | 
59 |   return 0;
60 | }
61 | 


--------------------------------------------------------------------------------
/18_sum_array_offset/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(sum_array_offset sum_array_offset.cu)
2 | 


--------------------------------------------------------------------------------
/18_sum_array_offset/sum_array_offset.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | 
 6 | void sumArrays(float * a,float * b,float * res,int offset,const int size)
 7 | {
 8 | 
 9 |     for(int i=0,k=offset;k<size;i++,k++)
10 |     {
11 |         res[i]=a[k]+b[k];
12 |     }
13 | 
14 | }
15 | __global__ void sumArraysGPU(float*a,float*b,float*res,int offset,int n)
16 | {
17 |   //int i=threadIdx.x;
18 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
19 |   int k=i+offset;
20 |   if(k<n)
21 |     res[i]=a[k]+b[k];
22 | }
23 | int main(int argc,char **argv)
24 | {
25 |   int dev = 0;
26 |   cudaSetDevice(dev);
27 | 
28 |   int nElem=1<<18;
29 |   int offset=0;
30 |   if(argc>=2)
31 |     offset=atoi(argv[1]);
32 |   printf("Vector size:%d\n",nElem);
33 |   int nByte=sizeof(float)*nElem;
34 |   float *a_h=(float*)malloc(nByte);
35 |   float *b_h=(float*)malloc(nByte);
36 |   float *res_h=(float*)malloc(nByte);
37 |   float *res_from_gpu_h=(float*)malloc(nByte);
38 |   memset(res_h,0,nByte);
39 |   memset(res_from_gpu_h,0,nByte);
40 | 
41 |   float *a_d,*b_d,*res_d;
42 |   CHECK(cudaMalloc((float**)&a_d,nByte));
43 |   CHECK(cudaMalloc((float**)&b_d,nByte));
44 |   CHECK(cudaMalloc((float**)&res_d,nByte));
45 |   CHECK(cudaMemset(res_d,0,nByte));
46 |   initialData(a_h,nElem);
47 |   initialData(b_h,nElem);
48 | 
49 |   CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
50 |   CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
51 | 
52 |   dim3 block(1024);
53 |   dim3 grid(nElem/block.x);
54 |   double iStart,iElaps;
55 |   iStart=cpuSecond();
56 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,offset,nElem);
57 |   cudaDeviceSynchronize();
58 |   iElaps=cpuSecond()-iStart;
59 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
60 |   printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec --offset:%d \n",grid.x,block.x,iElaps,offset);
61 | 
62 | 
63 |   sumArrays(a_h,b_h,res_h,offset,nElem);
64 | 
65 |   checkResult(res_h,res_from_gpu_h,nElem);
66 |   cudaFree(a_d);
67 |   cudaFree(b_d);
68 |   cudaFree(res_d);
69 | 
70 |   free(a_h);
71 |   free(b_h);
72 |   free(res_h);
73 |   free(res_from_gpu_h);
74 | 
75 |   return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/19_AoS/AoS.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | struct naiveStruct{
 6 |     float a;
 7 |     float b;
 8 | };
 9 | void sumArrays(float * a,float * b,float * res,const int size)
10 | {
11 | 
12 |     for(int i=0;i<size;i++)
13 |     {
14 |         res[i]=a[i]+b[i];
15 |     }
16 | 
17 | }
18 | __global__ void sumArraysGPU(float*a,float*b,struct naiveStruct* res,int n)
19 | {
20 |   //int i=threadIdx.x;
21 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
22 |   if(i<n)
23 |     res[i].a=a[i]+b[i];
24 | }
25 | void checkResult_struct(float* res_h,struct naiveStruct*res_from_gpu_h,int nElem)
26 | {
27 |     for(int i=0;i<nElem;i++)
28 |         if (res_h[i]!=res_from_gpu_h[i].a)
29 |         {
30 |             printf("check fail!\n");
31 |             exit(0);
32 |         }
33 |     printf("result check success!\n");
34 | }
35 | int main(int argc,char **argv)
36 | {
37 |   int dev = 0;
38 |   cudaSetDevice(dev);
39 | 
40 |   int nElem=1<<18;
41 |   int offset=0;
42 |   if(argc>=2)
43 |     offset=atoi(argv[1]);
44 |   printf("Vector size:%d\n",nElem);
45 |   int nByte=sizeof(float)*nElem;
46 |   int nByte_struct=sizeof(struct naiveStruct)*nElem;
47 |   float *a_h=(float*)malloc(nByte);
48 |   float *b_h=(float*)malloc(nByte);
49 |   float *res_h=(float*)malloc(nByte_struct);
50 |   struct naiveStruct *res_from_gpu_h=(struct naiveStruct*)malloc(nByte_struct);
51 |   memset(res_h,0,nByte);
52 |   memset(res_from_gpu_h,0,nByte);
53 | 
54 |   float *a_d,*b_d;
55 |   struct naiveStruct* res_d;
56 |   CHECK(cudaMalloc((float**)&a_d,nByte));
57 |   CHECK(cudaMalloc((float**)&b_d,nByte));
58 |   CHECK(cudaMalloc((struct naiveStruct**)&res_d,nByte_struct));
59 |   CHECK(cudaMemset(res_d,0,nByte_struct));
60 |   initialData(a_h,nElem);
61 |   initialData(b_h,nElem);
62 | 
63 |   CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
64 |   CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
65 | 
66 |   dim3 block(1024);
67 |   dim3 grid(nElem/block.x);
68 |   double iStart,iElaps;
69 |   iStart=cpuSecond();
70 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,nElem);
71 |   cudaDeviceSynchronize();
72 |   iElaps=cpuSecond()-iStart;
73 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte_struct,cudaMemcpyDeviceToHost));
74 |   printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);
75 | 
76 | 
77 |   sumArrays(a_h,b_h,res_h,nElem);
78 | 
79 |   checkResult_struct(res_h,res_from_gpu_h,nElem);
80 |   cudaFree(a_d);
81 |   cudaFree(b_d);
82 |   cudaFree(res_d);
83 | 
84 |   free(a_h);
85 |   free(b_h);
86 |   free(res_h);
87 |   free(res_from_gpu_h);
88 | 
89 |   return 0;
90 | }
91 | 


--------------------------------------------------------------------------------
/19_AoS/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(AoS AoS.cu)
2 | 


--------------------------------------------------------------------------------
/1_check_dimension/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(check_dimension check_dimension.cu)
2 | 


--------------------------------------------------------------------------------
/1_check_dimension/check_dimension.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | __global__ void checkIndex(void)
 4 | {
 5 |   printf("threadIdx:(%d,%d,%d) blockIdx:(%d,%d,%d) blockDim:(%d,%d,%d)\
 6 |   gridDim(%d,%d,%d)\n",threadIdx.x,threadIdx.y,threadIdx.z,
 7 |   blockIdx.x,blockIdx.y,blockIdx.z,blockDim.x,blockDim.y,blockDim.z,
 8 |   gridDim.x,gridDim.y,gridDim.z);
 9 | }
10 | int main(int argc,char **argv)
11 | {
12 |   int nElem=6;
13 |   dim3 block(3);
14 |   dim3 grid((nElem+block.x-1)/block.x);
15 |   printf("grid.x %d grid.y %d grid.z %d\n",grid.x,grid.y,grid.z);
16 |   printf("block.x %d block.y %d block.z %d\n",block.x,block.y,block.z);
17 |   checkIndex<<<grid,block>>>();
18 |   cudaDeviceReset();
19 |   return 0;
20 | }
21 | 


--------------------------------------------------------------------------------
/20_SoA/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(SoA SoA.cu)
2 | 


--------------------------------------------------------------------------------
/20_SoA/SoA.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | #define SIZE (1<<18)
 5 | struct naiveStruct{
 6 |     float a[SIZE];
 7 |     float b[SIZE];
 8 | };
 9 | void sumArrays(float * a,float * b,float * res,const int size)
10 | {
11 | 
12 |     for(int i=0;i<size;i++)
13 |     {
14 |         res[i]=a[i]+b[i];
15 |     }
16 | 
17 | }
18 | __global__ void sumArraysGPU(float*a,float*b,struct naiveStruct* res,int n)
19 | {
20 |   //int i=threadIdx.x;
21 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
22 |   if(i<n)
23 |     (res->a)[i]=a[i]+b[i];
24 | }
25 | void checkResult_struct(float* res_h,struct naiveStruct*res_from_gpu_h,int nElem)
26 | {
27 |     for(int i=0;i<nElem;i++)
28 |         if (res_h[i]!=(res_from_gpu_h->a)[i])
29 |         {
30 |             printf("check fail!\n");
31 |             exit(0);
32 |         }
33 |     printf("result check success!\n");
34 | }
35 | int main(int argc,char **argv)
36 | {
37 |   int dev = 0;
38 |   cudaSetDevice(dev);
39 | 
40 |   int nElem=SIZE;
41 |   printf("Vector size:%d\n",nElem);
42 |   int nByte=sizeof(float)*nElem;
43 |   int nByte_struct=sizeof(struct naiveStruct);
44 |   float *a_h=(float*)malloc(nByte);
45 |   float *b_h=(float*)malloc(nByte);
46 |   float *res_h=(float*)malloc(nByte_struct);
47 |   struct naiveStruct *res_from_gpu_h=(struct naiveStruct*)malloc(nByte_struct);
48 |   memset(res_h,0,nByte);
49 |   memset(res_from_gpu_h,0,nByte);
50 | 
51 |   float *a_d,*b_d;
52 |   struct naiveStruct* res_d;
53 |   CHECK(cudaMalloc((float**)&a_d,nByte));
54 |   CHECK(cudaMalloc((float**)&b_d,nByte));
55 |   CHECK(cudaMalloc((struct naiveStruct**)&res_d,nByte_struct));
56 |   CHECK(cudaMemset(res_d,0,nByte_struct));
57 |   initialData(a_h,nElem);
58 |   initialData(b_h,nElem);
59 | 
60 |   CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
61 |   CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
62 | 
63 |   dim3 block(1024);
64 |   dim3 grid(nElem/block.x);
65 |   double iStart,iElaps;
66 |   iStart=cpuSecond();
67 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,nElem);
68 |   cudaDeviceSynchronize();
69 |   iElaps=cpuSecond()-iStart;
70 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte_struct,cudaMemcpyDeviceToHost));
71 |   printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);
72 | 
73 | 
74 |   sumArrays(a_h,b_h,res_h,nElem);
75 | 
76 |   checkResult_struct(res_h,res_from_gpu_h,nElem);
77 |   cudaFree(a_d);
78 |   cudaFree(b_d);
79 |   cudaFree(res_d);
80 | 
81 |   free(a_h);
82 |   free(b_h);
83 |   free(res_h);
84 |   free(res_from_gpu_h);
85 | 
86 |   return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/21_sum_array_offset_unrolling/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(sum_array_offset_unrolling sum_array_offset_unrolling.cu)
2 | 


--------------------------------------------------------------------------------
/21_sum_array_offset_unrolling/sum_array_offset_unrolling.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | 
 6 | void sumArrays(float * a,float * b,float * res,int offset,const int size)
 7 | {
 8 | 
 9 |     for(int i=0,k=offset;k<size;i++,k++)
10 |     {
11 |         res[i]=a[k]+b[k];
12 |     }
13 | 
14 | }
15 | __global__ void sumArraysGPU(float*a,float*b,float*res,int offset,int n)
16 | {
17 |   //int i=threadIdx.x;
18 |   int i=blockIdx.x*blockDim.x*4+threadIdx.x;
19 |   int k=i+offset;
20 |   if(k+3*blockDim.x<n)
21 |   {
22 |       res[i]=a[k]+b[k];
23 |       res[i+blockDim.x]=a[k+blockDim.x]+b[k+blockDim.x];
24 |       res[i+blockDim.x*2]=a[k+blockDim.x*2]+b[k+blockDim.x*2];
25 |       res[i+blockDim.x*3]=a[k+blockDim.x*3]+b[k+blockDim.x*3];
26 |   }
27 | 
28 | }
29 | 
30 | int main(int argc,char **argv)
31 | {
32 |   int dev = 0;
33 |   cudaSetDevice(dev);
34 |   int block_x=512;
35 |   int nElem=1<<18;
36 |   int offset=0;
37 |   if(argc==2)
38 |     offset=atoi(argv[1]);
39 |   else if(argc==3)
40 |     {
41 |         offset=atoi(argv[1]);
42 |         block_x=atoi(argv[2]);
43 |     }
44 |   printf("Vector size:%d\n",nElem);
45 |   int nByte=sizeof(float)*nElem;
46 |   float *a_h=(float*)malloc(nByte);
47 |   float *b_h=(float*)malloc(nByte);
48 |   float *res_h=(float*)malloc(nByte);
49 |   float *res_from_gpu_h=(float*)malloc(nByte);
50 |   memset(res_h,0,nByte);
51 |   memset(res_from_gpu_h,0,nByte);
52 | 
53 |   float *a_d,*b_d,*res_d;
54 |   CHECK(cudaMalloc((float**)&a_d,nByte));
55 |   CHECK(cudaMalloc((float**)&b_d,nByte));
56 |   CHECK(cudaMalloc((float**)&res_d,nByte));
57 |   CHECK(cudaMemset(res_d,0,nByte));
58 |   initialData(a_h,nElem);
59 |   initialData(b_h,nElem);
60 | 
61 |   CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
62 |   CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
63 | 
64 |   dim3 block(block_x);
65 |   dim3 grid(nElem/block.x);
66 |   double iStart,iElaps;
67 |   iStart=cpuSecond();
68 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,offset,nElem);
69 |   cudaDeviceSynchronize();
70 |   iElaps=cpuSecond()-iStart;
71 | 
72 |   printf("warmup Time elapsed %f sec\n",iElaps);
73 |   iStart=cpuSecond();
74 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,offset,nElem);
75 |   cudaDeviceSynchronize();
76 |   iElaps=cpuSecond()-iStart;
77 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
78 |   printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec --offset:%d \n",grid.x,block.x,iElaps,offset);
79 | 
80 | 
81 |   sumArrays(a_h,b_h,res_h,offset,nElem);
82 | 
83 |   checkResult(res_h,res_from_gpu_h,nElem-4*block_x);
84 |   cudaFree(a_d);
85 |   cudaFree(b_d);
86 |   cudaFree(res_d);
87 | 
88 |   free(a_h);
89 |   free(b_h);
90 |   free(res_h);
91 |   free(res_from_gpu_h);
92 | 
93 |   return 0;
94 | }
95 | 


--------------------------------------------------------------------------------
/22_transform_matrix2D/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(transform_matrix2D transform_matrix2D.cu)
2 | 


--------------------------------------------------------------------------------
/22_transform_matrix2D/transform_matrix2D.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | //cpu transform
  5 | void transformMatrix2D_CPU(float * MatA,float * MatB,int nx,int ny)
  6 | {
  7 |   for(int j=0;j<ny;j++)
  8 |   {
  9 |     for(int i=0;i<nx;i++)
 10 |     {
 11 |       MatB[i*nx+j]=MatA[j*nx+i];
 12 |     }
 13 |   }
 14 | }
 15 | __global__ void copyRow(float * MatA,float * MatB,int nx,int ny)
 16 | {
 17 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 18 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 19 |     int idx=ix+iy*nx;
 20 |     if (ix<nx && iy<ny)
 21 |     {
 22 |       MatB[idx]=MatA[idx];
 23 |     }
 24 | }
 25 | __global__ void copyCol(float * MatA,float * MatB,int nx,int ny)
 26 | {
 27 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 28 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 29 |     int idx=ix*ny+iy;
 30 |     if (ix<nx && iy<ny)
 31 |     {
 32 |       MatB[idx]=MatA[idx];
 33 |     }
 34 | }
 35 | __global__ void transformNaiveRow(float * MatA,float * MatB,int nx,int ny)
 36 | {
 37 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 38 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 39 |     int idx_row=ix+iy*nx;
 40 |     int idx_col=ix*ny+iy;
 41 |     if (ix<nx && iy<ny)
 42 |     {
 43 |       MatB[idx_col]=MatA[idx_row];
 44 |     }
 45 | }
 46 | __global__ void transformNaiveCol(float * MatA,float * MatB,int nx,int ny)
 47 | {
 48 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 49 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 50 |     int idx_row=ix+iy*nx;
 51 |     int idx_col=ix*ny+iy;
 52 |     if (ix<nx && iy<ny)
 53 |     {
 54 |       MatB[idx_row]=MatA[idx_col];
 55 |     }
 56 | }
 57 | __global__ void transformNaiveRowUnroll(float * MatA,float * MatB,int nx,int ny)
 58 | {
 59 |     int ix=threadIdx.x+blockDim.x*blockIdx.x*4;
 60 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 61 |     int idx_row=ix+iy*nx;
 62 |     int idx_col=ix*ny+iy;
 63 |     if (ix<nx && iy<ny)
 64 |     {
 65 |       MatB[idx_col]=MatA[idx_row];
 66 |       MatB[idx_col+ny*1*blockDim.x]=MatA[idx_row+1*blockDim.x];
 67 |       MatB[idx_col+ny*2*blockDim.x]=MatA[idx_row+2*blockDim.x];
 68 |       MatB[idx_col+ny*3*blockDim.x]=MatA[idx_row+3*blockDim.x];
 69 |     }
 70 | }
 71 | __global__ void transformNaiveColUnroll(float * MatA,float * MatB,int nx,int ny)
 72 | {
 73 |     int ix=threadIdx.x+blockDim.x*blockIdx.x*4;
 74 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 75 |     int idx_row=ix+iy*nx;
 76 |     int idx_col=ix*ny+iy;
 77 |     if (ix<nx && iy<ny)
 78 |     {
 79 |         MatB[idx_row]=MatA[idx_col];
 80 |         MatB[idx_row+1*blockDim.x]=MatA[idx_col+ny*1*blockDim.x];
 81 |         MatB[idx_row+2*blockDim.x]=MatA[idx_col+ny*2*blockDim.x];
 82 |         MatB[idx_row+3*blockDim.x]=MatA[idx_col+ny*3*blockDim.x];
 83 |     }
 84 | }
 85 | __global__ void transformNaiveRowDiagonal(float * MatA,float * MatB,int nx,int ny)
 86 | {
 87 |     int block_y=blockIdx.x;
 88 |     int block_x=(blockIdx.x+blockIdx.y)%gridDim.x;
 89 |     int ix=threadIdx.x+blockDim.x*block_x;
 90 |     int iy=threadIdx.y+blockDim.y*block_y;
 91 |     int idx_row=ix+iy*nx;
 92 |     int idx_col=ix*ny+iy;
 93 |     if (ix<nx && iy<ny)
 94 |     {
 95 |       MatB[idx_col]=MatA[idx_row];
 96 |     }
 97 | }
 98 | __global__ void transformNaiveColDiagonal(float * MatA,float * MatB,int nx,int ny)
 99 | {
100 |     int block_y=blockIdx.x;
101 |     int block_x=(blockIdx.x+blockIdx.y)%gridDim.x;
102 |     int ix=threadIdx.x+blockDim.x*block_x;
103 |     int iy=threadIdx.y+blockDim.y*block_y;
104 |     int idx_row=ix+iy*nx;
105 |     int idx_col=ix*ny+iy;
106 |     if (ix<nx && iy<ny)
107 |     {
108 |       MatB[idx_row]=MatA[idx_col];
109 |     }
110 | }
111 | 
112 | 
113 | 
114 | int main(int argc,char** argv)
115 | {
116 |   printf("strating...\n");
117 |   initDevice(0);
118 |   int nx=1<<12;
119 |   int ny=1<<12;
120 |   int dimx=32;
121 |   int dimy=32;
122 |   int nxy=nx*ny;
123 |   int nBytes=nxy*sizeof(float);
124 |   int transform_kernel=0;
125 |   if(argc==2)
126 |     transform_kernel=atoi(argv[1]);
127 |   if(argc>=4)
128 |   {
129 |       transform_kernel=atoi(argv[1]);
130 |       dimx=atoi(argv[2]);
131 |       dimy=atoi(argv[3]);
132 |   }
133 | 
134 |   //Malloc
135 |   float* A_host=(float*)malloc(nBytes);
136 |   float* B_host=(float*)malloc(nBytes);
137 |   initialData(A_host,nxy);
138 | 
139 |   //cudaMalloc
140 |   float *A_dev=NULL;
141 |   float *B_dev=NULL;
142 |   CHECK(cudaMalloc((void**)&A_dev,nBytes));
143 |   CHECK(cudaMalloc((void**)&B_dev,nBytes));
144 | 
145 |   CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
146 |   CHECK(cudaMemset(B_dev,0,nBytes));
147 | 
148 | 
149 | 
150 |   // cpu compute
151 |   double iStart=cpuSecond();
152 |   transformMatrix2D_CPU(A_host,B_host,nx,ny);
153 |   double iElaps=cpuSecond()-iStart;
154 |   printf("CPU Execution Time elapsed %f sec\n",iElaps);
155 | 
156 |   // 2d block and 2d grid
157 |   dim3 block(dimx,dimy);
158 |   dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
159 |   dim3 block_1(dimx,dimy);
160 |   dim3 grid_1((nx-1)/(block_1.x*4)+1,(ny-1)/block_1.y+1);
161 |   iStart=cpuSecond();
162 |   switch(transform_kernel)
163 |   {
164 |   case 0:
165 |     copyRow<<<grid,block>>>(A_dev,B_dev,nx,ny);
166 |     break;
167 |   case 1:
168 |     copyCol<<<grid,block>>>(A_dev,B_dev,nx,ny);
169 |     break;
170 |   case 2:
171 |     transformNaiveRow<<<grid,block>>>(A_dev,B_dev,nx,ny);
172 |     break;
173 |   case 3:
174 |         transformNaiveCol<<<grid,block>>>(A_dev,B_dev,nx,ny);
175 |         break;
176 |   case 4:
177 |         transformNaiveColUnroll<<<grid_1,block_1>>>(A_dev,B_dev,nx,ny);
178 |         break;
179 |   case 5:
180 | 
181 |         transformNaiveColUnroll<<<grid_1,block_1>>>(A_dev,B_dev,nx,ny);
182 |         break;
183 |   case 6:
184 |         transformNaiveRowDiagonal<<<grid,block>>>(A_dev,B_dev,nx,ny);
185 |         break;
186 |   case 7:
187 |         transformNaiveColDiagonal<<<grid,block>>>(A_dev,B_dev,nx,ny);
188 |         break;
189 |   default:
190 |     break;
191 |   }
192 |   CHECK(cudaDeviceSynchronize());
193 |   iElaps=cpuSecond()-iStart;
194 |   printf(" Time elapsed %f sec\n",iElaps);
195 |   CHECK(cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost));
196 |   checkResult(B_host,B_host,nxy);
197 | 
198 |   cudaFree(A_dev);
199 |   cudaFree(B_dev);
200 |   free(A_host);
201 |   free(B_host);
202 |   cudaDeviceReset();
203 |   return 0;
204 | }
205 | 


--------------------------------------------------------------------------------
/23_sum_array_uniform_memory/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(sum_arrays_uniform_memory sum_arrays_uniform_memory.cu)
2 | 


--------------------------------------------------------------------------------
/23_sum_array_uniform_memory/sum_arrays_uniform_memory.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | 
 6 | 
 7 | void sumArrays(float * a,float * b,float * res,const int size)
 8 | {
 9 |   for(int i=0;i<size;i+=4)
10 |   {
11 |     res[i]=a[i]+b[i];
12 |     res[i+1]=a[i+1]+b[i+1];
13 |     res[i+2]=a[i+2]+b[i+2];
14 |     res[i+3]=a[i+3]+b[i+3];
15 |   }
16 | }
17 | __global__ void sumArraysGPU(float*a,float*b,float*res,int N)
18 | {
19 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
20 |   if(i < N)
21 |     res[i]=a[i]+b[i];
22 | }
23 | int main(int argc,char **argv)
24 | {
25 |   // set up device
26 |   initDevice(0);
27 | 
28 |   int nElem=1<<24;
29 |   printf("Vector size:%d\n",nElem);
30 |   int nByte=sizeof(float)*nElem;
31 |   float *res_h=(float*)malloc(nByte);
32 |   memset(res_h,0,nByte);
33 |   //memset(res_from_gpu_h,0,nByte);
34 | 
35 |   float *a_d,*b_d,*res_d;
36 |   CHECK(cudaMallocManaged((float**)&a_d,nByte));
37 |   CHECK(cudaMallocManaged((float**)&b_d,nByte));
38 |   CHECK(cudaMallocManaged((float**)&res_d,nByte));
39 | 
40 |   initialData(a_d,nElem);
41 |   initialData(b_d,nElem);
42 | 
43 |   //CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
44 |   //CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
45 | 
46 |   dim3 block(512);
47 |   dim3 grid((nElem-1)/block.x+1);
48 | 
49 |   double iStart,iElaps;
50 |   iStart=cpuSecond();
51 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,nElem);
52 |   cudaDeviceSynchronize();
53 |   iElaps=cpuSecond()-iStart;
54 |   printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);
55 | 
56 |   //CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
57 |   sumArrays(b_d,b_d,res_h,nElem);
58 | 
59 |   checkResult(res_h,res_d,nElem);
60 |   cudaFree(a_d);
61 |   cudaFree(b_d);
62 |   cudaFree(res_d);
63 | 
64 |   free(res_h);
65 | 
66 |   return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/24_shared_memory_read_data/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(shared_memory_read_data shared_memory_read_data.cu)
2 | 


--------------------------------------------------------------------------------
/24_shared_memory_read_data/shared_memory_read_data.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define BDIMX 32
  5 | #define BDIMY 32
  6 | 
  7 | #define BDIMX_RECT 32
  8 | #define BDIMY_RECT 16
  9 | #define IPAD 1
 10 | __global__ void warmup(int * out)
 11 | {
 12 |     __shared__ int tile[BDIMY][BDIMX];
 13 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
 14 | 
 15 |     tile[threadIdx.y][threadIdx.x]=idx;
 16 |     __syncthreads();
 17 |     out[idx]=tile[threadIdx.y][threadIdx.x];
 18 | }
 19 | __global__ void setRowReadRow(int * out)
 20 | {
 21 |     __shared__ int tile[BDIMY][BDIMX];
 22 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
 23 | 
 24 |     tile[threadIdx.y][threadIdx.x]=idx;
 25 |     __syncthreads();
 26 |     out[idx]=tile[threadIdx.y][threadIdx.x];
 27 | }
 28 | __global__ void setColReadCol(int * out)
 29 | {
 30 |     __shared__ int tile[BDIMY][BDIMX];
 31 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
 32 | 
 33 |     tile[threadIdx.x][threadIdx.y]=idx;
 34 |     __syncthreads();
 35 |     out[idx]=tile[threadIdx.x][threadIdx.y];
 36 | }
 37 | __global__ void setColReadRow(int * out)
 38 | {
 39 |     __shared__ int tile[BDIMY][BDIMX];
 40 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
 41 | 
 42 |     tile[threadIdx.x][threadIdx.y]=idx;
 43 |     __syncthreads();
 44 |     out[idx]=tile[threadIdx.y][threadIdx.x];
 45 | }
 46 | __global__ void setRowReadCol(int * out)
 47 | {
 48 |     __shared__ int tile[BDIMY][BDIMX];
 49 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
 50 | 
 51 |     tile[threadIdx.y][threadIdx.x]=idx;
 52 |     __syncthreads();
 53 |     out[idx]=tile[threadIdx.x][threadIdx.y];
 54 | }
 55 | __global__ void setRowReadColDyn(int * out)
 56 | {
 57 |     extern __shared__ int tile[];
 58 |     unsigned int row_idx=threadIdx.y*blockDim.x+threadIdx.x;
 59 |     unsigned int col_idx=threadIdx.x*blockDim.y+threadIdx.y;
 60 |     tile[row_idx]=row_idx;
 61 |     __syncthreads();
 62 |     out[row_idx]=tile[col_idx];
 63 | }
 64 | __global__ void setRowReadColIpad(int * out)
 65 | {
 66 |     __shared__ int tile[BDIMY][BDIMX+IPAD];
 67 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
 68 | 
 69 |     tile[threadIdx.y][threadIdx.x]=idx;
 70 |     __syncthreads();
 71 |     out[idx]=tile[threadIdx.x][threadIdx.y];
 72 | }
 73 | __global__ void setRowReadColDynIpad(int * out)
 74 | {
 75 |     extern __shared__ int tile[];
 76 |     unsigned int row_idx=threadIdx.y*(blockDim.x+1)+threadIdx.x;
 77 |     unsigned int col_idx=threadIdx.x*(blockDim.x+1)+threadIdx.y;
 78 |     tile[row_idx]=row_idx;
 79 |     __syncthreads();
 80 |     out[row_idx]=tile[col_idx];
 81 | }
 82 | //--------------------rectagle---------------------
 83 | __global__ void setRowReadColRect(int * out)
 84 | {
 85 |     __shared__ int tile[BDIMY_RECT][BDIMX_RECT];
 86 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
 87 |     unsigned int icol=idx%blockDim.y;
 88 |     unsigned int irow=idx/blockDim.y;
 89 |     tile[threadIdx.y][threadIdx.x]=idx;
 90 |     __syncthreads();
 91 |     out[idx]=tile[icol][irow];
 92 | }
 93 | __global__ void setRowReadColRectDyn(int * out)
 94 | {
 95 |     extern __shared__ int tile[];
 96 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
 97 |     unsigned int icol=idx%blockDim.y;
 98 |     unsigned int irow=idx/blockDim.y;
 99 |     unsigned int col_idx=icol*blockDim.x+irow;
100 |     tile[idx]=idx;
101 |     __syncthreads();
102 |     out[idx]=tile[col_idx];
103 | }
104 | __global__ void setRowReadColRectPad(int * out)
105 | {
106 |     __shared__ int tile[BDIMY_RECT][BDIMX_RECT+IPAD*2];
107 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
108 |     unsigned int icol=idx%blockDim.y;
109 |     unsigned int irow=idx/blockDim.y;
110 |     tile[threadIdx.y][threadIdx.x]=idx;
111 |     __syncthreads();
112 |     out[idx]=tile[icol][irow];
113 | }
114 | __global__ void setRowReadColRectDynPad(int * out)
115 | {
116 |     extern __shared__ int tile[];
117 |     unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x;
118 |     unsigned int icol=idx%blockDim.y;
119 |     unsigned int irow=idx/blockDim.y;
120 |     unsigned int row_idx=threadIdx.y*(IPAD+blockDim.x)+threadIdx.x;
121 |     unsigned int col_idx=icol*(IPAD+blockDim.x)+irow;
122 |     tile[row_idx]=idx;
123 |     __syncthreads();
124 |     out[idx]=tile[col_idx];
125 | }
126 | int main(int argc,char **argv)
127 | {
128 |   // set up device
129 |   initDevice(0);
130 |   int kernel=0;
131 |   if(argc>=2)
132 |     kernel=atoi(argv[1]);
133 |   int nElem=BDIMX*BDIMY;
134 |   printf("Vector size:%d\n",nElem);
135 |   int nByte=sizeof(int)*nElem;
136 |   int * out;
137 |   CHECK(cudaMalloc((int**)&out,nByte));
138 |   cudaSharedMemConfig MemConfig;
139 |   CHECK(cudaDeviceGetSharedMemConfig(&MemConfig));
140 |   printf("--------------------------------------------\n");
141 |   switch (MemConfig) {
142 | 
143 |       case cudaSharedMemBankSizeFourByte:
144 |         printf("the device is cudaSharedMemBankSizeFourByte: 4-Byte\n");
145 |       break;
146 |       case cudaSharedMemBankSizeEightByte:
147 |         printf("the device is cudaSharedMemBankSizeEightByte: 8-Byte\n");
148 |       break;
149 | 
150 |   }
151 |   printf("--------------------------------------------\n");
152 |   dim3 block(BDIMY,BDIMX);
153 |   dim3 grid(1,1);
154 |   dim3 block_rect(BDIMX_RECT,BDIMY_RECT);
155 |   dim3 grid_rect(1,1);
156 |   warmup<<<grid,block>>>(out);
157 |   printf("warmup!\n");
158 |   double iStart,iElaps;
159 |   iStart=cpuSecond();
160 |   switch(kernel)
161 |   {
162 |       case 0:
163 |           {
164 |           setRowReadRow<<<grid,block>>>(out);
165 |           cudaDeviceSynchronize();
166 |           iElaps=cpuSecond()-iStart;
167 |           printf("setRowReadRow  ");
168 |           printf("Execution Time elapsed %f sec\n",iElaps);
169 |       //break;
170 |       //case 1:
171 |           iStart=cpuSecond();
172 |           setColReadCol<<<grid,block>>>(out);
173 |           cudaDeviceSynchronize();
174 |           iElaps=cpuSecond()-iStart;
175 |           printf("setColReadCol  ");
176 |           printf("Execution Time elapsed %f sec\n",iElaps);
177 |           break;
178 |         }
179 |       case 2:
180 |         {
181 |           setColReadRow<<<grid,block>>>(out);
182 |           cudaDeviceSynchronize();
183 |           iElaps=cpuSecond()-iStart;
184 |           printf("setColReadRow  ");
185 |           printf("Execution Time elapsed %f sec\n",iElaps);
186 |           break;
187 |         }
188 |       case 3:
189 |       {
190 |           setRowReadCol<<<grid,block>>>(out);
191 |           cudaDeviceSynchronize();
192 |           iElaps=cpuSecond()-iStart;
193 |           printf("setRowReadCol  ");
194 |           printf("Execution Time elapsed %f sec\n",iElaps);
195 |           break;
196 |       }
197 |       case 4:
198 |       {
199 |             setRowReadColDyn<<<grid,block,(BDIMX)*BDIMY*sizeof(int)>>>(out);
200 |             cudaDeviceSynchronize();
201 |             iElaps=cpuSecond()-iStart;
202 |             printf("setRowReadColDyn  ");
203 |             printf("Execution Time elapsed %f sec\n",iElaps);
204 |             break;
205 |         }
206 |       case 5:
207 |       {
208 |           setRowReadColIpad<<<grid,block>>>(out);
209 |           cudaDeviceSynchronize();
210 |           iElaps=cpuSecond()-iStart;
211 |           printf("setRowReadColIpad  ");
212 |           printf("Execution Time elapsed %f sec\n",iElaps);
213 |           break;
214 |       }
215 |       case 6:
216 |       {
217 |           setRowReadColDynIpad<<<grid,block,(BDIMX+IPAD)*BDIMY*sizeof(int)>>>(out);
218 |           cudaDeviceSynchronize();
219 |           iElaps=cpuSecond()-iStart;
220 |           printf("setRowReadColDynIpad  ");
221 |           printf("Execution Time elapsed %f sec\n",iElaps);
222 |           break;
223 |       }
224 |       case 7:
225 |       {
226 |           setRowReadColRect<<<grid_rect,block_rect>>>(out);
227 |           cudaDeviceSynchronize();
228 |           iElaps=cpuSecond()-iStart;
229 |           printf("setRowReadColRect  ");
230 |           printf("Execution Time elapsed %f sec\n",iElaps);
231 |           break;
232 |       }
233 |       case 8:
234 |       {
235 |           setRowReadColRectDyn<<<grid_rect,block_rect,(BDIMX)*BDIMY*sizeof(int)>>>(out);
236 |           cudaDeviceSynchronize();
237 |           iElaps=cpuSecond()-iStart;
238 |           printf("setRowReadColRectDyn  ");
239 |           printf("Execution Time elapsed %f sec\n",iElaps);
240 |           break;
241 |       }
242 |       case 9:
243 |       {
244 |           setRowReadColRectPad<<<grid_rect,block_rect>>>(out);
245 |           cudaDeviceSynchronize();
246 |           iElaps=cpuSecond()-iStart;
247 |           printf("setRowReadColRectPad  ");
248 |           printf("Execution Time elapsed %f sec\n",iElaps);
249 |           break;
250 |       }
251 |       case 10:
252 |       {
253 |           setRowReadColRectDynPad<<<grid_rect,block_rect,(BDIMX+1)*BDIMY*sizeof(int)>>>(out);
254 |           cudaDeviceSynchronize();
255 |           iElaps=cpuSecond()-iStart;
256 |           printf("setRowReadColRectDynPad  ");
257 |           printf("Execution Time elapsed %f sec\n",iElaps);
258 |           break;
259 |       }
260 |       case 11:
261 |       {
262 |             setRowReadRow<<<grid,block>>>(out);
263 |             cudaDeviceSynchronize();
264 | 
265 |             setColReadCol<<<grid,block>>>(out);
266 |             cudaDeviceSynchronize();
267 | 
268 |             setColReadRow<<<grid,block>>>(out);
269 |             cudaDeviceSynchronize();
270 | 
271 |             setRowReadCol<<<grid,block>>>(out);
272 |             cudaDeviceSynchronize();
273 | 
274 |             setRowReadColDyn<<<grid,block,(BDIMX)*BDIMY*sizeof(int)>>>(out);
275 |             cudaDeviceSynchronize();
276 | 
277 |             setRowReadColIpad<<<grid,block>>>(out);
278 |             cudaDeviceSynchronize();
279 | 
280 |             setRowReadColDynIpad<<<grid,block,(BDIMX+IPAD)*BDIMY*sizeof(int)>>>(out);
281 |             cudaDeviceSynchronize();
282 |             break;
283 |     }
284 |     case 12:
285 |     {
286 |         setRowReadColRect<<<grid_rect,block_rect>>>(out);
287 |         setRowReadColRectDyn<<<grid_rect,block_rect,(BDIMX)*BDIMY*sizeof(int)>>>(out);
288 |         setRowReadColRectPad<<<grid_rect,block_rect>>>(out);
289 |         setRowReadColRectDynPad<<<grid_rect,block_rect,(BDIMX+1)*BDIMY*sizeof(int)>>>(out);
290 |         break;
291 |     }
292 | 
293 |   }
294 | 
295 |   cudaFree(out);
296 |   return 0;
297 | }
298 | 


--------------------------------------------------------------------------------
/25_reduce_integer_shared_memory/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(reduce_integer_shared_memory reduce_integer_shared_memory.cu)
2 | 


--------------------------------------------------------------------------------
/25_reduce_integer_shared_memory/reduce_integer_shared_memory.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define DIM 1024
  5 | 
  6 | int recursiveReduce(int *data, int const size)
  7 | {
  8 | 	// terminate check
  9 | 	if (size == 1) return data[0];
 10 | 	// renew the stride
 11 | 	int const stride = size / 2;
 12 | 	if (size % 2 == 1)
 13 | 	{
 14 | 		for (int i = 0; i < stride; i++)
 15 | 		{
 16 | 			data[i] += data[i + stride];
 17 | 		}
 18 | 		data[0] += data[size - 1];
 19 | 	}
 20 | 	else
 21 | 	{
 22 | 		for (int i = 0; i < stride; i++)
 23 | 		{
 24 | 			data[i] += data[i + stride];
 25 | 		}
 26 | 	}
 27 | 	// call
 28 | 	return recursiveReduce(data, stride);
 29 | }
 30 | __global__ void warmup(int * g_idata, int * g_odata, unsigned int n)
 31 | {
 32 | 
 33 | 	//set thread ID
 34 | 	unsigned int tid = threadIdx.x;
 35 | 	//boundary check
 36 | 	if (tid >= n) return;
 37 | 	//convert global data pointer to the
 38 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 39 | 	//in-place reduction in global memory
 40 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 41 | 	{
 42 | 		if ((tid % (2 * stride)) == 0)
 43 | 		{
 44 | 			idata[tid] += idata[tid + stride];
 45 | 		}
 46 | 		//synchronize within block
 47 | 		__syncthreads();
 48 | 	}
 49 | 	//write result for this block to global mem
 50 | 	if (tid == 0)
 51 | 		g_odata[blockIdx.x] = idata[0];
 52 | 
 53 | }
 54 | 
 55 | 
 56 | __global__ void reduceGmem(int * g_idata,int * g_odata,unsigned int n)
 57 | {
 58 | 	//set thread ID
 59 | 	unsigned int tid = threadIdx.x;
 60 | 	unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x;
 61 | 	//boundary check
 62 | 	if (tid >= n) return;
 63 | 	//convert global data pointer to the
 64 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 65 | 
 66 | 	//in-place reduction in global memory
 67 | 	if(blockDim.x>=1024 && tid <512)
 68 | 		idata[tid]+=idata[tid+512];
 69 | 	__syncthreads();
 70 | 	if(blockDim.x>=512 && tid <256)
 71 | 		idata[tid]+=idata[tid+256];
 72 | 	__syncthreads();
 73 | 	if(blockDim.x>=256 && tid <128)
 74 | 		idata[tid]+=idata[tid+128];
 75 | 	__syncthreads();
 76 | 	if(blockDim.x>=128 && tid <64)
 77 | 		idata[tid]+=idata[tid+64];
 78 | 	__syncthreads();
 79 | 	//write result for this block to global mem
 80 | 	if(tid<32)
 81 | 	{
 82 | 		volatile int *vmem = idata;
 83 | 		vmem[tid]+=vmem[tid+32];
 84 | 		vmem[tid]+=vmem[tid+16];
 85 | 		vmem[tid]+=vmem[tid+8];
 86 | 		vmem[tid]+=vmem[tid+4];
 87 | 		vmem[tid]+=vmem[tid+2];
 88 | 		vmem[tid]+=vmem[tid+1];
 89 | 
 90 | 	}
 91 | 
 92 | 	if (tid == 0)
 93 | 		g_odata[blockIdx.x] = idata[0];
 94 | 
 95 | }
 96 | 
 97 | 
 98 | __global__ void reduceSmem(int * g_idata,int * g_odata,unsigned int n)
 99 | {
100 | 	//set thread ID
101 |     __shared__ int smem[DIM];
102 | 	unsigned int tid = threadIdx.x;
103 | 	//unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x;
104 | 	//boundary check
105 | 	if (tid >= n) return;
106 | 	//convert global data pointer to the
107 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
108 | 
109 |     smem[tid]=idata[tid];
110 | 	__syncthreads();
111 | 	//in-place reduction in global memory
112 | 	if(blockDim.x>=1024 && tid <512)
113 | 		smem[tid]+=smem[tid+512];
114 | 	__syncthreads();
115 | 	if(blockDim.x>=512 && tid <256)
116 | 		smem[tid]+=smem[tid+256];
117 | 	__syncthreads();
118 | 	if(blockDim.x>=256 && tid <128)
119 | 		smem[tid]+=smem[tid+128];
120 | 	__syncthreads();
121 | 	if(blockDim.x>=128 && tid <64)
122 | 		smem[tid]+=smem[tid+64];
123 | 	__syncthreads();
124 | 	//write result for this block to global mem
125 | 	if(tid<32)
126 | 	{
127 | 		volatile int *vsmem = smem;
128 | 		vsmem[tid]+=vsmem[tid+32];
129 | 		vsmem[tid]+=vsmem[tid+16];
130 | 		vsmem[tid]+=vsmem[tid+8];
131 | 		vsmem[tid]+=vsmem[tid+4];
132 | 		vsmem[tid]+=vsmem[tid+2];
133 | 		vsmem[tid]+=vsmem[tid+1];
134 | 
135 | 	}
136 | 
137 | 	if (tid == 0)
138 | 		g_odata[blockIdx.x] = smem[0];
139 | 
140 | }
141 | 
142 | __global__ void reduceUnroll4Smem(int * g_idata,int * g_odata,unsigned int n)
143 | {
144 | 	//set thread ID
145 |     __shared__ int smem[DIM];
146 | 	unsigned int tid = threadIdx.x;
147 | 	unsigned int idx = blockDim.x*blockIdx.x*4+threadIdx.x;
148 | 	//boundary check
149 | 	if (tid >= n) return;
150 | 	//convert global data pointer to the
151 |     int tempSum=0;
152 | 	if(idx+3 * blockDim.x<=n)
153 | 	{
154 | 		int a1=g_idata[idx];
155 | 		int a2=g_idata[idx+blockDim.x];
156 | 		int a3=g_idata[idx+2*blockDim.x];
157 | 		int a4=g_idata[idx+3*blockDim.x];
158 | 		tempSum=a1+a2+a3+a4;
159 | 
160 | 	}
161 |     smem[tid]=tempSum;
162 | 	__syncthreads();
163 | 	//in-place reduction in global memory
164 | 	if(blockDim.x>=1024 && tid <512)
165 | 		smem[tid]+=smem[tid+512];
166 | 	__syncthreads();
167 | 	if(blockDim.x>=512 && tid <256)
168 | 		smem[tid]+=smem[tid+256];
169 | 	__syncthreads();
170 | 	if(blockDim.x>=256 && tid <128)
171 | 		smem[tid]+=smem[tid+128];
172 | 	__syncthreads();
173 | 	if(blockDim.x>=128 && tid <64)
174 | 		smem[tid]+=smem[tid+64];
175 | 	__syncthreads();
176 | 	//write result for this block to global mem
177 | 	if(tid<32)
178 | 	{
179 | 		volatile int *vsmem = smem;
180 | 		vsmem[tid]+=vsmem[tid+32];
181 | 		vsmem[tid]+=vsmem[tid+16];
182 | 		vsmem[tid]+=vsmem[tid+8];
183 | 		vsmem[tid]+=vsmem[tid+4];
184 | 		vsmem[tid]+=vsmem[tid+2];
185 | 		vsmem[tid]+=vsmem[tid+1];
186 | 
187 | 	}
188 | 
189 | 	if (tid == 0)
190 | 		g_odata[blockIdx.x] = smem[0];
191 | 
192 | }
193 | 
194 | int main(int argc,char** argv)
195 | {
196 | 	initDevice(0);
197 | 
198 | 	bool bResult = false;
199 | 	//initialization
200 | 
201 | 	int size = 1 << 24;
202 | 	printf("	with array size %d  \n", size);
203 | 
204 | 	//execution configuration
205 | 	int blocksize = 1024;
206 | 	if (argc > 1)
207 | 	{
208 | 		blocksize = atoi(argv[1]);
209 | 	}
210 | 	dim3 block(blocksize, 1);
211 | 	dim3 grid((size - 1) / block.x + 1, 1);
212 | 	printf("grid %d block %d \n", grid.x, block.x);
213 | 
214 | 	//allocate host memory
215 | 	size_t bytes = size * sizeof(int);
216 | 	int *idata_host = (int*)malloc(bytes);
217 | 	int *odata_host = (int*)malloc(grid.x * sizeof(int));
218 | 	int * tmp = (int*)malloc(bytes);
219 | 
220 | 	//initialize the array
221 | 	initialData_int(idata_host, size);
222 | 
223 | 	memcpy(tmp, idata_host, bytes);
224 | 	double iStart, iElaps;
225 | 	int gpu_sum = 0;
226 | 
227 | 	// device memory
228 | 	int * idata_dev = NULL;
229 | 	int * odata_dev = NULL;
230 | 	CHECK(cudaMalloc((void**)&idata_dev, bytes));
231 | 	CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int)));
232 | 
233 | 	//cpu reduction
234 | 	int cpu_sum = 0;
235 | 	iStart = cpuSecond();
236 | 	//cpu_sum = recursiveReduce(tmp, size);
237 | 	for (int i = 0; i < size; i++)
238 | 		cpu_sum += tmp[i];
239 | 	iElaps = cpuSecond() - iStart;
240 | 	printf("cpu reduce           elapsed %lf ms cpu_sum: %d\n", iElaps, cpu_sum);
241 | 
242 | 
243 | 	//kernel 1:warmup
244 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
245 | 	CHECK(cudaDeviceSynchronize());
246 | 	iStart = cpuSecond();
247 | 	warmup <<<grid.x/2, block >>>(idata_dev, odata_dev, size);
248 | 	cudaDeviceSynchronize();
249 | 	iElaps = cpuSecond() - iStart;
250 | 	printf("gpu warmup           elapsed %lf ms\n",iElaps);
251 | 
252 | 
253 | 
254 | 	//reduceGmem
255 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
256 | 	CHECK(cudaDeviceSynchronize());
257 | 	iStart = cpuSecond();
258 | 	reduceGmem <<<grid.x, block>>>(idata_dev, odata_dev, size);
259 | 	cudaDeviceSynchronize();
260 | 	iElaps = cpuSecond() - iStart;
261 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
262 | 	gpu_sum = 0;
263 | 	for (int i = 0; i < grid.x; i++)
264 | 		gpu_sum += odata_host[i];
265 | 	printf("reduceGmem           elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum);
266 | 
267 |     //reduceSmem
268 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
269 | 	CHECK(cudaDeviceSynchronize());
270 | 	iStart = cpuSecond();
271 | 	reduceSmem <<<grid.x, block>>>(idata_dev, odata_dev, size);
272 | 	cudaDeviceSynchronize();
273 | 	iElaps = cpuSecond() - iStart;
274 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
275 | 	gpu_sum = 0;
276 | 	for (int i = 0; i < grid.x; i++)
277 | 		gpu_sum += odata_host[i];
278 | 	printf("reduceSmem           elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum);
279 | 
280 | 
281 | 
282 |     //reduceUnroll4Smem
283 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
284 | 	CHECK(cudaDeviceSynchronize());
285 | 	iStart = cpuSecond();
286 | 	reduceUnroll4Smem <<<grid.x/4, block>>>(idata_dev, odata_dev, size);
287 | 	cudaDeviceSynchronize();
288 | 	iElaps = cpuSecond() - iStart;
289 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
290 | 	gpu_sum = 0;
291 | 	for (int i = 0; i < grid.x/4; i++)
292 | 		gpu_sum += odata_host[i];
293 | 	printf("reduceUnroll4Smem    elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum);
294 | 
295 | 
296 | 	free(idata_host);
297 | 	free(odata_host);
298 | 	CHECK(cudaFree(idata_dev));
299 | 	CHECK(cudaFree(odata_dev));
300 | 	//reset device
301 | 	cudaDeviceReset();
302 | 	return EXIT_SUCCESS;
303 | 
304 | }
305 | 


--------------------------------------------------------------------------------
/26_transform_shared_memory/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(transform_shared_memory transform_shared_memory.cu)
2 | 


--------------------------------------------------------------------------------
/26_transform_shared_memory/transform_shared_memory.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define BDIMX 8
  5 | #define BDIMY 8
  6 | #define IPAD 2
  7 | //cpu transform
  8 | void transformMatrix2D_CPU(float * in,float * out,int nx,int ny)
  9 | {
 10 |   for(int j=0;j<ny;j++)
 11 |   {
 12 |     for(int i=0;i<nx;i++)
 13 |     {
 14 |       out[i*nx+j]=in[j*nx+i];
 15 |     }
 16 |   }
 17 | }
 18 | __global__ void warmup(float * in,float * out,int nx,int ny)
 19 | {
 20 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 21 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 22 |     int idx=ix+iy*nx;
 23 |     if (ix<nx && iy<ny)
 24 |     {
 25 |       out[idx]=in[idx];
 26 |     }
 27 | }
 28 | __global__ void copyRow(float * in,float * out,int nx,int ny)
 29 | {
 30 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 31 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 32 |     int idx=ix+iy*nx;
 33 |     if (ix<nx && iy<ny)
 34 |     {
 35 |       out[idx]=in[idx];
 36 |     }
 37 | }
 38 | 
 39 | __global__ void transformNaiveRow(float * in,float * out,int nx,int ny)
 40 | {
 41 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 42 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 43 |     int idx_row=ix+iy*nx;
 44 |     int idx_col=ix*ny+iy;
 45 |     if (ix<nx && iy<ny)
 46 |     {
 47 |       out[idx_col]=in[idx_row];
 48 |     }
 49 | }
 50 | 
 51 | 
 52 | //----------------------shared memory---------------------------
 53 | __global__ void transformSmem(float * in,float* out,int nx,int ny)
 54 | {
 55 | 	__shared__ float tile[BDIMY][BDIMX];
 56 | 	unsigned int ix,iy,transform_in_idx,transform_out_idx;
 57 | 	ix=threadIdx.x+blockDim.x*blockIdx.x;
 58 |     iy=threadIdx.y+blockDim.y*blockIdx.y;
 59 | 	transform_in_idx=iy*nx+ix;
 60 | 
 61 | 	unsigned int bidx,irow,icol;
 62 | 	bidx=threadIdx.y*blockDim.x+threadIdx.x;
 63 | 	irow=bidx/blockDim.y;
 64 | 	icol=bidx%blockDim.y;
 65 | 
 66 | 
 67 | 	ix=blockIdx.y*blockDim.y+icol;
 68 | 	iy=blockIdx.x*blockDim.x+irow;
 69 | 
 70 | 
 71 | 	transform_out_idx=iy*ny+ix;
 72 | 
 73 | 	if(ix<nx&& iy<ny)
 74 | 	{
 75 | 		tile[threadIdx.y][threadIdx.x]=in[transform_in_idx];
 76 | 		__syncthreads();
 77 | 		out[transform_out_idx]=tile[icol][irow];
 78 | 
 79 | 	}
 80 | 
 81 | }
 82 | __global__ void transformSmemPad(float * in,float* out,int nx,int ny)
 83 | {
 84 | 	__shared__ float tile[BDIMY][BDIMX+IPAD];
 85 | 	unsigned int ix,iy,transform_in_idx,transform_out_idx;
 86 | 	ix=threadIdx.x+blockDim.x*blockIdx.x;
 87 |     iy=threadIdx.y+blockDim.y*blockIdx.y;
 88 | 	transform_in_idx=iy*nx+ix;
 89 | 
 90 | 	unsigned int bidx,irow,icol;
 91 | 	bidx=threadIdx.y*blockDim.x+threadIdx.x;
 92 | 	irow=bidx/blockDim.y;
 93 | 	icol=bidx%blockDim.y;
 94 | 
 95 | 
 96 | 	ix=blockIdx.y*blockDim.y+icol;
 97 | 	iy=blockIdx.x*blockDim.x+irow;
 98 | 
 99 | 
100 | 	transform_out_idx=iy*ny+ix;
101 | 
102 | 	if(ix<nx&& iy<ny)
103 | 	{
104 | 		tile[threadIdx.y][threadIdx.x]=in[transform_in_idx];
105 | 		__syncthreads();
106 | 		out[transform_out_idx]=tile[icol][irow];
107 | 
108 | 	}
109 | 
110 | }
111 | __global__ void transformSmemUnrollPad(float * in,float* out,int nx,int ny)
112 | {
113 | 	__shared__ float tile[BDIMY*(BDIMX*2+IPAD)];
114 | 
115 | 
116 | 	unsigned int ix,iy,transform_in_idx,transform_out_idx;
117 | 	ix=threadIdx.x+blockDim.x*blockIdx.x*2;
118 |     iy=threadIdx.y+blockDim.y*blockIdx.y;
119 | 	transform_in_idx=iy*nx+ix;
120 | 
121 | 	unsigned int bidx,irow,icol;
122 | 	bidx=threadIdx.y*blockDim.x+threadIdx.x;
123 | 	irow=bidx/blockDim.y;
124 | 	icol=bidx%blockDim.y;
125 | 
126 | 
127 | 	unsigned int ix2=blockIdx.y*blockDim.y+icol;
128 | 	unsigned int iy2=blockIdx.x*blockDim.x*2+irow;
129 | 
130 | 
131 | 	transform_out_idx=iy2*ny+ix2;
132 | 
133 | 	if(ix+blockDim.x<nx&& iy<ny)
134 | 	{
135 | 		unsigned int row_idx=threadIdx.y*(blockDim.x*2+IPAD)+threadIdx.x;
136 | 		tile[row_idx]=in[transform_in_idx];
137 | 		tile[row_idx+BDIMX]=in[transform_in_idx+BDIMX];
138 | 		__syncthreads();
139 | 		unsigned int col_idx=icol*(blockDim.x*2+IPAD)+irow;
140 |         out[transform_out_idx]=tile[col_idx];
141 | 		out[transform_out_idx+ny*BDIMX]=tile[col_idx+BDIMX];
142 | 
143 | 	}
144 | 
145 | }
146 | 
147 | //--------------------------------------------------------------
148 | 
149 | 
150 | 
151 | int main(int argc,char** argv)
152 | {
153 |   printf("strating...\n");
154 |   initDevice(0);
155 |   int nx=1<<12;
156 |   int ny=1<<12;
157 |   int dimx=BDIMX;
158 |   int dimy=BDIMY;
159 |   int nxy=nx*ny;
160 |   int nBytes=nxy*sizeof(float);
161 |   int transform_kernel=0;
162 |   if(argc==2)
163 |     transform_kernel=atoi(argv[1]);
164 |   if(argc>=4)
165 |   {
166 |       transform_kernel=atoi(argv[1]);
167 |       dimx=atoi(argv[2]);
168 |       dimy=atoi(argv[3]);
169 |   }
170 | 
171 |   //Malloc
172 |   float* A_host=(float*)malloc(nBytes);
173 |   float* B_host_cpu=(float*)malloc(nBytes);
174 |   float* B_host=(float*)malloc(nBytes);
175 |   initialData(A_host,nxy);
176 | 
177 |   //cudaMalloc
178 |   float *A_dev=NULL;
179 |   float *B_dev=NULL;
180 |   CHECK(cudaMalloc((void**)&A_dev,nBytes));
181 |   CHECK(cudaMalloc((void**)&B_dev,nBytes));
182 | 
183 |   CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
184 |   CHECK(cudaMemset(B_dev,0,nBytes));
185 | 
186 | 
187 | 
188 |   // cpu compute
189 |   double iStart=cpuSecond();
190 |   transformMatrix2D_CPU(A_host,B_host_cpu,nx,ny);
191 |   double iElaps=cpuSecond()-iStart;
192 |   printf("CPU Execution Time elapsed %f sec\n",iElaps);
193 | 
194 |   // 2d block and 2d grid
195 |   dim3 block(dimx,dimy);
196 |   dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
197 |   dim3 block_1(dimx,dimy);
198 |   dim3 grid_1((nx-1)/(block_1.x*2)+1,(ny-1)/block_1.y+1);
199 |   //warmup
200 |   warmup<<<grid,block>>>(A_dev,B_dev,nx,ny);
201 |   CHECK(cudaDeviceSynchronize());
202 |   iStart=cpuSecond();
203 |   switch(transform_kernel)
204 |   {
205 |   case 0:
206 | 	    copyRow<<<grid,block>>>(A_dev,B_dev,nx,ny);
207 |         printf("copyRow ");
208 | 	    break;
209 |   case 1:
210 | 	    transformNaiveRow<<<grid,block>>>(A_dev,B_dev,nx,ny);
211 |         printf("transformNaiveRow ");
212 | 	    break;
213 |   case 2:
214 |   		transformSmem<<<grid,block>>>(A_dev,B_dev,nx,ny);
215 |         printf("transformSmem ");
216 | 		break;
217 |   case 3:
218 | 		transformSmemPad<<<grid,block>>>(A_dev,B_dev,nx,ny);
219 |         printf("transformSmemPad ");
220 | 		break;
221 |   case 4:
222 |         transformSmemUnrollPad<<<grid_1,block_1>>>(A_dev,B_dev,nx,ny);
223 |         printf("transformSmemUnrollPad ");
224 |         break;
225 |   default:
226 |     break;
227 |   }
228 |   CHECK(cudaDeviceSynchronize());
229 |   iElaps=cpuSecond()-iStart;
230 |   printf(" Time elapsed %f sec\n",iElaps);
231 |   CHECK(cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost));
232 |   checkResult(B_host,B_host_cpu,nxy);
233 | 
234 |   cudaFree(A_dev);
235 |   cudaFree(B_dev);
236 |   free(A_host);
237 |   free(B_host);
238 |   free(B_host_cpu);
239 |   cudaDeviceReset();
240 |   return 0;
241 | }
242 | 


--------------------------------------------------------------------------------
/27_stencil_1d_constant_read_only/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(stencil_1d_constant_read_only stencil_1d_constant_read_only.cu)
2 | 


--------------------------------------------------------------------------------
/27_stencil_1d_constant_read_only/stencil_1d_constant_read_only.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define TEMPLATE_SIZE 9
  5 | #define TEMP_RADIO_SIZE (TEMPLATE_SIZE/2)
  6 | #define BDIM 32
  7 | 
  8 | __constant__ float coef[TEMP_RADIO_SIZE];//if in midle of the program will be error
  9 | void convolution(float *in,float *out,float* template_,const unsigned int array_size)
 10 | {
 11 |     for(int i=TEMP_RADIO_SIZE;i<array_size-TEMP_RADIO_SIZE;i++)
 12 |     {
 13 |         for(int j=1;j<=TEMP_RADIO_SIZE;j++)
 14 |         {
 15 |             out[i]+=template_[j-1]*(in[i+j]-in[i-j]);
 16 |         }
 17 | 
 18 |         //printf("%d:CPU :%lf\n",i,out[i]);
 19 |     }
 20 | 
 21 | }
 22 | 
 23 | __global__ void stencil_1d(float * in,float * out)
 24 | {
 25 |     __shared__ float smem[BDIM+2*TEMP_RADIO_SIZE];
 26 |     int idx=threadIdx.x+blockDim.x*blockIdx.x;
 27 |     int sidx=threadIdx.x+TEMP_RADIO_SIZE;
 28 |     smem[sidx]=in[idx];
 29 | 
 30 |     if (threadIdx.x<TEMP_RADIO_SIZE)
 31 | 
 32 |     {
 33 |         if(idx>TEMP_RADIO_SIZE)
 34 |             smem[sidx-TEMP_RADIO_SIZE]=in[idx-TEMP_RADIO_SIZE];
 35 |         if(idx<gridDim.x*blockDim.x-BDIM)
 36 |             smem[sidx+BDIM]=in[idx+BDIM];
 37 | 
 38 |     }
 39 | 
 40 |     __syncthreads();
 41 |     if (idx<TEMP_RADIO_SIZE||idx>=gridDim.x*blockDim.x-TEMP_RADIO_SIZE)
 42 |         return;
 43 |     float temp=.0f;
 44 |     #pragma unroll
 45 |     for(int i=1;i<=TEMP_RADIO_SIZE;i++)
 46 |     {
 47 |         temp+=coef[i-1]*(smem[sidx+i]-smem[sidx-i]);
 48 |     }
 49 |     out[idx]=temp;
 50 |     //printf("%d:GPU :%lf,\n",idx,temp);
 51 | }
 52 | //read only
 53 | __global__ void stencil_1d_readonly(float * in,float * out,const float* __restrict__ dcoef)
 54 | {
 55 |     __shared__ float smem[BDIM+2*TEMP_RADIO_SIZE];
 56 |     int idx=threadIdx.x+blockDim.x*blockIdx.x;
 57 |     int sidx=threadIdx.x+TEMP_RADIO_SIZE;
 58 |     smem[sidx]=in[idx];
 59 | 
 60 |     if (threadIdx.x<TEMP_RADIO_SIZE)
 61 | 
 62 |     {
 63 |         if(idx>TEMP_RADIO_SIZE)
 64 |             smem[sidx-TEMP_RADIO_SIZE]=in[idx-TEMP_RADIO_SIZE];
 65 |         if(idx<gridDim.x*blockDim.x-BDIM)
 66 |             smem[sidx+BDIM]=in[idx+BDIM];
 67 | 
 68 |     }
 69 | 
 70 |     __syncthreads();
 71 |     if (idx<TEMP_RADIO_SIZE||idx>=gridDim.x*blockDim.x-TEMP_RADIO_SIZE)
 72 |         return;
 73 |     float temp=.0f;
 74 |     #pragma unroll
 75 |     for(int i=1;i<=TEMP_RADIO_SIZE;i++)
 76 |     {
 77 |         temp+=dcoef[i-1]*(smem[sidx+i]-smem[sidx-i]);
 78 |     }
 79 |     out[idx]=temp;
 80 |     //printf("%d:GPU :%lf,\n",idx,temp);
 81 | }
 82 | 
 83 | int main(int argc,char** argv)
 84 | {
 85 |     printf("strating...\n");
 86 |     initDevice(0);
 87 |     int dimx=BDIM;
 88 |     unsigned int nxy=1<<16;
 89 |     int nBytes=nxy*sizeof(float);
 90 | 
 91 | 
 92 |     //Malloc
 93 |     float* in_host=(float*)malloc(nBytes);
 94 |     float* out_gpu=(float*)malloc(nBytes);
 95 |     float* out_cpu=(float*)malloc(nBytes);
 96 |     memset(out_cpu,0,nBytes);
 97 |     initialData(in_host,nxy);
 98 | 
 99 |     //cudaMalloc
100 |     float *in_dev=NULL;
101 |     float *out_dev=NULL;
102 | 
103 |     initialData(in_host,nxy);
104 |     float templ_[]={-1.0,-2.0,2.0,1.0};
105 |     CHECK(cudaMemcpyToSymbol(coef,templ_,TEMP_RADIO_SIZE*sizeof(float)));
106 | 
107 |     CHECK(cudaMalloc((void**)&in_dev,nBytes));
108 |     CHECK(cudaMalloc((void**)&out_dev,nBytes));
109 |     CHECK(cudaMemcpy(in_dev,in_host,nBytes,cudaMemcpyHostToDevice));
110 |     CHECK(cudaMemset(out_dev,0,nBytes));
111 | 
112 | 
113 | 
114 |     // cpu compute
115 |     double iStart=cpuSecond();
116 |     convolution(in_host,out_cpu,templ_,nxy);
117 |     double iElaps=cpuSecond()-iStart;
118 |     //printf("CPU Execution Time elapsed %f sec\n",iElaps);
119 | 
120 |     // stencil 1d
121 |     dim3 block(dimx);
122 |     dim3 grid((nxy-1)/block.x+1);
123 |     stencil_1d<<<grid,block>>>(in_dev,out_dev);
124 |     CHECK(cudaDeviceSynchronize());
125 |     iElaps=cpuSecond()-iStart;
126 |     printf("stencil_1d Time elapsed %f sec\n",iElaps);
127 |     CHECK(cudaMemcpy(out_gpu,out_dev,nBytes,cudaMemcpyDeviceToHost));
128 |     checkResult(out_cpu,out_gpu,nxy);
129 |     CHECK(cudaMemset(out_dev,0,nBytes));
130 |     // stencil 1d read only
131 |     float * dcoef_ro;
132 |     CHECK(cudaMalloc((void**)&dcoef_ro,TEMP_RADIO_SIZE * sizeof(float)));
133 |     CHECK(cudaMemcpy(dcoef_ro,templ_,TEMP_RADIO_SIZE * sizeof(float),cudaMemcpyHostToDevice));
134 |     stencil_1d_readonly<<<grid,block>>>(in_dev,out_dev,dcoef_ro);
135 |     CHECK(cudaDeviceSynchronize());
136 |     iElaps=cpuSecond()-iStart;
137 |     printf("stencil_1d_readonly Time elapsed %f sec\n",iElaps);
138 |     CHECK(cudaMemcpy(out_gpu,out_dev,nBytes,cudaMemcpyDeviceToHost));
139 |     checkResult(out_cpu,out_gpu,nxy);
140 | 
141 |     cudaFree(dcoef_ro);
142 |     cudaFree(in_dev);
143 |     cudaFree(out_dev);
144 |     free(out_gpu);
145 |     free(out_cpu);
146 |     free(in_host);
147 |     cudaDeviceReset();
148 |     return 0;
149 | }
150 | 


--------------------------------------------------------------------------------
/28_shfl_test/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(shfl_test shfl_test.cu)


--------------------------------------------------------------------------------
/28_shfl_test/shfl_test.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define BDIM 16
  5 | #define SEGM 4
  6 | __global__ void test_shfl_broadcast(int *in,int*out,int const srcLans)
  7 | {
  8 |     int value=in[threadIdx.x];
  9 |     value=__shfl(value,srcLans,BDIM);
 10 |     out[threadIdx.x]=value;
 11 | 
 12 | }
 13 | 
 14 | __global__ void test_shfl_up(int *in,int*out,int const delta)
 15 | {
 16 |     int value=in[threadIdx.x];
 17 |     value=__shfl_up(value,delta,BDIM);
 18 |     out[threadIdx.x]=value;
 19 | 
 20 | }
 21 | 
 22 | __global__ void test_shfl_down(int *in,int*out,int const delta)
 23 | {
 24 |     int value=in[threadIdx.x];
 25 |     value=__shfl_down(value,delta,BDIM);
 26 |     out[threadIdx.x]=value;
 27 | 
 28 | }
 29 | 
 30 | __global__ void test_shfl_wrap(int *in,int*out,int const offset)
 31 | {
 32 |     int value=in[threadIdx.x];
 33 |     value=__shfl(value,threadIdx.x+offset,BDIM);
 34 |     out[threadIdx.x]=value;
 35 | 
 36 | }
 37 | 
 38 | __global__ void test_shfl_xor(int *in,int*out,int const mask)
 39 | {
 40 |     int value=in[threadIdx.x];
 41 |     value=__shfl_xor(value,mask,BDIM);
 42 |     out[threadIdx.x]=value;
 43 | 
 44 | }
 45 | 
 46 | __global__ void test_shfl_xor_array(int *in,int*out,int const mask)
 47 | {
 48 |     int idx=threadIdx.x*SEGM;
 49 |     int value[SEGM];
 50 |     for(int i=0;i<SEGM;i++)
 51 |         value[i]=in[idx+i];
 52 |     value[0]=__shfl_xor(value[0],mask,BDIM);
 53 |     value[1]=__shfl_xor(value[1],mask,BDIM);
 54 |     value[2]=__shfl_xor(value[2],mask,BDIM);
 55 |     value[3]=__shfl_xor(value[3],mask,BDIM);
 56 |     for(int i=0;i<SEGM;i++)
 57 |         out[idx+i]=value[i];
 58 | 
 59 | }
 60 | __inline__ __device__
 61 | void swap(int *value,int laneIdx,int mask,int firstIdx,int secondIdx)
 62 | {
 63 |     bool pred=((laneIdx%(2))==0);
 64 |     if(pred)
 65 |     {
 66 |         int tmp=value[firstIdx];
 67 |         value[firstIdx]=value[secondIdx];
 68 |         value[secondIdx]=tmp;
 69 | 
 70 |     }
 71 |     value[secondIdx]=__shfl_xor(value[secondIdx],mask,BDIM);
 72 |     if(pred)
 73 |     {
 74 |         int tmp=value[firstIdx];
 75 |         value[firstIdx]=value[secondIdx];
 76 |         value[secondIdx]=tmp;
 77 |     }
 78 | }
 79 | 
 80 | __global__ void test_shfl_swap(int *in,int* out,int const mask,int firstIdx,int secondIdx)
 81 | {
 82 |     int idx=threadIdx.x*SEGM;
 83 |     int value[SEGM];
 84 |     for(int i=0;i<SEGM;i++)
 85 |         value[i]=in[idx+i];
 86 |     swap(value,threadIdx.x,mask,firstIdx,secondIdx);
 87 |     for(int i=0;i<SEGM;i++)
 88 |         out[idx+i]=value[i];
 89 | 
 90 | }
 91 | 
 92 | int main(int argc,char** argv)
 93 | {
 94 |     printf("strating...\n");
 95 |     initDevice(0);
 96 |     int dimx=BDIM;
 97 |     unsigned int data_size=BDIM;
 98 |     int nBytes=data_size*sizeof(int);
 99 |     int kernel_num=0;
100 |     if(argc>=2)
101 |         kernel_num=atoi(argv[1]);
102 | 
103 |     //Malloc
104 |     //int * in_host=(int*)malloc(nBytes);
105 |     int in_host[]={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
106 |     int * out_gpu=(int*)malloc(nBytes);
107 |     //initialData_int(in_host,data_size);
108 | 
109 |     //cudaMalloc
110 |     int * in_dev=NULL;
111 |     int * out_dev=NULL;
112 | 
113 |     CHECK(cudaMalloc((void**)&in_dev,nBytes));
114 |     CHECK(cudaMalloc((void**)&out_dev,nBytes));
115 |     CHECK(cudaMemcpy(in_dev,in_host,nBytes,cudaMemcpyHostToDevice));
116 |     CHECK(cudaMemset(out_dev,0,nBytes));
117 | 
118 | 
119 |     // test  _shfl broadcast
120 |     dim3 block(dimx);
121 |     dim3 grid((data_size-1)/block.x+1);
122 |     switch(kernel_num)
123 |     {
124 |         case 0:
125 |             test_shfl_broadcast<<<grid,block>>>(in_dev,out_dev,2);
126 |             printf("test_shfl_broadcast\n");
127 |             break;
128 |         case 1:
129 |             test_shfl_up<<<grid,block>>>(in_dev,out_dev,2);
130 |             printf("test_shfl_up\n");
131 |             break;
132 |         case 2:
133 |             test_shfl_down<<<grid,block>>>(in_dev,out_dev,2);
134 |             printf("test_shfl_down\n");
135 |             break;
136 |         case 3:
137 |             test_shfl_wrap<<<grid,block>>>(in_dev,out_dev,2);
138 |             printf("test_shfl_wrap\n");
139 |             break;
140 |         case 4:
141 |             test_shfl_xor<<<grid,block>>>(in_dev,out_dev,1);
142 |             printf("test_shfl_xor\n");
143 |             break;
144 |         case 5:
145 |             test_shfl_xor_array<<<1,block.x/SEGM>>>(in_dev,out_dev,1);
146 |             printf("test_shfl_xor_array\n");
147 |             break;
148 |         case 6:
149 |             test_shfl_swap<<<1,block.x/SEGM>>>(in_dev,out_dev,1,0,3);
150 |             printf("test_shfl_swap\n");
151 |             break;
152 |         default:
153 |             break;
154 |     }
155 |     CHECK(cudaMemcpy(out_gpu,out_dev,nBytes,cudaMemcpyDeviceToHost));
156 |     //show result
157 |     printf("input:\t");
158 |     for(int i=0;i<data_size;i++)
159 |         printf("%4d ",in_host[i]);
160 |     printf("\n\n\n\n\noutput:\t");
161 |     for(int i=0;i<data_size;i++)
162 |         printf("%4d ",out_gpu[i]);
163 |     printf("\n");
164 |     CHECK(cudaMemset(out_dev,0,nBytes));
165 |     // stencil 1d read only
166 | 
167 | 
168 |     cudaFree(in_dev);
169 |     cudaFree(out_dev);
170 |     free(out_gpu);
171 |     //free(in_host);
172 |     cudaDeviceReset();
173 |     return 0;
174 | }
175 | 


--------------------------------------------------------------------------------
/29_reduce_shfl/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(reduce_shfl reduce_shfl.cu)
2 | 


--------------------------------------------------------------------------------
/29_reduce_shfl/reduce_shfl.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define DIM 1024
  5 | 
  6 | int recursiveReduce(int *data, int const size)
  7 | {
  8 | 	// terminate check
  9 | 	if (size == 1) return data[0];
 10 | 	// renew the stride
 11 | 	int const stride = size / 2;
 12 | 	if (size % 2 == 1)
 13 | 	{
 14 | 		for (int i = 0; i < stride; i++)
 15 | 		{
 16 | 			data[i] += data[i + stride];
 17 | 		}
 18 | 		data[0] += data[size - 1];
 19 | 	}
 20 | 	else
 21 | 	{
 22 | 		for (int i = 0; i < stride; i++)
 23 | 		{
 24 | 			data[i] += data[i + stride];
 25 | 		}
 26 | 	}
 27 | 	// call
 28 | 	return recursiveReduce(data, stride);
 29 | }
 30 | __global__ void warmup(int * g_idata, int * g_odata, unsigned int n)
 31 | {
 32 | 
 33 | 	//set thread ID
 34 | 	unsigned int tid = threadIdx.x;
 35 | 	//boundary check
 36 | 	if (tid >= n) return;
 37 | 	//convert global data pointer to the
 38 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 39 | 	//in-place reduction in global memory
 40 | 	for (int stride = 1; stride < blockDim.x; stride *= 2)
 41 | 	{
 42 | 		if ((tid % (2 * stride)) == 0)
 43 | 		{
 44 | 			idata[tid] += idata[tid + stride];
 45 | 		}
 46 | 		//synchronize within block
 47 | 		__syncthreads();
 48 | 	}
 49 | 	//write result for this block to global mem
 50 | 	if (tid == 0)
 51 | 		g_odata[blockIdx.x] = idata[0];
 52 | 
 53 | }
 54 | 
 55 | 
 56 | __global__ void reduceGmem(int * g_idata,int * g_odata,unsigned int n)
 57 | {
 58 | 	//set thread ID
 59 | 	unsigned int tid = threadIdx.x;
 60 | 	unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x;
 61 | 	//boundary check
 62 | 	if (tid >= n) return;
 63 | 	//convert global data pointer to the
 64 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
 65 | 
 66 | 	__syncthreads();
 67 | 	//in-place reduction in global memory
 68 | 	if(blockDim.x>=1024 && tid <512)
 69 | 		idata[tid]+=idata[tid+512];
 70 | 	__syncthreads();
 71 | 	if(blockDim.x>=512 && tid <256)
 72 | 		idata[tid]+=idata[tid+256];
 73 | 	__syncthreads();
 74 | 	if(blockDim.x>=256 && tid <128)
 75 | 		idata[tid]+=idata[tid+128];
 76 | 	__syncthreads();
 77 | 	if(blockDim.x>=128 && tid <64)
 78 | 		idata[tid]+=idata[tid+64];
 79 | 	__syncthreads();
 80 | 	//write result for this block to global mem
 81 | 	if(tid<32)
 82 | 	{
 83 | 		volatile int *vmem = idata;
 84 | 		vmem[tid]+=vmem[tid+32];
 85 | 		vmem[tid]+=vmem[tid+16];
 86 | 		vmem[tid]+=vmem[tid+8];
 87 | 		vmem[tid]+=vmem[tid+4];
 88 | 		vmem[tid]+=vmem[tid+2];
 89 | 		vmem[tid]+=vmem[tid+1];
 90 | 
 91 | 	}
 92 | 
 93 | 	if (tid == 0)
 94 | 		g_odata[blockIdx.x] = idata[0];
 95 | 
 96 | }
 97 | 
 98 | 
 99 | __global__ void reduceSmem(int * g_idata,int * g_odata,unsigned int n)
100 | {
101 | 	//set thread ID
102 |     __shared__ int smem[DIM];
103 | 	unsigned int tid = threadIdx.x;
104 | 	//unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x;
105 | 	//boundary check
106 | 	if (tid >= n) return;
107 | 	//convert global data pointer to the
108 | 	int *idata = g_idata + blockIdx.x*blockDim.x;
109 | 
110 |     smem[tid]=idata[tid];
111 | 	__syncthreads();
112 | 	//in-place reduction in global memory
113 | 	if(blockDim.x>=1024 && tid <512)
114 | 		smem[tid]+=smem[tid+512];
115 | 	__syncthreads();
116 | 	if(blockDim.x>=512 && tid <256)
117 | 		smem[tid]+=smem[tid+256];
118 | 	__syncthreads();
119 | 	if(blockDim.x>=256 && tid <128)
120 | 		smem[tid]+=smem[tid+128];
121 | 	__syncthreads();
122 | 	if(blockDim.x>=128 && tid <64)
123 | 		smem[tid]+=smem[tid+64];
124 | 	__syncthreads();
125 | 	//write result for this block to global mem
126 | 	if(tid<32)
127 | 	{
128 | 		volatile int *vsmem = smem;
129 | 		vsmem[tid]+=vsmem[tid+32];
130 | 		vsmem[tid]+=vsmem[tid+16];
131 | 		vsmem[tid]+=vsmem[tid+8];
132 | 		vsmem[tid]+=vsmem[tid+4];
133 | 		vsmem[tid]+=vsmem[tid+2];
134 | 		vsmem[tid]+=vsmem[tid+1];
135 | 
136 | 	}
137 | 
138 | 	if (tid == 0)
139 | 		g_odata[blockIdx.x] = smem[0];
140 | 
141 | }
142 | __inline__ __device__ int warpReduce(int localSum)
143 | {
144 |     localSum += __shfl_xor(localSum, 16);
145 |     localSum += __shfl_xor(localSum, 8);
146 |     localSum += __shfl_xor(localSum, 4);
147 |     localSum += __shfl_xor(localSum, 2);
148 |     localSum += __shfl_xor(localSum, 1);
149 | 
150 |     return localSum;
151 | }
152 | __global__ void reduceShfl(int * g_idata,int * g_odata,unsigned int n)
153 | {
154 | 	//set thread ID
155 |     __shared__ int smem[DIM];
156 | 	unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x;
157 | 	//convert global data pointer to the
158 | 
159 | 	int mySum=g_idata[idx];
160 | 	int laneIdx=threadIdx.x%warpSize;
161 | 	int warpIdx=threadIdx.x/warpSize;
162 | 
163 | 	mySum=warpReduce(mySum);
164 | 
165 | 	if(laneIdx==0)
166 | 		smem[warpIdx]=mySum;
167 | 	__syncthreads();
168 | 	mySum=(threadIdx.x<DIM)?smem[laneIdx]:0;
169 | 	if(warpIdx==0)
170 | 		mySum=warpReduce(mySum);
171 | 	if(threadIdx.x==0)
172 | 		g_odata[blockIdx.x]=mySum;
173 | 
174 | }
175 | 
176 | int main(int argc,char** argv)
177 | {
178 | 	initDevice(0);
179 | 
180 | 	bool bResult = false;
181 | 	//initialization
182 | 
183 | 	int size = 1 << 24;
184 | 	printf("	with array size %d  \n", size);
185 | 
186 | 	//execution configuration
187 | 	int blocksize = 1024;
188 | 	if (argc > 1)
189 | 	{
190 | 		blocksize = atoi(argv[1]);
191 | 	}
192 | 	dim3 block(blocksize, 1);
193 | 	dim3 grid((size - 1) / block.x + 1, 1);
194 | 	printf("grid %d block %d \n", grid.x, block.x);
195 | 
196 | 	//allocate host memory
197 | 	size_t bytes = size * sizeof(int);
198 | 	int *idata_host = (int*)malloc(bytes);
199 | 	int *odata_host = (int*)malloc(grid.x * sizeof(int));
200 | 	int * tmp = (int*)malloc(bytes);
201 | 
202 | 	//initialize the array
203 | 	initialData_int(idata_host, size);
204 | 
205 | 	memcpy(tmp, idata_host, bytes);
206 | 	double iStart, iElaps;
207 | 	int gpu_sum = 0;
208 | 
209 | 	// device memory
210 | 	int * idata_dev = NULL;
211 | 	int * odata_dev = NULL;
212 | 	CHECK(cudaMalloc((void**)&idata_dev, bytes));
213 | 	CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int)));
214 | 
215 | 	//cpu reduction
216 | 	int cpu_sum = 0;
217 | 	iStart = cpuSecond();
218 | 	//cpu_sum = recursiveReduce(tmp, size);
219 | 	for (int i = 0; i < size; i++)
220 | 		cpu_sum += tmp[i];
221 | 	iElaps = cpuSecond() - iStart;
222 | 	printf("cpu reduce           elapsed %lf ms cpu_sum: %d\n", iElaps, cpu_sum);
223 | 
224 | 
225 | 	//kernel 1:warmup
226 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
227 | 	CHECK(cudaDeviceSynchronize());
228 | 	iStart = cpuSecond();
229 | 	warmup <<<grid.x/2, block >>>(idata_dev, odata_dev, size);
230 | 	cudaDeviceSynchronize();
231 | 	iElaps = cpuSecond() - iStart;
232 | 	printf("gpu warmup           elapsed %lf ms\n",iElaps);
233 | 
234 | 
235 | 
236 | 	//reduceGmem
237 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
238 | 	CHECK(cudaDeviceSynchronize());
239 | 	iStart = cpuSecond();
240 | 	reduceGmem <<<grid.x, block>>>(idata_dev, odata_dev, size);
241 | 	cudaDeviceSynchronize();
242 | 	iElaps = cpuSecond() - iStart;
243 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
244 | 	gpu_sum = 0;
245 | 	for (int i = 0; i < grid.x; i++)
246 | 		gpu_sum += odata_host[i];
247 | 	printf("reduceGmem           elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum);
248 | 
249 |     //reduceSmem
250 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
251 | 	CHECK(cudaDeviceSynchronize());
252 | 	iStart = cpuSecond();
253 | 	reduceSmem <<<grid.x, block>>>(idata_dev, odata_dev, size);
254 | 	cudaDeviceSynchronize();
255 | 	iElaps = cpuSecond() - iStart;
256 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
257 | 	gpu_sum = 0;
258 | 	for (int i = 0; i < grid.x; i++)
259 | 		gpu_sum += odata_host[i];
260 | 	printf("reduceSmem           elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum);
261 | 
262 | 
263 | 
264 |     //reduceShfl
265 | 	CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice));
266 | 	CHECK(cudaDeviceSynchronize());
267 | 	iStart = cpuSecond();
268 | 	reduceShfl<<<grid, block>>>(idata_dev, odata_dev, size);
269 | 	cudaDeviceSynchronize();
270 | 	iElaps = cpuSecond() - iStart;
271 | 	cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost);
272 | 	gpu_sum = 0;
273 | 	for (int i = 0; i < grid.x; i++)
274 | 		gpu_sum += odata_host[i];
275 | 	printf("reduceShfl           elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum);
276 | 
277 | 
278 | 	free(idata_host);
279 | 	free(odata_host);
280 | 	CHECK(cudaFree(idata_dev));
281 | 	CHECK(cudaFree(odata_dev));
282 | 	//reset device
283 | 	cudaDeviceReset();
284 | 	return EXIT_SUCCESS;
285 | 
286 | }
287 | 


--------------------------------------------------------------------------------
/2_grid_block/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(grid_block grid_block.cu)
2 | 


--------------------------------------------------------------------------------
/2_grid_block/grid_block.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | int main(int argc,char ** argv)
 4 | {
 5 |   int nElem=1024;
 6 |   dim3 block(1024);
 7 |   dim3 grid((nElem-1)/block.x+1);
 8 |   printf("grid.x %d block.x %d\n",grid.x,block.x);
 9 | 
10 |   block.x=512;
11 |   grid.x=(nElem-1)/block.x+1;
12 |   printf("grid.x %d block.x %d\n",grid.x,block.x);
13 | 
14 |   block.x=256;
15 |   grid.x=(nElem-1)/block.x+1;
16 |   printf("grid.x %d block.x %d\n",grid.x,block.x);
17 | 
18 |   block.x=128;
19 |   grid.x=(nElem-1)/block.x+1;
20 |   printf("grid.x %d block.x %d\n",grid.x,block.x);
21 | 
22 |   cudaDeviceReset();
23 |   return 0;
24 | }
25 | 


--------------------------------------------------------------------------------
/30_stream/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(stream stream.cu)
2 | 


--------------------------------------------------------------------------------
/30_stream/stream.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | #define N 300000
 6 | __global__ void kernel_1()
 7 | {
 8 |     double sum=0.0;
 9 |     for(int i=0;i<N;i++)
10 |         sum=sum+tan(0.1)*tan(0.1);
11 | }
12 | __global__ void kernel_2()
13 | {
14 |     double sum=0.0;
15 |     for(int i=0;i<N;i++)
16 |         sum=sum+tan(0.1)*tan(0.1);
17 | }
18 | __global__ void kernel_3()
19 | {
20 |     double sum=0.0;
21 |     for(int i=0;i<N;i++)
22 |         sum=sum+tan(0.1)*tan(0.1);
23 | }
24 | __global__ void kernel_4()
25 | {
26 |     double sum=0.0;
27 |     for(int i=0;i<N;i++)
28 |         sum=sum+tan(0.1)*tan(0.1);
29 | }
30 | int main()
31 | {
32 |     setenv("CUDA_DEVICE_MAX_CONNECTIONS","32",1);
33 |     int dev = 0;
34 |     cudaSetDevice(dev);
35 |     int n_stream=16;
36 |     cudaStream_t *stream=(cudaStream_t*)malloc(n_stream*sizeof(cudaStream_t));
37 |     for(int i=0;i<n_stream;i++)
38 |     {
39 |         cudaStreamCreate(&stream[i]);
40 |     }
41 |     dim3 block(1);
42 |     dim3 grid(1);
43 |     cudaEvent_t start,stop;
44 |     cudaEventCreate(&start);
45 |     cudaEventCreate(&stop);
46 |     cudaEventRecord(start,0);
47 |     for(int i=0;i<n_stream;i++)
48 |     {
49 |         kernel_1<<<grid,block,0,stream[i]>>>();
50 |         kernel_2<<<grid,block,0,stream[i]>>>();
51 |         kernel_3<<<grid,block,0,stream[i]>>>();
52 |         kernel_4<<<grid,block,0,stream[i]>>>();
53 |     }
54 |     cudaEventRecord(stop,0);
55 |     CHECK(cudaEventSynchronize(stop));
56 |     float elapsed_time;
57 |     cudaEventElapsedTime(&elapsed_time,start,stop);
58 |     printf("elapsed time:%f ms\n",elapsed_time);
59 | 
60 |     for(int i=0;i<n_stream;i++)
61 |     {
62 |         cudaStreamDestroy(stream[i]);
63 |     }
64 |     cudaEventDestroy(start);
65 |     cudaEventDestroy(stop);
66 |     free(stream);
67 |     CHECK(cudaDeviceReset());
68 |     return 0;
69 | }
70 | 
71 | 


--------------------------------------------------------------------------------
/31_stream_omp/stream_omp.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | #include <omp.h>
 5 | #define N 300000
 6 | __global__ void kernel_1()
 7 | {
 8 |     double sum=0.0;
 9 |     for(int i=0;i<N;i++)
10 |         sum=sum+tan(0.1)*tan(0.1);
11 | }
12 | __global__ void kernel_2()
13 | {
14 |     double sum=0.0;
15 |     for(int i=0;i<N;i++)
16 |         sum=sum+tan(0.1)*tan(0.1);
17 | }
18 | __global__ void kernel_3()
19 | {
20 |     double sum=0.0;
21 |     for(int i=0;i<N;i++)
22 |         sum=sum+tan(0.1)*tan(0.1);
23 | }
24 | __global__ void kernel_4()
25 | {
26 |     double sum=0.0;
27 |     for(int i=0;i<N;i++)
28 |         sum=sum+tan(0.1)*tan(0.1);
29 | }
30 | int main()
31 | {
32 |     int n_stream=4;
33 |     cudaStream_t *stream=(cudaStream_t*)malloc(n_stream*sizeof(cudaStream_t));
34 |     for(int i=0;i<n_stream;i++)
35 |     {
36 |         cudaStreamCreate(&stream[i]);
37 |     }
38 |     dim3 block(1);
39 |     dim3 grid(1);
40 |     cudaEvent_t start,stop;
41 |     cudaEventCreate(&start);
42 |     cudaEventCreate(&stop);
43 |     cudaEventRecord(start,0);
44 | omp_set_num_threads(n_stream);
45 | #pragma omp parallel
46 |     {      
47 |         int i=omp_get_thread_num();
48 |         kernel_1<<<grid,block,0,stream[i]>>>();
49 |         kernel_2<<<grid,block,0,stream[i]>>>();
50 |         kernel_3<<<grid,block,0,stream[i]>>>();
51 |         kernel_4<<<grid,block,0,stream[i]>>>();
52 |     }
53 |     cudaEventRecord(stop,0);
54 |     CHECK(cudaEventSynchronize(stop));
55 |     float elapsed_time;
56 |     cudaEventElapsedTime(&elapsed_time,start,stop);
57 |     printf("elapsed time:%f ms\n",elapsed_time);
58 |     
59 |     for(int i=0;i<n_stream;i++)
60 |     {
61 |         cudaStreamDestroy(stream[i]);
62 |     }
63 |     cudaEventDestroy(start);
64 |     cudaEventDestroy(stop);
65 |     free(stream);
66 |     return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/32_stream_resource/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(stream_resource stream_resource.cu)
2 | 


--------------------------------------------------------------------------------
/32_stream_resource/stream_resource.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | #define N 100
 5 | __global__ void kernel_1()
 6 | {
 7 |     double sum=0.0;
 8 |     for(int i=0;i<N;i++)
 9 |         sum=sum+tan(0.1)*tan(0.1);
10 | }
11 | __global__ void kernel_2()
12 | {
13 |     double sum=0.0;
14 |     for(int i=0;i<N;i++)
15 |         sum=sum+tan(0.1)*tan(0.1);
16 | }
17 | __global__ void kernel_3()
18 | {
19 |     double sum=0.0;
20 |     for(int i=0;i<N;i++)
21 |         sum=sum+tan(0.1)*tan(0.1);
22 | }
23 | __global__ void kernel_4()
24 | {
25 |     double sum=0.0;
26 |     for(int i=0;i<N;i++)
27 |         sum=sum+tan(0.1)*tan(0.1);
28 | }
29 | int main()
30 | {
31 |     //setenv("CUDA_DEVICE_MAX_CONNECTIONS","32",1);
32 |     int n_stream=4;
33 |     cudaStream_t *stream=(cudaStream_t*)malloc(n_stream*sizeof(cudaStream_t));
34 |     for(int i=0;i<n_stream;i++)
35 |     {
36 |         cudaStreamCreate(&stream[i]);
37 |     }
38 |     dim3 block(16,32);
39 |     dim3 grid(32);
40 |     cudaEvent_t start,stop;
41 |     cudaEventCreate(&start);
42 |     cudaEventCreate(&stop);
43 |     cudaEventRecord(start);
44 |     for(int i=0;i<n_stream;i++)
45 |     {
46 |         kernel_1<<<grid,block,0,stream[i]>>>();
47 |         kernel_2<<<grid,block,0,stream[i]>>>();
48 |         kernel_3<<<grid,block,0,stream[i]>>>();
49 |         kernel_4<<<grid,block,0,stream[i]>>>();
50 |     }
51 |     cudaEventRecord(stop);
52 |     CHECK(cudaEventSynchronize(stop));
53 |     float elapsed_time;
54 |     cudaEventElapsedTime(&elapsed_time,start,stop);
55 |     printf("elapsed time:%f ms\n",elapsed_time);
56 | 
57 |     for(int i=0;i<n_stream;i++)
58 |     {
59 |         cudaStreamDestroy(stream[i]);
60 |     }
61 |     cudaEventDestroy(start);
62 |     cudaEventDestroy(stop);
63 |     free(stream);
64 |     return 0;
65 | }
66 | 


--------------------------------------------------------------------------------
/33_stream_block/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(stream_block stream_block.cu)
2 | 


--------------------------------------------------------------------------------
/33_stream_block/stream_block.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | #define N 300000
 5 | __global__ void kernel_1()
 6 | {
 7 |     double sum=0.0;
 8 |     for(int i=0;i<N;i++)
 9 |         sum=sum+tan(0.1)*tan(0.1);
10 | }
11 | __global__ void kernel_2()
12 | {
13 |     double sum=0.0;
14 |     for(int i=0;i<N;i++)
15 |         sum=sum+tan(0.1)*tan(0.1);
16 | }
17 | __global__ void kernel_3()
18 | {
19 |     double sum=0.0;
20 |     for(int i=0;i<N;i++)
21 |         sum=sum+tan(0.1)*tan(0.1);
22 | }
23 | __global__ void kernel_4()
24 | {
25 |     double sum=0.0;
26 |     for(int i=0;i<N;i++)
27 |         sum=sum+tan(0.1)*tan(0.1);
28 | }
29 | int main()
30 | {
31 |     int n_stream=4;
32 |     cudaStream_t *stream=(cudaStream_t*)malloc(n_stream*sizeof(cudaStream_t));
33 |     for(int i=0;i<n_stream;i++)
34 |     {
35 |         cudaStreamCreate(&stream[i]);
36 |     }
37 |     dim3 block(1);
38 |     dim3 grid(1);
39 |     cudaEvent_t start,stop;
40 |     cudaEventCreate(&start);
41 |     cudaEventCreate(&stop);
42 |     cudaEventRecord(start);
43 |     for(int i=0;i<n_stream;i++)
44 |     {
45 |         kernel_1<<<grid,block,0,stream[i]>>>();
46 |         kernel_2<<<grid,block,0,stream[i]>>>();
47 |         kernel_3<<<grid,block>>>();
48 |         kernel_4<<<grid,block,0,stream[i]>>>();
49 |     }
50 |     cudaEventRecord(stop);
51 |     CHECK(cudaEventSynchronize(stop));
52 |     float elapsed_time;
53 |     cudaEventElapsedTime(&elapsed_time,start,stop);
54 |     
55 |     for(int i=0;i<n_stream;i++)
56 |     {
57 |         cudaStreamDestroy(stream[i]);
58 |     }
59 |     cudaEventDestroy(start);
60 |     cudaEventDestroy(stop);
61 |     free(stream);
62 |     return 0;
63 | }


--------------------------------------------------------------------------------
/34_stream_dependence/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(stream_dependence stream_dependence.cu)
2 | 


--------------------------------------------------------------------------------
/34_stream_dependence/stream_dependence.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | #define N 300000
 5 | __global__ void kernel_1()
 6 | {
 7 |     double sum=0.0;
 8 |     for(int i=0;i<N;i++)
 9 |         sum=sum+tan(0.1)*tan(0.1);
10 | }
11 | __global__ void kernel_2()
12 | {
13 |     double sum=0.0;
14 |     for(int i=0;i<N;i++)
15 |         sum=sum+tan(0.1)*tan(0.1);
16 | }
17 | __global__ void kernel_3()
18 | {
19 |     double sum=0.0;
20 |     for(int i=0;i<N;i++)
21 |         sum=sum+tan(0.1)*tan(0.1);
22 | }
23 | __global__ void kernel_4()
24 | {
25 |     double sum=0.0;
26 |     for(int i=0;i<N;i++)
27 |         sum=sum+tan(0.1)*tan(0.1);
28 | }
29 | int main()
30 | {
31 |     int n_stream=5;
32 |     cudaStream_t *stream=(cudaStream_t*)malloc(n_stream*sizeof(cudaStream_t));
33 |     for(int i=0;i<n_stream;i++)
34 |     {
35 |         cudaStreamCreate(&stream[i]);
36 |     }
37 |     dim3 block(1);
38 |     dim3 grid(1);
39 |     cudaEvent_t start,stop;
40 |     cudaEvent_t * event=(cudaEvent_t *)malloc(n_stream*sizeof(cudaEvent_t));
41 |     for(int i=0;i<n_stream;i++)
42 |     {
43 |         cudaEventCreateWithFlags(&event[i],cudaEventDisableTiming);
44 |     }
45 |     cudaEventCreate(&start);
46 |     cudaEventCreate(&stop);
47 |     cudaEventRecord(start);
48 |     for(int i=0;i<n_stream;i++)
49 |     {
50 |         kernel_1<<<grid,block,0,stream[i]>>>();
51 |         kernel_2<<<grid,block,0,stream[i]>>>();
52 |         kernel_3<<<grid,block,0,stream[i]>>>();
53 |         kernel_4<<<grid,block,0,stream[i]>>>();
54 |         cudaEventRecord(event[i],stream[i]);
55 |         cudaStreamWaitEvent(stream[n_stream-1],event[i],0);
56 |     }
57 |     cudaEventRecord(stop);
58 |     CHECK(cudaEventSynchronize(stop));
59 |     float elapsed_time;
60 |     cudaEventElapsedTime(&elapsed_time,start,stop);
61 | 
62 |     for(int i=0;i<n_stream;i++)
63 |     {
64 |         cudaStreamDestroy(stream[i]);
65 |     }
66 |     cudaEventDestroy(start);
67 |     cudaEventDestroy(stop);
68 |     for(int i=0;i<n_stream;i++)
69 |     {
70 |         cudaEventDestroy(event[i]);
71 |     }
72 |     free(stream);
73 |     free(event);
74 |     return 0;
75 | }
76 | 


--------------------------------------------------------------------------------
/35_multi_add_depth/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(multi_add_depth multi_add_depth.cu)
2 | 


--------------------------------------------------------------------------------
/35_multi_add_depth/multi_add_depth.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define N_REPEAT 10
  5 | #define N_SEGMENT 4
  6 | 
  7 | void sumArrays(float * a,float * b,float * res,const int size)
  8 | {
  9 |     for(int i=0;i<size;i+=4)
 10 |     {
 11 |         res[i]=a[i]+b[i];
 12 |         res[i+1]=a[i+1]+b[i+1];
 13 |         res[i+2]=a[i+2]+b[i+2];
 14 |         res[i+3]=a[i+3]+b[i+3];
 15 |     }
 16 | }
 17 | __global__ void sumArraysGPU(float*a,float*b,float*res,int N)
 18 | {
 19 |     int idx=blockIdx.x*blockDim.x+threadIdx.x;
 20 |     if(idx < N)
 21 |     //for delay
 22 |     {
 23 |         for(int j=0;j<N_REPEAT;j++)
 24 |             res[idx]=a[idx]+b[idx];
 25 |     }
 26 | 
 27 | }
 28 | int main(int argc,char **argv)
 29 | {
 30 |     // set up device
 31 |     initDevice(0);
 32 |     double iStart,iElaps;
 33 |     iStart=cpuSecond();
 34 |     int nElem=1<<24;
 35 |     printf("Vector size:%d\n",nElem);
 36 |     int nByte=sizeof(float)*nElem;
 37 |     float * a_h,*b_h,*res_h,*res_from_gpu_h;
 38 |     CHECK(cudaHostAlloc((float**)&a_h,nByte,cudaHostAllocDefault));
 39 |     CHECK(cudaHostAlloc((float**)&b_h,nByte,cudaHostAllocDefault));
 40 |     CHECK(cudaHostAlloc((float**)&res_h,nByte,cudaHostAllocDefault));
 41 |     CHECK(cudaHostAlloc((float**)&res_from_gpu_h,nByte,cudaHostAllocDefault));
 42 |     
 43 |     cudaMemset(res_h,0,nByte);
 44 |     cudaMemset(res_from_gpu_h,0,nByte);
 45 | 
 46 |     float *a_d,*b_d,*res_d;
 47 |     CHECK(cudaMalloc((float**)&a_d,nByte));
 48 |     CHECK(cudaMalloc((float**)&b_d,nByte));
 49 |     CHECK(cudaMalloc((float**)&res_d,nByte));
 50 | 
 51 |     initialData(a_h,nElem);
 52 |     initialData(b_h,nElem);
 53 |     
 54 |     sumArrays(a_h,b_h,res_h,nElem);
 55 |     dim3 block(512);
 56 |     dim3 grid((nElem-1)/block.x+1);
 57 | 
 58 | 
 59 |     //asynchronous calculation
 60 |     int iElem=nElem/N_SEGMENT;
 61 |     cudaStream_t stream[N_SEGMENT];
 62 |     for(int i=0;i<N_SEGMENT;i++)
 63 |     {
 64 |         CHECK(cudaStreamCreate(&stream[i]));
 65 |     }
 66 |     cudaEvent_t start,stop;
 67 |     cudaEventCreate(&start);
 68 |     cudaEventCreate(&stop);
 69 |     cudaEventRecord(start,0);
 70 |     for(int i=0;i<N_SEGMENT;i++)
 71 |     {
 72 |         int ioffset=i*iElem;
 73 |         CHECK(cudaMemcpyAsync(&a_d[ioffset],&a_h[ioffset],nByte/N_SEGMENT,cudaMemcpyHostToDevice,stream[i]));
 74 |         CHECK(cudaMemcpyAsync(&b_d[ioffset],&b_h[ioffset],nByte/N_SEGMENT,cudaMemcpyHostToDevice,stream[i]));
 75 |         sumArraysGPU<<<grid,block,0,stream[i]>>>(&a_d[ioffset],&b_d[ioffset],&res_d[ioffset],iElem);
 76 |         CHECK(cudaMemcpyAsync(&res_from_gpu_h[ioffset],&res_d[ioffset],nByte/N_SEGMENT,cudaMemcpyDeviceToHost,stream[i]));
 77 |     }
 78 |     //timer
 79 |     CHECK(cudaEventRecord(stop, 0));
 80 |     CHECK(cudaEventSynchronize(stop));
 81 |     iElaps=cpuSecond()-iStart;
 82 |     printf("Asynchronous Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);
 83 |     checkResult(res_h,res_from_gpu_h,nElem);
 84 |     for(int i=0;i<N_SEGMENT;i++)
 85 |     {
 86 |         CHECK(cudaStreamDestroy(stream[i]));
 87 |     }
 88 |     cudaFree(a_d);
 89 |     cudaFree(b_d);
 90 |     cudaFree(a_h);
 91 |     cudaFree(b_h);
 92 |     cudaFree(res_h);
 93 |     cudaFree(res_from_gpu_h);
 94 |     cudaEventDestroy(start);
 95 |     cudaEventDestroy(stop);
 96 | 
 97 |     return 0;
 98 | }
 99 | 
100 | 


--------------------------------------------------------------------------------
/36_multi_add_breadth/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(multi_add_breadth multi_add_breadth.cu)
2 | 


--------------------------------------------------------------------------------
/36_multi_add_breadth/multi_add_breadth.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define N_REPEAT 10
  5 | #define N_SEGMENT 4
  6 | 
  7 | void sumArrays(float * a,float * b,float * res,const int size)
  8 | {
  9 |     for(int i=0;i<size;i+=4)
 10 |     {
 11 |         res[i]=a[i]+b[i];
 12 |         res[i+1]=a[i+1]+b[i+1];
 13 |         res[i+2]=a[i+2]+b[i+2];
 14 |         res[i+3]=a[i+3]+b[i+3];
 15 |     }
 16 | }
 17 | __global__ void sumArraysGPU(float*a,float*b,float*res,int N)
 18 | {
 19 |     int idx=blockIdx.x*blockDim.x+threadIdx.x;
 20 |     if(idx < N)
 21 |     //for delay
 22 |     {
 23 |         for(int j=0;j<N_REPEAT;j++)
 24 |             res[idx]=a[idx]+b[idx];
 25 |     }
 26 | 
 27 | }
 28 | int main(int argc,char **argv)
 29 | {
 30 |     // set up device
 31 |     initDevice(0);
 32 |     double iStart,iElaps;
 33 |     iStart=cpuSecond();
 34 |     int nElem=1<<24;
 35 |     printf("Vector size:%d\n",nElem);
 36 |     int nByte=sizeof(float)*nElem;
 37 |     float * a_h,*b_h,*res_h,*res_from_gpu_h;
 38 |     CHECK(cudaHostAlloc((float**)&a_h,nByte,cudaHostAllocDefault));
 39 |     CHECK(cudaHostAlloc((float**)&b_h,nByte,cudaHostAllocDefault));
 40 |     CHECK(cudaHostAlloc((float**)&res_h,nByte,cudaHostAllocDefault));
 41 |     CHECK(cudaHostAlloc((float**)&res_from_gpu_h,nByte,cudaHostAllocDefault));
 42 |     
 43 |     cudaMemset(res_h,0,nByte);
 44 |     cudaMemset(res_from_gpu_h,0,nByte);
 45 | 
 46 |     float *a_d,*b_d,*res_d;
 47 |     CHECK(cudaMalloc((float**)&a_d,nByte));
 48 |     CHECK(cudaMalloc((float**)&b_d,nByte));
 49 |     CHECK(cudaMalloc((float**)&res_d,nByte));
 50 | 
 51 |     initialData(a_h,nElem);
 52 |     initialData(b_h,nElem);
 53 |     
 54 |     sumArrays(a_h,b_h,res_h,nElem);
 55 |     dim3 block(512);
 56 |     dim3 grid((nElem-1)/block.x+1);
 57 | 
 58 | 
 59 |     //asynchronous calculation
 60 |     int iElem=nElem/N_SEGMENT;
 61 |     cudaStream_t stream[N_SEGMENT];
 62 |     for(int i=0;i<N_SEGMENT;i++)
 63 |     {
 64 |         CHECK(cudaStreamCreate(&stream[i]));
 65 |     }
 66 |     cudaEvent_t start,stop;
 67 |     cudaEventCreate(&start);
 68 |     cudaEventCreate(&stop);
 69 |     cudaEventRecord(start,0);
 70 |     for(int i=0;i<N_SEGMENT;i++)
 71 |     {
 72 |         int ioffset=i*iElem;
 73 |         CHECK(cudaMemcpyAsync(&a_d[ioffset],&a_h[ioffset],nByte/N_SEGMENT,cudaMemcpyHostToDevice,stream[i]));
 74 |         CHECK(cudaMemcpyAsync(&b_d[ioffset],&b_h[ioffset],nByte/N_SEGMENT,cudaMemcpyHostToDevice,stream[i]));
 75 |     }
 76 |     for(int i=0;i<N_SEGMENT;i++)
 77 |     {
 78 |         int ioffset=i*iElem;
 79 |         sumArraysGPU<<<grid,block,0,stream[i]>>>(&a_d[ioffset],&b_d[ioffset],&res_d[ioffset],iElem);
 80 |     }
 81 |     for(int i=0;i<N_SEGMENT;i++)
 82 |     {
 83 |         int ioffset=i*iElem;
 84 |         CHECK(cudaMemcpyAsync(&res_from_gpu_h[ioffset],&res_d[ioffset],nByte/N_SEGMENT,cudaMemcpyDeviceToHost,stream[i]));
 85 |     }
 86 |     //timer
 87 |     CHECK(cudaEventRecord(stop, 0));
 88 |     CHECK(cudaEventSynchronize(stop));
 89 |     iElaps=cpuSecond()-iStart;
 90 |     printf("Asynchronous Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);
 91 |     checkResult(res_h,res_from_gpu_h,nElem);
 92 |     for(int i=0;i<N_SEGMENT;i++)
 93 |     {
 94 |         CHECK(cudaStreamDestroy(stream[i]));
 95 |     }
 96 |     cudaFree(a_d);
 97 |     cudaFree(b_d);
 98 |     cudaFree(a_h);
 99 |     cudaFree(b_h);
100 |     cudaFree(res_h);
101 |     cudaFree(res_from_gpu_h);
102 |     cudaEventDestroy(start);
103 |     cudaEventDestroy(stop);
104 | 
105 |     return 0;
106 | }
107 | 
108 | 


--------------------------------------------------------------------------------
/37_asyncAPI/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(asyncAPI asyncAPI.cu)
2 | 


--------------------------------------------------------------------------------
/37_asyncAPI/asyncAPI.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define N_REPEAT 10
  5 | #define N_SEGMENT 1
  6 | 
  7 | void sumArrays(float * a,float * b,float * res,const int size)
  8 | {
  9 |     for(int i=0;i<size;i+=4)
 10 |     {
 11 |         res[i]=a[i]+b[i];
 12 |         res[i+1]=a[i+1]+b[i+1];
 13 |         res[i+2]=a[i+2]+b[i+2];
 14 |         res[i+3]=a[i+3]+b[i+3];
 15 |     }
 16 | }
 17 | __global__ void sumArraysGPU(float*a,float*b,float*res,int N)
 18 | {
 19 |     int idx=blockIdx.x*blockDim.x+threadIdx.x;
 20 |     if(idx < N)
 21 |     //for delay
 22 |     {
 23 |         for(int j=0;j<N_REPEAT;j++)
 24 |             res[idx]=a[idx]+b[idx];
 25 |     }
 26 | 
 27 | }
 28 | int main(int argc,char **argv)
 29 | {
 30 |     // set up device
 31 |     initDevice(0);
 32 |     double iStart,iElaps;
 33 |     iStart=cpuSecond();
 34 |     int nElem=1<<24;
 35 |     printf("Vector size:%d\n",nElem);
 36 |     int nByte=sizeof(float)*nElem;
 37 |     float * a_h,*b_h,*res_h,*res_from_gpu_h;
 38 |     CHECK(cudaHostAlloc((float**)&a_h,nByte,cudaHostAllocDefault));
 39 |     CHECK(cudaHostAlloc((float**)&b_h,nByte,cudaHostAllocDefault));
 40 |     CHECK(cudaHostAlloc((float**)&res_h,nByte,cudaHostAllocDefault));
 41 |     CHECK(cudaHostAlloc((float**)&res_from_gpu_h,nByte,cudaHostAllocDefault));
 42 | 
 43 |     cudaMemset(res_h,0,nByte);
 44 |     cudaMemset(res_from_gpu_h,0,nByte);
 45 | 
 46 |     float *a_d,*b_d,*res_d;
 47 |     CHECK(cudaMalloc((float**)&a_d,nByte));
 48 |     CHECK(cudaMalloc((float**)&b_d,nByte));
 49 |     CHECK(cudaMalloc((float**)&res_d,nByte));
 50 | 
 51 |     initialData(a_h,nElem);
 52 |     initialData(b_h,nElem);
 53 | 
 54 |     sumArrays(a_h,b_h,res_h,nElem);
 55 |     dim3 block(512);
 56 |     dim3 grid((nElem-1)/block.x+1);
 57 | 
 58 | 
 59 |     //asynchronous calculation
 60 |     int iElem=nElem/N_SEGMENT;
 61 |     cudaStream_t stream[N_SEGMENT];
 62 |     for(int i=0;i<N_SEGMENT;i++)
 63 |     {
 64 |         CHECK(cudaStreamCreate(&stream[i]));
 65 |     }
 66 |     cudaEvent_t start,stop;
 67 |     cudaEventCreate(&start);
 68 |     cudaEventCreate(&stop);
 69 |     cudaEventRecord(start,0);
 70 |     for(int i=0;i<N_SEGMENT;i++)
 71 |     {
 72 |         int ioffset=i*iElem;
 73 |         CHECK(cudaMemcpyAsync(&a_d[ioffset],&a_h[ioffset],nByte/N_SEGMENT,cudaMemcpyHostToDevice,stream[i]));
 74 |         CHECK(cudaMemcpyAsync(&b_d[ioffset],&b_h[ioffset],nByte/N_SEGMENT,cudaMemcpyHostToDevice,stream[i]));
 75 |         sumArraysGPU<<<grid,block,0,stream[i]>>>(&a_d[ioffset],&b_d[ioffset],&res_d[ioffset],iElem);
 76 |         CHECK(cudaMemcpyAsync(&res_from_gpu_h[ioffset],&res_d[ioffset],nByte/N_SEGMENT,cudaMemcpyDeviceToHost,stream[i]));
 77 |     }
 78 |     //timer
 79 |     CHECK(cudaEventRecord(stop, 0));
 80 |     int counter=0;
 81 |     while (cudaEventQuery(stop)==cudaErrorNotReady)
 82 |     {
 83 |         counter++;
 84 |     }
 85 |     printf("cpu counter:%d\n",counter);
 86 |     iElaps=cpuSecond()-iStart;
 87 |     printf("Asynchronous Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);
 88 |     checkResult(res_h,res_from_gpu_h,nElem);
 89 |     for(int i=0;i<N_SEGMENT;i++)
 90 |     {
 91 |         CHECK(cudaStreamDestroy(stream[i]));
 92 |     }
 93 |     cudaFree(a_d);
 94 |     cudaFree(b_d);
 95 |     cudaFree(a_h);
 96 |     cudaFree(b_h);
 97 |     cudaFree(res_h);
 98 |     cudaFree(res_from_gpu_h);
 99 |     cudaEventDestroy(start);
100 |     cudaEventDestroy(stop);
101 | 
102 |     return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/38_stream_call_back/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(stream_call_back stream_call_back.cu)
2 | 


--------------------------------------------------------------------------------
/38_stream_call_back/stream_call_back.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | #define N_REPEAT 10
  5 | #define N_SEGMENT 16
  6 | void CUDART_CB my_callback(cudaStream_t stream,cudaError_t status,void * data)
  7 | {
  8 |     printf("call back from stream:%d\n",*((int *)data));
  9 | }
 10 | void sumArrays(float * a,float * b,float * res,const int size)
 11 | {
 12 |     for(int i=0;i<size;i+=4)
 13 |     {
 14 |         res[i]=a[i]+b[i];
 15 |         res[i+1]=a[i+1]+b[i+1];
 16 |         res[i+2]=a[i+2]+b[i+2];
 17 |         res[i+3]=a[i+3]+b[i+3];
 18 |     }
 19 | }
 20 | __global__ void sumArraysGPU(float*a,float*b,float*res,int N)
 21 | {
 22 |     int idx=blockIdx.x*blockDim.x+threadIdx.x;
 23 |     if(idx < N)
 24 |     //for delay
 25 |     {
 26 |         for(int j=0;j<N_REPEAT;j++)
 27 |             res[idx]=a[idx]+b[idx];
 28 |     }
 29 | 
 30 | }
 31 | int main(int argc,char **argv)
 32 | {
 33 |     // set up device
 34 |     initDevice(0);
 35 |     double iStart,iElaps;
 36 |     iStart=cpuSecond();
 37 |     int nElem=1<<24;
 38 |     printf("Vector size:%d\n",nElem);
 39 |     int nByte=sizeof(float)*nElem;
 40 |     float * a_h,*b_h,*res_h,*res_from_gpu_h;
 41 |     CHECK(cudaHostAlloc((float**)&a_h,nByte,cudaHostAllocDefault));
 42 |     CHECK(cudaHostAlloc((float**)&b_h,nByte,cudaHostAllocDefault));
 43 |     CHECK(cudaHostAlloc((float**)&res_h,nByte,cudaHostAllocDefault));
 44 |     CHECK(cudaHostAlloc((float**)&res_from_gpu_h,nByte,cudaHostAllocDefault));
 45 | 
 46 |     cudaMemset(res_h,0,nByte);
 47 |     cudaMemset(res_from_gpu_h,0,nByte);
 48 | 
 49 |     float *a_d,*b_d,*res_d;
 50 |     CHECK(cudaMalloc((float**)&a_d,nByte));
 51 |     CHECK(cudaMalloc((float**)&b_d,nByte));
 52 |     CHECK(cudaMalloc((float**)&res_d,nByte));
 53 | 
 54 |     initialData(a_h,nElem);
 55 |     initialData(b_h,nElem);
 56 | 
 57 |     sumArrays(a_h,b_h,res_h,nElem);
 58 |     dim3 block(512);
 59 |     dim3 grid((nElem-1)/block.x+1);
 60 | 
 61 | 
 62 |     //asynchronous calculation
 63 |     int iElem=nElem/N_SEGMENT;
 64 |     cudaStream_t stream[N_SEGMENT];
 65 |     for(int i=0;i<N_SEGMENT;i++)
 66 |     {
 67 |         CHECK(cudaStreamCreate(&stream[i]));
 68 |     }
 69 |     cudaEvent_t start,stop;
 70 |     cudaEventCreate(&start);
 71 |     cudaEventCreate(&stop);
 72 |     cudaEventRecord(start,0);
 73 |     for(int i=0;i<N_SEGMENT;i++)
 74 |     {
 75 |         int ioffset=i*iElem;
 76 |         CHECK(cudaMemcpyAsync(&a_d[ioffset],&a_h[ioffset],nByte/N_SEGMENT,cudaMemcpyHostToDevice,stream[i]));
 77 |         CHECK(cudaMemcpyAsync(&b_d[ioffset],&b_h[ioffset],nByte/N_SEGMENT,cudaMemcpyHostToDevice,stream[i]));
 78 |         sumArraysGPU<<<grid,block,0,stream[i]>>>(&a_d[ioffset],&b_d[ioffset],&res_d[ioffset],iElem);
 79 |         CHECK(cudaMemcpyAsync(&res_from_gpu_h[ioffset],&res_d[ioffset],nByte/N_SEGMENT,cudaMemcpyDeviceToHost,stream[i]));
 80 |         CHECK(cudaStreamAddCallback(stream[i],my_callback,(void *)(stream+i),0));
 81 |     }
 82 |     //timer
 83 |     CHECK(cudaEventRecord(stop, 0));
 84 |     int counter=0;
 85 |     while (cudaEventQuery(stop)==cudaErrorNotReady)
 86 |     {
 87 |         counter++;
 88 |     }
 89 |     printf("cpu counter:%d\n",counter);
 90 |     iElaps=cpuSecond()-iStart;
 91 |     printf("Asynchronous Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);
 92 |     checkResult(res_h,res_from_gpu_h,nElem);
 93 |     for(int i=0;i<N_SEGMENT;i++)
 94 |     {
 95 |         CHECK(cudaStreamDestroy(stream[i]));
 96 |     }
 97 |     cudaFree(a_d);
 98 |     cudaFree(b_d);
 99 |     cudaFree(a_h);
100 |     cudaFree(b_h);
101 |     cudaFree(res_h);
102 |     cudaFree(res_from_gpu_h);
103 |     cudaEventDestroy(start);
104 |     cudaEventDestroy(stop);
105 | 
106 |     return 0;
107 | }
108 | 


--------------------------------------------------------------------------------
/3_sum_arrays/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(sum_arrays sum_arrays.cu)
2 | 


--------------------------------------------------------------------------------
/3_sum_arrays/sum_arrays.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | 
 6 | void sumArrays(float * a,float * b,float * res,const int size)
 7 | {
 8 |   for(int i=0;i<size;i+=4)
 9 |   {
10 |     res[i]=a[i]+b[i];
11 |     res[i+1]=a[i+1]+b[i+1];
12 |     res[i+2]=a[i+2]+b[i+2];
13 |     res[i+3]=a[i+3]+b[i+3];
14 |   }
15 | }
16 | __global__ void sumArraysGPU(float*a,float*b,float*res)
17 | {
18 |   //int i=threadIdx.x;
19 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
20 |   res[i]=a[i]+b[i];
21 | }
22 | int main(int argc,char **argv)
23 | {
24 |   int dev = 0;
25 |   cudaSetDevice(dev);
26 | 
27 |   int nElem=1<<14;
28 |   printf("Vector size:%d\n",nElem);
29 |   int nByte=sizeof(float)*nElem;
30 |   float *a_h=(float*)malloc(nByte);
31 |   float *b_h=(float*)malloc(nByte);
32 |   float *res_h=(float*)malloc(nByte);
33 |   float *res_from_gpu_h=(float*)malloc(nByte);
34 |   memset(res_h,0,nByte);
35 |   memset(res_from_gpu_h,0,nByte);
36 | 
37 |   float *a_d,*b_d,*res_d;
38 |   CHECK(cudaMalloc((float**)&a_d,nByte));
39 |   CHECK(cudaMalloc((float**)&b_d,nByte));
40 |   CHECK(cudaMalloc((float**)&res_d,nByte));
41 | 
42 |   initialData(a_h,nElem);
43 |   initialData(b_h,nElem);
44 | 
45 |   CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
46 |   CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
47 | 
48 |   dim3 block(1024);
49 |   dim3 grid(nElem/block.x);
50 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d);
51 |   printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x);
52 | 
53 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
54 |   sumArrays(a_h,b_h,res_h,nElem);
55 | 
56 |   checkResult(res_h,res_from_gpu_h,nElem);
57 |   cudaFree(a_d);
58 |   cudaFree(b_d);
59 |   cudaFree(res_d);
60 | 
61 |   free(a_h);
62 |   free(b_h);
63 |   free(res_h);
64 |   free(res_from_gpu_h);
65 | 
66 |   return 0;
67 | }
68 | 


--------------------------------------------------------------------------------
/4_sum_arrays_timer/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(sum_arrays_timer sum_arrays_timer.cu)
2 | 


--------------------------------------------------------------------------------
/4_sum_arrays_timer/sum_arrays_timer.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | 
 6 | 
 7 | void sumArrays(float * a,float * b,float * res,const int size)
 8 | {
 9 |   for(int i=0;i<size;i+=4)
10 |   {
11 |     res[i]=a[i]+b[i];
12 |     res[i+1]=a[i+1]+b[i+1];
13 |     res[i+2]=a[i+2]+b[i+2];
14 |     res[i+3]=a[i+3]+b[i+3];
15 |   }
16 | }
17 | __global__ void sumArraysGPU(float*a,float*b,float*res,int N)
18 | {
19 |   int i=blockIdx.x*blockDim.x+threadIdx.x;
20 |   if(i < N)
21 |     res[i]=a[i]+b[i];
22 | }
23 | int main(int argc,char **argv)
24 | {
25 |   // set up device
26 |   initDevice(0);
27 | 
28 |   int nElem=1<<24;
29 |   printf("Vector size:%d\n",nElem);
30 |   int nByte=sizeof(float)*nElem;
31 |   float *a_h=(float*)malloc(nByte);
32 |   float *b_h=(float*)malloc(nByte);
33 |   float *res_h=(float*)malloc(nByte);
34 |   float *res_from_gpu_h=(float*)malloc(nByte);
35 |   memset(res_h,0,nByte);
36 |   memset(res_from_gpu_h,0,nByte);
37 | 
38 |   float *a_d,*b_d,*res_d;
39 |   CHECK(cudaMalloc((float**)&a_d,nByte));
40 |   CHECK(cudaMalloc((float**)&b_d,nByte));
41 |   CHECK(cudaMalloc((float**)&res_d,nByte));
42 | 
43 |   initialData(a_h,nElem);
44 |   initialData(b_h,nElem);
45 | 
46 |   CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice));
47 |   CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice));
48 | 
49 |   dim3 block(512);
50 |   dim3 grid((nElem-1)/block.x+1);
51 | 
52 |   //timer
53 |   double iStart,iElaps;
54 |   iStart=cpuSecond();
55 |   sumArraysGPU<<<grid,block>>>(a_d,b_d,res_d,nElem);
56 |   
57 |   
58 | 
59 |   CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost));
60 |   iElaps=cpuSecond()-iStart;
61 |   printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps);
62 |   sumArrays(a_h,b_h,res_h,nElem);
63 | 
64 |   checkResult(res_h,res_from_gpu_h,nElem);
65 |   cudaFree(a_d);
66 |   cudaFree(b_d);
67 |   cudaFree(res_d);
68 | 
69 |   free(a_h);
70 |   free(b_h);
71 |   free(res_h);
72 |   free(res_from_gpu_h);
73 | 
74 |   return 0;
75 | }
76 | 


--------------------------------------------------------------------------------
/5_thread_index/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(thread_index thread_index.cu)
2 | 


--------------------------------------------------------------------------------
/5_thread_index/thread_index.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | 
 5 | __global__ void printThreadIndex(float *A,const int nx,const int ny)
 6 | {
 7 |   int ix=threadIdx.x+blockIdx.x*blockDim.x;
 8 |   int iy=threadIdx.y+blockIdx.y*blockDim.y;
 9 |   unsigned int idx=iy*nx+ix;
10 |   printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)"
11 |           "global index %2d ival %f\n",threadIdx.x,threadIdx.y,
12 |           blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]);
13 | }
14 | int main(int argc,char** argv)
15 | {
16 |   initDevice(0);
17 |   int nx=8,ny=6;
18 |   int nxy=nx*ny;
19 |   int nBytes=nxy*sizeof(float);
20 | 
21 |   //Malloc
22 |   float* A_host=(float*)malloc(nBytes);
23 |   initialData(A_host,nxy);
24 |   printMatrix(A_host,nx,ny);
25 | 
26 |   //cudaMalloc
27 |   float *A_dev=NULL;
28 |   CHECK(cudaMalloc((void**)&A_dev,nBytes));
29 | 
30 |   cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice);
31 | 
32 |   dim3 block(4,2);
33 |   dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
34 | 
35 |   printThreadIndex<<<grid,block>>>(A_dev,nx,ny);
36 | 
37 |   CHECK(cudaDeviceSynchronize());
38 |   cudaFree(A_dev);
39 |   free(A_host);
40 | 
41 |   cudaDeviceReset();
42 |   return 0;
43 | }
44 | 


--------------------------------------------------------------------------------
/6_sum_matrix/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(sum_matrix sum_matrix.cu)
2 | 


--------------------------------------------------------------------------------
/6_sum_matrix/sum_matrix.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include "freshman.h"
  4 | void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny)
  5 | {
  6 |   float * a=MatA;
  7 |   float * b=MatB;
  8 |   float * c=MatC;
  9 |   for(int j=0;j<ny;j++)
 10 |   {
 11 |     for(int i=0;i<nx;i++)
 12 |     {
 13 |       c[i]=a[i]+b[i];
 14 |     }
 15 |     c+=nx;
 16 |     b+=nx;
 17 |     a+=nx;
 18 |   }
 19 | }
 20 | __global__ void sumMatrix(float * MatA,float * MatB,float * MatC,int nx,int ny)
 21 | {
 22 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
 23 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
 24 |     int idx=ix+iy*ny;
 25 |     if (ix<nx && iy<ny)
 26 |     {
 27 |       MatC[idx]=MatA[idx]+MatB[idx];
 28 |     }
 29 | }
 30 | 
 31 | int main(int argc,char** argv)
 32 | {
 33 |   printf("strating...\n");
 34 |   initDevice(0);
 35 |   int nx=1<<12;
 36 |   int ny=1<<12;
 37 |   int nxy=nx*ny;
 38 |   int nBytes=nxy*sizeof(float);
 39 | 
 40 |   //Malloc
 41 |   float* A_host=(float*)malloc(nBytes);
 42 |   float* B_host=(float*)malloc(nBytes);
 43 |   float* C_host=(float*)malloc(nBytes);
 44 |   float* C_from_gpu=(float*)malloc(nBytes);
 45 |   initialData(A_host,nxy);
 46 |   initialData(B_host,nxy);
 47 | 
 48 |   //cudaMalloc
 49 |   float *A_dev=NULL;
 50 |   float *B_dev=NULL;
 51 |   float *C_dev=NULL;
 52 |   CHECK(cudaMalloc((void**)&A_dev,nBytes));
 53 |   CHECK(cudaMalloc((void**)&B_dev,nBytes));
 54 |   CHECK(cudaMalloc((void**)&C_dev,nBytes));
 55 | 
 56 | 
 57 |   CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
 58 |   CHECK(cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice));
 59 | 
 60 |   int dimx=32;
 61 |   int dimy=32;
 62 | 
 63 |   // cpu compute
 64 |   cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost);
 65 |   double iStart=cpuSecond();
 66 |   sumMatrix2D_CPU(A_host,B_host,C_host,nx,ny);
 67 |   double iElaps=cpuSecond()-iStart;
 68 |   printf("CPU Execution Time elapsed %f sec\n",iElaps);
 69 | 
 70 |   // 2d block and 2d grid
 71 |   dim3 block_0(dimx,dimy);
 72 |   dim3 grid_0((nx-1)/block_0.x+1,(ny-1)/block_0.y+1);
 73 |   iStart=cpuSecond();
 74 |   sumMatrix<<<grid_0,block_0>>>(A_dev,B_dev,C_dev,nx,ny);
 75 |   CHECK(cudaDeviceSynchronize());
 76 |   iElaps=cpuSecond()-iStart;
 77 |   printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
 78 |         grid_0.x,grid_0.y,block_0.x,block_0.y,iElaps);
 79 |   CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost));
 80 |   checkResult(C_host,C_from_gpu,nxy);
 81 |   // 1d block and 1d grid
 82 |   dimx=32;
 83 |   dim3 block_1(dimx);
 84 |   dim3 grid_1((nxy-1)/block_1.x+1);
 85 |   iStart=cpuSecond();
 86 |   sumMatrix<<<grid_1,block_1>>>(A_dev,B_dev,C_dev,nx*ny ,1);
 87 |   CHECK(cudaDeviceSynchronize());
 88 |   iElaps=cpuSecond()-iStart;
 89 |   printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
 90 |         grid_1.x,grid_1.y,block_1.x,block_1.y,iElaps);
 91 |   CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost));
 92 |   checkResult(C_host,C_from_gpu,nxy);
 93 |   // 2d block and 1d grid
 94 |   dimx=32;
 95 |   dim3 block_2(dimx);
 96 |   dim3 grid_2((nx-1)/block_2.x+1,ny);
 97 |   iStart=cpuSecond();
 98 |   sumMatrix<<<grid_2,block_2>>>(A_dev,B_dev,C_dev,nx,ny);
 99 |   CHECK(cudaDeviceSynchronize());
100 |   iElaps=cpuSecond()-iStart;
101 |   printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
102 |         grid_2.x,grid_2.y,block_2.x,block_2.y,iElaps);
103 |   CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost));
104 |   checkResult(C_host,C_from_gpu,nxy);
105 | 
106 | 
107 |   cudaFree(A_dev);
108 |   cudaFree(B_dev);
109 |   cudaFree(C_dev);
110 |   free(A_host);
111 |   free(B_host);
112 |   free(C_host);
113 |   free(C_from_gpu);
114 |   cudaDeviceReset();
115 |   return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/7_device_information/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(device_information device_information.cu)
2 | 


--------------------------------------------------------------------------------
/7_device_information/device_information.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main(int argc,char** argv)
 5 | {
 6 |     printf("%s Starting ...\n",argv[0]);
 7 |     int deviceCount = 0;
 8 |     cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
 9 |     if(error_id!=cudaSuccess)
10 |     {
11 |         printf("cudaGetDeviceCount returned %d\n ->%s\n",
12 |               (int)error_id,cudaGetErrorString(error_id));
13 |         printf("Result = FAIL\n");
14 |         exit(EXIT_FAILURE);
15 |     }
16 |     if(deviceCount==0)
17 |     {
18 |         printf("There are no available device(s) that support CUDA\n");
19 |     }
20 |     else
21 |     {
22 |         printf("Detected %d CUDA Capable device(s)\n",deviceCount);
23 |     }
24 |     int dev=0,driverVersion=0,runtimeVersion=0;
25 |     cudaSetDevice(dev);
26 |     cudaDeviceProp deviceProp;
27 |     cudaGetDeviceProperties(&deviceProp,dev);
28 |     printf("Device %d:\"%s\"\n",dev,deviceProp.name);
29 |     cudaDriverGetVersion(&driverVersion);
30 |     cudaRuntimeGetVersion(&runtimeVersion);
31 |     printf("  CUDA Driver Version / Runtime Version         %d.%d  /  %d.%d\n",
32 |         driverVersion/1000,(driverVersion%100)/10,
33 |         runtimeVersion/1000,(runtimeVersion%100)/10);
34 |     printf("  CUDA Capability Major/Minor version number:   %d.%d\n",
35 |         deviceProp.major,deviceProp.minor);
36 |     printf("  Total amount of global memory:                %.2f GBytes (%llu bytes)\n",
37 |             (float)deviceProp.totalGlobalMem/pow(1024.0,3),deviceProp.totalGlobalMem);
38 |     printf("  GPU Clock rate:                               %.0f MHz (%0.2f GHz)\n",
39 |             deviceProp.clockRate*1e-3f,deviceProp.clockRate*1e-6f);
40 |     printf("  Memory Bus width:                             %d-bits\n",
41 |             deviceProp.memoryBusWidth);
42 |     if (deviceProp.l2CacheSize)
43 |     {
44 |         printf("  L2 Cache Size:                            	%d bytes\n",
45 |                 deviceProp.l2CacheSize);
46 |     }
47 |     printf("  Max Texture Dimension Size (x,y,z)            1D=(%d),2D=(%d,%d),3D=(%d,%d,%d)\n",
48 |             deviceProp.maxTexture1D,deviceProp.maxTexture2D[0],deviceProp.maxTexture2D[1]
49 |             ,deviceProp.maxTexture3D[0],deviceProp.maxTexture3D[1],deviceProp.maxTexture3D[2]);
50 |     printf("  Max Layered Texture Size (dim) x layers       1D=(%d) x %d,2D=(%d,%d) x %d\n",
51 |             deviceProp.maxTexture1DLayered[0],deviceProp.maxTexture1DLayered[1],
52 |             deviceProp.maxTexture2DLayered[0],deviceProp.maxTexture2DLayered[1],
53 |             deviceProp.maxTexture2DLayered[2]);
54 |     printf("  Total amount of constant memory               %lu bytes\n",
55 |             deviceProp.totalConstMem);
56 |     printf("  Total amount of shared memory per block:      %lu bytes\n",
57 |             deviceProp.sharedMemPerBlock);
58 |     printf("  Total number of registers available per block:%d\n",
59 |             deviceProp.regsPerBlock);
60 |     printf("  Wrap size:                                    %d\n",deviceProp.warpSize);
61 |     printf("  Maximun number of thread per multiprocesser:  %d\n",
62 |             deviceProp.maxThreadsPerMultiProcessor);
63 |     printf("  Maximun number of thread per block:           %d\n",
64 |             deviceProp.maxThreadsPerBlock);
65 |     printf("  Maximun size of each dimension of a block:    %d x %d x %d\n",
66 |             deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
67 |     printf("  Maximun size of each dimension of a grid:     %d x %d x %d\n",
68 |             deviceProp.maxGridSize[0],
69 | 	    deviceProp.maxGridSize[1],
70 | 	    deviceProp.maxGridSize[2]);
71 |     printf("  Maximu memory pitch                           %lu bytes\n",deviceProp.memPitch);
72 |     printf("----------------------------------------------------------\n");
73 |     printf("Number of multiprocessors:                      %d\n", deviceProp.multiProcessorCount);
74 |     printf("Total amount of constant memory:                %4.2f KB\n",
75 | 	deviceProp.totalConstMem/1024.0);
76 |     printf("Total amount of shared memory per block:        %4.2f KB\n",
77 |      deviceProp.sharedMemPerBlock/1024.0);
78 |     printf("Total number of registers available per block:  %d\n",
79 |     deviceProp.regsPerBlock);
80 |     printf("Warp size                                       %d\n", deviceProp.warpSize);
81 |     printf("Maximum number of threads per block:            %d\n", deviceProp.maxThreadsPerBlock);
82 |     printf("Maximum number of threads per multiprocessor:  %d\n",
83 | 	deviceProp.maxThreadsPerMultiProcessor);
84 |     printf("Maximum number of warps per multiprocessor:     %d\n",
85 | 	deviceProp.maxThreadsPerMultiProcessor/32);
86 |     return EXIT_SUCCESS;
87 |    
88 | }
89 | 


--------------------------------------------------------------------------------
/8_divergence/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(divergence divergence.cu)
2 | 


--------------------------------------------------------------------------------
/8_divergence/divergence.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include "freshman.h"
  5 | __global__ void warmup(float *c)
  6 | {
  7 | 	int tid = blockIdx.x* blockDim.x + threadIdx.x;
  8 | 	float a = 0.0;
  9 | 	float b = 0.0;
 10 | 	
 11 | 	if ((tid/warpSize) % 2 == 0)
 12 | 	{
 13 | 		a = 100.0f;
 14 | 		
 15 | 	}
 16 | 	else
 17 | 	{
 18 | 		b = 200.0f;
 19 | 	}
 20 | 	//printf("%d %d %f \n",tid,warpSize,a+b);
 21 | 	c[tid] = a + b;
 22 | }
 23 | __global__ void mathKernel1(float *c)
 24 | {
 25 | 	int tid = blockIdx.x* blockDim.x + threadIdx.x;
 26 | 	
 27 | 	float a = 0.0;
 28 | 	float b = 0.0;
 29 | 	if (tid % 2 == 0)
 30 | 	{
 31 | 		a = 100.0f;
 32 | 	}
 33 | 	else
 34 | 	{
 35 | 		b = 200.0f;
 36 | 	}
 37 | 	c[tid] = a + b;
 38 | }
 39 | 
 40 | __global__ void mathKernel2(float *c)
 41 | {
 42 | 	int tid = blockIdx.x* blockDim.x + threadIdx.x;
 43 | 	float a = 0.0;
 44 | 	float b = 0.0;
 45 | 	if ((tid/warpSize) % 2 == 0)
 46 | 	{
 47 | 		a = 100.0f;
 48 | 	}
 49 | 	else
 50 | 	{
 51 | 		b = 200.0f;
 52 | 	}
 53 | 	c[tid] = a + b;
 54 | }
 55 | __global__ void mathKernel3(float *c)
 56 | {
 57 | 	int tid = blockIdx.x* blockDim.x + threadIdx.x;
 58 | 	float a = 0.0;
 59 | 	float b = 0.0;
 60 | 	bool ipred = (tid % 2 == 0);
 61 | 	if (ipred)
 62 | 	{
 63 | 		a = 100.0f;
 64 | 	}
 65 | 	else
 66 | 	{
 67 | 		b = 200.0f;
 68 | 	}
 69 | 	c[tid] = a + b;
 70 | }
 71 | 
 72 | int main(int argc, char **argv)
 73 | {
 74 | 	int dev = 0;
 75 | 	cudaDeviceProp deviceProp;
 76 | 	cudaGetDeviceProperties(&deviceProp, dev);
 77 | 	printf("%s using Device %d: %s\n", argv[0], dev, deviceProp.name);
 78 | 
 79 | 	//set up data size
 80 | 	int size = 64;
 81 | 	int blocksize = 64;
 82 | 	if (argc > 1) blocksize = atoi(argv[1]);
 83 | 	if (argc > 2) size = atoi(argv[2]);
 84 | 	printf("Data size %d ", size);
 85 | 
 86 | 	//set up execution configuration
 87 | 	dim3 block(blocksize,1);
 88 | 	dim3 grid((size - 1) / block.x + 1,1);
 89 | 	printf("Execution Configure (block %d grid %d)\n", block.x, grid.x);
 90 | 
 91 | 	//allocate gpu memory
 92 | 	float * C_dev;
 93 | 	size_t nBytes = size * sizeof(float);
 94 | 	float * C_host=(float*)malloc(nBytes);
 95 | 	cudaMalloc((float**)&C_dev, nBytes);
 96 | 	
 97 | 	//run a warmup kernel to remove overhead
 98 | 	double iStart, iElaps;
 99 | 	cudaDeviceSynchronize();
100 | 	iStart = cpuSecond();
101 | 	warmup<<<grid,block>>> (C_dev);
102 | 	cudaDeviceSynchronize();
103 | 	iElaps = cpuSecond() - iStart;
104 | 	
105 | 	printf("warmup	  <<<%d,%d>>>elapsed %lf sec \n", grid.x, block.x, iElaps);
106 | 	
107 | 	//run kernel 1
108 | 	iStart = cpuSecond();
109 | 	mathKernel1 <<< grid,block >>> (C_dev);
110 | 	cudaDeviceSynchronize();
111 | 	iElaps = cpuSecond() - iStart;
112 | 	printf("mathKernel1<<<%4d,%4d>>>elapsed %lf sec \n", grid.x, block.x, iElaps);
113 | 	cudaMemcpy(C_host,C_dev,nBytes,cudaMemcpyDeviceToHost);
114 | 	//for(int i=0;i<size;i++)
115 | 	//{
116 | 	//	printf("%f ",C_host[i]);	
117 | 	//}
118 | 	//run kernel 2
119 | 	iStart = cpuSecond();
120 | 	mathKernel2 <<<grid,block >>> (C_dev);
121 | 	cudaDeviceSynchronize();
122 | 	iElaps = cpuSecond() - iStart;
123 | 	printf("mathKernel2<<<%4d,%4d>>>elapsed %lf sec \n", grid.x, block.x, iElaps);
124 | 
125 | 	//run kernel 3
126 | 	iStart = cpuSecond();
127 | 	mathKernel3 << <grid, block >> > (C_dev);
128 | 	cudaDeviceSynchronize();
129 | 	iElaps = cpuSecond() - iStart;
130 | 	printf("mathKernel3<<<%4d,%4d>>>elapsed %lf sec \n", grid.x, block.x, iElaps);
131 | 
132 | 	cudaFree(C_dev);
133 | 	free(C_host);
134 | 	cudaDeviceReset();
135 | 	return EXIT_SUCCESS;
136 | }
137 | 


--------------------------------------------------------------------------------
/9_sum_matrix2D/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(sum_matrix2D sum_matrix2D.cu)
2 | 


--------------------------------------------------------------------------------
/9_sum_matrix2D/sum_matrix2D.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | #include "freshman.h"
 4 | void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny)
 5 | {
 6 |   float * a=MatA;
 7 |   float * b=MatB;
 8 |   float * c=MatC;
 9 |   for(int j=0;j<ny;j++)
10 |   {
11 |     for(int i=0;i<nx;i++)
12 |     {
13 |       c[i]=a[i]+b[i];
14 |     }
15 |     c+=nx;
16 |     b+=nx;
17 |     a+=nx;
18 |   }
19 | }
20 | __global__ void sumMatrix(float * MatA,float * MatB,float * MatC,int nx,int ny)
21 | {
22 |     int ix=threadIdx.x+blockDim.x*blockIdx.x;
23 |     int iy=threadIdx.y+blockDim.y*blockIdx.y;
24 |     int idx=ix+iy*ny;
25 |     if (ix<nx && iy<ny)
26 |     {
27 |       MatC[idx]=MatA[idx]+MatB[idx];
28 |     }
29 | }
30 | 
31 | int main(int argc,char** argv)
32 | {
33 |   printf("strating...\n");
34 |   initDevice(0);
35 |   int nx=1<<12;
36 |   int ny=1<<12;
37 |   int nxy=nx*ny;
38 |   int nBytes=nxy*sizeof(float);
39 | 
40 |   //Malloc
41 |   float* A_host=(float*)malloc(nBytes);
42 |   float* B_host=(float*)malloc(nBytes);
43 |   float* C_host=(float*)malloc(nBytes);
44 |   float* C_from_gpu=(float*)malloc(nBytes);
45 |   initialData(A_host,nxy);
46 |   initialData(B_host,nxy);
47 | 
48 |   //cudaMalloc
49 |   float *A_dev=NULL;
50 |   float *B_dev=NULL;
51 |   float *C_dev=NULL;
52 |   CHECK(cudaMalloc((void**)&A_dev,nBytes));
53 |   CHECK(cudaMalloc((void**)&B_dev,nBytes));
54 |   CHECK(cudaMalloc((void**)&C_dev,nBytes));
55 | 
56 | 
57 |   CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice));
58 |   CHECK(cudaMemcpy(B_dev,B_host,nBytes,cudaMemcpyHostToDevice));
59 | 
60 |   int dimx=32;
61 |   int dimy=32;
62 | 
63 |   // cpu compute
64 |   cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost);
65 |   double iStart=cpuSecond();
66 |   sumMatrix2D_CPU(A_host,B_host,C_host,nx,ny);
67 |   double iElaps=cpuSecond()-iStart;
68 |   printf("CPU Execution Time elapsed %f sec\n",iElaps);
69 | 
70 |   // 2d block and 2d grid
71 |   dim3 block_0(dimx,dimy);
72 |   dim3 grid_0((nx-1)/block_0.x+1,(ny-1)/block_0.y+1);
73 |   iStart=cpuSecond();
74 |   sumMatrix<<<grid_0,block_0>>>(A_dev,B_dev,C_dev,nx,ny);
75 |   CHECK(cudaDeviceSynchronize());
76 |   iElaps=cpuSecond()-iStart;
77 |   printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
78 |         grid_0.x,grid_0.y,block_0.x,block_0.y,iElaps);
79 |   CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost));
80 |   checkResult(C_host,C_from_gpu,nxy);
81 |   
82 |   cudaFree(A_dev);
83 |   cudaFree(B_dev);
84 |   cudaFree(C_dev);
85 |   free(A_host);
86 |   free(B_host);
87 |   free(C_host);
88 |   free(C_from_gpu);
89 |   cudaDeviceReset();
90 |   return 0;
91 | }
92 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
 2 | Project(CUDA_Freshman CXX C CUDA)
 3 | set(CMAKE_CUDA_FLAGS "-arch=compute_35 -g -G -O3")
 4 | include_directories(./include)
 5 | add_subdirectory(0_hello_world)
 6 | add_subdirectory(1_check_dimension)
 7 | add_subdirectory(2_grid_block)
 8 | add_subdirectory(3_sum_arrays)
 9 | add_subdirectory(4_sum_arrays_timer)
10 | add_subdirectory(5_thread_index)
11 | add_subdirectory(6_sum_matrix)
12 | add_subdirectory(7_device_information)
13 | add_subdirectory(8_divergence)
14 | add_subdirectory(9_sum_matrix2D)
15 | add_subdirectory(10_reduceInteger)
16 | add_subdirectory(11_simple_sum_matrix2D)
17 | add_subdirectory(12_reduce_unrolling)
18 | add_subdirectory(14_global_variable)
19 | add_subdirectory(15_pine_memory)
20 | add_subdirectory(16_zero_copy_memory)
21 | add_subdirectory(17_UVA)
22 | add_subdirectory(18_sum_array_offset)
23 | add_subdirectory(19_AoS)
24 | add_subdirectory(20_SoA)
25 | add_subdirectory(21_sum_array_offset_unrolling)
26 | add_subdirectory(22_transform_matrix2D)
27 | add_subdirectory(23_sum_array_uniform_memory)
28 | add_subdirectory(24_shared_memory_read_data)
29 | add_subdirectory(25_reduce_integer_shared_memory)
30 | add_subdirectory(26_transform_shared_memory)
31 | add_subdirectory(27_stencil_1d_constant_read_only)
32 | add_subdirectory(28_shfl_test)
33 | add_subdirectory(29_reduce_shfl)
34 | add_subdirectory(30_stream)
35 | add_subdirectory(32_stream_resource)
36 | add_subdirectory(33_stream_block)
37 | add_subdirectory(34_stream_dependence)
38 | add_subdirectory(35_multi_add_depth)
39 | add_subdirectory(36_multi_add_breadth)
40 | add_subdirectory(37_asyncAPI)
41 | add_subdirectory(38_stream_call_back)
42 | 


--------------------------------------------------------------------------------
/IMG_9066.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Tony-Tan/CUDA_Freshman/979938216fbbd8bc81ccbc525c4dd1f8c0c9fcbb/IMG_9066.JPG


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## 联系我
 3 | ![](./IMG_9066.JPG)
 4 | ## 博客
 5 | 具体内容可以访问博客：
 6 | - [0.0 腾讯云CUDA环境搭建](http://www.face2ai.com/CUDA-F-0-0-Tencent-GPU-Cloud/)
 7 | - [1.0 并行计算与计算机架构](http://www.face2ai.com/CUDA-F-1-0-并行计算与计算机架构/)
 8 | - [1.1 异构计算与CUDA](http://www.face2ai.com/CUDA-F-1-1-异构计算-CUDA/)
 9 | - [2.0 CUDA编程模型概述(一)](http://www.face2ai.com/CUDA-F-2-0-CUDA编程模型概述1/)
10 | - [2.1 CUDA编程模型概述(二)](http://www.face2ai.com/CUDA-F-2-1-CUDA编程模型概述2/)
11 | - [2.2 给核函数计时](http://www.face2ai.com/CUDA-F-2-2-核函数计时/)
12 | - [2.3 组织并行线程](http://www.face2ai.com/CUDA-F-2-3-组织并行线程/)
13 | - [2.4 设备信息查询](http://www.face2ai.com/CUDA-F-2-4-设备信息/)
14 | - [3.1 CUDA执行模型概述](http://www.face2ai.com/CUDA-F-3-1-CUDA执行模型概述/)
15 | - [3.2 理解线程束执行的本质（Part I）](http://www.face2ai.com/CUDA-F-3-2-理解线程束执行的本质-P1/)
16 | - [3.2 理解线程束执行的本质（Part II）](http://www.face2ai.com/CUDA-F-3-2-理解线程束执行的本质-P2/)
17 | - [3.3 并行性表现](http://www.face2ai.com/CUDA-F-3-3-并行性表现/)
18 | - [3.4 避免分支分化](http://www.face2ai.com/CUDA-F-3-4-避免分支分化/)
19 | - [3.5 循环展开](http://www.face2ai.com/CUDA-F-3-5-展开循环/)
20 | - [3.6 动态并行](http://www.face2ai.com/CUDA-F-3-6-动态并行/)
21 | - [4.0 全局内存](http://www.face2ai.com/CUDA-F-4-0-全局内存/)
22 | - [4.1 内存模型概述](http://www.face2ai.com/CUDA-F-4-1-内存模型概述/)
23 | - [4.2 内存管理](http://www.face2ai.com/CUDA-F-4-2-内存管理/)
24 | - [4.3 内存访问模式](http://www.face2ai.com/CUDA-F-4-3-内存访问模式/)
25 | - [4.4 核函数可达到的带宽](http://www.face2ai.com/CUDA-F-4-4-核函数可达到的带宽/)
26 | - [4.5 使用统一内存的向量加法](http://www.face2ai.com/CUDA-F-4-5-使用统一内存的向量加法/)
27 | - [5.0 共享内存和常量内存](http://www.face2ai.com/CUDA-F-5-0-共享内存和常量内存/)
28 | - [5.1 CUDA共享内存概述](http://www.face2ai.com/CUDA-F-5-1-CUDA共享内存概述/)
29 | - [5.2 共享内存的数据布局](http://www.face2ai.com/CUDA-F-5-2-共享内存的数据布局/)
30 | - [5.3 减少全局内存访问](http://www.face2ai.com/CUDA-F-5-3-减少全局内存访问/)
31 | - [5.4 合并的全局内存访问](http://www.face2ai.com/CUDA-F-5-4-合并的全局内存访问/)
32 | - [5.5 常量内存](http://www.face2ai.com/CUDA-F-5-5-常量内存/)
33 | - [5.6 线程束洗牌指令](http://www.face2ai.com/CUDA-F-5-6-线程束洗牌指令/)
34 | - [6.0 流和并发](http://www.face2ai.com/CUDA-F-6-0-流和并发/)
35 | - [6.1 流和事件概述](http://www.face2ai.com/CUDA-F-6-1-流和事件概述/)
36 | - [6.2 并发内核执行](http://www.face2ai.com/CUDA-F-6-2-并发内核执行/)
37 | - [6.3 重叠内核执行和数据传输](http://www.face2ai.com/CUDA-F-6-3-重叠内核执行和数据传输/)
38 | - [6.4 重叠GPU和CPU的执行](http://www.face2ai.com/CUDA-F-6-4-重叠GPU和CPU的执行/)
39 | - [6.5 流回调](http://www.face2ai.com/CUDA-F-6-5-流回调/)
40 | 
41 | 
42 | ## CUDA Freshman
43 | 1. This project is a set of CUDA programs
44 | 2. Some of them are from the book "Professional CUDA C Programming"
45 | 3. The others are coded by myself
46 | 4. You can get more details from the website [www.face2ai.com](http://www.face2ai.com)
47 | 


--------------------------------------------------------------------------------
/include/freshman.h:
--------------------------------------------------------------------------------
  1 | #ifndef FRESHMAN_H
  2 | #define FRESHMAN_H
  3 | #define CHECK(call)\
  4 | {\
  5 |   const cudaError_t error=call;\
  6 |   if(error!=cudaSuccess)\
  7 |   {\
  8 |       printf("ERROR: %s:%d,",__FILE__,__LINE__);\
  9 |       printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
 10 |       exit(1);\
 11 |   }\
 12 | }
 13 | 
 14 | 
 15 | #include <time.h>
 16 | #ifdef _WIN32
 17 | #	include <windows.h>
 18 | #else
 19 | #	include <sys/time.h>
 20 | #endif
 21 | #ifdef _WIN32
 22 | int gettimeofday(struct timeval *tp, void *tzp)
 23 | {
 24 |   time_t clock;
 25 |   struct tm tm;
 26 |   SYSTEMTIME wtm;
 27 |   GetLocalTime(&wtm);
 28 |   tm.tm_year   = wtm.wYear - 1900;
 29 |   tm.tm_mon   = wtm.wMonth - 1;
 30 |   tm.tm_mday   = wtm.wDay;
 31 |   tm.tm_hour   = wtm.wHour;
 32 |   tm.tm_min   = wtm.wMinute;
 33 |   tm.tm_sec   = wtm.wSecond;
 34 |   tm. tm_isdst  = -1;
 35 |   clock = mktime(&tm);
 36 |   tp->tv_sec = clock;
 37 |   tp->tv_usec = wtm.wMilliseconds * 1000;
 38 |   return (0);
 39 | }
 40 | #endif
 41 | double cpuSecond()
 42 | {
 43 |   struct timeval tp;
 44 |   gettimeofday(&tp,NULL);
 45 |   return((double)tp.tv_sec+(double)tp.tv_usec*1e-6);
 46 | 
 47 | }
 48 | void initialData(float* ip,int size)
 49 | {
 50 |   time_t t;
 51 |   srand((unsigned )time(&t));
 52 |   for(int i=0;i<size;i++)
 53 |   {
 54 |     ip[i]=(float)(rand()&0xffff)/1000.0f;
 55 |   }
 56 | }
 57 | void initialData_int(int* ip, int size)
 58 | {
 59 | 	time_t t;
 60 | 	srand((unsigned)time(&t));
 61 | 	for (int i = 0; i<size; i++)
 62 | 	{
 63 | 		ip[i] = int(rand()&0xff);
 64 | 	}
 65 | }
 66 | void printMatrix(float * C,const int nx,const int ny)
 67 | {
 68 |   float *ic=C;
 69 |   printf("Matrix<%d,%d>:\n",ny,nx);
 70 |   for(int i=0;i<ny;i++)
 71 |   {
 72 |     for(int j=0;j<nx;j++)
 73 |     {
 74 |       printf("%6f ",ic[j]);
 75 |     }
 76 |     ic+=nx;
 77 |     printf("\n");
 78 |   }
 79 | }
 80 | 
 81 | void initDevice(int devNum)
 82 | {
 83 |   int dev = devNum;
 84 |   cudaDeviceProp deviceProp;
 85 |   CHECK(cudaGetDeviceProperties(&deviceProp,dev));
 86 |   printf("Using device %d: %s\n",dev,deviceProp.name);
 87 |   CHECK(cudaSetDevice(dev));
 88 | 
 89 | }
 90 | void checkResult(float * hostRef,float * gpuRef,const int N)
 91 | {
 92 |   double epsilon=1.0E-8;
 93 |   for(int i=0;i<N;i++)
 94 |   {
 95 |     if(abs(hostRef[i]-gpuRef[i])>epsilon)
 96 |     {
 97 |       printf("Results don\'t match!\n");
 98 |       printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i);
 99 |       return;
100 |     }
101 |   }
102 |   printf("Check result success!\n");
103 | }
104 | #endif//FRESHMAN_H
105 | 


--------------------------------------------------------------------------------