├── .gitignore ├── 0_hello_world ├── CMakeLists.txt └── hello_world.cu ├── 10_reduceInteger ├── CMakeLists.txt └── reduceInteger.cu ├── 11_simple_sum_matrix2D ├── .sudo_as_admin_successful ├── CMakeLists.txt └── simple_sum_matrix.cu ├── 12_reduce_unrolling ├── CMakeLists.txt └── reduceUnrolling.cu ├── 13_nested_hello_world ├── Makefile └── nested_Hello_World.cu ├── 14_global_variable ├── CMakeLists.txt └── global_variable.cu ├── 15_pine_memory ├── CMakeLists.txt └── pine_memory.cu ├── 16_zero_copy_memory ├── CMakeLists.txt └── zero_copy_memory.cu ├── 17_UVA ├── CMakeLists.txt └── UVA.cu ├── 18_sum_array_offset ├── CMakeLists.txt └── sum_array_offset.cu ├── 19_AoS ├── AoS.cu └── CMakeLists.txt ├── 1_check_dimension ├── CMakeLists.txt └── check_dimension.cu ├── 20_SoA ├── CMakeLists.txt └── SoA.cu ├── 21_sum_array_offset_unrolling ├── CMakeLists.txt └── sum_array_offset_unrolling.cu ├── 22_transform_matrix2D ├── CMakeLists.txt └── transform_matrix2D.cu ├── 23_sum_array_uniform_memory ├── CMakeLists.txt └── sum_arrays_uniform_memory.cu ├── 24_shared_memory_read_data ├── CMakeLists.txt └── shared_memory_read_data.cu ├── 25_reduce_integer_shared_memory ├── CMakeLists.txt └── reduce_integer_shared_memory.cu ├── 26_transform_shared_memory ├── CMakeLists.txt └── transform_shared_memory.cu ├── 27_stencil_1d_constant_read_only ├── CMakeLists.txt └── stencil_1d_constant_read_only.cu ├── 28_shfl_test ├── CMakeLists.txt └── shfl_test.cu ├── 29_reduce_shfl ├── CMakeLists.txt └── reduce_shfl.cu ├── 2_grid_block ├── CMakeLists.txt └── grid_block.cu ├── 30_stream ├── CMakeLists.txt └── stream.cu ├── 31_stream_omp └── stream_omp.cu ├── 32_stream_resource ├── CMakeLists.txt └── stream_resource.cu ├── 33_stream_block ├── CMakeLists.txt └── stream_block.cu ├── 34_stream_dependence ├── CMakeLists.txt └── stream_dependence.cu ├── 35_multi_add_depth ├── CMakeLists.txt └── multi_add_depth.cu ├── 36_multi_add_breadth ├── CMakeLists.txt └── multi_add_breadth.cu ├── 37_asyncAPI ├── CMakeLists.txt └── asyncAPI.cu ├── 38_stream_call_back ├── CMakeLists.txt └── stream_call_back.cu ├── 3_sum_arrays ├── CMakeLists.txt └── sum_arrays.cu ├── 4_sum_arrays_timer ├── CMakeLists.txt └── sum_arrays_timer.cu ├── 5_thread_index ├── CMakeLists.txt └── thread_index.cu ├── 6_sum_matrix ├── CMakeLists.txt └── sum_matrix.cu ├── 7_device_information ├── CMakeLists.txt └── device_information.cu ├── 8_divergence ├── CMakeLists.txt └── divergence.cu ├── 9_sum_matrix2D ├── CMakeLists.txt └── sum_matrix2D.cu ├── CMakeLists.txt ├── IMG_9066.JPG ├── README.md └── include └── freshman.h /.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /cmake-build-debug/ 3 | .DS_Stroe 4 | .vscode -------------------------------------------------------------------------------- /0_hello_world/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(hello_world hello_world.cu) 2 | -------------------------------------------------------------------------------- /0_hello_world/hello_world.cu: -------------------------------------------------------------------------------- 1 | #include 2 | __global__ void hello_world(void) 3 | { 4 | printf("GPU: Hello world!\n"); 5 | } 6 | int main(int argc,char **argv) 7 | { 8 | printf("CPU: Hello world!\n"); 9 | hello_world<<<1,10>>>(); 10 | cudaDeviceReset();//if no this line ,it can not output hello world from gpu 11 | return 0; 12 | } 13 | -------------------------------------------------------------------------------- /10_reduceInteger/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(reduceInteger reduceInteger.cu) 2 | -------------------------------------------------------------------------------- /10_reduceInteger/reduceInteger.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | int recursiveReduce(int *data, int const size) 5 | { 6 | // terminate check 7 | if (size == 1) return data[0]; 8 | // renew the stride 9 | int const stride = size / 2; 10 | if (size % 2 == 1) 11 | { 12 | for (int i = 0; i < stride; i++) 13 | { 14 | data[i] += data[i + stride]; 15 | } 16 | data[0] += data[size - 1]; 17 | } 18 | else 19 | { 20 | for (int i = 0; i < stride; i++) 21 | { 22 | data[i] += data[i + stride]; 23 | } 24 | } 25 | // call 26 | return recursiveReduce(data, stride); 27 | } 28 | __global__ void warmup(int * g_idata, int * g_odata, unsigned int n) 29 | { 30 | //set thread ID 31 | unsigned int tid = threadIdx.x; 32 | //boundary check 33 | if (tid >= n) return; 34 | //convert global data pointer to the 35 | int *idata = g_idata + blockIdx.x*blockDim.x; 36 | //in-place reduction in global memory 37 | for (int stride = 1; stride < blockDim.x; stride *= 2) 38 | { 39 | if ((tid % (2 * stride)) == 0) 40 | { 41 | idata[tid] += idata[tid + stride]; 42 | } 43 | //synchronize within block 44 | __syncthreads(); 45 | } 46 | //write result for this block to global mem 47 | if (tid == 0) 48 | g_odata[blockIdx.x] = idata[0]; 49 | 50 | } 51 | __global__ void reduceNeighbored(int * g_idata,int * g_odata,unsigned int n) 52 | { 53 | //set thread ID 54 | unsigned int tid = threadIdx.x; 55 | //boundary check 56 | if (tid >= n) return; 57 | //convert global data pointer to the 58 | int *idata = g_idata + blockIdx.x*blockDim.x; 59 | //in-place reduction in global memory 60 | for (int stride = 1; stride < blockDim.x; stride *= 2) 61 | { 62 | if ((tid % (2 * stride)) == 0) 63 | { 64 | idata[tid] += idata[tid + stride]; 65 | } 66 | //synchronize within block 67 | __syncthreads(); 68 | } 69 | //write result for this block to global mem 70 | if (tid == 0) 71 | g_odata[blockIdx.x] = idata[0]; 72 | 73 | } 74 | 75 | __global__ void reduceNeighboredLess(int * g_idata,int *g_odata,unsigned int n) 76 | { 77 | unsigned int tid = threadIdx.x; 78 | unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; 79 | // convert global data pointer to the local point of this block 80 | int *idata = g_idata + blockIdx.x*blockDim.x; 81 | if (idx > n) 82 | return; 83 | //in-place reduction in global memory 84 | for (int stride = 1; stride < blockDim.x; stride *= 2) 85 | { 86 | //convert tid into local array index 87 | int index = 2 * stride *tid; 88 | if (index < blockDim.x) 89 | { 90 | idata[index] += idata[index + stride]; 91 | } 92 | __syncthreads(); 93 | } 94 | //write result for this block to global men 95 | if (tid == 0) 96 | g_odata[blockIdx.x] = idata[0]; 97 | } 98 | 99 | __global__ void reduceInterleaved(int * g_idata, int *g_odata, unsigned int n) 100 | { 101 | unsigned int tid = threadIdx.x; 102 | unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; 103 | // convert global data pointer to the local point of this block 104 | int *idata = g_idata + blockIdx.x*blockDim.x; 105 | if (idx >= n) 106 | return; 107 | //in-place reduction in global memory 108 | for (int stride = blockDim.x/2; stride >0; stride >>=1) 109 | { 110 | 111 | if (tid 1) 134 | { 135 | blocksize = atoi(argv[1]); 136 | } 137 | dim3 block(blocksize, 1); 138 | dim3 grid((size - 1) / block.x + 1, 1); 139 | printf("grid %d block %d \n", grid.x, block.x); 140 | 141 | //allocate host memory 142 | size_t bytes = size * sizeof(int); 143 | int *idata_host = (int*)malloc(bytes); 144 | int *odata_host = (int*)malloc(grid.x * sizeof(int)); 145 | int * tmp = (int*)malloc(bytes); 146 | 147 | //initialize the array 148 | initialData_int(idata_host, size); 149 | 150 | memcpy(tmp, idata_host, bytes); 151 | double iStart, iElaps; 152 | int gpu_sum = 0; 153 | 154 | // device memory 155 | int * idata_dev = NULL; 156 | int * odata_dev = NULL; 157 | CHECK(cudaMalloc((void**)&idata_dev, bytes)); 158 | CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int))); 159 | 160 | //cpu reduction 161 | int cpu_sum = 0; 162 | iStart = cpuSecond(); 163 | //cpu_sum = recursiveReduce(tmp, size); 164 | for (int i = 0; i < size; i++) 165 | cpu_sum += tmp[i]; 166 | printf("cpu sum:%d \n", cpu_sum); 167 | iElaps = cpuSecond() - iStart; 168 | printf("cpu reduce elapsed %lf ms cpu_sum: %d\n", iElaps, cpu_sum); 169 | 170 | 171 | //kernel 1:reduceNeighbored 172 | 173 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 174 | CHECK(cudaDeviceSynchronize()); 175 | iStart = cpuSecond(); 176 | warmup <<>>(idata_dev, odata_dev, size); 177 | cudaDeviceSynchronize(); 178 | iElaps = cpuSecond() - iStart; 179 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 180 | gpu_sum = 0; 181 | for (int i = 0; i < grid.x; i++) 182 | gpu_sum += odata_host[i]; 183 | printf("gpu warmup elapsed %lf ms gpu_sum: %d<<>>\n", 184 | iElaps, gpu_sum, grid.x, block.x); 185 | 186 | //kernel 1:reduceNeighbored 187 | 188 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 189 | CHECK(cudaDeviceSynchronize()); 190 | iStart = cpuSecond(); 191 | reduceNeighbored << > >(idata_dev, odata_dev, size); 192 | cudaDeviceSynchronize(); 193 | iElaps = cpuSecond() - iStart; 194 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 195 | gpu_sum = 0; 196 | for (int i = 0; i < grid.x; i++) 197 | gpu_sum += odata_host[i]; 198 | printf("gpu reduceNeighbored elapsed %lf ms gpu_sum: %d<<>>\n", 199 | iElaps, gpu_sum, grid.x, block.x); 200 | 201 | //kernel 2:reduceNeighboredLess 202 | 203 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 204 | CHECK(cudaDeviceSynchronize()); 205 | iStart = cpuSecond(); 206 | reduceNeighboredLess <<>>(idata_dev, odata_dev, size); 207 | cudaDeviceSynchronize(); 208 | iElaps = cpuSecond() - iStart; 209 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 210 | gpu_sum = 0; 211 | for (int i = 0; i < grid.x; i++) 212 | gpu_sum += odata_host[i]; 213 | printf("gpu reduceNeighboredLess elapsed %lf ms gpu_sum: %d<<>>\n", 214 | iElaps, gpu_sum, grid.x, block.x); 215 | 216 | //kernel 3:reduceInterleaved 217 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 218 | CHECK(cudaDeviceSynchronize()); 219 | iStart = cpuSecond(); 220 | reduceInterleaved << > >(idata_dev, odata_dev, size); 221 | cudaDeviceSynchronize(); 222 | iElaps = cpuSecond() - iStart; 223 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 224 | gpu_sum = 0; 225 | for (int i = 0; i < grid.x; i++) 226 | gpu_sum += odata_host[i]; 227 | printf("gpu reduceInterleaved elapsed %lf ms gpu_sum: %d<<>>\n", 228 | iElaps, gpu_sum, grid.x, block.x); 229 | // free host memory 230 | 231 | free(idata_host); 232 | free(odata_host); 233 | CHECK(cudaFree(idata_dev)); 234 | CHECK(cudaFree(odata_dev)); 235 | 236 | //reset device 237 | cudaDeviceReset(); 238 | 239 | //check the results 240 | if (gpu_sum == cpu_sum) 241 | { 242 | printf("Test success!\n"); 243 | } 244 | return EXIT_SUCCESS; 245 | 246 | } 247 | -------------------------------------------------------------------------------- /11_simple_sum_matrix2D/.sudo_as_admin_successful: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tony-Tan/CUDA_Freshman/979938216fbbd8bc81ccbc525c4dd1f8c0c9fcbb/11_simple_sum_matrix2D/.sudo_as_admin_successful -------------------------------------------------------------------------------- /11_simple_sum_matrix2D/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(simple_sum_matrix simple_sum_matrix.cu) 2 | -------------------------------------------------------------------------------- /11_simple_sum_matrix2D/simple_sum_matrix.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny) 5 | { 6 | float * a=MatA; 7 | float * b=MatB; 8 | float * c=MatC; 9 | for(int j=0;j2?atoi(argv[1]):32; 61 | int dimy=argc>2?atoi(argv[2]):32; 62 | 63 | double iStart,iElaps; 64 | // cpu compute 65 | iStart=cpuSecond(); 66 | sumMatrix2D_CPU(A_host,B_host,C_host,nx,ny); 67 | iElaps=cpuSecond()-iStart; 68 | printf("CPU Execution Time elapsed %f sec\n",iElaps); 69 | //warm up 70 | // 2d block and 2d grid 71 | dim3 block_0(32,32); 72 | dim3 grid_0((nx-1)/block_0.x+1,(ny-1)/block_0.y+1); 73 | iStart=cpuSecond(); 74 | sumMatrix<<>>(A_dev,B_dev,C_dev,nx,ny); 75 | CHECK(cudaDeviceSynchronize()); 76 | printf("Warm Up \n"); 77 | 78 | // 2d block and 2d grid 79 | dim3 block(dimx,dimy); 80 | dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1); 81 | iStart=cpuSecond(); 82 | sumMatrix<<>>(A_dev,B_dev,C_dev,nx,ny); 83 | CHECK(cudaDeviceSynchronize()); 84 | iElaps=cpuSecond()-iStart; 85 | printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", 86 | grid.x,grid.y,block.x,block.y,iElaps); 87 | CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost)); 88 | 89 | checkResult(C_host,C_from_gpu,nxy); 90 | 91 | cudaFree(A_dev); 92 | cudaFree(B_dev); 93 | cudaFree(C_dev); 94 | free(A_host); 95 | free(B_host); 96 | free(C_host); 97 | free(C_from_gpu); 98 | cudaDeviceReset(); 99 | return 0; 100 | } 101 | -------------------------------------------------------------------------------- /12_reduce_unrolling/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(reduceUnrolling reduceUnrolling.cu) 2 | -------------------------------------------------------------------------------- /12_reduce_unrolling/reduceUnrolling.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | int recursiveReduce(int *data, int const size) 5 | { 6 | // terminate check 7 | if (size == 1) return data[0]; 8 | // renew the stride 9 | int const stride = size / 2; 10 | if (size % 2 == 1) 11 | { 12 | for (int i = 0; i < stride; i++) 13 | { 14 | data[i] += data[i + stride]; 15 | } 16 | data[0] += data[size - 1]; 17 | } 18 | else 19 | { 20 | for (int i = 0; i < stride; i++) 21 | { 22 | data[i] += data[i + stride]; 23 | } 24 | } 25 | // call 26 | return recursiveReduce(data, stride); 27 | } 28 | __global__ void warmup(int * g_idata, int * g_odata, unsigned int n) 29 | { 30 | //set thread ID 31 | unsigned int tid = threadIdx.x; 32 | //boundary check 33 | if (tid >= n) return; 34 | //convert global data pointer to the 35 | int *idata = g_idata + blockIdx.x*blockDim.x; 36 | //in-place reduction in global memory 37 | for (int stride = 1; stride < blockDim.x; stride *= 2) 38 | { 39 | if ((tid % (2 * stride)) == 0) 40 | { 41 | idata[tid] += idata[tid + stride]; 42 | } 43 | //synchronize within block 44 | __syncthreads(); 45 | } 46 | //write result for this block to global mem 47 | if (tid == 0) 48 | g_odata[blockIdx.x] = idata[0]; 49 | 50 | } 51 | __global__ void reduceUnroll2(int * g_idata,int * g_odata,unsigned int n) 52 | { 53 | //set thread ID 54 | unsigned int tid = threadIdx.x; 55 | unsigned int idx = blockDim.x*blockIdx.x*2+threadIdx.x; 56 | //boundary check 57 | if (tid >= n) return; 58 | //convert global data pointer to the 59 | int *idata = g_idata + blockIdx.x*blockDim.x*2; 60 | if(idx+blockDim.x0 ; stride >>=1) 68 | { 69 | if (tid = n) return; 91 | //convert global data pointer to the 92 | int *idata = g_idata + blockIdx.x*blockDim.x*4; 93 | if(idx+blockDim.x0 ; stride >>=1) 102 | { 103 | if (tid = n) return; 125 | //convert global data pointer to the 126 | int *idata = g_idata + blockIdx.x*blockDim.x*8; 127 | if(idx+blockDim.x0 ; stride >>=1) 141 | { 142 | if (tid = n) return; 164 | //convert global data pointer to the 165 | int *idata = g_idata + blockIdx.x*blockDim.x*8; 166 | //unrolling 8; 167 | if(idx+7 * blockDim.x32; stride >>=1) 183 | { 184 | if (tid = n) return; 218 | //convert global data pointer to the 219 | int *idata = g_idata + blockIdx.x*blockDim.x*8; 220 | if(idx+7 * blockDim.x=1024 && tid <512) 236 | idata[tid]+=idata[tid+512]; 237 | __syncthreads(); 238 | if(blockDim.x>=512 && tid <256) 239 | idata[tid]+=idata[tid+256]; 240 | __syncthreads(); 241 | if(blockDim.x>=256 && tid <128) 242 | idata[tid]+=idata[tid+128]; 243 | __syncthreads(); 244 | if(blockDim.x>=128 && tid <64) 245 | idata[tid]+=idata[tid+64]; 246 | __syncthreads(); 247 | //write result for this block to global mem 248 | if(tid<32) 249 | { 250 | volatile int *vmem = idata; 251 | vmem[tid]+=vmem[tid+32]; 252 | vmem[tid]+=vmem[tid+16]; 253 | vmem[tid]+=vmem[tid+8]; 254 | vmem[tid]+=vmem[tid+4]; 255 | vmem[tid]+=vmem[tid+2]; 256 | vmem[tid]+=vmem[tid+1]; 257 | 258 | } 259 | 260 | if (tid == 0) 261 | g_odata[blockIdx.x] = idata[0]; 262 | 263 | } 264 | // 265 | // 266 | // 267 | template 268 | __global__ void reduceCompleteUnroll(int * g_idata,int * g_odata,unsigned int n) 269 | { 270 | //set thread ID 271 | unsigned int tid = threadIdx.x; 272 | unsigned int idx = blockDim.x*blockIdx.x*8+threadIdx.x; 273 | //boundary check 274 | if (tid >= n) return; 275 | //convert global data pointer to the 276 | int *idata = g_idata + blockIdx.x*blockDim.x*8; 277 | if(idx+7 * blockDim.x=1024 && tid <512) 293 | idata[tid]+=idata[tid+512]; 294 | __syncthreads(); 295 | if(iBlockSize>=512 && tid <256) 296 | idata[tid]+=idata[tid+256]; 297 | __syncthreads(); 298 | if(iBlockSize>=256 && tid <128) 299 | idata[tid]+=idata[tid+128]; 300 | __syncthreads(); 301 | if(iBlockSize>=128 && tid <64) 302 | idata[tid]+=idata[tid+64]; 303 | __syncthreads(); 304 | //write result for this block to global mem 305 | if(tid<32) 306 | { 307 | volatile int *vmem = idata; 308 | vmem[tid]+=vmem[tid+32]; 309 | vmem[tid]+=vmem[tid+16]; 310 | vmem[tid]+=vmem[tid+8]; 311 | vmem[tid]+=vmem[tid+4]; 312 | vmem[tid]+=vmem[tid+2]; 313 | vmem[tid]+=vmem[tid+1]; 314 | 315 | } 316 | 317 | if (tid == 0) 318 | g_odata[blockIdx.x] = idata[0]; 319 | 320 | } 321 | 322 | int main(int argc,char** argv) 323 | { 324 | initDevice(0); 325 | 326 | bool bResult = false; 327 | //initialization 328 | 329 | int size = 1 << 24; 330 | printf(" with array size %d ", size); 331 | 332 | //execution configuration 333 | int blocksize = 1024; 334 | if (argc > 1) 335 | { 336 | blocksize = atoi(argv[1]); 337 | } 338 | dim3 block(blocksize, 1); 339 | dim3 grid((size - 1) / block.x + 1, 1); 340 | printf("grid %d block %d \n", grid.x, block.x); 341 | 342 | //allocate host memory 343 | size_t bytes = size * sizeof(int); 344 | int *idata_host = (int*)malloc(bytes); 345 | int *odata_host = (int*)malloc(grid.x * sizeof(int)); 346 | int * tmp = (int*)malloc(bytes); 347 | 348 | //initialize the array 349 | initialData_int(idata_host, size); 350 | 351 | memcpy(tmp, idata_host, bytes); 352 | double iStart, iElaps; 353 | int gpu_sum = 0; 354 | 355 | // device memory 356 | int * idata_dev = NULL; 357 | int * odata_dev = NULL; 358 | CHECK(cudaMalloc((void**)&idata_dev, bytes)); 359 | CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int))); 360 | 361 | //cpu reduction 362 | int cpu_sum = 0; 363 | iStart = cpuSecond(); 364 | //cpu_sum = recursiveReduce(tmp, size); 365 | for (int i = 0; i < size; i++) 366 | cpu_sum += tmp[i]; 367 | printf("cpu sum:%d \n", cpu_sum); 368 | iElaps = cpuSecond() - iStart; 369 | printf("cpu reduce elapsed %lf ms cpu_sum: %d\n", iElaps, cpu_sum); 370 | 371 | 372 | //kernel 1:warmup 373 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 374 | CHECK(cudaDeviceSynchronize()); 375 | iStart = cpuSecond(); 376 | warmup <<>>(idata_dev, odata_dev, size); 377 | cudaDeviceSynchronize(); 378 | iElaps = cpuSecond() - iStart; 379 | printf("gpu warmup elapsed %lf ms \n",iElaps); 380 | 381 | 382 | //kernel 1:reduceUnrolling2 383 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 384 | CHECK(cudaDeviceSynchronize()); 385 | iStart = cpuSecond(); 386 | reduceUnroll2 <<>>(idata_dev, odata_dev, size); 387 | cudaDeviceSynchronize(); 388 | iElaps = cpuSecond() - iStart; 389 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 390 | gpu_sum = 0; 391 | for (int i = 0; i < grid.x/2; i++) 392 | gpu_sum += odata_host[i]; 393 | printf("reduceUnrolling2 elapsed %lf ms gpu_sum: %d<<>>\n", 394 | iElaps, gpu_sum, grid.x/2, block.x); 395 | 396 | //kernel 1.1:reduceUnrolling4 397 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 398 | CHECK(cudaDeviceSynchronize()); 399 | iStart = cpuSecond(); 400 | reduceUnroll4 <<>>(idata_dev, odata_dev, size); 401 | cudaDeviceSynchronize(); 402 | iElaps = cpuSecond() - iStart; 403 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 404 | gpu_sum = 0; 405 | for (int i = 0; i < grid.x/4; i++) 406 | gpu_sum += odata_host[i]; 407 | printf("reduceUnrolling4 elapsed %lf ms gpu_sum: %d<<>>\n", 408 | iElaps, gpu_sum, grid.x/4, block.x); 409 | //kernel 1.2:reduceUnrolling8 410 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 411 | CHECK(cudaDeviceSynchronize()); 412 | iStart = cpuSecond(); 413 | reduceUnroll8 <<>>(idata_dev, odata_dev, size); 414 | cudaDeviceSynchronize(); 415 | iElaps = cpuSecond() - iStart; 416 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 417 | gpu_sum = 0; 418 | for (int i = 0; i < grid.x/8; i++) 419 | gpu_sum += odata_host[i]; 420 | printf("reduceUnrolling8 elapsed %lf ms gpu_sum: %d<<>>\n", 421 | iElaps, gpu_sum, grid.x/8, block.x); 422 | 423 | //kernel 2:reduceUnrollingWarp8 424 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 425 | CHECK(cudaDeviceSynchronize()); 426 | iStart = cpuSecond(); 427 | reduceUnrollWarp8<<>>(idata_dev, odata_dev, size); 428 | cudaDeviceSynchronize(); 429 | iElaps = cpuSecond() - iStart; 430 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 431 | gpu_sum = 0; 432 | for (int i = 0; i < grid.x/8; i++) 433 | gpu_sum += odata_host[i]; 434 | printf("reduceUnrollingWarp8 elapsed %lf ms gpu_sum: %d<<>>\n", 435 | iElaps, gpu_sum, grid.x/8, block.x); 436 | 437 | 438 | //kernel 3:reduceCompleteUnrollWarp8 439 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 440 | CHECK(cudaDeviceSynchronize()); 441 | iStart = cpuSecond(); 442 | reduceCompleteUnrollWarp8 <<>>(idata_dev, odata_dev, size); 443 | cudaDeviceSynchronize(); 444 | iElaps = cpuSecond() - iStart; 445 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 446 | gpu_sum = 0; 447 | for (int i = 0; i < grid.x/8; i++) 448 | gpu_sum += odata_host[i]; 449 | printf("reduceCompleteUnrollWarp8 elapsed %lf ms gpu_sum: %d<<>>\n", 450 | iElaps, gpu_sum, grid.x/8, block.x); 451 | 452 | 453 | //kernel 4:reduceCompleteUnroll 454 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 455 | CHECK(cudaDeviceSynchronize()); 456 | iStart = cpuSecond(); 457 | switch(blocksize) 458 | { 459 | case 1024: 460 | reduceCompleteUnroll <1024><< > >(idata_dev, odata_dev, size); 461 | break; 462 | case 512: 463 | reduceCompleteUnroll <512><< > >(idata_dev, odata_dev, size); 464 | break; 465 | case 256: 466 | reduceCompleteUnroll <256><< > >(idata_dev, odata_dev, size); 467 | break; 468 | case 128: 469 | reduceCompleteUnroll <128><< > >(idata_dev, odata_dev, size); 470 | break; 471 | } 472 | cudaDeviceSynchronize(); 473 | iElaps = cpuSecond() - iStart; 474 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 475 | gpu_sum = 0; 476 | for (int i = 0; i < grid.x/8; i++) 477 | gpu_sum += odata_host[i]; 478 | printf("reduceCompleteUnroll elapsed %lf ms gpu_sum: %d<<>>\n", 479 | iElaps, gpu_sum, grid.x/8, block.x); 480 | // free host memory 481 | 482 | free(idata_host); 483 | free(odata_host); 484 | CHECK(cudaFree(idata_dev)); 485 | CHECK(cudaFree(odata_dev)); 486 | 487 | //reset device 488 | cudaDeviceReset(); 489 | 490 | //check the results 491 | if (gpu_sum == cpu_sum) 492 | { 493 | printf("Test success!\n"); 494 | } 495 | return EXIT_SUCCESS; 496 | 497 | } 498 | -------------------------------------------------------------------------------- /13_nested_hello_world/Makefile: -------------------------------------------------------------------------------- 1 | nested_Hello_World: 2 | nvcc -arch=sm_35 nested_Hello_World.cu -o nested_Hello_World -lcudadevrt --relocatable-device-code true 3 | clean: 4 | rm nested_Hello_World 5 | -------------------------------------------------------------------------------- /13_nested_hello_world/nested_Hello_World.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | __global__ void nesthelloworld(int iSize,int iDepth) 4 | { 5 | unsigned int tid=threadIdx.x; 6 | printf("depth : %d blockIdx: %d,threadIdx: %d\n",iDepth,blockIdx.x,threadIdx.x); 7 | if (iSize==1) 8 | return; 9 | int nthread=(iSize>>1); 10 | if (tid==0 && nthread>0) 11 | { 12 | nesthelloworld<<<1,nthread>>>(nthread,++iDepth); 13 | printf("-----------> nested execution depth: %d\n",iDepth); 14 | } 15 | 16 | } 17 | 18 | int main(int argc,char* argv[]) 19 | { 20 | int size=64; 21 | int block_x=2; 22 | dim3 block(block_x,1); 23 | dim3 grid((size-1)/block.x+1,1); 24 | nesthelloworld<<>>(size,0); 25 | cudaGetLastError(); 26 | cudaDeviceReset(); 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /14_global_variable/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(global_variable global_variable.cu) 2 | -------------------------------------------------------------------------------- /14_global_variable/global_variable.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | __device__ float devData; 4 | __global__ void checkGlobalVariable() 5 | { 6 | printf("Device: The value of the global variable is %f\n",devData); 7 | devData+=2.0; 8 | } 9 | int main() 10 | { 11 | float value=3.14f; 12 | cudaMemcpyToSymbol(devData,&value,sizeof(float)); 13 | printf("Host: copy %f to the global variable\n",value); 14 | checkGlobalVariable<<<1,1>>>(); 15 | cudaMemcpyFromSymbol(&value,devData,sizeof(float)); 16 | printf("Host: the value changed by the kernel to %f \n",value); 17 | cudaDeviceReset(); 18 | return EXIT_SUCCESS; 19 | } 20 | -------------------------------------------------------------------------------- /15_pine_memory/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(pine_memory pine_memory.cu) 2 | -------------------------------------------------------------------------------- /15_pine_memory/pine_memory.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | 6 | void sumArrays(float * a,float * b,float * res,const int size) 7 | { 8 | for(int i=0;i>>(a_d,b_d,res_d); 51 | printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x); 52 | 53 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 54 | sumArrays(a_h,b_h,res_h,nElem); 55 | 56 | checkResult(res_h,res_from_gpu_h,nElem); 57 | cudaFreeHost(a_d); 58 | cudaFreeHost(b_d); 59 | cudaFreeHost(res_d); 60 | 61 | free(a_h); 62 | free(b_h); 63 | free(res_h); 64 | free(res_from_gpu_h); 65 | 66 | return 0; 67 | } 68 | -------------------------------------------------------------------------------- /16_zero_copy_memory/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(zero_copy_memory zero_copy_memory.cu) 2 | -------------------------------------------------------------------------------- /16_zero_copy_memory/zero_copy_memory.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | 6 | void sumArrays(float * a,float * b,float * res,const int size) 7 | { 8 | for(int i=0;i=2) 27 | power=atoi(argv[1]); 28 | int nElem=1<>>(a_dev,b_dev,res_d); 53 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 54 | iElaps = cpuSecond() - iStart; 55 | //=============================================================// 56 | printf("zero copy memory elapsed %lf ms \n", iElaps); 57 | printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x); 58 | //-----------------------normal memory--------------------------- 59 | float *a_h_n=(float*)malloc(nByte); 60 | float *b_h_n=(float*)malloc(nByte); 61 | float *res_h_n=(float*)malloc(nByte); 62 | float *res_from_gpu_h_n=(float*)malloc(nByte); 63 | memset(res_h_n,0,nByte); 64 | memset(res_from_gpu_h_n,0,nByte); 65 | 66 | float *a_d_n,*b_d_n,*res_d_n; 67 | CHECK(cudaMalloc((float**)&a_d_n,nByte)); 68 | CHECK(cudaMalloc((float**)&b_d_n,nByte)); 69 | CHECK(cudaMalloc((float**)&res_d_n,nByte)); 70 | 71 | initialData(a_h_n,nElem); 72 | initialData(b_h_n,nElem); 73 | //=============================================================// 74 | iStart = cpuSecond(); 75 | CHECK(cudaMemcpy(a_d_n,a_h_n,nByte,cudaMemcpyHostToDevice)); 76 | CHECK(cudaMemcpy(b_d_n,b_h_n,nByte,cudaMemcpyHostToDevice)); 77 | sumArraysGPU<<>>(a_d_n,b_d_n,res_d_n); 78 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 79 | iElaps = cpuSecond() - iStart; 80 | //=============================================================// 81 | printf("device memory elapsed %lf ms \n", iElaps); 82 | printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x); 83 | //-------------------------------------------------------------------- 84 | 85 | sumArrays(a_host,b_host,res_h,nElem); 86 | checkResult(res_h,res_from_gpu_h,nElem); 87 | 88 | cudaFreeHost(a_host); 89 | cudaFreeHost(b_host); 90 | cudaFree(res_d); 91 | free(res_h); 92 | free(res_from_gpu_h); 93 | 94 | cudaFree(a_d_n); 95 | cudaFree(b_d_n); 96 | cudaFree(res_d_n); 97 | 98 | free(a_h_n); 99 | free(b_h_n); 100 | free(res_h_n); 101 | free(res_from_gpu_h_n); 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /17_UVA/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(UVA UVA.cu) 2 | -------------------------------------------------------------------------------- /17_UVA/UVA.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | 6 | void sumArrays(float * a,float * b,float * res,const int size) 7 | { 8 | for(int i=0;i>>(a_host,b_host,res_d); 46 | printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x); 47 | 48 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 49 | sumArrays(a_host,b_host,res_h,nElem); 50 | 51 | checkResult(res_h,res_from_gpu_h,nElem); 52 | cudaFreeHost(a_host); 53 | cudaFreeHost(b_host); 54 | cudaFree(res_d); 55 | 56 | free(res_h); 57 | free(res_from_gpu_h); 58 | 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /18_sum_array_offset/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(sum_array_offset sum_array_offset.cu) 2 | -------------------------------------------------------------------------------- /18_sum_array_offset/sum_array_offset.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | 6 | void sumArrays(float * a,float * b,float * res,int offset,const int size) 7 | { 8 | 9 | for(int i=0,k=offset;k=2) 31 | offset=atoi(argv[1]); 32 | printf("Vector size:%d\n",nElem); 33 | int nByte=sizeof(float)*nElem; 34 | float *a_h=(float*)malloc(nByte); 35 | float *b_h=(float*)malloc(nByte); 36 | float *res_h=(float*)malloc(nByte); 37 | float *res_from_gpu_h=(float*)malloc(nByte); 38 | memset(res_h,0,nByte); 39 | memset(res_from_gpu_h,0,nByte); 40 | 41 | float *a_d,*b_d,*res_d; 42 | CHECK(cudaMalloc((float**)&a_d,nByte)); 43 | CHECK(cudaMalloc((float**)&b_d,nByte)); 44 | CHECK(cudaMalloc((float**)&res_d,nByte)); 45 | CHECK(cudaMemset(res_d,0,nByte)); 46 | initialData(a_h,nElem); 47 | initialData(b_h,nElem); 48 | 49 | CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice)); 50 | CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice)); 51 | 52 | dim3 block(1024); 53 | dim3 grid(nElem/block.x); 54 | double iStart,iElaps; 55 | iStart=cpuSecond(); 56 | sumArraysGPU<<>>(a_d,b_d,res_d,offset,nElem); 57 | cudaDeviceSynchronize(); 58 | iElaps=cpuSecond()-iStart; 59 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 60 | printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec --offset:%d \n",grid.x,block.x,iElaps,offset); 61 | 62 | 63 | sumArrays(a_h,b_h,res_h,offset,nElem); 64 | 65 | checkResult(res_h,res_from_gpu_h,nElem); 66 | cudaFree(a_d); 67 | cudaFree(b_d); 68 | cudaFree(res_d); 69 | 70 | free(a_h); 71 | free(b_h); 72 | free(res_h); 73 | free(res_from_gpu_h); 74 | 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /19_AoS/AoS.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | struct naiveStruct{ 6 | float a; 7 | float b; 8 | }; 9 | void sumArrays(float * a,float * b,float * res,const int size) 10 | { 11 | 12 | for(int i=0;i=2) 43 | offset=atoi(argv[1]); 44 | printf("Vector size:%d\n",nElem); 45 | int nByte=sizeof(float)*nElem; 46 | int nByte_struct=sizeof(struct naiveStruct)*nElem; 47 | float *a_h=(float*)malloc(nByte); 48 | float *b_h=(float*)malloc(nByte); 49 | float *res_h=(float*)malloc(nByte_struct); 50 | struct naiveStruct *res_from_gpu_h=(struct naiveStruct*)malloc(nByte_struct); 51 | memset(res_h,0,nByte); 52 | memset(res_from_gpu_h,0,nByte); 53 | 54 | float *a_d,*b_d; 55 | struct naiveStruct* res_d; 56 | CHECK(cudaMalloc((float**)&a_d,nByte)); 57 | CHECK(cudaMalloc((float**)&b_d,nByte)); 58 | CHECK(cudaMalloc((struct naiveStruct**)&res_d,nByte_struct)); 59 | CHECK(cudaMemset(res_d,0,nByte_struct)); 60 | initialData(a_h,nElem); 61 | initialData(b_h,nElem); 62 | 63 | CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice)); 64 | CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice)); 65 | 66 | dim3 block(1024); 67 | dim3 grid(nElem/block.x); 68 | double iStart,iElaps; 69 | iStart=cpuSecond(); 70 | sumArraysGPU<<>>(a_d,b_d,res_d,nElem); 71 | cudaDeviceSynchronize(); 72 | iElaps=cpuSecond()-iStart; 73 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte_struct,cudaMemcpyDeviceToHost)); 74 | printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); 75 | 76 | 77 | sumArrays(a_h,b_h,res_h,nElem); 78 | 79 | checkResult_struct(res_h,res_from_gpu_h,nElem); 80 | cudaFree(a_d); 81 | cudaFree(b_d); 82 | cudaFree(res_d); 83 | 84 | free(a_h); 85 | free(b_h); 86 | free(res_h); 87 | free(res_from_gpu_h); 88 | 89 | return 0; 90 | } 91 | -------------------------------------------------------------------------------- /19_AoS/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(AoS AoS.cu) 2 | -------------------------------------------------------------------------------- /1_check_dimension/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(check_dimension check_dimension.cu) 2 | -------------------------------------------------------------------------------- /1_check_dimension/check_dimension.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | __global__ void checkIndex(void) 4 | { 5 | printf("threadIdx:(%d,%d,%d) blockIdx:(%d,%d,%d) blockDim:(%d,%d,%d)\ 6 | gridDim(%d,%d,%d)\n",threadIdx.x,threadIdx.y,threadIdx.z, 7 | blockIdx.x,blockIdx.y,blockIdx.z,blockDim.x,blockDim.y,blockDim.z, 8 | gridDim.x,gridDim.y,gridDim.z); 9 | } 10 | int main(int argc,char **argv) 11 | { 12 | int nElem=6; 13 | dim3 block(3); 14 | dim3 grid((nElem+block.x-1)/block.x); 15 | printf("grid.x %d grid.y %d grid.z %d\n",grid.x,grid.y,grid.z); 16 | printf("block.x %d block.y %d block.z %d\n",block.x,block.y,block.z); 17 | checkIndex<<>>(); 18 | cudaDeviceReset(); 19 | return 0; 20 | } 21 | -------------------------------------------------------------------------------- /20_SoA/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(SoA SoA.cu) 2 | -------------------------------------------------------------------------------- /20_SoA/SoA.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | #define SIZE (1<<18) 5 | struct naiveStruct{ 6 | float a[SIZE]; 7 | float b[SIZE]; 8 | }; 9 | void sumArrays(float * a,float * b,float * res,const int size) 10 | { 11 | 12 | for(int i=0;ia)[i]=a[i]+b[i]; 24 | } 25 | void checkResult_struct(float* res_h,struct naiveStruct*res_from_gpu_h,int nElem) 26 | { 27 | for(int i=0;ia)[i]) 29 | { 30 | printf("check fail!\n"); 31 | exit(0); 32 | } 33 | printf("result check success!\n"); 34 | } 35 | int main(int argc,char **argv) 36 | { 37 | int dev = 0; 38 | cudaSetDevice(dev); 39 | 40 | int nElem=SIZE; 41 | printf("Vector size:%d\n",nElem); 42 | int nByte=sizeof(float)*nElem; 43 | int nByte_struct=sizeof(struct naiveStruct); 44 | float *a_h=(float*)malloc(nByte); 45 | float *b_h=(float*)malloc(nByte); 46 | float *res_h=(float*)malloc(nByte_struct); 47 | struct naiveStruct *res_from_gpu_h=(struct naiveStruct*)malloc(nByte_struct); 48 | memset(res_h,0,nByte); 49 | memset(res_from_gpu_h,0,nByte); 50 | 51 | float *a_d,*b_d; 52 | struct naiveStruct* res_d; 53 | CHECK(cudaMalloc((float**)&a_d,nByte)); 54 | CHECK(cudaMalloc((float**)&b_d,nByte)); 55 | CHECK(cudaMalloc((struct naiveStruct**)&res_d,nByte_struct)); 56 | CHECK(cudaMemset(res_d,0,nByte_struct)); 57 | initialData(a_h,nElem); 58 | initialData(b_h,nElem); 59 | 60 | CHECK(cudaMemcpy(a_d,a_h,nByte,cudaMemcpyHostToDevice)); 61 | CHECK(cudaMemcpy(b_d,b_h,nByte,cudaMemcpyHostToDevice)); 62 | 63 | dim3 block(1024); 64 | dim3 grid(nElem/block.x); 65 | double iStart,iElaps; 66 | iStart=cpuSecond(); 67 | sumArraysGPU<<>>(a_d,b_d,res_d,nElem); 68 | cudaDeviceSynchronize(); 69 | iElaps=cpuSecond()-iStart; 70 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte_struct,cudaMemcpyDeviceToHost)); 71 | printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); 72 | 73 | 74 | sumArrays(a_h,b_h,res_h,nElem); 75 | 76 | checkResult_struct(res_h,res_from_gpu_h,nElem); 77 | cudaFree(a_d); 78 | cudaFree(b_d); 79 | cudaFree(res_d); 80 | 81 | free(a_h); 82 | free(b_h); 83 | free(res_h); 84 | free(res_from_gpu_h); 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /21_sum_array_offset_unrolling/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(sum_array_offset_unrolling sum_array_offset_unrolling.cu) 2 | -------------------------------------------------------------------------------- /21_sum_array_offset_unrolling/sum_array_offset_unrolling.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | 6 | void sumArrays(float * a,float * b,float * res,int offset,const int size) 7 | { 8 | 9 | for(int i=0,k=offset;k>>(a_d,b_d,res_d,offset,nElem); 69 | cudaDeviceSynchronize(); 70 | iElaps=cpuSecond()-iStart; 71 | 72 | printf("warmup Time elapsed %f sec\n",iElaps); 73 | iStart=cpuSecond(); 74 | sumArraysGPU<<>>(a_d,b_d,res_d,offset,nElem); 75 | cudaDeviceSynchronize(); 76 | iElaps=cpuSecond()-iStart; 77 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 78 | printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec --offset:%d \n",grid.x,block.x,iElaps,offset); 79 | 80 | 81 | sumArrays(a_h,b_h,res_h,offset,nElem); 82 | 83 | checkResult(res_h,res_from_gpu_h,nElem-4*block_x); 84 | cudaFree(a_d); 85 | cudaFree(b_d); 86 | cudaFree(res_d); 87 | 88 | free(a_h); 89 | free(b_h); 90 | free(res_h); 91 | free(res_from_gpu_h); 92 | 93 | return 0; 94 | } 95 | -------------------------------------------------------------------------------- /22_transform_matrix2D/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(transform_matrix2D transform_matrix2D.cu) 2 | -------------------------------------------------------------------------------- /22_transform_matrix2D/transform_matrix2D.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | //cpu transform 5 | void transformMatrix2D_CPU(float * MatA,float * MatB,int nx,int ny) 6 | { 7 | for(int j=0;j=4) 128 | { 129 | transform_kernel=atoi(argv[1]); 130 | dimx=atoi(argv[2]); 131 | dimy=atoi(argv[3]); 132 | } 133 | 134 | //Malloc 135 | float* A_host=(float*)malloc(nBytes); 136 | float* B_host=(float*)malloc(nBytes); 137 | initialData(A_host,nxy); 138 | 139 | //cudaMalloc 140 | float *A_dev=NULL; 141 | float *B_dev=NULL; 142 | CHECK(cudaMalloc((void**)&A_dev,nBytes)); 143 | CHECK(cudaMalloc((void**)&B_dev,nBytes)); 144 | 145 | CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice)); 146 | CHECK(cudaMemset(B_dev,0,nBytes)); 147 | 148 | 149 | 150 | // cpu compute 151 | double iStart=cpuSecond(); 152 | transformMatrix2D_CPU(A_host,B_host,nx,ny); 153 | double iElaps=cpuSecond()-iStart; 154 | printf("CPU Execution Time elapsed %f sec\n",iElaps); 155 | 156 | // 2d block and 2d grid 157 | dim3 block(dimx,dimy); 158 | dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1); 159 | dim3 block_1(dimx,dimy); 160 | dim3 grid_1((nx-1)/(block_1.x*4)+1,(ny-1)/block_1.y+1); 161 | iStart=cpuSecond(); 162 | switch(transform_kernel) 163 | { 164 | case 0: 165 | copyRow<<>>(A_dev,B_dev,nx,ny); 166 | break; 167 | case 1: 168 | copyCol<<>>(A_dev,B_dev,nx,ny); 169 | break; 170 | case 2: 171 | transformNaiveRow<<>>(A_dev,B_dev,nx,ny); 172 | break; 173 | case 3: 174 | transformNaiveCol<<>>(A_dev,B_dev,nx,ny); 175 | break; 176 | case 4: 177 | transformNaiveColUnroll<<>>(A_dev,B_dev,nx,ny); 178 | break; 179 | case 5: 180 | 181 | transformNaiveColUnroll<<>>(A_dev,B_dev,nx,ny); 182 | break; 183 | case 6: 184 | transformNaiveRowDiagonal<<>>(A_dev,B_dev,nx,ny); 185 | break; 186 | case 7: 187 | transformNaiveColDiagonal<<>>(A_dev,B_dev,nx,ny); 188 | break; 189 | default: 190 | break; 191 | } 192 | CHECK(cudaDeviceSynchronize()); 193 | iElaps=cpuSecond()-iStart; 194 | printf(" Time elapsed %f sec\n",iElaps); 195 | CHECK(cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost)); 196 | checkResult(B_host,B_host,nxy); 197 | 198 | cudaFree(A_dev); 199 | cudaFree(B_dev); 200 | free(A_host); 201 | free(B_host); 202 | cudaDeviceReset(); 203 | return 0; 204 | } 205 | -------------------------------------------------------------------------------- /23_sum_array_uniform_memory/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(sum_arrays_uniform_memory sum_arrays_uniform_memory.cu) 2 | -------------------------------------------------------------------------------- /23_sum_array_uniform_memory/sum_arrays_uniform_memory.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | 6 | 7 | void sumArrays(float * a,float * b,float * res,const int size) 8 | { 9 | for(int i=0;i>>(a_d,b_d,res_d,nElem); 52 | cudaDeviceSynchronize(); 53 | iElaps=cpuSecond()-iStart; 54 | printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); 55 | 56 | //CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 57 | sumArrays(b_d,b_d,res_h,nElem); 58 | 59 | checkResult(res_h,res_d,nElem); 60 | cudaFree(a_d); 61 | cudaFree(b_d); 62 | cudaFree(res_d); 63 | 64 | free(res_h); 65 | 66 | return 0; 67 | } 68 | -------------------------------------------------------------------------------- /24_shared_memory_read_data/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(shared_memory_read_data shared_memory_read_data.cu) 2 | -------------------------------------------------------------------------------- /24_shared_memory_read_data/shared_memory_read_data.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | #define BDIMX 32 5 | #define BDIMY 32 6 | 7 | #define BDIMX_RECT 32 8 | #define BDIMY_RECT 16 9 | #define IPAD 1 10 | __global__ void warmup(int * out) 11 | { 12 | __shared__ int tile[BDIMY][BDIMX]; 13 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 14 | 15 | tile[threadIdx.y][threadIdx.x]=idx; 16 | __syncthreads(); 17 | out[idx]=tile[threadIdx.y][threadIdx.x]; 18 | } 19 | __global__ void setRowReadRow(int * out) 20 | { 21 | __shared__ int tile[BDIMY][BDIMX]; 22 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 23 | 24 | tile[threadIdx.y][threadIdx.x]=idx; 25 | __syncthreads(); 26 | out[idx]=tile[threadIdx.y][threadIdx.x]; 27 | } 28 | __global__ void setColReadCol(int * out) 29 | { 30 | __shared__ int tile[BDIMY][BDIMX]; 31 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 32 | 33 | tile[threadIdx.x][threadIdx.y]=idx; 34 | __syncthreads(); 35 | out[idx]=tile[threadIdx.x][threadIdx.y]; 36 | } 37 | __global__ void setColReadRow(int * out) 38 | { 39 | __shared__ int tile[BDIMY][BDIMX]; 40 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 41 | 42 | tile[threadIdx.x][threadIdx.y]=idx; 43 | __syncthreads(); 44 | out[idx]=tile[threadIdx.y][threadIdx.x]; 45 | } 46 | __global__ void setRowReadCol(int * out) 47 | { 48 | __shared__ int tile[BDIMY][BDIMX]; 49 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 50 | 51 | tile[threadIdx.y][threadIdx.x]=idx; 52 | __syncthreads(); 53 | out[idx]=tile[threadIdx.x][threadIdx.y]; 54 | } 55 | __global__ void setRowReadColDyn(int * out) 56 | { 57 | extern __shared__ int tile[]; 58 | unsigned int row_idx=threadIdx.y*blockDim.x+threadIdx.x; 59 | unsigned int col_idx=threadIdx.x*blockDim.y+threadIdx.y; 60 | tile[row_idx]=row_idx; 61 | __syncthreads(); 62 | out[row_idx]=tile[col_idx]; 63 | } 64 | __global__ void setRowReadColIpad(int * out) 65 | { 66 | __shared__ int tile[BDIMY][BDIMX+IPAD]; 67 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 68 | 69 | tile[threadIdx.y][threadIdx.x]=idx; 70 | __syncthreads(); 71 | out[idx]=tile[threadIdx.x][threadIdx.y]; 72 | } 73 | __global__ void setRowReadColDynIpad(int * out) 74 | { 75 | extern __shared__ int tile[]; 76 | unsigned int row_idx=threadIdx.y*(blockDim.x+1)+threadIdx.x; 77 | unsigned int col_idx=threadIdx.x*(blockDim.x+1)+threadIdx.y; 78 | tile[row_idx]=row_idx; 79 | __syncthreads(); 80 | out[row_idx]=tile[col_idx]; 81 | } 82 | //--------------------rectagle--------------------- 83 | __global__ void setRowReadColRect(int * out) 84 | { 85 | __shared__ int tile[BDIMY_RECT][BDIMX_RECT]; 86 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 87 | unsigned int icol=idx%blockDim.y; 88 | unsigned int irow=idx/blockDim.y; 89 | tile[threadIdx.y][threadIdx.x]=idx; 90 | __syncthreads(); 91 | out[idx]=tile[icol][irow]; 92 | } 93 | __global__ void setRowReadColRectDyn(int * out) 94 | { 95 | extern __shared__ int tile[]; 96 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 97 | unsigned int icol=idx%blockDim.y; 98 | unsigned int irow=idx/blockDim.y; 99 | unsigned int col_idx=icol*blockDim.x+irow; 100 | tile[idx]=idx; 101 | __syncthreads(); 102 | out[idx]=tile[col_idx]; 103 | } 104 | __global__ void setRowReadColRectPad(int * out) 105 | { 106 | __shared__ int tile[BDIMY_RECT][BDIMX_RECT+IPAD*2]; 107 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 108 | unsigned int icol=idx%blockDim.y; 109 | unsigned int irow=idx/blockDim.y; 110 | tile[threadIdx.y][threadIdx.x]=idx; 111 | __syncthreads(); 112 | out[idx]=tile[icol][irow]; 113 | } 114 | __global__ void setRowReadColRectDynPad(int * out) 115 | { 116 | extern __shared__ int tile[]; 117 | unsigned int idx=threadIdx.y*blockDim.x+threadIdx.x; 118 | unsigned int icol=idx%blockDim.y; 119 | unsigned int irow=idx/blockDim.y; 120 | unsigned int row_idx=threadIdx.y*(IPAD+blockDim.x)+threadIdx.x; 121 | unsigned int col_idx=icol*(IPAD+blockDim.x)+irow; 122 | tile[row_idx]=idx; 123 | __syncthreads(); 124 | out[idx]=tile[col_idx]; 125 | } 126 | int main(int argc,char **argv) 127 | { 128 | // set up device 129 | initDevice(0); 130 | int kernel=0; 131 | if(argc>=2) 132 | kernel=atoi(argv[1]); 133 | int nElem=BDIMX*BDIMY; 134 | printf("Vector size:%d\n",nElem); 135 | int nByte=sizeof(int)*nElem; 136 | int * out; 137 | CHECK(cudaMalloc((int**)&out,nByte)); 138 | cudaSharedMemConfig MemConfig; 139 | CHECK(cudaDeviceGetSharedMemConfig(&MemConfig)); 140 | printf("--------------------------------------------\n"); 141 | switch (MemConfig) { 142 | 143 | case cudaSharedMemBankSizeFourByte: 144 | printf("the device is cudaSharedMemBankSizeFourByte: 4-Byte\n"); 145 | break; 146 | case cudaSharedMemBankSizeEightByte: 147 | printf("the device is cudaSharedMemBankSizeEightByte: 8-Byte\n"); 148 | break; 149 | 150 | } 151 | printf("--------------------------------------------\n"); 152 | dim3 block(BDIMY,BDIMX); 153 | dim3 grid(1,1); 154 | dim3 block_rect(BDIMX_RECT,BDIMY_RECT); 155 | dim3 grid_rect(1,1); 156 | warmup<<>>(out); 157 | printf("warmup!\n"); 158 | double iStart,iElaps; 159 | iStart=cpuSecond(); 160 | switch(kernel) 161 | { 162 | case 0: 163 | { 164 | setRowReadRow<<>>(out); 165 | cudaDeviceSynchronize(); 166 | iElaps=cpuSecond()-iStart; 167 | printf("setRowReadRow "); 168 | printf("Execution Time elapsed %f sec\n",iElaps); 169 | //break; 170 | //case 1: 171 | iStart=cpuSecond(); 172 | setColReadCol<<>>(out); 173 | cudaDeviceSynchronize(); 174 | iElaps=cpuSecond()-iStart; 175 | printf("setColReadCol "); 176 | printf("Execution Time elapsed %f sec\n",iElaps); 177 | break; 178 | } 179 | case 2: 180 | { 181 | setColReadRow<<>>(out); 182 | cudaDeviceSynchronize(); 183 | iElaps=cpuSecond()-iStart; 184 | printf("setColReadRow "); 185 | printf("Execution Time elapsed %f sec\n",iElaps); 186 | break; 187 | } 188 | case 3: 189 | { 190 | setRowReadCol<<>>(out); 191 | cudaDeviceSynchronize(); 192 | iElaps=cpuSecond()-iStart; 193 | printf("setRowReadCol "); 194 | printf("Execution Time elapsed %f sec\n",iElaps); 195 | break; 196 | } 197 | case 4: 198 | { 199 | setRowReadColDyn<<>>(out); 200 | cudaDeviceSynchronize(); 201 | iElaps=cpuSecond()-iStart; 202 | printf("setRowReadColDyn "); 203 | printf("Execution Time elapsed %f sec\n",iElaps); 204 | break; 205 | } 206 | case 5: 207 | { 208 | setRowReadColIpad<<>>(out); 209 | cudaDeviceSynchronize(); 210 | iElaps=cpuSecond()-iStart; 211 | printf("setRowReadColIpad "); 212 | printf("Execution Time elapsed %f sec\n",iElaps); 213 | break; 214 | } 215 | case 6: 216 | { 217 | setRowReadColDynIpad<<>>(out); 218 | cudaDeviceSynchronize(); 219 | iElaps=cpuSecond()-iStart; 220 | printf("setRowReadColDynIpad "); 221 | printf("Execution Time elapsed %f sec\n",iElaps); 222 | break; 223 | } 224 | case 7: 225 | { 226 | setRowReadColRect<<>>(out); 227 | cudaDeviceSynchronize(); 228 | iElaps=cpuSecond()-iStart; 229 | printf("setRowReadColRect "); 230 | printf("Execution Time elapsed %f sec\n",iElaps); 231 | break; 232 | } 233 | case 8: 234 | { 235 | setRowReadColRectDyn<<>>(out); 236 | cudaDeviceSynchronize(); 237 | iElaps=cpuSecond()-iStart; 238 | printf("setRowReadColRectDyn "); 239 | printf("Execution Time elapsed %f sec\n",iElaps); 240 | break; 241 | } 242 | case 9: 243 | { 244 | setRowReadColRectPad<<>>(out); 245 | cudaDeviceSynchronize(); 246 | iElaps=cpuSecond()-iStart; 247 | printf("setRowReadColRectPad "); 248 | printf("Execution Time elapsed %f sec\n",iElaps); 249 | break; 250 | } 251 | case 10: 252 | { 253 | setRowReadColRectDynPad<<>>(out); 254 | cudaDeviceSynchronize(); 255 | iElaps=cpuSecond()-iStart; 256 | printf("setRowReadColRectDynPad "); 257 | printf("Execution Time elapsed %f sec\n",iElaps); 258 | break; 259 | } 260 | case 11: 261 | { 262 | setRowReadRow<<>>(out); 263 | cudaDeviceSynchronize(); 264 | 265 | setColReadCol<<>>(out); 266 | cudaDeviceSynchronize(); 267 | 268 | setColReadRow<<>>(out); 269 | cudaDeviceSynchronize(); 270 | 271 | setRowReadCol<<>>(out); 272 | cudaDeviceSynchronize(); 273 | 274 | setRowReadColDyn<<>>(out); 275 | cudaDeviceSynchronize(); 276 | 277 | setRowReadColIpad<<>>(out); 278 | cudaDeviceSynchronize(); 279 | 280 | setRowReadColDynIpad<<>>(out); 281 | cudaDeviceSynchronize(); 282 | break; 283 | } 284 | case 12: 285 | { 286 | setRowReadColRect<<>>(out); 287 | setRowReadColRectDyn<<>>(out); 288 | setRowReadColRectPad<<>>(out); 289 | setRowReadColRectDynPad<<>>(out); 290 | break; 291 | } 292 | 293 | } 294 | 295 | cudaFree(out); 296 | return 0; 297 | } 298 | -------------------------------------------------------------------------------- /25_reduce_integer_shared_memory/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(reduce_integer_shared_memory reduce_integer_shared_memory.cu) 2 | -------------------------------------------------------------------------------- /25_reduce_integer_shared_memory/reduce_integer_shared_memory.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | #define DIM 1024 5 | 6 | int recursiveReduce(int *data, int const size) 7 | { 8 | // terminate check 9 | if (size == 1) return data[0]; 10 | // renew the stride 11 | int const stride = size / 2; 12 | if (size % 2 == 1) 13 | { 14 | for (int i = 0; i < stride; i++) 15 | { 16 | data[i] += data[i + stride]; 17 | } 18 | data[0] += data[size - 1]; 19 | } 20 | else 21 | { 22 | for (int i = 0; i < stride; i++) 23 | { 24 | data[i] += data[i + stride]; 25 | } 26 | } 27 | // call 28 | return recursiveReduce(data, stride); 29 | } 30 | __global__ void warmup(int * g_idata, int * g_odata, unsigned int n) 31 | { 32 | 33 | //set thread ID 34 | unsigned int tid = threadIdx.x; 35 | //boundary check 36 | if (tid >= n) return; 37 | //convert global data pointer to the 38 | int *idata = g_idata + blockIdx.x*blockDim.x; 39 | //in-place reduction in global memory 40 | for (int stride = 1; stride < blockDim.x; stride *= 2) 41 | { 42 | if ((tid % (2 * stride)) == 0) 43 | { 44 | idata[tid] += idata[tid + stride]; 45 | } 46 | //synchronize within block 47 | __syncthreads(); 48 | } 49 | //write result for this block to global mem 50 | if (tid == 0) 51 | g_odata[blockIdx.x] = idata[0]; 52 | 53 | } 54 | 55 | 56 | __global__ void reduceGmem(int * g_idata,int * g_odata,unsigned int n) 57 | { 58 | //set thread ID 59 | unsigned int tid = threadIdx.x; 60 | unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x; 61 | //boundary check 62 | if (tid >= n) return; 63 | //convert global data pointer to the 64 | int *idata = g_idata + blockIdx.x*blockDim.x; 65 | 66 | //in-place reduction in global memory 67 | if(blockDim.x>=1024 && tid <512) 68 | idata[tid]+=idata[tid+512]; 69 | __syncthreads(); 70 | if(blockDim.x>=512 && tid <256) 71 | idata[tid]+=idata[tid+256]; 72 | __syncthreads(); 73 | if(blockDim.x>=256 && tid <128) 74 | idata[tid]+=idata[tid+128]; 75 | __syncthreads(); 76 | if(blockDim.x>=128 && tid <64) 77 | idata[tid]+=idata[tid+64]; 78 | __syncthreads(); 79 | //write result for this block to global mem 80 | if(tid<32) 81 | { 82 | volatile int *vmem = idata; 83 | vmem[tid]+=vmem[tid+32]; 84 | vmem[tid]+=vmem[tid+16]; 85 | vmem[tid]+=vmem[tid+8]; 86 | vmem[tid]+=vmem[tid+4]; 87 | vmem[tid]+=vmem[tid+2]; 88 | vmem[tid]+=vmem[tid+1]; 89 | 90 | } 91 | 92 | if (tid == 0) 93 | g_odata[blockIdx.x] = idata[0]; 94 | 95 | } 96 | 97 | 98 | __global__ void reduceSmem(int * g_idata,int * g_odata,unsigned int n) 99 | { 100 | //set thread ID 101 | __shared__ int smem[DIM]; 102 | unsigned int tid = threadIdx.x; 103 | //unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x; 104 | //boundary check 105 | if (tid >= n) return; 106 | //convert global data pointer to the 107 | int *idata = g_idata + blockIdx.x*blockDim.x; 108 | 109 | smem[tid]=idata[tid]; 110 | __syncthreads(); 111 | //in-place reduction in global memory 112 | if(blockDim.x>=1024 && tid <512) 113 | smem[tid]+=smem[tid+512]; 114 | __syncthreads(); 115 | if(blockDim.x>=512 && tid <256) 116 | smem[tid]+=smem[tid+256]; 117 | __syncthreads(); 118 | if(blockDim.x>=256 && tid <128) 119 | smem[tid]+=smem[tid+128]; 120 | __syncthreads(); 121 | if(blockDim.x>=128 && tid <64) 122 | smem[tid]+=smem[tid+64]; 123 | __syncthreads(); 124 | //write result for this block to global mem 125 | if(tid<32) 126 | { 127 | volatile int *vsmem = smem; 128 | vsmem[tid]+=vsmem[tid+32]; 129 | vsmem[tid]+=vsmem[tid+16]; 130 | vsmem[tid]+=vsmem[tid+8]; 131 | vsmem[tid]+=vsmem[tid+4]; 132 | vsmem[tid]+=vsmem[tid+2]; 133 | vsmem[tid]+=vsmem[tid+1]; 134 | 135 | } 136 | 137 | if (tid == 0) 138 | g_odata[blockIdx.x] = smem[0]; 139 | 140 | } 141 | 142 | __global__ void reduceUnroll4Smem(int * g_idata,int * g_odata,unsigned int n) 143 | { 144 | //set thread ID 145 | __shared__ int smem[DIM]; 146 | unsigned int tid = threadIdx.x; 147 | unsigned int idx = blockDim.x*blockIdx.x*4+threadIdx.x; 148 | //boundary check 149 | if (tid >= n) return; 150 | //convert global data pointer to the 151 | int tempSum=0; 152 | if(idx+3 * blockDim.x<=n) 153 | { 154 | int a1=g_idata[idx]; 155 | int a2=g_idata[idx+blockDim.x]; 156 | int a3=g_idata[idx+2*blockDim.x]; 157 | int a4=g_idata[idx+3*blockDim.x]; 158 | tempSum=a1+a2+a3+a4; 159 | 160 | } 161 | smem[tid]=tempSum; 162 | __syncthreads(); 163 | //in-place reduction in global memory 164 | if(blockDim.x>=1024 && tid <512) 165 | smem[tid]+=smem[tid+512]; 166 | __syncthreads(); 167 | if(blockDim.x>=512 && tid <256) 168 | smem[tid]+=smem[tid+256]; 169 | __syncthreads(); 170 | if(blockDim.x>=256 && tid <128) 171 | smem[tid]+=smem[tid+128]; 172 | __syncthreads(); 173 | if(blockDim.x>=128 && tid <64) 174 | smem[tid]+=smem[tid+64]; 175 | __syncthreads(); 176 | //write result for this block to global mem 177 | if(tid<32) 178 | { 179 | volatile int *vsmem = smem; 180 | vsmem[tid]+=vsmem[tid+32]; 181 | vsmem[tid]+=vsmem[tid+16]; 182 | vsmem[tid]+=vsmem[tid+8]; 183 | vsmem[tid]+=vsmem[tid+4]; 184 | vsmem[tid]+=vsmem[tid+2]; 185 | vsmem[tid]+=vsmem[tid+1]; 186 | 187 | } 188 | 189 | if (tid == 0) 190 | g_odata[blockIdx.x] = smem[0]; 191 | 192 | } 193 | 194 | int main(int argc,char** argv) 195 | { 196 | initDevice(0); 197 | 198 | bool bResult = false; 199 | //initialization 200 | 201 | int size = 1 << 24; 202 | printf(" with array size %d \n", size); 203 | 204 | //execution configuration 205 | int blocksize = 1024; 206 | if (argc > 1) 207 | { 208 | blocksize = atoi(argv[1]); 209 | } 210 | dim3 block(blocksize, 1); 211 | dim3 grid((size - 1) / block.x + 1, 1); 212 | printf("grid %d block %d \n", grid.x, block.x); 213 | 214 | //allocate host memory 215 | size_t bytes = size * sizeof(int); 216 | int *idata_host = (int*)malloc(bytes); 217 | int *odata_host = (int*)malloc(grid.x * sizeof(int)); 218 | int * tmp = (int*)malloc(bytes); 219 | 220 | //initialize the array 221 | initialData_int(idata_host, size); 222 | 223 | memcpy(tmp, idata_host, bytes); 224 | double iStart, iElaps; 225 | int gpu_sum = 0; 226 | 227 | // device memory 228 | int * idata_dev = NULL; 229 | int * odata_dev = NULL; 230 | CHECK(cudaMalloc((void**)&idata_dev, bytes)); 231 | CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int))); 232 | 233 | //cpu reduction 234 | int cpu_sum = 0; 235 | iStart = cpuSecond(); 236 | //cpu_sum = recursiveReduce(tmp, size); 237 | for (int i = 0; i < size; i++) 238 | cpu_sum += tmp[i]; 239 | iElaps = cpuSecond() - iStart; 240 | printf("cpu reduce elapsed %lf ms cpu_sum: %d\n", iElaps, cpu_sum); 241 | 242 | 243 | //kernel 1:warmup 244 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 245 | CHECK(cudaDeviceSynchronize()); 246 | iStart = cpuSecond(); 247 | warmup <<>>(idata_dev, odata_dev, size); 248 | cudaDeviceSynchronize(); 249 | iElaps = cpuSecond() - iStart; 250 | printf("gpu warmup elapsed %lf ms\n",iElaps); 251 | 252 | 253 | 254 | //reduceGmem 255 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 256 | CHECK(cudaDeviceSynchronize()); 257 | iStart = cpuSecond(); 258 | reduceGmem <<>>(idata_dev, odata_dev, size); 259 | cudaDeviceSynchronize(); 260 | iElaps = cpuSecond() - iStart; 261 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 262 | gpu_sum = 0; 263 | for (int i = 0; i < grid.x; i++) 264 | gpu_sum += odata_host[i]; 265 | printf("reduceGmem elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum); 266 | 267 | //reduceSmem 268 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 269 | CHECK(cudaDeviceSynchronize()); 270 | iStart = cpuSecond(); 271 | reduceSmem <<>>(idata_dev, odata_dev, size); 272 | cudaDeviceSynchronize(); 273 | iElaps = cpuSecond() - iStart; 274 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 275 | gpu_sum = 0; 276 | for (int i = 0; i < grid.x; i++) 277 | gpu_sum += odata_host[i]; 278 | printf("reduceSmem elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum); 279 | 280 | 281 | 282 | //reduceUnroll4Smem 283 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 284 | CHECK(cudaDeviceSynchronize()); 285 | iStart = cpuSecond(); 286 | reduceUnroll4Smem <<>>(idata_dev, odata_dev, size); 287 | cudaDeviceSynchronize(); 288 | iElaps = cpuSecond() - iStart; 289 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 290 | gpu_sum = 0; 291 | for (int i = 0; i < grid.x/4; i++) 292 | gpu_sum += odata_host[i]; 293 | printf("reduceUnroll4Smem elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum); 294 | 295 | 296 | free(idata_host); 297 | free(odata_host); 298 | CHECK(cudaFree(idata_dev)); 299 | CHECK(cudaFree(odata_dev)); 300 | //reset device 301 | cudaDeviceReset(); 302 | return EXIT_SUCCESS; 303 | 304 | } 305 | -------------------------------------------------------------------------------- /26_transform_shared_memory/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(transform_shared_memory transform_shared_memory.cu) 2 | -------------------------------------------------------------------------------- /26_transform_shared_memory/transform_shared_memory.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | #define BDIMX 8 5 | #define BDIMY 8 6 | #define IPAD 2 7 | //cpu transform 8 | void transformMatrix2D_CPU(float * in,float * out,int nx,int ny) 9 | { 10 | for(int j=0;j=4) 165 | { 166 | transform_kernel=atoi(argv[1]); 167 | dimx=atoi(argv[2]); 168 | dimy=atoi(argv[3]); 169 | } 170 | 171 | //Malloc 172 | float* A_host=(float*)malloc(nBytes); 173 | float* B_host_cpu=(float*)malloc(nBytes); 174 | float* B_host=(float*)malloc(nBytes); 175 | initialData(A_host,nxy); 176 | 177 | //cudaMalloc 178 | float *A_dev=NULL; 179 | float *B_dev=NULL; 180 | CHECK(cudaMalloc((void**)&A_dev,nBytes)); 181 | CHECK(cudaMalloc((void**)&B_dev,nBytes)); 182 | 183 | CHECK(cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice)); 184 | CHECK(cudaMemset(B_dev,0,nBytes)); 185 | 186 | 187 | 188 | // cpu compute 189 | double iStart=cpuSecond(); 190 | transformMatrix2D_CPU(A_host,B_host_cpu,nx,ny); 191 | double iElaps=cpuSecond()-iStart; 192 | printf("CPU Execution Time elapsed %f sec\n",iElaps); 193 | 194 | // 2d block and 2d grid 195 | dim3 block(dimx,dimy); 196 | dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1); 197 | dim3 block_1(dimx,dimy); 198 | dim3 grid_1((nx-1)/(block_1.x*2)+1,(ny-1)/block_1.y+1); 199 | //warmup 200 | warmup<<>>(A_dev,B_dev,nx,ny); 201 | CHECK(cudaDeviceSynchronize()); 202 | iStart=cpuSecond(); 203 | switch(transform_kernel) 204 | { 205 | case 0: 206 | copyRow<<>>(A_dev,B_dev,nx,ny); 207 | printf("copyRow "); 208 | break; 209 | case 1: 210 | transformNaiveRow<<>>(A_dev,B_dev,nx,ny); 211 | printf("transformNaiveRow "); 212 | break; 213 | case 2: 214 | transformSmem<<>>(A_dev,B_dev,nx,ny); 215 | printf("transformSmem "); 216 | break; 217 | case 3: 218 | transformSmemPad<<>>(A_dev,B_dev,nx,ny); 219 | printf("transformSmemPad "); 220 | break; 221 | case 4: 222 | transformSmemUnrollPad<<>>(A_dev,B_dev,nx,ny); 223 | printf("transformSmemUnrollPad "); 224 | break; 225 | default: 226 | break; 227 | } 228 | CHECK(cudaDeviceSynchronize()); 229 | iElaps=cpuSecond()-iStart; 230 | printf(" Time elapsed %f sec\n",iElaps); 231 | CHECK(cudaMemcpy(B_host,B_dev,nBytes,cudaMemcpyDeviceToHost)); 232 | checkResult(B_host,B_host_cpu,nxy); 233 | 234 | cudaFree(A_dev); 235 | cudaFree(B_dev); 236 | free(A_host); 237 | free(B_host); 238 | free(B_host_cpu); 239 | cudaDeviceReset(); 240 | return 0; 241 | } 242 | -------------------------------------------------------------------------------- /27_stencil_1d_constant_read_only/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(stencil_1d_constant_read_only stencil_1d_constant_read_only.cu) 2 | -------------------------------------------------------------------------------- /27_stencil_1d_constant_read_only/stencil_1d_constant_read_only.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | #define TEMPLATE_SIZE 9 5 | #define TEMP_RADIO_SIZE (TEMPLATE_SIZE/2) 6 | #define BDIM 32 7 | 8 | __constant__ float coef[TEMP_RADIO_SIZE];//if in midle of the program will be error 9 | void convolution(float *in,float *out,float* template_,const unsigned int array_size) 10 | { 11 | for(int i=TEMP_RADIO_SIZE;iTEMP_RADIO_SIZE) 34 | smem[sidx-TEMP_RADIO_SIZE]=in[idx-TEMP_RADIO_SIZE]; 35 | if(idx=gridDim.x*blockDim.x-TEMP_RADIO_SIZE) 42 | return; 43 | float temp=.0f; 44 | #pragma unroll 45 | for(int i=1;i<=TEMP_RADIO_SIZE;i++) 46 | { 47 | temp+=coef[i-1]*(smem[sidx+i]-smem[sidx-i]); 48 | } 49 | out[idx]=temp; 50 | //printf("%d:GPU :%lf,\n",idx,temp); 51 | } 52 | //read only 53 | __global__ void stencil_1d_readonly(float * in,float * out,const float* __restrict__ dcoef) 54 | { 55 | __shared__ float smem[BDIM+2*TEMP_RADIO_SIZE]; 56 | int idx=threadIdx.x+blockDim.x*blockIdx.x; 57 | int sidx=threadIdx.x+TEMP_RADIO_SIZE; 58 | smem[sidx]=in[idx]; 59 | 60 | if (threadIdx.xTEMP_RADIO_SIZE) 64 | smem[sidx-TEMP_RADIO_SIZE]=in[idx-TEMP_RADIO_SIZE]; 65 | if(idx=gridDim.x*blockDim.x-TEMP_RADIO_SIZE) 72 | return; 73 | float temp=.0f; 74 | #pragma unroll 75 | for(int i=1;i<=TEMP_RADIO_SIZE;i++) 76 | { 77 | temp+=dcoef[i-1]*(smem[sidx+i]-smem[sidx-i]); 78 | } 79 | out[idx]=temp; 80 | //printf("%d:GPU :%lf,\n",idx,temp); 81 | } 82 | 83 | int main(int argc,char** argv) 84 | { 85 | printf("strating...\n"); 86 | initDevice(0); 87 | int dimx=BDIM; 88 | unsigned int nxy=1<<16; 89 | int nBytes=nxy*sizeof(float); 90 | 91 | 92 | //Malloc 93 | float* in_host=(float*)malloc(nBytes); 94 | float* out_gpu=(float*)malloc(nBytes); 95 | float* out_cpu=(float*)malloc(nBytes); 96 | memset(out_cpu,0,nBytes); 97 | initialData(in_host,nxy); 98 | 99 | //cudaMalloc 100 | float *in_dev=NULL; 101 | float *out_dev=NULL; 102 | 103 | initialData(in_host,nxy); 104 | float templ_[]={-1.0,-2.0,2.0,1.0}; 105 | CHECK(cudaMemcpyToSymbol(coef,templ_,TEMP_RADIO_SIZE*sizeof(float))); 106 | 107 | CHECK(cudaMalloc((void**)&in_dev,nBytes)); 108 | CHECK(cudaMalloc((void**)&out_dev,nBytes)); 109 | CHECK(cudaMemcpy(in_dev,in_host,nBytes,cudaMemcpyHostToDevice)); 110 | CHECK(cudaMemset(out_dev,0,nBytes)); 111 | 112 | 113 | 114 | // cpu compute 115 | double iStart=cpuSecond(); 116 | convolution(in_host,out_cpu,templ_,nxy); 117 | double iElaps=cpuSecond()-iStart; 118 | //printf("CPU Execution Time elapsed %f sec\n",iElaps); 119 | 120 | // stencil 1d 121 | dim3 block(dimx); 122 | dim3 grid((nxy-1)/block.x+1); 123 | stencil_1d<<>>(in_dev,out_dev); 124 | CHECK(cudaDeviceSynchronize()); 125 | iElaps=cpuSecond()-iStart; 126 | printf("stencil_1d Time elapsed %f sec\n",iElaps); 127 | CHECK(cudaMemcpy(out_gpu,out_dev,nBytes,cudaMemcpyDeviceToHost)); 128 | checkResult(out_cpu,out_gpu,nxy); 129 | CHECK(cudaMemset(out_dev,0,nBytes)); 130 | // stencil 1d read only 131 | float * dcoef_ro; 132 | CHECK(cudaMalloc((void**)&dcoef_ro,TEMP_RADIO_SIZE * sizeof(float))); 133 | CHECK(cudaMemcpy(dcoef_ro,templ_,TEMP_RADIO_SIZE * sizeof(float),cudaMemcpyHostToDevice)); 134 | stencil_1d_readonly<<>>(in_dev,out_dev,dcoef_ro); 135 | CHECK(cudaDeviceSynchronize()); 136 | iElaps=cpuSecond()-iStart; 137 | printf("stencil_1d_readonly Time elapsed %f sec\n",iElaps); 138 | CHECK(cudaMemcpy(out_gpu,out_dev,nBytes,cudaMemcpyDeviceToHost)); 139 | checkResult(out_cpu,out_gpu,nxy); 140 | 141 | cudaFree(dcoef_ro); 142 | cudaFree(in_dev); 143 | cudaFree(out_dev); 144 | free(out_gpu); 145 | free(out_cpu); 146 | free(in_host); 147 | cudaDeviceReset(); 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /28_shfl_test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(shfl_test shfl_test.cu) -------------------------------------------------------------------------------- /28_shfl_test/shfl_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | #define BDIM 16 5 | #define SEGM 4 6 | __global__ void test_shfl_broadcast(int *in,int*out,int const srcLans) 7 | { 8 | int value=in[threadIdx.x]; 9 | value=__shfl(value,srcLans,BDIM); 10 | out[threadIdx.x]=value; 11 | 12 | } 13 | 14 | __global__ void test_shfl_up(int *in,int*out,int const delta) 15 | { 16 | int value=in[threadIdx.x]; 17 | value=__shfl_up(value,delta,BDIM); 18 | out[threadIdx.x]=value; 19 | 20 | } 21 | 22 | __global__ void test_shfl_down(int *in,int*out,int const delta) 23 | { 24 | int value=in[threadIdx.x]; 25 | value=__shfl_down(value,delta,BDIM); 26 | out[threadIdx.x]=value; 27 | 28 | } 29 | 30 | __global__ void test_shfl_wrap(int *in,int*out,int const offset) 31 | { 32 | int value=in[threadIdx.x]; 33 | value=__shfl(value,threadIdx.x+offset,BDIM); 34 | out[threadIdx.x]=value; 35 | 36 | } 37 | 38 | __global__ void test_shfl_xor(int *in,int*out,int const mask) 39 | { 40 | int value=in[threadIdx.x]; 41 | value=__shfl_xor(value,mask,BDIM); 42 | out[threadIdx.x]=value; 43 | 44 | } 45 | 46 | __global__ void test_shfl_xor_array(int *in,int*out,int const mask) 47 | { 48 | int idx=threadIdx.x*SEGM; 49 | int value[SEGM]; 50 | for(int i=0;i=2) 101 | kernel_num=atoi(argv[1]); 102 | 103 | //Malloc 104 | //int * in_host=(int*)malloc(nBytes); 105 | int in_host[]={0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; 106 | int * out_gpu=(int*)malloc(nBytes); 107 | //initialData_int(in_host,data_size); 108 | 109 | //cudaMalloc 110 | int * in_dev=NULL; 111 | int * out_dev=NULL; 112 | 113 | CHECK(cudaMalloc((void**)&in_dev,nBytes)); 114 | CHECK(cudaMalloc((void**)&out_dev,nBytes)); 115 | CHECK(cudaMemcpy(in_dev,in_host,nBytes,cudaMemcpyHostToDevice)); 116 | CHECK(cudaMemset(out_dev,0,nBytes)); 117 | 118 | 119 | // test _shfl broadcast 120 | dim3 block(dimx); 121 | dim3 grid((data_size-1)/block.x+1); 122 | switch(kernel_num) 123 | { 124 | case 0: 125 | test_shfl_broadcast<<>>(in_dev,out_dev,2); 126 | printf("test_shfl_broadcast\n"); 127 | break; 128 | case 1: 129 | test_shfl_up<<>>(in_dev,out_dev,2); 130 | printf("test_shfl_up\n"); 131 | break; 132 | case 2: 133 | test_shfl_down<<>>(in_dev,out_dev,2); 134 | printf("test_shfl_down\n"); 135 | break; 136 | case 3: 137 | test_shfl_wrap<<>>(in_dev,out_dev,2); 138 | printf("test_shfl_wrap\n"); 139 | break; 140 | case 4: 141 | test_shfl_xor<<>>(in_dev,out_dev,1); 142 | printf("test_shfl_xor\n"); 143 | break; 144 | case 5: 145 | test_shfl_xor_array<<<1,block.x/SEGM>>>(in_dev,out_dev,1); 146 | printf("test_shfl_xor_array\n"); 147 | break; 148 | case 6: 149 | test_shfl_swap<<<1,block.x/SEGM>>>(in_dev,out_dev,1,0,3); 150 | printf("test_shfl_swap\n"); 151 | break; 152 | default: 153 | break; 154 | } 155 | CHECK(cudaMemcpy(out_gpu,out_dev,nBytes,cudaMemcpyDeviceToHost)); 156 | //show result 157 | printf("input:\t"); 158 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #define DIM 1024 5 | 6 | int recursiveReduce(int *data, int const size) 7 | { 8 | // terminate check 9 | if (size == 1) return data[0]; 10 | // renew the stride 11 | int const stride = size / 2; 12 | if (size % 2 == 1) 13 | { 14 | for (int i = 0; i < stride; i++) 15 | { 16 | data[i] += data[i + stride]; 17 | } 18 | data[0] += data[size - 1]; 19 | } 20 | else 21 | { 22 | for (int i = 0; i < stride; i++) 23 | { 24 | data[i] += data[i + stride]; 25 | } 26 | } 27 | // call 28 | return recursiveReduce(data, stride); 29 | } 30 | __global__ void warmup(int * g_idata, int * g_odata, unsigned int n) 31 | { 32 | 33 | //set thread ID 34 | unsigned int tid = threadIdx.x; 35 | //boundary check 36 | if (tid >= n) return; 37 | //convert global data pointer to the 38 | int *idata = g_idata + blockIdx.x*blockDim.x; 39 | //in-place reduction in global memory 40 | for (int stride = 1; stride < blockDim.x; stride *= 2) 41 | { 42 | if ((tid % (2 * stride)) == 0) 43 | { 44 | idata[tid] += idata[tid + stride]; 45 | } 46 | //synchronize within block 47 | __syncthreads(); 48 | } 49 | //write result for this block to global mem 50 | if (tid == 0) 51 | g_odata[blockIdx.x] = idata[0]; 52 | 53 | } 54 | 55 | 56 | __global__ void reduceGmem(int * g_idata,int * g_odata,unsigned int n) 57 | { 58 | //set thread ID 59 | unsigned int tid = threadIdx.x; 60 | unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x; 61 | //boundary check 62 | if (tid >= n) return; 63 | //convert global data pointer to the 64 | int *idata = g_idata + blockIdx.x*blockDim.x; 65 | 66 | __syncthreads(); 67 | //in-place reduction in global memory 68 | if(blockDim.x>=1024 && tid <512) 69 | idata[tid]+=idata[tid+512]; 70 | __syncthreads(); 71 | if(blockDim.x>=512 && tid <256) 72 | idata[tid]+=idata[tid+256]; 73 | __syncthreads(); 74 | if(blockDim.x>=256 && tid <128) 75 | idata[tid]+=idata[tid+128]; 76 | __syncthreads(); 77 | if(blockDim.x>=128 && tid <64) 78 | idata[tid]+=idata[tid+64]; 79 | __syncthreads(); 80 | //write result for this block to global mem 81 | if(tid<32) 82 | { 83 | volatile int *vmem = idata; 84 | vmem[tid]+=vmem[tid+32]; 85 | vmem[tid]+=vmem[tid+16]; 86 | vmem[tid]+=vmem[tid+8]; 87 | vmem[tid]+=vmem[tid+4]; 88 | vmem[tid]+=vmem[tid+2]; 89 | vmem[tid]+=vmem[tid+1]; 90 | 91 | } 92 | 93 | if (tid == 0) 94 | g_odata[blockIdx.x] = idata[0]; 95 | 96 | } 97 | 98 | 99 | __global__ void reduceSmem(int * g_idata,int * g_odata,unsigned int n) 100 | { 101 | //set thread ID 102 | __shared__ int smem[DIM]; 103 | unsigned int tid = threadIdx.x; 104 | //unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x; 105 | //boundary check 106 | if (tid >= n) return; 107 | //convert global data pointer to the 108 | int *idata = g_idata + blockIdx.x*blockDim.x; 109 | 110 | smem[tid]=idata[tid]; 111 | __syncthreads(); 112 | //in-place reduction in global memory 113 | if(blockDim.x>=1024 && tid <512) 114 | smem[tid]+=smem[tid+512]; 115 | __syncthreads(); 116 | if(blockDim.x>=512 && tid <256) 117 | smem[tid]+=smem[tid+256]; 118 | __syncthreads(); 119 | if(blockDim.x>=256 && tid <128) 120 | smem[tid]+=smem[tid+128]; 121 | __syncthreads(); 122 | if(blockDim.x>=128 && tid <64) 123 | smem[tid]+=smem[tid+64]; 124 | __syncthreads(); 125 | //write result for this block to global mem 126 | if(tid<32) 127 | { 128 | volatile int *vsmem = smem; 129 | vsmem[tid]+=vsmem[tid+32]; 130 | vsmem[tid]+=vsmem[tid+16]; 131 | vsmem[tid]+=vsmem[tid+8]; 132 | vsmem[tid]+=vsmem[tid+4]; 133 | vsmem[tid]+=vsmem[tid+2]; 134 | vsmem[tid]+=vsmem[tid+1]; 135 | 136 | } 137 | 138 | if (tid == 0) 139 | g_odata[blockIdx.x] = smem[0]; 140 | 141 | } 142 | __inline__ __device__ int warpReduce(int localSum) 143 | { 144 | localSum += __shfl_xor(localSum, 16); 145 | localSum += __shfl_xor(localSum, 8); 146 | localSum += __shfl_xor(localSum, 4); 147 | localSum += __shfl_xor(localSum, 2); 148 | localSum += __shfl_xor(localSum, 1); 149 | 150 | return localSum; 151 | } 152 | __global__ void reduceShfl(int * g_idata,int * g_odata,unsigned int n) 153 | { 154 | //set thread ID 155 | __shared__ int smem[DIM]; 156 | unsigned int idx = blockDim.x*blockIdx.x+threadIdx.x; 157 | //convert global data pointer to the 158 | 159 | int mySum=g_idata[idx]; 160 | int laneIdx=threadIdx.x%warpSize; 161 | int warpIdx=threadIdx.x/warpSize; 162 | 163 | mySum=warpReduce(mySum); 164 | 165 | if(laneIdx==0) 166 | smem[warpIdx]=mySum; 167 | __syncthreads(); 168 | mySum=(threadIdx.x 1) 189 | { 190 | blocksize = atoi(argv[1]); 191 | } 192 | dim3 block(blocksize, 1); 193 | dim3 grid((size - 1) / block.x + 1, 1); 194 | printf("grid %d block %d \n", grid.x, block.x); 195 | 196 | //allocate host memory 197 | size_t bytes = size * sizeof(int); 198 | int *idata_host = (int*)malloc(bytes); 199 | int *odata_host = (int*)malloc(grid.x * sizeof(int)); 200 | int * tmp = (int*)malloc(bytes); 201 | 202 | //initialize the array 203 | initialData_int(idata_host, size); 204 | 205 | memcpy(tmp, idata_host, bytes); 206 | double iStart, iElaps; 207 | int gpu_sum = 0; 208 | 209 | // device memory 210 | int * idata_dev = NULL; 211 | int * odata_dev = NULL; 212 | CHECK(cudaMalloc((void**)&idata_dev, bytes)); 213 | CHECK(cudaMalloc((void**)&odata_dev, grid.x * sizeof(int))); 214 | 215 | //cpu reduction 216 | int cpu_sum = 0; 217 | iStart = cpuSecond(); 218 | //cpu_sum = recursiveReduce(tmp, size); 219 | for (int i = 0; i < size; i++) 220 | cpu_sum += tmp[i]; 221 | iElaps = cpuSecond() - iStart; 222 | printf("cpu reduce elapsed %lf ms cpu_sum: %d\n", iElaps, cpu_sum); 223 | 224 | 225 | //kernel 1:warmup 226 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 227 | CHECK(cudaDeviceSynchronize()); 228 | iStart = cpuSecond(); 229 | warmup <<>>(idata_dev, odata_dev, size); 230 | cudaDeviceSynchronize(); 231 | iElaps = cpuSecond() - iStart; 232 | printf("gpu warmup elapsed %lf ms\n",iElaps); 233 | 234 | 235 | 236 | //reduceGmem 237 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 238 | CHECK(cudaDeviceSynchronize()); 239 | iStart = cpuSecond(); 240 | reduceGmem <<>>(idata_dev, odata_dev, size); 241 | cudaDeviceSynchronize(); 242 | iElaps = cpuSecond() - iStart; 243 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 244 | gpu_sum = 0; 245 | for (int i = 0; i < grid.x; i++) 246 | gpu_sum += odata_host[i]; 247 | printf("reduceGmem elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum); 248 | 249 | //reduceSmem 250 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 251 | CHECK(cudaDeviceSynchronize()); 252 | iStart = cpuSecond(); 253 | reduceSmem <<>>(idata_dev, odata_dev, size); 254 | cudaDeviceSynchronize(); 255 | iElaps = cpuSecond() - iStart; 256 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 257 | gpu_sum = 0; 258 | for (int i = 0; i < grid.x; i++) 259 | gpu_sum += odata_host[i]; 260 | printf("reduceSmem elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum); 261 | 262 | 263 | 264 | //reduceShfl 265 | CHECK(cudaMemcpy(idata_dev, idata_host, bytes, cudaMemcpyHostToDevice)); 266 | CHECK(cudaDeviceSynchronize()); 267 | iStart = cpuSecond(); 268 | reduceShfl<<>>(idata_dev, odata_dev, size); 269 | cudaDeviceSynchronize(); 270 | iElaps = cpuSecond() - iStart; 271 | cudaMemcpy(odata_host, odata_dev, grid.x * sizeof(int), cudaMemcpyDeviceToHost); 272 | gpu_sum = 0; 273 | for (int i = 0; i < grid.x; i++) 274 | gpu_sum += odata_host[i]; 275 | printf("reduceShfl elapsed %lf ms gpu_sum: %d\n",iElaps, gpu_sum); 276 | 277 | 278 | free(idata_host); 279 | free(odata_host); 280 | CHECK(cudaFree(idata_dev)); 281 | CHECK(cudaFree(odata_dev)); 282 | //reset device 283 | cudaDeviceReset(); 284 | return EXIT_SUCCESS; 285 | 286 | } 287 | -------------------------------------------------------------------------------- /2_grid_block/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(grid_block grid_block.cu) 2 | -------------------------------------------------------------------------------- /2_grid_block/grid_block.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | int main(int argc,char ** argv) 4 | { 5 | int nElem=1024; 6 | dim3 block(1024); 7 | dim3 grid((nElem-1)/block.x+1); 8 | printf("grid.x %d block.x %d\n",grid.x,block.x); 9 | 10 | block.x=512; 11 | grid.x=(nElem-1)/block.x+1; 12 | printf("grid.x %d block.x %d\n",grid.x,block.x); 13 | 14 | block.x=256; 15 | grid.x=(nElem-1)/block.x+1; 16 | printf("grid.x %d block.x %d\n",grid.x,block.x); 17 | 18 | block.x=128; 19 | grid.x=(nElem-1)/block.x+1; 20 | printf("grid.x %d block.x %d\n",grid.x,block.x); 21 | 22 | cudaDeviceReset(); 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /30_stream/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(stream stream.cu) 2 | -------------------------------------------------------------------------------- /30_stream/stream.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | #define N 300000 6 | __global__ void kernel_1() 7 | { 8 | double sum=0.0; 9 | for(int i=0;i>>(); 50 | kernel_2<<>>(); 51 | kernel_3<<>>(); 52 | kernel_4<<>>(); 53 | } 54 | cudaEventRecord(stop,0); 55 | CHECK(cudaEventSynchronize(stop)); 56 | float elapsed_time; 57 | cudaEventElapsedTime(&elapsed_time,start,stop); 58 | printf("elapsed time:%f ms\n",elapsed_time); 59 | 60 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #include 5 | #define N 300000 6 | __global__ void kernel_1() 7 | { 8 | double sum=0.0; 9 | for(int i=0;i>>(); 49 | kernel_2<<>>(); 50 | kernel_3<<>>(); 51 | kernel_4<<>>(); 52 | } 53 | cudaEventRecord(stop,0); 54 | CHECK(cudaEventSynchronize(stop)); 55 | float elapsed_time; 56 | cudaEventElapsedTime(&elapsed_time,start,stop); 57 | printf("elapsed time:%f ms\n",elapsed_time); 58 | 59 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #define N 100 5 | __global__ void kernel_1() 6 | { 7 | double sum=0.0; 8 | for(int i=0;i>>(); 47 | kernel_2<<>>(); 48 | kernel_3<<>>(); 49 | kernel_4<<>>(); 50 | } 51 | cudaEventRecord(stop); 52 | CHECK(cudaEventSynchronize(stop)); 53 | float elapsed_time; 54 | cudaEventElapsedTime(&elapsed_time,start,stop); 55 | printf("elapsed time:%f ms\n",elapsed_time); 56 | 57 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #define N 300000 5 | __global__ void kernel_1() 6 | { 7 | double sum=0.0; 8 | for(int i=0;i>>(); 46 | kernel_2<<>>(); 47 | kernel_3<<>>(); 48 | kernel_4<<>>(); 49 | } 50 | cudaEventRecord(stop); 51 | CHECK(cudaEventSynchronize(stop)); 52 | float elapsed_time; 53 | cudaEventElapsedTime(&elapsed_time,start,stop); 54 | 55 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #define N 300000 5 | __global__ void kernel_1() 6 | { 7 | double sum=0.0; 8 | for(int i=0;i>>(); 51 | kernel_2<<>>(); 52 | kernel_3<<>>(); 53 | kernel_4<<>>(); 54 | cudaEventRecord(event[i],stream[i]); 55 | cudaStreamWaitEvent(stream[n_stream-1],event[i],0); 56 | } 57 | cudaEventRecord(stop); 58 | CHECK(cudaEventSynchronize(stop)); 59 | float elapsed_time; 60 | cudaEventElapsedTime(&elapsed_time,start,stop); 61 | 62 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #define N_REPEAT 10 5 | #define N_SEGMENT 4 6 | 7 | void sumArrays(float * a,float * b,float * res,const int size) 8 | { 9 | for(int i=0;i>>(&a_d[ioffset],&b_d[ioffset],&res_d[ioffset],iElem); 76 | CHECK(cudaMemcpyAsync(&res_from_gpu_h[ioffset],&res_d[ioffset],nByte/N_SEGMENT,cudaMemcpyDeviceToHost,stream[i])); 77 | } 78 | //timer 79 | CHECK(cudaEventRecord(stop, 0)); 80 | CHECK(cudaEventSynchronize(stop)); 81 | iElaps=cpuSecond()-iStart; 82 | printf("Asynchronous Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); 83 | checkResult(res_h,res_from_gpu_h,nElem); 84 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #define N_REPEAT 10 5 | #define N_SEGMENT 4 6 | 7 | void sumArrays(float * a,float * b,float * res,const int size) 8 | { 9 | for(int i=0;i>>(&a_d[ioffset],&b_d[ioffset],&res_d[ioffset],iElem); 80 | } 81 | for(int i=0;i>> Time elapsed %f sec\n",grid.x,block.x,iElaps); 91 | checkResult(res_h,res_from_gpu_h,nElem); 92 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #define N_REPEAT 10 5 | #define N_SEGMENT 1 6 | 7 | void sumArrays(float * a,float * b,float * res,const int size) 8 | { 9 | for(int i=0;i>>(&a_d[ioffset],&b_d[ioffset],&res_d[ioffset],iElem); 76 | CHECK(cudaMemcpyAsync(&res_from_gpu_h[ioffset],&res_d[ioffset],nByte/N_SEGMENT,cudaMemcpyDeviceToHost,stream[i])); 77 | } 78 | //timer 79 | CHECK(cudaEventRecord(stop, 0)); 80 | int counter=0; 81 | while (cudaEventQuery(stop)==cudaErrorNotReady) 82 | { 83 | counter++; 84 | } 85 | printf("cpu counter:%d\n",counter); 86 | iElaps=cpuSecond()-iStart; 87 | printf("Asynchronous Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); 88 | checkResult(res_h,res_from_gpu_h,nElem); 89 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | #define N_REPEAT 10 5 | #define N_SEGMENT 16 6 | void CUDART_CB my_callback(cudaStream_t stream,cudaError_t status,void * data) 7 | { 8 | printf("call back from stream:%d\n",*((int *)data)); 9 | } 10 | void sumArrays(float * a,float * b,float * res,const int size) 11 | { 12 | for(int i=0;i>>(&a_d[ioffset],&b_d[ioffset],&res_d[ioffset],iElem); 79 | CHECK(cudaMemcpyAsync(&res_from_gpu_h[ioffset],&res_d[ioffset],nByte/N_SEGMENT,cudaMemcpyDeviceToHost,stream[i])); 80 | CHECK(cudaStreamAddCallback(stream[i],my_callback,(void *)(stream+i),0)); 81 | } 82 | //timer 83 | CHECK(cudaEventRecord(stop, 0)); 84 | int counter=0; 85 | while (cudaEventQuery(stop)==cudaErrorNotReady) 86 | { 87 | counter++; 88 | } 89 | printf("cpu counter:%d\n",counter); 90 | iElaps=cpuSecond()-iStart; 91 | printf("Asynchronous Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); 92 | checkResult(res_h,res_from_gpu_h,nElem); 93 | for(int i=0;i 2 | #include 3 | #include "freshman.h" 4 | 5 | 6 | void sumArrays(float * a,float * b,float * res,const int size) 7 | { 8 | for(int i=0;i>>(a_d,b_d,res_d); 51 | printf("Execution configuration<<<%d,%d>>>\n",grid.x,block.x); 52 | 53 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 54 | sumArrays(a_h,b_h,res_h,nElem); 55 | 56 | checkResult(res_h,res_from_gpu_h,nElem); 57 | cudaFree(a_d); 58 | cudaFree(b_d); 59 | cudaFree(res_d); 60 | 61 | free(a_h); 62 | free(b_h); 63 | free(res_h); 64 | free(res_from_gpu_h); 65 | 66 | return 0; 67 | } 68 | -------------------------------------------------------------------------------- /4_sum_arrays_timer/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(sum_arrays_timer sum_arrays_timer.cu) 2 | -------------------------------------------------------------------------------- /4_sum_arrays_timer/sum_arrays_timer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | 6 | 7 | void sumArrays(float * a,float * b,float * res,const int size) 8 | { 9 | for(int i=0;i>>(a_d,b_d,res_d,nElem); 56 | 57 | 58 | 59 | CHECK(cudaMemcpy(res_from_gpu_h,res_d,nByte,cudaMemcpyDeviceToHost)); 60 | iElaps=cpuSecond()-iStart; 61 | printf("Execution configuration<<<%d,%d>>> Time elapsed %f sec\n",grid.x,block.x,iElaps); 62 | sumArrays(a_h,b_h,res_h,nElem); 63 | 64 | checkResult(res_h,res_from_gpu_h,nElem); 65 | cudaFree(a_d); 66 | cudaFree(b_d); 67 | cudaFree(res_d); 68 | 69 | free(a_h); 70 | free(b_h); 71 | free(res_h); 72 | free(res_from_gpu_h); 73 | 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /5_thread_index/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(thread_index thread_index.cu) 2 | -------------------------------------------------------------------------------- /5_thread_index/thread_index.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | 5 | __global__ void printThreadIndex(float *A,const int nx,const int ny) 6 | { 7 | int ix=threadIdx.x+blockIdx.x*blockDim.x; 8 | int iy=threadIdx.y+blockIdx.y*blockDim.y; 9 | unsigned int idx=iy*nx+ix; 10 | printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)" 11 | "global index %2d ival %f\n",threadIdx.x,threadIdx.y, 12 | blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]); 13 | } 14 | int main(int argc,char** argv) 15 | { 16 | initDevice(0); 17 | int nx=8,ny=6; 18 | int nxy=nx*ny; 19 | int nBytes=nxy*sizeof(float); 20 | 21 | //Malloc 22 | float* A_host=(float*)malloc(nBytes); 23 | initialData(A_host,nxy); 24 | printMatrix(A_host,nx,ny); 25 | 26 | //cudaMalloc 27 | float *A_dev=NULL; 28 | CHECK(cudaMalloc((void**)&A_dev,nBytes)); 29 | 30 | cudaMemcpy(A_dev,A_host,nBytes,cudaMemcpyHostToDevice); 31 | 32 | dim3 block(4,2); 33 | dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1); 34 | 35 | printThreadIndex<<>>(A_dev,nx,ny); 36 | 37 | CHECK(cudaDeviceSynchronize()); 38 | cudaFree(A_dev); 39 | free(A_host); 40 | 41 | cudaDeviceReset(); 42 | return 0; 43 | } 44 | -------------------------------------------------------------------------------- /6_sum_matrix/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(sum_matrix sum_matrix.cu) 2 | -------------------------------------------------------------------------------- /6_sum_matrix/sum_matrix.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny) 5 | { 6 | float * a=MatA; 7 | float * b=MatB; 8 | float * c=MatC; 9 | for(int j=0;j>>(A_dev,B_dev,C_dev,nx,ny); 75 | CHECK(cudaDeviceSynchronize()); 76 | iElaps=cpuSecond()-iStart; 77 | printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", 78 | grid_0.x,grid_0.y,block_0.x,block_0.y,iElaps); 79 | CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost)); 80 | checkResult(C_host,C_from_gpu,nxy); 81 | // 1d block and 1d grid 82 | dimx=32; 83 | dim3 block_1(dimx); 84 | dim3 grid_1((nxy-1)/block_1.x+1); 85 | iStart=cpuSecond(); 86 | sumMatrix<<>>(A_dev,B_dev,C_dev,nx*ny ,1); 87 | CHECK(cudaDeviceSynchronize()); 88 | iElaps=cpuSecond()-iStart; 89 | printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", 90 | grid_1.x,grid_1.y,block_1.x,block_1.y,iElaps); 91 | CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost)); 92 | checkResult(C_host,C_from_gpu,nxy); 93 | // 2d block and 1d grid 94 | dimx=32; 95 | dim3 block_2(dimx); 96 | dim3 grid_2((nx-1)/block_2.x+1,ny); 97 | iStart=cpuSecond(); 98 | sumMatrix<<>>(A_dev,B_dev,C_dev,nx,ny); 99 | CHECK(cudaDeviceSynchronize()); 100 | iElaps=cpuSecond()-iStart; 101 | printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", 102 | grid_2.x,grid_2.y,block_2.x,block_2.y,iElaps); 103 | CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost)); 104 | checkResult(C_host,C_from_gpu,nxy); 105 | 106 | 107 | cudaFree(A_dev); 108 | cudaFree(B_dev); 109 | cudaFree(C_dev); 110 | free(A_host); 111 | free(B_host); 112 | free(C_host); 113 | free(C_from_gpu); 114 | cudaDeviceReset(); 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /7_device_information/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(device_information device_information.cu) 2 | -------------------------------------------------------------------------------- /7_device_information/device_information.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc,char** argv) 5 | { 6 | printf("%s Starting ...\n",argv[0]); 7 | int deviceCount = 0; 8 | cudaError_t error_id = cudaGetDeviceCount(&deviceCount); 9 | if(error_id!=cudaSuccess) 10 | { 11 | printf("cudaGetDeviceCount returned %d\n ->%s\n", 12 | (int)error_id,cudaGetErrorString(error_id)); 13 | printf("Result = FAIL\n"); 14 | exit(EXIT_FAILURE); 15 | } 16 | if(deviceCount==0) 17 | { 18 | printf("There are no available device(s) that support CUDA\n"); 19 | } 20 | else 21 | { 22 | printf("Detected %d CUDA Capable device(s)\n",deviceCount); 23 | } 24 | int dev=0,driverVersion=0,runtimeVersion=0; 25 | cudaSetDevice(dev); 26 | cudaDeviceProp deviceProp; 27 | cudaGetDeviceProperties(&deviceProp,dev); 28 | printf("Device %d:\"%s\"\n",dev,deviceProp.name); 29 | cudaDriverGetVersion(&driverVersion); 30 | cudaRuntimeGetVersion(&runtimeVersion); 31 | printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", 32 | driverVersion/1000,(driverVersion%100)/10, 33 | runtimeVersion/1000,(runtimeVersion%100)/10); 34 | printf(" CUDA Capability Major/Minor version number: %d.%d\n", 35 | deviceProp.major,deviceProp.minor); 36 | printf(" Total amount of global memory: %.2f GBytes (%llu bytes)\n", 37 | (float)deviceProp.totalGlobalMem/pow(1024.0,3),deviceProp.totalGlobalMem); 38 | printf(" GPU Clock rate: %.0f MHz (%0.2f GHz)\n", 39 | deviceProp.clockRate*1e-3f,deviceProp.clockRate*1e-6f); 40 | printf(" Memory Bus width: %d-bits\n", 41 | deviceProp.memoryBusWidth); 42 | if (deviceProp.l2CacheSize) 43 | { 44 | printf(" L2 Cache Size: %d bytes\n", 45 | deviceProp.l2CacheSize); 46 | } 47 | printf(" Max Texture Dimension Size (x,y,z) 1D=(%d),2D=(%d,%d),3D=(%d,%d,%d)\n", 48 | deviceProp.maxTexture1D,deviceProp.maxTexture2D[0],deviceProp.maxTexture2D[1] 49 | ,deviceProp.maxTexture3D[0],deviceProp.maxTexture3D[1],deviceProp.maxTexture3D[2]); 50 | printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d,2D=(%d,%d) x %d\n", 51 | deviceProp.maxTexture1DLayered[0],deviceProp.maxTexture1DLayered[1], 52 | deviceProp.maxTexture2DLayered[0],deviceProp.maxTexture2DLayered[1], 53 | deviceProp.maxTexture2DLayered[2]); 54 | printf(" Total amount of constant memory %lu bytes\n", 55 | deviceProp.totalConstMem); 56 | printf(" Total amount of shared memory per block: %lu bytes\n", 57 | deviceProp.sharedMemPerBlock); 58 | printf(" Total number of registers available per block:%d\n", 59 | deviceProp.regsPerBlock); 60 | printf(" Wrap size: %d\n",deviceProp.warpSize); 61 | printf(" Maximun number of thread per multiprocesser: %d\n", 62 | deviceProp.maxThreadsPerMultiProcessor); 63 | printf(" Maximun number of thread per block: %d\n", 64 | deviceProp.maxThreadsPerBlock); 65 | printf(" Maximun size of each dimension of a block: %d x %d x %d\n", 66 | deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]); 67 | printf(" Maximun size of each dimension of a grid: %d x %d x %d\n", 68 | deviceProp.maxGridSize[0], 69 | deviceProp.maxGridSize[1], 70 | deviceProp.maxGridSize[2]); 71 | printf(" Maximu memory pitch %lu bytes\n",deviceProp.memPitch); 72 | printf("----------------------------------------------------------\n"); 73 | printf("Number of multiprocessors: %d\n", deviceProp.multiProcessorCount); 74 | printf("Total amount of constant memory: %4.2f KB\n", 75 | deviceProp.totalConstMem/1024.0); 76 | printf("Total amount of shared memory per block: %4.2f KB\n", 77 | deviceProp.sharedMemPerBlock/1024.0); 78 | printf("Total number of registers available per block: %d\n", 79 | deviceProp.regsPerBlock); 80 | printf("Warp size %d\n", deviceProp.warpSize); 81 | printf("Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock); 82 | printf("Maximum number of threads per multiprocessor: %d\n", 83 | deviceProp.maxThreadsPerMultiProcessor); 84 | printf("Maximum number of warps per multiprocessor: %d\n", 85 | deviceProp.maxThreadsPerMultiProcessor/32); 86 | return EXIT_SUCCESS; 87 | 88 | } 89 | -------------------------------------------------------------------------------- /8_divergence/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(divergence divergence.cu) 2 | -------------------------------------------------------------------------------- /8_divergence/divergence.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "freshman.h" 5 | __global__ void warmup(float *c) 6 | { 7 | int tid = blockIdx.x* blockDim.x + threadIdx.x; 8 | float a = 0.0; 9 | float b = 0.0; 10 | 11 | if ((tid/warpSize) % 2 == 0) 12 | { 13 | a = 100.0f; 14 | 15 | } 16 | else 17 | { 18 | b = 200.0f; 19 | } 20 | //printf("%d %d %f \n",tid,warpSize,a+b); 21 | c[tid] = a + b; 22 | } 23 | __global__ void mathKernel1(float *c) 24 | { 25 | int tid = blockIdx.x* blockDim.x + threadIdx.x; 26 | 27 | float a = 0.0; 28 | float b = 0.0; 29 | if (tid % 2 == 0) 30 | { 31 | a = 100.0f; 32 | } 33 | else 34 | { 35 | b = 200.0f; 36 | } 37 | c[tid] = a + b; 38 | } 39 | 40 | __global__ void mathKernel2(float *c) 41 | { 42 | int tid = blockIdx.x* blockDim.x + threadIdx.x; 43 | float a = 0.0; 44 | float b = 0.0; 45 | if ((tid/warpSize) % 2 == 0) 46 | { 47 | a = 100.0f; 48 | } 49 | else 50 | { 51 | b = 200.0f; 52 | } 53 | c[tid] = a + b; 54 | } 55 | __global__ void mathKernel3(float *c) 56 | { 57 | int tid = blockIdx.x* blockDim.x + threadIdx.x; 58 | float a = 0.0; 59 | float b = 0.0; 60 | bool ipred = (tid % 2 == 0); 61 | if (ipred) 62 | { 63 | a = 100.0f; 64 | } 65 | else 66 | { 67 | b = 200.0f; 68 | } 69 | c[tid] = a + b; 70 | } 71 | 72 | int main(int argc, char **argv) 73 | { 74 | int dev = 0; 75 | cudaDeviceProp deviceProp; 76 | cudaGetDeviceProperties(&deviceProp, dev); 77 | printf("%s using Device %d: %s\n", argv[0], dev, deviceProp.name); 78 | 79 | //set up data size 80 | int size = 64; 81 | int blocksize = 64; 82 | if (argc > 1) blocksize = atoi(argv[1]); 83 | if (argc > 2) size = atoi(argv[2]); 84 | printf("Data size %d ", size); 85 | 86 | //set up execution configuration 87 | dim3 block(blocksize,1); 88 | dim3 grid((size - 1) / block.x + 1,1); 89 | printf("Execution Configure (block %d grid %d)\n", block.x, grid.x); 90 | 91 | //allocate gpu memory 92 | float * C_dev; 93 | size_t nBytes = size * sizeof(float); 94 | float * C_host=(float*)malloc(nBytes); 95 | cudaMalloc((float**)&C_dev, nBytes); 96 | 97 | //run a warmup kernel to remove overhead 98 | double iStart, iElaps; 99 | cudaDeviceSynchronize(); 100 | iStart = cpuSecond(); 101 | warmup<<>> (C_dev); 102 | cudaDeviceSynchronize(); 103 | iElaps = cpuSecond() - iStart; 104 | 105 | printf("warmup <<<%d,%d>>>elapsed %lf sec \n", grid.x, block.x, iElaps); 106 | 107 | //run kernel 1 108 | iStart = cpuSecond(); 109 | mathKernel1 <<< grid,block >>> (C_dev); 110 | cudaDeviceSynchronize(); 111 | iElaps = cpuSecond() - iStart; 112 | printf("mathKernel1<<<%4d,%4d>>>elapsed %lf sec \n", grid.x, block.x, iElaps); 113 | cudaMemcpy(C_host,C_dev,nBytes,cudaMemcpyDeviceToHost); 114 | //for(int i=0;i>> (C_dev); 121 | cudaDeviceSynchronize(); 122 | iElaps = cpuSecond() - iStart; 123 | printf("mathKernel2<<<%4d,%4d>>>elapsed %lf sec \n", grid.x, block.x, iElaps); 124 | 125 | //run kernel 3 126 | iStart = cpuSecond(); 127 | mathKernel3 << > > (C_dev); 128 | cudaDeviceSynchronize(); 129 | iElaps = cpuSecond() - iStart; 130 | printf("mathKernel3<<<%4d,%4d>>>elapsed %lf sec \n", grid.x, block.x, iElaps); 131 | 132 | cudaFree(C_dev); 133 | free(C_host); 134 | cudaDeviceReset(); 135 | return EXIT_SUCCESS; 136 | } 137 | -------------------------------------------------------------------------------- /9_sum_matrix2D/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(sum_matrix2D sum_matrix2D.cu) 2 | -------------------------------------------------------------------------------- /9_sum_matrix2D/sum_matrix2D.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "freshman.h" 4 | void sumMatrix2D_CPU(float * MatA,float * MatB,float * MatC,int nx,int ny) 5 | { 6 | float * a=MatA; 7 | float * b=MatB; 8 | float * c=MatC; 9 | for(int j=0;j>>(A_dev,B_dev,C_dev,nx,ny); 75 | CHECK(cudaDeviceSynchronize()); 76 | iElaps=cpuSecond()-iStart; 77 | printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n", 78 | grid_0.x,grid_0.y,block_0.x,block_0.y,iElaps); 79 | CHECK(cudaMemcpy(C_from_gpu,C_dev,nBytes,cudaMemcpyDeviceToHost)); 80 | checkResult(C_host,C_from_gpu,nxy); 81 | 82 | cudaFree(A_dev); 83 | cudaFree(B_dev); 84 | cudaFree(C_dev); 85 | free(A_host); 86 | free(B_host); 87 | free(C_host); 88 | free(C_from_gpu); 89 | cudaDeviceReset(); 90 | return 0; 91 | } 92 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9 FATAL_ERROR) 2 | Project(CUDA_Freshman CXX C CUDA) 3 | set(CMAKE_CUDA_FLAGS "-arch=compute_35 -g -G -O3") 4 | include_directories(./include) 5 | add_subdirectory(0_hello_world) 6 | add_subdirectory(1_check_dimension) 7 | add_subdirectory(2_grid_block) 8 | add_subdirectory(3_sum_arrays) 9 | add_subdirectory(4_sum_arrays_timer) 10 | add_subdirectory(5_thread_index) 11 | add_subdirectory(6_sum_matrix) 12 | add_subdirectory(7_device_information) 13 | add_subdirectory(8_divergence) 14 | add_subdirectory(9_sum_matrix2D) 15 | add_subdirectory(10_reduceInteger) 16 | add_subdirectory(11_simple_sum_matrix2D) 17 | add_subdirectory(12_reduce_unrolling) 18 | add_subdirectory(14_global_variable) 19 | add_subdirectory(15_pine_memory) 20 | add_subdirectory(16_zero_copy_memory) 21 | add_subdirectory(17_UVA) 22 | add_subdirectory(18_sum_array_offset) 23 | add_subdirectory(19_AoS) 24 | add_subdirectory(20_SoA) 25 | add_subdirectory(21_sum_array_offset_unrolling) 26 | add_subdirectory(22_transform_matrix2D) 27 | add_subdirectory(23_sum_array_uniform_memory) 28 | add_subdirectory(24_shared_memory_read_data) 29 | add_subdirectory(25_reduce_integer_shared_memory) 30 | add_subdirectory(26_transform_shared_memory) 31 | add_subdirectory(27_stencil_1d_constant_read_only) 32 | add_subdirectory(28_shfl_test) 33 | add_subdirectory(29_reduce_shfl) 34 | add_subdirectory(30_stream) 35 | add_subdirectory(32_stream_resource) 36 | add_subdirectory(33_stream_block) 37 | add_subdirectory(34_stream_dependence) 38 | add_subdirectory(35_multi_add_depth) 39 | add_subdirectory(36_multi_add_breadth) 40 | add_subdirectory(37_asyncAPI) 41 | add_subdirectory(38_stream_call_back) 42 | -------------------------------------------------------------------------------- /IMG_9066.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Tony-Tan/CUDA_Freshman/979938216fbbd8bc81ccbc525c4dd1f8c0c9fcbb/IMG_9066.JPG -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## 联系我 3 | ![](./IMG_9066.JPG) 4 | ## 博客 5 | 具体内容可以访问博客: 6 | - [0.0 腾讯云CUDA环境搭建](http://www.face2ai.com/CUDA-F-0-0-Tencent-GPU-Cloud/) 7 | - [1.0 并行计算与计算机架构](http://www.face2ai.com/CUDA-F-1-0-并行计算与计算机架构/) 8 | - [1.1 异构计算与CUDA](http://www.face2ai.com/CUDA-F-1-1-异构计算-CUDA/) 9 | - [2.0 CUDA编程模型概述(一)](http://www.face2ai.com/CUDA-F-2-0-CUDA编程模型概述1/) 10 | - [2.1 CUDA编程模型概述(二)](http://www.face2ai.com/CUDA-F-2-1-CUDA编程模型概述2/) 11 | - [2.2 给核函数计时](http://www.face2ai.com/CUDA-F-2-2-核函数计时/) 12 | - [2.3 组织并行线程](http://www.face2ai.com/CUDA-F-2-3-组织并行线程/) 13 | - [2.4 设备信息查询](http://www.face2ai.com/CUDA-F-2-4-设备信息/) 14 | - [3.1 CUDA执行模型概述](http://www.face2ai.com/CUDA-F-3-1-CUDA执行模型概述/) 15 | - [3.2 理解线程束执行的本质(Part I)](http://www.face2ai.com/CUDA-F-3-2-理解线程束执行的本质-P1/) 16 | - [3.2 理解线程束执行的本质(Part II)](http://www.face2ai.com/CUDA-F-3-2-理解线程束执行的本质-P2/) 17 | - [3.3 并行性表现](http://www.face2ai.com/CUDA-F-3-3-并行性表现/) 18 | - [3.4 避免分支分化](http://www.face2ai.com/CUDA-F-3-4-避免分支分化/) 19 | - [3.5 循环展开](http://www.face2ai.com/CUDA-F-3-5-展开循环/) 20 | - [3.6 动态并行](http://www.face2ai.com/CUDA-F-3-6-动态并行/) 21 | - [4.0 全局内存](http://www.face2ai.com/CUDA-F-4-0-全局内存/) 22 | - [4.1 内存模型概述](http://www.face2ai.com/CUDA-F-4-1-内存模型概述/) 23 | - [4.2 内存管理](http://www.face2ai.com/CUDA-F-4-2-内存管理/) 24 | - [4.3 内存访问模式](http://www.face2ai.com/CUDA-F-4-3-内存访问模式/) 25 | - [4.4 核函数可达到的带宽](http://www.face2ai.com/CUDA-F-4-4-核函数可达到的带宽/) 26 | - [4.5 使用统一内存的向量加法](http://www.face2ai.com/CUDA-F-4-5-使用统一内存的向量加法/) 27 | - [5.0 共享内存和常量内存](http://www.face2ai.com/CUDA-F-5-0-共享内存和常量内存/) 28 | - [5.1 CUDA共享内存概述](http://www.face2ai.com/CUDA-F-5-1-CUDA共享内存概述/) 29 | - [5.2 共享内存的数据布局](http://www.face2ai.com/CUDA-F-5-2-共享内存的数据布局/) 30 | - [5.3 减少全局内存访问](http://www.face2ai.com/CUDA-F-5-3-减少全局内存访问/) 31 | - [5.4 合并的全局内存访问](http://www.face2ai.com/CUDA-F-5-4-合并的全局内存访问/) 32 | - [5.5 常量内存](http://www.face2ai.com/CUDA-F-5-5-常量内存/) 33 | - [5.6 线程束洗牌指令](http://www.face2ai.com/CUDA-F-5-6-线程束洗牌指令/) 34 | - [6.0 流和并发](http://www.face2ai.com/CUDA-F-6-0-流和并发/) 35 | - [6.1 流和事件概述](http://www.face2ai.com/CUDA-F-6-1-流和事件概述/) 36 | - [6.2 并发内核执行](http://www.face2ai.com/CUDA-F-6-2-并发内核执行/) 37 | - [6.3 重叠内核执行和数据传输](http://www.face2ai.com/CUDA-F-6-3-重叠内核执行和数据传输/) 38 | - [6.4 重叠GPU和CPU的执行](http://www.face2ai.com/CUDA-F-6-4-重叠GPU和CPU的执行/) 39 | - [6.5 流回调](http://www.face2ai.com/CUDA-F-6-5-流回调/) 40 | 41 | 42 | ## CUDA Freshman 43 | 1. This project is a set of CUDA programs 44 | 2. Some of them are from the book "Professional CUDA C Programming" 45 | 3. The others are coded by myself 46 | 4. You can get more details from the website [www.face2ai.com](http://www.face2ai.com) 47 | -------------------------------------------------------------------------------- /include/freshman.h: -------------------------------------------------------------------------------- 1 | #ifndef FRESHMAN_H 2 | #define FRESHMAN_H 3 | #define CHECK(call)\ 4 | {\ 5 | const cudaError_t error=call;\ 6 | if(error!=cudaSuccess)\ 7 | {\ 8 | printf("ERROR: %s:%d,",__FILE__,__LINE__);\ 9 | printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\ 10 | exit(1);\ 11 | }\ 12 | } 13 | 14 | 15 | #include 16 | #ifdef _WIN32 17 | # include 18 | #else 19 | # include 20 | #endif 21 | #ifdef _WIN32 22 | int gettimeofday(struct timeval *tp, void *tzp) 23 | { 24 | time_t clock; 25 | struct tm tm; 26 | SYSTEMTIME wtm; 27 | GetLocalTime(&wtm); 28 | tm.tm_year = wtm.wYear - 1900; 29 | tm.tm_mon = wtm.wMonth - 1; 30 | tm.tm_mday = wtm.wDay; 31 | tm.tm_hour = wtm.wHour; 32 | tm.tm_min = wtm.wMinute; 33 | tm.tm_sec = wtm.wSecond; 34 | tm. tm_isdst = -1; 35 | clock = mktime(&tm); 36 | tp->tv_sec = clock; 37 | tp->tv_usec = wtm.wMilliseconds * 1000; 38 | return (0); 39 | } 40 | #endif 41 | double cpuSecond() 42 | { 43 | struct timeval tp; 44 | gettimeofday(&tp,NULL); 45 | return((double)tp.tv_sec+(double)tp.tv_usec*1e-6); 46 | 47 | } 48 | void initialData(float* ip,int size) 49 | { 50 | time_t t; 51 | srand((unsigned )time(&t)); 52 | for(int i=0;i:\n",ny,nx); 70 | for(int i=0;iepsilon) 96 | { 97 | printf("Results don\'t match!\n"); 98 | printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n",hostRef[i],i,gpuRef[i],i); 99 | return; 100 | } 101 | } 102 | printf("Check result success!\n"); 103 | } 104 | #endif//FRESHMAN_H 105 | --------------------------------------------------------------------------------