├── README.md ├── chapter01 ├── Makefile ├── hello └── hello.cu ├── chapter02 ├── Makefile ├── a.out ├── check ├── checkDeviceInfor.cu ├── checkDimension.cu ├── checkThreadIndex.cu ├── defineGridBlock.cu ├── out ├── out2 ├── sum ├── sumArraysOnGPU-small-case.cu ├── sumArraysOnGPU-timer.cu ├── sumArraysOnHost.c ├── sumMatrixOnGPU-1D-grid-1D-block.cu ├── sumMatrixOnGPU-2D-grid-1D-block.cu ├── sumMatrixOnGPU-2D-grid-2D-block.cu └── sumMatrixOnGPU.cu ├── chapter03 ├── Makefile ├── nestedHelloWorld.cu ├── nestedReduce.cu ├── nestedReduce2.cu ├── nestedReduceNosync.cu ├── reduceInteger.cu ├── simpleDeviceQuery.cu ├── simpleDivergence.cu └── sumMatrix.cu ├── chapter04 ├── Makefile ├── globalVariable.cu ├── memTransfer.cu ├── pinMemTransfer.cu ├── readSegment.cu ├── readSegmentUnroll.cu ├── simpleMathAoS.cu ├── simpleMathSoA.cu ├── sumArrayZerocpy.cu ├── sumMatrixGPUManaged.cu ├── sumMatrixGPUManual.cu ├── transpose.cu └── writeSegment.cu ├── chapter05 ├── Makefile ├── checkSmemRectangle.cu ├── checkSmemSquare.cu ├── constantReadOnly.cu ├── constantStencil.cu ├── reduceInteger.cu ├── reduceIntegerShfl.cu ├── simpleShfl.cu └── transposeRectangle.cu ├── chapter06 ├── Makefile ├── asyncAPI.cu ├── simpleCallback.cu ├── simpleHyperqBreadth.cu ├── simpleHyperqDependence.cu ├── simpleHyperqDepth.cu ├── simpleHyperqOpenmp.cu ├── simpleMultiAddBreadth.cu └── simpleMultiAddDepth.cu ├── chapter07 ├── Makefile ├── atomic-ordering.cu ├── floating-point-accuracy.cu ├── floating-point-perf.cu ├── fmad.cu ├── intrinsic-standard-comp.cu ├── my-atomic-add.cu └── nbody.cu ├── chapter08 ├── Makefile ├── cublas.cu ├── cuda-openacc.cu ├── cufft-multi.cu ├── cufft.cu ├── cusparse.cu ├── drop-in.c ├── rand-kernel.cu ├── replace-rand-streams.cu ├── replace-rand.cu ├── simple-data.c ├── simple-kernels.c └── simple-parallel.c ├── chapter09 ├── Makefile ├── simple2DFD.cu ├── simpleC2C.c ├── simpleMultiGPU.cu ├── simpleP2P.c ├── simpleP2P_CUDA_Aware.c └── simpleP2P_PingPong.cu └── chapter10 ├── crypt.c ├── crypt.config.cu ├── crypt.constant.cu ├── crypt.flexible.cu ├── crypt.legacy.cu ├── crypt.openmp.cu ├── crypt.overlap.cu ├── crypt.parallelized.cu ├── debug-hazards.cu ├── debug-segfault.cu ├── debug-segfault.fixed.cu └── generate_data.c /README.md: -------------------------------------------------------------------------------- 1 | # CUDA_C Code 2 | CUDA_C编程权威指南示例代码 3 | -------------------------------------------------------------------------------- /chapter01/Makefile: -------------------------------------------------------------------------------- 1 | APPS=hello 2 | 3 | all: ${APPS} 4 | 5 | %: %.cu 6 | nvcc -O2 -arch=sm_20 -o $@ $< 7 | clean: 8 | rm -f ${APPS} 9 | -------------------------------------------------------------------------------- /chapter01/hello: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter01/hello -------------------------------------------------------------------------------- /chapter01/hello.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | 4 | /* 5 | * A simple introduction to programming in CUDA. This program prints "Hello 6 | * World from GPU! from 10 CUDA threads running on the GPU. 7 | */ 8 | 9 | __global__ void helloFromGPU() 10 | { 11 | printf("Hello World from GPU!\n"); 12 | } 13 | 14 | int main(int argc, char **argv) 15 | { 16 | printf("Hello World from CPU!\n"); 17 | 18 | helloFromGPU<<<1, 10>>>(); 19 | CHECK(cudaDeviceReset()); 20 | return 0; 21 | } 22 | 23 | 24 | -------------------------------------------------------------------------------- /chapter02/Makefile: -------------------------------------------------------------------------------- 1 | CU_APPS=checkDeviceInfor checkThreadIndex sumArraysOnGPU-timer \ 2 | sumMatrixOnGPU-1D-grid-1D-block sumMatrixOnGPU-2D-grid-2D-block \ 3 | checkDimension defineGridBlock sumArraysOnGPU-small-case \ 4 | sumMatrixOnGPU-2D-grid-1D-block sumMatrixOnGPU 5 | C_APPS=sumArraysOnHost 6 | 7 | all: ${C_APPS} ${CU_APPS} 8 | 9 | %: %.cu 10 | nvcc -O2 -arch=sm_20 -o $@ $< 11 | %: %.c 12 | gcc -O2 -std=c99 -o $@ $< 13 | clean: 14 | rm -f ${CU_APPS} ${C_APPS} 15 | -------------------------------------------------------------------------------- /chapter02/a.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/a.out -------------------------------------------------------------------------------- /chapter02/check: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/check -------------------------------------------------------------------------------- /chapter02/checkDeviceInfor.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * Display a variety of information on the first CUDA device in this system, 7 | * including driver version, runtime version, compute capability, bytes of 8 | * global memory, etc. 9 | */ 10 | 11 | int main(int argc, char **argv) 12 | { 13 | printf("%s Starting...\n", argv[0]); 14 | 15 | int deviceCount = 0; 16 | cudaGetDeviceCount(&deviceCount); 17 | 18 | if (deviceCount == 0) 19 | { 20 | printf("There are no available device(s) that support CUDA\n"); 21 | } 22 | else 23 | { 24 | printf("Detected %d CUDA Capable device(s)\n", deviceCount); 25 | } 26 | 27 | int dev = 0, driverVersion = 0, runtimeVersion = 0; 28 | CHECK(cudaSetDevice(dev)); 29 | cudaDeviceProp deviceProp; 30 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 31 | printf("Device %d: \"%s\"\n", dev, deviceProp.name); 32 | 33 | cudaDriverGetVersion(&driverVersion); 34 | cudaRuntimeGetVersion(&runtimeVersion); 35 | printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", 36 | driverVersion / 1000, (driverVersion % 100) / 10, 37 | runtimeVersion / 1000, (runtimeVersion % 100) / 10); 38 | printf(" CUDA Capability Major/Minor version number: %d.%d\n", 39 | deviceProp.major, deviceProp.minor); 40 | printf(" Total amount of global memory: %.2f GBytes (%llu " 41 | "bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3), 42 | (unsigned long long)deviceProp.totalGlobalMem); 43 | printf(" GPU Clock rate: %.0f MHz (%0.2f " 44 | "GHz)\n", deviceProp.clockRate * 1e-3f, 45 | deviceProp.clockRate * 1e-6f); 46 | printf(" Memory Clock rate: %.0f Mhz\n", 47 | deviceProp.memoryClockRate * 1e-3f); 48 | printf(" Memory Bus Width: %d-bit\n", 49 | deviceProp.memoryBusWidth); 50 | 51 | if (deviceProp.l2CacheSize) 52 | { 53 | printf(" L2 Cache Size: %d bytes\n", 54 | deviceProp.l2CacheSize); 55 | } 56 | 57 | printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), " 58 | "2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D, 59 | deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1], 60 | deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], 61 | deviceProp.maxTexture3D[2]); 62 | printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, " 63 | "2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0], 64 | deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0], 65 | deviceProp.maxTexture2DLayered[1], 66 | deviceProp.maxTexture2DLayered[2]); 67 | printf(" Total amount of constant memory: %lu bytes\n", 68 | deviceProp.totalConstMem); 69 | printf(" Total amount of shared memory per block: %lu bytes\n", 70 | deviceProp.sharedMemPerBlock); 71 | printf(" Total number of registers available per block: %d\n", 72 | deviceProp.regsPerBlock); 73 | printf(" Warp size: %d\n", 74 | deviceProp.warpSize); 75 | printf(" Maximum number of threads per multiprocessor: %d\n", 76 | deviceProp.maxThreadsPerMultiProcessor); 77 | printf(" Maximum number of threads per block: %d\n", 78 | deviceProp.maxThreadsPerBlock); 79 | printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", 80 | deviceProp.maxThreadsDim[0], 81 | deviceProp.maxThreadsDim[1], 82 | deviceProp.maxThreadsDim[2]); 83 | printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", 84 | deviceProp.maxGridSize[0], 85 | deviceProp.maxGridSize[1], 86 | deviceProp.maxGridSize[2]); 87 | printf(" Maximum memory pitch: %lu bytes\n", 88 | deviceProp.memPitch); 89 | 90 | exit(EXIT_SUCCESS); 91 | } 92 | -------------------------------------------------------------------------------- /chapter02/checkDimension.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * Display the dimensionality of a thread block and grid from the host and 7 | * device. 8 | */ 9 | 10 | __global__ void checkIndex(void) 11 | { 12 | printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z); 13 | printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z); 14 | 15 | printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z); 16 | printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z); 17 | 18 | } 19 | 20 | int main(int argc, char **argv) 21 | { 22 | // define total data element 23 | int nElem = 6; 24 | 25 | // define grid and block structure 26 | dim3 block(3); 27 | dim3 grid((nElem + block.x - 1) / block.x); 28 | 29 | // check grid and block dimension from host side 30 | printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z); 31 | printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z); 32 | 33 | // check grid and block dimension from device side 34 | checkIndex<<>>(); 35 | 36 | // reset device before you leave 37 | CHECK(cudaDeviceReset()); 38 | 39 | return(0); 40 | } 41 | -------------------------------------------------------------------------------- /chapter02/checkThreadIndex.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example helps to visualize the relationship between thread/block IDs and 7 | * offsets into data. For each CUDA thread, this example displays the 8 | * intra-block thread ID, the inter-block block ID, the global coordinate of a 9 | * thread, the calculated offset into input data, and the input data at that 10 | * offset. 11 | */ 12 | 13 | void printMatrix(int *C, const int nx, const int ny) 14 | { 15 | int *ic = C; 16 | printf("\nMatrix: (%d.%d)\n", nx, ny); 17 | 18 | for (int iy = 0; iy < ny; iy++) 19 | { 20 | for (int ix = 0; ix < nx; ix++) 21 | { 22 | printf("%3d", ic[ix]); 23 | 24 | } 25 | 26 | ic += nx; 27 | printf("\n"); 28 | } 29 | 30 | printf("\n"); 31 | return; 32 | } 33 | 34 | __global__ void printThreadIndex(int *A, const int nx, const int ny) 35 | { 36 | int ix = threadIdx.x + blockIdx.x * blockDim.x; 37 | int iy = threadIdx.y + blockIdx.y * blockDim.y; 38 | unsigned int idx = iy * nx + ix; 39 | 40 | printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index" 41 | " %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, 42 | ix, iy, idx, A[idx]); 43 | } 44 | 45 | int main(int argc, char **argv) 46 | { 47 | printf("%s Starting...\n", argv[0]); 48 | 49 | // get device information 50 | int dev = 0; 51 | cudaDeviceProp deviceProp; 52 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 53 | printf("Using Device %d: %s\n", dev, deviceProp.name); 54 | CHECK(cudaSetDevice(dev)); 55 | 56 | // set matrix dimension 57 | int nx = 8; 58 | int ny = 6; 59 | int nxy = nx * ny; 60 | int nBytes = nxy * sizeof(float); 61 | 62 | // malloc host memory 63 | int *h_A; 64 | h_A = (int *)malloc(nBytes); 65 | 66 | // iniitialize host matrix with integer 67 | for (int i = 0; i < nxy; i++) 68 | { 69 | h_A[i] = i; 70 | } 71 | printMatrix(h_A, nx, ny); 72 | 73 | // malloc device memory 74 | int *d_MatA; 75 | CHECK(cudaMalloc((void **)&d_MatA, nBytes)); 76 | 77 | // transfer data from host to device 78 | CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); 79 | 80 | // set up execution configuration 81 | dim3 block(4, 2); 82 | dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); 83 | 84 | // invoke the kernel 85 | printThreadIndex<<>>(d_MatA, nx, ny); 86 | CHECK(cudaGetLastError()); 87 | 88 | // free host and devide memory 89 | CHECK(cudaFree(d_MatA)); 90 | free(h_A); 91 | 92 | // reset device 93 | CHECK(cudaDeviceReset()); 94 | 95 | return (0); 96 | } 97 | -------------------------------------------------------------------------------- /chapter02/defineGridBlock.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * Demonstrate defining the dimensions of a block of threads and a grid of 7 | * blocks from the host. 8 | */ 9 | 10 | int main(int argc, char **argv) 11 | { 12 | // define total data element 13 | int nElem = 1024; 14 | 15 | // define grid and block structure 16 | dim3 block (1024); 17 | dim3 grid ((nElem + block.x - 1) / block.x); 18 | printf("grid.x %d block.x %d \n", grid.x, block.x); 19 | 20 | // reset block 21 | block.x = 512; 22 | grid.x = (nElem + block.x - 1) / block.x; 23 | printf("grid.x %d block.x %d \n", grid.x, block.x); 24 | 25 | // reset block 26 | block.x = 256; 27 | grid.x = (nElem + block.x - 1) / block.x; 28 | printf("grid.x %d block.x %d \n", grid.x, block.x); 29 | 30 | // reset block 31 | block.x = 128; 32 | grid.x = (nElem + block.x - 1) / block.x; 33 | printf("grid.x %d block.x %d \n", grid.x, block.x); 34 | 35 | // reset device before you leave 36 | CHECK(cudaDeviceReset()); 37 | 38 | return(0); 39 | } 40 | 41 | -------------------------------------------------------------------------------- /chapter02/out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/out -------------------------------------------------------------------------------- /chapter02/out2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/out2 -------------------------------------------------------------------------------- /chapter02/sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/sum -------------------------------------------------------------------------------- /chapter02/sumArraysOnGPU-small-case.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates a simple vector sum on the GPU and on the host. 7 | * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the 8 | * GPU. Only a single thread block is used in this small case, for simplicity. 9 | * sumArraysOnHost sequentially iterates through vector elements on the host. 10 | */ 11 | 12 | void checkResult(float *hostRef, float *gpuRef, const int N) 13 | { 14 | double epsilon = 1.0E-8; 15 | bool match = 1; 16 | 17 | for (int i = 0; i < N; i++) 18 | { 19 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 20 | { 21 | match = 0; 22 | printf("Arrays do not match!\n"); 23 | printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], 24 | gpuRef[i], i); 25 | break; 26 | } 27 | } 28 | 29 | if (match) printf("Arrays match.\n\n"); 30 | 31 | return; 32 | } 33 | 34 | 35 | void initialData(float *ip, int size) 36 | { 37 | // generate different seed for random number 38 | time_t t; 39 | srand((unsigned) time(&t)); 40 | 41 | for (int i = 0; i < size; i++) 42 | { 43 | ip[i] = (float)(rand() & 0xFF) / 10.0f; 44 | } 45 | 46 | return; 47 | } 48 | 49 | 50 | void sumArraysOnHost(float *A, float *B, float *C, const int N) 51 | { 52 | for (int idx = 0; idx < N; idx++) 53 | C[idx] = A[idx] + B[idx]; 54 | } 55 | 56 | __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N) 57 | { 58 | int i = threadIdx.x; 59 | 60 | if (i < N) C[i] = A[i] + B[i]; 61 | } 62 | 63 | 64 | int main(int argc, char **argv) 65 | { 66 | printf("%s Starting...\n", argv[0]); 67 | 68 | // set up device 69 | int dev = 0; 70 | CHECK(cudaSetDevice(dev)); 71 | 72 | // set up data size of vectors 73 | int nElem = 1 << 5; 74 | printf("Vector size %d\n", nElem); 75 | 76 | // malloc host memory 77 | size_t nBytes = nElem * sizeof(float); 78 | 79 | float *h_A, *h_B, *hostRef, *gpuRef; 80 | h_A = (float *)malloc(nBytes); 81 | h_B = (float *)malloc(nBytes); 82 | hostRef = (float *)malloc(nBytes); 83 | gpuRef = (float *)malloc(nBytes); 84 | 85 | // initialize data at host side 86 | initialData(h_A, nElem); 87 | initialData(h_B, nElem); 88 | 89 | memset(hostRef, 0, nBytes); 90 | memset(gpuRef, 0, nBytes); 91 | 92 | // malloc device global memory 93 | float *d_A, *d_B, *d_C; 94 | CHECK(cudaMalloc((float**)&d_A, nBytes)); 95 | CHECK(cudaMalloc((float**)&d_B, nBytes)); 96 | CHECK(cudaMalloc((float**)&d_C, nBytes)); 97 | 98 | // transfer data from host to device 99 | CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 100 | CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); 101 | CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice)); 102 | 103 | // invoke kernel at host side 104 | dim3 block (nElem); 105 | dim3 grid (1); 106 | 107 | sumArraysOnGPU<<>>(d_A, d_B, d_C, nElem); 108 | printf("Execution configure <<<%d, %d>>>\n", grid.x, block.x); 109 | 110 | // copy kernel result back to host side 111 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 112 | 113 | // add vector at host side for result checks 114 | sumArraysOnHost(h_A, h_B, hostRef, nElem); 115 | 116 | // check device results 117 | checkResult(hostRef, gpuRef, nElem); 118 | 119 | // free device global memory 120 | CHECK(cudaFree(d_A)); 121 | CHECK(cudaFree(d_B)); 122 | CHECK(cudaFree(d_C)); 123 | 124 | // free host memory 125 | free(h_A); 126 | free(h_B); 127 | free(hostRef); 128 | free(gpuRef); 129 | 130 | CHECK(cudaDeviceReset()); 131 | return(0); 132 | } 133 | -------------------------------------------------------------------------------- /chapter02/sumArraysOnGPU-timer.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates a simple vector sum on the GPU and on the host. 7 | * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the 8 | * GPU. Only a single thread block is used in this small case, for simplicity. 9 | * sumArraysOnHost sequentially iterates through vector elements on the host. 10 | * This version of sumArrays adds host timers to measure GPU and CPU 11 | * performance. 12 | */ 13 | 14 | void checkResult(float *hostRef, float *gpuRef, const int N) 15 | { 16 | double epsilon = 1.0E-8; 17 | bool match = 1; 18 | 19 | for (int i = 0; i < N; i++) 20 | { 21 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 22 | { 23 | match = 0; 24 | printf("Arrays do not match!\n"); 25 | printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], 26 | gpuRef[i], i); 27 | break; 28 | } 29 | } 30 | 31 | if (match) printf("Arrays match.\n\n"); 32 | 33 | return; 34 | } 35 | 36 | void initialData(float *ip, int size) 37 | { 38 | // generate different seed for random number 39 | time_t t; 40 | srand((unsigned) time(&t)); 41 | 42 | for (int i = 0; i < size; i++) 43 | { 44 | ip[i] = (float)( rand() & 0xFF ) / 10.0f; 45 | } 46 | 47 | return; 48 | } 49 | 50 | void sumArraysOnHost(float *A, float *B, float *C, const int N) 51 | { 52 | for (int idx = 0; idx < N; idx++) 53 | { 54 | C[idx] = A[idx] + B[idx]; 55 | } 56 | } 57 | __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N) 58 | { 59 | int i = blockIdx.x * blockDim.x + threadIdx.x; 60 | 61 | if (i < N) C[i] = A[i] + B[i]; 62 | } 63 | 64 | int main(int argc, char **argv) 65 | { 66 | printf("%s Starting...\n", argv[0]); 67 | 68 | // set up device 69 | int dev = 0; 70 | cudaDeviceProp deviceProp; 71 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 72 | printf("Using Device %d: %s\n", dev, deviceProp.name); 73 | CHECK(cudaSetDevice(dev)); 74 | 75 | // set up data size of vectors 76 | int nElem = 1 << 24; 77 | printf("Vector size %d\n", nElem); 78 | 79 | // malloc host memory 80 | size_t nBytes = nElem * sizeof(float); 81 | 82 | float *h_A, *h_B, *hostRef, *gpuRef; 83 | h_A = (float *)malloc(nBytes); 84 | h_B = (float *)malloc(nBytes); 85 | hostRef = (float *)malloc(nBytes); 86 | gpuRef = (float *)malloc(nBytes); 87 | 88 | double iStart, iElaps; 89 | 90 | // initialize data at host side 91 | iStart = seconds(); 92 | initialData(h_A, nElem); 93 | initialData(h_B, nElem); 94 | iElaps = seconds() - iStart; 95 | printf("initialData Time elapsed %f sec\n", iElaps); 96 | memset(hostRef, 0, nBytes); 97 | memset(gpuRef, 0, nBytes); 98 | 99 | // add vector at host side for result checks 100 | iStart = seconds(); 101 | sumArraysOnHost(h_A, h_B, hostRef, nElem); 102 | iElaps = seconds() - iStart; 103 | printf("sumArraysOnHost Time elapsed %f sec\n", iElaps); 104 | 105 | // malloc device global memory 106 | float *d_A, *d_B, *d_C; 107 | CHECK(cudaMalloc((float**)&d_A, nBytes)); 108 | CHECK(cudaMalloc((float**)&d_B, nBytes)); 109 | CHECK(cudaMalloc((float**)&d_C, nBytes)); 110 | 111 | // transfer data from host to device 112 | CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 113 | CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); 114 | CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice)); 115 | 116 | // invoke kernel at host side 117 | int iLen = 512; 118 | dim3 block (iLen); 119 | dim3 grid ((nElem + block.x - 1) / block.x); 120 | 121 | iStart = seconds(); 122 | sumArraysOnGPU<<>>(d_A, d_B, d_C, nElem); 123 | CHECK(cudaDeviceSynchronize()); 124 | iElaps = seconds() - iStart; 125 | printf("sumArraysOnGPU <<< %d, %d >>> Time elapsed %f sec\n", grid.x, 126 | block.x, iElaps); 127 | 128 | // check kernel error 129 | CHECK(cudaGetLastError()) ; 130 | 131 | // copy kernel result back to host side 132 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 133 | 134 | // check device results 135 | checkResult(hostRef, gpuRef, nElem); 136 | 137 | // free device global memory 138 | CHECK(cudaFree(d_A)); 139 | CHECK(cudaFree(d_B)); 140 | CHECK(cudaFree(d_C)); 141 | 142 | // free host memory 143 | free(h_A); 144 | free(h_B); 145 | free(hostRef); 146 | free(gpuRef); 147 | 148 | return(0); 149 | } 150 | -------------------------------------------------------------------------------- /chapter02/sumArraysOnHost.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* 5 | * This example demonstrates a simple vector sum on the host. sumArraysOnHost 6 | * sequentially iterates through vector elements on the host. 7 | */ 8 | 9 | void sumArraysOnHost(float *A, float *B, float *C, const int N) 10 | { 11 | for (int idx = 0; idx < N; idx++) 12 | { 13 | C[idx] = A[idx] + B[idx]; 14 | } 15 | 16 | } 17 | 18 | void initialData(float *ip, int size) 19 | { 20 | // generate different seed for random number 21 | time_t t; 22 | srand((unsigned) time(&t)); 23 | 24 | for (int i = 0; i < size; i++) 25 | { 26 | ip[i] = (float)(rand() & 0xFF) / 10.0f; 27 | } 28 | 29 | return; 30 | } 31 | 32 | int main(int argc, char **argv) 33 | { 34 | int nElem = 1024; 35 | size_t nBytes = nElem * sizeof(float); 36 | 37 | float *h_A, *h_B, *h_C; 38 | h_A = (float *)malloc(nBytes); 39 | h_B = (float *)malloc(nBytes); 40 | h_C = (float *)malloc(nBytes); 41 | 42 | initialData(h_A, nElem); 43 | initialData(h_B, nElem); 44 | 45 | sumArraysOnHost(h_A, h_B, h_C, nElem); 46 | 47 | free(h_A); 48 | free(h_B); 49 | free(h_C); 50 | 51 | return(0); 52 | } 53 | -------------------------------------------------------------------------------- /chapter02/sumMatrixOnGPU-1D-grid-1D-block.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates a simple vector sum on the GPU and on the host. 7 | * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the 8 | * GPU. A 1D thread block and 1D grid are used. sumArraysOnHost sequentially 9 | * iterates through vector elements on the host. 10 | */ 11 | 12 | void initialData(float *ip, const int size) 13 | { 14 | int i; 15 | 16 | for(i = 0; i < size; i++) 17 | { 18 | ip[i] = (float)(rand() & 0xFF ) / 10.0f; 19 | } 20 | 21 | return; 22 | } 23 | 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, 25 | const int ny) 26 | { 27 | float *ia = A; 28 | float *ib = B; 29 | float *ic = C; 30 | 31 | for (int iy = 0; iy < ny; iy++) 32 | { 33 | for (int ix = 0; ix < nx; ix++) 34 | { 35 | ic[ix] = ia[ix] + ib[ix]; 36 | 37 | } 38 | 39 | ia += nx; 40 | ib += nx; 41 | ic += nx; 42 | } 43 | 44 | return; 45 | } 46 | 47 | 48 | void checkResult(float *hostRef, float *gpuRef, const int N) 49 | { 50 | double epsilon = 1.0E-8; 51 | bool match = 1; 52 | 53 | for (int i = 0; i < N; i++) 54 | { 55 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 56 | { 57 | match = 0; 58 | printf("host %f gpu %f\n", hostRef[i], gpuRef[i]); 59 | break; 60 | } 61 | } 62 | 63 | if (match) 64 | printf("Arrays match.\n\n"); 65 | else 66 | printf("Arrays do not match.\n\n"); 67 | } 68 | 69 | // grid 1D block 1D 70 | __global__ void sumMatrixOnGPU1D(float *MatA, float *MatB, float *MatC, int nx, 71 | int ny) 72 | { 73 | unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; 74 | 75 | if (ix < nx ) 76 | for (int iy = 0; iy < ny; iy++) 77 | { 78 | int idx = iy * nx + ix; 79 | MatC[idx] = MatA[idx] + MatB[idx]; 80 | } 81 | 82 | 83 | } 84 | 85 | int main(int argc, char **argv) 86 | { 87 | printf("%s Starting...\n", argv[0]); 88 | 89 | // set up device 90 | int dev = 0; 91 | cudaDeviceProp deviceProp; 92 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 93 | printf("Using Device %d: %s\n", dev, deviceProp.name); 94 | CHECK(cudaSetDevice(dev)); 95 | 96 | // set up data size of matrix 97 | int nx = 1 << 14; 98 | int ny = 1 << 14; 99 | 100 | int nxy = nx * ny; 101 | int nBytes = nxy * sizeof(float); 102 | printf("Matrix size: nx %d ny %d\n", nx, ny); 103 | 104 | // malloc host memory 105 | float *h_A, *h_B, *hostRef, *gpuRef; 106 | h_A = (float *)malloc(nBytes); 107 | h_B = (float *)malloc(nBytes); 108 | hostRef = (float *)malloc(nBytes); 109 | gpuRef = (float *)malloc(nBytes); 110 | 111 | // initialize data at host side 112 | double iStart = seconds(); 113 | initialData(h_A, nxy); 114 | initialData(h_B, nxy); 115 | double iElaps = seconds() - iStart; 116 | printf("initialize matrix elapsed %f sec\n", iElaps); 117 | 118 | memset(hostRef, 0, nBytes); 119 | memset(gpuRef, 0, nBytes); 120 | 121 | // add matrix at host side for result checks 122 | iStart = seconds(); 123 | sumMatrixOnHost(h_A, h_B, hostRef, nx, ny); 124 | iElaps = seconds() - iStart; 125 | printf("sumMatrixOnHost elapsed %f sec\n", iElaps); 126 | 127 | // malloc device global memory 128 | float *d_MatA, *d_MatB, *d_MatC; 129 | CHECK(cudaMalloc((void **)&d_MatA, nBytes)); 130 | CHECK(cudaMalloc((void **)&d_MatB, nBytes)); 131 | CHECK(cudaMalloc((void **)&d_MatC, nBytes)); 132 | 133 | // transfer data from host to device 134 | CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); 135 | CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice)); 136 | 137 | // invoke kernel at host side 138 | int dimx = 32; 139 | dim3 block(dimx, 1); 140 | dim3 grid((nx + block.x - 1) / block.x, 1); 141 | 142 | iStart = seconds(); 143 | sumMatrixOnGPU1D<<>>(d_MatA, d_MatB, d_MatC, nx, ny); 144 | CHECK(cudaDeviceSynchronize()); 145 | iElaps = seconds() - iStart; 146 | printf("sumMatrixOnGPU1D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x, 147 | grid.y, 148 | block.x, block.y, iElaps); 149 | 150 | // check kernel error 151 | CHECK(cudaGetLastError()); 152 | 153 | // copy kernel result back to host side 154 | CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost)); 155 | 156 | // check device results 157 | checkResult(hostRef, gpuRef, nxy); 158 | 159 | // free device global memory 160 | CHECK(cudaFree(d_MatA)); 161 | CHECK(cudaFree(d_MatB)); 162 | CHECK(cudaFree(d_MatC)); 163 | 164 | // free host memory 165 | free(h_A); 166 | free(h_B); 167 | free(hostRef); 168 | free(gpuRef); 169 | 170 | // reset device 171 | CHECK(cudaDeviceReset()); 172 | 173 | return (0); 174 | } 175 | -------------------------------------------------------------------------------- /chapter02/sumMatrixOnGPU-2D-grid-1D-block.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates a simple vector sum on the GPU and on the host. 7 | * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the 8 | * GPU. A 1D thread block and 2D grid are used. sumArraysOnHost sequentially 9 | * iterates through vector elements on the host. 10 | */ 11 | 12 | void initialData(float *ip, const int size) 13 | { 14 | int i; 15 | 16 | for(i = 0; i < size; i++) 17 | { 18 | ip[i] = (float)(rand() & 0xFF) / 10.0f; 19 | } 20 | 21 | return; 22 | } 23 | 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, 25 | const int ny) 26 | { 27 | float *ia = A; 28 | float *ib = B; 29 | float *ic = C; 30 | 31 | for (int iy = 0; iy < ny; iy++) 32 | { 33 | for (int ix = 0; ix < nx; ix++) 34 | { 35 | ic[ix] = ia[ix] + ib[ix]; 36 | 37 | } 38 | 39 | ia += nx; 40 | ib += nx; 41 | ic += nx; 42 | } 43 | 44 | return; 45 | } 46 | 47 | 48 | void checkResult(float *hostRef, float *gpuRef, const int N) 49 | { 50 | double epsilon = 1.0E-8; 51 | bool match = 1; 52 | 53 | for (int i = 0; i < N; i++) 54 | { 55 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 56 | { 57 | match = 0; 58 | printf("host %f gpu %f\n", hostRef[i], gpuRef[i]); 59 | break; 60 | } 61 | } 62 | 63 | if (match) 64 | printf("Arrays match.\n\n"); 65 | else 66 | printf("Arrays do not match.\n\n"); 67 | } 68 | 69 | // grid 2D block 1D 70 | __global__ void sumMatrixOnGPUMix(float *MatA, float *MatB, float *MatC, int nx, 71 | int ny) 72 | { 73 | unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; 74 | unsigned int iy = blockIdx.y; 75 | unsigned int idx = iy * nx + ix; 76 | 77 | if (ix < nx && iy < ny) 78 | MatC[idx] = MatA[idx] + MatB[idx]; 79 | } 80 | 81 | int main(int argc, char **argv) 82 | { 83 | printf("%s Starting...\n", argv[0]); 84 | 85 | // set up device 86 | int dev = 0; 87 | cudaDeviceProp deviceProp; 88 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 89 | printf("Using Device %d: %s\n", dev, deviceProp.name); 90 | CHECK(cudaSetDevice(dev)); 91 | 92 | // set up data size of matrix 93 | int nx = 1 << 14; 94 | int ny = 1 << 14; 95 | 96 | int nxy = nx * ny; 97 | int nBytes = nxy * sizeof(float); 98 | printf("Matrix size: nx %d ny %d\n", nx, ny); 99 | 100 | // malloc host memory 101 | float *h_A, *h_B, *hostRef, *gpuRef; 102 | h_A = (float *)malloc(nBytes); 103 | h_B = (float *)malloc(nBytes); 104 | hostRef = (float *)malloc(nBytes); 105 | gpuRef = (float *)malloc(nBytes); 106 | 107 | // initialize data at host side 108 | double iStart = seconds(); 109 | initialData(h_A, nxy); 110 | initialData(h_B, nxy); 111 | double iElaps = seconds() - iStart; 112 | printf("Matrix initialization elapsed %f sec\n", iElaps); 113 | 114 | memset(hostRef, 0, nBytes); 115 | memset(gpuRef, 0, nBytes); 116 | 117 | // add matrix at host side for result checks 118 | iStart = seconds(); 119 | sumMatrixOnHost(h_A, h_B, hostRef, nx, ny); 120 | iElaps = seconds() - iStart; 121 | printf("sumMatrixOnHost elapsed %f sec\n", iElaps); 122 | 123 | // malloc device global memory 124 | float *d_MatA, *d_MatB, *d_MatC; 125 | CHECK(cudaMalloc((void **)&d_MatA, nBytes)); 126 | CHECK(cudaMalloc((void **)&d_MatB, nBytes)); 127 | CHECK(cudaMalloc((void **)&d_MatC, nBytes)); 128 | 129 | // transfer data from host to device 130 | CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); 131 | CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice)); 132 | 133 | // invoke kernel at host side 134 | int dimx = 32; 135 | dim3 block(dimx, 1); 136 | dim3 grid((nx + block.x - 1) / block.x, ny); 137 | 138 | iStart = seconds(); 139 | sumMatrixOnGPUMix<<>>(d_MatA, d_MatB, d_MatC, nx, ny); 140 | CHECK(cudaDeviceSynchronize()); 141 | iElaps = seconds() - iStart; 142 | printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x, 143 | grid.y, 144 | block.x, block.y, iElaps); 145 | // check kernel error 146 | CHECK(cudaGetLastError()); 147 | 148 | // copy kernel result back to host side 149 | CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost)); 150 | 151 | // check device results 152 | checkResult(hostRef, gpuRef, nxy); 153 | 154 | // free device global memory 155 | CHECK(cudaFree(d_MatA)); 156 | CHECK(cudaFree(d_MatB)); 157 | CHECK(cudaFree(d_MatC)); 158 | 159 | // free host memory 160 | free(h_A); 161 | free(h_B); 162 | free(hostRef); 163 | free(gpuRef); 164 | 165 | // reset device 166 | CHECK(cudaDeviceReset()); 167 | 168 | return (0); 169 | } 170 | -------------------------------------------------------------------------------- /chapter02/sumMatrixOnGPU-2D-grid-2D-block.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates a simple vector sum on the GPU and on the host. 7 | * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the 8 | * GPU. A 2D thread block and 2D grid are used. sumArraysOnHost sequentially 9 | * iterates through vector elements on the host. 10 | */ 11 | 12 | void initialData(float *ip, const int size) 13 | { 14 | int i; 15 | 16 | for(i = 0; i < size; i++) 17 | { 18 | ip[i] = (float)(rand() & 0xFF) / 10.0f; 19 | } 20 | 21 | return; 22 | } 23 | 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, 25 | const int ny) 26 | { 27 | float *ia = A; 28 | float *ib = B; 29 | float *ic = C; 30 | 31 | for (int iy = 0; iy < ny; iy++) 32 | { 33 | for (int ix = 0; ix < nx; ix++) 34 | { 35 | ic[ix] = ia[ix] + ib[ix]; 36 | 37 | } 38 | 39 | ia += nx; 40 | ib += nx; 41 | ic += nx; 42 | } 43 | 44 | return; 45 | } 46 | 47 | 48 | void checkResult(float *hostRef, float *gpuRef, const int N) 49 | { 50 | double epsilon = 1.0E-8; 51 | bool match = 1; 52 | 53 | for (int i = 0; i < N; i++) 54 | { 55 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 56 | { 57 | match = 0; 58 | printf("host %f gpu %f\n", hostRef[i], gpuRef[i]); 59 | break; 60 | } 61 | } 62 | 63 | if (match) 64 | printf("Arrays match.\n\n"); 65 | else 66 | printf("Arrays do not match.\n\n"); 67 | } 68 | 69 | // grid 2D block 2D 70 | __global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, 71 | int ny) 72 | { 73 | unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; 74 | unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y; 75 | unsigned int idx = iy * nx + ix; 76 | 77 | if (ix < nx && iy < ny) 78 | MatC[idx] = MatA[idx] + MatB[idx]; 79 | } 80 | 81 | int main(int argc, char **argv) 82 | { 83 | printf("%s Starting...\n", argv[0]); 84 | 85 | // set up device 86 | int dev = 0; 87 | cudaDeviceProp deviceProp; 88 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 89 | printf("Using Device %d: %s\n", dev, deviceProp.name); 90 | CHECK(cudaSetDevice(dev)); 91 | 92 | // set up data size of matrix 93 | int nx = 1 << 14; 94 | int ny = 1 << 14; 95 | 96 | int nxy = nx * ny; 97 | int nBytes = nxy * sizeof(float); 98 | printf("Matrix size: nx %d ny %d\n", nx, ny); 99 | 100 | // malloc host memory 101 | float *h_A, *h_B, *hostRef, *gpuRef; 102 | h_A = (float *)malloc(nBytes); 103 | h_B = (float *)malloc(nBytes); 104 | hostRef = (float *)malloc(nBytes); 105 | gpuRef = (float *)malloc(nBytes); 106 | 107 | // initialize data at host side 108 | double iStart = seconds(); 109 | initialData(h_A, nxy); 110 | initialData(h_B, nxy); 111 | double iElaps = seconds() - iStart; 112 | printf("Matrix initialization elapsed %f sec\n", iElaps); 113 | 114 | memset(hostRef, 0, nBytes); 115 | memset(gpuRef, 0, nBytes); 116 | 117 | // add matrix at host side for result checks 118 | iStart = seconds(); 119 | sumMatrixOnHost(h_A, h_B, hostRef, nx, ny); 120 | iElaps = seconds() - iStart; 121 | printf("sumMatrixOnHost elapsed %f sec\n", iElaps); 122 | 123 | // malloc device global memory 124 | float *d_MatA, *d_MatB, *d_MatC; 125 | CHECK(cudaMalloc((void **)&d_MatA, nBytes)); 126 | CHECK(cudaMalloc((void **)&d_MatB, nBytes)); 127 | CHECK(cudaMalloc((void **)&d_MatC, nBytes)); 128 | 129 | // transfer data from host to device 130 | CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); 131 | CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice)); 132 | 133 | // invoke kernel at host side 134 | int dimx = 32; 135 | int dimy = 32; 136 | dim3 block(dimx, dimy); 137 | dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); 138 | 139 | iStart = seconds(); 140 | sumMatrixOnGPU2D<<>>(d_MatA, d_MatB, d_MatC, nx, ny); 141 | CHECK(cudaDeviceSynchronize()); 142 | iElaps = seconds() - iStart; 143 | printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x, 144 | grid.y, 145 | block.x, block.y, iElaps); 146 | // check kernel error 147 | CHECK(cudaGetLastError()); 148 | 149 | // copy kernel result back to host side 150 | CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost)); 151 | 152 | // check device results 153 | checkResult(hostRef, gpuRef, nxy); 154 | 155 | // free device global memory 156 | CHECK(cudaFree(d_MatA)); 157 | CHECK(cudaFree(d_MatB)); 158 | CHECK(cudaFree(d_MatC)); 159 | 160 | // free host memory 161 | free(h_A); 162 | free(h_B); 163 | free(hostRef); 164 | free(gpuRef); 165 | 166 | // reset device 167 | CHECK(cudaDeviceReset()); 168 | 169 | return (0); 170 | } 171 | -------------------------------------------------------------------------------- /chapter03/Makefile: -------------------------------------------------------------------------------- 1 | CU_APPS=nestedHelloWorld nestedReduce nestedReduce2 nestedReduceNosync \ 2 | reduceInteger simpleDeviceQuery simpleDivergence sumMatrix 3 | C_APPS= 4 | 5 | all: ${C_APPS} ${CU_APPS} 6 | 7 | %: %.cu 8 | nvcc -O2 -arch=sm_35 -o $@ $< -lcudadevrt --relocatable-device-code true 9 | %: %.c 10 | gcc -O2 -std=c99 -o $@ $< 11 | clean: 12 | rm -f ${CU_APPS} ${C_APPS} 13 | -------------------------------------------------------------------------------- /chapter03/nestedHelloWorld.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * A simple example of nested kernel launches from the GPU. Each thread displays 7 | * its information when execution begins, and also diagnostics when the next 8 | * lowest nesting layer completes. 9 | */ 10 | 11 | __global__ void nestedHelloWorld(int const iSize, int iDepth) 12 | { 13 | int tid = threadIdx.x; 14 | printf("Recursion=%d: Hello World from thread %d block %d\n", iDepth, tid, 15 | blockIdx.x); 16 | 17 | // condition to stop recursive execution 18 | if (iSize == 1) return; 19 | 20 | // reduce block size to half 21 | int nthreads = iSize >> 1; 22 | 23 | // thread 0 launches child grid recursively 24 | if(tid == 0 && nthreads > 0) 25 | { 26 | nestedHelloWorld<<<1, nthreads>>>(nthreads, ++iDepth); 27 | printf("-------> nested execution depth: %d\n", iDepth); 28 | } 29 | } 30 | 31 | int main(int argc, char **argv) 32 | { 33 | int size = 8; 34 | int blocksize = 8; // initial block size 35 | int igrid = 1; 36 | 37 | if(argc > 1) 38 | { 39 | igrid = atoi(argv[1]); 40 | size = igrid * blocksize; 41 | } 42 | 43 | dim3 block (blocksize, 1); 44 | dim3 grid ((size + block.x - 1) / block.x, 1); 45 | printf("%s Execution Configuration: grid %d block %d\n", argv[0], grid.x, 46 | block.x); 47 | 48 | nestedHelloWorld<<>>(block.x, 0); 49 | 50 | CHECK(cudaGetLastError()); 51 | CHECK(cudaDeviceReset()); 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /chapter03/nestedReduce.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #define LOG 0 5 | 6 | /* 7 | * An implementation of parallel reduction using nested kernel launches from 8 | * CUDA kernels. 9 | */ 10 | 11 | // Recursive Implementation of Interleaved Pair Approach 12 | int cpuRecursiveReduce(int *data, int const size) 13 | { 14 | // stop condition 15 | if (size == 1) return data[0]; 16 | 17 | // renew the stride 18 | int const stride = size / 2; 19 | 20 | // in-place reduction 21 | for (int i = 0; i < stride; i++) 22 | { 23 | data[i] += data[i + stride]; 24 | } 25 | 26 | // call recursively 27 | return cpuRecursiveReduce(data, stride); 28 | } 29 | 30 | // Neighbored Pair Implementation with divergence 31 | __global__ void reduceNeighbored (int *g_idata, int *g_odata, unsigned int n) 32 | { 33 | // set thread ID 34 | unsigned int tid = threadIdx.x; 35 | unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 36 | 37 | // convert global data pointer to the local pointer of this block 38 | int *idata = g_idata + blockIdx.x * blockDim.x; 39 | 40 | // boundary check 41 | if (idx >= n) return; 42 | 43 | // in-place reduction in global memory 44 | for (int stride = 1; stride < blockDim.x; stride *= 2) 45 | { 46 | if ((tid % (2 * stride)) == 0) 47 | { 48 | idata[tid] += idata[tid + stride]; 49 | } 50 | 51 | // synchronize within threadblock 52 | __syncthreads(); 53 | } 54 | 55 | // write result for this block to global mem 56 | if (tid == 0) g_odata[blockIdx.x] = idata[0]; 57 | } 58 | 59 | __global__ void gpuRecursiveReduce (int *g_idata, int *g_odata, 60 | unsigned int isize) 61 | { 62 | // set thread ID 63 | unsigned int tid = threadIdx.x; 64 | 65 | // convert global data pointer to the local pointer of this block 66 | int *idata = g_idata + blockIdx.x * blockDim.x; 67 | int *odata = &g_odata[blockIdx.x]; 68 | 69 | // stop condition 70 | if (isize == 2 && tid == 0) 71 | { 72 | g_odata[blockIdx.x] = idata[0] + idata[1]; 73 | return; 74 | } 75 | 76 | // nested invocation 77 | int istride = isize >> 1; 78 | 79 | if(istride > 1 && tid < istride) 80 | { 81 | // in place reduction 82 | idata[tid] += idata[tid + istride]; 83 | } 84 | 85 | // sync at block level 86 | __syncthreads(); 87 | 88 | // nested invocation to generate child grids 89 | if(tid == 0) 90 | { 91 | gpuRecursiveReduce<<<1, istride>>>(idata, odata, istride); 92 | 93 | // sync all child grids launched in this block 94 | cudaDeviceSynchronize(); 95 | } 96 | 97 | // sync at block level again 98 | __syncthreads(); 99 | } 100 | 101 | // main from here 102 | int main(int argc, char **argv) 103 | { 104 | // set up device 105 | int dev = 0, gpu_sum; 106 | cudaDeviceProp deviceProp; 107 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 108 | printf("%s starting reduction at ", argv[0]); 109 | printf("device %d: %s ", dev, deviceProp.name); 110 | CHECK(cudaSetDevice(dev)); 111 | 112 | bool bResult = false; 113 | 114 | // set up execution configuration 115 | int nblock = 2048; 116 | int nthread = 512; // initial block size 117 | 118 | if(argc > 1) 119 | { 120 | nblock = atoi(argv[1]); // block size from command line argument 121 | } 122 | 123 | if(argc > 2) 124 | { 125 | nthread = atoi(argv[2]); // block size from command line argument 126 | } 127 | 128 | int size = nblock * nthread; // total number of elements to reduceNeighbored 129 | 130 | dim3 block (nthread, 1); 131 | dim3 grid ((size + block.x - 1) / block.x, 1); 132 | printf("array %d grid %d block %d\n", size, grid.x, block.x); 133 | 134 | // allocate host memory 135 | size_t bytes = size * sizeof(int); 136 | int *h_idata = (int *) malloc(bytes); 137 | int *h_odata = (int *) malloc(grid.x * sizeof(int)); 138 | int *tmp = (int *) malloc(bytes); 139 | 140 | // initialize the array 141 | for (int i = 0; i < size; i++) 142 | { 143 | h_idata[i] = (int)( rand() & 0xFF ); 144 | h_idata[i] = 1; 145 | } 146 | 147 | memcpy (tmp, h_idata, bytes); 148 | 149 | // allocate device memory 150 | int *d_idata = NULL; 151 | int *d_odata = NULL; 152 | CHECK(cudaMalloc((void **) &d_idata, bytes)); 153 | CHECK(cudaMalloc((void **) &d_odata, grid.x * sizeof(int))); 154 | 155 | double iStart, iElaps; 156 | 157 | // cpu recursive reduction 158 | iStart = seconds(); 159 | int cpu_sum = cpuRecursiveReduce (tmp, size); 160 | iElaps = seconds() - iStart; 161 | printf("cpu reduce\t\telapsed %f sec cpu_sum: %d\n", iElaps, cpu_sum); 162 | 163 | // gpu reduceNeighbored 164 | CHECK(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); 165 | iStart = seconds(); 166 | reduceNeighbored<<>>(d_idata, d_odata, size); 167 | CHECK(cudaDeviceSynchronize()); 168 | CHECK(cudaGetLastError()); 169 | iElaps = seconds() - iStart; 170 | CHECK(cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int), 171 | cudaMemcpyDeviceToHost)); 172 | gpu_sum = 0; 173 | 174 | for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i]; 175 | 176 | printf("gpu Neighbored\t\telapsed %f sec gpu_sum: %d <<>>\n", iElaps, gpu_sum, grid.x, block.x); 178 | 179 | // gpu nested reduce kernel 180 | CHECK(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice)); 181 | iStart = seconds(); 182 | gpuRecursiveReduce<<>>(d_idata, d_odata, block.x); 183 | CHECK(cudaDeviceSynchronize()); 184 | CHECK(cudaGetLastError()); 185 | iElaps = seconds() - iStart; 186 | CHECK(cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int), 187 | cudaMemcpyDeviceToHost)); 188 | gpu_sum = 0; 189 | 190 | for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i]; 191 | 192 | printf("gpu nested\t\telapsed %f sec gpu_sum: %d <<>>\n", 193 | iElaps, gpu_sum, grid.x, block.x); 194 | 195 | // free host memory 196 | free(h_idata); 197 | free(h_odata); 198 | 199 | // free device memory 200 | CHECK(cudaFree(d_idata)); 201 | CHECK(cudaFree(d_odata)); 202 | 203 | // reset device 204 | CHECK(cudaDeviceReset()); 205 | 206 | // check the results 207 | bResult = (gpu_sum == cpu_sum); 208 | 209 | if(!bResult) printf("Test failed!\n"); 210 | 211 | return EXIT_SUCCESS; 212 | } 213 | -------------------------------------------------------------------------------- /chapter03/simpleDeviceQuery.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * Fetches basic information on the first device in the current CUDA platform, 7 | * including number of SMs, bytes of constant memory, bytes of shared memory per 8 | * block, etc. 9 | */ 10 | 11 | int main(int argc, char *argv[]) 12 | { 13 | int iDev = 0; 14 | cudaDeviceProp iProp; 15 | CHECK(cudaGetDeviceProperties(&iProp, iDev)); 16 | 17 | printf("Device %d: %s\n", iDev, iProp.name); 18 | printf(" Number of multiprocessors: %d\n", 19 | iProp.multiProcessorCount); 20 | printf(" Total amount of constant memory: %4.2f KB\n", 21 | iProp.totalConstMem / 1024.0); 22 | printf(" Total amount of shared memory per block: %4.2f KB\n", 23 | iProp.sharedMemPerBlock / 1024.0); 24 | printf(" Total number of registers available per block: %d\n", 25 | iProp.regsPerBlock); 26 | printf(" Warp size: %d\n", 27 | iProp.warpSize); 28 | printf(" Maximum number of threads per block: %d\n", 29 | iProp.maxThreadsPerBlock); 30 | printf(" Maximum number of threads per multiprocessor: %d\n", 31 | iProp.maxThreadsPerMultiProcessor); 32 | printf(" Maximum number of warps per multiprocessor: %d\n", 33 | iProp.maxThreadsPerMultiProcessor / 32); 34 | return EXIT_SUCCESS; 35 | } 36 | -------------------------------------------------------------------------------- /chapter03/simpleDivergence.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * simpleDivergence demonstrates divergent code on the GPU and its impact on 7 | * performance and CUDA metrics. 8 | */ 9 | 10 | __global__ void mathKernel1(float *c) 11 | { 12 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 13 | float ia, ib; 14 | ia = ib = 0.0f; 15 | 16 | if (tid % 2 == 0) 17 | { 18 | ia = 100.0f; 19 | } 20 | else 21 | { 22 | ib = 200.0f; 23 | } 24 | 25 | c[tid] = ia + ib; 26 | } 27 | 28 | __global__ void mathKernel2(float *c) 29 | { 30 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 31 | float ia, ib; 32 | ia = ib = 0.0f; 33 | 34 | if ((tid / warpSize) % 2 == 0) 35 | { 36 | ia = 100.0f; 37 | } 38 | else 39 | { 40 | ib = 200.0f; 41 | } 42 | 43 | c[tid] = ia + ib; 44 | } 45 | 46 | __global__ void mathKernel3(float *c) 47 | { 48 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 49 | float ia, ib; 50 | ia = ib = 0.0f; 51 | 52 | bool ipred = (tid % 2 == 0); 53 | 54 | if (ipred) 55 | { 56 | ia = 100.0f; 57 | } 58 | 59 | if (!ipred) 60 | { 61 | ib = 200.0f; 62 | } 63 | 64 | c[tid] = ia + ib; 65 | } 66 | 67 | __global__ void mathKernel4(float *c) 68 | { 69 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 70 | float ia, ib; 71 | ia = ib = 0.0f; 72 | 73 | int itid = tid >> 5; 74 | 75 | if (itid & 0x01 == 0) 76 | { 77 | ia = 100.0f; 78 | } 79 | else 80 | { 81 | ib = 200.0f; 82 | } 83 | 84 | c[tid] = ia + ib; 85 | } 86 | 87 | __global__ void warmingup(float *c) 88 | { 89 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 90 | float ia, ib; 91 | ia = ib = 0.0f; 92 | 93 | if ((tid / warpSize) % 2 == 0) 94 | { 95 | ia = 100.0f; 96 | } 97 | else 98 | { 99 | ib = 200.0f; 100 | } 101 | 102 | c[tid] = ia + ib; 103 | } 104 | 105 | 106 | int main(int argc, char **argv) 107 | { 108 | // set up device 109 | int dev = 0; 110 | cudaDeviceProp deviceProp; 111 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 112 | printf("%s using Device %d: %s\n", argv[0], dev, deviceProp.name); 113 | 114 | // set up data size 115 | int size = 64; 116 | int blocksize = 64; 117 | 118 | if(argc > 1) blocksize = atoi(argv[1]); 119 | 120 | if(argc > 2) size = atoi(argv[2]); 121 | 122 | printf("Data size %d ", size); 123 | 124 | // set up execution configuration 125 | dim3 block (blocksize, 1); 126 | dim3 grid ((size + block.x - 1) / block.x, 1); 127 | printf("Execution Configure (block %d grid %d)\n", block.x, grid.x); 128 | 129 | // allocate gpu memory 130 | float *d_C; 131 | size_t nBytes = size * sizeof(float); 132 | CHECK(cudaMalloc((float**)&d_C, nBytes)); 133 | 134 | // run a warmup kernel to remove overhead 135 | size_t iStart, iElaps; 136 | CHECK(cudaDeviceSynchronize()); 137 | iStart = seconds(); 138 | warmingup<<>>(d_C); 139 | CHECK(cudaDeviceSynchronize()); 140 | iElaps = seconds() - iStart; 141 | printf("warmup <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x, 142 | iElaps ); 143 | CHECK(cudaGetLastError()); 144 | 145 | // run kernel 1 146 | iStart = seconds(); 147 | mathKernel1<<>>(d_C); 148 | CHECK(cudaDeviceSynchronize()); 149 | iElaps = seconds() - iStart; 150 | printf("mathKernel1 <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x, 151 | iElaps ); 152 | CHECK(cudaGetLastError()); 153 | 154 | // run kernel 3 155 | iStart = seconds(); 156 | mathKernel2<<>>(d_C); 157 | CHECK(cudaDeviceSynchronize()); 158 | iElaps = seconds() - iStart; 159 | printf("mathKernel2 <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x, 160 | iElaps ); 161 | CHECK(cudaGetLastError()); 162 | 163 | // run kernel 3 164 | iStart = seconds(); 165 | mathKernel3<<>>(d_C); 166 | CHECK(cudaDeviceSynchronize()); 167 | iElaps = seconds() - iStart; 168 | printf("mathKernel3 <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x, 169 | iElaps); 170 | CHECK(cudaGetLastError()); 171 | 172 | // run kernel 4 173 | iStart = seconds(); 174 | mathKernel4<<>>(d_C); 175 | CHECK(cudaDeviceSynchronize()); 176 | iElaps = seconds() - iStart; 177 | printf("mathKernel4 <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x, 178 | iElaps); 179 | CHECK(cudaGetLastError()); 180 | 181 | // free gpu memory and reset divece 182 | CHECK(cudaFree(d_C)); 183 | CHECK(cudaDeviceReset()); 184 | return EXIT_SUCCESS; 185 | } 186 | -------------------------------------------------------------------------------- /chapter03/sumMatrix.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example implements matrix element-wise addition on the host and GPU. 7 | * sumMatrixOnHost iterates over the rows and columns of each matrix, adding 8 | * elements from A and B together and storing the results in C. The current 9 | * offset in each matrix is stored using pointer arithmetic. sumMatrixOnGPU2D 10 | * implements the same logic, but using CUDA threads to process each matrix. 11 | */ 12 | 13 | void initialData(float *ip, const int size) 14 | { 15 | int i; 16 | 17 | for(i = 0; i < size; i++) 18 | { 19 | ip[i] = (float)( rand() & 0xFF ) / 10.0f; 20 | } 21 | } 22 | 23 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny) 24 | { 25 | float *ia = A; 26 | float *ib = B; 27 | float *ic = C; 28 | 29 | for (int iy = 0; iy < ny; iy++) 30 | { 31 | for (int ix = 0; ix < nx; ix++) 32 | { 33 | ic[ix] = ia[ix] + ib[ix]; 34 | } 35 | 36 | ia += nx; 37 | ib += nx; 38 | ic += nx; 39 | } 40 | 41 | return; 42 | } 43 | 44 | void checkResult(float *hostRef, float *gpuRef, const int N) 45 | { 46 | double epsilon = 1.0E-8; 47 | 48 | for (int i = 0; i < N; i++) 49 | { 50 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 51 | { 52 | printf("host %f gpu %f ", hostRef[i], gpuRef[i]); 53 | printf("Arrays do not match.\n\n"); 54 | break; 55 | } 56 | } 57 | } 58 | 59 | // grid 2D block 2D 60 | __global__ void sumMatrixOnGPU2D(float *A, float *B, float *C, int NX, int NY) 61 | { 62 | unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x; 63 | unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y; 64 | unsigned int idx = iy * NX + ix; 65 | 66 | if (ix < NX && iy < NY) 67 | { 68 | C[idx] = A[idx] + B[idx]; 69 | } 70 | } 71 | 72 | int main(int argc, char **argv) 73 | { 74 | // set up device 75 | int dev = 0; 76 | cudaDeviceProp deviceProp; 77 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 78 | CHECK(cudaSetDevice(dev)); 79 | 80 | // set up data size of matrix 81 | int nx = 1 << 14; 82 | int ny = 1 << 14; 83 | 84 | int nxy = nx * ny; 85 | int nBytes = nxy * sizeof(float); 86 | 87 | // malloc host memory 88 | float *h_A, *h_B, *hostRef, *gpuRef; 89 | h_A = (float *)malloc(nBytes); 90 | h_B = (float *)malloc(nBytes); 91 | hostRef = (float *)malloc(nBytes); 92 | gpuRef = (float *)malloc(nBytes); 93 | 94 | // initialize data at host side 95 | size_t iStart = seconds(); 96 | initialData(h_A, nxy); 97 | initialData(h_B, nxy); 98 | size_t iElaps = seconds() - iStart; 99 | 100 | memset(hostRef, 0, nBytes); 101 | memset(gpuRef, 0, nBytes); 102 | 103 | // add matrix at host side for result checks 104 | iStart = seconds(); 105 | sumMatrixOnHost (h_A, h_B, hostRef, nx, ny); 106 | iElaps = seconds() - iStart; 107 | 108 | // malloc device global memory 109 | float *d_MatA, *d_MatB, *d_MatC; 110 | CHECK(cudaMalloc((void **)&d_MatA, nBytes)); 111 | CHECK(cudaMalloc((void **)&d_MatB, nBytes)); 112 | CHECK(cudaMalloc((void **)&d_MatC, nBytes)); 113 | 114 | // transfer data from host to device 115 | CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); 116 | CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice)); 117 | 118 | // invoke kernel at host side 119 | int dimx = 32; 120 | int dimy = 32; 121 | 122 | if(argc > 2) 123 | { 124 | dimx = atoi(argv[1]); 125 | dimy = atoi(argv[2]); 126 | } 127 | 128 | dim3 block(dimx, dimy); 129 | dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); 130 | 131 | // execute the kernel 132 | CHECK(cudaDeviceSynchronize()); 133 | iStart = seconds(); 134 | sumMatrixOnGPU2D<<>>(d_MatA, d_MatB, d_MatC, nx, ny); 135 | CHECK(cudaDeviceSynchronize()); 136 | iElaps = seconds() - iStart; 137 | printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %d ms\n", grid.x, 138 | grid.y, 139 | block.x, block.y, iElaps); 140 | CHECK(cudaGetLastError()); 141 | 142 | // copy kernel result back to host side 143 | CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost)); 144 | 145 | // check device results 146 | checkResult(hostRef, gpuRef, nxy); 147 | 148 | // free device global memory 149 | CHECK(cudaFree(d_MatA)); 150 | CHECK(cudaFree(d_MatB)); 151 | CHECK(cudaFree(d_MatC)); 152 | 153 | // free host memory 154 | free(h_A); 155 | free(h_B); 156 | free(hostRef); 157 | free(gpuRef); 158 | 159 | // reset device 160 | CHECK(cudaDeviceReset()); 161 | 162 | return EXIT_SUCCESS; 163 | } 164 | -------------------------------------------------------------------------------- /chapter04/Makefile: -------------------------------------------------------------------------------- 1 | CU_APPS=globalVariable memTransfer pinMemTransfer readSegment \ 2 | readSegmentUnroll simpleMathAoS simpleMathSoA sumArrayZerocpy \ 3 | sumMatrixGPUManaged sumMatrixGPUManual transpose writeSegment 4 | C_APPS= 5 | 6 | all: ${C_APPS} ${CU_APPS} 7 | 8 | %: %.cu 9 | nvcc -O2 -arch=sm_20 -o $@ $< 10 | %: %.c 11 | gcc -O2 -std=c99 -o $@ $< 12 | clean: 13 | rm -f ${CU_APPS} ${C_APPS} 14 | -------------------------------------------------------------------------------- /chapter04/globalVariable.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * An example of using a statically declared global variable (devData) to store 7 | * a floating-point value on the device. 8 | */ 9 | 10 | __device__ float devData; 11 | 12 | __global__ void checkGlobalVariable() 13 | { 14 | // display the original value 15 | printf("Device: the value of the global variable is %f\n", devData); 16 | 17 | // alter the value 18 | devData += 2.0f; 19 | } 20 | 21 | int main(void) 22 | { 23 | // initialize the global variable 24 | float value = 3.14f; 25 | CHECK(cudaMemcpyToSymbol(devData, &value, sizeof(float))); 26 | printf("Host: copied %f to the global variable\n", value); 27 | 28 | // invoke the kernel 29 | checkGlobalVariable<<<1, 1>>>(); 30 | 31 | // copy the global variable back to the host 32 | CHECK(cudaMemcpyFromSymbol(&value, devData, sizeof(float))); 33 | printf("Host: the value changed by the kernel to %f\n", value); 34 | 35 | CHECK(cudaDeviceReset()); 36 | return EXIT_SUCCESS; 37 | } 38 | -------------------------------------------------------------------------------- /chapter04/memTransfer.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * An example of using CUDA's memory copy API to transfer data to and from the 7 | * device. In this case, cudaMalloc is used to allocate memory on the GPU and 8 | * cudaMemcpy is used to transfer the contents of host memory to an array 9 | * allocated using cudaMalloc. 10 | */ 11 | 12 | int main(int argc, char **argv) 13 | { 14 | // set up device 15 | int dev = 0; 16 | CHECK(cudaSetDevice(dev)); 17 | 18 | // memory size 19 | unsigned int isize = 1 << 22; 20 | unsigned int nbytes = isize * sizeof(float); 21 | 22 | // get device information 23 | cudaDeviceProp deviceProp; 24 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 25 | printf("%s starting at ", argv[0]); 26 | printf("device %d: %s memory size %d nbyte %5.2fMB\n", dev, 27 | deviceProp.name, isize, nbytes / (1024.0f * 1024.0f)); 28 | 29 | // allocate the host memory 30 | float *h_a = (float *)malloc(nbytes); 31 | 32 | // allocate the device memory 33 | float *d_a; 34 | CHECK(cudaMalloc((float **)&d_a, nbytes)); 35 | 36 | // initialize the host memory 37 | for(unsigned int i = 0; i < isize; i++) h_a[i] = 0.5f; 38 | 39 | // transfer data from the host to the device 40 | CHECK(cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice)); 41 | 42 | // transfer data from the device to the host 43 | CHECK(cudaMemcpy(h_a, d_a, nbytes, cudaMemcpyDeviceToHost)); 44 | 45 | // free memory 46 | CHECK(cudaFree(d_a)); 47 | free(h_a); 48 | 49 | // reset device 50 | CHECK(cudaDeviceReset()); 51 | return EXIT_SUCCESS; 52 | } 53 | -------------------------------------------------------------------------------- /chapter04/pinMemTransfer.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * An example of using CUDA's memory copy API to transfer data to and from the 7 | * device. In this case, cudaMalloc is used to allocate memory on the GPU and 8 | * cudaMemcpy is used to transfer the contents of host memory to an array 9 | * allocated using cudaMalloc. Host memory is allocated using cudaMallocHost to 10 | * create a page-locked host array. 11 | */ 12 | 13 | int main(int argc, char **argv) 14 | { 15 | // set up device 16 | int dev = 0; 17 | CHECK(cudaSetDevice(dev)); 18 | 19 | // memory size 20 | unsigned int isize = 1 << 22; 21 | unsigned int nbytes = isize * sizeof(float); 22 | 23 | // get device information 24 | cudaDeviceProp deviceProp; 25 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 26 | 27 | if (!deviceProp.canMapHostMemory) 28 | { 29 | printf("Device %d does not support mapping CPU host memory!\n", dev); 30 | CHECK(cudaDeviceReset()); 31 | exit(EXIT_SUCCESS); 32 | } 33 | 34 | printf("%s starting at ", argv[0]); 35 | printf("device %d: %s memory size %d nbyte %5.2fMB canMap %d\n", dev, 36 | deviceProp.name, isize, nbytes / (1024.0f * 1024.0f), 37 | deviceProp.canMapHostMemory); 38 | 39 | // allocate pinned host memory 40 | float *h_a; 41 | CHECK(cudaMallocHost ((float **)&h_a, nbytes)); 42 | 43 | // allocate device memory 44 | float *d_a; 45 | CHECK(cudaMalloc((float **)&d_a, nbytes)); 46 | 47 | // initialize host memory 48 | memset(h_a, 0, nbytes); 49 | 50 | for (int i = 0; i < isize; i++) h_a[i] = 100.10f; 51 | 52 | // transfer data from the host to the device 53 | CHECK(cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice)); 54 | 55 | // transfer data from the device to the host 56 | CHECK(cudaMemcpy(h_a, d_a, nbytes, cudaMemcpyDeviceToHost)); 57 | 58 | // free memory 59 | CHECK(cudaFree(d_a)); 60 | CHECK(cudaFreeHost(h_a)); 61 | 62 | // reset device 63 | CHECK(cudaDeviceReset()); 64 | return EXIT_SUCCESS; 65 | } 66 | -------------------------------------------------------------------------------- /chapter04/readSegment.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates the impact of misaligned reads on performance by 7 | * forcing misaligned reads to occur on a float*. 8 | */ 9 | 10 | void checkResult(float *hostRef, float *gpuRef, const int N) 11 | { 12 | double epsilon = 1.0E-8; 13 | bool match = 1; 14 | 15 | for (int i = 0; i < N; i++) 16 | { 17 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 18 | { 19 | match = 0; 20 | printf("different on %dth element: host %f gpu %f\n", i, hostRef[i], 21 | gpuRef[i]); 22 | break; 23 | } 24 | } 25 | 26 | if (!match) printf("Arrays do not match.\n\n"); 27 | } 28 | 29 | void initialData(float *ip, int size) 30 | { 31 | for (int i = 0; i < size; i++) 32 | { 33 | ip[i] = (float)( rand() & 0xFF ) / 100.0f; 34 | } 35 | 36 | return; 37 | } 38 | 39 | 40 | void sumArraysOnHost(float *A, float *B, float *C, const int n, int offset) 41 | { 42 | for (int idx = offset, k = 0; idx < n; idx++, k++) 43 | { 44 | C[k] = A[idx] + B[idx]; 45 | } 46 | } 47 | 48 | __global__ void warmup(float *A, float *B, float *C, const int n, int offset) 49 | { 50 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 51 | unsigned int k = i + offset; 52 | 53 | if (k < n) C[i] = A[k] + B[k]; 54 | } 55 | 56 | __global__ void readOffset(float *A, float *B, float *C, const int n, 57 | int offset) 58 | { 59 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 60 | unsigned int k = i + offset; 61 | 62 | if (k < n) C[i] = A[k] + B[k]; 63 | } 64 | 65 | int main(int argc, char **argv) 66 | { 67 | // set up device 68 | int dev = 0; 69 | cudaDeviceProp deviceProp; 70 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 71 | printf("%s starting reduction at ", argv[0]); 72 | printf("device %d: %s ", dev, deviceProp.name); 73 | CHECK(cudaSetDevice(dev)); 74 | 75 | // set up array size 76 | int nElem = 1 << 20; // total number of elements to reduce 77 | printf(" with array size %d\n", nElem); 78 | size_t nBytes = nElem * sizeof(float); 79 | 80 | // set up offset for summary 81 | int blocksize = 512; 82 | int offset = 0; 83 | 84 | if (argc > 1) offset = atoi(argv[1]); 85 | 86 | if (argc > 2) blocksize = atoi(argv[2]); 87 | 88 | // execution configuration 89 | dim3 block (blocksize, 1); 90 | dim3 grid ((nElem + block.x - 1) / block.x, 1); 91 | 92 | // allocate host memory 93 | float *h_A = (float *)malloc(nBytes); 94 | float *h_B = (float *)malloc(nBytes); 95 | float *hostRef = (float *)malloc(nBytes); 96 | float *gpuRef = (float *)malloc(nBytes); 97 | 98 | // initialize host array 99 | initialData(h_A, nElem); 100 | memcpy(h_B, h_A, nBytes); 101 | 102 | // summary at host side 103 | sumArraysOnHost(h_A, h_B, hostRef, nElem, offset); 104 | 105 | // allocate device memory 106 | float *d_A, *d_B, *d_C; 107 | CHECK(cudaMalloc((float**)&d_A, nBytes)); 108 | CHECK(cudaMalloc((float**)&d_B, nBytes)); 109 | CHECK(cudaMalloc((float**)&d_C, nBytes)); 110 | 111 | // copy data from host to device 112 | CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 113 | CHECK(cudaMemcpy(d_B, h_A, nBytes, cudaMemcpyHostToDevice)); 114 | 115 | // kernel 1: 116 | double iStart = seconds(); 117 | warmup<<>>(d_A, d_B, d_C, nElem, offset); 118 | CHECK(cudaDeviceSynchronize()); 119 | double iElaps = seconds() - iStart; 120 | printf("warmup <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x, 121 | block.x, offset, iElaps); 122 | CHECK(cudaGetLastError()); 123 | 124 | iStart = seconds(); 125 | readOffset<<>>(d_A, d_B, d_C, nElem, offset); 126 | CHECK(cudaDeviceSynchronize()); 127 | iElaps = seconds() - iStart; 128 | printf("readOffset <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x, 129 | block.x, offset, iElaps); 130 | CHECK(cudaGetLastError()); 131 | 132 | // copy kernel result back to host side and check device results 133 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 134 | checkResult(hostRef, gpuRef, nElem - offset); 135 | 136 | // free host and device memory 137 | CHECK(cudaFree(d_A)); 138 | CHECK(cudaFree(d_B)); 139 | CHECK(cudaFree(d_C)); 140 | free(h_A); 141 | free(h_B); 142 | 143 | // reset device 144 | CHECK(cudaDeviceReset()); 145 | return EXIT_SUCCESS; 146 | } 147 | -------------------------------------------------------------------------------- /chapter04/simpleMathAoS.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * A simple example of using an array of structures to store data on the device. 7 | * This example is used to study the impact on performance of data layout on the 8 | * GPU. 9 | * 10 | * AoS: one contiguous 64-bit read to get x and y (up to 300 cycles) 11 | */ 12 | 13 | #define LEN 1<<22 14 | 15 | struct innerStruct 16 | { 17 | float x; 18 | float y; 19 | }; 20 | 21 | struct innerArray 22 | { 23 | float x[LEN]; 24 | float y[LEN]; 25 | }; 26 | 27 | void initialInnerStruct(innerStruct *ip, int size) 28 | { 29 | for (int i = 0; i < size; i++) 30 | { 31 | ip[i].x = (float)(rand() & 0xFF) / 100.0f; 32 | ip[i].y = (float)(rand() & 0xFF) / 100.0f; 33 | } 34 | 35 | return; 36 | } 37 | 38 | void testInnerStructHost(innerStruct *A, innerStruct *C, const int n) 39 | { 40 | for (int idx = 0; idx < n; idx++) 41 | { 42 | C[idx].x = A[idx].x + 10.f; 43 | C[idx].y = A[idx].y + 20.f; 44 | } 45 | 46 | return; 47 | } 48 | 49 | void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N) 50 | { 51 | double epsilon = 1.0E-8; 52 | bool match = 1; 53 | 54 | for (int i = 0; i < N; i++) 55 | { 56 | if (abs(hostRef[i].x - gpuRef[i].x) > epsilon) 57 | { 58 | match = 0; 59 | printf("different on %dth element: host %f gpu %f\n", i, 60 | hostRef[i].x, gpuRef[i].x); 61 | break; 62 | } 63 | 64 | if (abs(hostRef[i].y - gpuRef[i].y) > epsilon) 65 | { 66 | match = 0; 67 | printf("different on %dth element: host %f gpu %f\n", i, 68 | hostRef[i].y, gpuRef[i].y); 69 | break; 70 | } 71 | } 72 | 73 | if (!match) printf("Arrays do not match.\n\n"); 74 | } 75 | 76 | __global__ void testInnerStruct(innerStruct *data, innerStruct * result, 77 | const int n) 78 | { 79 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 80 | 81 | if (i < n) 82 | { 83 | innerStruct tmp = data[i]; 84 | tmp.x += 10.f; 85 | tmp.y += 20.f; 86 | result[i] = tmp; 87 | } 88 | } 89 | 90 | __global__ void warmup(innerStruct *data, innerStruct * result, const int n) 91 | { 92 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 93 | 94 | if (i < n) 95 | { 96 | innerStruct tmp = data[i]; 97 | tmp.x += 10.f; 98 | tmp.y += 20.f; 99 | result[i] = tmp; 100 | } 101 | } 102 | 103 | int main(int argc, char **argv) 104 | { 105 | // set up device 106 | int dev = 0; 107 | cudaDeviceProp deviceProp; 108 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 109 | printf("%s test struct of array at ", argv[0]); 110 | printf("device %d: %s \n", dev, deviceProp.name); 111 | CHECK(cudaSetDevice(dev)); 112 | 113 | // allocate host memory 114 | int nElem = LEN; 115 | size_t nBytes = nElem * sizeof(innerStruct); 116 | innerStruct *h_A = (innerStruct *)malloc(nBytes); 117 | innerStruct *hostRef = (innerStruct *)malloc(nBytes); 118 | innerStruct *gpuRef = (innerStruct *)malloc(nBytes); 119 | 120 | // initialize host array 121 | initialInnerStruct(h_A, nElem); 122 | testInnerStructHost(h_A, hostRef, nElem); 123 | 124 | // allocate device memory 125 | innerStruct *d_A, *d_C; 126 | CHECK(cudaMalloc((innerStruct**)&d_A, nBytes)); 127 | CHECK(cudaMalloc((innerStruct**)&d_C, nBytes)); 128 | 129 | // copy data from host to device 130 | CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 131 | 132 | // set up offset for summaryAU: It is blocksize not offset. Thanks.CZ 133 | int blocksize = 128; 134 | 135 | if (argc > 1) blocksize = atoi(argv[1]); 136 | 137 | // execution configuration 138 | dim3 block (blocksize, 1); 139 | dim3 grid ((nElem + block.x - 1) / block.x, 1); 140 | 141 | // kernel 1: warmup 142 | double iStart = seconds(); 143 | warmup<<>>(d_A, d_C, nElem); 144 | CHECK(cudaDeviceSynchronize()); 145 | double iElaps = seconds() - iStart; 146 | printf("warmup <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x, 147 | iElaps); 148 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 149 | checkInnerStruct(hostRef, gpuRef, nElem); 150 | CHECK(cudaGetLastError()); 151 | 152 | // kernel 2: testInnerStruct 153 | iStart = seconds(); 154 | testInnerStruct<<>>(d_A, d_C, nElem); 155 | CHECK(cudaDeviceSynchronize()); 156 | iElaps = seconds() - iStart; 157 | printf("innerstruct <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x, 158 | iElaps); 159 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 160 | checkInnerStruct(hostRef, gpuRef, nElem); 161 | CHECK(cudaGetLastError()); 162 | 163 | // free memories both host and device 164 | CHECK(cudaFree(d_A)); 165 | CHECK(cudaFree(d_C)); 166 | free(h_A); 167 | free(hostRef); 168 | free(gpuRef); 169 | 170 | // reset device 171 | CHECK(cudaDeviceReset()); 172 | return EXIT_SUCCESS; 173 | } 174 | -------------------------------------------------------------------------------- /chapter04/simpleMathSoA.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * A simple example of using a structore of arrays to store data on the device. 7 | * This example is used to study the impact on performance of data layout on the 8 | * GPU. 9 | * 10 | * SoA: contiguous reads for x and y 11 | */ 12 | 13 | #define LEN 1<<22 14 | 15 | struct InnerArray 16 | { 17 | float x[LEN]; 18 | float y[LEN]; 19 | }; 20 | 21 | // functions for inner array outer struct 22 | void initialInnerArray(InnerArray *ip, int size) 23 | { 24 | for (int i = 0; i < size; i++) 25 | { 26 | ip->x[i] = (float)( rand() & 0xFF ) / 100.0f; 27 | ip->y[i] = (float)( rand() & 0xFF ) / 100.0f; 28 | } 29 | 30 | return; 31 | } 32 | 33 | void testInnerArrayHost(InnerArray *A, InnerArray *C, const int n) 34 | { 35 | for (int idx = 0; idx < n; idx++) 36 | { 37 | C->x[idx] = A->x[idx] + 10.f; 38 | C->y[idx] = A->y[idx] + 20.f; 39 | } 40 | 41 | return; 42 | } 43 | 44 | 45 | void printfHostResult(InnerArray *C, const int n) 46 | { 47 | for (int idx = 0; idx < n; idx++) 48 | { 49 | printf("printout idx %d: x %f y %f\n", idx, C->x[idx], C->y[idx]); 50 | } 51 | 52 | return; 53 | } 54 | 55 | void checkInnerArray(InnerArray *hostRef, InnerArray *gpuRef, const int N) 56 | { 57 | double epsilon = 1.0E-8; 58 | bool match = 1; 59 | 60 | for (int i = 0; i < N; i++) 61 | { 62 | if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon) 63 | { 64 | match = 0; 65 | printf("different on x %dth element: host %f gpu %f\n", i, 66 | hostRef->x[i], gpuRef->x[i]); 67 | break; 68 | } 69 | 70 | if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon) 71 | { 72 | match = 0; 73 | printf("different on y %dth element: host %f gpu %f\n", i, 74 | hostRef->y[i], gpuRef->y[i]); 75 | break; 76 | } 77 | } 78 | 79 | if (!match) printf("Arrays do not match.\n\n"); 80 | } 81 | 82 | __global__ void testInnerArray(InnerArray *data, InnerArray * result, 83 | const int n) 84 | { 85 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 86 | 87 | if (i < n) 88 | { 89 | float tmpx = data->x[i]; 90 | float tmpy = data->y[i]; 91 | 92 | tmpx += 10.f; 93 | tmpy += 20.f; 94 | result->x[i] = tmpx; 95 | result->y[i] = tmpy; 96 | } 97 | } 98 | 99 | __global__ void warmup2(InnerArray *data, InnerArray * result, const int n) 100 | { 101 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 102 | 103 | if (i < n) 104 | { 105 | float tmpx = data->x[i]; 106 | float tmpy = data->y[i]; 107 | tmpx += 10.f; 108 | tmpy += 20.f; 109 | result->x[i] = tmpx; 110 | result->y[i] = tmpy; 111 | } 112 | } 113 | 114 | // test for array of struct 115 | int main(int argc, char **argv) 116 | { 117 | // set up device 118 | int dev = 0; 119 | cudaDeviceProp deviceProp; 120 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 121 | printf("%s test struct of array at ", argv[0]); 122 | printf("device %d: %s \n", dev, deviceProp.name); 123 | CHECK(cudaSetDevice(dev)); 124 | 125 | // allocate host memory 126 | int nElem = LEN; 127 | size_t nBytes = sizeof(InnerArray); 128 | InnerArray *h_A = (InnerArray *)malloc(nBytes); 129 | InnerArray *hostRef = (InnerArray *)malloc(nBytes); 130 | InnerArray *gpuRef = (InnerArray *)malloc(nBytes); 131 | 132 | // initialize host array 133 | initialInnerArray(h_A, nElem); 134 | testInnerArrayHost(h_A, hostRef, nElem); 135 | 136 | // allocate device memory 137 | InnerArray *d_A, *d_C; 138 | CHECK(cudaMalloc((InnerArray**)&d_A, nBytes)); 139 | CHECK(cudaMalloc((InnerArray**)&d_C, nBytes)); 140 | 141 | // copy data from host to device 142 | CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 143 | 144 | // set up offset for summary 145 | int blocksize = 128; 146 | 147 | if (argc > 1) blocksize = atoi(argv[1]); 148 | 149 | // execution configuration 150 | dim3 block (blocksize, 1); 151 | dim3 grid ((nElem + block.x - 1) / block.x, 1); 152 | 153 | // kernel 1: 154 | double iStart = seconds(); 155 | warmup2<<>>(d_A, d_C, nElem); 156 | CHECK(cudaDeviceSynchronize()); 157 | double iElaps = seconds() - iStart; 158 | printf("warmup2 <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x, 159 | iElaps); 160 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 161 | checkInnerArray(hostRef, gpuRef, nElem); 162 | CHECK(cudaGetLastError()); 163 | 164 | iStart = seconds(); 165 | testInnerArray<<>>(d_A, d_C, nElem); 166 | CHECK(cudaDeviceSynchronize()); 167 | iElaps = seconds() - iStart; 168 | printf("innerarray <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x, 169 | iElaps); 170 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 171 | checkInnerArray(hostRef, gpuRef, nElem); 172 | CHECK(cudaGetLastError()); 173 | 174 | CHECK(cudaFree(d_A)); 175 | CHECK(cudaFree(d_C)); 176 | free(h_A); 177 | free(hostRef); 178 | free(gpuRef); 179 | 180 | // reset device 181 | CHECK(cudaDeviceReset()); 182 | return EXIT_SUCCESS; 183 | } 184 | -------------------------------------------------------------------------------- /chapter04/sumArrayZerocpy.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates the use of zero-copy memory to remove the need to 7 | * explicitly issue a memcpy operation between the host and device. By mapping 8 | * host, page-locked memory into the device's address space, the address can 9 | * directly reference a host array and transfer its contents over the PCIe bus. 10 | * 11 | * This example compares performing a vector addition with and without zero-copy 12 | * memory. 13 | */ 14 | 15 | void checkResult(float *hostRef, float *gpuRef, const int N) 16 | { 17 | double epsilon = 1.0E-8; 18 | 19 | for (int i = 0; i < N; i++) 20 | { 21 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 22 | { 23 | printf("Arrays do not match!\n"); 24 | printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], 25 | gpuRef[i], i); 26 | break; 27 | } 28 | } 29 | 30 | return; 31 | } 32 | 33 | void initialData(float *ip, int size) 34 | { 35 | int i; 36 | 37 | for (i = 0; i < size; i++) 38 | { 39 | ip[i] = (float)( rand() & 0xFF ) / 10.0f; 40 | } 41 | 42 | return; 43 | } 44 | 45 | void sumArraysOnHost(float *A, float *B, float *C, const int N) 46 | { 47 | for (int idx = 0; idx < N; idx++) 48 | { 49 | C[idx] = A[idx] + B[idx]; 50 | } 51 | } 52 | 53 | __global__ void sumArrays(float *A, float *B, float *C, const int N) 54 | { 55 | int i = blockIdx.x * blockDim.x + threadIdx.x; 56 | 57 | if (i < N) C[i] = A[i] + B[i]; 58 | } 59 | 60 | __global__ void sumArraysZeroCopy(float *A, float *B, float *C, const int N) 61 | { 62 | int i = blockIdx.x * blockDim.x + threadIdx.x; 63 | 64 | if (i < N) C[i] = A[i] + B[i]; 65 | } 66 | 67 | int main(int argc, char **argv) 68 | { 69 | // set up device 70 | int dev = 0; 71 | CHECK(cudaSetDevice(dev)); 72 | 73 | // get device properties 74 | cudaDeviceProp deviceProp; 75 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 76 | 77 | // check if support mapped memory 78 | if (!deviceProp.canMapHostMemory) 79 | { 80 | printf("Device %d does not support mapping CPU host memory!\n", dev); 81 | CHECK(cudaDeviceReset()); 82 | exit(EXIT_SUCCESS); 83 | } 84 | 85 | printf("Using Device %d: %s ", dev, deviceProp.name); 86 | 87 | // set up data size of vectors 88 | int ipower = 10; 89 | 90 | if (argc > 1) ipower = atoi(argv[1]); 91 | 92 | int nElem = 1 << ipower; 93 | size_t nBytes = nElem * sizeof(float); 94 | 95 | if (ipower < 18) 96 | { 97 | printf("Vector size %d power %d nbytes %3.0f KB\n", nElem, ipower, 98 | (float)nBytes / (1024.0f)); 99 | } 100 | else 101 | { 102 | printf("Vector size %d power %d nbytes %3.0f MB\n", nElem, ipower, 103 | (float)nBytes / (1024.0f * 1024.0f)); 104 | } 105 | 106 | // part 1: using device memory 107 | // malloc host memory 108 | float *h_A, *h_B, *hostRef, *gpuRef; 109 | h_A = (float *)malloc(nBytes); 110 | h_B = (float *)malloc(nBytes); 111 | hostRef = (float *)malloc(nBytes); 112 | gpuRef = (float *)malloc(nBytes); 113 | 114 | // initialize data at host side 115 | initialData(h_A, nElem); 116 | initialData(h_B, nElem); 117 | memset(hostRef, 0, nBytes); 118 | memset(gpuRef, 0, nBytes); 119 | 120 | // add vector at host side for result checks 121 | sumArraysOnHost(h_A, h_B, hostRef, nElem); 122 | 123 | // malloc device global memory 124 | float *d_A, *d_B, *d_C; 125 | CHECK(cudaMalloc((float**)&d_A, nBytes)); 126 | CHECK(cudaMalloc((float**)&d_B, nBytes)); 127 | CHECK(cudaMalloc((float**)&d_C, nBytes)); 128 | 129 | // transfer data from host to device 130 | CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 131 | CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); 132 | 133 | // set up execution configuration 134 | int iLen = 512; 135 | dim3 block (iLen); 136 | dim3 grid ((nElem + block.x - 1) / block.x); 137 | 138 | sumArrays<<>>(d_A, d_B, d_C, nElem); 139 | 140 | // copy kernel result back to host side 141 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 142 | 143 | // check device results 144 | checkResult(hostRef, gpuRef, nElem); 145 | 146 | // free device global memory 147 | CHECK(cudaFree(d_A)); 148 | CHECK(cudaFree(d_B)); 149 | 150 | // free host memory 151 | free(h_A); 152 | free(h_B); 153 | 154 | // part 2: using zerocopy memory for array A and B 155 | // allocate zerocpy memory 156 | CHECK(cudaHostAlloc((void **)&h_A, nBytes, cudaHostAllocMapped)); 157 | CHECK(cudaHostAlloc((void **)&h_B, nBytes, cudaHostAllocMapped)); 158 | 159 | // initialize data at host side 160 | initialData(h_A, nElem); 161 | initialData(h_B, nElem); 162 | memset(hostRef, 0, nBytes); 163 | memset(gpuRef, 0, nBytes); 164 | 165 | // pass the pointer to device 166 | CHECK(cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0)); 167 | CHECK(cudaHostGetDevicePointer((void **)&d_B, (void *)h_B, 0)); 168 | 169 | // add at host side for result checks 170 | sumArraysOnHost(h_A, h_B, hostRef, nElem); 171 | 172 | // execute kernel with zero copy memory 173 | sumArraysZeroCopy<<>>(d_A, d_B, d_C, nElem); 174 | 175 | // copy kernel result back to host side 176 | CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost)); 177 | 178 | // check device results 179 | checkResult(hostRef, gpuRef, nElem); 180 | 181 | // free memory 182 | CHECK(cudaFree(d_C)); 183 | CHECK(cudaFreeHost(h_A)); 184 | CHECK(cudaFreeHost(h_B)); 185 | 186 | free(hostRef); 187 | free(gpuRef); 188 | 189 | // reset device 190 | CHECK(cudaDeviceReset()); 191 | return EXIT_SUCCESS; 192 | } 193 | -------------------------------------------------------------------------------- /chapter04/sumMatrixGPUManaged.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates the use of CUDA managed memory to implement matrix 7 | * addition. In this example, arbitrary pointers can be dereferenced on the host 8 | * and device. CUDA will automatically manage the transfer of data to and from 9 | * the GPU as needed by the application. There is no need for the programmer to 10 | * use cudaMemcpy, cudaHostGetDevicePointer, or any other CUDA API involved with 11 | * explicitly transferring data. In addition, because CUDA managed memory is not 12 | * forced to reside in a single place it can be transferred to the optimal 13 | * memory space and not require round-trips over the PCIe bus every time a 14 | * cross-device reference is performed (as is required with zero copy and UVA). 15 | */ 16 | 17 | void initialData(float *ip, const int size) 18 | { 19 | int i; 20 | 21 | for (i = 0; i < size; i++) 22 | { 23 | ip[i] = (float)( rand() & 0xFF ) / 10.0f; 24 | } 25 | 26 | return; 27 | } 28 | 29 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny) 30 | { 31 | float *ia = A; 32 | float *ib = B; 33 | float *ic = C; 34 | 35 | for (int iy = 0; iy < ny; iy++) 36 | { 37 | for (int ix = 0; ix < nx; ix++) 38 | { 39 | ic[ix] = ia[ix] + ib[ix]; 40 | } 41 | 42 | ia += nx; 43 | ib += nx; 44 | ic += nx; 45 | } 46 | 47 | return; 48 | } 49 | 50 | void checkResult(float *hostRef, float *gpuRef, const int N) 51 | { 52 | double epsilon = 1.0E-8; 53 | bool match = 1; 54 | 55 | for (int i = 0; i < N; i++) 56 | { 57 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 58 | { 59 | match = 0; 60 | printf("host %f gpu %f\n", hostRef[i], gpuRef[i]); 61 | break; 62 | } 63 | } 64 | 65 | if (!match) 66 | { 67 | printf("Arrays do not match.\n\n"); 68 | } 69 | } 70 | 71 | // grid 2D block 2D 72 | __global__ void sumMatrixGPU(float *MatA, float *MatB, float *MatC, int nx, 73 | int ny) 74 | { 75 | unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; 76 | unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y; 77 | unsigned int idx = iy * nx + ix; 78 | 79 | if (ix < nx && iy < ny) 80 | { 81 | MatC[idx] = MatA[idx] + MatB[idx]; 82 | } 83 | } 84 | 85 | int main(int argc, char **argv) 86 | { 87 | printf("%s Starting ", argv[0]); 88 | 89 | // set up device 90 | int dev = 0; 91 | cudaDeviceProp deviceProp; 92 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 93 | printf("using Device %d: %s\n", dev, deviceProp.name); 94 | CHECK(cudaSetDevice(dev)); 95 | 96 | // set up data size of matrix 97 | int nx, ny; 98 | int ishift = 12; 99 | 100 | if (argc > 1) ishift = atoi(argv[1]); 101 | 102 | nx = ny = 1 << ishift; 103 | 104 | int nxy = nx * ny; 105 | int nBytes = nxy * sizeof(float); 106 | printf("Matrix size: nx %d ny %d\n", nx, ny); 107 | 108 | // malloc host memory 109 | float *A, *B, *hostRef, *gpuRef; 110 | CHECK(cudaMallocManaged((void **)&A, nBytes)); 111 | CHECK(cudaMallocManaged((void **)&B, nBytes)); 112 | CHECK(cudaMallocManaged((void **)&gpuRef, nBytes); ); 113 | CHECK(cudaMallocManaged((void **)&hostRef, nBytes);); 114 | 115 | // initialize data at host side 116 | double iStart = seconds(); 117 | initialData(A, nxy); 118 | initialData(B, nxy); 119 | double iElaps = seconds() - iStart; 120 | printf("initialization: \t %f sec\n", iElaps); 121 | 122 | memset(hostRef, 0, nBytes); 123 | memset(gpuRef, 0, nBytes); 124 | 125 | // add matrix at host side for result checks 126 | iStart = seconds(); 127 | sumMatrixOnHost(A, B, hostRef, nx, ny); 128 | iElaps = seconds() - iStart; 129 | printf("sumMatrix on host:\t %f sec\n", iElaps); 130 | 131 | // invoke kernel at host side 132 | int dimx = 32; 133 | int dimy = 32; 134 | dim3 block(dimx, dimy); 135 | dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); 136 | 137 | // warm-up kernel, with unified memory all pages will migrate from host to 138 | // device 139 | sumMatrixGPU<<>>(A, B, gpuRef, 1, 1); 140 | 141 | // after warm-up, time with unified memory 142 | iStart = seconds(); 143 | 144 | sumMatrixGPU<<>>(A, B, gpuRef, nx, ny); 145 | 146 | CHECK(cudaDeviceSynchronize()); 147 | iElaps = seconds() - iStart; 148 | printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>> \n", iElaps, 149 | grid.x, grid.y, block.x, block.y); 150 | 151 | // check kernel error 152 | CHECK(cudaGetLastError()); 153 | 154 | // check device results 155 | checkResult(hostRef, gpuRef, nxy); 156 | 157 | // free device global memory 158 | CHECK(cudaFree(A)); 159 | CHECK(cudaFree(B)); 160 | CHECK(cudaFree(hostRef)); 161 | CHECK(cudaFree(gpuRef)); 162 | 163 | // reset device 164 | CHECK(cudaDeviceReset()); 165 | 166 | return (0); 167 | } 168 | -------------------------------------------------------------------------------- /chapter04/sumMatrixGPUManual.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * This example demonstrates using explicit CUDA memory transfer to implement 7 | * matrix addition. This code contrasts with sumMatrixGPUManaged.cu, where CUDA 8 | * managed memory is used to remove all explicit memory transfers and abstract 9 | * away the concept of physicall separate address spaces. 10 | */ 11 | 12 | void initialData(float *ip, const int size) 13 | { 14 | int i; 15 | 16 | for(i = 0; i < size; i++) 17 | { 18 | ip[i] = (float)( rand() & 0xFF ) / 10.0f; 19 | } 20 | 21 | return; 22 | } 23 | 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny) 25 | { 26 | float *ia = A; 27 | float *ib = B; 28 | float *ic = C; 29 | 30 | for (int iy = 0; iy < ny; iy++) 31 | { 32 | for (int ix = 0; ix < nx; ix++) 33 | { 34 | ic[ix] = ia[ix] + ib[ix]; 35 | } 36 | 37 | ia += nx; 38 | ib += nx; 39 | ic += nx; 40 | } 41 | 42 | return; 43 | } 44 | 45 | void checkResult(float *hostRef, float *gpuRef, const int N) 46 | { 47 | double epsilon = 1.0E-8; 48 | bool match = 1; 49 | 50 | for (int i = 0; i < N; i++) 51 | { 52 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 53 | { 54 | match = 0; 55 | printf("host %f gpu %f\n", hostRef[i], gpuRef[i]); 56 | break; 57 | } 58 | } 59 | 60 | if (!match) 61 | { 62 | printf("Arrays do not match.\n\n"); 63 | } 64 | } 65 | 66 | // grid 2D block 2D 67 | __global__ void sumMatrixGPU(float *MatA, float *MatB, float *MatC, int nx, 68 | int ny) 69 | { 70 | unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; 71 | unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y; 72 | unsigned int idx = iy * nx + ix; 73 | 74 | if (ix < nx && iy < ny) 75 | { 76 | MatC[idx] = MatA[idx] + MatB[idx]; 77 | } 78 | } 79 | 80 | int main(int argc, char **argv) 81 | { 82 | printf("%s Starting ", argv[0]); 83 | 84 | // set up device 85 | int dev = 0; 86 | cudaDeviceProp deviceProp; 87 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 88 | printf("using Device %d: %s\n", dev, deviceProp.name); 89 | CHECK(cudaSetDevice(dev)); 90 | 91 | // set up data size of matrix 92 | int nx, ny; 93 | int ishift = 12; 94 | 95 | if (argc > 1) ishift = atoi(argv[1]); 96 | 97 | nx = ny = 1 << ishift; 98 | 99 | int nxy = nx * ny; 100 | int nBytes = nxy * sizeof(float); 101 | printf("Matrix size: nx %d ny %d\n", nx, ny); 102 | 103 | // malloc host memory 104 | float *h_A, *h_B, *hostRef, *gpuRef; 105 | h_A = (float *)malloc(nBytes); 106 | h_B = (float *)malloc(nBytes); 107 | hostRef = (float *)malloc(nBytes); 108 | gpuRef = (float *)malloc(nBytes); 109 | 110 | // initialize data at host side 111 | double iStart = seconds(); 112 | initialData(h_A, nxy); 113 | initialData(h_B, nxy); 114 | double iElaps = seconds() - iStart; 115 | 116 | printf("initialization: \t %f sec\n", iElaps); 117 | 118 | memset(hostRef, 0, nBytes); 119 | memset(gpuRef, 0, nBytes); 120 | 121 | // add matrix at host side for result checks 122 | iStart = seconds(); 123 | sumMatrixOnHost(h_A, h_B, hostRef, nx, ny); 124 | iElaps = seconds() - iStart; 125 | printf("sumMatrix on host:\t %f sec\n", iElaps); 126 | 127 | // malloc device global memory 128 | float *d_MatA, *d_MatB, *d_MatC; 129 | CHECK(cudaMalloc((void **)&d_MatA, nBytes)); 130 | CHECK(cudaMalloc((void **)&d_MatB, nBytes)); 131 | CHECK(cudaMalloc((void **)&d_MatC, nBytes)); 132 | 133 | // invoke kernel at host side 134 | int dimx = 32; 135 | int dimy = 32; 136 | dim3 block(dimx, dimy); 137 | dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y); 138 | 139 | // init device data to 0.0f, then warm-up kernel to obtain accurate timing 140 | // result 141 | CHECK(cudaMemset(d_MatA, 0.0f, nBytes)); 142 | CHECK(cudaMemset(d_MatB, 0.0f, nBytes)); 143 | sumMatrixGPU<<>>(d_MatA, d_MatB, d_MatC, 1, 1); 144 | 145 | 146 | // transfer data from host to device 147 | CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice)); 148 | CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice)); 149 | 150 | iStart = seconds(); 151 | sumMatrixGPU<<>>(d_MatA, d_MatB, d_MatC, nx, ny); 152 | 153 | CHECK(cudaDeviceSynchronize()); 154 | iElaps = seconds() - iStart; 155 | printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>> \n", iElaps, 156 | grid.x, grid.y, block.x, block.y); 157 | 158 | CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost)); 159 | 160 | // check kernel error 161 | CHECK(cudaGetLastError()); 162 | 163 | // check device results 164 | checkResult(hostRef, gpuRef, nxy); 165 | 166 | // free device global memory 167 | CHECK(cudaFree(d_MatA)); 168 | CHECK(cudaFree(d_MatB)); 169 | CHECK(cudaFree(d_MatC)); 170 | 171 | // free host memory 172 | free(h_A); 173 | free(h_B); 174 | free(hostRef); 175 | free(gpuRef); 176 | 177 | // reset device 178 | CHECK(cudaDeviceReset()); 179 | 180 | return (0); 181 | } 182 | -------------------------------------------------------------------------------- /chapter05/Makefile: -------------------------------------------------------------------------------- 1 | CU_APPS=checkSmemRectangle checkSmemSquare constantReadOnly constantStencil \ 2 | reduceInteger reduceIntegerShfl simpleShfl transposeRectangle 3 | C_APPS= 4 | 5 | all: ${C_APPS} ${CU_APPS} 6 | 7 | %: %.cu 8 | nvcc -O2 -arch=sm_20 -o $@ $< 9 | %: %.c 10 | gcc -O2 -std=c99 -o $@ $< 11 | clean: 12 | rm -f ${CU_APPS} ${C_APPS} 13 | -------------------------------------------------------------------------------- /chapter05/constantReadOnly.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | #define RADIUS 4 6 | #define BDIM 32 7 | 8 | // constant memory 9 | __constant__ float coef[RADIUS + 1]; 10 | 11 | // FD coeffecient 12 | #define a0 0.00000f 13 | #define a1 0.80000f 14 | #define a2 -0.20000f 15 | #define a3 0.03809f 16 | #define a4 -0.00357f 17 | 18 | void initialData(float *in, const int size) 19 | { 20 | for (int i = 0; i < size; i++) 21 | { 22 | in[i] = (float)( rand() & 0xFF ) / 100.0f; 23 | } 24 | } 25 | 26 | void printData(float *in, const int size) 27 | { 28 | for (int i = RADIUS; i < size; i++) 29 | { 30 | printf("%f ", in[i]); 31 | } 32 | 33 | printf("\n"); 34 | } 35 | 36 | void setup_coef_constant (void) 37 | { 38 | const float h_coef[] = {a0, a1, a2, a3, a4}; 39 | CHECK(cudaMemcpyToSymbol( coef, h_coef, (RADIUS + 1) * sizeof(float))); 40 | } 41 | 42 | void cpu_stencil_1d (float *in, float *out, int isize) 43 | { 44 | for( int i = RADIUS; i <= isize; i++ ) 45 | { 46 | float tmp = 0.0f; 47 | tmp += a1 * (in[i + 1] - in[i - 1]) 48 | + a2 * (in[i + 2] - in[i - 2]) 49 | + a3 * (in[i + 3] - in[i - 3]) 50 | + a4 * (in[i + 4] - in[i - 4]); 51 | out[i] = tmp; 52 | } 53 | } 54 | 55 | void checkResult(float *hostRef, float *gpuRef, const int size) 56 | { 57 | double epsilon = 1.0E-6; 58 | bool match = 1; 59 | 60 | for (int i = RADIUS; i < size; i++) 61 | { 62 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 63 | { 64 | match = 0; 65 | printf("different on %dth element: host %f gpu %f\n", i, hostRef[i], 66 | gpuRef[i]); 67 | break; 68 | } 69 | } 70 | 71 | if (!match) printf("Arrays do not match.\n\n"); 72 | } 73 | 74 | __global__ void stencil_1d(float *in, float *out) 75 | { 76 | // shared memory 77 | __shared__ float smem[BDIM + 2 * RADIUS]; 78 | 79 | // index to global memory 80 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 81 | 82 | // index to shared memory for stencil calculatioin 83 | int sidx = threadIdx.x + RADIUS; 84 | 85 | // Read data from global memory into shared memory 86 | smem[sidx] = in[idx]; 87 | 88 | // read halo part to shared memory 89 | if (threadIdx.x < RADIUS) 90 | { 91 | smem[sidx - RADIUS] = in[idx - RADIUS]; 92 | smem[sidx + BDIM] = in[idx + BDIM]; 93 | } 94 | 95 | // Synchronize (ensure all the data is available) 96 | __syncthreads(); 97 | 98 | // Apply the stencil 99 | float tmp = 0.0f; 100 | #pragma unroll 101 | 102 | for (int i = 1; i <= RADIUS; i++) 103 | { 104 | tmp += coef[i] * (smem[sidx + i] - smem[sidx - i]); 105 | } 106 | 107 | // Store the result 108 | out[idx] = tmp; 109 | } 110 | 111 | __global__ void stencil_1d_read_only (float* in, 112 | float* out, 113 | const float *__restrict__ dcoef) 114 | { 115 | // shared memory 116 | __shared__ float smem[BDIM + 2 * RADIUS]; 117 | 118 | // index to global memory 119 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 120 | 121 | // index to shared memory for stencil calculatioin 122 | int sidx = threadIdx.x + RADIUS; 123 | 124 | // Read data from global memory into shared memory 125 | smem[sidx] = in[idx]; 126 | 127 | // read halo part to shared memory 128 | if (threadIdx.x < RADIUS) 129 | { 130 | smem[sidx - RADIUS] = in[idx - RADIUS]; 131 | smem[sidx + BDIM] = in[idx + BDIM]; 132 | } 133 | 134 | // Synchronize (ensure all the data is available) 135 | __syncthreads(); 136 | 137 | // Apply the stencil 138 | float tmp = 0.0f; 139 | #pragma unroll 140 | 141 | for (int i = 1; i <= RADIUS; i++) 142 | { 143 | tmp += dcoef[i] * (smem[sidx + i] - smem[sidx - i]); 144 | } 145 | 146 | // Store the result 147 | out[idx] = tmp; 148 | } 149 | 150 | int main(int argc, char **argv) 151 | { 152 | // set up device 153 | int dev = 0; 154 | cudaDeviceProp deviceProp; 155 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 156 | printf("%s starting transpose at ", argv[0]); 157 | printf("device %d: %s ", dev, deviceProp.name); 158 | CHECK(cudaSetDevice(dev)); 159 | 160 | // set up data size 161 | int isize = 1 << 24; 162 | 163 | size_t nBytes = (isize + 2 * RADIUS) * sizeof(float); 164 | printf("array size: %d ", isize); 165 | 166 | bool iprint = 0; 167 | 168 | // allocate host memory 169 | float *h_in = (float *)malloc(nBytes); 170 | float *hostRef = (float *)malloc(nBytes); 171 | float *gpuRef = (float *)malloc(nBytes); 172 | 173 | // allocate device memory 174 | float *d_in, *d_out, *d_coef; 175 | CHECK(cudaMalloc((float**)&d_in, nBytes)); 176 | CHECK(cudaMalloc((float**)&d_out, nBytes)); 177 | CHECK(cudaMalloc((float**)&d_coef, (RADIUS + 1) * sizeof(float))); 178 | 179 | // set up coefficient to global memory 180 | const float h_coef[] = {a0, a1, a2, a3, a4}; 181 | CHECK(cudaMemcpy(d_coef, h_coef, (RADIUS + 1) * sizeof(float), 182 | cudaMemcpyHostToDevice);) 183 | 184 | // initialize host array 185 | initialData(h_in, isize + 2 * RADIUS); 186 | 187 | // Copy to device 188 | CHECK(cudaMemcpy(d_in, h_in, nBytes, cudaMemcpyHostToDevice)); 189 | 190 | // set up constant memory 191 | setup_coef_constant (); 192 | 193 | // launch configuration 194 | dim3 block (BDIM, 1); 195 | dim3 grid (isize / block.x, 1); 196 | printf("(grid, block) %d,%d \n ", grid.x, block.x); 197 | 198 | // Launch stencil_1d() kernel on GPU 199 | stencil_1d<<>>(d_in + RADIUS, d_out + RADIUS); 200 | 201 | // Copy result back to host 202 | CHECK(cudaMemcpy(gpuRef, d_out, nBytes, cudaMemcpyDeviceToHost)); 203 | 204 | // apply cpu stencil 205 | cpu_stencil_1d(h_in, hostRef, isize); 206 | 207 | // check results 208 | checkResult(hostRef, gpuRef, isize); 209 | 210 | // launch read only cache kernel 211 | stencil_1d_read_only<<>>(d_in + RADIUS, d_out + RADIUS, 212 | d_coef); 213 | CHECK(cudaMemcpy(gpuRef, d_out, nBytes, cudaMemcpyDeviceToHost)); 214 | checkResult(hostRef, gpuRef, isize); 215 | 216 | // print out results 217 | if(iprint) 218 | { 219 | printData(gpuRef, isize); 220 | printData(hostRef, isize); 221 | } 222 | 223 | // Cleanup 224 | CHECK(cudaFree(d_in)); 225 | CHECK(cudaFree(d_out)); 226 | CHECK(cudaFree(d_coef)); 227 | free(h_in); 228 | free(hostRef); 229 | free(gpuRef); 230 | 231 | // reset device 232 | CHECK(cudaDeviceReset()); 233 | return EXIT_SUCCESS; 234 | } 235 | -------------------------------------------------------------------------------- /chapter05/constantStencil.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * An example of using constant memory to optimize performance of a stencil 7 | * computation by storing coefficients of the computation in a constant memory 8 | * array (coef). 9 | */ 10 | 11 | #define RADIUS 4 12 | #define BDIM 32 13 | 14 | // constant memory 15 | __constant__ float coef[RADIUS + 1]; 16 | 17 | // FD coeffecient 18 | #define a0 0.00000f 19 | #define a1 0.80000f 20 | #define a2 -0.20000f 21 | #define a3 0.03809f 22 | #define a4 -0.00357f 23 | 24 | void initialData(float *in, const int size) 25 | { 26 | for (int i = 0; i < size; i++) 27 | { 28 | in[i] = (float)(rand() & 0xFF) / 100.0f; 29 | } 30 | } 31 | 32 | void printData(float *in, const int size) 33 | { 34 | for (int i = RADIUS; i < size; i++) 35 | { 36 | printf("%f ", in[i]); 37 | } 38 | 39 | printf("\n"); 40 | } 41 | 42 | void setup_coef_constant (void) 43 | { 44 | const float h_coef[] = {a0, a1, a2, a3, a4}; 45 | CHECK(cudaMemcpyToSymbol( coef, h_coef, (RADIUS + 1) * sizeof(float))); 46 | } 47 | 48 | void cpu_stencil_1d (float *in, float *out, int isize) 49 | { 50 | for (int i = RADIUS; i <= isize; i++) 51 | { 52 | float tmp = a1 * (in[i + 1] - in[i - 1]) 53 | + a2 * (in[i + 2] - in[i - 2]) 54 | + a3 * (in[i + 3] - in[i - 3]) 55 | + a4 * (in[i + 4] - in[i - 4]); 56 | out[i] = tmp; 57 | } 58 | } 59 | 60 | void checkResult(float *hostRef, float *gpuRef, const int size) 61 | { 62 | double epsilon = 1.0E-6; 63 | bool match = 1; 64 | 65 | for (int i = RADIUS; i < size; i++) 66 | { 67 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 68 | { 69 | match = 0; 70 | printf("different on %dth element: host %f gpu %f\n", i, hostRef[i], 71 | gpuRef[i]); 72 | break; 73 | } 74 | } 75 | 76 | if (!match) printf("Arrays do not match.\n\n"); 77 | } 78 | 79 | __global__ void stencil_1d(float *in, float *out, int N) 80 | { 81 | // shared memory 82 | __shared__ float smem[BDIM + 2 * RADIUS]; 83 | 84 | // index to global memory 85 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 86 | 87 | while (idx < N) 88 | { 89 | 90 | // index to shared memory for stencil calculatioin 91 | int sidx = threadIdx.x + RADIUS; 92 | 93 | // Read data from global memory into shared memory 94 | smem[sidx] = in[idx]; 95 | 96 | // read halo part to shared memory 97 | if (threadIdx.x < RADIUS) 98 | { 99 | smem[sidx - RADIUS] = in[idx - RADIUS]; 100 | smem[sidx + BDIM] = in[idx + BDIM]; 101 | } 102 | 103 | // Synchronize (ensure all the data is available) 104 | __syncthreads(); 105 | 106 | // Apply the stencil 107 | float tmp = 0.0f; 108 | 109 | #pragma unroll 110 | for (int i = 1; i <= RADIUS; i++) 111 | { 112 | tmp += coef[i] * (smem[sidx + i] - smem[sidx - i]); 113 | } 114 | 115 | // Store the result 116 | out[idx] = tmp; 117 | 118 | idx += gridDim.x * blockDim.x; 119 | } 120 | } 121 | 122 | 123 | int main(int argc, char **argv) 124 | { 125 | // set up device 126 | int dev = 0; 127 | cudaDeviceProp deviceProp; 128 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 129 | printf("%s starting transpose at ", argv[0]); 130 | printf("device %d: %s ", dev, deviceProp.name); 131 | CHECK(cudaSetDevice(dev)); 132 | 133 | // set up data size 134 | int isize = 1 << 24; 135 | 136 | size_t nBytes = (isize + 2 * RADIUS) * sizeof(float); 137 | printf("array size: %d ", isize); 138 | 139 | bool iprint = 0; 140 | 141 | // allocate host memory 142 | float *h_in = (float *)malloc(nBytes); 143 | float *hostRef = (float *)malloc(nBytes); 144 | float *gpuRef = (float *)malloc(nBytes); 145 | 146 | // allocate device memory 147 | float *d_in, *d_out; 148 | CHECK(cudaMalloc((float**)&d_in, nBytes)); 149 | CHECK(cudaMalloc((float**)&d_out, nBytes)); 150 | 151 | // initialize host array 152 | initialData(h_in, isize + 2 * RADIUS); 153 | 154 | // Copy to device 155 | CHECK(cudaMemcpy(d_in, h_in, nBytes, cudaMemcpyHostToDevice)); 156 | 157 | // set up constant memory 158 | setup_coef_constant(); 159 | 160 | // launch configuration 161 | cudaDeviceProp info; 162 | CHECK(cudaGetDeviceProperties(&info, 0)); 163 | dim3 block(BDIM, 1); 164 | dim3 grid(info.maxGridSize[0] < isize / block.x ? info.maxGridSize[0] : 165 | isize / block.x, 1); 166 | printf("(grid, block) %d,%d \n ", grid.x, block.x); 167 | 168 | // Launch stencil_1d() kernel on GPU 169 | stencil_1d<<>>(d_in + RADIUS, d_out + RADIUS, isize); 170 | 171 | // Copy result back to host 172 | CHECK(cudaMemcpy(gpuRef, d_out, nBytes, cudaMemcpyDeviceToHost)); 173 | 174 | // apply cpu stencil 175 | cpu_stencil_1d(h_in, hostRef, isize); 176 | 177 | // check results 178 | checkResult(hostRef, gpuRef, isize); 179 | 180 | // print out results 181 | if(iprint) 182 | { 183 | printData(gpuRef, isize); 184 | printData(hostRef, isize); 185 | } 186 | 187 | // Cleanup 188 | CHECK(cudaFree(d_in)); 189 | CHECK(cudaFree(d_out)); 190 | free(h_in); 191 | free(hostRef); 192 | free(gpuRef); 193 | 194 | // reset device 195 | CHECK(cudaDeviceReset()); 196 | return EXIT_SUCCESS; 197 | } 198 | -------------------------------------------------------------------------------- /chapter06/Makefile: -------------------------------------------------------------------------------- 1 | CU_APPS=asyncAPI simpleCallback simpleHyperqBreadth simpleHyperqDependence \ 2 | simpleHyperqDepth simpleHyperqOpenmp simpleMultiAddBreadth \ 3 | simpleMultiAddDepth 4 | C_APPS= 5 | 6 | all: ${C_APPS} ${CU_APPS} 7 | 8 | %: %.cu 9 | nvcc -O2 -arch=sm_20 -Xcompiler -fopenmp -o $@ $< -lgomp 10 | %: %.c 11 | gcc -O2 -std=c99 -o $@ $< 12 | clean: 13 | rm -f ${CU_APPS} ${C_APPS} 14 | -------------------------------------------------------------------------------- /chapter06/asyncAPI.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * An example of using CUDA events to control asynchronous work launched on the 7 | * GPU. In this example, asynchronous copies and an asynchronous kernel are 8 | * used. A CUDA event is used to determine when that work has completed. 9 | */ 10 | 11 | __global__ void kernel(float *g_data, float value) 12 | { 13 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 14 | g_data[idx] = g_data[idx] + value; 15 | } 16 | 17 | int checkResult(float *data, const int n, const float x) 18 | { 19 | for (int i = 0; i < n; i++) 20 | { 21 | if (data[i] != x) 22 | { 23 | printf("Error! data[%d] = %f, ref = %f\n", i, data[i], x); 24 | return 0; 25 | } 26 | } 27 | 28 | return 1; 29 | } 30 | 31 | int main(int argc, char *argv[]) 32 | { 33 | int devID = 0; 34 | cudaDeviceProp deviceProps; 35 | CHECK(cudaGetDeviceProperties(&deviceProps, devID)); 36 | printf("> %s running on", argv[0]); 37 | printf(" CUDA device [%s]\n", deviceProps.name); 38 | 39 | int num = 1 << 24; 40 | int nbytes = num * sizeof(int); 41 | float value = 10.0f; 42 | 43 | // allocate host memory 44 | float *h_a = 0; 45 | CHECK(cudaMallocHost((void **)&h_a, nbytes)); 46 | memset(h_a, 0, nbytes); 47 | 48 | // allocate device memory 49 | float *d_a = 0; 50 | CHECK(cudaMalloc((void **)&d_a, nbytes)); 51 | CHECK(cudaMemset(d_a, 255, nbytes)); 52 | 53 | // set kernel launch configuration 54 | dim3 block = dim3(512); 55 | dim3 grid = dim3((num + block.x - 1) / block.x); 56 | 57 | // create cuda event handles 58 | cudaEvent_t stop; 59 | CHECK(cudaEventCreate(&stop)); 60 | 61 | // asynchronously issue work to the GPU (all to stream 0) 62 | CHECK(cudaMemcpyAsync(d_a, h_a, nbytes, cudaMemcpyHostToDevice)); 63 | kernel<<>>(d_a, value); 64 | CHECK(cudaMemcpyAsync(h_a, d_a, nbytes, cudaMemcpyDeviceToHost)); 65 | CHECK(cudaEventRecord(stop)); 66 | 67 | // have CPU do some work while waiting for stage 1 to finish 68 | unsigned long int counter = 0; 69 | 70 | while (cudaEventQuery(stop) == cudaErrorNotReady) { 71 | counter++; 72 | } 73 | 74 | // print the cpu and gpu times 75 | printf("CPU executed %lu iterations while waiting for GPU to finish\n", 76 | counter); 77 | 78 | // check the output for correctness 79 | bool bFinalResults = (bool) checkResult(h_a, num, value); 80 | 81 | // release resources 82 | CHECK(cudaEventDestroy(stop)); 83 | CHECK(cudaFreeHost(h_a)); 84 | CHECK(cudaFree(d_a)); 85 | 86 | CHECK(cudaDeviceReset()); 87 | 88 | exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE); 89 | } 90 | -------------------------------------------------------------------------------- /chapter06/simpleCallback.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /* 6 | * An example of using CUDA callbacks to trigger work on the host after the 7 | * completion of asynchronous work on the device. In this example, n_streams 8 | * CUDA streams are created and 4 kernels are launched asynchronously in each. 9 | * Then, a callback is added at the completion of those asynchronous kernels 10 | * that prints diagnostic information. 11 | */ 12 | 13 | #define N 100000 14 | #define NSTREAM 4 15 | 16 | void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void *data) 17 | { 18 | printf("callback from stream %d\n", *((int *)data)); 19 | } 20 | 21 | __global__ void kernel_1() 22 | { 23 | double sum = 0.0; 24 | 25 | for(int i = 0; i < N; i++) 26 | { 27 | sum = sum + tan(0.1) * tan(0.1); 28 | } 29 | } 30 | 31 | __global__ void kernel_2() 32 | { 33 | double sum = 0.0; 34 | 35 | for(int i = 0; i < N; i++) 36 | { 37 | sum = sum + tan(0.1) * tan(0.1); 38 | } 39 | } 40 | 41 | __global__ void kernel_3() 42 | { 43 | double sum = 0.0; 44 | 45 | for(int i = 0; i < N; i++) 46 | { 47 | sum = sum + tan(0.1) * tan(0.1); 48 | } 49 | } 50 | 51 | __global__ void kernel_4() 52 | { 53 | double sum = 0.0; 54 | 55 | for(int i = 0; i < N; i++) 56 | { 57 | sum = sum + tan(0.1) * tan(0.1); 58 | } 59 | } 60 | 61 | int main(int argc, char **argv) 62 | { 63 | int n_streams = NSTREAM; 64 | 65 | if (argc > 1) n_streams = atoi(argv[1]); 66 | 67 | int dev = 0; 68 | cudaDeviceProp deviceProp; 69 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 70 | printf("> %s Starting...\n", argv[0]); 71 | printf("> Using Device %d: %s\n", dev, deviceProp.name); 72 | CHECK(cudaSetDevice(dev)); 73 | 74 | // check if device support hyper-q 75 | if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) 76 | { 77 | if (deviceProp.concurrentKernels == 0) 78 | { 79 | printf("> GPU does not support concurrent kernel execution (SM 3.5 " 80 | "or higher required)\n"); 81 | printf("> CUDA kernel runs will be serialized\n"); 82 | } 83 | else 84 | { 85 | printf("> GPU does not support HyperQ\n"); 86 | printf("> CUDA kernel runs will have limited concurrency\n"); 87 | } 88 | } 89 | 90 | printf("> Compute Capability %d.%d hardware with %d multi-processors\n", 91 | deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); 92 | 93 | // set up max connectioin 94 | char * iname = "CUDA_DEVICE_MAX_CONNECTIONS"; 95 | setenv (iname, "8", 1); 96 | char *ivalue = getenv (iname); 97 | printf ("> %s = %s\n", iname, ivalue); 98 | printf ("> with streams = %d\n", n_streams); 99 | 100 | // Allocate and initialize an array of stream handles 101 | cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof( 102 | cudaStream_t)); 103 | 104 | for (int i = 0 ; i < n_streams ; i++) 105 | { 106 | CHECK(cudaStreamCreate(&(streams[i]))); 107 | } 108 | 109 | dim3 block (1); 110 | dim3 grid (1); 111 | cudaEvent_t start_event, stop_event; 112 | CHECK(cudaEventCreate(&start_event)); 113 | CHECK(cudaEventCreate(&stop_event)); 114 | 115 | int stream_ids[n_streams]; 116 | 117 | CHECK(cudaEventRecord(start_event, 0)); 118 | 119 | for (int i = 0; i < n_streams; i++) 120 | { 121 | stream_ids[i] = i; 122 | kernel_1<<>>(); 123 | kernel_2<<>>(); 124 | kernel_3<<>>(); 125 | kernel_4<<>>(); 126 | CHECK(cudaStreamAddCallback(streams[i], my_callback, 127 | (void *)(stream_ids + i), 0)); 128 | } 129 | 130 | CHECK(cudaEventRecord(stop_event, 0)); 131 | CHECK(cudaEventSynchronize(stop_event)); 132 | 133 | float elapsed_time; 134 | CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); 135 | printf("Measured time for parallel execution = %.3fs\n", 136 | elapsed_time / 1000.0f); 137 | 138 | // release all stream 139 | for (int i = 0 ; i < n_streams ; i++) 140 | { 141 | CHECK(cudaStreamDestroy(streams[i])); 142 | } 143 | 144 | free(streams); 145 | 146 | /* 147 | * cudaDeviceReset must be called before exiting in order for profiling and 148 | * tracing tools such as Nsight and Visual Profiler to show complete traces. 149 | */ 150 | CHECK(cudaDeviceReset()); 151 | 152 | return 0; 153 | } 154 | -------------------------------------------------------------------------------- /chapter06/simpleHyperqBreadth.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | 6 | /* 7 | * This example demonstrates submitting work to a CUDA stream in breadth-first 8 | * order. Work submission in breadth-first order prevents false-dependencies 9 | * from reducing the parallelism of an application. kernel_1, kernel_2, 10 | * kernel_3, and kernel_4 simply implement identical, dummy computation. 11 | * Separate kernels are used to make the scheduling of these kernels simpler to 12 | * visualize in the Visual Profiler. 13 | */ 14 | 15 | #define N 300000 16 | #define NSTREAM 4 17 | 18 | __global__ void kernel_1() 19 | { 20 | double sum = 0.0; 21 | 22 | for(int i = 0; i < N; i++) 23 | { 24 | sum = sum + tan(0.1) * tan(0.1); 25 | } 26 | } 27 | 28 | __global__ void kernel_2() 29 | { 30 | double sum = 0.0; 31 | 32 | for(int i = 0; i < N; i++) 33 | { 34 | sum = sum + tan(0.1) * tan(0.1); 35 | } 36 | } 37 | 38 | __global__ void kernel_3() 39 | { 40 | double sum = 0.0; 41 | 42 | for(int i = 0; i < N; i++) 43 | { 44 | sum = sum + tan(0.1) * tan(0.1); 45 | } 46 | } 47 | 48 | __global__ void kernel_4() 49 | { 50 | double sum = 0.0; 51 | 52 | for(int i = 0; i < N; i++) 53 | { 54 | sum = sum + tan(0.1) * tan(0.1); 55 | } 56 | } 57 | 58 | int main(int argc, char **argv) 59 | { 60 | int n_streams = NSTREAM; 61 | int isize = 1; 62 | int iblock = 1; 63 | int bigcase = 0; 64 | 65 | // get argument from command line 66 | if (argc > 1) n_streams = atoi(argv[1]); 67 | 68 | if (argc > 2) bigcase = atoi(argv[2]); 69 | 70 | float elapsed_time; 71 | 72 | // set up max connectioin 73 | char * iname = "CUDA_DEVICE_MAX_CONNECTIONS"; 74 | setenv (iname, "32", 1); 75 | char *ivalue = getenv (iname); 76 | printf ("%s = %s\n", iname, ivalue); 77 | 78 | int dev = 0; 79 | cudaDeviceProp deviceProp; 80 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 81 | printf("> Using Device %d: %s with num_streams %d\n", dev, deviceProp.name, 82 | n_streams); 83 | CHECK(cudaSetDevice(dev)); 84 | 85 | // check if device support hyper-q 86 | if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) 87 | { 88 | if (deviceProp.concurrentKernels == 0) 89 | { 90 | printf("> GPU does not support concurrent kernel execution (SM 3.5 " 91 | "or higher required)\n"); 92 | printf("> CUDA kernel runs will be serialized\n"); 93 | } 94 | else 95 | { 96 | printf("> GPU does not support HyperQ\n"); 97 | printf("> CUDA kernel runs will have limited concurrency\n"); 98 | } 99 | } 100 | 101 | printf("> Compute Capability %d.%d hardware with %d multi-processors\n", 102 | deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); 103 | 104 | // Allocate and initialize an array of stream handles 105 | cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof( 106 | cudaStream_t)); 107 | 108 | for (int i = 0 ; i < n_streams ; i++) 109 | { 110 | CHECK(cudaStreamCreate(&(streams[i]))); 111 | } 112 | 113 | // run kernel with more threads 114 | if (bigcase == 1) 115 | { 116 | iblock = 512; 117 | isize = 1 << 12; 118 | } 119 | 120 | // set up execution configuration 121 | dim3 block (iblock); 122 | dim3 grid (isize / iblock); 123 | printf("> grid %d block %d\n", grid.x, block.x); 124 | 125 | // creat events 126 | cudaEvent_t start, stop; 127 | CHECK(cudaEventCreate(&start)); 128 | CHECK(cudaEventCreate(&stop)); 129 | 130 | // record start event 131 | CHECK(cudaEventRecord(start, 0)); 132 | 133 | // dispatch job with breadth first ordering 134 | for (int i = 0; i < n_streams; i++) 135 | kernel_1<<>>(); 136 | 137 | for (int i = 0; i < n_streams; i++) 138 | kernel_2<<>>(); 139 | 140 | for (int i = 0; i < n_streams; i++) 141 | kernel_3<<>>(); 142 | 143 | for (int i = 0; i < n_streams; i++) 144 | kernel_4<<>>(); 145 | 146 | // record stop event 147 | CHECK(cudaEventRecord(stop, 0)); 148 | CHECK(cudaEventSynchronize(stop)); 149 | 150 | // calculate elapsed time 151 | CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); 152 | printf("Measured time for parallel execution = %.3fs\n", 153 | elapsed_time / 1000.0f); 154 | 155 | // release all stream 156 | for (int i = 0 ; i < n_streams ; i++) 157 | { 158 | CHECK(cudaStreamDestroy(streams[i])); 159 | } 160 | 161 | free(streams); 162 | 163 | // destroy events 164 | CHECK(cudaEventDestroy(start)); 165 | CHECK(cudaEventDestroy(stop)); 166 | 167 | // reset device 168 | CHECK(cudaDeviceReset()); 169 | 170 | return 0; 171 | } 172 | -------------------------------------------------------------------------------- /chapter06/simpleHyperqDependence.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | 6 | /* 7 | * A simple example of adding inter-stream dependencies using 8 | * cudaStreamWaitEvent. This code launches 4 kernels in each of n_streams 9 | * streams. An event is recorded at the completion of each stream (kernelEvent). 10 | * cudaStreamWaitEvent is then called on that event and the last stream 11 | * (streams[n_streams - 1]) to force all computation in the final stream to only 12 | * execute when all other streams have completed. 13 | */ 14 | 15 | #define N 300000 16 | #define NSTREAM 4 17 | 18 | __global__ void kernel_1() 19 | { 20 | double sum = 0.0; 21 | 22 | for(int i = 0; i < N; i++) 23 | { 24 | sum = sum + tan(0.1) * tan(0.1); 25 | } 26 | } 27 | 28 | __global__ void kernel_2() 29 | { 30 | double sum = 0.0; 31 | 32 | for(int i = 0; i < N; i++) 33 | { 34 | sum = sum + tan(0.1) * tan(0.1); 35 | } 36 | } 37 | 38 | __global__ void kernel_3() 39 | { 40 | double sum = 0.0; 41 | 42 | for(int i = 0; i < N; i++) 43 | { 44 | sum = sum + tan(0.1) * tan(0.1); 45 | } 46 | } 47 | 48 | __global__ void kernel_4() 49 | { 50 | double sum = 0.0; 51 | 52 | for(int i = 0; i < N; i++) 53 | { 54 | sum = sum + tan(0.1) * tan(0.1); 55 | } 56 | } 57 | 58 | int main(int argc, char **argv) 59 | { 60 | int n_streams = NSTREAM; 61 | int isize = 1; 62 | int iblock = 1; 63 | int bigcase = 0; 64 | 65 | // get argument from command line 66 | if (argc > 1) n_streams = atoi(argv[1]); 67 | 68 | if (argc > 2) bigcase = atoi(argv[2]); 69 | 70 | float elapsed_time; 71 | 72 | // set up max connectioin 73 | char * iname = "CUDA_DEVICE_MAX_CONNECTIONS"; 74 | setenv (iname, "32", 1); 75 | char *ivalue = getenv (iname); 76 | printf ("%s = %s\n", iname, ivalue); 77 | 78 | int dev = 0; 79 | cudaDeviceProp deviceProp; 80 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 81 | printf("> Using Device %d: %s with num_streams %d\n", dev, deviceProp.name, 82 | n_streams); 83 | CHECK(cudaSetDevice(dev)); 84 | 85 | // check if device support hyper-q 86 | if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) 87 | { 88 | if (deviceProp.concurrentKernels == 0) 89 | { 90 | printf("> GPU does not support concurrent kernel execution (SM 3.5 " 91 | "or higher required)\n"); 92 | printf("> CUDA kernel runs will be serialized\n"); 93 | } 94 | else 95 | { 96 | printf("> GPU does not support HyperQ\n"); 97 | printf("> CUDA kernel runs will have limited concurrency\n"); 98 | } 99 | } 100 | 101 | printf("> Compute Capability %d.%d hardware with %d multi-processors\n", 102 | deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); 103 | 104 | // Allocate and initialize an array of stream handles 105 | cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof( 106 | cudaStream_t)); 107 | 108 | for (int i = 0 ; i < n_streams ; i++) 109 | { 110 | CHECK(cudaStreamCreate(&(streams[i]))); 111 | } 112 | 113 | // run kernel with more threads 114 | if (bigcase == 1) 115 | { 116 | iblock = 512; 117 | isize = 1 << 12; 118 | } 119 | 120 | // set up execution configuration 121 | dim3 block (iblock); 122 | dim3 grid (isize / iblock); 123 | printf("> grid %d block %d\n", grid.x, block.x); 124 | 125 | // creat events 126 | cudaEvent_t start, stop; 127 | CHECK(cudaEventCreate(&start)); 128 | CHECK(cudaEventCreate(&stop)); 129 | 130 | 131 | cudaEvent_t *kernelEvent; 132 | kernelEvent = (cudaEvent_t *) malloc(n_streams * sizeof(cudaEvent_t)); 133 | 134 | for (int i = 0; i < n_streams; i++) 135 | { 136 | CHECK(cudaEventCreateWithFlags(&(kernelEvent[i]), 137 | cudaEventDisableTiming)); 138 | } 139 | 140 | // record start event 141 | CHECK(cudaEventRecord(start, 0)); 142 | 143 | // dispatch job with depth first ordering 144 | for (int i = 0; i < n_streams; i++) 145 | { 146 | kernel_1<<>>(); 147 | kernel_2<<>>(); 148 | kernel_3<<>>(); 149 | kernel_4<<>>(); 150 | 151 | CHECK(cudaEventRecord(kernelEvent[i], streams[i])); 152 | CHECK(cudaStreamWaitEvent(streams[n_streams - 1], kernelEvent[i], 0)); 153 | } 154 | 155 | // record stop event 156 | CHECK(cudaEventRecord(stop, 0)); 157 | CHECK(cudaEventSynchronize(stop)); 158 | 159 | // calculate elapsed time 160 | CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); 161 | printf("Measured time for parallel execution = %.3fs\n", 162 | elapsed_time / 1000.0f); 163 | 164 | // release all stream 165 | for (int i = 0 ; i < n_streams ; i++) 166 | { 167 | CHECK(cudaStreamDestroy(streams[i])); 168 | CHECK(cudaEventDestroy(kernelEvent[i])); 169 | } 170 | 171 | free(streams); 172 | free(kernelEvent); 173 | 174 | // reset device 175 | CHECK(cudaDeviceReset()); 176 | 177 | return 0; 178 | } 179 | -------------------------------------------------------------------------------- /chapter06/simpleHyperqDepth.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | 6 | /* 7 | * This example demonstrates submitting work to a CUDA stream in depth-first 8 | * order. Work submission in depth-first order may introduce false-dependencies 9 | * between unrelated tasks in different CUDA streams, limiting the parallelism 10 | * of a CUDA application. kernel_1, kernel_2, kernel_3, and kernel_4 simply 11 | * implement identical, dummy computation. Separate kernels are used to make the 12 | * scheduling of these kernels simpler to visualize in the Visual Profiler. 13 | */ 14 | 15 | #define N 300000 16 | #define NSTREAM 4 17 | 18 | __global__ void kernel_1() 19 | { 20 | double sum = 0.0; 21 | 22 | for(int i = 0; i < N; i++) 23 | { 24 | sum = sum + tan(0.1) * tan(0.1); 25 | } 26 | } 27 | 28 | __global__ void kernel_2() 29 | { 30 | double sum = 0.0; 31 | 32 | for(int i = 0; i < N; i++) 33 | { 34 | sum = sum + tan(0.1) * tan(0.1); 35 | } 36 | } 37 | 38 | __global__ void kernel_3() 39 | { 40 | double sum = 0.0; 41 | 42 | for(int i = 0; i < N; i++) 43 | { 44 | sum = sum + tan(0.1) * tan(0.1); 45 | } 46 | } 47 | 48 | __global__ void kernel_4() 49 | { 50 | double sum = 0.0; 51 | 52 | for(int i = 0; i < N; i++) 53 | { 54 | sum = sum + tan(0.1) * tan(0.1); 55 | } 56 | } 57 | 58 | int main(int argc, char **argv) 59 | { 60 | int n_streams = NSTREAM; 61 | int isize = 1; 62 | int iblock = 1; 63 | int bigcase = 0; 64 | 65 | // get argument from command line 66 | if (argc > 1) n_streams = atoi(argv[1]); 67 | 68 | if (argc > 2) bigcase = atoi(argv[2]); 69 | 70 | float elapsed_time; 71 | 72 | // set up max connectioin 73 | char* iname = "CUDA_DEVICE_MAX_CONNECTIONS"; 74 | setenv (iname, "32", 1); 75 | char *ivalue = getenv (iname); 76 | printf ("%s = %s\n", iname, ivalue); 77 | 78 | int dev = 0; 79 | cudaDeviceProp deviceProp; 80 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 81 | printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name, 82 | n_streams); 83 | CHECK(cudaSetDevice(dev)); 84 | 85 | // check if device support hyper-q 86 | if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) 87 | { 88 | if (deviceProp.concurrentKernels == 0) 89 | { 90 | printf("> GPU does not support concurrent kernel execution (SM 3.5 " 91 | "or higher required)\n"); 92 | printf("> CUDA kernel runs will be serialized\n"); 93 | } 94 | else 95 | { 96 | printf("> GPU does not support HyperQ\n"); 97 | printf("> CUDA kernel runs will have limited concurrency\n"); 98 | } 99 | } 100 | 101 | printf("> Compute Capability %d.%d hardware with %d multi-processors\n", 102 | deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); 103 | 104 | // Allocate and initialize an array of stream handles 105 | cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof( 106 | cudaStream_t)); 107 | 108 | for (int i = 0 ; i < n_streams ; i++) 109 | { 110 | CHECK(cudaStreamCreate(&(streams[i]))); 111 | } 112 | 113 | // run kernel with more threads 114 | if (bigcase == 1) 115 | { 116 | iblock = 512; 117 | isize = 1 << 12; 118 | } 119 | 120 | // set up execution configuration 121 | dim3 block (iblock); 122 | dim3 grid (isize / iblock); 123 | printf("> grid %d block %d\n", grid.x, block.x); 124 | 125 | // creat events 126 | cudaEvent_t start, stop; 127 | CHECK(cudaEventCreate(&start)); 128 | CHECK(cudaEventCreate(&stop)); 129 | 130 | // record start event 131 | CHECK(cudaEventRecord(start, 0)); 132 | 133 | // dispatch job with depth first ordering 134 | for (int i = 0; i < n_streams; i++) 135 | { 136 | kernel_1<<>>(); 137 | kernel_2<<>>(); 138 | kernel_3<<>>(); 139 | kernel_4<<>>(); 140 | } 141 | 142 | // record stop event 143 | CHECK(cudaEventRecord(stop, 0)); 144 | CHECK(cudaEventSynchronize(stop)); 145 | 146 | // calculate elapsed time 147 | CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); 148 | printf("Measured time for parallel execution = %.3fs\n", 149 | elapsed_time / 1000.0f); 150 | 151 | // release all stream 152 | for (int i = 0 ; i < n_streams ; i++) 153 | { 154 | CHECK(cudaStreamDestroy(streams[i])); 155 | } 156 | 157 | free(streams); 158 | 159 | // destroy events 160 | CHECK(cudaEventDestroy(start)); 161 | CHECK(cudaEventDestroy(stop)); 162 | 163 | // reset device 164 | CHECK(cudaDeviceReset()); 165 | 166 | return 0; 167 | } 168 | -------------------------------------------------------------------------------- /chapter06/simpleHyperqOpenmp.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * An example of using OpenMP to parallelize the creation of CUDA work in 9 | * multiple streams. This example using n_streams OpenMP threads to launch 4 10 | * kernels in each stream. Note the new pragma introduced, #pragma omp parallel. 11 | */ 12 | 13 | #define N 300000 14 | #define NSTREAM 4 15 | 16 | __global__ void kernel_1() 17 | { 18 | double sum = 0.0; 19 | 20 | for(int i = 0; i < N; i++) 21 | { 22 | sum = sum + tan(0.1) * tan(0.1); 23 | } 24 | } 25 | 26 | __global__ void kernel_2() 27 | { 28 | double sum = 0.0; 29 | 30 | for(int i = 0; i < N; i++) 31 | { 32 | sum = sum + tan(0.1) * tan(0.1); 33 | } 34 | } 35 | 36 | __global__ void kernel_3() 37 | { 38 | double sum = 0.0; 39 | 40 | for(int i = 0; i < N; i++) 41 | { 42 | sum = sum + tan(0.1) * tan(0.1); 43 | } 44 | } 45 | 46 | __global__ void kernel_4() 47 | { 48 | double sum = 0.0; 49 | 50 | for(int i = 0; i < N; i++) 51 | { 52 | sum = sum + tan(0.1) * tan(0.1); 53 | } 54 | } 55 | 56 | int main(int argc, char **argv) 57 | { 58 | int n_streams = NSTREAM; 59 | int isize = 1; 60 | int iblock = 1; 61 | int bigcase = 0; 62 | 63 | // get argument from command line 64 | if (argc > 1) n_streams = atoi(argv[1]); 65 | 66 | if (argc > 2) bigcase = atoi(argv[2]); 67 | 68 | float elapsed_time; 69 | 70 | // set up max connectioin 71 | char* iname = "CUDA_DEVICE_MAX_CONNECTIONS"; 72 | setenv (iname, "32", 1); 73 | char *ivalue = getenv (iname); 74 | printf ("%s = %s\n", iname, ivalue); 75 | 76 | int dev = 0; 77 | cudaDeviceProp deviceProp; 78 | CHECK(cudaGetDeviceProperties(&deviceProp, dev)); 79 | printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name, 80 | n_streams); 81 | CHECK(cudaSetDevice(dev)); 82 | 83 | // check if device support hyper-q 84 | if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) 85 | { 86 | if (deviceProp.concurrentKernels == 0) 87 | { 88 | printf("> GPU does not support concurrent kernel execution (SM 3.5 " 89 | "or higher required)\n"); 90 | printf("> CUDA kernel runs will be serialized\n"); 91 | } 92 | else 93 | { 94 | printf("> GPU does not support HyperQ\n"); 95 | printf("> CUDA kernel runs will have limited concurrency\n"); 96 | } 97 | } 98 | 99 | printf("> Compute Capability %d.%d hardware with %d multi-processors\n", 100 | deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); 101 | 102 | // Allocate and initialize an array of stream handles 103 | cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof( 104 | cudaStream_t)); 105 | 106 | for (int i = 0 ; i < n_streams ; i++) 107 | { 108 | CHECK(cudaStreamCreate(&(streams[i]))); 109 | } 110 | 111 | // run kernel with more threads 112 | if (bigcase == 1) 113 | { 114 | iblock = 512; 115 | isize = 1 << 12; 116 | } 117 | 118 | // set up execution configuration 119 | dim3 block (iblock); 120 | dim3 grid (isize / iblock); 121 | printf("> grid %d block %d\n", grid.x, block.x); 122 | 123 | // creat events 124 | cudaEvent_t start, stop; 125 | CHECK(cudaEventCreate(&start)); 126 | CHECK(cudaEventCreate(&stop)); 127 | 128 | // record start event 129 | CHECK(cudaEventRecord(start, 0)); 130 | 131 | // dispatch job with depth first ordering using OpenMP 132 | omp_set_num_threads(n_streams); 133 | #pragma omp parallel 134 | { 135 | int i = omp_get_thread_num(); 136 | kernel_1<<>>(); 137 | kernel_2<<>>(); 138 | kernel_3<<>>(); 139 | kernel_4<<>>(); 140 | } 141 | 142 | // record stop event 143 | CHECK(cudaEventRecord(stop, 0)); 144 | CHECK(cudaEventSynchronize(stop)); 145 | 146 | // calculate elapsed time 147 | CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); 148 | printf("Measured time for parallel execution = %.3fs\n", 149 | elapsed_time / 1000.0f); 150 | 151 | // release all stream 152 | for (int i = 0 ; i < n_streams ; i++) 153 | { 154 | CHECK(cudaStreamDestroy(streams[i])); 155 | } 156 | 157 | free(streams); 158 | 159 | // destroy events 160 | CHECK(cudaEventDestroy(start)); 161 | CHECK(cudaEventDestroy(stop)); 162 | 163 | // reset device 164 | CHECK(cudaDeviceReset()); 165 | 166 | return 0; 167 | } 168 | -------------------------------------------------------------------------------- /chapter07/Makefile: -------------------------------------------------------------------------------- 1 | CU_APPS=atomic-ordering floating-point-accuracy floating-point-perf fmad \ 2 | intrinsic-standard-comp my-atomic-add nbody 3 | C_APPS= 4 | 5 | all: ${C_APPS} ${CU_APPS} 6 | 7 | %: %.cu 8 | nvcc -O2 -arch=sm_20 -o $@ $< 9 | %: %.c 10 | gcc -O2 -std=c99 -o $@ $< 11 | clean: 12 | rm -f ${CU_APPS} ${C_APPS} 13 | -------------------------------------------------------------------------------- /chapter07/atomic-ordering.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * This example illustrates the difference between using atomic operations and 7 | * using unsafe accesses to increment a shared variable. 8 | * 9 | * In both the atomics() and unsafe() kernels, each thread repeatedly increments 10 | * a globally shared variable by 1. Each thread also stores the value it reads 11 | * from the shared location for the first increment. 12 | **/ 13 | 14 | /** 15 | * This version of the kernel uses atomic operations to safely increment a 16 | * shared variable from multiple threads. 17 | **/ 18 | __global__ void atomics(int *shared_var, int *values_read, int N, int iters) 19 | { 20 | int i; 21 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 22 | 23 | if (tid >= N) return; 24 | 25 | values_read[tid] = atomicAdd(shared_var, 1); 26 | 27 | for (i = 0; i < iters; i++) 28 | { 29 | atomicAdd(shared_var, 1); 30 | } 31 | } 32 | 33 | /** 34 | * This version of the kernel performs the same increments as atomics() but in 35 | * an unsafe manner. 36 | **/ 37 | __global__ void unsafe(int *shared_var, int *values_read, int N, int iters) 38 | { 39 | int i; 40 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 41 | 42 | if (tid >= N) return; 43 | 44 | int old = *shared_var; 45 | *shared_var = old + 1; 46 | values_read[tid] = old; 47 | 48 | for (i = 0; i < iters; i++) 49 | { 50 | int old = *shared_var; 51 | *shared_var = old + 1; 52 | } 53 | } 54 | 55 | /** 56 | * Utility function for printing the contents of an array. 57 | **/ 58 | static void print_read_results(int *h_arr, int *d_arr, int N, 59 | const char *label) 60 | { 61 | int i; 62 | int maxNumToPrint = 10; 63 | int nToPrint = N > maxNumToPrint ? maxNumToPrint : N; 64 | CHECK(cudaMemcpy(h_arr, d_arr, nToPrint * sizeof(int), 65 | cudaMemcpyDeviceToHost)); 66 | printf("Threads performing %s operations read values", label); 67 | 68 | for (i = 0; i < nToPrint; i++) 69 | { 70 | printf(" %d", h_arr[i]); 71 | } 72 | 73 | printf("\n"); 74 | } 75 | 76 | int main(int argc, char **argv) 77 | { 78 | int N = 64; 79 | int block = 32; 80 | int runs = 30; 81 | int iters = 100000; 82 | int r; 83 | int *d_shared_var; 84 | int h_shared_var_atomic, h_shared_var_unsafe; 85 | int *d_values_read_atomic; 86 | int *d_values_read_unsafe; 87 | int *h_values_read; 88 | 89 | CHECK(cudaMalloc((void **)&d_shared_var, sizeof(int))); 90 | CHECK(cudaMalloc((void **)&d_values_read_atomic, N * sizeof(int))); 91 | CHECK(cudaMalloc((void **)&d_values_read_unsafe, N * sizeof(int))); 92 | h_values_read = (int *)malloc(N * sizeof(int)); 93 | 94 | double atomic_mean_time = 0; 95 | double unsafe_mean_time = 0; 96 | 97 | for (r = 0; r < runs; r++) 98 | { 99 | double start_atomic = seconds(); 100 | CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int))); 101 | atomics<<>>(d_shared_var, d_values_read_atomic, N, 102 | iters); 103 | CHECK(cudaDeviceSynchronize()); 104 | atomic_mean_time += seconds() - start_atomic; 105 | CHECK(cudaMemcpy(&h_shared_var_atomic, d_shared_var, sizeof(int), 106 | cudaMemcpyDeviceToHost)); 107 | 108 | double start_unsafe = seconds(); 109 | CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int))); 110 | unsafe<<>>(d_shared_var, d_values_read_unsafe, N, 111 | iters); 112 | CHECK(cudaDeviceSynchronize()); 113 | unsafe_mean_time += seconds() - start_unsafe; 114 | CHECK(cudaMemcpy(&h_shared_var_unsafe, d_shared_var, sizeof(int), 115 | cudaMemcpyDeviceToHost)); 116 | } 117 | 118 | printf("In total, %d runs using atomic operations took %f s\n", 119 | runs, atomic_mean_time); 120 | printf(" Using atomic operations also produced an output of %d\n", 121 | h_shared_var_atomic); 122 | printf("In total, %d runs using unsafe operations took %f s\n", 123 | runs, unsafe_mean_time); 124 | printf(" Using unsafe operations also produced an output of %d\n", 125 | h_shared_var_unsafe); 126 | 127 | print_read_results(h_values_read, d_values_read_atomic, N, "atomic"); 128 | print_read_results(h_values_read, d_values_read_unsafe, N, "unsafe"); 129 | 130 | return 0; 131 | } 132 | -------------------------------------------------------------------------------- /chapter07/floating-point-accuracy.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * This example demonstrates floating-point's inability to represent certain 7 | * values with a specific value as an example. 8 | * 9 | * In this example, the value 12.1 is stored in single- and double-precision 10 | * floating-point variables on both the host and device. After retrieving the 11 | * results from the device, the actual values stored are printed to 20 decimal 12 | * places and the single- and double-precision results from the host and device 13 | * are compared to each other to verify that host and device are equally 14 | * accurate for the same type. 15 | **/ 16 | 17 | /** 18 | * Save the single- and double-precision representation of 12.1 from the device 19 | * into global memory. That global memory is then copied back to the host for 20 | * later analysis. 21 | **/ 22 | __global__ void kernel(float *F, double *D) 23 | { 24 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 25 | 26 | if (tid == 0) 27 | { 28 | *F = 12.1; 29 | *D = 12.1; 30 | } 31 | } 32 | 33 | int main(int argc, char **argv) 34 | { 35 | float *deviceF; 36 | float h_deviceF; 37 | double *deviceD; 38 | double h_deviceD; 39 | 40 | float hostF = 12.1; 41 | double hostD = 12.1; 42 | 43 | CHECK(cudaMalloc((void **)&deviceF, sizeof(float))); 44 | CHECK(cudaMalloc((void **)&deviceD, sizeof(double))); 45 | kernel<<<1, 32>>>(deviceF, deviceD); 46 | CHECK(cudaMemcpy(&h_deviceF, deviceF, sizeof(float), 47 | cudaMemcpyDeviceToHost)); 48 | CHECK(cudaMemcpy(&h_deviceD, deviceD, sizeof(double), 49 | cudaMemcpyDeviceToHost)); 50 | 51 | printf("Host single-precision representation of 12.1 = %.20f\n", hostF); 52 | printf("Host double-precision representation of 12.1 = %.20f\n", hostD); 53 | printf("Device single-precision representation of 12.1 = %.20f\n", hostF); 54 | printf("Device double-precision representation of 12.1 = %.20f\n", hostD); 55 | printf("Device and host single-precision representation equal? %s\n", 56 | hostF == h_deviceF ? "yes" : "no"); 57 | printf("Device and host double-precision representation equal? %s\n", 58 | hostD == h_deviceD ? "yes" : "no"); 59 | 60 | return 0; 61 | } 62 | 63 | -------------------------------------------------------------------------------- /chapter07/fmad.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * This example illustrates the effect on numerical accuracy of fusing a 7 | * multiply-add into a single MAD instruction. 8 | **/ 9 | 10 | __global__ void fmad_kernel(double x, double y, double *out) 11 | { 12 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 13 | 14 | if (tid == 0) 15 | { 16 | *out = x * x + y; 17 | } 18 | } 19 | 20 | double host_fmad_kernel(double x, double y) 21 | { 22 | return x * x + y; 23 | } 24 | 25 | int main(int argc, char **argv) 26 | { 27 | double *d_out, h_out; 28 | double x = 2.891903; 29 | double y = -3.980364; 30 | 31 | double host_value = host_fmad_kernel(x, y); 32 | CHECK(cudaMalloc((void **)&d_out, sizeof(double))); 33 | fmad_kernel<<<1, 32>>>(x, y, d_out); 34 | CHECK(cudaMemcpy(&h_out, d_out, sizeof(double), 35 | cudaMemcpyDeviceToHost)); 36 | 37 | if (host_value == h_out) 38 | { 39 | printf("The device output the same value as the host.\n"); 40 | } 41 | else 42 | { 43 | printf("The device output a different value than the host, diff=%e.\n", 44 | fabs(host_value - h_out)); 45 | } 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /chapter07/intrinsic-standard-comp.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * This example demonstrates the relative performance and accuracy of CUDA 7 | * standard and intrinsic functions. 8 | * 9 | * The computational kernel of this example is the iterative calculation of a 10 | * value squared. This computation is done on the host, on the device with a 11 | * standard function, and on the device with an intrinsic function. The results 12 | * from all three are compared for numerical accuracy (with the host as the 13 | * baseline), and the performance of standard and intrinsic functions is also 14 | * compared. 15 | **/ 16 | 17 | /** 18 | * Perform iters power operations using the standard powf function. 19 | **/ 20 | __global__ void standard_kernel(float a, float *out, int iters) 21 | { 22 | int i; 23 | int tid = (blockDim.x * blockIdx.x) + threadIdx.x; 24 | 25 | if(tid == 0) 26 | { 27 | float tmp; 28 | 29 | for (i = 0; i < iters; i++) 30 | { 31 | tmp = powf(a, 2.0f); 32 | } 33 | 34 | *out = tmp; 35 | } 36 | } 37 | 38 | /** 39 | * Perform iters power operations using the intrinsic __powf function. 40 | **/ 41 | __global__ void intrinsic_kernel(float a, float *out, int iters) 42 | { 43 | int i; 44 | int tid = (blockDim.x * blockIdx.x) + threadIdx.x; 45 | 46 | if(tid == 0) 47 | { 48 | float tmp; 49 | 50 | for (i = 0; i < iters; i++) 51 | { 52 | tmp = __powf(a, 2.0f); 53 | } 54 | 55 | *out = tmp; 56 | } 57 | } 58 | 59 | int main(int argc, char **argv) 60 | { 61 | int i; 62 | int runs = 30; 63 | int iters = 1000; 64 | 65 | float *d_standard_out, h_standard_out; 66 | CHECK(cudaMalloc((void **)&d_standard_out, sizeof(float))); 67 | 68 | float *d_intrinsic_out, h_intrinsic_out; 69 | CHECK(cudaMalloc((void **)&d_intrinsic_out, sizeof(float))); 70 | 71 | float input_value = 8181.25; 72 | 73 | double mean_intrinsic_time = 0.0; 74 | double mean_standard_time = 0.0; 75 | 76 | for (i = 0; i < runs; i++) 77 | { 78 | double start_standard = seconds(); 79 | standard_kernel<<<1, 32>>>(input_value, d_standard_out, iters); 80 | CHECK(cudaDeviceSynchronize()); 81 | mean_standard_time += seconds() - start_standard; 82 | 83 | double start_intrinsic = seconds(); 84 | intrinsic_kernel<<<1, 32>>>(input_value, d_intrinsic_out, iters); 85 | CHECK(cudaDeviceSynchronize()); 86 | mean_intrinsic_time += seconds() - start_intrinsic; 87 | } 88 | 89 | CHECK(cudaMemcpy(&h_standard_out, d_standard_out, sizeof(float), 90 | cudaMemcpyDeviceToHost)); 91 | CHECK(cudaMemcpy(&h_intrinsic_out, d_intrinsic_out, sizeof(float), 92 | cudaMemcpyDeviceToHost)); 93 | float host_value = powf(input_value, 2.0f); 94 | 95 | printf("Host calculated\t\t\t%f\n", host_value); 96 | printf("Standard Device calculated\t%f\n", h_standard_out); 97 | printf("Intrinsic Device calculated\t%f\n", h_intrinsic_out); 98 | printf("Host equals Standard?\t\t%s diff=%e\n", 99 | host_value == h_standard_out ? "Yes" : "No", 100 | fabs(host_value - h_standard_out)); 101 | printf("Host equals Intrinsic?\t\t%s diff=%e\n", 102 | host_value == h_intrinsic_out ? "Yes" : "No", 103 | fabs(host_value - h_intrinsic_out)); 104 | printf("Standard equals Intrinsic?\t%s diff=%e\n", 105 | h_standard_out == h_intrinsic_out ? "Yes" : "No", 106 | fabs(h_standard_out - h_intrinsic_out)); 107 | printf("\n"); 108 | printf("Mean execution time for standard function powf: %f s\n", 109 | mean_standard_time); 110 | printf("Mean execution time for intrinsic function __powf: %f s\n", 111 | mean_intrinsic_time); 112 | 113 | return 0; 114 | } 115 | -------------------------------------------------------------------------------- /chapter07/my-atomic-add.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * This example illustrates implementation of custom atomic operations using 7 | * CUDA's built-in atomicCAS function to implement atomic signed 32-bit integer 8 | * addition. 9 | **/ 10 | 11 | __device__ int myAtomicAdd(int *address, int incr) 12 | { 13 | // Create an initial guess for the value stored at *address. 14 | int guess = *address; 15 | int oldValue = atomicCAS(address, guess, guess + incr); 16 | 17 | // Loop while the guess is incorrect. 18 | while (oldValue != guess) 19 | { 20 | guess = oldValue; 21 | oldValue = atomicCAS(address, guess, guess + incr); 22 | } 23 | 24 | return oldValue; 25 | } 26 | 27 | __global__ void kernel(int *sharedInteger) 28 | { 29 | myAtomicAdd(sharedInteger, 1); 30 | } 31 | 32 | int main(int argc, char **argv) 33 | { 34 | int h_sharedInteger; 35 | int *d_sharedInteger; 36 | CHECK(cudaMalloc((void **)&d_sharedInteger, sizeof(int))); 37 | CHECK(cudaMemset(d_sharedInteger, 0x00, sizeof(int))); 38 | 39 | kernel<<<4, 128>>>(d_sharedInteger); 40 | 41 | CHECK(cudaMemcpy(&h_sharedInteger, d_sharedInteger, sizeof(int), 42 | cudaMemcpyDeviceToHost)); 43 | printf("4 x 128 increments led to value of %d\n", h_sharedInteger); 44 | 45 | return 0; 46 | } 47 | 48 | -------------------------------------------------------------------------------- /chapter08/Makefile: -------------------------------------------------------------------------------- 1 | CU_APPS=cublas cuda-openacc cufft-multi cufft cusparse rand-kernel \ 2 | replace-rand-streams replace-rand 3 | C_APPS=simple-data simple-kernels simple-parallel 4 | 5 | all: ${C_APPS} ${CU_APPS} 6 | 7 | cublas: cublas.cu 8 | nvcc -O2 -arch=sm_20 -lcublas -o cublas cublas.cu 9 | cuda-openacc: cuda-openacc.cu 10 | nvcc -O2 -arch=sm_20 -lcublas -lcurand -o cuda-openacc cuda-openacc.cu 11 | cufft-multi: cufft-multi.cu 12 | nvcc -O2 -arch=sm_20 -lcufft -o cufft-multi cufft-multi.cu 13 | cufft: cufft.cu 14 | nvcc -O2 -arch=sm_20 -lcufft -o cufft cufft.cu 15 | cusparse: cusparse.cu 16 | nvcc -O2 -arch=sm_20 -lcusparse -o cusparse cusparse.cu 17 | rand-kernel: rand-kernel.cu 18 | nvcc -O2 -arch=sm_20 -lcurand -o rand-kernel rand-kernel.cu 19 | replace-rand-streams: replace-rand-streams.cu 20 | nvcc -O2 -arch=sm_20 -lcurand -o replace-rand-streams replace-rand-streams.cu 21 | replace-rand: replace-rand.cu 22 | nvcc -O2 -arch=sm_20 -lcurand -o replace-rand replace-rand.cu 23 | %: %.cu 24 | nvcc -O2 -arch=sm_20 -o $@ $< 25 | %: %.c 26 | gcc -O2 -std=c99 -o $@ $< 27 | clean: 28 | rm -f ${CU_APPS} ${C_APPS} 29 | -------------------------------------------------------------------------------- /chapter08/cublas.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include "cublas_v2.h" 6 | 7 | /* 8 | * A simple example of performing matrix-vector multiplication using the cuBLAS 9 | * library and some randomly generated inputs. 10 | */ 11 | 12 | /* 13 | * M = # of rows 14 | * N = # of columns 15 | */ 16 | int M = 1024; 17 | int N = 1024; 18 | 19 | /* 20 | * Generate a vector of length N with random single-precision floating-point 21 | * values between 0 and 100. 22 | */ 23 | void generate_random_vector(int N, float **outX) 24 | { 25 | int i; 26 | double rMax = (double)RAND_MAX; 27 | float *X = (float *)malloc(sizeof(float) * N); 28 | 29 | for (i = 0; i < N; i++) 30 | { 31 | int r = rand(); 32 | double dr = (double)r; 33 | X[i] = (dr / rMax) * 100.0; 34 | } 35 | 36 | *outX = X; 37 | } 38 | 39 | /* 40 | * Generate a matrix with M rows and N columns in column-major order. The matrix 41 | * will be filled with random single-precision floating-point values between 0 42 | * and 100. 43 | */ 44 | void generate_random_dense_matrix(int M, int N, float **outA) 45 | { 46 | int i, j; 47 | double rMax = (double)RAND_MAX; 48 | float *A = (float *)malloc(sizeof(float) * M * N); 49 | 50 | // For each column 51 | for (j = 0; j < N; j++) 52 | { 53 | // For each row 54 | for (i = 0; i < M; i++) 55 | { 56 | double dr = (double)rand(); 57 | A[j * M + i] = (dr / rMax) * 100.0; 58 | } 59 | } 60 | 61 | *outA = A; 62 | } 63 | 64 | int main(int argc, char **argv) 65 | { 66 | int i; 67 | float *A, *dA; 68 | float *X, *dX; 69 | float *Y, *dY; 70 | float beta; 71 | float alpha; 72 | cublasHandle_t handle = 0; 73 | 74 | alpha = 3.0f; 75 | beta = 4.0f; 76 | 77 | // Generate inputs 78 | srand(9384); 79 | generate_random_dense_matrix(M, N, &A); 80 | generate_random_vector(N, &X); 81 | generate_random_vector(M, &Y); 82 | 83 | // Create the cuBLAS handle 84 | CHECK_CUBLAS(cublasCreate(&handle)); 85 | 86 | // Allocate device memory 87 | CHECK(cudaMalloc((void **)&dA, sizeof(float) * M * N)); 88 | CHECK(cudaMalloc((void **)&dX, sizeof(float) * N)); 89 | CHECK(cudaMalloc((void **)&dY, sizeof(float) * M)); 90 | 91 | // Transfer inputs to the device 92 | CHECK_CUBLAS(cublasSetVector(N, sizeof(float), X, 1, dX, 1)); 93 | CHECK_CUBLAS(cublasSetVector(M, sizeof(float), Y, 1, dY, 1)); 94 | CHECK_CUBLAS(cublasSetMatrix(M, N, sizeof(float), A, M, dA, M)); 95 | 96 | // Execute the matrix-vector multiplication 97 | CHECK_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, M, N, &alpha, dA, M, dX, 1, 98 | &beta, dY, 1)); 99 | 100 | // Retrieve the output vector from the device 101 | CHECK_CUBLAS(cublasGetVector(M, sizeof(float), dY, 1, Y, 1)); 102 | 103 | for (i = 0; i < 10; i++) 104 | { 105 | printf("%2.2f\n", Y[i]); 106 | } 107 | 108 | printf("...\n"); 109 | 110 | free(A); 111 | free(X); 112 | free(Y); 113 | 114 | CHECK(cudaFree(dA)); 115 | CHECK(cudaFree(dY)); 116 | CHECK_CUBLAS(cublasDestroy(handle)); 117 | 118 | return 0; 119 | } 120 | -------------------------------------------------------------------------------- /chapter08/cuda-openacc.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * This example illustrates the use of OpenACC and CUDA libraries in the same 9 | * application. cuRAND is used to fill two input matrices with random values. 10 | * OpenACC is used to implement a matrix-multiply using the parallel and loop 11 | * directives. Finally, cuBLAS is used to first sum the values of every row, and 12 | * then sum those values together to calculate the sum of all values in the 13 | * output matrix. 14 | */ 15 | 16 | #define M 1024 17 | #define N 1024 18 | #define P 1024 19 | 20 | int main(int argc, char **argv) 21 | { 22 | int i, j, k; 23 | float *__restrict__ d_A; 24 | float *__restrict__ d_B; 25 | float *__restrict__ d_C; 26 | float *d_row_sums; 27 | float total_sum; 28 | curandGenerator_t rand_state = 0; 29 | cublasHandle_t cublas_handle = 0; 30 | 31 | // Initialize the cuRAND and cuBLAS handles. 32 | CHECK_CURAND(curandCreateGenerator(&rand_state, CURAND_RNG_PSEUDO_DEFAULT)); 33 | CHECK_CUBLAS(cublasCreate(&cublas_handle)); 34 | 35 | // Allocate GPU memory for the input matrices, output matrix, and row sums. 36 | CHECK(cudaMalloc((void **)&d_A, sizeof(float) * M * N)); 37 | CHECK(cudaMalloc((void **)&d_B, sizeof(float) * N * P)); 38 | CHECK(cudaMalloc((void **)&d_C, sizeof(float) * M * P)); 39 | CHECK(cudaMalloc((void **)&d_row_sums, sizeof(float) * M)); 40 | 41 | // Generate random values in both input matrices. 42 | CHECK_CURAND(curandGenerateUniform(rand_state, d_A, M * N)); 43 | CHECK_CURAND(curandGenerateUniform(rand_state, d_B, N * P)); 44 | 45 | // Perform a matrix multiply parallelized across gangs and workers 46 | #pragma acc parallel loop gang deviceptr(d_A, d_B, d_C) 47 | 48 | for (i = 0; i < M; i++) 49 | { 50 | #pragma acc loop worker vector 51 | 52 | for (j = 0; j < P; j++) 53 | { 54 | float sum = 0.0f; 55 | 56 | for (k = 0; k < N; k++) 57 | { 58 | sum += d_A[i * N + k] * d_B[k * P + j]; 59 | } 60 | 61 | d_C[i * P + j] = sum; 62 | } 63 | } 64 | 65 | /* 66 | * Set cuBLAS to device pointer mode, indicating that all scalars are passed 67 | * as device pointers. 68 | */ 69 | CHECK_CUBLAS(cublasSetPointerMode(cublas_handle, 70 | CUBLAS_POINTER_MODE_DEVICE)); 71 | 72 | // Sum the values contained in each row. 73 | for (i = 0; i < M; i++) 74 | { 75 | CHECK_CUBLAS(cublasSasum(cublas_handle, P, d_C + (i * P), 1, 76 | d_row_sums + i)); 77 | } 78 | 79 | /* 80 | * Set cuBLAS back to host pointer mode, indicating that all scalars are 81 | * passed as host pointers. 82 | */ 83 | CHECK_CUBLAS(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST)); 84 | /* 85 | * Do the final sum of the sum of all rows to produce a total for the whole 86 | * output matrix. 87 | */ 88 | CHECK_CUBLAS(cublasSasum(cublas_handle, M, d_row_sums, 1, &total_sum)); 89 | CHECK(cudaDeviceSynchronize()); 90 | 91 | // Release device memory 92 | CHECK(cudaFree(d_A)); 93 | CHECK(cudaFree(d_B)); 94 | CHECK(cudaFree(d_C)); 95 | CHECK(cudaFree(d_row_sums)); 96 | 97 | printf("Total sum = %f\n", total_sum); 98 | 99 | return 0; 100 | } 101 | -------------------------------------------------------------------------------- /chapter08/cufft-multi.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * An example usage of the Multi-GPU cuFFT XT library introduced in CUDA 6. This 9 | * example performs a 1D forward FFT across all devices detected in the system. 10 | */ 11 | 12 | /* 13 | * Create N fake samplings along the function cos(x). These samplings will be 14 | * stored as single-precision floating-point values. 15 | */ 16 | void generate_fake_samples(int N, float **out) 17 | { 18 | int i; 19 | float *result = (float *)malloc(sizeof(float) * N); 20 | double delta = M_PI / 4.0; 21 | 22 | for (i = 0; i < N; i++) 23 | { 24 | result[i] = cos(i * delta); 25 | } 26 | 27 | *out = result; 28 | } 29 | 30 | /* 31 | * Convert a real-valued vector r of length Nto a complex-valued vector. 32 | */ 33 | void real_to_complex(float *r, cufftComplex **complx, int N) 34 | { 35 | int i; 36 | (*complx) = (cufftComplex *)malloc(sizeof(cufftComplex) * N); 37 | 38 | for (i = 0; i < N; i++) 39 | { 40 | (*complx)[i].x = r[i]; 41 | (*complx)[i].y = 0; 42 | } 43 | } 44 | 45 | /* 46 | * Retrieve device IDs for all CUDA devices in the current system. 47 | */ 48 | int getAllGpus(int **gpus) 49 | { 50 | int i; 51 | int nGpus; 52 | 53 | CHECK(cudaGetDeviceCount(&nGpus)); 54 | 55 | *gpus = (int *)malloc(sizeof(int) * nGpus); 56 | 57 | for (i = 0; i < nGpus; i++) 58 | { 59 | (*gpus)[i] = i; 60 | } 61 | 62 | return nGpus; 63 | } 64 | 65 | int main(int argc, char **argv) 66 | { 67 | int i; 68 | int N = 1024; 69 | float *samples; 70 | cufftComplex *complexSamples; 71 | int *gpus; 72 | size_t *workSize; 73 | cufftHandle plan = 0; 74 | cudaLibXtDesc *dComplexSamples; 75 | 76 | int nGPUs = getAllGpus(&gpus); 77 | nGPUs = nGPUs > 2 ? 2 : nGPUs; 78 | workSize = (size_t *)malloc(sizeof(size_t) * nGPUs); 79 | 80 | // Setup the cuFFT Multi-GPU plan 81 | CHECK_CUFFT(cufftCreate(&plan)); 82 | // CHECK_CUFFT(cufftPlan1d(&plan, N, CUFFT_C2C, 1)); 83 | CHECK_CUFFT(cufftXtSetGPUs(plan, 2, gpus)); 84 | CHECK_CUFFT(cufftMakePlan1d(plan, N, CUFFT_C2C, 1, workSize)); 85 | 86 | // Generate inputs 87 | generate_fake_samples(N, &samples); 88 | real_to_complex(samples, &complexSamples, N); 89 | cufftComplex *complexFreq = (cufftComplex *)malloc( 90 | sizeof(cufftComplex) * N); 91 | 92 | // Allocate memory across multiple GPUs and transfer the inputs into it 93 | CHECK_CUFFT(cufftXtMalloc(plan, &dComplexSamples, CUFFT_XT_FORMAT_INPLACE)); 94 | CHECK_CUFFT(cufftXtMemcpy(plan, dComplexSamples, complexSamples, 95 | CUFFT_COPY_HOST_TO_DEVICE)); 96 | 97 | // Execute a complex-to-complex 1D FFT across multiple GPUs 98 | CHECK_CUFFT(cufftXtExecDescriptorC2C(plan, dComplexSamples, dComplexSamples, 99 | CUFFT_FORWARD)); 100 | 101 | // Retrieve the results from multiple GPUs into host memory 102 | CHECK_CUFFT(cufftXtMemcpy(plan, complexSamples, dComplexSamples, 103 | CUFFT_COPY_DEVICE_TO_HOST)); 104 | 105 | printf("Fourier Coefficients:\n"); 106 | 107 | for (i = 0; i < 30; i++) 108 | { 109 | printf(" %d: (%2.4f, %2.4f)\n", i + 1, complexFreq[i].x, 110 | complexFreq[i].y); 111 | } 112 | 113 | free(gpus); 114 | free(samples); 115 | free(complexSamples); 116 | free(complexFreq); 117 | free(workSize); 118 | 119 | CHECK_CUFFT(cufftXtFree(dComplexSamples)); 120 | CHECK_CUFFT(cufftDestroy(plan)); 121 | 122 | return 0; 123 | } 124 | -------------------------------------------------------------------------------- /chapter08/cufft.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * An example usage of the cuFFT library. This example performs a 1D forward 9 | * FFT. 10 | */ 11 | 12 | int nprints = 30; 13 | 14 | /* 15 | * Create N fake samplings along the function cos(x). These samplings will be 16 | * stored as single-precision floating-point values. 17 | */ 18 | void generate_fake_samples(int N, float **out) 19 | { 20 | int i; 21 | float *result = (float *)malloc(sizeof(float) * N); 22 | double delta = M_PI / 20.0; 23 | 24 | for (i = 0; i < N; i++) 25 | { 26 | result[i] = cos(i * delta); 27 | } 28 | 29 | *out = result; 30 | } 31 | 32 | /* 33 | * Convert a real-valued vector r of length Nto a complex-valued vector. 34 | */ 35 | void real_to_complex(float *r, cufftComplex **complx, int N) 36 | { 37 | int i; 38 | (*complx) = (cufftComplex *)malloc(sizeof(cufftComplex) * N); 39 | 40 | for (i = 0; i < N; i++) 41 | { 42 | (*complx)[i].x = r[i]; 43 | (*complx)[i].y = 0; 44 | } 45 | } 46 | 47 | int main(int argc, char **argv) 48 | { 49 | int i; 50 | int N = 2048; 51 | float *samples; 52 | cufftHandle plan = 0; 53 | cufftComplex *dComplexSamples, *complexSamples, *complexFreq; 54 | 55 | // Input Generation 56 | generate_fake_samples(N, &samples); 57 | real_to_complex(samples, &complexSamples, N); 58 | complexFreq = (cufftComplex *)malloc( 59 | sizeof(cufftComplex) * N); 60 | printf("Initial Samples:\n"); 61 | 62 | for (i = 0; i < nprints; i++) 63 | { 64 | printf(" %2.4f\n", samples[i]); 65 | } 66 | 67 | printf(" ...\n"); 68 | 69 | // Setup the cuFFT plan 70 | CHECK_CUFFT(cufftPlan1d(&plan, N, CUFFT_C2C, 1)); 71 | 72 | // Allocate device memory 73 | CHECK(cudaMalloc((void **)&dComplexSamples, 74 | sizeof(cufftComplex) * N)); 75 | 76 | // Transfer inputs into device memory 77 | CHECK(cudaMemcpy(dComplexSamples, complexSamples, 78 | sizeof(cufftComplex) * N, cudaMemcpyHostToDevice)); 79 | 80 | // Execute a complex-to-complex 1D FFT 81 | CHECK_CUFFT(cufftExecC2C(plan, dComplexSamples, dComplexSamples, 82 | CUFFT_FORWARD)); 83 | 84 | // Retrieve the results into host memory 85 | CHECK(cudaMemcpy(complexFreq, dComplexSamples, 86 | sizeof(cufftComplex) * N, cudaMemcpyDeviceToHost)); 87 | 88 | printf("Fourier Coefficients:\n"); 89 | 90 | for (i = 0; i < nprints; i++) 91 | { 92 | printf(" %d: (%2.4f, %2.4f)\n", i + 1, complexFreq[i].x, 93 | complexFreq[i].y); 94 | } 95 | 96 | printf(" ...\n"); 97 | 98 | free(samples); 99 | free(complexSamples); 100 | free(complexFreq); 101 | 102 | CHECK(cudaFree(dComplexSamples)); 103 | CHECK_CUFFT(cufftDestroy(plan)); 104 | 105 | return 0; 106 | } 107 | -------------------------------------------------------------------------------- /chapter08/cusparse.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * This is an example demonstrating usage of the cuSPARSE library to perform a 9 | * sparse matrix-vector multiplication on randomly generated data. 10 | */ 11 | 12 | /* 13 | * M = # of rows 14 | * N = # of columns 15 | */ 16 | int M = 1024; 17 | int N = 1024; 18 | 19 | /* 20 | * Generate a vector of length N with random single-precision floating-point 21 | * values between 0 and 100. 22 | */ 23 | void generate_random_vector(int N, float **outX) 24 | { 25 | int i; 26 | double rMax = (double)RAND_MAX; 27 | float *X = (float *)malloc(sizeof(float) * N); 28 | 29 | for (i = 0; i < N; i++) 30 | { 31 | int r = rand(); 32 | double dr = (double)r; 33 | X[i] = (dr / rMax) * 100.0; 34 | } 35 | 36 | *outX = X; 37 | } 38 | 39 | /* 40 | * Generate random dense matrix A in column-major order, while rounding some 41 | * elements down to zero to ensure it is sparse. 42 | */ 43 | int generate_random_dense_matrix(int M, int N, float **outA) 44 | { 45 | int i, j; 46 | double rMax = (double)RAND_MAX; 47 | float *A = (float *)malloc(sizeof(float) * M * N); 48 | int totalNnz = 0; 49 | 50 | for (j = 0; j < N; j++) 51 | { 52 | for (i = 0; i < M; i++) 53 | { 54 | int r = rand(); 55 | float *curr = A + (j * M + i); 56 | 57 | if (r % 3 > 0) 58 | { 59 | *curr = 0.0f; 60 | } 61 | else 62 | { 63 | double dr = (double)r; 64 | *curr = (dr / rMax) * 100.0; 65 | } 66 | 67 | if (*curr != 0.0f) 68 | { 69 | totalNnz++; 70 | } 71 | } 72 | } 73 | 74 | *outA = A; 75 | return totalNnz; 76 | } 77 | 78 | int main(int argc, char **argv) 79 | { 80 | int row; 81 | float *A, *dA; 82 | int *dNnzPerRow; 83 | float *dCsrValA; 84 | int *dCsrRowPtrA; 85 | int *dCsrColIndA; 86 | int totalNnz; 87 | float alpha = 3.0f; 88 | float beta = 4.0f; 89 | float *dX, *X; 90 | float *dY, *Y; 91 | cusparseHandle_t handle = 0; 92 | cusparseMatDescr_t descr = 0; 93 | 94 | // Generate input 95 | srand(9384); 96 | int trueNnz = generate_random_dense_matrix(M, N, &A); 97 | generate_random_vector(N, &X); 98 | generate_random_vector(M, &Y); 99 | 100 | // Create the cuSPARSE handle 101 | CHECK_CUSPARSE(cusparseCreate(&handle)); 102 | 103 | // Allocate device memory for vectors and the dense form of the matrix A 104 | CHECK(cudaMalloc((void **)&dX, sizeof(float) * N)); 105 | CHECK(cudaMalloc((void **)&dY, sizeof(float) * M)); 106 | CHECK(cudaMalloc((void **)&dA, sizeof(float) * M * N)); 107 | CHECK(cudaMalloc((void **)&dNnzPerRow, sizeof(int) * M)); 108 | 109 | // Construct a descriptor of the matrix A 110 | CHECK_CUSPARSE(cusparseCreateMatDescr(&descr)); 111 | CHECK_CUSPARSE(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 112 | CHECK_CUSPARSE(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); 113 | 114 | // Transfer the input vectors and dense matrix A to the device 115 | CHECK(cudaMemcpy(dX, X, sizeof(float) * N, cudaMemcpyHostToDevice)); 116 | CHECK(cudaMemcpy(dY, Y, sizeof(float) * M, cudaMemcpyHostToDevice)); 117 | CHECK(cudaMemcpy(dA, A, sizeof(float) * M * N, cudaMemcpyHostToDevice)); 118 | 119 | // Compute the number of non-zero elements in A 120 | CHECK_CUSPARSE(cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA, 121 | M, dNnzPerRow, &totalNnz)); 122 | 123 | if (totalNnz != trueNnz) 124 | { 125 | fprintf(stderr, "Difference detected between cuSPARSE NNZ and true " 126 | "value: expected %d but got %d\n", trueNnz, totalNnz); 127 | return 1; 128 | } 129 | 130 | // Allocate device memory to store the sparse CSR representation of A 131 | CHECK(cudaMalloc((void **)&dCsrValA, sizeof(float) * totalNnz)); 132 | CHECK(cudaMalloc((void **)&dCsrRowPtrA, sizeof(int) * (M + 1))); 133 | CHECK(cudaMalloc((void **)&dCsrColIndA, sizeof(int) * totalNnz)); 134 | 135 | // Convert A from a dense formatting to a CSR formatting, using the GPU 136 | CHECK_CUSPARSE(cusparseSdense2csr(handle, M, N, descr, dA, M, dNnzPerRow, 137 | dCsrValA, dCsrRowPtrA, dCsrColIndA)); 138 | 139 | // Perform matrix-vector multiplication with the CSR-formatted matrix A 140 | CHECK_CUSPARSE(cusparseScsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 141 | M, N, totalNnz, &alpha, descr, dCsrValA, 142 | dCsrRowPtrA, dCsrColIndA, dX, &beta, dY)); 143 | 144 | // Copy the result vector back to the host 145 | CHECK(cudaMemcpy(Y, dY, sizeof(float) * M, cudaMemcpyDeviceToHost)); 146 | 147 | for (row = 0; row < 10; row++) 148 | { 149 | printf("%2.2f\n", Y[row]); 150 | } 151 | 152 | printf("...\n"); 153 | 154 | free(A); 155 | free(X); 156 | free(Y); 157 | 158 | CHECK(cudaFree(dX)); 159 | CHECK(cudaFree(dY)); 160 | CHECK(cudaFree(dA)); 161 | CHECK(cudaFree(dNnzPerRow)); 162 | CHECK(cudaFree(dCsrValA)); 163 | CHECK(cudaFree(dCsrRowPtrA)); 164 | CHECK(cudaFree(dCsrColIndA)); 165 | 166 | CHECK_CUSPARSE(cusparseDestroyMatDescr(descr)); 167 | CHECK_CUSPARSE(cusparseDestroy(handle)); 168 | 169 | 170 | return 0; 171 | } 172 | -------------------------------------------------------------------------------- /chapter08/drop-in.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern int sgemm_(char *transa, char *transb, int *m, int * 5 | n, int *k, float *alpha, float *a, int *lda, float *b, int * 6 | ldb, float *beta, float *c, int *ldc); 7 | 8 | /* 9 | * A simple example of re-compiling legacy BLAS code to use the drop-in cuBLAS 10 | * library. 11 | */ 12 | 13 | /* 14 | * M = # of rows 15 | * N = # of columns 16 | */ 17 | int M = 1024; 18 | int N = 1024; 19 | 20 | /* 21 | * Generate a matrix with M rows and N columns in column-major order. The matrix 22 | * will be filled with random single-precision floating-point values between 0 23 | * and 100. 24 | */ 25 | void generate_random_dense_matrix(int M, int N, float **outA) 26 | { 27 | int i, j; 28 | double rMax = (double)RAND_MAX; 29 | float *A = (float *)malloc(sizeof(float) * M * N); 30 | 31 | // For each column 32 | for (j = 0; j < N; j++) 33 | { 34 | // For each row 35 | for (i = 0; i < M; i++) 36 | { 37 | double dr = (double)rand(); 38 | A[j * M + i] = (dr / rMax) * 100.0; 39 | } 40 | } 41 | 42 | *outA = A; 43 | } 44 | 45 | int main(int argc, char **argv) 46 | { 47 | int i, j; 48 | float *A, *B, *C; 49 | float alpha = 3.0f; 50 | float beta = 4.0f; 51 | 52 | // Generate inputs 53 | srand(9384); 54 | generate_random_dense_matrix(M, N, &A); 55 | generate_random_dense_matrix(N, M, &B); 56 | generate_random_dense_matrix(M, N, &C); 57 | 58 | sgemm_("N", "N", &M, &M, &N, &alpha, A, &M, B, &N, &beta, C, &M); 59 | 60 | for (i = 0; i < 10; i++) 61 | { 62 | for (j = 0; j < 10; j++) 63 | { 64 | printf("%2.2f ", C[j * M + i]); 65 | } 66 | 67 | printf("...\n"); 68 | } 69 | 70 | printf("...\n"); 71 | 72 | free(A); 73 | free(B); 74 | free(C); 75 | 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /chapter08/rand-kernel.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * This example demonstrates two techniques for using the cuRAND host and device 9 | * API to generate random numbers for CUDA kernels to consume. 10 | */ 11 | 12 | int threads_per_block = 256; 13 | int blocks_per_grid = 30; 14 | 15 | /* 16 | * host_api_kernel consumes pre-generated random values from the cuRAND host API 17 | * to perform some dummy computation. 18 | */ 19 | __global__ void host_api_kernel(float *randomValues, float *out, int N) 20 | { 21 | int i; 22 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 23 | int nthreads = gridDim.x * blockDim.x; 24 | 25 | for (i = tid; i < N; i += nthreads) 26 | { 27 | float rand = randomValues[i]; 28 | rand = rand * 2; 29 | out[i] = rand; 30 | } 31 | } 32 | 33 | /* 34 | * device_api_kernel uses the cuRAND device API to generate random numbers 35 | * on-the-fly on the GPU, and then performs some dummy computation using them. 36 | */ 37 | __global__ void device_api_kernel(curandState *states, float *out, int N) 38 | { 39 | int i; 40 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 41 | int nthreads = gridDim.x * blockDim.x; 42 | curandState *state = states + tid; 43 | 44 | curand_init(9384, tid, 0, state); 45 | 46 | for (i = tid; i < N; i += nthreads) 47 | { 48 | float rand = curand_uniform(state); 49 | rand = rand * 2; 50 | out[i] = rand; 51 | } 52 | } 53 | 54 | /* 55 | * use_host_api is an examples usage of the cuRAND host API to generate random 56 | * values to be consumed on the device. 57 | */ 58 | void use_host_api(int N) 59 | { 60 | int i; 61 | curandGenerator_t randGen; 62 | float *dRand, *dOut, *hOut; 63 | 64 | // Create cuRAND generator (i.e. handle) 65 | CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_DEFAULT)); 66 | 67 | // Allocate device memory to store the random values and output 68 | CHECK(cudaMalloc((void **)&dRand, sizeof(float) * N)); 69 | CHECK(cudaMalloc((void **)&dOut, sizeof(float) * N)); 70 | hOut = (float *)malloc(sizeof(float) * N); 71 | 72 | // Generate N random values from a uniform distribution 73 | CHECK_CURAND(curandGenerateUniform(randGen, dRand, N)); 74 | 75 | // Consume the values generated by curandGenerateUniform 76 | host_api_kernel<<>>(dRand, dOut, N); 77 | 78 | // Retrieve outputs 79 | CHECK(cudaMemcpy(hOut, dOut, sizeof(float) * N, cudaMemcpyDeviceToHost)); 80 | 81 | printf("Sampling of output from host API:\n"); 82 | 83 | for (i = 0; i < 10; i++) 84 | { 85 | printf("%2.4f\n", hOut[i]); 86 | } 87 | 88 | printf("...\n"); 89 | 90 | free(hOut); 91 | CHECK(cudaFree(dRand)); 92 | CHECK(cudaFree(dOut)); 93 | CHECK_CURAND(curandDestroyGenerator(randGen)); 94 | } 95 | 96 | /* 97 | * use_device_api is an examples usage of the cuRAND device API to use the GPU 98 | * to generate random values on the fly from inside a CUDA kernel. 99 | */ 100 | void use_device_api(int N) 101 | { 102 | int i; 103 | static curandState *states = NULL; 104 | float *dOut, *hOut; 105 | 106 | /* 107 | * Allocate device memory to store the output and cuRAND device state 108 | * objects (which are analogous to handles, but on the GPU). 109 | */ 110 | CHECK(cudaMalloc((void **)&dOut, sizeof(float) * N)); 111 | CHECK(cudaMalloc((void **)&states, sizeof(curandState) * 112 | threads_per_block * blocks_per_grid)); 113 | hOut = (float *)malloc(sizeof(float) * N); 114 | 115 | // Execute a kernel that generates and consumes its own random numbers 116 | device_api_kernel<<>>(states, dOut, N); 117 | 118 | // Retrieve the results 119 | CHECK(cudaMemcpy(hOut, dOut, sizeof(float) * N, cudaMemcpyDeviceToHost)); 120 | 121 | printf("Sampling of output from device API:\n"); 122 | 123 | for (i = 0; i < 10; i++) 124 | { 125 | printf("%2.4f\n", hOut[i]); 126 | } 127 | 128 | printf("...\n"); 129 | 130 | free(hOut); 131 | CHECK(cudaFree(dOut)); 132 | CHECK(cudaFree(states)); 133 | } 134 | 135 | int main(int argc, char **argv) 136 | { 137 | int N = 8388608; 138 | 139 | use_host_api(N); 140 | use_device_api(N); 141 | 142 | return 0; 143 | } 144 | -------------------------------------------------------------------------------- /chapter08/replace-rand-streams.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * This example is a clone of replace-rand.cu that uses CUDA streams to overlap 9 | * the generation of random numbers using cuSPARSE with any host computation. 10 | */ 11 | 12 | /* 13 | * initialize_state initializes cuRAND device state 14 | */ 15 | __global__ void initialize_state(curandState *states) 16 | { 17 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 18 | curand_init(9384, tid, 0, states + tid); 19 | } 20 | 21 | /* 22 | * refill_randoms uses the cuRAND device API to generate N random values using 23 | * the states passed to the kernel. 24 | */ 25 | __global__ void refill_randoms(float *dRand, int N, curandState *states) 26 | { 27 | int i; 28 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 29 | int nthreads = gridDim.x * blockDim.x; 30 | curandState *state = states + tid; 31 | 32 | for (i = tid; i < N; i += nthreads) 33 | { 34 | dRand[i] = curand_uniform(state); 35 | } 36 | } 37 | 38 | /* 39 | * An implementation of rand() that uses the cuRAND device API. 40 | */ 41 | float cuda_device_rand() 42 | { 43 | static cudaStream_t stream = 0; 44 | static curandState *states = NULL; 45 | static float *dRand = NULL; 46 | static float *hRand = NULL; 47 | static int dRand_length = 1000000; 48 | static int dRand_used = dRand_length; 49 | 50 | int threads_per_block = 256; 51 | int blocks_per_grid = 30; 52 | 53 | if (dRand == NULL) 54 | { 55 | /* 56 | * If the cuRAND state hasn't been initialized yet, create a CUDA stream 57 | * to execute operations in, pre-allocate device memory to store the 58 | * generated random values in, and asynchronously launch a 59 | * refill_randoms kernel to begin generating random numbers. 60 | */ 61 | CHECK(cudaStreamCreate(&stream)); 62 | CHECK(cudaMalloc((void **)&dRand, sizeof(float) * dRand_length)); 63 | CHECK(cudaMalloc((void **)&states, sizeof(curandState) * 64 | threads_per_block * blocks_per_grid)); 65 | hRand = (float *)malloc(sizeof(float) * dRand_length); 66 | initialize_state<<>>( 67 | states); 68 | refill_randoms<<>>(dRand, 69 | dRand_length, states); 70 | } 71 | 72 | if (dRand_used == dRand_length) 73 | { 74 | /* 75 | * If all pre-generated random numbers have been consumed, wait for the 76 | * last launch of refill_randoms to complete, transfer those newly 77 | * generated random numbers back, and launch another batch random number 78 | * generation kernel asynchronously. 79 | */ 80 | CHECK(cudaStreamSynchronize(stream)); 81 | CHECK(cudaMemcpy(hRand, dRand, sizeof(float) * dRand_length, 82 | cudaMemcpyDeviceToHost)); 83 | refill_randoms<<>>(dRand, 84 | dRand_length, states); 85 | dRand_used = 0; 86 | } 87 | 88 | // Return the next pre-generated random number 89 | return hRand[dRand_used++]; 90 | } 91 | 92 | /* 93 | * An implementation of rand() that uses the cuRAND host API. 94 | */ 95 | float cuda_host_rand() 96 | { 97 | static cudaStream_t stream = 0; 98 | static float *dRand = NULL; 99 | static float *hRand = NULL; 100 | curandGenerator_t randGen; 101 | static int dRand_length = 1000000; 102 | static int dRand_used = 1000000; 103 | 104 | if (dRand == NULL) 105 | { 106 | /* 107 | * If the cuRAND state hasn't been initialized yet, construct a cuRAND 108 | * generator and configure it to use a CUDA stream. Pre-allocate device 109 | * memory to store the output random numbers and asynchronously launch 110 | * curandGenerateUniform. Because curandGenerateUniform uses the randGen 111 | * handle, it will execute in the set stream. 112 | */ 113 | CHECK_CURAND(curandCreateGenerator(&randGen, 114 | CURAND_RNG_PSEUDO_DEFAULT)); 115 | CHECK(cudaStreamCreate(&stream)); 116 | CHECK_CURAND(curandSetStream(randGen, stream)); 117 | 118 | CHECK(cudaMalloc((void **)&dRand, sizeof(float) * dRand_length)); 119 | hRand = (float *)malloc(sizeof(float) * dRand_length); 120 | CHECK_CURAND(curandGenerateUniform(randGen, dRand, dRand_length)); 121 | } 122 | 123 | if (dRand_used == dRand_length) 124 | { 125 | /* 126 | * If all pre-generated random numbers have been consumed, wait for the 127 | * last asynchronous curandGenerateUniform to complex, transfer the new 128 | * batch of random numbers back to the host, and relaunch 129 | * curandGenerateUniform. 130 | */ 131 | CHECK(cudaStreamSynchronize(stream)); 132 | CHECK(cudaMemcpy(hRand, dRand, sizeof(float) * dRand_length, 133 | cudaMemcpyDeviceToHost)); 134 | CHECK_CURAND(curandGenerateUniform(randGen, dRand, dRand_length)); 135 | dRand_used = 0; 136 | } 137 | 138 | // Return the next pre-generated random number 139 | return hRand[dRand_used++]; 140 | } 141 | 142 | float host_rand() 143 | { 144 | return (float)rand() / (float)RAND_MAX; 145 | } 146 | 147 | int main(int argc, char **argv) 148 | { 149 | int i; 150 | int N = 8388608; 151 | 152 | for (i = 0; i < N; i++) 153 | { 154 | float h = host_rand(); 155 | float d = cuda_host_rand(); 156 | float dd = cuda_device_rand(); 157 | printf("%2.4f %2.4f %2.4f\n", h, d, dd); 158 | getchar(); 159 | } 160 | 161 | return 0; 162 | } 163 | -------------------------------------------------------------------------------- /chapter08/replace-rand.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * This example uses the cuRAND host and device API to replace the system rand() 9 | * call by pre-generating large chunks of random numbers before fetching one at 10 | * at time. If there are no unused random numbers left, a new batch is generated 11 | * synchronously. 12 | */ 13 | 14 | /* 15 | * initialize_state initializes cuRAND device state 16 | */ 17 | __global__ void initialize_state(curandState *states) 18 | { 19 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 20 | curand_init(9384, tid, 0, states + tid); 21 | } 22 | 23 | /* 24 | * refill_randoms uses the cuRAND device API to generate N random values using 25 | * the states passed to the kernel. 26 | */ 27 | __global__ void refill_randoms(float *dRand, int N, curandState *states) 28 | { 29 | int i; 30 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 31 | int nthreads = gridDim.x * blockDim.x; 32 | curandState *state = states + tid; 33 | 34 | for (i = tid; i < N; i += nthreads) 35 | { 36 | dRand[i] = curand_uniform(state); 37 | } 38 | } 39 | 40 | /* 41 | * An implementation of rand() that uses the cuRAND device API. 42 | */ 43 | float cuda_device_rand() 44 | { 45 | static curandState *states = NULL; 46 | static float *dRand = NULL; 47 | static float *hRand = NULL; 48 | static int dRand_length = 1000000; 49 | static int dRand_used = 1000000; 50 | 51 | int threads_per_block = 256; 52 | int blocks_per_grid = 30; 53 | 54 | if (dRand == NULL) 55 | { 56 | /* 57 | * If the cuRAND state hasn't been initialized yet, pre-allocate memory 58 | * to store the generated random values in as well as the cuRAND device 59 | * state objects. 60 | */ 61 | CHECK(cudaMalloc((void **)&dRand, sizeof(float) * dRand_length)); 62 | CHECK(cudaMalloc((void **)&states, sizeof(curandState) * 63 | threads_per_block * blocks_per_grid)); 64 | hRand = (float *)malloc(sizeof(float) * dRand_length); 65 | // Initialize states on the device 66 | initialize_state<<>>(states); 67 | } 68 | 69 | if (dRand_used == dRand_length) 70 | { 71 | /* 72 | * If all pre-generated random numbers have been consumed, regenerate a 73 | * new batch. 74 | */ 75 | refill_randoms<<>>(dRand, 76 | dRand_length, states); 77 | CHECK(cudaMemcpy(hRand, dRand, sizeof(float) * dRand_length, 78 | cudaMemcpyDeviceToHost)); 79 | dRand_used = 0; 80 | } 81 | 82 | // Return the next pre-generated random number 83 | return hRand[dRand_used++]; 84 | } 85 | 86 | /* 87 | * An implementation of rand() that uses the cuRAND host API. 88 | */ 89 | float cuda_host_rand() 90 | { 91 | static float *dRand = NULL; 92 | static float *hRand = NULL; 93 | curandGenerator_t randGen; 94 | static int dRand_length = 1000000; 95 | static int dRand_used = 1000000; 96 | 97 | if (dRand == NULL) 98 | { 99 | /* 100 | * If the cuRAND state hasn't been initialized yet, construct a cuRAND 101 | * host generator and pre-allocate memory to store the generated random 102 | * values in. 103 | */ 104 | CHECK_CURAND(curandCreateGenerator(&randGen, 105 | CURAND_RNG_PSEUDO_DEFAULT)); 106 | CHECK(cudaMalloc((void **)&dRand, sizeof(float) * dRand_length)); 107 | hRand = (float *)malloc(sizeof(float) * dRand_length); 108 | } 109 | 110 | if (dRand_used == dRand_length) 111 | { 112 | /* 113 | * If all pre-generated random numbers have been consumed, regenerate a 114 | * new batch using curandGenerateUniform. 115 | */ 116 | CHECK_CURAND(curandGenerateUniform(randGen, dRand, dRand_length)); 117 | CHECK(cudaMemcpy(hRand, dRand, sizeof(float) * dRand_length, 118 | cudaMemcpyDeviceToHost)); 119 | dRand_used = 0; 120 | } 121 | 122 | // Return the next pre-generated random number 123 | return hRand[dRand_used++]; 124 | } 125 | 126 | /* 127 | * A reference implementation that uses system rand(). 128 | */ 129 | float host_rand() 130 | { 131 | return (float)rand() / (float)RAND_MAX; 132 | } 133 | 134 | int main(int argc, char **argv) 135 | { 136 | int i; 137 | int N = 8388608; 138 | 139 | /* 140 | * Allocate N random numbers from each of the random number generation 141 | * functions implemented. 142 | */ 143 | for (i = 0; i < N; i++) 144 | { 145 | float h = host_rand(); 146 | float d = cuda_host_rand(); 147 | float dd = cuda_device_rand(); 148 | printf("%2.4f %2.4f %2.4f\n", h, d, dd); 149 | getchar(); 150 | } 151 | 152 | return 0; 153 | } 154 | -------------------------------------------------------------------------------- /chapter08/simple-data.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* 5 | * This example offers a brief introduction to the data directive. The 6 | * data directive allows the programmer to explicitly mark variables to be 7 | * transferred to or from the accelerator. This serves as a performance 8 | * optimization by eliminating redundant or unnecessary memcpys. 9 | */ 10 | 11 | #define N 1024 12 | 13 | int main(int argc, char **argv) 14 | { 15 | int i; 16 | int *A = (int *)malloc(N * sizeof(int)); 17 | int *B = (int *)malloc(N * sizeof(int)); 18 | int *C = (int *)malloc(N * sizeof(int)); 19 | int *D = (int *)malloc(N * sizeof(int)); 20 | 21 | // Initialize A and B 22 | for (i = 0; i < N; i++) 23 | { 24 | A[i] = i; 25 | B[i] = 2 * i; 26 | } 27 | 28 | /* 29 | * Transfer the full contents of A and B to the accelerator, and transfer 30 | * the full contents of C and D back. 31 | */ 32 | #pragma acc data copyin(A[0:N], B[0:N]) copyout(C[0:N], D[0:N]) 33 | { 34 | #pragma acc parallel 35 | { 36 | #pragma acc loop 37 | 38 | for (i = 0; i < N; i++) 39 | { 40 | C[i] = A[i] + B[i]; 41 | } 42 | 43 | #pragma acc loop 44 | 45 | for (i = 0; i < N; i++) 46 | { 47 | D[i] = C[i] * A[i]; 48 | } 49 | } 50 | } 51 | 52 | // Display part of the results 53 | for (i = 0; i < 10; i++) 54 | { 55 | printf("%d ", D[i]); 56 | } 57 | 58 | printf("...\n"); 59 | 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /chapter08/simple-kernels.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* 5 | * This example offers a brief introduction to the kernels directive. The 6 | * kernels directive attempts to break the code block that follows into 7 | * accelerator kernels, generally by searching for parallelizable loops. It then 8 | * launches each kernel on the acclerator using an automatically configured 9 | * thread configuration. 10 | */ 11 | 12 | #define N 1024 13 | 14 | int main(int argc, char **argv) 15 | { 16 | int i; 17 | /* 18 | * restrict indicates to the compiler that the memory pointed to by A, B, C, 19 | * and D will only be accessed through those respective pointers or by 20 | * offsets from those pointers. This restriction makes it possible to 21 | * analyze the loops below for parallelization. 22 | */ 23 | int *restrict A = (int *)malloc(N * sizeof(int)); 24 | int *restrict B = (int *)malloc(N * sizeof(int)); 25 | int *restrict C = (int *)malloc(N * sizeof(int)); 26 | int *restrict D = (int *)malloc(N * sizeof(int)); 27 | 28 | // Initialize A and B 29 | for (i = 0; i < N; i++) 30 | { 31 | A[i] = i; 32 | B[i] = 2 * i; 33 | } 34 | 35 | // Execute the following block of code on an accelerator 36 | #pragma acc kernels 37 | { 38 | for (i = 0; i < N; i++) 39 | { 40 | C[i] = A[i] + B[i]; 41 | } 42 | 43 | for (i = 0; i < N; i++) 44 | { 45 | D[i] = C[i] * A[i]; 46 | } 47 | } 48 | 49 | // Display part of the results 50 | for (i = 0; i < 10; i++) 51 | { 52 | printf("%d ", D[i]); 53 | } 54 | 55 | printf("...\n"); 56 | 57 | return 0; 58 | } 59 | -------------------------------------------------------------------------------- /chapter08/simple-parallel.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* 5 | * This example offers a brief introduction to the parallel directive. The 6 | * parallel directive executes a fixed number of threads throughout the code 7 | * block that follows it. The programmer is responsible for using that 8 | * parallelism. 9 | */ 10 | 11 | #define N 1024 12 | 13 | int main(int argc, char **argv) 14 | { 15 | int i; 16 | /* 17 | * Note that this example does not require the restrict keyword that 18 | * simple-kernels.cu did. Because the parallel directive relies on the 19 | * programmer to mark parallelism, the compiler does not need to be careful 20 | * about multiple pointers referencing the same memory locations. 21 | */ 22 | int *A = (int *)malloc(N * sizeof(int)); 23 | int *B = (int *)malloc(N * sizeof(int)); 24 | int *C = (int *)malloc(N * sizeof(int)); 25 | int *D = (int *)malloc(N * sizeof(int)); 26 | 27 | // Initialize A and B 28 | for (i = 0; i < N; i++) 29 | { 30 | A[i] = i; 31 | B[i] = 2 * i; 32 | } 33 | 34 | /* 35 | * Execute the following block of code on an accelerator, parallelizing the 36 | * two loops marked. 37 | */ 38 | #pragma acc parallel 39 | { 40 | #pragma acc loop 41 | 42 | for (i = 0; i < N; i++) 43 | { 44 | C[i] = A[i] + B[i]; 45 | } 46 | 47 | #pragma acc loop 48 | 49 | for (i = 0; i < N; i++) 50 | { 51 | D[i] = C[i] * A[i]; 52 | } 53 | } 54 | 55 | // Display part of the results 56 | for (i = 0; i < 10; i++) 57 | { 58 | printf("%d ", D[i]); 59 | } 60 | 61 | printf("...\n"); 62 | 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /chapter09/Makefile: -------------------------------------------------------------------------------- 1 | CU_APPS=simple2DFD simpleMultiGPU simpleP2P_PingPong 2 | C_APPS=simpleC2C simpleP2P simpleP2P_CUDA_Aware 3 | 4 | all: ${C_APPS} ${CU_APPS} 5 | 6 | simpleC2C: simpleC2C.c 7 | gcc -O2 -std=c99 -I${MPI_HOME}/include -L${MPI_HOME}/lib -lmpi -o simpleC2C simpleC2C.c 8 | simpleP2P: simpleP2P.c 9 | gcc -O2 -std=c99 -I${MPI_HOME}/include -I${CUDA_HOME}/include -L${MPI_HOME}/lib -L${CUDA_HOME}/lib64 -lcudart -lmpi -o simpleP2P simpleP2P.c 10 | simpleP2P_CUDA_Aware: simpleP2P_CUDA_Aware.c 11 | gcc -O2 -std=c99 -I${MPI_HOME}/include -I${CUDA_HOME}/include -L${MPI_HOME}/lib -L${CUDA_HOME}/lib64 -lcudart -lmpi -o simpleP2P_CUDA_Aware simpleP2P_CUDA_Aware.c 12 | %: %.cu 13 | nvcc -O2 -arch=sm_20 -I${MPI_HOME}/include -o $@ $< 14 | %: %.c 15 | gcc -O2 -std=c99 -I${MPI_HOME}/include -o $@ $< 16 | clean: 17 | rm -f ${CU_APPS} ${C_APPS} 18 | -------------------------------------------------------------------------------- /chapter09/simpleC2C.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* 8 | * A simple example of using non-blocking communication between multiple MPI 9 | * processes to send and receive a char*. The sends and receives are done 10 | * repeatedly and timing results allows inter-process bandwidth to be 11 | * calculated. 12 | */ 13 | 14 | #define MESSAGE_ALIGNMENT 64 15 | #define MAX_MSG_SIZE (1<<22) 16 | #define MYBUFSIZE MAX_MSG_SIZE 17 | #define LOOP_LARGE 100 18 | 19 | void initalData (void * sbuf, void * rbuf, size_t size) 20 | { 21 | memset(sbuf, 'a', size); 22 | memset(rbuf, 'b', size); 23 | } 24 | 25 | int main (int argc, char *argv[]) 26 | { 27 | int rank, nprocs, ilen; 28 | char processor[MPI_MAX_PROCESSOR_NAME]; 29 | double tstart = 0.0, tend = 0.0; 30 | 31 | MPI_Status reqstat; 32 | MPI_Request send_request; 33 | MPI_Request recv_request; 34 | 35 | MPI_Init(&argc, &argv); 36 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs); 37 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 38 | MPI_Get_processor_name(processor, &ilen); 39 | 40 | if(nprocs != 2) 41 | { 42 | if(rank == 0) printf("This test requires exactly two processes\n"); 43 | 44 | MPI_Finalize(); 45 | exit(EXIT_FAILURE); 46 | } 47 | 48 | char *s_buf, *r_buf; 49 | s_buf = (char *)malloc(MYBUFSIZE); 50 | r_buf = (char *)malloc(MYBUFSIZE); 51 | 52 | int other_proc = (rank == 1 ? 0 : 1); 53 | 54 | if(rank == 0 ) 55 | { 56 | printf("%s allocates %d MB dynamic memory aligned to 64 byte\n", 57 | argv[0], MAX_MSG_SIZE / 1024 / 1024); 58 | } 59 | 60 | printf("node=%d(%s): my other _proc = %d\n", rank, processor, other_proc); 61 | 62 | int loop = LOOP_LARGE; 63 | 64 | // latency test 65 | for(int size = 1024; size <= MAX_MSG_SIZE; size = size * 4) 66 | { 67 | initalData(s_buf, r_buf, size); 68 | 69 | MPI_Barrier(MPI_COMM_WORLD); 70 | 71 | if(rank == 0) 72 | { 73 | tstart = MPI_Wtime(); 74 | 75 | for(int i = 0; i < loop; i++) 76 | { 77 | MPI_Irecv(r_buf, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, 78 | &recv_request); 79 | MPI_Isend(s_buf, size, MPI_CHAR, other_proc, 100, 80 | MPI_COMM_WORLD, &send_request); 81 | MPI_Waitall(1, &send_request, &reqstat); 82 | MPI_Waitall(1, &recv_request, &reqstat); 83 | } 84 | 85 | tend = MPI_Wtime(); 86 | } 87 | else 88 | { 89 | for(int i = 0; i < loop; i++) 90 | { 91 | MPI_Irecv(r_buf, size, MPI_CHAR, other_proc, 100, 92 | MPI_COMM_WORLD, &recv_request); 93 | MPI_Isend(s_buf, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, 94 | &send_request); 95 | MPI_Waitall(1, &send_request, &reqstat); 96 | MPI_Waitall(1, &recv_request, &reqstat); 97 | } 98 | } 99 | 100 | MPI_Barrier(MPI_COMM_WORLD); 101 | 102 | if(rank == 0) 103 | { 104 | double latency = (tend - tstart) * 1e6 / (2.0 * loop); 105 | float performance = (float) size / (float) latency; 106 | printf("%6d %s %10.2f μs %10.2f MB/sec\n", 107 | (size >= 1024 * 1024) ? size / 1024 / 1024 : size / 1024, 108 | (size >= 1024 * 1024) ? "MB" : "KB", latency, performance); 109 | fflush(stdout); 110 | } 111 | } 112 | 113 | free(s_buf); 114 | free(r_buf); 115 | 116 | MPI_Finalize(); 117 | 118 | return EXIT_SUCCESS; 119 | } 120 | -------------------------------------------------------------------------------- /chapter09/simpleMultiGPU.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | #include 5 | 6 | /* 7 | * A simple example of a multi-GPU CUDA application implementing a vector sum. 8 | * Note that all communication and computation is done asynchronously in order 9 | * to overlap computation across multiple devices, and that this requires 10 | * allocating page-locked host memory associated with a specific device. 11 | */ 12 | 13 | __global__ void iKernel(float *A, float *B, float *C, const int N) 14 | { 15 | int i = blockIdx.x * blockDim.x + threadIdx.x; 16 | 17 | if (i < N) C[i] = A[i] + B[i]; 18 | } 19 | 20 | void checkResult(float *hostRef, float *gpuRef, const int N) 21 | { 22 | double epsilon = 1.0E-8; 23 | 24 | for (int i = 0; i < N; i++) 25 | { 26 | if (abs(hostRef[i] - gpuRef[i]) > epsilon) 27 | { 28 | printf("Arrays do not match!\n"); 29 | printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], 30 | gpuRef[i], i); 31 | break; 32 | } 33 | } 34 | } 35 | 36 | void initialData(float * const ip, int const size) 37 | { 38 | for (int i = 0; i < size; i++) 39 | { 40 | ip[i] = (float)rand() / (float)RAND_MAX; 41 | } 42 | } 43 | 44 | void sumOnHost(float *A, float *B, float *C, const int N) 45 | { 46 | for (int idx = 0; idx < N; idx++) 47 | { 48 | C[idx] = A[idx] + B[idx]; 49 | } 50 | } 51 | 52 | int main(int argc, char **argv) 53 | { 54 | int ngpus; 55 | 56 | printf("> starting %s", argv[0]); 57 | 58 | CHECK(cudaGetDeviceCount(&ngpus)); 59 | printf(" CUDA-capable devices: %i\n", ngpus); 60 | 61 | int ishift = 24; 62 | 63 | if (argc > 2) ishift = atoi(argv[2]); 64 | 65 | int size = 1 << ishift; 66 | 67 | if (argc > 1) 68 | { 69 | if (atoi(argv[1]) > ngpus) 70 | { 71 | fprintf(stderr, "Invalid number of GPUs specified: %d is greater " 72 | "than the total number of GPUs in this platform (%d)\n", 73 | atoi(argv[1]), ngpus); 74 | exit(1); 75 | } 76 | 77 | ngpus = atoi(argv[1]); 78 | } 79 | 80 | int iSize = size / ngpus; 81 | size_t iBytes = iSize * sizeof(float); 82 | 83 | printf("> total array size %d M, using %d devices with each device " 84 | "handling %d M\n", size / 1024 / 1024, ngpus, iSize / 1024 / 1024); 85 | 86 | // allocat device emory 87 | float **d_A = (float **)malloc(sizeof(float *) * ngpus); 88 | float **d_B = (float **)malloc(sizeof(float *) * ngpus); 89 | float **d_C = (float **)malloc(sizeof(float *) * ngpus); 90 | 91 | float **h_A = (float **)malloc(sizeof(float *) * ngpus); 92 | float **h_B = (float **)malloc(sizeof(float *) * ngpus); 93 | float **hostRef = (float **)malloc(sizeof(float *) * ngpus); 94 | float **gpuRef = (float **)malloc(sizeof(float *) * ngpus); 95 | cudaStream_t *stream = (cudaStream_t *)malloc(sizeof(cudaStream_t) * ngpus); 96 | 97 | for (int i = 0; i < ngpus; i++) 98 | { 99 | // set current device 100 | CHECK(cudaSetDevice(i)); 101 | 102 | // allocate device memory 103 | CHECK(cudaMalloc((void **) &d_A[i], iBytes)); 104 | CHECK(cudaMalloc((void **) &d_B[i], iBytes)); 105 | CHECK(cudaMalloc((void **) &d_C[i], iBytes)); 106 | 107 | // allocate page locked host memory for asynchronous data transfer 108 | CHECK(cudaMallocHost((void **) &h_A[i], iBytes)); 109 | CHECK(cudaMallocHost((void **) &h_B[i], iBytes)); 110 | CHECK(cudaMallocHost((void **) &hostRef[i], iBytes)); 111 | CHECK(cudaMallocHost((void **) &gpuRef[i], iBytes)); 112 | 113 | // create streams for timing and synchronizing 114 | CHECK(cudaStreamCreate(&stream[i])); 115 | } 116 | 117 | dim3 block (512); 118 | dim3 grid ((iSize + block.x - 1) / block.x); 119 | 120 | for (int i = 0; i < ngpus; i++) 121 | { 122 | CHECK(cudaSetDevice(i)); 123 | initialData(h_A[i], iSize); 124 | initialData(h_B[i], iSize); 125 | } 126 | 127 | // record start time 128 | double iStart = seconds(); 129 | 130 | // distributing the workload across multiple devices 131 | for (int i = 0; i < ngpus; i++) 132 | { 133 | CHECK(cudaSetDevice(i)); 134 | 135 | CHECK(cudaMemcpyAsync(d_A[i], h_A[i], iBytes, cudaMemcpyHostToDevice, 136 | stream[i])); 137 | CHECK(cudaMemcpyAsync(d_B[i], h_B[i], iBytes, cudaMemcpyHostToDevice, 138 | stream[i])); 139 | 140 | iKernel<<>>(d_A[i], d_B[i], d_C[i], iSize); 141 | 142 | CHECK(cudaMemcpyAsync(gpuRef[i], d_C[i], iBytes, cudaMemcpyDeviceToHost, 143 | stream[i])); 144 | } 145 | 146 | // synchronize streams 147 | for (int i = 0; i < ngpus; i++) 148 | { 149 | CHECK(cudaSetDevice(i)); 150 | CHECK(cudaStreamSynchronize(stream[i])); 151 | } 152 | 153 | // calculate the elapsed time in seconds 154 | double iElaps = seconds() - iStart; 155 | printf("%d GPU timer elapsed: %8.2fms \n", ngpus, iElaps * 1000.0); 156 | 157 | // check results 158 | for (int i = 0; i < ngpus; i++) 159 | { 160 | //Set device 161 | CHECK(cudaSetDevice(i)); 162 | sumOnHost(h_A[i], h_B[i], hostRef[i], iSize); 163 | checkResult(hostRef[i], gpuRef[i], iSize); 164 | } 165 | 166 | // Cleanup and shutdown 167 | for (int i = 0; i < ngpus; i++) 168 | { 169 | CHECK(cudaSetDevice(i)); 170 | CHECK(cudaFree(d_A[i])); 171 | CHECK(cudaFree(d_B[i])); 172 | CHECK(cudaFree(d_C[i])); 173 | 174 | CHECK(cudaFreeHost(h_A[i])); 175 | CHECK(cudaFreeHost(h_B[i])); 176 | CHECK(cudaFreeHost(hostRef[i])); 177 | 178 | CHECK(cudaFreeHost(gpuRef[i])); 179 | CHECK(cudaStreamDestroy(stream[i])); 180 | 181 | CHECK(cudaDeviceReset()); 182 | } 183 | 184 | free(d_A); 185 | free(d_B); 186 | free(d_C); 187 | free(h_A); 188 | free(h_B); 189 | free(hostRef); 190 | free(gpuRef); 191 | free(stream); 192 | 193 | return EXIT_SUCCESS; 194 | } 195 | -------------------------------------------------------------------------------- /chapter09/simpleP2P.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* 10 | * A simple example of using the MPI and CUDA communication APIs to manually 11 | * transfer data from a GPU managed in one MPI process to a GPU managed in 12 | * another. The general steps performed are GPU0 -> cudaMemcpy -> rank0 -> 13 | * MPI_Isend -> rank1 -> cudaMemcpy -> GPU1. 14 | */ 15 | 16 | #define CHECK(call) \ 17 | { \ 18 | const cudaError_t error = call; \ 19 | if (error != cudaSuccess) \ 20 | { \ 21 | fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ 22 | fprintf(stderr, "code: %d, reason: %s\n", error, \ 23 | cudaGetErrorString(error)); \ 24 | } \ 25 | } 26 | 27 | #define MESSAGE_ALIGNMENT 64 28 | #define MAX_MSG_SIZE (1<<22) 29 | #define MYBUFSIZE MAX_MSG_SIZE 30 | 31 | #define LOOP_LARGE 100 32 | #define SKIP_LARGE 10 33 | #define LARGE_MESSAGE_SIZE 8192 34 | 35 | int loop = LOOP_LARGE; 36 | 37 | void initalData (void * sbuf, void * rbuf, size_t size) 38 | { 39 | memset(sbuf, 'a', size); 40 | memset(rbuf, 'b', size); 41 | } 42 | 43 | int main (int argc, char *argv[]) 44 | { 45 | int rank, nprocs, ilen; 46 | char processor[MPI_MAX_PROCESSOR_NAME]; 47 | double tstart = 0.0, tend = 0.0; 48 | 49 | MPI_Status reqstat; 50 | MPI_Request send_request; 51 | MPI_Request recv_request; 52 | 53 | MPI_Init(&argc, &argv); 54 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs); 55 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 56 | MPI_Get_processor_name(processor, &ilen); 57 | 58 | if (nprocs != 2) 59 | { 60 | if(rank == 0) printf("This test requires exactly two processes\n"); 61 | 62 | MPI_Finalize(); 63 | exit(EXIT_FAILURE); 64 | } 65 | 66 | int other_proc = (rank == 1 ? 0 : 1); 67 | 68 | // Hard code GPU affinity since this example only works with 2 GPUs. 69 | int igpu = (rank == 1 ? 0 : 1); 70 | 71 | if(rank == 0 ) 72 | printf("%s allocates %d MB pinned memory with regual mpi and " 73 | "bidirectional bandwidth\n", argv[0], 74 | MAX_MSG_SIZE / 1024 / 1024); 75 | 76 | printf("node=%d(%s): my other _proc = %d and using GPU=%d\n", rank, 77 | processor, other_proc, igpu); 78 | 79 | char *h_src, *h_rcv; 80 | CHECK(cudaSetDevice(igpu)); 81 | CHECK(cudaMallocHost((void**)&h_src, MYBUFSIZE)); 82 | CHECK(cudaMallocHost((void**)&h_rcv, MYBUFSIZE)); 83 | 84 | char *d_src, *d_rcv; 85 | CHECK(cudaSetDevice(igpu)); 86 | CHECK(cudaMalloc((void **)&d_src, MYBUFSIZE)); 87 | CHECK(cudaMalloc((void **)&d_rcv, MYBUFSIZE)); 88 | 89 | initalData(h_src, h_rcv, MYBUFSIZE); 90 | 91 | CHECK(cudaMemcpy(d_src, h_src, MYBUFSIZE, cudaMemcpyDefault)); 92 | CHECK(cudaMemcpy(d_rcv, h_rcv, MYBUFSIZE, cudaMemcpyDefault)); 93 | 94 | // latency test 95 | for(int size = 1024; size <= MAX_MSG_SIZE; size = size * 4) 96 | { 97 | MPI_Barrier(MPI_COMM_WORLD); 98 | 99 | if(rank == 0) 100 | { 101 | tstart = MPI_Wtime(); 102 | 103 | for(int i = 0; i < loop; i++) 104 | { 105 | /* 106 | * Transfer data from the GPU to the host to be transmitted to 107 | * the other MPI process. 108 | */ 109 | CHECK(cudaMemcpy(h_src, d_src, size, cudaMemcpyDeviceToHost)); 110 | 111 | // bi-directional transmission 112 | MPI_Irecv(h_rcv, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, 113 | &recv_request); 114 | MPI_Isend(h_src, size, MPI_CHAR, other_proc, 100, 115 | MPI_COMM_WORLD, &send_request); 116 | 117 | MPI_Waitall(1, &recv_request, &reqstat); 118 | MPI_Waitall(1, &send_request, &reqstat); 119 | 120 | /* 121 | * Transfer the data received from the other MPI process to 122 | * the device. 123 | */ 124 | CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice)); 125 | } 126 | 127 | tend = MPI_Wtime(); 128 | } 129 | else 130 | { 131 | for(int i = 0; i < loop; i++) 132 | { 133 | /* 134 | * Transfer data from the GPU to the host to be transmitted to 135 | * the other MPI process. 136 | */ 137 | CHECK(cudaMemcpy(h_src, d_src, size, cudaMemcpyDeviceToHost)); 138 | 139 | // bi-directional transmission 140 | MPI_Irecv(h_rcv, size, MPI_CHAR, other_proc, 100, 141 | MPI_COMM_WORLD, &recv_request); 142 | MPI_Isend(h_src, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, 143 | &send_request); 144 | 145 | MPI_Waitall(1, &recv_request, &reqstat); 146 | MPI_Waitall(1, &send_request, &reqstat); 147 | 148 | /* 149 | * Transfer the data received from the other MPI process to 150 | * the device. 151 | */ 152 | CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice)); 153 | } 154 | } 155 | 156 | MPI_Barrier(MPI_COMM_WORLD); 157 | 158 | if(rank == 0) 159 | { 160 | double latency = (tend - tstart) * 1e6 / (2.0 * loop); 161 | float performance = (float) size / (float) latency; 162 | printf("%6d %s %10.2f μs %10.2f MB/sec\n", 163 | (size >= 1024 * 1024) ? size / 1024 / 1024 : size / 1024, 164 | (size >= 1024 * 1024) ? "MB" : "KB", latency, performance); 165 | 166 | fflush(stdout); 167 | } 168 | } 169 | 170 | CHECK(cudaFreeHost(h_src)); 171 | CHECK(cudaFreeHost(h_rcv)); 172 | 173 | CHECK(cudaSetDevice(igpu)); 174 | CHECK(cudaFree(d_src)); 175 | CHECK(cudaFree(d_rcv)); 176 | 177 | MPI_Finalize(); 178 | 179 | return EXIT_SUCCESS; 180 | } 181 | -------------------------------------------------------------------------------- /chapter09/simpleP2P_CUDA_Aware.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* 10 | * An example of using a CUDA-aware MPI implementation to transfer an array 11 | * directly from one GPU to another, between MPI processes. Note that no CUDA 12 | * transfer API calls are used here, and that device pointers are passed 13 | * directly to MPI_Isend and MPI_Irecv. 14 | */ 15 | 16 | #define CHECK(call) \ 17 | { \ 18 | const cudaError_t error = call; \ 19 | if (error != cudaSuccess) \ 20 | { \ 21 | fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ 22 | fprintf(stderr, "code: %d, reason: %s\n", error, \ 23 | cudaGetErrorString(error)); \ 24 | } \ 25 | } 26 | 27 | #define MESSAGE_ALIGNMENT 64 28 | #define MAX_MSG_SIZE (1<<22) 29 | #define MYBUFSIZE MAX_MSG_SIZE 30 | 31 | #define LOOP_LARGE 100 32 | #define FIELD_WIDTH 20 33 | #define FLOAT_PRECISION 2 34 | 35 | void SetDeviceBeforeInit() 36 | { 37 | int devCount = 0; 38 | int rank = atoi(getenv("MV2_COMM_WORLD_RANK")); 39 | int idev = (rank == 0 ? 1 : 0); 40 | CHECK(cudaSetDevice(idev)); 41 | 42 | printf("local rank=%d: and idev %d\n", rank, idev); 43 | } 44 | 45 | int main (int argc, char *argv[]) 46 | { 47 | int rank, nprocs, ilen; 48 | char processor[MPI_MAX_PROCESSOR_NAME]; 49 | double tstart = 0.0, tend = 0.0; 50 | 51 | MPI_Status reqstat; 52 | MPI_Request send_request; 53 | MPI_Request recv_request; 54 | 55 | MPI_Init(&argc, &argv); 56 | MPI_Comm_size(MPI_COMM_WORLD, &nprocs); 57 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 58 | 59 | MPI_Get_processor_name(processor, &ilen); 60 | 61 | if(nprocs != 2) 62 | { 63 | if(rank == 0) printf("This test requires exactly two processes\n"); 64 | 65 | MPI_Finalize(); 66 | exit(EXIT_FAILURE); 67 | } 68 | 69 | char *h_src, *h_rcv; 70 | 71 | int other_proc = (rank == 1 ? 0 : 1); 72 | int igpu = (rank == 1 ? 0 : 1); 73 | 74 | int loop = LOOP_LARGE; 75 | 76 | printf("node=%d(%s): my other _proc = %d and using GPU=%d loop %d\n", rank, 77 | processor, other_proc, igpu, loop); 78 | 79 | char *d_src, *d_rcv; 80 | CHECK(cudaSetDevice(igpu)); 81 | CHECK(cudaMalloc((void **)&d_src, MYBUFSIZE)); 82 | CHECK(cudaMalloc((void **)&d_rcv, MYBUFSIZE)); 83 | 84 | for (int size = 1; size <= MAX_MSG_SIZE; size *= 2) 85 | { 86 | MPI_Barrier(MPI_COMM_WORLD); 87 | 88 | CHECK(cudaMemset(d_src, 'a', size)); 89 | CHECK(cudaMemset(d_rcv, 'b', size)); 90 | 91 | if(rank == 0) 92 | { 93 | tstart = MPI_Wtime(); 94 | 95 | for(int i = 0; i < loop; i++) 96 | { 97 | MPI_Isend(d_src, size, MPI_CHAR, other_proc, 100, 98 | MPI_COMM_WORLD, &send_request); 99 | MPI_Irecv(d_rcv, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, 100 | &recv_request); 101 | 102 | MPI_Waitall(1, &recv_request, &reqstat); 103 | MPI_Waitall(1, &send_request, &reqstat); 104 | 105 | } 106 | 107 | tend = MPI_Wtime(); 108 | } 109 | else 110 | { 111 | for(int i = 0; i < loop; i++) 112 | { 113 | MPI_Isend(d_src, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD, 114 | &send_request); 115 | MPI_Irecv(d_rcv, size, MPI_CHAR, other_proc, 100, 116 | MPI_COMM_WORLD, &recv_request); 117 | 118 | MPI_Waitall(1, &recv_request, &reqstat); 119 | MPI_Waitall(1, &send_request, &reqstat); 120 | } 121 | } 122 | 123 | MPI_Barrier(MPI_COMM_WORLD); 124 | 125 | if(rank == 0) 126 | { 127 | double tmp = size / 1e6 * loop * 2; 128 | double t = (tend - tstart); 129 | 130 | printf("%-*d%*.*f\n", 10, size, FIELD_WIDTH, FLOAT_PRECISION, 131 | tmp / t); 132 | fflush(stdout); 133 | } 134 | } 135 | 136 | CHECK(cudaSetDevice(igpu)); 137 | CHECK(cudaFree(d_src)); 138 | CHECK(cudaFree(d_rcv)); 139 | 140 | MPI_Finalize(); 141 | 142 | return EXIT_SUCCESS; 143 | } 144 | -------------------------------------------------------------------------------- /chapter10/debug-hazards.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | #include 4 | 5 | /** 6 | * This example illustrates different approaches to optimizing access to a 7 | * single shared variable by limiting conflicting, atomic operations on it. 8 | * 9 | * The first kernel, naive_reduction, simply performs an atomicAdd from every 10 | * thread on the same shared variable. 11 | * 12 | * simple_reduction first stores the values to be added together in shared 13 | * memory. Then, a single thread iterates over those values and computes a 14 | * partial sum. Finally, that partial sum is added to the global result using an 15 | * atomicAdd. 16 | * 17 | * parallel_reduction is the most complex example. It performs a parallel 18 | * reduction within each thread block. The partial result produced by that 19 | * local reduction is then added to the global result with an atomicAdd. 20 | * 21 | * The core of each of these kernels is wrapped in a loop to augment the amount 22 | * of work done and make timing the kernels at the millisecond granularity 23 | * feasible. 24 | **/ 25 | 26 | /** 27 | * This implementation makes use of shared memory and local reduction to improve 28 | * performance and decrease contention 29 | **/ 30 | __global__ void simple_reduction(int *shared_var, int *input_values, int N, 31 | int iters) 32 | { 33 | __shared__ int local_mem[256]; 34 | int iter, i; 35 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 36 | int local_tid = threadIdx.x; 37 | int local_dim = blockDim.x; 38 | int minThreadInThisBlock = blockIdx.x * blockDim.x; 39 | int maxThreadInThisBlock = minThreadInThisBlock + (blockDim.x - 1); 40 | 41 | if (maxThreadInThisBlock >= N) 42 | { 43 | local_dim = N - minThreadInThisBlock; 44 | } 45 | 46 | for (iter = 0; iter < iters; iter++) 47 | { 48 | if (tid < N) 49 | { 50 | local_mem[local_tid] = input_values[tid]; 51 | } 52 | 53 | // Required for correctness 54 | // __syncthreads(); 55 | 56 | /* 57 | * Perform the local reduction across values written to shared memory 58 | * by threads in this thread block. 59 | */ 60 | if (local_tid == 0) 61 | { 62 | int sum = 0; 63 | 64 | for (i = 0; i < local_dim; i++) 65 | { 66 | sum = sum + local_mem[i]; 67 | } 68 | 69 | atomicAdd(shared_var, sum); 70 | } 71 | 72 | // Required for correctness 73 | // __syncthreads(); 74 | } 75 | } 76 | 77 | int main(int argc, char **argv) 78 | { 79 | int N = 20480; 80 | int block = 256; 81 | int device_iters = 3; 82 | int runs = 1; 83 | int i, true_value; 84 | int *d_shared_var, *d_input_values, *h_input_values; 85 | int h_sum; 86 | double mean_time = 0.0; 87 | 88 | CHECK(cudaMalloc((void **)&d_shared_var, sizeof(int))); 89 | CHECK(cudaMalloc((void **)&d_input_values, N * sizeof(int))); 90 | h_input_values = (int *)malloc(N * sizeof(int)); 91 | 92 | for (i = 0; i < N; i++) 93 | { 94 | h_input_values[i] = i; 95 | true_value += i; 96 | } 97 | 98 | true_value *= device_iters; 99 | 100 | for (i = 0; i < runs; i++) 101 | { 102 | CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int))); 103 | CHECK(cudaMemcpy(d_input_values, h_input_values, N * sizeof(int), 104 | cudaMemcpyHostToDevice)); 105 | double start = seconds(); 106 | 107 | simple_reduction<<>>(d_shared_var, 108 | d_input_values, N, device_iters); 109 | 110 | CHECK(cudaDeviceSynchronize()); 111 | mean_time += seconds() - start; 112 | CHECK(cudaMemcpy(&h_sum, d_shared_var, sizeof(int), 113 | cudaMemcpyDeviceToHost)); 114 | 115 | if (h_sum != true_value) 116 | { 117 | fprintf(stderr, "Validation failure: expected %d, got %d\n", 118 | true_value, h_sum); 119 | return 1; 120 | } 121 | } 122 | 123 | mean_time /= runs; 124 | 125 | printf("Mean execution time for reduction: %.4f ms\n", 126 | mean_time * 1000.0); 127 | 128 | return 0; 129 | } 130 | -------------------------------------------------------------------------------- /chapter10/debug-segfault.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | 4 | /* 5 | * This example purposefully introduces an invalid memory access on the GPU to 6 | * illustrate the use of cuda-gdb. 7 | */ 8 | 9 | #define N 1025 10 | #define M 12 11 | 12 | __device__ int foo(int row, int col) 13 | { 14 | return (2 * row); 15 | } 16 | 17 | __global__ void kernel(int **arr) 18 | { 19 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 20 | int i; 21 | 22 | /* 23 | * Iterate over each row in parallel and column sequentially, assigning a 24 | * value decided by foo. 25 | */ 26 | for ( ; tid < N; tid++) 27 | { 28 | for (i = 0; i < M; i++) 29 | { 30 | arr[tid][i] = foo(tid, i); 31 | } 32 | } 33 | } 34 | 35 | int main(int argc, char **argv) 36 | { 37 | int i; 38 | // Host representation of a 2D matrix 39 | int **h_matrix; 40 | // A host array of device pointers to the matrix rows on the device 41 | int **d_ptrs; 42 | // A device array of device pointers, filled from d_ptrs 43 | int **d_matrix; 44 | 45 | h_matrix = (int **)malloc(N * sizeof(int *)); 46 | d_ptrs = (int **)malloc(N * sizeof(int *)); 47 | CHECK(cudaMalloc((void **)&d_matrix, N * sizeof(int *))); 48 | CHECK(cudaMemset(d_matrix, 0x00, N * sizeof(int *))); 49 | 50 | // Allocate rows on the host and device 51 | for (i = 0; i < N; i++) 52 | { 53 | h_matrix[i] = (int *)malloc(M * sizeof(int)); 54 | CHECK(cudaMalloc((void **)&d_ptrs[i], M * sizeof(int))); 55 | CHECK(cudaMemset(d_ptrs[i], 0x00, M * sizeof(int))); 56 | } 57 | 58 | int threadsPerBlock = 256; 59 | int blocksPerGrid = 1024; 60 | kernel<<>>(d_matrix); 61 | 62 | // Copy rows back 63 | for (i = 0; i < N; i++) 64 | { 65 | CHECK(cudaMemcpy(h_matrix[i], d_ptrs[i], M * sizeof(int), 66 | cudaMemcpyDeviceToHost)); 67 | CHECK(cudaFree(d_ptrs[i])); 68 | free(h_matrix[i]); 69 | } 70 | 71 | CHECK(cudaFree(d_matrix)); 72 | free(h_matrix); 73 | 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /chapter10/debug-segfault.fixed.cu: -------------------------------------------------------------------------------- 1 | #include "../common/common.h" 2 | #include 3 | 4 | /* 5 | * This example purposefully introduces an invalid memory access on the GPU to 6 | * illustrate the use of cuda-gdb. 7 | */ 8 | 9 | #define N 1025 10 | #define M 12 11 | 12 | __device__ int foo(int row, int col) 13 | { 14 | return (2 * row); 15 | } 16 | 17 | __global__ void kernel(int **arr) 18 | { 19 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 20 | int i; 21 | 22 | /* 23 | * Iterate over each row in parallel and column sequentially, assigning a 24 | * value decided by foo. 25 | */ 26 | for ( ; tid < N; tid++) 27 | { 28 | for (i = 0; i < M; i++) 29 | { 30 | arr[tid][i] = foo(tid, i); 31 | } 32 | } 33 | } 34 | 35 | int main(int argc, char **argv) 36 | { 37 | int i; 38 | // Host representation of a 2D matrix 39 | int **h_matrix; 40 | // A host array of device pointers to the matrix rows on the device 41 | int **d_ptrs; 42 | // A device array of device pointers, filled from d_ptrs 43 | int **d_matrix; 44 | 45 | h_matrix = (int **)malloc(N * sizeof(int *)); 46 | d_ptrs = (int **)malloc(N * sizeof(int *)); 47 | CHECK(cudaMalloc((void **)&d_matrix, N * sizeof(int *))); 48 | CHECK(cudaMemset(d_matrix, 0x00, N * sizeof(int *))); 49 | 50 | // Allocate rows on the host and device 51 | for (i = 0; i < N; i++) 52 | { 53 | h_matrix[i] = (int *)malloc(M * sizeof(int)); 54 | CHECK(cudaMalloc((void **)&d_ptrs[i], M * sizeof(int))); 55 | CHECK(cudaMemset(d_ptrs[i], 0x00, M * sizeof(int))); 56 | } 57 | 58 | CHECK(cudaMemcpy(d_matrix, d_ptrs, N * sizeof(int *), 59 | cudaMemcpyHostToDevice)); 60 | 61 | int threadsPerBlock = 256; 62 | int blocksPerGrid = 1024; 63 | kernel<<>>(d_matrix); 64 | 65 | // Copy rows back 66 | for (i = 0; i < N; i++) 67 | { 68 | CHECK(cudaMemcpy(h_matrix[i], d_ptrs[i], M * sizeof(int), 69 | cudaMemcpyDeviceToHost)); 70 | CHECK(cudaFree(d_ptrs[i])); 71 | free(h_matrix[i]); 72 | } 73 | 74 | CHECK(cudaFree(d_matrix)); 75 | free(h_matrix); 76 | 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /chapter10/generate_data.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | /* 5 | * Generate a sample input for crypt to encrypt and decrypt. generate_data 6 | * allows the user to specify the output file and length in bytes. 7 | */ 8 | 9 | #define CHUNK_SIZE 1024 10 | signed char chunk[CHUNK_SIZE]; 11 | 12 | int main(int argc, char **argv) 13 | { 14 | int i, j; 15 | FILE *out; 16 | int outLength; 17 | int *ichunk; 18 | 19 | if (argc != 3) 20 | { 21 | printf("usage: %s \n", argv[0]); 22 | return (1); 23 | } 24 | 25 | out = fopen(argv[1], "w"); 26 | 27 | if (out == NULL) 28 | { 29 | fprintf(stderr, "Failed opening %s for writing\n", argv[1]); 30 | return (1); 31 | } 32 | 33 | outLength = atoi(argv[2]); 34 | 35 | if (outLength % 8 != 0) 36 | { 37 | fprintf(stderr, "The specified length (%d) must be evenly divisible " 38 | "by 8\n", outLength); 39 | return (1); 40 | } 41 | 42 | // Write in chunks of CHUNK_SIZE. 43 | for (i = 0; i < outLength; i += CHUNK_SIZE) 44 | { 45 | int toWrite = CHUNK_SIZE; 46 | 47 | if (i + toWrite > outLength) 48 | { 49 | toWrite = outLength - i; 50 | } 51 | 52 | for (j = 0; j < toWrite; j++) 53 | { 54 | chunk[j] = (i * CHUNK_SIZE + j); 55 | } 56 | 57 | if (fwrite(chunk, 1, toWrite, out) != toWrite) 58 | { 59 | fprintf(stderr, "Error writing chunk of length %d\n", toWrite); 60 | return (1); 61 | } 62 | } 63 | 64 | fclose(out); 65 | 66 | return (0); 67 | } 68 | --------------------------------------------------------------------------------