├── README.md
├── chapter01
    ├── Makefile
    ├── hello
    └── hello.cu
├── chapter02
    ├── Makefile
    ├── a.out
    ├── check
    ├── checkDeviceInfor.cu
    ├── checkDimension.cu
    ├── checkThreadIndex.cu
    ├── defineGridBlock.cu
    ├── out
    ├── out2
    ├── sum
    ├── sumArraysOnGPU-small-case.cu
    ├── sumArraysOnGPU-timer.cu
    ├── sumArraysOnHost.c
    ├── sumMatrixOnGPU-1D-grid-1D-block.cu
    ├── sumMatrixOnGPU-2D-grid-1D-block.cu
    ├── sumMatrixOnGPU-2D-grid-2D-block.cu
    └── sumMatrixOnGPU.cu
├── chapter03
    ├── Makefile
    ├── nestedHelloWorld.cu
    ├── nestedReduce.cu
    ├── nestedReduce2.cu
    ├── nestedReduceNosync.cu
    ├── reduceInteger.cu
    ├── simpleDeviceQuery.cu
    ├── simpleDivergence.cu
    └── sumMatrix.cu
├── chapter04
    ├── Makefile
    ├── globalVariable.cu
    ├── memTransfer.cu
    ├── pinMemTransfer.cu
    ├── readSegment.cu
    ├── readSegmentUnroll.cu
    ├── simpleMathAoS.cu
    ├── simpleMathSoA.cu
    ├── sumArrayZerocpy.cu
    ├── sumMatrixGPUManaged.cu
    ├── sumMatrixGPUManual.cu
    ├── transpose.cu
    └── writeSegment.cu
├── chapter05
    ├── Makefile
    ├── checkSmemRectangle.cu
    ├── checkSmemSquare.cu
    ├── constantReadOnly.cu
    ├── constantStencil.cu
    ├── reduceInteger.cu
    ├── reduceIntegerShfl.cu
    ├── simpleShfl.cu
    └── transposeRectangle.cu
├── chapter06
    ├── Makefile
    ├── asyncAPI.cu
    ├── simpleCallback.cu
    ├── simpleHyperqBreadth.cu
    ├── simpleHyperqDependence.cu
    ├── simpleHyperqDepth.cu
    ├── simpleHyperqOpenmp.cu
    ├── simpleMultiAddBreadth.cu
    └── simpleMultiAddDepth.cu
├── chapter07
    ├── Makefile
    ├── atomic-ordering.cu
    ├── floating-point-accuracy.cu
    ├── floating-point-perf.cu
    ├── fmad.cu
    ├── intrinsic-standard-comp.cu
    ├── my-atomic-add.cu
    └── nbody.cu
├── chapter08
    ├── Makefile
    ├── cublas.cu
    ├── cuda-openacc.cu
    ├── cufft-multi.cu
    ├── cufft.cu
    ├── cusparse.cu
    ├── drop-in.c
    ├── rand-kernel.cu
    ├── replace-rand-streams.cu
    ├── replace-rand.cu
    ├── simple-data.c
    ├── simple-kernels.c
    └── simple-parallel.c
├── chapter09
    ├── Makefile
    ├── simple2DFD.cu
    ├── simpleC2C.c
    ├── simpleMultiGPU.cu
    ├── simpleP2P.c
    ├── simpleP2P_CUDA_Aware.c
    └── simpleP2P_PingPong.cu
└── chapter10
    ├── crypt.c
    ├── crypt.config.cu
    ├── crypt.constant.cu
    ├── crypt.flexible.cu
    ├── crypt.legacy.cu
    ├── crypt.openmp.cu
    ├── crypt.overlap.cu
    ├── crypt.parallelized.cu
    ├── debug-hazards.cu
    ├── debug-segfault.cu
    ├── debug-segfault.fixed.cu
    └── generate_data.c


/README.md:
--------------------------------------------------------------------------------
1 | # CUDA_C Code
2 | CUDA_C编程权威指南示例代码
3 | 


--------------------------------------------------------------------------------
/chapter01/Makefile:
--------------------------------------------------------------------------------
1 | APPS=hello
2 | 
3 | all: ${APPS}
4 | 
5 | %: %.cu
6 | 	nvcc -O2 -arch=sm_20 -o $@ $<
7 | clean:
8 | 	rm -f ${APPS}
9 | 


--------------------------------------------------------------------------------
/chapter01/hello:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter01/hello


--------------------------------------------------------------------------------
/chapter01/hello.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | 
 4 | /*
 5 |  * A simple introduction to programming in CUDA. This program prints "Hello
 6 |  * World from GPU! from 10 CUDA threads running on the GPU.
 7 |  */
 8 | 
 9 | __global__ void helloFromGPU()
10 | {
11 |     printf("Hello World from GPU!\n");
12 | }
13 | 
14 | int main(int argc, char **argv)
15 | {
16 |     printf("Hello World from CPU!\n");
17 | 
18 |     helloFromGPU<<<1, 10>>>();
19 |     CHECK(cudaDeviceReset());
20 |     return 0;
21 | }
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/chapter02/Makefile:
--------------------------------------------------------------------------------
 1 | CU_APPS=checkDeviceInfor checkThreadIndex sumArraysOnGPU-timer \
 2 |         sumMatrixOnGPU-1D-grid-1D-block sumMatrixOnGPU-2D-grid-2D-block \
 3 |         checkDimension defineGridBlock sumArraysOnGPU-small-case \
 4 |         sumMatrixOnGPU-2D-grid-1D-block sumMatrixOnGPU
 5 | C_APPS=sumArraysOnHost
 6 | 
 7 | all: ${C_APPS} ${CU_APPS}
 8 | 
 9 | %: %.cu
10 | 	nvcc -O2 -arch=sm_20 -o $@ $<
11 | %: %.c
12 | 	gcc -O2 -std=c99 -o $@ $<
13 | clean:
14 | 	rm -f ${CU_APPS} ${C_APPS}
15 | 


--------------------------------------------------------------------------------
/chapter02/a.out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/a.out


--------------------------------------------------------------------------------
/chapter02/check:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/check


--------------------------------------------------------------------------------
/chapter02/checkDeviceInfor.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <cuda_runtime.h>
 3 | #include <stdio.h>
 4 | 
 5 | /*
 6 |  * Display a variety of information on the first CUDA device in this system,
 7 |  * including driver version, runtime version, compute capability, bytes of
 8 |  * global memory, etc.
 9 |  */
10 | 
11 | int main(int argc, char **argv)
12 | {
13 |     printf("%s Starting...\n", argv[0]);
14 | 
15 |     int deviceCount = 0;
16 |     cudaGetDeviceCount(&deviceCount);
17 | 
18 |     if (deviceCount == 0)
19 |     {
20 |         printf("There are no available device(s) that support CUDA\n");
21 |     }
22 |     else
23 |     {
24 |         printf("Detected %d CUDA Capable device(s)\n", deviceCount);
25 |     }
26 | 
27 |     int dev = 0, driverVersion = 0, runtimeVersion = 0;
28 |     CHECK(cudaSetDevice(dev));
29 |     cudaDeviceProp deviceProp;
30 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
31 |     printf("Device %d: \"%s\"\n", dev, deviceProp.name);
32 | 
33 |     cudaDriverGetVersion(&driverVersion);
34 |     cudaRuntimeGetVersion(&runtimeVersion);
35 |     printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
36 |            driverVersion / 1000, (driverVersion % 100) / 10,
37 |            runtimeVersion / 1000, (runtimeVersion % 100) / 10);
38 |     printf("  CUDA Capability Major/Minor version number:    %d.%d\n",
39 |            deviceProp.major, deviceProp.minor);
40 |     printf("  Total amount of global memory:                 %.2f GBytes (%llu "
41 |            "bytes)\n", (float)deviceProp.totalGlobalMem / pow(1024.0, 3),
42 |            (unsigned long long)deviceProp.totalGlobalMem);
43 |     printf("  GPU Clock rate:                                %.0f MHz (%0.2f "
44 |            "GHz)\n", deviceProp.clockRate * 1e-3f,
45 |            deviceProp.clockRate * 1e-6f);
46 |     printf("  Memory Clock rate:                             %.0f Mhz\n",
47 |            deviceProp.memoryClockRate * 1e-3f);
48 |     printf("  Memory Bus Width:                              %d-bit\n",
49 |            deviceProp.memoryBusWidth);
50 | 
51 |     if (deviceProp.l2CacheSize)
52 |     {
53 |         printf("  L2 Cache Size:                                 %d bytes\n",
54 |                deviceProp.l2CacheSize);
55 |     }
56 | 
57 |     printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), "
58 |            "2D=(%d,%d), 3D=(%d,%d,%d)\n", deviceProp.maxTexture1D,
59 |            deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
60 |            deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1],
61 |            deviceProp.maxTexture3D[2]);
62 |     printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, "
63 |            "2D=(%d,%d) x %d\n", deviceProp.maxTexture1DLayered[0],
64 |            deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0],
65 |            deviceProp.maxTexture2DLayered[1],
66 |            deviceProp.maxTexture2DLayered[2]);
67 |     printf("  Total amount of constant memory:               %lu bytes\n",
68 |            deviceProp.totalConstMem);
69 |     printf("  Total amount of shared memory per block:       %lu bytes\n",
70 |            deviceProp.sharedMemPerBlock);
71 |     printf("  Total number of registers available per block: %d\n",
72 |            deviceProp.regsPerBlock);
73 |     printf("  Warp size:                                     %d\n",
74 |            deviceProp.warpSize);
75 |     printf("  Maximum number of threads per multiprocessor:  %d\n",
76 |            deviceProp.maxThreadsPerMultiProcessor);
77 |     printf("  Maximum number of threads per block:           %d\n",
78 |            deviceProp.maxThreadsPerBlock);
79 |     printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n",
80 |            deviceProp.maxThreadsDim[0],
81 |            deviceProp.maxThreadsDim[1],
82 |            deviceProp.maxThreadsDim[2]);
83 |     printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n",
84 |            deviceProp.maxGridSize[0],
85 |            deviceProp.maxGridSize[1],
86 |            deviceProp.maxGridSize[2]);
87 |     printf("  Maximum memory pitch:                          %lu bytes\n",
88 |            deviceProp.memPitch);
89 | 
90 |     exit(EXIT_SUCCESS);
91 | }
92 | 


--------------------------------------------------------------------------------
/chapter02/checkDimension.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <cuda_runtime.h>
 3 | #include <stdio.h>
 4 | 
 5 | /*
 6 |  * Display the dimensionality of a thread block and grid from the host and
 7 |  * device.
 8 |  */
 9 | 
10 | __global__ void checkIndex(void)
11 | {
12 |     printf("threadIdx:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z);
13 |     printf("blockIdx:(%d, %d, %d)\n", blockIdx.x, blockIdx.y, blockIdx.z);
14 | 
15 |     printf("blockDim:(%d, %d, %d)\n", blockDim.x, blockDim.y, blockDim.z);
16 |     printf("gridDim:(%d, %d, %d)\n", gridDim.x, gridDim.y, gridDim.z);
17 | 
18 | }
19 | 
20 | int main(int argc, char **argv)
21 | {
22 |     // define total data element
23 |     int nElem = 6;
24 | 
25 |     // define grid and block structure
26 |     dim3 block(3);
27 |     dim3 grid((nElem + block.x - 1) / block.x);
28 | 
29 |     // check grid and block dimension from host side
30 |     printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
31 |     printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);
32 | 
33 |     // check grid and block dimension from device side
34 |     checkIndex<<<grid, block>>>();
35 | 
36 |     // reset device before you leave
37 |     CHECK(cudaDeviceReset());
38 | 
39 |     return(0);
40 | }
41 | 


--------------------------------------------------------------------------------
/chapter02/checkThreadIndex.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <cuda_runtime.h>
 3 | #include <stdio.h>
 4 | 
 5 | /*
 6 |  * This example helps to visualize the relationship between thread/block IDs and
 7 |  * offsets into data. For each CUDA thread, this example displays the
 8 |  * intra-block thread ID, the inter-block block ID, the global coordinate of a
 9 |  * thread, the calculated offset into input data, and the input data at that
10 |  * offset.
11 |  */
12 | 
13 | void printMatrix(int *C, const int nx, const int ny)
14 | {
15 |     int *ic = C;
16 |     printf("\nMatrix: (%d.%d)\n", nx, ny);
17 | 
18 |     for (int iy = 0; iy < ny; iy++)
19 |     {
20 |         for (int ix = 0; ix < nx; ix++)
21 |         {
22 |             printf("%3d", ic[ix]);
23 | 
24 |         }
25 | 
26 |         ic += nx;
27 |         printf("\n");
28 |     }
29 | 
30 |     printf("\n");
31 |     return;
32 | }
33 | 
34 | __global__ void printThreadIndex(int *A, const int nx, const int ny)
35 | {
36 |     int ix = threadIdx.x + blockIdx.x * blockDim.x;
37 |     int iy = threadIdx.y + blockIdx.y * blockDim.y;
38 |     unsigned int idx = iy * nx + ix;
39 | 
40 |     printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index"
41 |            " %2d ival %2d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y,
42 |            ix, iy, idx, A[idx]);
43 | }
44 | 
45 | int main(int argc, char **argv)
46 | {
47 |     printf("%s Starting...\n", argv[0]);
48 | 
49 |     // get device information
50 |     int dev = 0;
51 |     cudaDeviceProp deviceProp;
52 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
53 |     printf("Using Device %d: %s\n", dev, deviceProp.name);
54 |     CHECK(cudaSetDevice(dev));
55 | 
56 |     // set matrix dimension
57 |     int nx = 8;
58 |     int ny = 6;
59 |     int nxy = nx * ny;
60 |     int nBytes = nxy * sizeof(float);
61 | 
62 |     // malloc host memory
63 |     int *h_A;
64 |     h_A = (int *)malloc(nBytes);
65 | 
66 |     // iniitialize host matrix with integer
67 |     for (int i = 0; i < nxy; i++)
68 |     {
69 |         h_A[i] = i;
70 |     }
71 |     printMatrix(h_A, nx, ny);
72 | 
73 |     // malloc device memory
74 |     int *d_MatA;
75 |     CHECK(cudaMalloc((void **)&d_MatA, nBytes));
76 | 
77 |     // transfer data from host to device
78 |     CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
79 | 
80 |     // set up execution configuration
81 |     dim3 block(4, 2);
82 |     dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
83 | 
84 |     // invoke the kernel
85 |     printThreadIndex<<<grid, block>>>(d_MatA, nx, ny);
86 |     CHECK(cudaGetLastError());
87 | 
88 |     // free host and devide memory
89 |     CHECK(cudaFree(d_MatA));
90 |     free(h_A);
91 | 
92 |     // reset device
93 |     CHECK(cudaDeviceReset());
94 | 
95 |     return (0);
96 | }
97 | 


--------------------------------------------------------------------------------
/chapter02/defineGridBlock.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <cuda_runtime.h>
 3 | #include <stdio.h>
 4 | 
 5 | /*
 6 |  * Demonstrate defining the dimensions of a block of threads and a grid of
 7 |  * blocks from the host.
 8 |  */
 9 | 
10 | int main(int argc, char **argv)
11 | {
12 |     // define total data element
13 |     int nElem = 1024;
14 | 
15 |     // define grid and block structure
16 |     dim3 block (1024);
17 |     dim3 grid  ((nElem + block.x - 1) / block.x);
18 |     printf("grid.x %d block.x %d \n", grid.x, block.x);
19 | 
20 |     // reset block
21 |     block.x = 512;
22 |     grid.x  = (nElem + block.x - 1) / block.x;
23 |     printf("grid.x %d block.x %d \n", grid.x, block.x);
24 | 
25 |     // reset block
26 |     block.x = 256;
27 |     grid.x  = (nElem + block.x - 1) / block.x;
28 |     printf("grid.x %d block.x %d \n", grid.x, block.x);
29 | 
30 |     // reset block
31 |     block.x = 128;
32 |     grid.x  = (nElem + block.x - 1) / block.x;
33 |     printf("grid.x %d block.x %d \n", grid.x, block.x);
34 | 
35 |     // reset device before you leave
36 |     CHECK(cudaDeviceReset());
37 | 
38 |     return(0);
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/chapter02/out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/out


--------------------------------------------------------------------------------
/chapter02/out2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/out2


--------------------------------------------------------------------------------
/chapter02/sum:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ElvisCheny/CUDA_C-Code/7d5b2f9f5c9e26748256ccf4dad495d561e496c1/chapter02/sum


--------------------------------------------------------------------------------
/chapter02/sumArraysOnGPU-small-case.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates a simple vector sum on the GPU and on the host.
  7 |  * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the
  8 |  * GPU. Only a single thread block is used in this small case, for simplicity.
  9 |  * sumArraysOnHost sequentially iterates through vector elements on the host.
 10 |  */
 11 | 
 12 | void checkResult(float *hostRef, float *gpuRef, const int N)
 13 | {
 14 |     double epsilon = 1.0E-8;
 15 |     bool match = 1;
 16 | 
 17 |     for (int i = 0; i < N; i++)
 18 |     {
 19 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 20 |         {
 21 |             match = 0;
 22 |             printf("Arrays do not match!\n");
 23 |             printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
 24 |                    gpuRef[i], i);
 25 |             break;
 26 |         }
 27 |     }
 28 | 
 29 |     if (match) printf("Arrays match.\n\n");
 30 | 
 31 |     return;
 32 | }
 33 | 
 34 | 
 35 | void initialData(float *ip, int size)
 36 | {
 37 |     // generate different seed for random number
 38 |     time_t t;
 39 |     srand((unsigned) time(&t));
 40 | 
 41 |     for (int i = 0; i < size; i++)
 42 |     {
 43 |         ip[i] = (float)(rand() & 0xFF) / 10.0f;
 44 |     }
 45 | 
 46 |     return;
 47 | }
 48 | 
 49 | 
 50 | void sumArraysOnHost(float *A, float *B, float *C, const int N)
 51 | {
 52 |     for (int idx = 0; idx < N; idx++)
 53 |         C[idx] = A[idx] + B[idx];
 54 | }
 55 | 
 56 | __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
 57 | {
 58 |     int i = threadIdx.x;
 59 | 
 60 |     if (i < N) C[i] = A[i] + B[i];
 61 | }
 62 | 
 63 | 
 64 | int main(int argc, char **argv)
 65 | {
 66 |     printf("%s Starting...\n", argv[0]);
 67 | 
 68 |     // set up device
 69 |     int dev = 0;
 70 |     CHECK(cudaSetDevice(dev));
 71 | 
 72 |     // set up data size of vectors
 73 |     int nElem = 1 << 5;
 74 |     printf("Vector size %d\n", nElem);
 75 | 
 76 |     // malloc host memory
 77 |     size_t nBytes = nElem * sizeof(float);
 78 | 
 79 |     float *h_A, *h_B, *hostRef, *gpuRef;
 80 |     h_A     = (float *)malloc(nBytes);
 81 |     h_B     = (float *)malloc(nBytes);
 82 |     hostRef = (float *)malloc(nBytes);
 83 |     gpuRef  = (float *)malloc(nBytes);
 84 | 
 85 |     // initialize data at host side
 86 |     initialData(h_A, nElem);
 87 |     initialData(h_B, nElem);
 88 | 
 89 |     memset(hostRef, 0, nBytes);
 90 |     memset(gpuRef,  0, nBytes);
 91 | 
 92 |     // malloc device global memory
 93 |     float *d_A, *d_B, *d_C;
 94 |     CHECK(cudaMalloc((float**)&d_A, nBytes));
 95 |     CHECK(cudaMalloc((float**)&d_B, nBytes));
 96 |     CHECK(cudaMalloc((float**)&d_C, nBytes));
 97 | 
 98 |     // transfer data from host to device
 99 |     CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
100 |     CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
101 |     CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));
102 | 
103 |     // invoke kernel at host side
104 |     dim3 block (nElem);
105 |     dim3 grid  (1);
106 | 
107 |     sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
108 |     printf("Execution configure <<<%d, %d>>>\n", grid.x, block.x);
109 | 
110 |     // copy kernel result back to host side
111 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
112 | 
113 |     // add vector at host side for result checks
114 |     sumArraysOnHost(h_A, h_B, hostRef, nElem);
115 | 
116 |     // check device results
117 |     checkResult(hostRef, gpuRef, nElem);
118 | 
119 |     // free device global memory
120 |     CHECK(cudaFree(d_A));
121 |     CHECK(cudaFree(d_B));
122 |     CHECK(cudaFree(d_C));
123 | 
124 |     // free host memory
125 |     free(h_A);
126 |     free(h_B);
127 |     free(hostRef);
128 |     free(gpuRef);
129 | 
130 |     CHECK(cudaDeviceReset());
131 |     return(0);
132 | }
133 | 


--------------------------------------------------------------------------------
/chapter02/sumArraysOnGPU-timer.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates a simple vector sum on the GPU and on the host.
  7 |  * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the
  8 |  * GPU. Only a single thread block is used in this small case, for simplicity.
  9 |  * sumArraysOnHost sequentially iterates through vector elements on the host.
 10 |  * This version of sumArrays adds host timers to measure GPU and CPU
 11 |  * performance.
 12 |  */
 13 | 
 14 | void checkResult(float *hostRef, float *gpuRef, const int N)
 15 | {
 16 |     double epsilon = 1.0E-8;
 17 |     bool match = 1;
 18 | 
 19 |     for (int i = 0; i < N; i++)
 20 |     {
 21 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 22 |         {
 23 |             match = 0;
 24 |             printf("Arrays do not match!\n");
 25 |             printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
 26 |                    gpuRef[i], i);
 27 |             break;
 28 |         }
 29 |     }
 30 | 
 31 |     if (match) printf("Arrays match.\n\n");
 32 | 
 33 |     return;
 34 | }
 35 | 
 36 | void initialData(float *ip, int size)
 37 | {
 38 |     // generate different seed for random number
 39 |     time_t t;
 40 |     srand((unsigned) time(&t));
 41 | 
 42 |     for (int i = 0; i < size; i++)
 43 |     {
 44 |         ip[i] = (float)( rand() & 0xFF ) / 10.0f;
 45 |     }
 46 | 
 47 |     return;
 48 | }
 49 | 
 50 | void sumArraysOnHost(float *A, float *B, float *C, const int N)
 51 | {
 52 |     for (int idx = 0; idx < N; idx++)
 53 |     {
 54 |         C[idx] = A[idx] + B[idx];
 55 |     }
 56 | }
 57 | __global__ void sumArraysOnGPU(float *A, float *B, float *C, const int N)
 58 | {
 59 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 60 | 
 61 |     if (i < N) C[i] = A[i] + B[i];
 62 | }
 63 | 
 64 | int main(int argc, char **argv)
 65 | {
 66 |     printf("%s Starting...\n", argv[0]);
 67 | 
 68 |     // set up device
 69 |     int dev = 0;
 70 |     cudaDeviceProp deviceProp;
 71 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 72 |     printf("Using Device %d: %s\n", dev, deviceProp.name);
 73 |     CHECK(cudaSetDevice(dev));
 74 | 
 75 |     // set up data size of vectors
 76 |     int nElem = 1 << 24;
 77 |     printf("Vector size %d\n", nElem);
 78 | 
 79 |     // malloc host memory
 80 |     size_t nBytes = nElem * sizeof(float);
 81 | 
 82 |     float *h_A, *h_B, *hostRef, *gpuRef;
 83 |     h_A     = (float *)malloc(nBytes);
 84 |     h_B     = (float *)malloc(nBytes);
 85 |     hostRef = (float *)malloc(nBytes);
 86 |     gpuRef  = (float *)malloc(nBytes);
 87 | 
 88 |     double iStart, iElaps;
 89 | 
 90 |     // initialize data at host side
 91 |     iStart = seconds();
 92 |     initialData(h_A, nElem);
 93 |     initialData(h_B, nElem);
 94 |     iElaps = seconds() - iStart;
 95 |     printf("initialData Time elapsed %f sec\n", iElaps);
 96 |     memset(hostRef, 0, nBytes);
 97 |     memset(gpuRef,  0, nBytes);
 98 | 
 99 |     // add vector at host side for result checks
100 |     iStart = seconds();
101 |     sumArraysOnHost(h_A, h_B, hostRef, nElem);
102 |     iElaps = seconds() - iStart;
103 |     printf("sumArraysOnHost Time elapsed %f sec\n", iElaps);
104 | 
105 |     // malloc device global memory
106 |     float *d_A, *d_B, *d_C;
107 |     CHECK(cudaMalloc((float**)&d_A, nBytes));
108 |     CHECK(cudaMalloc((float**)&d_B, nBytes));
109 |     CHECK(cudaMalloc((float**)&d_C, nBytes));
110 | 
111 |     // transfer data from host to device
112 |     CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
113 |     CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
114 |     CHECK(cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice));
115 | 
116 |     // invoke kernel at host side
117 |     int iLen = 512;
118 |     dim3 block (iLen);
119 |     dim3 grid  ((nElem + block.x - 1) / block.x);
120 | 
121 |     iStart = seconds();
122 |     sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
123 |     CHECK(cudaDeviceSynchronize());
124 |     iElaps = seconds() - iStart;
125 |     printf("sumArraysOnGPU <<<  %d, %d  >>>  Time elapsed %f sec\n", grid.x,
126 |            block.x, iElaps);
127 | 
128 |     // check kernel error
129 |     CHECK(cudaGetLastError()) ;
130 | 
131 |     // copy kernel result back to host side
132 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
133 | 
134 |     // check device results
135 |     checkResult(hostRef, gpuRef, nElem);
136 | 
137 |     // free device global memory
138 |     CHECK(cudaFree(d_A));
139 |     CHECK(cudaFree(d_B));
140 |     CHECK(cudaFree(d_C));
141 | 
142 |     // free host memory
143 |     free(h_A);
144 |     free(h_B);
145 |     free(hostRef);
146 |     free(gpuRef);
147 | 
148 |     return(0);
149 | }
150 | 


--------------------------------------------------------------------------------
/chapter02/sumArraysOnHost.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <time.h>
 3 | 
 4 | /*
 5 |  * This example demonstrates a simple vector sum on the host. sumArraysOnHost
 6 |  * sequentially iterates through vector elements on the host.
 7 |  */
 8 | 
 9 | void sumArraysOnHost(float *A, float *B, float *C, const int N)
10 | {
11 |     for (int idx = 0; idx < N; idx++)
12 |     {
13 |         C[idx] = A[idx] + B[idx];
14 |     }
15 | 
16 | }
17 | 
18 | void initialData(float *ip, int size)
19 | {
20 |     // generate different seed for random number
21 |     time_t t;
22 |     srand((unsigned) time(&t));
23 | 
24 |     for (int i = 0; i < size; i++)
25 |     {
26 |         ip[i] = (float)(rand() & 0xFF) / 10.0f;
27 |     }
28 | 
29 |     return;
30 | }
31 | 
32 | int main(int argc, char **argv)
33 | {
34 |     int nElem = 1024;
35 |     size_t nBytes = nElem * sizeof(float);
36 | 
37 |     float *h_A, *h_B, *h_C;
38 |     h_A = (float *)malloc(nBytes);
39 |     h_B = (float *)malloc(nBytes);
40 |     h_C = (float *)malloc(nBytes);
41 | 
42 |     initialData(h_A, nElem);
43 |     initialData(h_B, nElem);
44 | 
45 |     sumArraysOnHost(h_A, h_B, h_C, nElem);
46 | 
47 |     free(h_A);
48 |     free(h_B);
49 |     free(h_C);
50 | 
51 |     return(0);
52 | }
53 | 


--------------------------------------------------------------------------------
/chapter02/sumMatrixOnGPU-1D-grid-1D-block.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates a simple vector sum on the GPU and on the host.
  7 |  * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the
  8 |  * GPU. A 1D thread block and 1D grid are used. sumArraysOnHost sequentially
  9 |  * iterates through vector elements on the host.
 10 |  */
 11 | 
 12 | void initialData(float *ip, const int size)
 13 | {
 14 |     int i;
 15 | 
 16 |     for(i = 0; i < size; i++)
 17 |     {
 18 |         ip[i] = (float)(rand() & 0xFF ) / 10.0f;
 19 |     }
 20 | 
 21 |     return;
 22 | }
 23 | 
 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx,
 25 |                      const int ny)
 26 | {
 27 |     float *ia = A;
 28 |     float *ib = B;
 29 |     float *ic = C;
 30 | 
 31 |     for (int iy = 0; iy < ny; iy++)
 32 |     {
 33 |         for (int ix = 0; ix < nx; ix++)
 34 |         {
 35 |             ic[ix] = ia[ix] + ib[ix];
 36 | 
 37 |         }
 38 | 
 39 |         ia += nx;
 40 |         ib += nx;
 41 |         ic += nx;
 42 |     }
 43 | 
 44 |     return;
 45 | }
 46 | 
 47 | 
 48 | void checkResult(float *hostRef, float *gpuRef, const int N)
 49 | {
 50 |     double epsilon = 1.0E-8;
 51 |     bool match = 1;
 52 | 
 53 |     for (int i = 0; i < N; i++)
 54 |     {
 55 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 56 |         {
 57 |             match = 0;
 58 |             printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
 59 |             break;
 60 |         }
 61 |     }
 62 | 
 63 |     if (match)
 64 |         printf("Arrays match.\n\n");
 65 |     else
 66 |         printf("Arrays do not match.\n\n");
 67 | }
 68 | 
 69 | // grid 1D block 1D
 70 | __global__ void sumMatrixOnGPU1D(float *MatA, float *MatB, float *MatC, int nx,
 71 |                                  int ny)
 72 | {
 73 |     unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
 74 | 
 75 |     if (ix < nx )
 76 |         for (int iy = 0; iy < ny; iy++)
 77 |         {
 78 |             int idx = iy * nx + ix;
 79 |             MatC[idx] = MatA[idx] + MatB[idx];
 80 |         }
 81 | 
 82 | 
 83 | }
 84 | 
 85 | int main(int argc, char **argv)
 86 | {
 87 |     printf("%s Starting...\n", argv[0]);
 88 | 
 89 |     // set up device
 90 |     int dev = 0;
 91 |     cudaDeviceProp deviceProp;
 92 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 93 |     printf("Using Device %d: %s\n", dev, deviceProp.name);
 94 |     CHECK(cudaSetDevice(dev));
 95 | 
 96 |     // set up data size of matrix
 97 |     int nx = 1 << 14;
 98 |     int ny = 1 << 14;
 99 | 
100 |     int nxy = nx * ny;
101 |     int nBytes = nxy * sizeof(float);
102 |     printf("Matrix size: nx %d ny %d\n", nx, ny);
103 | 
104 |     // malloc host memory
105 |     float *h_A, *h_B, *hostRef, *gpuRef;
106 |     h_A = (float *)malloc(nBytes);
107 |     h_B = (float *)malloc(nBytes);
108 |     hostRef = (float *)malloc(nBytes);
109 |     gpuRef = (float *)malloc(nBytes);
110 | 
111 |     // initialize data at host side
112 |     double iStart = seconds();
113 |     initialData(h_A, nxy);
114 |     initialData(h_B, nxy);
115 |     double iElaps = seconds() - iStart;
116 |     printf("initialize matrix elapsed %f sec\n", iElaps);
117 | 
118 |     memset(hostRef, 0, nBytes);
119 |     memset(gpuRef, 0, nBytes);
120 | 
121 |     // add matrix at host side for result checks
122 |     iStart = seconds();
123 |     sumMatrixOnHost(h_A, h_B, hostRef, nx, ny);
124 |     iElaps = seconds() - iStart;
125 |     printf("sumMatrixOnHost elapsed %f sec\n", iElaps);
126 | 
127 |     // malloc device global memory
128 |     float *d_MatA, *d_MatB, *d_MatC;
129 |     CHECK(cudaMalloc((void **)&d_MatA, nBytes));
130 |     CHECK(cudaMalloc((void **)&d_MatB, nBytes));
131 |     CHECK(cudaMalloc((void **)&d_MatC, nBytes));
132 | 
133 |     // transfer data from host to device
134 |     CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
135 |     CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));
136 | 
137 |     // invoke kernel at host side
138 |     int dimx = 32;
139 |     dim3 block(dimx, 1);
140 |     dim3 grid((nx + block.x - 1) / block.x, 1);
141 | 
142 |     iStart = seconds();
143 |     sumMatrixOnGPU1D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
144 |     CHECK(cudaDeviceSynchronize());
145 |     iElaps = seconds() - iStart;
146 |     printf("sumMatrixOnGPU1D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x,
147 |            grid.y,
148 |            block.x, block.y, iElaps);
149 | 
150 |     // check kernel error
151 |     CHECK(cudaGetLastError());
152 | 
153 |     // copy kernel result back to host side
154 |     CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));
155 | 
156 |     // check device results
157 |     checkResult(hostRef, gpuRef, nxy);
158 | 
159 |     // free device global memory
160 |     CHECK(cudaFree(d_MatA));
161 |     CHECK(cudaFree(d_MatB));
162 |     CHECK(cudaFree(d_MatC));
163 | 
164 |     // free host memory
165 |     free(h_A);
166 |     free(h_B);
167 |     free(hostRef);
168 |     free(gpuRef);
169 | 
170 |     // reset device
171 |     CHECK(cudaDeviceReset());
172 | 
173 |     return (0);
174 | }
175 | 


--------------------------------------------------------------------------------
/chapter02/sumMatrixOnGPU-2D-grid-1D-block.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates a simple vector sum on the GPU and on the host.
  7 |  * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the
  8 |  * GPU. A 1D thread block and 2D grid are used. sumArraysOnHost sequentially
  9 |  * iterates through vector elements on the host.
 10 |  */
 11 | 
 12 | void initialData(float *ip, const int size)
 13 | {
 14 |     int i;
 15 | 
 16 |     for(i = 0; i < size; i++)
 17 |     {
 18 |         ip[i] = (float)(rand() & 0xFF) / 10.0f;
 19 |     }
 20 | 
 21 |     return;
 22 | }
 23 | 
 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx,
 25 |                      const int ny)
 26 | {
 27 |     float *ia = A;
 28 |     float *ib = B;
 29 |     float *ic = C;
 30 | 
 31 |     for (int iy = 0; iy < ny; iy++)
 32 |     {
 33 |         for (int ix = 0; ix < nx; ix++)
 34 |         {
 35 |             ic[ix] = ia[ix] + ib[ix];
 36 | 
 37 |         }
 38 | 
 39 |         ia += nx;
 40 |         ib += nx;
 41 |         ic += nx;
 42 |     }
 43 | 
 44 |     return;
 45 | }
 46 | 
 47 | 
 48 | void checkResult(float *hostRef, float *gpuRef, const int N)
 49 | {
 50 |     double epsilon = 1.0E-8;
 51 |     bool match = 1;
 52 | 
 53 |     for (int i = 0; i < N; i++)
 54 |     {
 55 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 56 |         {
 57 |             match = 0;
 58 |             printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
 59 |             break;
 60 |         }
 61 |     }
 62 | 
 63 |     if (match)
 64 |         printf("Arrays match.\n\n");
 65 |     else
 66 |         printf("Arrays do not match.\n\n");
 67 | }
 68 | 
 69 | // grid 2D block 1D
 70 | __global__ void sumMatrixOnGPUMix(float *MatA, float *MatB, float *MatC, int nx,
 71 |                                   int ny)
 72 | {
 73 |     unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
 74 |     unsigned int iy = blockIdx.y;
 75 |     unsigned int idx = iy * nx + ix;
 76 | 
 77 |     if (ix < nx && iy < ny)
 78 |         MatC[idx] = MatA[idx] + MatB[idx];
 79 | }
 80 | 
 81 | int main(int argc, char **argv)
 82 | {
 83 |     printf("%s Starting...\n", argv[0]);
 84 | 
 85 |     // set up device
 86 |     int dev = 0;
 87 |     cudaDeviceProp deviceProp;
 88 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 89 |     printf("Using Device %d: %s\n", dev, deviceProp.name);
 90 |     CHECK(cudaSetDevice(dev));
 91 | 
 92 |     // set up data size of matrix
 93 |     int nx = 1 << 14;
 94 |     int ny = 1 << 14;
 95 | 
 96 |     int nxy = nx * ny;
 97 |     int nBytes = nxy * sizeof(float);
 98 |     printf("Matrix size: nx %d ny %d\n", nx, ny);
 99 | 
100 |     // malloc host memory
101 |     float *h_A, *h_B, *hostRef, *gpuRef;
102 |     h_A = (float *)malloc(nBytes);
103 |     h_B = (float *)malloc(nBytes);
104 |     hostRef = (float *)malloc(nBytes);
105 |     gpuRef = (float *)malloc(nBytes);
106 | 
107 |     // initialize data at host side
108 |     double iStart = seconds();
109 |     initialData(h_A, nxy);
110 |     initialData(h_B, nxy);
111 |     double iElaps = seconds() - iStart;
112 |     printf("Matrix initialization elapsed %f sec\n", iElaps);
113 | 
114 |     memset(hostRef, 0, nBytes);
115 |     memset(gpuRef, 0, nBytes);
116 | 
117 |     // add matrix at host side for result checks
118 |     iStart = seconds();
119 |     sumMatrixOnHost(h_A, h_B, hostRef, nx, ny);
120 |     iElaps = seconds() - iStart;
121 |     printf("sumMatrixOnHost elapsed %f sec\n", iElaps);
122 | 
123 |     // malloc device global memory
124 |     float *d_MatA, *d_MatB, *d_MatC;
125 |     CHECK(cudaMalloc((void **)&d_MatA, nBytes));
126 |     CHECK(cudaMalloc((void **)&d_MatB, nBytes));
127 |     CHECK(cudaMalloc((void **)&d_MatC, nBytes));
128 | 
129 |     // transfer data from host to device
130 |     CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
131 |     CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));
132 | 
133 |     // invoke kernel at host side
134 |     int dimx = 32;
135 |     dim3 block(dimx, 1);
136 |     dim3 grid((nx + block.x - 1) / block.x, ny);
137 | 
138 |     iStart = seconds();
139 |     sumMatrixOnGPUMix<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
140 |     CHECK(cudaDeviceSynchronize());
141 |     iElaps = seconds() - iStart;
142 |     printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x,
143 |            grid.y,
144 |            block.x, block.y, iElaps);
145 |     // check kernel error
146 |     CHECK(cudaGetLastError());
147 | 
148 |     // copy kernel result back to host side
149 |     CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));
150 | 
151 |     // check device results
152 |     checkResult(hostRef, gpuRef, nxy);
153 | 
154 |     // free device global memory
155 |     CHECK(cudaFree(d_MatA));
156 |     CHECK(cudaFree(d_MatB));
157 |     CHECK(cudaFree(d_MatC));
158 | 
159 |     // free host memory
160 |     free(h_A);
161 |     free(h_B);
162 |     free(hostRef);
163 |     free(gpuRef);
164 | 
165 |     // reset device
166 |     CHECK(cudaDeviceReset());
167 | 
168 |     return (0);
169 | }
170 | 


--------------------------------------------------------------------------------
/chapter02/sumMatrixOnGPU-2D-grid-2D-block.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates a simple vector sum on the GPU and on the host.
  7 |  * sumArraysOnGPU splits the work of the vector sum across CUDA threads on the
  8 |  * GPU. A 2D thread block and 2D grid are used. sumArraysOnHost sequentially
  9 |  * iterates through vector elements on the host.
 10 |  */
 11 | 
 12 | void initialData(float *ip, const int size)
 13 | {
 14 |     int i;
 15 | 
 16 |     for(i = 0; i < size; i++)
 17 |     {
 18 |         ip[i] = (float)(rand() & 0xFF) / 10.0f;
 19 |     }
 20 | 
 21 |     return;
 22 | }
 23 | 
 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx,
 25 |                      const int ny)
 26 | {
 27 |     float *ia = A;
 28 |     float *ib = B;
 29 |     float *ic = C;
 30 | 
 31 |     for (int iy = 0; iy < ny; iy++)
 32 |     {
 33 |         for (int ix = 0; ix < nx; ix++)
 34 |         {
 35 |             ic[ix] = ia[ix] + ib[ix];
 36 | 
 37 |         }
 38 | 
 39 |         ia += nx;
 40 |         ib += nx;
 41 |         ic += nx;
 42 |     }
 43 | 
 44 |     return;
 45 | }
 46 | 
 47 | 
 48 | void checkResult(float *hostRef, float *gpuRef, const int N)
 49 | {
 50 |     double epsilon = 1.0E-8;
 51 |     bool match = 1;
 52 | 
 53 |     for (int i = 0; i < N; i++)
 54 |     {
 55 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 56 |         {
 57 |             match = 0;
 58 |             printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
 59 |             break;
 60 |         }
 61 |     }
 62 | 
 63 |     if (match)
 64 |         printf("Arrays match.\n\n");
 65 |     else
 66 |         printf("Arrays do not match.\n\n");
 67 | }
 68 | 
 69 | // grid 2D block 2D
 70 | __global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx,
 71 |                                  int ny)
 72 | {
 73 |     unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
 74 |     unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
 75 |     unsigned int idx = iy * nx + ix;
 76 | 
 77 |     if (ix < nx && iy < ny)
 78 |         MatC[idx] = MatA[idx] + MatB[idx];
 79 | }
 80 | 
 81 | int main(int argc, char **argv)
 82 | {
 83 |     printf("%s Starting...\n", argv[0]);
 84 | 
 85 |     // set up device
 86 |     int dev = 0;
 87 |     cudaDeviceProp deviceProp;
 88 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 89 |     printf("Using Device %d: %s\n", dev, deviceProp.name);
 90 |     CHECK(cudaSetDevice(dev));
 91 | 
 92 |     // set up data size of matrix
 93 |     int nx = 1 << 14;
 94 |     int ny = 1 << 14;
 95 | 
 96 |     int nxy = nx * ny;
 97 |     int nBytes = nxy * sizeof(float);
 98 |     printf("Matrix size: nx %d ny %d\n", nx, ny);
 99 | 
100 |     // malloc host memory
101 |     float *h_A, *h_B, *hostRef, *gpuRef;
102 |     h_A = (float *)malloc(nBytes);
103 |     h_B = (float *)malloc(nBytes);
104 |     hostRef = (float *)malloc(nBytes);
105 |     gpuRef = (float *)malloc(nBytes);
106 | 
107 |     // initialize data at host side
108 |     double iStart = seconds();
109 |     initialData(h_A, nxy);
110 |     initialData(h_B, nxy);
111 |     double iElaps = seconds() - iStart;
112 |     printf("Matrix initialization elapsed %f sec\n", iElaps);
113 | 
114 |     memset(hostRef, 0, nBytes);
115 |     memset(gpuRef, 0, nBytes);
116 | 
117 |     // add matrix at host side for result checks
118 |     iStart = seconds();
119 |     sumMatrixOnHost(h_A, h_B, hostRef, nx, ny);
120 |     iElaps = seconds() - iStart;
121 |     printf("sumMatrixOnHost elapsed %f sec\n", iElaps);
122 | 
123 |     // malloc device global memory
124 |     float *d_MatA, *d_MatB, *d_MatC;
125 |     CHECK(cudaMalloc((void **)&d_MatA, nBytes));
126 |     CHECK(cudaMalloc((void **)&d_MatB, nBytes));
127 |     CHECK(cudaMalloc((void **)&d_MatC, nBytes));
128 | 
129 |     // transfer data from host to device
130 |     CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
131 |     CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));
132 | 
133 |     // invoke kernel at host side
134 |     int dimx = 32;
135 |     int dimy = 32;
136 |     dim3 block(dimx, dimy);
137 |     dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
138 | 
139 |     iStart = seconds();
140 |     sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
141 |     CHECK(cudaDeviceSynchronize());
142 |     iElaps = seconds() - iStart;
143 |     printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %f sec\n", grid.x,
144 |            grid.y,
145 |            block.x, block.y, iElaps);
146 |     // check kernel error
147 |     CHECK(cudaGetLastError());
148 | 
149 |     // copy kernel result back to host side
150 |     CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));
151 | 
152 |     // check device results
153 |     checkResult(hostRef, gpuRef, nxy);
154 | 
155 |     // free device global memory
156 |     CHECK(cudaFree(d_MatA));
157 |     CHECK(cudaFree(d_MatB));
158 |     CHECK(cudaFree(d_MatC));
159 | 
160 |     // free host memory
161 |     free(h_A);
162 |     free(h_B);
163 |     free(hostRef);
164 |     free(gpuRef);
165 | 
166 |     // reset device
167 |     CHECK(cudaDeviceReset());
168 | 
169 |     return (0);
170 | }
171 | 


--------------------------------------------------------------------------------
/chapter03/Makefile:
--------------------------------------------------------------------------------
 1 | CU_APPS=nestedHelloWorld nestedReduce nestedReduce2 nestedReduceNosync \
 2 | 	    reduceInteger simpleDeviceQuery simpleDivergence sumMatrix
 3 | C_APPS=
 4 | 
 5 | all: ${C_APPS} ${CU_APPS}
 6 | 
 7 | %: %.cu
 8 | 	nvcc -O2 -arch=sm_35 -o $@ $< -lcudadevrt --relocatable-device-code true
 9 | %: %.c
10 | 	gcc -O2 -std=c99 -o $@ $<
11 | clean:
12 | 	rm -f ${CU_APPS} ${C_APPS}
13 | 


--------------------------------------------------------------------------------
/chapter03/nestedHelloWorld.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | /*
 6 |  * A simple example of nested kernel launches from the GPU. Each thread displays
 7 |  * its information when execution begins, and also diagnostics when the next
 8 |  * lowest nesting layer completes.
 9 |  */
10 | 
11 | __global__ void nestedHelloWorld(int const iSize, int iDepth)
12 | {
13 |     int tid = threadIdx.x;
14 |     printf("Recursion=%d: Hello World from thread %d block %d\n", iDepth, tid,
15 |            blockIdx.x);
16 | 
17 |     // condition to stop recursive execution
18 |     if (iSize == 1) return;
19 | 
20 |     // reduce block size to half
21 |     int nthreads = iSize >> 1;
22 | 
23 |     // thread 0 launches child grid recursively
24 |     if(tid == 0 && nthreads > 0)
25 |     {
26 |         nestedHelloWorld<<<1, nthreads>>>(nthreads, ++iDepth);
27 |         printf("-------> nested execution depth: %d\n", iDepth);
28 |     }
29 | }
30 | 
31 | int main(int argc, char **argv)
32 | {
33 |     int size = 8;
34 |     int blocksize = 8;   // initial block size
35 |     int igrid = 1;
36 | 
37 |     if(argc > 1)
38 |     {
39 |         igrid = atoi(argv[1]);
40 |         size = igrid * blocksize;
41 |     }
42 | 
43 |     dim3 block (blocksize, 1);
44 |     dim3 grid  ((size + block.x - 1) / block.x, 1);
45 |     printf("%s Execution Configuration: grid %d block %d\n", argv[0], grid.x,
46 |            block.x);
47 | 
48 |     nestedHelloWorld<<<grid, block>>>(block.x, 0);
49 | 
50 |     CHECK(cudaGetLastError());
51 |     CHECK(cudaDeviceReset());
52 |     return 0;
53 | }
54 | 


--------------------------------------------------------------------------------
/chapter03/nestedReduce.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <cuda_runtime.h>
  4 | #define LOG 0
  5 | 
  6 | /*
  7 |  * An implementation of parallel reduction using nested kernel launches from
  8 |  * CUDA kernels.
  9 |  */
 10 | 
 11 | // Recursive Implementation of Interleaved Pair Approach
 12 | int cpuRecursiveReduce(int *data, int const size)
 13 | {
 14 |     // stop condition
 15 |     if (size == 1) return data[0];
 16 | 
 17 |     // renew the stride
 18 |     int const stride = size / 2;
 19 | 
 20 |     // in-place reduction
 21 |     for (int i = 0; i < stride; i++)
 22 |     {
 23 |         data[i] += data[i + stride];
 24 |     }
 25 | 
 26 |     // call recursively
 27 |     return cpuRecursiveReduce(data, stride);
 28 | }
 29 | 
 30 | // Neighbored Pair Implementation with divergence
 31 | __global__ void reduceNeighbored (int *g_idata, int *g_odata, unsigned int n)
 32 | {
 33 |     // set thread ID
 34 |     unsigned int tid = threadIdx.x;
 35 |     unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
 36 | 
 37 |     // convert global data pointer to the local pointer of this block
 38 |     int *idata = g_idata + blockIdx.x * blockDim.x;
 39 | 
 40 |     // boundary check
 41 |     if (idx >= n) return;
 42 | 
 43 |     // in-place reduction in global memory
 44 |     for (int stride = 1; stride < blockDim.x; stride *= 2)
 45 |     {
 46 |         if ((tid % (2 * stride)) == 0)
 47 |         {
 48 |             idata[tid] += idata[tid + stride];
 49 |         }
 50 | 
 51 |         // synchronize within threadblock
 52 |         __syncthreads();
 53 |     }
 54 | 
 55 |     // write result for this block to global mem
 56 |     if (tid == 0) g_odata[blockIdx.x] = idata[0];
 57 | }
 58 | 
 59 | __global__ void gpuRecursiveReduce (int *g_idata, int *g_odata,
 60 |                                     unsigned int isize)
 61 | {
 62 |     // set thread ID
 63 |     unsigned int tid = threadIdx.x;
 64 | 
 65 |     // convert global data pointer to the local pointer of this block
 66 |     int *idata = g_idata + blockIdx.x * blockDim.x;
 67 |     int *odata = &g_odata[blockIdx.x];
 68 | 
 69 |     // stop condition
 70 |     if (isize == 2 && tid == 0)
 71 |     {
 72 |         g_odata[blockIdx.x] = idata[0] + idata[1];
 73 |         return;
 74 |     }
 75 | 
 76 |     // nested invocation
 77 |     int istride = isize >> 1;
 78 | 
 79 |     if(istride > 1 && tid < istride)
 80 |     {
 81 |         // in place reduction
 82 |         idata[tid] += idata[tid + istride];
 83 |     }
 84 | 
 85 |     // sync at block level
 86 |     __syncthreads();
 87 | 
 88 |     // nested invocation to generate child grids
 89 |     if(tid == 0)
 90 |     {
 91 |         gpuRecursiveReduce<<<1, istride>>>(idata, odata, istride);
 92 | 
 93 |         // sync all child grids launched in this block
 94 |         cudaDeviceSynchronize();
 95 |     }
 96 | 
 97 |     // sync at block level again
 98 |     __syncthreads();
 99 | }
100 | 
101 | // main from here
102 | int main(int argc, char **argv)
103 | {
104 |     // set up device
105 |     int dev = 0, gpu_sum;
106 |     cudaDeviceProp deviceProp;
107 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
108 |     printf("%s starting reduction at ", argv[0]);
109 |     printf("device %d: %s ", dev, deviceProp.name);
110 |     CHECK(cudaSetDevice(dev));
111 | 
112 |     bool bResult = false;
113 | 
114 |     // set up execution configuration
115 |     int nblock  = 2048;
116 |     int nthread = 512;   // initial block size
117 | 
118 |     if(argc > 1)
119 |     {
120 |         nblock = atoi(argv[1]);   // block size from command line argument
121 |     }
122 | 
123 |     if(argc > 2)
124 |     {
125 |         nthread = atoi(argv[2]);   // block size from command line argument
126 |     }
127 | 
128 |     int size = nblock * nthread; // total number of elements to reduceNeighbored
129 | 
130 |     dim3 block (nthread, 1);
131 |     dim3 grid  ((size + block.x - 1) / block.x, 1);
132 |     printf("array %d grid %d block %d\n", size, grid.x, block.x);
133 | 
134 |     // allocate host memory
135 |     size_t bytes = size * sizeof(int);
136 |     int *h_idata = (int *) malloc(bytes);
137 |     int *h_odata = (int *) malloc(grid.x * sizeof(int));
138 |     int *tmp     = (int *) malloc(bytes);
139 | 
140 |     // initialize the array
141 |     for (int i = 0; i < size; i++)
142 |     {
143 |         h_idata[i] = (int)( rand() & 0xFF );
144 |         h_idata[i] = 1;
145 |     }
146 | 
147 |     memcpy (tmp, h_idata, bytes);
148 | 
149 |     // allocate device memory
150 |     int *d_idata = NULL;
151 |     int *d_odata = NULL;
152 |     CHECK(cudaMalloc((void **) &d_idata, bytes));
153 |     CHECK(cudaMalloc((void **) &d_odata, grid.x * sizeof(int)));
154 | 
155 |     double iStart, iElaps;
156 | 
157 |     // cpu recursive reduction
158 |     iStart = seconds();
159 |     int cpu_sum = cpuRecursiveReduce (tmp, size);
160 |     iElaps = seconds() - iStart;
161 |     printf("cpu reduce\t\telapsed %f sec cpu_sum: %d\n", iElaps, cpu_sum);
162 | 
163 |     // gpu reduceNeighbored
164 |     CHECK(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
165 |     iStart = seconds();
166 |     reduceNeighbored<<<grid, block>>>(d_idata, d_odata, size);
167 |     CHECK(cudaDeviceSynchronize());
168 |     CHECK(cudaGetLastError());
169 |     iElaps = seconds() - iStart;
170 |     CHECK(cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),
171 |                      cudaMemcpyDeviceToHost));
172 |     gpu_sum = 0;
173 | 
174 |     for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];
175 | 
176 |     printf("gpu Neighbored\t\telapsed %f sec gpu_sum: %d <<<grid %d block "
177 |            "%d>>>\n", iElaps, gpu_sum, grid.x, block.x);
178 | 
179 |     // gpu nested reduce kernel
180 |     CHECK(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
181 |     iStart = seconds();
182 |     gpuRecursiveReduce<<<grid, block>>>(d_idata, d_odata, block.x);
183 |     CHECK(cudaDeviceSynchronize());
184 |     CHECK(cudaGetLastError());
185 |     iElaps = seconds() - iStart;
186 |     CHECK(cudaMemcpy(h_odata, d_odata, grid.x * sizeof(int),
187 |                      cudaMemcpyDeviceToHost));
188 |     gpu_sum = 0;
189 | 
190 |     for (int i = 0; i < grid.x; i++) gpu_sum += h_odata[i];
191 | 
192 |     printf("gpu nested\t\telapsed %f sec gpu_sum: %d <<<grid %d block %d>>>\n",
193 |            iElaps, gpu_sum, grid.x, block.x);
194 | 
195 |     // free host memory
196 |     free(h_idata);
197 |     free(h_odata);
198 | 
199 |     // free device memory
200 |     CHECK(cudaFree(d_idata));
201 |     CHECK(cudaFree(d_odata));
202 | 
203 |     // reset device
204 |     CHECK(cudaDeviceReset());
205 | 
206 |     // check the results
207 |     bResult = (gpu_sum == cpu_sum);
208 | 
209 |     if(!bResult) printf("Test failed!\n");
210 | 
211 |     return EXIT_SUCCESS;
212 | }
213 | 


--------------------------------------------------------------------------------
/chapter03/simpleDeviceQuery.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | /*
 6 |  * Fetches basic information on the first device in the current CUDA platform,
 7 |  * including number of SMs, bytes of constant memory, bytes of shared memory per
 8 |  * block, etc.
 9 |  */
10 | 
11 | int main(int argc, char *argv[])
12 | {
13 |     int iDev = 0;
14 |     cudaDeviceProp iProp;
15 |     CHECK(cudaGetDeviceProperties(&iProp, iDev));
16 | 
17 |     printf("Device %d: %s\n", iDev, iProp.name);
18 |     printf("  Number of multiprocessors:                     %d\n",
19 |            iProp.multiProcessorCount);
20 |     printf("  Total amount of constant memory:               %4.2f KB\n",
21 |            iProp.totalConstMem / 1024.0);
22 |     printf("  Total amount of shared memory per block:       %4.2f KB\n",
23 |            iProp.sharedMemPerBlock / 1024.0);
24 |     printf("  Total number of registers available per block: %d\n",
25 |            iProp.regsPerBlock);
26 |     printf("  Warp size:                                     %d\n",
27 |            iProp.warpSize);
28 |     printf("  Maximum number of threads per block:           %d\n",
29 |            iProp.maxThreadsPerBlock);
30 |     printf("  Maximum number of threads per multiprocessor:  %d\n",
31 |            iProp.maxThreadsPerMultiProcessor);
32 |     printf("  Maximum number of warps per multiprocessor:    %d\n",
33 |            iProp.maxThreadsPerMultiProcessor / 32);
34 |     return EXIT_SUCCESS;
35 | }
36 | 


--------------------------------------------------------------------------------
/chapter03/simpleDivergence.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * simpleDivergence demonstrates divergent code on the GPU and its impact on
  7 |  * performance and CUDA metrics.
  8 |  */
  9 | 
 10 | __global__ void mathKernel1(float *c)
 11 | {
 12 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 13 |     float ia, ib;
 14 |     ia = ib = 0.0f;
 15 | 
 16 |     if (tid % 2 == 0)
 17 |     {
 18 |         ia = 100.0f;
 19 |     }
 20 |     else
 21 |     {
 22 |         ib = 200.0f;
 23 |     }
 24 | 
 25 |     c[tid] = ia + ib;
 26 | }
 27 | 
 28 | __global__ void mathKernel2(float *c)
 29 | {
 30 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 31 |     float ia, ib;
 32 |     ia = ib = 0.0f;
 33 | 
 34 |     if ((tid / warpSize) % 2 == 0)
 35 |     {
 36 |         ia = 100.0f;
 37 |     }
 38 |     else
 39 |     {
 40 |         ib = 200.0f;
 41 |     }
 42 | 
 43 |     c[tid] = ia + ib;
 44 | }
 45 | 
 46 | __global__ void mathKernel3(float *c)
 47 | {
 48 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 49 |     float ia, ib;
 50 |     ia = ib = 0.0f;
 51 | 
 52 |     bool ipred = (tid % 2 == 0);
 53 | 
 54 |     if (ipred)
 55 |     {
 56 |         ia = 100.0f;
 57 |     }
 58 | 
 59 |     if (!ipred)
 60 |     {
 61 |         ib = 200.0f;
 62 |     }
 63 | 
 64 |     c[tid] = ia + ib;
 65 | }
 66 | 
 67 | __global__ void mathKernel4(float *c)
 68 | {
 69 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 70 |     float ia, ib;
 71 |     ia = ib = 0.0f;
 72 | 
 73 |     int itid = tid >> 5;
 74 | 
 75 |     if (itid & 0x01 == 0)
 76 |     {
 77 |         ia = 100.0f;
 78 |     }
 79 |     else
 80 |     {
 81 |         ib = 200.0f;
 82 |     }
 83 | 
 84 |     c[tid] = ia + ib;
 85 | }
 86 | 
 87 | __global__ void warmingup(float *c)
 88 | {
 89 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 90 |     float ia, ib;
 91 |     ia = ib = 0.0f;
 92 | 
 93 |     if ((tid / warpSize) % 2 == 0)
 94 |     {
 95 |         ia = 100.0f;
 96 |     }
 97 |     else
 98 |     {
 99 |         ib = 200.0f;
100 |     }
101 | 
102 |     c[tid] = ia + ib;
103 | }
104 | 
105 | 
106 | int main(int argc, char **argv)
107 | {
108 |     // set up device
109 |     int dev = 0;
110 |     cudaDeviceProp deviceProp;
111 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
112 |     printf("%s using Device %d: %s\n", argv[0], dev, deviceProp.name);
113 | 
114 |     // set up data size
115 |     int size = 64;
116 |     int blocksize = 64;
117 | 
118 |     if(argc > 1) blocksize = atoi(argv[1]);
119 | 
120 |     if(argc > 2) size      = atoi(argv[2]);
121 | 
122 |     printf("Data size %d ", size);
123 | 
124 |     // set up execution configuration
125 |     dim3 block (blocksize, 1);
126 |     dim3 grid  ((size + block.x - 1) / block.x, 1);
127 |     printf("Execution Configure (block %d grid %d)\n", block.x, grid.x);
128 | 
129 |     // allocate gpu memory
130 |     float *d_C;
131 |     size_t nBytes = size * sizeof(float);
132 |     CHECK(cudaMalloc((float**)&d_C, nBytes));
133 | 
134 |     // run a warmup kernel to remove overhead
135 |     size_t iStart, iElaps;
136 |     CHECK(cudaDeviceSynchronize());
137 |     iStart = seconds();
138 |     warmingup<<<grid, block>>>(d_C);
139 |     CHECK(cudaDeviceSynchronize());
140 |     iElaps = seconds() - iStart;
141 |     printf("warmup      <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x,
142 |            iElaps );
143 |     CHECK(cudaGetLastError());
144 | 
145 |     // run kernel 1
146 |     iStart = seconds();
147 |     mathKernel1<<<grid, block>>>(d_C);
148 |     CHECK(cudaDeviceSynchronize());
149 |     iElaps = seconds() - iStart;
150 |     printf("mathKernel1 <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x,
151 |            iElaps );
152 |     CHECK(cudaGetLastError());
153 | 
154 |     // run kernel 3
155 |     iStart = seconds();
156 |     mathKernel2<<<grid, block>>>(d_C);
157 |     CHECK(cudaDeviceSynchronize());
158 |     iElaps = seconds() - iStart;
159 |     printf("mathKernel2 <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x,
160 |            iElaps );
161 |     CHECK(cudaGetLastError());
162 | 
163 |     // run kernel 3
164 |     iStart = seconds();
165 |     mathKernel3<<<grid, block>>>(d_C);
166 |     CHECK(cudaDeviceSynchronize());
167 |     iElaps = seconds() - iStart;
168 |     printf("mathKernel3 <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x,
169 |            iElaps);
170 |     CHECK(cudaGetLastError());
171 | 
172 |     // run kernel 4
173 |     iStart = seconds();
174 |     mathKernel4<<<grid, block>>>(d_C);
175 |     CHECK(cudaDeviceSynchronize());
176 |     iElaps = seconds() - iStart;
177 |     printf("mathKernel4 <<< %4d %4d >>> elapsed %d sec \n", grid.x, block.x,
178 |            iElaps);
179 |     CHECK(cudaGetLastError());
180 | 
181 |     // free gpu memory and reset divece
182 |     CHECK(cudaFree(d_C));
183 |     CHECK(cudaDeviceReset());
184 |     return EXIT_SUCCESS;
185 | }
186 | 


--------------------------------------------------------------------------------
/chapter03/sumMatrix.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example implements matrix element-wise addition on the host and GPU.
  7 |  * sumMatrixOnHost iterates over the rows and columns of each matrix, adding
  8 |  * elements from A and B together and storing the results in C. The current
  9 |  * offset in each matrix is stored using pointer arithmetic. sumMatrixOnGPU2D
 10 |  * implements the same logic, but using CUDA threads to process each matrix.
 11 |  */
 12 | 
 13 | void initialData(float *ip, const int size)
 14 | {
 15 |     int i;
 16 | 
 17 |     for(i = 0; i < size; i++)
 18 |     {
 19 |         ip[i] = (float)( rand() & 0xFF ) / 10.0f;
 20 |     }
 21 | }
 22 | 
 23 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
 24 | {
 25 |     float *ia = A;
 26 |     float *ib = B;
 27 |     float *ic = C;
 28 | 
 29 |     for (int iy = 0; iy < ny; iy++)
 30 |     {
 31 |         for (int ix = 0; ix < nx; ix++)
 32 |         {
 33 |             ic[ix] = ia[ix] + ib[ix];
 34 |         }
 35 | 
 36 |         ia += nx;
 37 |         ib += nx;
 38 |         ic += nx;
 39 |     }
 40 | 
 41 |     return;
 42 | }
 43 | 
 44 | void checkResult(float *hostRef, float *gpuRef, const int N)
 45 | {
 46 |     double epsilon = 1.0E-8;
 47 | 
 48 |     for (int i = 0; i < N; i++)
 49 |     {
 50 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 51 |         {
 52 |             printf("host %f gpu %f ", hostRef[i], gpuRef[i]);
 53 |             printf("Arrays do not match.\n\n");
 54 |             break;
 55 |         }
 56 |     }
 57 | }
 58 | 
 59 | // grid 2D block 2D
 60 | __global__ void sumMatrixOnGPU2D(float *A, float *B, float *C, int NX, int NY)
 61 | {
 62 |     unsigned int ix = blockIdx.x * blockDim.x + threadIdx.x;
 63 |     unsigned int iy = blockIdx.y * blockDim.y + threadIdx.y;
 64 |     unsigned int idx = iy * NX + ix;
 65 | 
 66 |     if (ix < NX && iy < NY)
 67 |     {
 68 |         C[idx] = A[idx] + B[idx];
 69 |     }
 70 | }
 71 | 
 72 | int main(int argc, char **argv)
 73 | {
 74 |     // set up device
 75 |     int dev = 0;
 76 |     cudaDeviceProp deviceProp;
 77 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 78 |     CHECK(cudaSetDevice(dev));
 79 | 
 80 |     // set up data size of matrix
 81 |     int nx = 1 << 14;
 82 |     int ny = 1 << 14;
 83 | 
 84 |     int nxy = nx * ny;
 85 |     int nBytes = nxy * sizeof(float);
 86 | 
 87 |     // malloc host memory
 88 |     float *h_A, *h_B, *hostRef, *gpuRef;
 89 |     h_A = (float *)malloc(nBytes);
 90 |     h_B = (float *)malloc(nBytes);
 91 |     hostRef = (float *)malloc(nBytes);
 92 |     gpuRef = (float *)malloc(nBytes);
 93 | 
 94 |     // initialize data at host side
 95 |     size_t iStart = seconds();
 96 |     initialData(h_A, nxy);
 97 |     initialData(h_B, nxy);
 98 |     size_t iElaps = seconds() - iStart;
 99 | 
100 |     memset(hostRef, 0, nBytes);
101 |     memset(gpuRef, 0, nBytes);
102 | 
103 |     // add matrix at host side for result checks
104 |     iStart = seconds();
105 |     sumMatrixOnHost (h_A, h_B, hostRef, nx, ny);
106 |     iElaps = seconds() - iStart;
107 | 
108 |     // malloc device global memory
109 |     float *d_MatA, *d_MatB, *d_MatC;
110 |     CHECK(cudaMalloc((void **)&d_MatA, nBytes));
111 |     CHECK(cudaMalloc((void **)&d_MatB, nBytes));
112 |     CHECK(cudaMalloc((void **)&d_MatC, nBytes));
113 | 
114 |     // transfer data from host to device
115 |     CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
116 |     CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));
117 | 
118 |     // invoke kernel at host side
119 |     int dimx = 32;
120 |     int dimy = 32;
121 | 
122 |     if(argc > 2)
123 |     {
124 |         dimx = atoi(argv[1]);
125 |         dimy = atoi(argv[2]);
126 |     }
127 | 
128 |     dim3 block(dimx, dimy);
129 |     dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
130 | 
131 |     // execute the kernel
132 |     CHECK(cudaDeviceSynchronize());
133 |     iStart = seconds();
134 |     sumMatrixOnGPU2D<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
135 |     CHECK(cudaDeviceSynchronize());
136 |     iElaps = seconds() - iStart;
137 |     printf("sumMatrixOnGPU2D <<<(%d,%d), (%d,%d)>>> elapsed %d ms\n", grid.x,
138 |            grid.y,
139 |            block.x, block.y, iElaps);
140 |     CHECK(cudaGetLastError());
141 | 
142 |     // copy kernel result back to host side
143 |     CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));
144 | 
145 |     // check device results
146 |     checkResult(hostRef, gpuRef, nxy);
147 | 
148 |     // free device global memory
149 |     CHECK(cudaFree(d_MatA));
150 |     CHECK(cudaFree(d_MatB));
151 |     CHECK(cudaFree(d_MatC));
152 | 
153 |     // free host memory
154 |     free(h_A);
155 |     free(h_B);
156 |     free(hostRef);
157 |     free(gpuRef);
158 | 
159 |     // reset device
160 |     CHECK(cudaDeviceReset());
161 | 
162 |     return EXIT_SUCCESS;
163 | }
164 | 


--------------------------------------------------------------------------------
/chapter04/Makefile:
--------------------------------------------------------------------------------
 1 | CU_APPS=globalVariable memTransfer pinMemTransfer readSegment \
 2 | 		readSegmentUnroll simpleMathAoS simpleMathSoA sumArrayZerocpy \
 3 | 		sumMatrixGPUManaged sumMatrixGPUManual transpose writeSegment
 4 | C_APPS=
 5 | 
 6 | all: ${C_APPS} ${CU_APPS}
 7 | 
 8 | %: %.cu
 9 | 	nvcc -O2 -arch=sm_20 -o $@ $<
10 | %: %.c
11 | 	gcc -O2 -std=c99 -o $@ $<
12 | clean:
13 | 	rm -f ${CU_APPS} ${C_APPS}
14 | 


--------------------------------------------------------------------------------
/chapter04/globalVariable.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <cuda_runtime.h>
 3 | #include <stdio.h>
 4 | 
 5 | /*
 6 |  * An example of using a statically declared global variable (devData) to store
 7 |  * a floating-point value on the device.
 8 |  */
 9 | 
10 | __device__ float devData;
11 | 
12 | __global__ void checkGlobalVariable()
13 | {
14 |     // display the original value
15 |     printf("Device: the value of the global variable is %f\n", devData);
16 | 
17 |     // alter the value
18 |     devData += 2.0f;
19 | }
20 | 
21 | int main(void)
22 | {
23 |     // initialize the global variable
24 |     float value = 3.14f;
25 |     CHECK(cudaMemcpyToSymbol(devData, &value, sizeof(float)));
26 |     printf("Host:   copied %f to the global variable\n", value);
27 | 
28 |     // invoke the kernel
29 |     checkGlobalVariable<<<1, 1>>>();
30 | 
31 |     // copy the global variable back to the host
32 |     CHECK(cudaMemcpyFromSymbol(&value, devData, sizeof(float)));
33 |     printf("Host:   the value changed by the kernel to %f\n", value);
34 | 
35 |     CHECK(cudaDeviceReset());
36 |     return EXIT_SUCCESS;
37 | }
38 | 


--------------------------------------------------------------------------------
/chapter04/memTransfer.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <cuda_runtime.h>
 3 | #include <stdio.h>
 4 | 
 5 | /*
 6 |  * An example of using CUDA's memory copy API to transfer data to and from the
 7 |  * device. In this case, cudaMalloc is used to allocate memory on the GPU and
 8 |  * cudaMemcpy is used to transfer the contents of host memory to an array
 9 |  * allocated using cudaMalloc.
10 |  */
11 | 
12 | int main(int argc, char **argv)
13 | {
14 |     // set up device
15 |     int dev = 0;
16 |     CHECK(cudaSetDevice(dev));
17 | 
18 |     // memory size
19 |     unsigned int isize = 1 << 22;
20 |     unsigned int nbytes = isize * sizeof(float);
21 | 
22 |     // get device information
23 |     cudaDeviceProp deviceProp;
24 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
25 |     printf("%s starting at ", argv[0]);
26 |     printf("device %d: %s memory size %d nbyte %5.2fMB\n", dev,
27 |            deviceProp.name, isize, nbytes / (1024.0f * 1024.0f));
28 | 
29 |     // allocate the host memory
30 |     float *h_a = (float *)malloc(nbytes);
31 | 
32 |     // allocate the device memory
33 |     float *d_a;
34 |     CHECK(cudaMalloc((float **)&d_a, nbytes));
35 | 
36 |     // initialize the host memory
37 |     for(unsigned int i = 0; i < isize; i++) h_a[i] = 0.5f;
38 | 
39 |     // transfer data from the host to the device
40 |     CHECK(cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice));
41 | 
42 |     // transfer data from the device to the host
43 |     CHECK(cudaMemcpy(h_a, d_a, nbytes, cudaMemcpyDeviceToHost));
44 | 
45 |     // free memory
46 |     CHECK(cudaFree(d_a));
47 |     free(h_a);
48 | 
49 |     // reset device
50 |     CHECK(cudaDeviceReset());
51 |     return EXIT_SUCCESS;
52 | }
53 | 


--------------------------------------------------------------------------------
/chapter04/pinMemTransfer.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <cuda_runtime.h>
 3 | #include <stdio.h>
 4 | 
 5 | /*
 6 |  * An example of using CUDA's memory copy API to transfer data to and from the
 7 |  * device. In this case, cudaMalloc is used to allocate memory on the GPU and
 8 |  * cudaMemcpy is used to transfer the contents of host memory to an array
 9 |  * allocated using cudaMalloc. Host memory is allocated using cudaMallocHost to
10 |  * create a page-locked host array.
11 |  */
12 | 
13 | int main(int argc, char **argv)
14 | {
15 |     // set up device
16 |     int dev = 0;
17 |     CHECK(cudaSetDevice(dev));
18 | 
19 |     // memory size
20 |     unsigned int isize = 1 << 22;
21 |     unsigned int nbytes = isize * sizeof(float);
22 | 
23 |     // get device information
24 |     cudaDeviceProp deviceProp;
25 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
26 | 
27 |     if (!deviceProp.canMapHostMemory)
28 |     {
29 |         printf("Device %d does not support mapping CPU host memory!\n", dev);
30 |         CHECK(cudaDeviceReset());
31 |         exit(EXIT_SUCCESS);
32 |     }
33 | 
34 |     printf("%s starting at ", argv[0]);
35 |     printf("device %d: %s memory size %d nbyte %5.2fMB canMap %d\n", dev,
36 |            deviceProp.name, isize, nbytes / (1024.0f * 1024.0f),
37 |            deviceProp.canMapHostMemory);
38 | 
39 |     // allocate pinned host memory
40 |     float *h_a;
41 |     CHECK(cudaMallocHost ((float **)&h_a, nbytes));
42 | 
43 |     // allocate device memory
44 |     float *d_a;
45 |     CHECK(cudaMalloc((float **)&d_a, nbytes));
46 | 
47 |     // initialize host memory
48 |     memset(h_a, 0, nbytes);
49 | 
50 |     for (int i = 0; i < isize; i++) h_a[i] = 100.10f;
51 | 
52 |     // transfer data from the host to the device
53 |     CHECK(cudaMemcpy(d_a, h_a, nbytes, cudaMemcpyHostToDevice));
54 | 
55 |     // transfer data from the device to the host
56 |     CHECK(cudaMemcpy(h_a, d_a, nbytes, cudaMemcpyDeviceToHost));
57 | 
58 |     // free memory
59 |     CHECK(cudaFree(d_a));
60 |     CHECK(cudaFreeHost(h_a));
61 | 
62 |     // reset device
63 |     CHECK(cudaDeviceReset());
64 |     return EXIT_SUCCESS;
65 | }
66 | 


--------------------------------------------------------------------------------
/chapter04/readSegment.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates the impact of misaligned reads on performance by
  7 |  * forcing misaligned reads to occur on a float*.
  8 |  */
  9 | 
 10 | void checkResult(float *hostRef, float *gpuRef, const int N)
 11 | {
 12 |     double epsilon = 1.0E-8;
 13 |     bool match = 1;
 14 | 
 15 |     for (int i = 0; i < N; i++)
 16 |     {
 17 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 18 |         {
 19 |             match = 0;
 20 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef[i],
 21 |                     gpuRef[i]);
 22 |             break;
 23 |         }
 24 |     }
 25 | 
 26 |     if (!match)  printf("Arrays do not match.\n\n");
 27 | }
 28 | 
 29 | void initialData(float *ip,  int size)
 30 | {
 31 |     for (int i = 0; i < size; i++)
 32 |     {
 33 |         ip[i] = (float)( rand() & 0xFF ) / 100.0f;
 34 |     }
 35 | 
 36 |     return;
 37 | }
 38 | 
 39 | 
 40 | void sumArraysOnHost(float *A, float *B, float *C, const int n, int offset)
 41 | {
 42 |     for (int idx = offset, k = 0; idx < n; idx++, k++)
 43 |     {
 44 |         C[k] = A[idx] + B[idx];
 45 |     }
 46 | }
 47 | 
 48 | __global__ void warmup(float *A, float *B, float *C, const int n, int offset)
 49 | {
 50 |     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 51 |     unsigned int k = i + offset;
 52 | 
 53 |     if (k < n) C[i] = A[k] + B[k];
 54 | }
 55 | 
 56 | __global__ void readOffset(float *A, float *B, float *C, const int n,
 57 |                            int offset)
 58 | {
 59 |     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 60 |     unsigned int k = i + offset;
 61 | 
 62 |     if (k < n) C[i] = A[k] + B[k];
 63 | }
 64 | 
 65 | int main(int argc, char **argv)
 66 | {
 67 |     // set up device
 68 |     int dev = 0;
 69 |     cudaDeviceProp deviceProp;
 70 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 71 |     printf("%s starting reduction at ", argv[0]);
 72 |     printf("device %d: %s ", dev, deviceProp.name);
 73 |     CHECK(cudaSetDevice(dev));
 74 | 
 75 |     // set up array size
 76 |     int nElem = 1 << 20; // total number of elements to reduce
 77 |     printf(" with array size %d\n", nElem);
 78 |     size_t nBytes = nElem * sizeof(float);
 79 | 
 80 |     // set up offset for summary
 81 |     int blocksize = 512;
 82 |     int offset = 0;
 83 | 
 84 |     if (argc > 1) offset    = atoi(argv[1]);
 85 | 
 86 |     if (argc > 2) blocksize = atoi(argv[2]);
 87 | 
 88 |     // execution configuration
 89 |     dim3 block (blocksize, 1);
 90 |     dim3 grid  ((nElem + block.x - 1) / block.x, 1);
 91 | 
 92 |     // allocate host memory
 93 |     float *h_A = (float *)malloc(nBytes);
 94 |     float *h_B = (float *)malloc(nBytes);
 95 |     float *hostRef = (float *)malloc(nBytes);
 96 |     float *gpuRef  = (float *)malloc(nBytes);
 97 | 
 98 |     //  initialize host array
 99 |     initialData(h_A, nElem);
100 |     memcpy(h_B, h_A, nBytes);
101 | 
102 |     //  summary at host side
103 |     sumArraysOnHost(h_A, h_B, hostRef, nElem, offset);
104 | 
105 |     // allocate device memory
106 |     float *d_A, *d_B, *d_C;
107 |     CHECK(cudaMalloc((float**)&d_A, nBytes));
108 |     CHECK(cudaMalloc((float**)&d_B, nBytes));
109 |     CHECK(cudaMalloc((float**)&d_C, nBytes));
110 | 
111 |     // copy data from host to device
112 |     CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
113 |     CHECK(cudaMemcpy(d_B, h_A, nBytes, cudaMemcpyHostToDevice));
114 | 
115 |     //  kernel 1:
116 |     double iStart = seconds();
117 |     warmup<<<grid, block>>>(d_A, d_B, d_C, nElem, offset);
118 |     CHECK(cudaDeviceSynchronize());
119 |     double iElaps = seconds() - iStart;
120 |     printf("warmup     <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x,
121 |            block.x, offset, iElaps);
122 |     CHECK(cudaGetLastError());
123 | 
124 |     iStart = seconds();
125 |     readOffset<<<grid, block>>>(d_A, d_B, d_C, nElem, offset);
126 |     CHECK(cudaDeviceSynchronize());
127 |     iElaps = seconds() - iStart;
128 |     printf("readOffset <<< %4d, %4d >>> offset %4d elapsed %f sec\n", grid.x,
129 |            block.x, offset, iElaps);
130 |     CHECK(cudaGetLastError());
131 | 
132 |     // copy kernel result back to host side and check device results
133 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
134 |     checkResult(hostRef, gpuRef, nElem - offset);
135 | 
136 |     // free host and device memory
137 |     CHECK(cudaFree(d_A));
138 |     CHECK(cudaFree(d_B));
139 |     CHECK(cudaFree(d_C));
140 |     free(h_A);
141 |     free(h_B);
142 | 
143 |     // reset device
144 |     CHECK(cudaDeviceReset());
145 |     return EXIT_SUCCESS;
146 | }
147 | 


--------------------------------------------------------------------------------
/chapter04/simpleMathAoS.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * A simple example of using an array of structures to store data on the device.
  7 |  * This example is used to study the impact on performance of data layout on the
  8 |  * GPU.
  9 |  *
 10 |  * AoS: one contiguous 64-bit read to get x and y (up to 300 cycles)
 11 |  */
 12 | 
 13 | #define LEN 1<<22
 14 | 
 15 | struct innerStruct
 16 | {
 17 |     float x;
 18 |     float y;
 19 | };
 20 | 
 21 | struct innerArray
 22 | {
 23 |     float x[LEN];
 24 |     float y[LEN];
 25 | };
 26 | 
 27 | void initialInnerStruct(innerStruct *ip,  int size)
 28 | {
 29 |     for (int i = 0; i < size; i++)
 30 |     {
 31 |         ip[i].x = (float)(rand() & 0xFF) / 100.0f;
 32 |         ip[i].y = (float)(rand() & 0xFF) / 100.0f;
 33 |     }
 34 | 
 35 |     return;
 36 | }
 37 | 
 38 | void testInnerStructHost(innerStruct *A, innerStruct *C, const int n)
 39 | {
 40 |     for (int idx = 0; idx < n; idx++)
 41 |     {
 42 |         C[idx].x = A[idx].x + 10.f;
 43 |         C[idx].y = A[idx].y + 20.f;
 44 |     }
 45 | 
 46 |     return;
 47 | }
 48 | 
 49 | void checkInnerStruct(innerStruct *hostRef, innerStruct *gpuRef, const int N)
 50 | {
 51 |     double epsilon = 1.0E-8;
 52 |     bool match = 1;
 53 | 
 54 |     for (int i = 0; i < N; i++)
 55 |     {
 56 |         if (abs(hostRef[i].x - gpuRef[i].x) > epsilon)
 57 |         {
 58 |             match = 0;
 59 |             printf("different on %dth element: host %f gpu %f\n", i,
 60 |                     hostRef[i].x, gpuRef[i].x);
 61 |             break;
 62 |         }
 63 | 
 64 |         if (abs(hostRef[i].y - gpuRef[i].y) > epsilon)
 65 |         {
 66 |             match = 0;
 67 |             printf("different on %dth element: host %f gpu %f\n", i,
 68 |                     hostRef[i].y, gpuRef[i].y);
 69 |             break;
 70 |         }
 71 |     }
 72 | 
 73 |     if (!match)  printf("Arrays do not match.\n\n");
 74 | }
 75 | 
 76 | __global__ void testInnerStruct(innerStruct *data, innerStruct * result,
 77 |                                 const int n)
 78 | {
 79 |     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 80 | 
 81 |     if (i < n)
 82 |     {
 83 |         innerStruct tmp = data[i];
 84 |         tmp.x += 10.f;
 85 |         tmp.y += 20.f;
 86 |         result[i] = tmp;
 87 |     }
 88 | }
 89 | 
 90 | __global__ void warmup(innerStruct *data, innerStruct * result, const int n)
 91 | {
 92 |     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 93 | 
 94 |     if (i < n)
 95 |     {
 96 |         innerStruct tmp = data[i];
 97 |         tmp.x += 10.f;
 98 |         tmp.y += 20.f;
 99 |         result[i] = tmp;
100 |     }
101 | }
102 | 
103 | int main(int argc, char **argv)
104 | {
105 |     // set up device
106 |     int dev = 0;
107 |     cudaDeviceProp deviceProp;
108 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
109 |     printf("%s test struct of array at ", argv[0]);
110 |     printf("device %d: %s \n", dev, deviceProp.name);
111 |     CHECK(cudaSetDevice(dev));
112 | 
113 |     // allocate host memory
114 |     int nElem = LEN;
115 |     size_t nBytes = nElem * sizeof(innerStruct);
116 |     innerStruct     *h_A = (innerStruct *)malloc(nBytes);
117 |     innerStruct *hostRef = (innerStruct *)malloc(nBytes);
118 |     innerStruct *gpuRef  = (innerStruct *)malloc(nBytes);
119 | 
120 |     // initialize host array
121 |     initialInnerStruct(h_A, nElem);
122 |     testInnerStructHost(h_A, hostRef, nElem);
123 | 
124 |     // allocate device memory
125 |     innerStruct *d_A, *d_C;
126 |     CHECK(cudaMalloc((innerStruct**)&d_A, nBytes));
127 |     CHECK(cudaMalloc((innerStruct**)&d_C, nBytes));
128 | 
129 |     // copy data from host to device
130 |     CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
131 | 
132 |     // set up offset for summaryAU: It is blocksize not offset. Thanks.CZ
133 |     int blocksize = 128;
134 | 
135 |     if (argc > 1) blocksize = atoi(argv[1]);
136 | 
137 |     // execution configuration
138 |     dim3 block (blocksize, 1);
139 |     dim3 grid  ((nElem + block.x - 1) / block.x, 1);
140 | 
141 |     // kernel 1: warmup
142 |     double iStart = seconds();
143 |     warmup<<<grid, block>>>(d_A, d_C, nElem);
144 |     CHECK(cudaDeviceSynchronize());
145 |     double iElaps = seconds() - iStart;
146 |     printf("warmup      <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x,
147 |            iElaps);
148 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
149 |     checkInnerStruct(hostRef, gpuRef, nElem);
150 |     CHECK(cudaGetLastError());
151 | 
152 |     // kernel 2: testInnerStruct
153 |     iStart = seconds();
154 |     testInnerStruct<<<grid, block>>>(d_A, d_C, nElem);
155 |     CHECK(cudaDeviceSynchronize());
156 |     iElaps = seconds() - iStart;
157 |     printf("innerstruct <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x,
158 |            iElaps);
159 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
160 |     checkInnerStruct(hostRef, gpuRef, nElem);
161 |     CHECK(cudaGetLastError());
162 | 
163 |     // free memories both host and device
164 |     CHECK(cudaFree(d_A));
165 |     CHECK(cudaFree(d_C));
166 |     free(h_A);
167 |     free(hostRef);
168 |     free(gpuRef);
169 | 
170 |     // reset device
171 |     CHECK(cudaDeviceReset());
172 |     return EXIT_SUCCESS;
173 | }
174 | 


--------------------------------------------------------------------------------
/chapter04/simpleMathSoA.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * A simple example of using a structore of arrays to store data on the device.
  7 |  * This example is used to study the impact on performance of data layout on the
  8 |  * GPU.
  9 |  *
 10 |  * SoA: contiguous reads for x and y
 11 |  */
 12 | 
 13 | #define LEN 1<<22
 14 | 
 15 | struct InnerArray
 16 | {
 17 |     float x[LEN];
 18 |     float y[LEN];
 19 | };
 20 | 
 21 | // functions for inner array outer struct
 22 | void initialInnerArray(InnerArray *ip,  int size)
 23 | {
 24 |     for (int i = 0; i < size; i++)
 25 |     {
 26 |         ip->x[i] = (float)( rand() & 0xFF ) / 100.0f;
 27 |         ip->y[i] = (float)( rand() & 0xFF ) / 100.0f;
 28 |     }
 29 | 
 30 |     return;
 31 | }
 32 | 
 33 | void testInnerArrayHost(InnerArray *A, InnerArray *C, const int n)
 34 | {
 35 |     for (int idx = 0; idx < n; idx++)
 36 |     {
 37 |         C->x[idx] = A->x[idx] + 10.f;
 38 |         C->y[idx] = A->y[idx] + 20.f;
 39 |     }
 40 | 
 41 |     return;
 42 | }
 43 | 
 44 | 
 45 | void printfHostResult(InnerArray *C, const int n)
 46 | {
 47 |     for (int idx = 0; idx < n; idx++)
 48 |     {
 49 |         printf("printout idx %d:  x %f y %f\n", idx, C->x[idx], C->y[idx]);
 50 |     }
 51 | 
 52 |     return;
 53 | }
 54 | 
 55 | void checkInnerArray(InnerArray *hostRef, InnerArray *gpuRef, const int N)
 56 | {
 57 |     double epsilon = 1.0E-8;
 58 |     bool match = 1;
 59 | 
 60 |     for (int i = 0; i < N; i++)
 61 |     {
 62 |         if (abs(hostRef->x[i] - gpuRef->x[i]) > epsilon)
 63 |         {
 64 |             match = 0;
 65 |             printf("different on x %dth element: host %f gpu %f\n", i,
 66 |                    hostRef->x[i], gpuRef->x[i]);
 67 |             break;
 68 |         }
 69 | 
 70 |         if (abs(hostRef->y[i] - gpuRef->y[i]) > epsilon)
 71 |         {
 72 |             match = 0;
 73 |             printf("different on y %dth element: host %f gpu %f\n", i,
 74 |                    hostRef->y[i], gpuRef->y[i]);
 75 |             break;
 76 |         }
 77 |     }
 78 | 
 79 |     if (!match)  printf("Arrays do not match.\n\n");
 80 | }
 81 | 
 82 | __global__ void testInnerArray(InnerArray *data, InnerArray * result,
 83 |                                const int n)
 84 | {
 85 |     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 86 | 
 87 |     if (i < n)
 88 |     {
 89 |         float tmpx = data->x[i];
 90 |         float tmpy = data->y[i];
 91 | 
 92 |         tmpx += 10.f;
 93 |         tmpy += 20.f;
 94 |         result->x[i] = tmpx;
 95 |         result->y[i] = tmpy;
 96 |     }
 97 | }
 98 | 
 99 | __global__ void warmup2(InnerArray *data, InnerArray * result, const int n)
100 | {
101 |     unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
102 | 
103 |     if (i < n)
104 |     {
105 |         float tmpx = data->x[i];
106 |         float tmpy = data->y[i];
107 |         tmpx += 10.f;
108 |         tmpy += 20.f;
109 |         result->x[i] = tmpx;
110 |         result->y[i] = tmpy;
111 |     }
112 | }
113 | 
114 | // test for array of struct
115 | int main(int argc, char **argv)
116 | {
117 |     // set up device
118 |     int dev = 0;
119 |     cudaDeviceProp deviceProp;
120 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
121 |     printf("%s test struct of array at ", argv[0]);
122 |     printf("device %d: %s \n", dev, deviceProp.name);
123 |     CHECK(cudaSetDevice(dev));
124 | 
125 |     // allocate host memory
126 |     int nElem = LEN;
127 |     size_t nBytes = sizeof(InnerArray);
128 |     InnerArray     *h_A = (InnerArray *)malloc(nBytes);
129 |     InnerArray *hostRef = (InnerArray *)malloc(nBytes);
130 |     InnerArray *gpuRef  = (InnerArray *)malloc(nBytes);
131 | 
132 |     // initialize host array
133 |     initialInnerArray(h_A, nElem);
134 |     testInnerArrayHost(h_A, hostRef, nElem);
135 | 
136 |     // allocate device memory
137 |     InnerArray *d_A, *d_C;
138 |     CHECK(cudaMalloc((InnerArray**)&d_A, nBytes));
139 |     CHECK(cudaMalloc((InnerArray**)&d_C, nBytes));
140 | 
141 |     // copy data from host to device
142 |     CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
143 | 
144 |     // set up offset for summary
145 |     int blocksize = 128;
146 | 
147 |     if (argc > 1) blocksize = atoi(argv[1]);
148 | 
149 |     // execution configuration
150 |     dim3 block (blocksize, 1);
151 |     dim3 grid  ((nElem + block.x - 1) / block.x, 1);
152 | 
153 |     // kernel 1:
154 |     double iStart = seconds();
155 |     warmup2<<<grid, block>>>(d_A, d_C, nElem);
156 |     CHECK(cudaDeviceSynchronize());
157 |     double iElaps = seconds() - iStart;
158 |     printf("warmup2      <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x,
159 |            iElaps);
160 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
161 |     checkInnerArray(hostRef, gpuRef, nElem);
162 |     CHECK(cudaGetLastError());
163 | 
164 |     iStart = seconds();
165 |     testInnerArray<<<grid, block>>>(d_A, d_C, nElem);
166 |     CHECK(cudaDeviceSynchronize());
167 |     iElaps = seconds() - iStart;
168 |     printf("innerarray   <<< %3d, %3d >>> elapsed %f sec\n", grid.x, block.x,
169 |            iElaps);
170 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
171 |     checkInnerArray(hostRef, gpuRef, nElem);
172 |     CHECK(cudaGetLastError());
173 | 
174 |     CHECK(cudaFree(d_A));
175 |     CHECK(cudaFree(d_C));
176 |     free(h_A);
177 |     free(hostRef);
178 |     free(gpuRef);
179 | 
180 |     // reset device
181 |     CHECK(cudaDeviceReset());
182 |     return EXIT_SUCCESS;
183 | }
184 | 


--------------------------------------------------------------------------------
/chapter04/sumArrayZerocpy.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates the use of zero-copy memory to remove the need to
  7 |  * explicitly issue a memcpy operation between the host and device. By mapping
  8 |  * host, page-locked memory into the device's address space, the address can
  9 |  * directly reference a host array and transfer its contents over the PCIe bus.
 10 |  *
 11 |  * This example compares performing a vector addition with and without zero-copy
 12 |  * memory.
 13 |  */
 14 | 
 15 | void checkResult(float *hostRef, float *gpuRef, const int N)
 16 | {
 17 |     double epsilon = 1.0E-8;
 18 | 
 19 |     for (int i = 0; i < N; i++)
 20 |     {
 21 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 22 |         {
 23 |             printf("Arrays do not match!\n");
 24 |             printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
 25 |                     gpuRef[i], i);
 26 |             break;
 27 |         }
 28 |     }
 29 | 
 30 |     return;
 31 | }
 32 | 
 33 | void initialData(float *ip, int size)
 34 | {
 35 |     int i;
 36 | 
 37 |     for (i = 0; i < size; i++)
 38 |     {
 39 |         ip[i] = (float)( rand() & 0xFF ) / 10.0f;
 40 |     }
 41 | 
 42 |     return;
 43 | }
 44 | 
 45 | void sumArraysOnHost(float *A, float *B, float *C, const int N)
 46 | {
 47 |     for (int idx = 0; idx < N; idx++)
 48 |     {
 49 |         C[idx] = A[idx] + B[idx];
 50 |     }
 51 | }
 52 | 
 53 | __global__ void sumArrays(float *A, float *B, float *C, const int N)
 54 | {
 55 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 56 | 
 57 |     if (i < N) C[i] = A[i] + B[i];
 58 | }
 59 | 
 60 | __global__ void sumArraysZeroCopy(float *A, float *B, float *C, const int N)
 61 | {
 62 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 63 | 
 64 |     if (i < N) C[i] = A[i] + B[i];
 65 | }
 66 | 
 67 | int main(int argc, char **argv)
 68 | {
 69 |     // set up device
 70 |     int dev = 0;
 71 |     CHECK(cudaSetDevice(dev));
 72 | 
 73 |     // get device properties
 74 |     cudaDeviceProp deviceProp;
 75 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 76 | 
 77 |     // check if support mapped memory
 78 |     if (!deviceProp.canMapHostMemory)
 79 |     {
 80 |         printf("Device %d does not support mapping CPU host memory!\n", dev);
 81 |         CHECK(cudaDeviceReset());
 82 |         exit(EXIT_SUCCESS);
 83 |     }
 84 | 
 85 |     printf("Using Device %d: %s ", dev, deviceProp.name);
 86 | 
 87 |     // set up data size of vectors
 88 |     int ipower = 10;
 89 | 
 90 |     if (argc > 1) ipower = atoi(argv[1]);
 91 | 
 92 |     int nElem = 1 << ipower;
 93 |     size_t nBytes = nElem * sizeof(float);
 94 | 
 95 |     if (ipower < 18)
 96 |     {
 97 |         printf("Vector size %d power %d  nbytes  %3.0f KB\n", nElem, ipower,
 98 |                (float)nBytes / (1024.0f));
 99 |     }
100 |     else
101 |     {
102 |         printf("Vector size %d power %d  nbytes  %3.0f MB\n", nElem, ipower,
103 |                (float)nBytes / (1024.0f * 1024.0f));
104 |     }
105 | 
106 |     // part 1: using device memory
107 |     // malloc host memory
108 |     float *h_A, *h_B, *hostRef, *gpuRef;
109 |     h_A     = (float *)malloc(nBytes);
110 |     h_B     = (float *)malloc(nBytes);
111 |     hostRef = (float *)malloc(nBytes);
112 |     gpuRef  = (float *)malloc(nBytes);
113 | 
114 |     // initialize data at host side
115 |     initialData(h_A, nElem);
116 |     initialData(h_B, nElem);
117 |     memset(hostRef, 0, nBytes);
118 |     memset(gpuRef,  0, nBytes);
119 | 
120 |     // add vector at host side for result checks
121 |     sumArraysOnHost(h_A, h_B, hostRef, nElem);
122 | 
123 |     // malloc device global memory
124 |     float *d_A, *d_B, *d_C;
125 |     CHECK(cudaMalloc((float**)&d_A, nBytes));
126 |     CHECK(cudaMalloc((float**)&d_B, nBytes));
127 |     CHECK(cudaMalloc((float**)&d_C, nBytes));
128 | 
129 |     // transfer data from host to device
130 |     CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
131 |     CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
132 | 
133 |     // set up execution configuration
134 |     int iLen = 512;
135 |     dim3 block (iLen);
136 |     dim3 grid  ((nElem + block.x - 1) / block.x);
137 | 
138 |     sumArrays<<<grid, block>>>(d_A, d_B, d_C, nElem);
139 | 
140 |     // copy kernel result back to host side
141 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
142 | 
143 |     // check device results
144 |     checkResult(hostRef, gpuRef, nElem);
145 | 
146 |     // free device global memory
147 |     CHECK(cudaFree(d_A));
148 |     CHECK(cudaFree(d_B));
149 | 
150 |     // free host memory
151 |     free(h_A);
152 |     free(h_B);
153 | 
154 |     // part 2: using zerocopy memory for array A and B
155 |     // allocate zerocpy memory
156 |     CHECK(cudaHostAlloc((void **)&h_A, nBytes, cudaHostAllocMapped));
157 |     CHECK(cudaHostAlloc((void **)&h_B, nBytes, cudaHostAllocMapped));
158 | 
159 |     // initialize data at host side
160 |     initialData(h_A, nElem);
161 |     initialData(h_B, nElem);
162 |     memset(hostRef, 0, nBytes);
163 |     memset(gpuRef,  0, nBytes);
164 | 
165 |     // pass the pointer to device
166 |     CHECK(cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0));
167 |     CHECK(cudaHostGetDevicePointer((void **)&d_B, (void *)h_B, 0));
168 | 
169 |     // add at host side for result checks
170 |     sumArraysOnHost(h_A, h_B, hostRef, nElem);
171 | 
172 |     // execute kernel with zero copy memory
173 |     sumArraysZeroCopy<<<grid, block>>>(d_A, d_B, d_C, nElem);
174 | 
175 |     // copy kernel result back to host side
176 |     CHECK(cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost));
177 | 
178 |     // check device results
179 |     checkResult(hostRef, gpuRef, nElem);
180 | 
181 |     // free  memory
182 |     CHECK(cudaFree(d_C));
183 |     CHECK(cudaFreeHost(h_A));
184 |     CHECK(cudaFreeHost(h_B));
185 | 
186 |     free(hostRef);
187 |     free(gpuRef);
188 | 
189 |     // reset device
190 |     CHECK(cudaDeviceReset());
191 |     return EXIT_SUCCESS;
192 | }
193 | 


--------------------------------------------------------------------------------
/chapter04/sumMatrixGPUManaged.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates the use of CUDA managed memory to implement matrix
  7 |  * addition. In this example, arbitrary pointers can be dereferenced on the host
  8 |  * and device. CUDA will automatically manage the transfer of data to and from
  9 |  * the GPU as needed by the application. There is no need for the programmer to
 10 |  * use cudaMemcpy, cudaHostGetDevicePointer, or any other CUDA API involved with
 11 |  * explicitly transferring data. In addition, because CUDA managed memory is not
 12 |  * forced to reside in a single place it can be transferred to the optimal
 13 |  * memory space and not require round-trips over the PCIe bus every time a
 14 |  * cross-device reference is performed (as is required with zero copy and UVA).
 15 |  */
 16 | 
 17 | void initialData(float *ip, const int size)
 18 | {
 19 |     int i;
 20 | 
 21 |     for (i = 0; i < size; i++)
 22 |     {
 23 |         ip[i] = (float)( rand() & 0xFF ) / 10.0f;
 24 |     }
 25 | 
 26 |     return;
 27 | }
 28 | 
 29 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
 30 | {
 31 |     float *ia = A;
 32 |     float *ib = B;
 33 |     float *ic = C;
 34 | 
 35 |     for (int iy = 0; iy < ny; iy++)
 36 |     {
 37 |         for (int ix = 0; ix < nx; ix++)
 38 |         {
 39 |             ic[ix] = ia[ix] + ib[ix];
 40 |         }
 41 | 
 42 |         ia += nx;
 43 |         ib += nx;
 44 |         ic += nx;
 45 |     }
 46 | 
 47 |     return;
 48 | }
 49 | 
 50 | void checkResult(float *hostRef, float *gpuRef, const int N)
 51 | {
 52 |     double epsilon = 1.0E-8;
 53 |     bool match = 1;
 54 | 
 55 |     for (int i = 0; i < N; i++)
 56 |     {
 57 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 58 |         {
 59 |             match = 0;
 60 |             printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
 61 |             break;
 62 |         }
 63 |     }
 64 | 
 65 |     if (!match)
 66 |     {
 67 |         printf("Arrays do not match.\n\n");
 68 |     }
 69 | }
 70 | 
 71 | // grid 2D block 2D
 72 | __global__ void sumMatrixGPU(float *MatA, float *MatB, float *MatC, int nx,
 73 |                              int ny)
 74 | {
 75 |     unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
 76 |     unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
 77 |     unsigned int idx = iy * nx + ix;
 78 | 
 79 |     if (ix < nx && iy < ny)
 80 |     {
 81 |         MatC[idx] = MatA[idx] + MatB[idx];
 82 |     }
 83 | }
 84 | 
 85 | int main(int argc, char **argv)
 86 | {
 87 |     printf("%s Starting ", argv[0]);
 88 | 
 89 |     // set up device
 90 |     int dev = 0;
 91 |     cudaDeviceProp deviceProp;
 92 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 93 |     printf("using Device %d: %s\n", dev, deviceProp.name);
 94 |     CHECK(cudaSetDevice(dev));
 95 | 
 96 |     // set up data size of matrix
 97 |     int nx, ny;
 98 |     int ishift = 12;
 99 | 
100 |     if  (argc > 1) ishift = atoi(argv[1]);
101 | 
102 |     nx = ny = 1 << ishift;
103 | 
104 |     int nxy = nx * ny;
105 |     int nBytes = nxy * sizeof(float);
106 |     printf("Matrix size: nx %d ny %d\n", nx, ny);
107 | 
108 |     // malloc host memory
109 |     float *A, *B, *hostRef, *gpuRef;
110 |     CHECK(cudaMallocManaged((void **)&A, nBytes));
111 |     CHECK(cudaMallocManaged((void **)&B, nBytes));
112 |     CHECK(cudaMallocManaged((void **)&gpuRef,  nBytes);  );
113 |     CHECK(cudaMallocManaged((void **)&hostRef, nBytes););
114 | 
115 |     // initialize data at host side
116 |     double iStart = seconds();
117 |     initialData(A, nxy);
118 |     initialData(B, nxy);
119 |     double iElaps = seconds() - iStart;
120 |     printf("initialization: \t %f sec\n", iElaps);
121 | 
122 |     memset(hostRef, 0, nBytes);
123 |     memset(gpuRef, 0, nBytes);
124 | 
125 |     // add matrix at host side for result checks
126 |     iStart = seconds();
127 |     sumMatrixOnHost(A, B, hostRef, nx, ny);
128 |     iElaps = seconds() - iStart;
129 |     printf("sumMatrix on host:\t %f sec\n", iElaps);
130 | 
131 |     // invoke kernel at host side
132 |     int dimx = 32;
133 |     int dimy = 32;
134 |     dim3 block(dimx, dimy);
135 |     dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
136 | 
137 |     // warm-up kernel, with unified memory all pages will migrate from host to
138 |     // device
139 |     sumMatrixGPU<<<grid, block>>>(A, B, gpuRef, 1, 1);
140 | 
141 |     // after warm-up, time with unified memory
142 |     iStart = seconds();
143 | 
144 |     sumMatrixGPU<<<grid, block>>>(A, B, gpuRef, nx, ny);
145 | 
146 |     CHECK(cudaDeviceSynchronize());
147 |     iElaps = seconds() - iStart;
148 |     printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>> \n", iElaps,
149 |             grid.x, grid.y, block.x, block.y);
150 | 
151 |     // check kernel error
152 |     CHECK(cudaGetLastError());
153 | 
154 |     // check device results
155 |     checkResult(hostRef, gpuRef, nxy);
156 | 
157 |     // free device global memory
158 |     CHECK(cudaFree(A));
159 |     CHECK(cudaFree(B));
160 |     CHECK(cudaFree(hostRef));
161 |     CHECK(cudaFree(gpuRef));
162 | 
163 |     // reset device
164 |     CHECK(cudaDeviceReset());
165 | 
166 |     return (0);
167 | }
168 | 


--------------------------------------------------------------------------------
/chapter04/sumMatrixGPUManual.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * This example demonstrates using explicit CUDA memory transfer to implement
  7 |  * matrix addition. This code contrasts with sumMatrixGPUManaged.cu, where CUDA
  8 |  * managed memory is used to remove all explicit memory transfers and abstract
  9 |  * away the concept of physicall separate address spaces.
 10 |  */
 11 | 
 12 | void initialData(float *ip, const int size)
 13 | {
 14 |     int i;
 15 | 
 16 |     for(i = 0; i < size; i++)
 17 |     {
 18 |         ip[i] = (float)( rand() & 0xFF ) / 10.0f;
 19 |     }
 20 | 
 21 |     return;
 22 | }
 23 | 
 24 | void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny)
 25 | {
 26 |     float *ia = A;
 27 |     float *ib = B;
 28 |     float *ic = C;
 29 | 
 30 |     for (int iy = 0; iy < ny; iy++)
 31 |     {
 32 |         for (int ix = 0; ix < nx; ix++)
 33 |         {
 34 |             ic[ix] = ia[ix] + ib[ix];
 35 |         }
 36 | 
 37 |         ia += nx;
 38 |         ib += nx;
 39 |         ic += nx;
 40 |     }
 41 | 
 42 |     return;
 43 | }
 44 | 
 45 | void checkResult(float *hostRef, float *gpuRef, const int N)
 46 | {
 47 |     double epsilon = 1.0E-8;
 48 |     bool match = 1;
 49 | 
 50 |     for (int i = 0; i < N; i++)
 51 |     {
 52 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 53 |         {
 54 |             match = 0;
 55 |             printf("host %f gpu %f\n", hostRef[i], gpuRef[i]);
 56 |             break;
 57 |         }
 58 |     }
 59 | 
 60 |     if (!match)
 61 |     {
 62 |         printf("Arrays do not match.\n\n");
 63 |     }
 64 | }
 65 | 
 66 | // grid 2D block 2D
 67 | __global__ void sumMatrixGPU(float *MatA, float *MatB, float *MatC, int nx,
 68 |                              int ny)
 69 | {
 70 |     unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
 71 |     unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
 72 |     unsigned int idx = iy * nx + ix;
 73 | 
 74 |     if (ix < nx && iy < ny)
 75 |     {
 76 |         MatC[idx] = MatA[idx] + MatB[idx];
 77 |     }
 78 | }
 79 | 
 80 | int main(int argc, char **argv)
 81 | {
 82 |     printf("%s Starting ", argv[0]);
 83 | 
 84 |     // set up device
 85 |     int dev = 0;
 86 |     cudaDeviceProp deviceProp;
 87 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 88 |     printf("using Device %d: %s\n", dev, deviceProp.name);
 89 |     CHECK(cudaSetDevice(dev));
 90 | 
 91 |     // set up data size of matrix
 92 |     int nx, ny;
 93 |     int ishift = 12;
 94 | 
 95 |     if  (argc > 1) ishift = atoi(argv[1]);
 96 | 
 97 |     nx = ny = 1 << ishift;
 98 | 
 99 |     int nxy = nx * ny;
100 |     int nBytes = nxy * sizeof(float);
101 |     printf("Matrix size: nx %d ny %d\n", nx, ny);
102 | 
103 |     // malloc host memory
104 |     float *h_A, *h_B, *hostRef, *gpuRef;
105 |     h_A = (float *)malloc(nBytes);
106 |     h_B = (float *)malloc(nBytes);
107 |     hostRef = (float *)malloc(nBytes);
108 |     gpuRef = (float *)malloc(nBytes);
109 | 
110 |     // initialize data at host side
111 |     double iStart = seconds();
112 |     initialData(h_A, nxy);
113 |     initialData(h_B, nxy);
114 |     double iElaps = seconds() - iStart;
115 | 
116 |     printf("initialization: \t %f sec\n", iElaps);
117 | 
118 |     memset(hostRef, 0, nBytes);
119 |     memset(gpuRef, 0, nBytes);
120 | 
121 |     // add matrix at host side for result checks
122 |     iStart = seconds();
123 |     sumMatrixOnHost(h_A, h_B, hostRef, nx, ny);
124 |     iElaps = seconds() - iStart;
125 |     printf("sumMatrix on host:\t %f sec\n", iElaps);
126 | 
127 |     // malloc device global memory
128 |     float *d_MatA, *d_MatB, *d_MatC;
129 |     CHECK(cudaMalloc((void **)&d_MatA, nBytes));
130 |     CHECK(cudaMalloc((void **)&d_MatB, nBytes));
131 |     CHECK(cudaMalloc((void **)&d_MatC, nBytes));
132 | 
133 |     // invoke kernel at host side
134 |     int dimx = 32;
135 |     int dimy = 32;
136 |     dim3 block(dimx, dimy);
137 |     dim3 grid((nx + block.x - 1) / block.x, (ny + block.y - 1) / block.y);
138 | 
139 |     // init device data to 0.0f, then warm-up kernel to obtain accurate timing
140 |     // result
141 |     CHECK(cudaMemset(d_MatA, 0.0f, nBytes));
142 |     CHECK(cudaMemset(d_MatB, 0.0f, nBytes));
143 |     sumMatrixGPU<<<grid, block>>>(d_MatA, d_MatB, d_MatC, 1, 1);
144 | 
145 | 
146 |     // transfer data from host to device
147 |     CHECK(cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice));
148 |     CHECK(cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice));
149 | 
150 |     iStart =  seconds();
151 |     sumMatrixGPU<<<grid, block>>>(d_MatA, d_MatB, d_MatC, nx, ny);
152 | 
153 |     CHECK(cudaDeviceSynchronize());
154 |     iElaps = seconds() - iStart;
155 |     printf("sumMatrix on gpu :\t %f sec <<<(%d,%d), (%d,%d)>>> \n", iElaps,
156 |             grid.x, grid.y, block.x, block.y);
157 | 
158 |     CHECK(cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost));
159 | 
160 |     // check kernel error
161 |     CHECK(cudaGetLastError());
162 | 
163 |     // check device results
164 |     checkResult(hostRef, gpuRef, nxy);
165 | 
166 |     // free device global memory
167 |     CHECK(cudaFree(d_MatA));
168 |     CHECK(cudaFree(d_MatB));
169 |     CHECK(cudaFree(d_MatC));
170 | 
171 |     // free host memory
172 |     free(h_A);
173 |     free(h_B);
174 |     free(hostRef);
175 |     free(gpuRef);
176 | 
177 |     // reset device
178 |     CHECK(cudaDeviceReset());
179 | 
180 |     return (0);
181 | }
182 | 


--------------------------------------------------------------------------------
/chapter05/Makefile:
--------------------------------------------------------------------------------
 1 | CU_APPS=checkSmemRectangle checkSmemSquare constantReadOnly constantStencil \
 2 |         reduceInteger reduceIntegerShfl simpleShfl transposeRectangle
 3 | C_APPS=
 4 | 
 5 | all: ${C_APPS} ${CU_APPS}
 6 | 
 7 | %: %.cu
 8 | 	nvcc -O2 -arch=sm_20 -o $@ $<
 9 | %: %.c
10 | 	gcc -O2 -std=c99 -o $@ $<
11 | clean:
12 | 	rm -f ${CU_APPS} ${C_APPS}
13 | 


--------------------------------------------------------------------------------
/chapter05/constantReadOnly.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | #define RADIUS 4
  6 | #define BDIM 32
  7 | 
  8 | // constant memory
  9 | __constant__ float coef[RADIUS + 1];
 10 | 
 11 | // FD coeffecient
 12 | #define a0     0.00000f
 13 | #define a1     0.80000f
 14 | #define a2    -0.20000f
 15 | #define a3     0.03809f
 16 | #define a4    -0.00357f
 17 | 
 18 | void initialData(float *in,  const int size)
 19 | {
 20 |     for (int i = 0; i < size; i++)
 21 |     {
 22 |         in[i] = (float)( rand() & 0xFF ) / 100.0f;
 23 |     }
 24 | }
 25 | 
 26 | void printData(float *in,  const int size)
 27 | {
 28 |     for (int i = RADIUS; i < size; i++)
 29 |     {
 30 |         printf("%f ", in[i]);
 31 |     }
 32 | 
 33 |     printf("\n");
 34 | }
 35 | 
 36 | void setup_coef_constant (void)
 37 | {
 38 |     const float h_coef[] = {a0, a1, a2, a3, a4};
 39 |     CHECK(cudaMemcpyToSymbol( coef, h_coef, (RADIUS + 1) * sizeof(float)));
 40 | }
 41 | 
 42 | void cpu_stencil_1d (float *in, float *out, int isize)
 43 | {
 44 |     for( int i = RADIUS; i <= isize; i++ )
 45 |     {
 46 |         float tmp = 0.0f;
 47 |         tmp += a1 * (in[i + 1] - in[i - 1])
 48 |                + a2 * (in[i + 2] - in[i - 2])
 49 |                + a3 * (in[i + 3] - in[i - 3])
 50 |                + a4 * (in[i + 4] - in[i - 4]);
 51 |         out[i] = tmp;
 52 |     }
 53 | }
 54 | 
 55 | void checkResult(float *hostRef, float *gpuRef, const int size)
 56 | {
 57 |     double epsilon = 1.0E-6;
 58 |     bool match = 1;
 59 | 
 60 |     for (int i = RADIUS; i < size; i++)
 61 |     {
 62 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 63 |         {
 64 |             match = 0;
 65 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef[i],
 66 |                    gpuRef[i]);
 67 |             break;
 68 |         }
 69 |     }
 70 | 
 71 |     if (!match)  printf("Arrays do not match.\n\n");
 72 | }
 73 | 
 74 | __global__ void stencil_1d(float *in, float *out)
 75 | {
 76 |     // shared memory
 77 |     __shared__ float smem[BDIM + 2 * RADIUS];
 78 | 
 79 |     // index to global memory
 80 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 81 | 
 82 |     // index to shared memory for stencil calculatioin
 83 |     int sidx = threadIdx.x + RADIUS;
 84 | 
 85 |     // Read data from global memory into shared memory
 86 |     smem[sidx] = in[idx];
 87 | 
 88 |     // read halo part to shared memory
 89 |     if (threadIdx.x < RADIUS)
 90 |     {
 91 |         smem[sidx - RADIUS] = in[idx - RADIUS];
 92 |         smem[sidx + BDIM] = in[idx + BDIM];
 93 |     }
 94 | 
 95 |     // Synchronize (ensure all the data is available)
 96 |     __syncthreads();
 97 | 
 98 |     // Apply the stencil
 99 |     float tmp = 0.0f;
100 | #pragma unroll
101 | 
102 |     for (int i = 1; i <= RADIUS; i++)
103 |     {
104 |         tmp += coef[i] * (smem[sidx + i] - smem[sidx - i]);
105 |     }
106 | 
107 |     // Store the result
108 |     out[idx] = tmp;
109 | }
110 | 
111 | __global__ void stencil_1d_read_only (float* in,
112 |                                       float* out,
113 |                                       const float *__restrict__ dcoef)
114 | {
115 |     // shared memory
116 |     __shared__ float smem[BDIM + 2 * RADIUS];
117 | 
118 |     // index to global memory
119 |     int idx = threadIdx.x + blockIdx.x * blockDim.x;
120 | 
121 |     // index to shared memory for stencil calculatioin
122 |     int sidx = threadIdx.x + RADIUS;
123 | 
124 |     // Read data from global memory into shared memory
125 |     smem[sidx] = in[idx];
126 | 
127 |     // read halo part to shared memory
128 |     if (threadIdx.x < RADIUS)
129 |     {
130 |         smem[sidx - RADIUS] = in[idx - RADIUS];
131 |         smem[sidx + BDIM] = in[idx + BDIM];
132 |     }
133 | 
134 |     // Synchronize (ensure all the data is available)
135 |     __syncthreads();
136 | 
137 |     // Apply the stencil
138 |     float tmp = 0.0f;
139 | #pragma unroll
140 | 
141 |     for (int i = 1; i <= RADIUS; i++)
142 |     {
143 |         tmp += dcoef[i] * (smem[sidx + i] - smem[sidx - i]);
144 |     }
145 | 
146 |     // Store the result
147 |     out[idx] = tmp;
148 | }
149 | 
150 | int main(int argc, char **argv)
151 | {
152 |     // set up device
153 |     int dev = 0;
154 |     cudaDeviceProp deviceProp;
155 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
156 |     printf("%s starting transpose at ", argv[0]);
157 |     printf("device %d: %s ", dev, deviceProp.name);
158 |     CHECK(cudaSetDevice(dev));
159 | 
160 |     // set up data size
161 |     int isize = 1 << 24;
162 | 
163 |     size_t nBytes = (isize + 2 * RADIUS) * sizeof(float);
164 |     printf("array size: %d ", isize);
165 | 
166 |     bool iprint = 0;
167 | 
168 |     // allocate host memory
169 |     float *h_in    = (float *)malloc(nBytes);
170 |     float *hostRef = (float *)malloc(nBytes);
171 |     float *gpuRef  = (float *)malloc(nBytes);
172 | 
173 |     // allocate device memory
174 |     float *d_in, *d_out, *d_coef;
175 |     CHECK(cudaMalloc((float**)&d_in, nBytes));
176 |     CHECK(cudaMalloc((float**)&d_out, nBytes));
177 |     CHECK(cudaMalloc((float**)&d_coef, (RADIUS + 1) * sizeof(float)));
178 | 
179 |     // set up coefficient to global memory
180 |     const float h_coef[] = {a0, a1, a2, a3, a4};
181 |     CHECK(cudaMemcpy(d_coef, h_coef, (RADIUS + 1) * sizeof(float),
182 |                      cudaMemcpyHostToDevice);)
183 | 
184 |     // initialize host array
185 |     initialData(h_in, isize + 2 * RADIUS);
186 | 
187 |     // Copy to device
188 |     CHECK(cudaMemcpy(d_in, h_in, nBytes, cudaMemcpyHostToDevice));
189 | 
190 |     // set up constant memory
191 |     setup_coef_constant ();
192 | 
193 |     // launch configuration
194 |     dim3 block (BDIM, 1);
195 |     dim3 grid  (isize / block.x, 1);
196 |     printf("(grid, block) %d,%d \n ", grid.x, block.x);
197 | 
198 |     // Launch stencil_1d() kernel on GPU
199 |     stencil_1d<<<grid, block>>>(d_in + RADIUS, d_out + RADIUS);
200 | 
201 |     // Copy result back to host
202 |     CHECK(cudaMemcpy(gpuRef, d_out, nBytes, cudaMemcpyDeviceToHost));
203 | 
204 |     // apply cpu stencil
205 |     cpu_stencil_1d(h_in, hostRef, isize);
206 | 
207 |     // check results
208 |     checkResult(hostRef, gpuRef, isize);
209 | 
210 |     // launch read only cache kernel
211 |     stencil_1d_read_only<<<grid, block>>>(d_in + RADIUS, d_out + RADIUS,
212 |             d_coef);
213 |     CHECK(cudaMemcpy(gpuRef, d_out, nBytes, cudaMemcpyDeviceToHost));
214 |     checkResult(hostRef, gpuRef, isize);
215 | 
216 |     // print out results
217 |     if(iprint)
218 |     {
219 |         printData(gpuRef, isize);
220 |         printData(hostRef, isize);
221 |     }
222 | 
223 |     // Cleanup
224 |     CHECK(cudaFree(d_in));
225 |     CHECK(cudaFree(d_out));
226 |     CHECK(cudaFree(d_coef));
227 |     free(h_in);
228 |     free(hostRef);
229 |     free(gpuRef);
230 | 
231 |     // reset device
232 |     CHECK(cudaDeviceReset());
233 |     return EXIT_SUCCESS;
234 | }
235 | 


--------------------------------------------------------------------------------
/chapter05/constantStencil.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <cuda_runtime.h>
  3 | #include <stdio.h>
  4 | 
  5 | /*
  6 |  * An example of using constant memory to optimize performance of a stencil
  7 |  * computation by storing coefficients of the computation in a constant memory
  8 |  * array (coef).
  9 |  */
 10 | 
 11 | #define RADIUS 4
 12 | #define BDIM 32
 13 | 
 14 | // constant memory
 15 | __constant__ float coef[RADIUS + 1];
 16 | 
 17 | // FD coeffecient
 18 | #define a0     0.00000f
 19 | #define a1     0.80000f
 20 | #define a2    -0.20000f
 21 | #define a3     0.03809f
 22 | #define a4    -0.00357f
 23 | 
 24 | void initialData(float *in,  const int size)
 25 | {
 26 |     for (int i = 0; i < size; i++)
 27 |     {
 28 |         in[i] = (float)(rand() & 0xFF) / 100.0f;
 29 |     }
 30 | }
 31 | 
 32 | void printData(float *in,  const int size)
 33 | {
 34 |     for (int i = RADIUS; i < size; i++)
 35 |     {
 36 |         printf("%f ", in[i]);
 37 |     }
 38 | 
 39 |     printf("\n");
 40 | }
 41 | 
 42 | void setup_coef_constant (void)
 43 | {
 44 |     const float h_coef[] = {a0, a1, a2, a3, a4};
 45 |     CHECK(cudaMemcpyToSymbol( coef, h_coef, (RADIUS + 1) * sizeof(float)));
 46 | }
 47 | 
 48 | void cpu_stencil_1d (float *in, float *out, int isize)
 49 | {
 50 |     for (int i = RADIUS; i <= isize; i++)
 51 |     {
 52 |         float tmp = a1 * (in[i + 1] - in[i - 1])
 53 |                     + a2 * (in[i + 2] - in[i - 2])
 54 |                     + a3 * (in[i + 3] - in[i - 3])
 55 |                     + a4 * (in[i + 4] - in[i - 4]);
 56 |         out[i] = tmp;
 57 |     }
 58 | }
 59 | 
 60 | void checkResult(float *hostRef, float *gpuRef, const int size)
 61 | {
 62 |     double epsilon = 1.0E-6;
 63 |     bool match = 1;
 64 | 
 65 |     for (int i = RADIUS; i < size; i++)
 66 |     {
 67 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 68 |         {
 69 |             match = 0;
 70 |             printf("different on %dth element: host %f gpu %f\n", i, hostRef[i],
 71 |                    gpuRef[i]);
 72 |             break;
 73 |         }
 74 |     }
 75 | 
 76 |     if (!match) printf("Arrays do not match.\n\n");
 77 | }
 78 | 
 79 | __global__ void stencil_1d(float *in, float *out, int N)
 80 | {
 81 |     // shared memory
 82 |     __shared__ float smem[BDIM + 2 * RADIUS];
 83 | 
 84 |     // index to global memory
 85 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 86 | 
 87 |     while (idx < N)
 88 |     {
 89 | 
 90 |         // index to shared memory for stencil calculatioin
 91 |         int sidx = threadIdx.x + RADIUS;
 92 | 
 93 |         // Read data from global memory into shared memory
 94 |         smem[sidx] = in[idx];
 95 | 
 96 |         // read halo part to shared memory
 97 |         if (threadIdx.x < RADIUS)
 98 |         {
 99 |             smem[sidx - RADIUS] = in[idx - RADIUS];
100 |             smem[sidx + BDIM] = in[idx + BDIM];
101 |         }
102 | 
103 |         // Synchronize (ensure all the data is available)
104 |         __syncthreads();
105 | 
106 |         // Apply the stencil
107 |         float tmp = 0.0f;
108 | 
109 | #pragma unroll
110 |         for (int i = 1; i <= RADIUS; i++)
111 |         {
112 |             tmp += coef[i] * (smem[sidx + i] - smem[sidx - i]);
113 |         }
114 | 
115 |         // Store the result
116 |         out[idx] = tmp;
117 | 
118 |         idx += gridDim.x * blockDim.x;
119 |     }
120 | }
121 | 
122 | 
123 | int main(int argc, char **argv)
124 | {
125 |     // set up device
126 |     int dev = 0;
127 |     cudaDeviceProp deviceProp;
128 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
129 |     printf("%s starting transpose at ", argv[0]);
130 |     printf("device %d: %s ", dev, deviceProp.name);
131 |     CHECK(cudaSetDevice(dev));
132 | 
133 |     // set up data size
134 |     int isize = 1 << 24;
135 | 
136 |     size_t nBytes = (isize + 2 * RADIUS) * sizeof(float);
137 |     printf("array size: %d ", isize);
138 | 
139 |     bool iprint = 0;
140 | 
141 |     // allocate host memory
142 |     float *h_in    = (float *)malloc(nBytes);
143 |     float *hostRef = (float *)malloc(nBytes);
144 |     float *gpuRef  = (float *)malloc(nBytes);
145 | 
146 |     // allocate device memory
147 |     float *d_in, *d_out;
148 |     CHECK(cudaMalloc((float**)&d_in, nBytes));
149 |     CHECK(cudaMalloc((float**)&d_out, nBytes));
150 | 
151 |     // initialize host array
152 |     initialData(h_in, isize + 2 * RADIUS);
153 | 
154 |     // Copy to device
155 |     CHECK(cudaMemcpy(d_in, h_in, nBytes, cudaMemcpyHostToDevice));
156 | 
157 |     // set up constant memory
158 |     setup_coef_constant();
159 | 
160 |     // launch configuration
161 |     cudaDeviceProp info;
162 |     CHECK(cudaGetDeviceProperties(&info, 0));
163 |     dim3 block(BDIM, 1);
164 |     dim3 grid(info.maxGridSize[0] < isize / block.x ? info.maxGridSize[0] :
165 |             isize / block.x, 1);
166 |     printf("(grid, block) %d,%d \n ", grid.x, block.x);
167 | 
168 |     // Launch stencil_1d() kernel on GPU
169 |     stencil_1d<<<grid, block>>>(d_in + RADIUS, d_out + RADIUS, isize);
170 | 
171 |     // Copy result back to host
172 |     CHECK(cudaMemcpy(gpuRef, d_out, nBytes, cudaMemcpyDeviceToHost));
173 | 
174 |     // apply cpu stencil
175 |     cpu_stencil_1d(h_in, hostRef, isize);
176 | 
177 |     // check results
178 |     checkResult(hostRef, gpuRef, isize);
179 | 
180 |     // print out results
181 |     if(iprint)
182 |     {
183 |         printData(gpuRef, isize);
184 |         printData(hostRef, isize);
185 |     }
186 | 
187 |     // Cleanup
188 |     CHECK(cudaFree(d_in));
189 |     CHECK(cudaFree(d_out));
190 |     free(h_in);
191 |     free(hostRef);
192 |     free(gpuRef);
193 | 
194 |     // reset device
195 |     CHECK(cudaDeviceReset());
196 |     return EXIT_SUCCESS;
197 | }
198 | 


--------------------------------------------------------------------------------
/chapter06/Makefile:
--------------------------------------------------------------------------------
 1 | CU_APPS=asyncAPI simpleCallback simpleHyperqBreadth simpleHyperqDependence \
 2 |         simpleHyperqDepth simpleHyperqOpenmp simpleMultiAddBreadth \
 3 |         simpleMultiAddDepth
 4 | C_APPS=
 5 | 
 6 | all: ${C_APPS} ${CU_APPS}
 7 | 
 8 | %: %.cu
 9 | 	nvcc -O2 -arch=sm_20 -Xcompiler -fopenmp -o $@ $< -lgomp
10 | %: %.c
11 | 	gcc -O2 -std=c99 -o $@ $<
12 | clean:
13 | 	rm -f ${CU_APPS} ${C_APPS}
14 | 


--------------------------------------------------------------------------------
/chapter06/asyncAPI.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | /*
 6 |  * An example of using CUDA events to control asynchronous work launched on the
 7 |  * GPU. In this example, asynchronous copies and an asynchronous kernel are
 8 |  * used. A CUDA event is used to determine when that work has completed.
 9 |  */
10 | 
11 | __global__ void kernel(float *g_data, float value)
12 | {
13 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
14 |     g_data[idx] = g_data[idx] + value;
15 | }
16 | 
17 | int checkResult(float *data, const int n, const float x)
18 | {
19 |     for (int i = 0; i < n; i++)
20 |     {
21 |         if (data[i] != x)
22 |         {
23 |             printf("Error! data[%d] = %f, ref = %f\n", i, data[i], x);
24 |             return 0;
25 |         }
26 |     }
27 | 
28 |     return 1;
29 | }
30 | 
31 | int main(int argc, char *argv[])
32 | {
33 |     int devID = 0;
34 |     cudaDeviceProp deviceProps;
35 |     CHECK(cudaGetDeviceProperties(&deviceProps, devID));
36 |     printf("> %s running on", argv[0]);
37 |     printf(" CUDA device [%s]\n", deviceProps.name);
38 | 
39 |     int num = 1 << 24;
40 |     int nbytes = num * sizeof(int);
41 |     float value = 10.0f;
42 | 
43 |     // allocate host memory
44 |     float *h_a = 0;
45 |     CHECK(cudaMallocHost((void **)&h_a, nbytes));
46 |     memset(h_a, 0, nbytes);
47 | 
48 |     // allocate device memory
49 |     float *d_a = 0;
50 |     CHECK(cudaMalloc((void **)&d_a, nbytes));
51 |     CHECK(cudaMemset(d_a, 255, nbytes));
52 | 
53 |     // set kernel launch configuration
54 |     dim3 block = dim3(512);
55 |     dim3 grid  = dim3((num + block.x - 1) / block.x);
56 | 
57 |     // create cuda event handles
58 |     cudaEvent_t stop;
59 |     CHECK(cudaEventCreate(&stop));
60 | 
61 |     // asynchronously issue work to the GPU (all to stream 0)
62 |     CHECK(cudaMemcpyAsync(d_a, h_a, nbytes, cudaMemcpyHostToDevice));
63 |     kernel<<<grid, block>>>(d_a, value);
64 |     CHECK(cudaMemcpyAsync(h_a, d_a, nbytes, cudaMemcpyDeviceToHost));
65 |     CHECK(cudaEventRecord(stop));
66 | 
67 |     // have CPU do some work while waiting for stage 1 to finish
68 |     unsigned long int counter = 0;
69 | 
70 |     while (cudaEventQuery(stop) == cudaErrorNotReady) {
71 |         counter++;
72 |     }
73 | 
74 |     // print the cpu and gpu times
75 |     printf("CPU executed %lu iterations while waiting for GPU to finish\n",
76 |            counter);
77 | 
78 |     // check the output for correctness
79 |     bool bFinalResults = (bool) checkResult(h_a, num, value);
80 | 
81 |     // release resources
82 |     CHECK(cudaEventDestroy(stop));
83 |     CHECK(cudaFreeHost(h_a));
84 |     CHECK(cudaFree(d_a));
85 | 
86 |     CHECK(cudaDeviceReset());
87 | 
88 |     exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
89 | }
90 | 


--------------------------------------------------------------------------------
/chapter06/simpleCallback.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <cuda_runtime.h>
  4 | 
  5 | /*
  6 |  * An example of using CUDA callbacks to trigger work on the host after the
  7 |  * completion of asynchronous work on the device. In this example, n_streams
  8 |  * CUDA streams are created and 4 kernels are launched asynchronously in each.
  9 |  * Then, a callback is added at the completion of those asynchronous kernels
 10 |  * that prints diagnostic information.
 11 |  */
 12 | 
 13 | #define N 100000
 14 | #define NSTREAM 4
 15 | 
 16 | void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void *data)
 17 | {
 18 |     printf("callback from stream %d\n", *((int *)data));
 19 | }
 20 | 
 21 | __global__ void kernel_1()
 22 | {
 23 |     double sum = 0.0;
 24 | 
 25 |     for(int i = 0; i < N; i++)
 26 |     {
 27 |         sum = sum + tan(0.1) * tan(0.1);
 28 |     }
 29 | }
 30 | 
 31 | __global__ void kernel_2()
 32 | {
 33 |     double sum = 0.0;
 34 | 
 35 |     for(int i = 0; i < N; i++)
 36 |     {
 37 |         sum = sum + tan(0.1) * tan(0.1);
 38 |     }
 39 | }
 40 | 
 41 | __global__ void kernel_3()
 42 | {
 43 |     double sum = 0.0;
 44 | 
 45 |     for(int i = 0; i < N; i++)
 46 |     {
 47 |         sum = sum + tan(0.1) * tan(0.1);
 48 |     }
 49 | }
 50 | 
 51 | __global__ void kernel_4()
 52 | {
 53 |     double sum = 0.0;
 54 | 
 55 |     for(int i = 0; i < N; i++)
 56 |     {
 57 |         sum = sum + tan(0.1) * tan(0.1);
 58 |     }
 59 | }
 60 | 
 61 | int main(int argc, char **argv)
 62 | {
 63 |     int n_streams = NSTREAM;
 64 | 
 65 |     if (argc > 1) n_streams = atoi(argv[1]);
 66 | 
 67 |     int dev = 0;
 68 |     cudaDeviceProp deviceProp;
 69 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 70 |     printf("> %s Starting...\n", argv[0]);
 71 |     printf("> Using Device %d: %s\n", dev, deviceProp.name);
 72 |     CHECK(cudaSetDevice(dev));
 73 | 
 74 |     // check if device support hyper-q
 75 |     if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
 76 |     {
 77 |         if (deviceProp.concurrentKernels == 0)
 78 |         {
 79 |             printf("> GPU does not support concurrent kernel execution (SM 3.5 "
 80 |                    "or higher required)\n");
 81 |             printf("> CUDA kernel runs will be serialized\n");
 82 |         }
 83 |         else
 84 |         {
 85 |             printf("> GPU does not support HyperQ\n");
 86 |             printf("> CUDA kernel runs will have limited concurrency\n");
 87 |         }
 88 |     }
 89 | 
 90 |     printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
 91 |            deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
 92 | 
 93 |     // set up max connectioin
 94 |     char * iname = "CUDA_DEVICE_MAX_CONNECTIONS";
 95 |     setenv (iname, "8", 1);
 96 |     char *ivalue =  getenv (iname);
 97 |     printf ("> %s = %s\n", iname, ivalue);
 98 |     printf ("> with streams = %d\n", n_streams);
 99 | 
100 |     // Allocate and initialize an array of stream handles
101 |     cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(
102 |                                 cudaStream_t));
103 | 
104 |     for (int i = 0 ; i < n_streams ; i++)
105 |     {
106 |         CHECK(cudaStreamCreate(&(streams[i])));
107 |     }
108 | 
109 |     dim3 block (1);
110 |     dim3 grid  (1);
111 |     cudaEvent_t start_event, stop_event;
112 |     CHECK(cudaEventCreate(&start_event));
113 |     CHECK(cudaEventCreate(&stop_event));
114 | 
115 |     int stream_ids[n_streams];
116 | 
117 |     CHECK(cudaEventRecord(start_event, 0));
118 | 
119 |     for (int i = 0; i < n_streams; i++)
120 |     {
121 |         stream_ids[i] = i;
122 |         kernel_1<<<grid, block, 0, streams[i]>>>();
123 |         kernel_2<<<grid, block, 0, streams[i]>>>();
124 |         kernel_3<<<grid, block, 0, streams[i]>>>();
125 |         kernel_4<<<grid, block, 0, streams[i]>>>();
126 |         CHECK(cudaStreamAddCallback(streams[i], my_callback,
127 |                     (void *)(stream_ids + i), 0));
128 |     }
129 | 
130 |     CHECK(cudaEventRecord(stop_event, 0));
131 |     CHECK(cudaEventSynchronize(stop_event));
132 | 
133 |     float elapsed_time;
134 |     CHECK(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
135 |     printf("Measured time for parallel execution = %.3fs\n",
136 |            elapsed_time / 1000.0f);
137 | 
138 |     // release all stream
139 |     for (int i = 0 ; i < n_streams ; i++)
140 |     {
141 |         CHECK(cudaStreamDestroy(streams[i]));
142 |     }
143 | 
144 |     free(streams);
145 | 
146 |     /*
147 |      * cudaDeviceReset must be called before exiting in order for profiling and
148 |      * tracing tools such as Nsight and Visual Profiler to show complete traces.
149 |      */
150 |     CHECK(cudaDeviceReset());
151 | 
152 |     return 0;
153 | }
154 | 


--------------------------------------------------------------------------------
/chapter06/simpleHyperqBreadth.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <cuda_runtime.h>
  4 | #include <stdlib.h>
  5 | 
  6 | /*
  7 |  * This example demonstrates submitting work to a CUDA stream in breadth-first
  8 |  * order. Work submission in breadth-first order prevents false-dependencies
  9 |  * from reducing the parallelism of an application. kernel_1, kernel_2,
 10 |  * kernel_3, and kernel_4 simply implement identical, dummy computation.
 11 |  * Separate kernels are used to make the scheduling of these kernels simpler to
 12 |  * visualize in the Visual Profiler.
 13 |  */
 14 | 
 15 | #define N 300000
 16 | #define NSTREAM 4
 17 | 
 18 | __global__ void kernel_1()
 19 | {
 20 |     double sum = 0.0;
 21 | 
 22 |     for(int i = 0; i < N; i++)
 23 |     {
 24 |         sum = sum + tan(0.1) * tan(0.1);
 25 |     }
 26 | }
 27 | 
 28 | __global__ void kernel_2()
 29 | {
 30 |     double sum = 0.0;
 31 | 
 32 |     for(int i = 0; i < N; i++)
 33 |     {
 34 |         sum = sum + tan(0.1) * tan(0.1);
 35 |     }
 36 | }
 37 | 
 38 | __global__ void kernel_3()
 39 | {
 40 |     double sum = 0.0;
 41 | 
 42 |     for(int i = 0; i < N; i++)
 43 |     {
 44 |         sum = sum + tan(0.1) * tan(0.1);
 45 |     }
 46 | }
 47 | 
 48 | __global__ void kernel_4()
 49 | {
 50 |     double sum = 0.0;
 51 | 
 52 |     for(int i = 0; i < N; i++)
 53 |     {
 54 |         sum = sum + tan(0.1) * tan(0.1);
 55 |     }
 56 | }
 57 | 
 58 | int main(int argc, char **argv)
 59 | {
 60 |     int n_streams = NSTREAM;
 61 |     int isize = 1;
 62 |     int iblock = 1;
 63 |     int bigcase = 0;
 64 | 
 65 |     // get argument from command line
 66 |     if (argc > 1) n_streams = atoi(argv[1]);
 67 | 
 68 |     if (argc > 2) bigcase = atoi(argv[2]);
 69 | 
 70 |     float elapsed_time;
 71 | 
 72 |     // set up max connectioin
 73 |     char * iname = "CUDA_DEVICE_MAX_CONNECTIONS";
 74 |     setenv (iname, "32", 1);
 75 |     char *ivalue =  getenv (iname);
 76 |     printf ("%s = %s\n", iname, ivalue);
 77 | 
 78 |     int dev = 0;
 79 |     cudaDeviceProp deviceProp;
 80 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 81 |     printf("> Using Device %d: %s with num_streams %d\n", dev, deviceProp.name,
 82 |            n_streams);
 83 |     CHECK(cudaSetDevice(dev));
 84 | 
 85 |     // check if device support hyper-q
 86 |     if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
 87 |     {
 88 |         if (deviceProp.concurrentKernels == 0)
 89 |         {
 90 |             printf("> GPU does not support concurrent kernel execution (SM 3.5 "
 91 |                     "or higher required)\n");
 92 |             printf("> CUDA kernel runs will be serialized\n");
 93 |         }
 94 |         else
 95 |         {
 96 |             printf("> GPU does not support HyperQ\n");
 97 |             printf("> CUDA kernel runs will have limited concurrency\n");
 98 |         }
 99 |     }
100 | 
101 |     printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
102 |            deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
103 | 
104 |     // Allocate and initialize an array of stream handles
105 |     cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(
106 |                                 cudaStream_t));
107 | 
108 |     for (int i = 0 ; i < n_streams ; i++)
109 |     {
110 |         CHECK(cudaStreamCreate(&(streams[i])));
111 |     }
112 | 
113 |     // run kernel with more threads
114 |     if (bigcase == 1)
115 |     {
116 |         iblock = 512;
117 |         isize = 1 << 12;
118 |     }
119 | 
120 |     // set up execution configuration
121 |     dim3 block (iblock);
122 |     dim3 grid  (isize / iblock);
123 |     printf("> grid %d block %d\n", grid.x, block.x);
124 | 
125 |     // creat events
126 |     cudaEvent_t start, stop;
127 |     CHECK(cudaEventCreate(&start));
128 |     CHECK(cudaEventCreate(&stop));
129 | 
130 |     // record start event
131 |     CHECK(cudaEventRecord(start, 0));
132 | 
133 |     // dispatch job with breadth first ordering
134 |     for (int i = 0; i < n_streams; i++)
135 |         kernel_1<<<grid, block, 0, streams[i]>>>();
136 | 
137 |     for (int i = 0; i < n_streams; i++)
138 |         kernel_2<<<grid, block, 0, streams[i]>>>();
139 | 
140 |     for (int i = 0; i < n_streams; i++)
141 |         kernel_3<<<grid, block, 0, streams[i]>>>();
142 | 
143 |     for (int i = 0; i < n_streams; i++)
144 |         kernel_4<<<grid, block, 0, streams[i]>>>();
145 | 
146 |     // record stop event
147 |     CHECK(cudaEventRecord(stop, 0));
148 |     CHECK(cudaEventSynchronize(stop));
149 | 
150 |     // calculate elapsed time
151 |     CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
152 |     printf("Measured time for parallel execution = %.3fs\n",
153 |            elapsed_time / 1000.0f);
154 | 
155 |     // release all stream
156 |     for (int i = 0 ; i < n_streams ; i++)
157 |     {
158 |         CHECK(cudaStreamDestroy(streams[i]));
159 |     }
160 | 
161 |     free(streams);
162 | 
163 |     // destroy events
164 |     CHECK(cudaEventDestroy(start));
165 |     CHECK(cudaEventDestroy(stop));
166 | 
167 |     // reset device
168 |     CHECK(cudaDeviceReset());
169 | 
170 |     return 0;
171 | }
172 | 


--------------------------------------------------------------------------------
/chapter06/simpleHyperqDependence.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <cuda_runtime.h>
  4 | #include <stdlib.h>
  5 | 
  6 | /*
  7 |  * A simple example of adding inter-stream dependencies using
  8 |  * cudaStreamWaitEvent. This code launches 4 kernels in each of n_streams
  9 |  * streams. An event is recorded at the completion of each stream (kernelEvent).
 10 |  * cudaStreamWaitEvent is then called on that event and the last stream
 11 |  * (streams[n_streams - 1]) to force all computation in the final stream to only
 12 |  * execute when all other streams have completed.
 13 |  */
 14 | 
 15 | #define N 300000
 16 | #define NSTREAM 4
 17 | 
 18 | __global__ void kernel_1()
 19 | {
 20 |     double sum = 0.0;
 21 | 
 22 |     for(int i = 0; i < N; i++)
 23 |     {
 24 |         sum = sum + tan(0.1) * tan(0.1);
 25 |     }
 26 | }
 27 | 
 28 | __global__ void kernel_2()
 29 | {
 30 |     double sum = 0.0;
 31 | 
 32 |     for(int i = 0; i < N; i++)
 33 |     {
 34 |         sum = sum + tan(0.1) * tan(0.1);
 35 |     }
 36 | }
 37 | 
 38 | __global__ void kernel_3()
 39 | {
 40 |     double sum = 0.0;
 41 | 
 42 |     for(int i = 0; i < N; i++)
 43 |     {
 44 |         sum = sum + tan(0.1) * tan(0.1);
 45 |     }
 46 | }
 47 | 
 48 | __global__ void kernel_4()
 49 | {
 50 |     double sum = 0.0;
 51 | 
 52 |     for(int i = 0; i < N; i++)
 53 |     {
 54 |         sum = sum + tan(0.1) * tan(0.1);
 55 |     }
 56 | }
 57 | 
 58 | int main(int argc, char **argv)
 59 | {
 60 |     int n_streams = NSTREAM;
 61 |     int isize = 1;
 62 |     int iblock = 1;
 63 |     int bigcase = 0;
 64 | 
 65 |     // get argument from command line
 66 |     if (argc > 1) n_streams = atoi(argv[1]);
 67 | 
 68 |     if (argc > 2) bigcase = atoi(argv[2]);
 69 | 
 70 |     float elapsed_time;
 71 | 
 72 |     // set up max connectioin
 73 |     char * iname = "CUDA_DEVICE_MAX_CONNECTIONS";
 74 |     setenv (iname, "32", 1);
 75 |     char *ivalue =  getenv (iname);
 76 |     printf ("%s = %s\n", iname, ivalue);
 77 | 
 78 |     int dev = 0;
 79 |     cudaDeviceProp deviceProp;
 80 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 81 |     printf("> Using Device %d: %s with num_streams %d\n", dev, deviceProp.name,
 82 |            n_streams);
 83 |     CHECK(cudaSetDevice(dev));
 84 | 
 85 |     // check if device support hyper-q
 86 |     if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
 87 |     {
 88 |         if (deviceProp.concurrentKernels == 0)
 89 |         {
 90 |             printf("> GPU does not support concurrent kernel execution (SM 3.5 "
 91 |                     "or higher required)\n");
 92 |             printf("> CUDA kernel runs will be serialized\n");
 93 |         }
 94 |         else
 95 |         {
 96 |             printf("> GPU does not support HyperQ\n");
 97 |             printf("> CUDA kernel runs will have limited concurrency\n");
 98 |         }
 99 |     }
100 | 
101 |     printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
102 |            deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
103 | 
104 |     // Allocate and initialize an array of stream handles
105 |     cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(
106 |                                 cudaStream_t));
107 | 
108 |     for (int i = 0 ; i < n_streams ; i++)
109 |     {
110 |         CHECK(cudaStreamCreate(&(streams[i])));
111 |     }
112 | 
113 |     // run kernel with more threads
114 |     if (bigcase == 1)
115 |     {
116 |         iblock = 512;
117 |         isize = 1 << 12;
118 |     }
119 | 
120 |     // set up execution configuration
121 |     dim3 block (iblock);
122 |     dim3 grid  (isize / iblock);
123 |     printf("> grid %d block %d\n", grid.x, block.x);
124 | 
125 |     // creat events
126 |     cudaEvent_t start, stop;
127 |     CHECK(cudaEventCreate(&start));
128 |     CHECK(cudaEventCreate(&stop));
129 | 
130 | 
131 |     cudaEvent_t *kernelEvent;
132 |     kernelEvent = (cudaEvent_t *) malloc(n_streams * sizeof(cudaEvent_t));
133 | 
134 |     for (int i = 0; i < n_streams; i++)
135 |     {
136 |         CHECK(cudaEventCreateWithFlags(&(kernelEvent[i]),
137 |                     cudaEventDisableTiming));
138 |     }
139 | 
140 |     // record start event
141 |     CHECK(cudaEventRecord(start, 0));
142 | 
143 |     // dispatch job with depth first ordering
144 |     for (int i = 0; i < n_streams; i++)
145 |     {
146 |         kernel_1<<<grid, block, 0, streams[i]>>>();
147 |         kernel_2<<<grid, block, 0, streams[i]>>>();
148 |         kernel_3<<<grid, block, 0, streams[i]>>>();
149 |         kernel_4<<<grid, block, 0, streams[i]>>>();
150 | 
151 |         CHECK(cudaEventRecord(kernelEvent[i], streams[i]));
152 |         CHECK(cudaStreamWaitEvent(streams[n_streams - 1], kernelEvent[i], 0));
153 |     }
154 | 
155 |     // record stop event
156 |     CHECK(cudaEventRecord(stop, 0));
157 |     CHECK(cudaEventSynchronize(stop));
158 | 
159 |     // calculate elapsed time
160 |     CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
161 |     printf("Measured time for parallel execution = %.3fs\n",
162 |            elapsed_time / 1000.0f);
163 | 
164 |     // release all stream
165 |     for (int i = 0 ; i < n_streams ; i++)
166 |     {
167 |         CHECK(cudaStreamDestroy(streams[i]));
168 |         CHECK(cudaEventDestroy(kernelEvent[i]));
169 |     }
170 | 
171 |     free(streams);
172 |     free(kernelEvent);
173 | 
174 |     // reset device
175 |     CHECK(cudaDeviceReset());
176 | 
177 |     return 0;
178 | }
179 | 


--------------------------------------------------------------------------------
/chapter06/simpleHyperqDepth.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <cuda_runtime.h>
  4 | #include <stdlib.h>
  5 | 
  6 | /*
  7 |  * This example demonstrates submitting work to a CUDA stream in depth-first
  8 |  * order. Work submission in depth-first order may introduce false-dependencies
  9 |  * between unrelated tasks in different CUDA streams, limiting the parallelism
 10 |  * of a CUDA application. kernel_1, kernel_2, kernel_3, and kernel_4 simply
 11 |  * implement identical, dummy computation. Separate kernels are used to make the
 12 |  * scheduling of these kernels simpler to visualize in the Visual Profiler.
 13 |  */
 14 | 
 15 | #define N 300000
 16 | #define NSTREAM 4
 17 | 
 18 | __global__ void kernel_1()
 19 | {
 20 |     double sum = 0.0;
 21 | 
 22 |     for(int i = 0; i < N; i++)
 23 |     {
 24 |         sum = sum + tan(0.1) * tan(0.1);
 25 |     }
 26 | }
 27 | 
 28 | __global__ void kernel_2()
 29 | {
 30 |     double sum = 0.0;
 31 | 
 32 |     for(int i = 0; i < N; i++)
 33 |     {
 34 |         sum = sum + tan(0.1) * tan(0.1);
 35 |     }
 36 | }
 37 | 
 38 | __global__ void kernel_3()
 39 | {
 40 |     double sum = 0.0;
 41 | 
 42 |     for(int i = 0; i < N; i++)
 43 |     {
 44 |         sum = sum + tan(0.1) * tan(0.1);
 45 |     }
 46 | }
 47 | 
 48 | __global__ void kernel_4()
 49 | {
 50 |     double sum = 0.0;
 51 | 
 52 |     for(int i = 0; i < N; i++)
 53 |     {
 54 |         sum = sum + tan(0.1) * tan(0.1);
 55 |     }
 56 | }
 57 | 
 58 | int main(int argc, char **argv)
 59 | {
 60 |     int n_streams = NSTREAM;
 61 |     int isize = 1;
 62 |     int iblock = 1;
 63 |     int bigcase = 0;
 64 | 
 65 |     // get argument from command line
 66 |     if (argc > 1) n_streams = atoi(argv[1]);
 67 | 
 68 |     if (argc > 2) bigcase = atoi(argv[2]);
 69 | 
 70 |     float elapsed_time;
 71 | 
 72 |     // set up max connectioin
 73 |     char* iname = "CUDA_DEVICE_MAX_CONNECTIONS";
 74 |     setenv (iname, "32", 1);
 75 |     char *ivalue =  getenv (iname);
 76 |     printf ("%s = %s\n", iname, ivalue);
 77 | 
 78 |     int dev = 0;
 79 |     cudaDeviceProp deviceProp;
 80 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 81 |     printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name,
 82 |            n_streams);
 83 |     CHECK(cudaSetDevice(dev));
 84 | 
 85 |     // check if device support hyper-q
 86 |     if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
 87 |     {
 88 |         if (deviceProp.concurrentKernels == 0)
 89 |         {
 90 |             printf("> GPU does not support concurrent kernel execution (SM 3.5 "
 91 |                     "or higher required)\n");
 92 |             printf("> CUDA kernel runs will be serialized\n");
 93 |         }
 94 |         else
 95 |         {
 96 |             printf("> GPU does not support HyperQ\n");
 97 |             printf("> CUDA kernel runs will have limited concurrency\n");
 98 |         }
 99 |     }
100 | 
101 |     printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
102 |            deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
103 | 
104 |     // Allocate and initialize an array of stream handles
105 |     cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(
106 |                                 cudaStream_t));
107 | 
108 |     for (int i = 0 ; i < n_streams ; i++)
109 |     {
110 |         CHECK(cudaStreamCreate(&(streams[i])));
111 |     }
112 | 
113 |     // run kernel with more threads
114 |     if (bigcase == 1)
115 |     {
116 |         iblock = 512;
117 |         isize = 1 << 12;
118 |     }
119 | 
120 |     // set up execution configuration
121 |     dim3 block (iblock);
122 |     dim3 grid  (isize / iblock);
123 |     printf("> grid %d block %d\n", grid.x, block.x);
124 | 
125 |     // creat events
126 |     cudaEvent_t start, stop;
127 |     CHECK(cudaEventCreate(&start));
128 |     CHECK(cudaEventCreate(&stop));
129 | 
130 |     // record start event
131 |     CHECK(cudaEventRecord(start, 0));
132 | 
133 |     // dispatch job with depth first ordering
134 |     for (int i = 0; i < n_streams; i++)
135 |     {
136 |         kernel_1<<<grid, block, 0, streams[i]>>>();
137 |         kernel_2<<<grid, block, 0, streams[i]>>>();
138 |         kernel_3<<<grid, block, 0, streams[i]>>>();
139 |         kernel_4<<<grid, block, 0, streams[i]>>>();
140 |     }
141 | 
142 |     // record stop event
143 |     CHECK(cudaEventRecord(stop, 0));
144 |     CHECK(cudaEventSynchronize(stop));
145 | 
146 |     // calculate elapsed time
147 |     CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
148 |     printf("Measured time for parallel execution = %.3fs\n",
149 |            elapsed_time / 1000.0f);
150 | 
151 |     // release all stream
152 |     for (int i = 0 ; i < n_streams ; i++)
153 |     {
154 |         CHECK(cudaStreamDestroy(streams[i]));
155 |     }
156 | 
157 |     free(streams);
158 | 
159 |     // destroy events
160 |     CHECK(cudaEventDestroy(start));
161 |     CHECK(cudaEventDestroy(stop));
162 | 
163 |     // reset device
164 |     CHECK(cudaDeviceReset());
165 | 
166 |     return 0;
167 | }
168 | 


--------------------------------------------------------------------------------
/chapter06/simpleHyperqOpenmp.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <cuda_runtime.h>
  4 | #include <stdlib.h>
  5 | #include <omp.h>
  6 | 
  7 | /*
  8 |  * An example of using OpenMP to parallelize the creation of CUDA work in
  9 |  * multiple streams. This example using n_streams OpenMP threads to launch 4
 10 |  * kernels in each stream. Note the new pragma introduced, #pragma omp parallel.
 11 |  */
 12 | 
 13 | #define N 300000
 14 | #define NSTREAM 4
 15 | 
 16 | __global__ void kernel_1()
 17 | {
 18 |     double sum = 0.0;
 19 | 
 20 |     for(int i = 0; i < N; i++)
 21 |     {
 22 |         sum = sum + tan(0.1) * tan(0.1);
 23 |     }
 24 | }
 25 | 
 26 | __global__ void kernel_2()
 27 | {
 28 |     double sum = 0.0;
 29 | 
 30 |     for(int i = 0; i < N; i++)
 31 |     {
 32 |         sum = sum + tan(0.1) * tan(0.1);
 33 |     }
 34 | }
 35 | 
 36 | __global__ void kernel_3()
 37 | {
 38 |     double sum = 0.0;
 39 | 
 40 |     for(int i = 0; i < N; i++)
 41 |     {
 42 |         sum = sum + tan(0.1) * tan(0.1);
 43 |     }
 44 | }
 45 | 
 46 | __global__ void kernel_4()
 47 | {
 48 |     double sum = 0.0;
 49 | 
 50 |     for(int i = 0; i < N; i++)
 51 |     {
 52 |         sum = sum + tan(0.1) * tan(0.1);
 53 |     }
 54 | }
 55 | 
 56 | int main(int argc, char **argv)
 57 | {
 58 |     int n_streams = NSTREAM;
 59 |     int isize = 1;
 60 |     int iblock = 1;
 61 |     int bigcase = 0;
 62 | 
 63 |     // get argument from command line
 64 |     if (argc > 1) n_streams = atoi(argv[1]);
 65 | 
 66 |     if (argc > 2) bigcase = atoi(argv[2]);
 67 | 
 68 |     float elapsed_time;
 69 | 
 70 |     // set up max connectioin
 71 |     char* iname = "CUDA_DEVICE_MAX_CONNECTIONS";
 72 |     setenv (iname, "32", 1);
 73 |     char *ivalue =  getenv (iname);
 74 |     printf ("%s = %s\n", iname, ivalue);
 75 | 
 76 |     int dev = 0;
 77 |     cudaDeviceProp deviceProp;
 78 |     CHECK(cudaGetDeviceProperties(&deviceProp, dev));
 79 |     printf("> Using Device %d: %s with num_streams=%d\n", dev, deviceProp.name,
 80 |            n_streams);
 81 |     CHECK(cudaSetDevice(dev));
 82 | 
 83 |     // check if device support hyper-q
 84 |     if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5))
 85 |     {
 86 |         if (deviceProp.concurrentKernels == 0)
 87 |         {
 88 |             printf("> GPU does not support concurrent kernel execution (SM 3.5 "
 89 |                     "or higher required)\n");
 90 |             printf("> CUDA kernel runs will be serialized\n");
 91 |         }
 92 |         else
 93 |         {
 94 |             printf("> GPU does not support HyperQ\n");
 95 |             printf("> CUDA kernel runs will have limited concurrency\n");
 96 |         }
 97 |     }
 98 | 
 99 |     printf("> Compute Capability %d.%d hardware with %d multi-processors\n",
100 |            deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
101 | 
102 |     // Allocate and initialize an array of stream handles
103 |     cudaStream_t *streams = (cudaStream_t *) malloc(n_streams * sizeof(
104 |                                 cudaStream_t));
105 | 
106 |     for (int i = 0 ; i < n_streams ; i++)
107 |     {
108 |         CHECK(cudaStreamCreate(&(streams[i])));
109 |     }
110 | 
111 |     // run kernel with more threads
112 |     if (bigcase == 1)
113 |     {
114 |         iblock = 512;
115 |         isize = 1 << 12;
116 |     }
117 | 
118 |     // set up execution configuration
119 |     dim3 block (iblock);
120 |     dim3 grid  (isize / iblock);
121 |     printf("> grid %d block %d\n", grid.x, block.x);
122 | 
123 |     // creat events
124 |     cudaEvent_t start, stop;
125 |     CHECK(cudaEventCreate(&start));
126 |     CHECK(cudaEventCreate(&stop));
127 | 
128 |     // record start event
129 |     CHECK(cudaEventRecord(start, 0));
130 | 
131 |     // dispatch job with depth first ordering using OpenMP
132 |     omp_set_num_threads(n_streams);
133 |     #pragma omp parallel
134 |     {
135 |         int i = omp_get_thread_num();
136 |         kernel_1<<<grid, block, 0, streams[i]>>>();
137 |         kernel_2<<<grid, block, 0, streams[i]>>>();
138 |         kernel_3<<<grid, block, 0, streams[i]>>>();
139 |         kernel_4<<<grid, block, 0, streams[i]>>>();
140 |     }
141 | 
142 |     // record stop event
143 |     CHECK(cudaEventRecord(stop, 0));
144 |     CHECK(cudaEventSynchronize(stop));
145 | 
146 |     // calculate elapsed time
147 |     CHECK(cudaEventElapsedTime(&elapsed_time, start, stop));
148 |     printf("Measured time for parallel execution = %.3fs\n",
149 |            elapsed_time / 1000.0f);
150 | 
151 |     // release all stream
152 |     for (int i = 0 ; i < n_streams ; i++)
153 |     {
154 |         CHECK(cudaStreamDestroy(streams[i]));
155 |     }
156 | 
157 |     free(streams);
158 | 
159 |     // destroy events
160 |     CHECK(cudaEventDestroy(start));
161 |     CHECK(cudaEventDestroy(stop));
162 | 
163 |     // reset device
164 |     CHECK(cudaDeviceReset());
165 | 
166 |     return 0;
167 | }
168 | 


--------------------------------------------------------------------------------
/chapter07/Makefile:
--------------------------------------------------------------------------------
 1 | CU_APPS=atomic-ordering floating-point-accuracy floating-point-perf fmad \
 2 |         intrinsic-standard-comp my-atomic-add nbody
 3 | C_APPS=
 4 | 
 5 | all: ${C_APPS} ${CU_APPS}
 6 | 
 7 | %: %.cu
 8 | 	nvcc -O2 -arch=sm_20 -o $@ $<
 9 | %: %.c
10 | 	gcc -O2 -std=c99 -o $@ $<
11 | clean:
12 | 	rm -f ${CU_APPS} ${C_APPS}
13 | 


--------------------------------------------------------------------------------
/chapter07/atomic-ordering.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | 
  5 | /**
  6 |  * This example illustrates the difference between using atomic operations and
  7 |  * using unsafe accesses to increment a shared variable.
  8 |  *
  9 |  * In both the atomics() and unsafe() kernels, each thread repeatedly increments
 10 |  * a globally shared variable by 1. Each thread also stores the value it reads
 11 |  * from the shared location for the first increment.
 12 |  **/
 13 | 
 14 | /**
 15 |  * This version of the kernel uses atomic operations to safely increment a
 16 |  * shared variable from multiple threads.
 17 |  **/
 18 | __global__ void atomics(int *shared_var, int *values_read, int N, int iters)
 19 | {
 20 |     int i;
 21 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 22 | 
 23 |     if (tid >= N) return;
 24 | 
 25 |     values_read[tid] = atomicAdd(shared_var, 1);
 26 | 
 27 |     for (i = 0; i < iters; i++)
 28 |     {
 29 |         atomicAdd(shared_var, 1);
 30 |     }
 31 | }
 32 | 
 33 | /**
 34 |  * This version of the kernel performs the same increments as atomics() but in
 35 |  * an unsafe manner.
 36 |  **/
 37 | __global__ void unsafe(int *shared_var, int *values_read, int N, int iters)
 38 | {
 39 |     int i;
 40 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 41 | 
 42 |     if (tid >= N) return;
 43 | 
 44 |     int old = *shared_var;
 45 |     *shared_var = old + 1;
 46 |     values_read[tid] = old;
 47 | 
 48 |     for (i = 0; i < iters; i++)
 49 |     {
 50 |         int old = *shared_var;
 51 |         *shared_var = old + 1;
 52 |     }
 53 | }
 54 | 
 55 | /**
 56 |  * Utility function for printing the contents of an array.
 57 |  **/
 58 | static void print_read_results(int *h_arr, int *d_arr, int N,
 59 |                                const char *label)
 60 | {
 61 |     int i;
 62 |     int maxNumToPrint = 10;
 63 |     int nToPrint = N > maxNumToPrint ? maxNumToPrint : N;
 64 |     CHECK(cudaMemcpy(h_arr, d_arr, nToPrint * sizeof(int),
 65 |                      cudaMemcpyDeviceToHost));
 66 |     printf("Threads performing %s operations read values", label);
 67 | 
 68 |     for (i = 0; i < nToPrint; i++)
 69 |     {
 70 |         printf(" %d", h_arr[i]);
 71 |     }
 72 | 
 73 |     printf("\n");
 74 | }
 75 | 
 76 | int main(int argc, char **argv)
 77 | {
 78 |     int N = 64;
 79 |     int block = 32;
 80 |     int runs = 30;
 81 |     int iters = 100000;
 82 |     int r;
 83 |     int *d_shared_var;
 84 |     int h_shared_var_atomic, h_shared_var_unsafe;
 85 |     int *d_values_read_atomic;
 86 |     int *d_values_read_unsafe;
 87 |     int *h_values_read;
 88 | 
 89 |     CHECK(cudaMalloc((void **)&d_shared_var, sizeof(int)));
 90 |     CHECK(cudaMalloc((void **)&d_values_read_atomic, N * sizeof(int)));
 91 |     CHECK(cudaMalloc((void **)&d_values_read_unsafe, N * sizeof(int)));
 92 |     h_values_read = (int *)malloc(N * sizeof(int));
 93 | 
 94 |     double atomic_mean_time = 0;
 95 |     double unsafe_mean_time = 0;
 96 | 
 97 |     for (r = 0; r < runs; r++)
 98 |     {
 99 |         double start_atomic = seconds();
100 |         CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int)));
101 |         atomics<<<N / block, block>>>(d_shared_var, d_values_read_atomic, N,
102 |                                           iters);
103 |         CHECK(cudaDeviceSynchronize());
104 |         atomic_mean_time += seconds() - start_atomic;
105 |         CHECK(cudaMemcpy(&h_shared_var_atomic, d_shared_var, sizeof(int),
106 |                          cudaMemcpyDeviceToHost));
107 | 
108 |         double start_unsafe = seconds();
109 |         CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int)));
110 |         unsafe<<<N / block, block>>>(d_shared_var, d_values_read_unsafe, N,
111 |                                          iters);
112 |         CHECK(cudaDeviceSynchronize());
113 |         unsafe_mean_time += seconds() - start_unsafe;
114 |         CHECK(cudaMemcpy(&h_shared_var_unsafe, d_shared_var, sizeof(int),
115 |                          cudaMemcpyDeviceToHost));
116 |     }
117 | 
118 |     printf("In total, %d runs using atomic operations took %f s\n",
119 |            runs, atomic_mean_time);
120 |     printf("  Using atomic operations also produced an output of %d\n",
121 |            h_shared_var_atomic);
122 |     printf("In total, %d runs using unsafe operations took %f s\n",
123 |            runs, unsafe_mean_time);
124 |     printf("  Using unsafe operations also produced an output of %d\n",
125 |            h_shared_var_unsafe);
126 | 
127 |     print_read_results(h_values_read, d_values_read_atomic, N, "atomic");
128 |     print_read_results(h_values_read, d_values_read_unsafe, N, "unsafe");
129 | 
130 |     return 0;
131 | }
132 | 


--------------------------------------------------------------------------------
/chapter07/floating-point-accuracy.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | /**
 6 |  * This example demonstrates floating-point's inability to represent certain
 7 |  * values with a specific value as an example.
 8 |  *
 9 |  * In this example, the value 12.1 is stored in single- and double-precision
10 |  * floating-point variables on both the host and device. After retrieving the
11 |  * results from the device, the actual values stored are printed to 20 decimal
12 |  * places and the single- and double-precision results from the host and device
13 |  * are compared to each other to verify that host and device are equally
14 |  * accurate for the same type.
15 |  **/
16 | 
17 | /**
18 |  * Save the single- and double-precision representation of 12.1 from the device
19 |  * into global memory. That global memory is then copied back to the host for
20 |  * later analysis.
21 |  **/
22 | __global__ void kernel(float *F, double *D)
23 | {
24 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
25 | 
26 |     if (tid == 0)
27 |     {
28 |         *F = 12.1;
29 |         *D = 12.1;
30 |     }
31 | }
32 | 
33 | int main(int argc, char **argv)
34 | {
35 |     float *deviceF;
36 |     float h_deviceF;
37 |     double *deviceD;
38 |     double h_deviceD;
39 | 
40 |     float hostF = 12.1;
41 |     double hostD = 12.1;
42 | 
43 |     CHECK(cudaMalloc((void **)&deviceF, sizeof(float)));
44 |     CHECK(cudaMalloc((void **)&deviceD, sizeof(double)));
45 |     kernel<<<1, 32>>>(deviceF, deviceD);
46 |     CHECK(cudaMemcpy(&h_deviceF, deviceF, sizeof(float),
47 |                      cudaMemcpyDeviceToHost));
48 |     CHECK(cudaMemcpy(&h_deviceD, deviceD, sizeof(double),
49 |                      cudaMemcpyDeviceToHost));
50 | 
51 |     printf("Host single-precision representation of 12.1   = %.20f\n", hostF);
52 |     printf("Host double-precision representation of 12.1   = %.20f\n", hostD);
53 |     printf("Device single-precision representation of 12.1 = %.20f\n", hostF);
54 |     printf("Device double-precision representation of 12.1 = %.20f\n", hostD);
55 |     printf("Device and host single-precision representation equal? %s\n",
56 |            hostF == h_deviceF ? "yes" : "no");
57 |     printf("Device and host double-precision representation equal? %s\n",
58 |            hostD == h_deviceD ? "yes" : "no");
59 | 
60 |     return 0;
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/chapter07/fmad.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | /**
 6 |  * This example illustrates the effect on numerical accuracy of fusing a
 7 |  * multiply-add into a single MAD instruction.
 8 |  **/
 9 | 
10 | __global__ void fmad_kernel(double x, double y, double *out)
11 | {
12 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
13 | 
14 |     if (tid == 0)
15 |     {
16 |         *out = x * x + y;
17 |     }
18 | }
19 | 
20 | double host_fmad_kernel(double x, double y)
21 | {
22 |     return x * x + y;
23 | }
24 | 
25 | int main(int argc, char **argv)
26 | {
27 |     double *d_out, h_out;
28 |     double x = 2.891903;
29 |     double y = -3.980364;
30 | 
31 |     double host_value = host_fmad_kernel(x, y);
32 |     CHECK(cudaMalloc((void **)&d_out, sizeof(double)));
33 |     fmad_kernel<<<1, 32>>>(x, y, d_out);
34 |     CHECK(cudaMemcpy(&h_out, d_out, sizeof(double),
35 |                      cudaMemcpyDeviceToHost));
36 | 
37 |     if (host_value == h_out)
38 |     {
39 |         printf("The device output the same value as the host.\n");
40 |     }
41 |     else
42 |     {
43 |         printf("The device output a different value than the host, diff=%e.\n",
44 |                fabs(host_value - h_out));
45 |     }
46 | 
47 |     return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/chapter07/intrinsic-standard-comp.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | 
  5 | /**
  6 |  * This example demonstrates the relative performance and accuracy of CUDA
  7 |  * standard and intrinsic functions.
  8 |  *
  9 |  * The computational kernel of this example is the iterative calculation of a
 10 |  * value squared. This computation is done on the host, on the device with a
 11 |  * standard function, and on the device with an intrinsic function. The results
 12 |  * from all three are compared for numerical accuracy (with the host as the
 13 |  * baseline), and the performance of standard and intrinsic functions is also
 14 |  * compared.
 15 |  **/
 16 | 
 17 | /**
 18 |  * Perform iters power operations using the standard powf function.
 19 |  **/
 20 | __global__ void standard_kernel(float a, float *out, int iters)
 21 | {
 22 |     int i;
 23 |     int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
 24 | 
 25 |     if(tid == 0)
 26 |     {
 27 |         float tmp;
 28 | 
 29 |         for (i = 0; i < iters; i++)
 30 |         {
 31 |             tmp = powf(a, 2.0f);
 32 |         }
 33 | 
 34 |         *out = tmp;
 35 |     }
 36 | }
 37 | 
 38 | /**
 39 |  * Perform iters power operations using the intrinsic __powf function.
 40 |  **/
 41 | __global__ void intrinsic_kernel(float a, float *out, int iters)
 42 | {
 43 |     int i;
 44 |     int tid = (blockDim.x * blockIdx.x) + threadIdx.x;
 45 | 
 46 |     if(tid == 0)
 47 |     {
 48 |         float tmp;
 49 | 
 50 |         for (i = 0; i < iters; i++)
 51 |         {
 52 |             tmp = __powf(a, 2.0f);
 53 |         }
 54 | 
 55 |         *out = tmp;
 56 |     }
 57 | }
 58 | 
 59 | int main(int argc, char **argv)
 60 | {
 61 |     int i;
 62 |     int runs = 30;
 63 |     int iters = 1000;
 64 | 
 65 |     float *d_standard_out, h_standard_out;
 66 |     CHECK(cudaMalloc((void **)&d_standard_out, sizeof(float)));
 67 | 
 68 |     float *d_intrinsic_out, h_intrinsic_out;
 69 |     CHECK(cudaMalloc((void **)&d_intrinsic_out, sizeof(float)));
 70 | 
 71 |     float input_value = 8181.25;
 72 | 
 73 |     double mean_intrinsic_time = 0.0;
 74 |     double mean_standard_time = 0.0;
 75 | 
 76 |     for (i = 0; i < runs; i++)
 77 |     {
 78 |         double start_standard = seconds();
 79 |         standard_kernel<<<1, 32>>>(input_value, d_standard_out, iters);
 80 |         CHECK(cudaDeviceSynchronize());
 81 |         mean_standard_time += seconds() - start_standard;
 82 | 
 83 |         double start_intrinsic = seconds();
 84 |         intrinsic_kernel<<<1, 32>>>(input_value, d_intrinsic_out, iters);
 85 |         CHECK(cudaDeviceSynchronize());
 86 |         mean_intrinsic_time += seconds() - start_intrinsic;
 87 |     }
 88 | 
 89 |     CHECK(cudaMemcpy(&h_standard_out, d_standard_out, sizeof(float),
 90 |                      cudaMemcpyDeviceToHost));
 91 |     CHECK(cudaMemcpy(&h_intrinsic_out, d_intrinsic_out, sizeof(float),
 92 |                      cudaMemcpyDeviceToHost));
 93 |     float host_value = powf(input_value, 2.0f);
 94 | 
 95 |     printf("Host calculated\t\t\t%f\n", host_value);
 96 |     printf("Standard Device calculated\t%f\n", h_standard_out);
 97 |     printf("Intrinsic Device calculated\t%f\n", h_intrinsic_out);
 98 |     printf("Host equals Standard?\t\t%s diff=%e\n",
 99 |            host_value == h_standard_out ? "Yes" : "No",
100 |            fabs(host_value - h_standard_out));
101 |     printf("Host equals Intrinsic?\t\t%s diff=%e\n",
102 |            host_value == h_intrinsic_out ? "Yes" : "No",
103 |            fabs(host_value - h_intrinsic_out));
104 |     printf("Standard equals Intrinsic?\t%s diff=%e\n",
105 |            h_standard_out == h_intrinsic_out ? "Yes" : "No",
106 |            fabs(h_standard_out - h_intrinsic_out));
107 |     printf("\n");
108 |     printf("Mean execution time for standard function powf:    %f s\n",
109 |            mean_standard_time);
110 |     printf("Mean execution time for intrinsic function __powf: %f s\n",
111 |            mean_intrinsic_time);
112 | 
113 |     return 0;
114 | }
115 | 


--------------------------------------------------------------------------------
/chapter07/my-atomic-add.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | /**
 6 |  * This example illustrates implementation of custom atomic operations using
 7 |  * CUDA's built-in atomicCAS function to implement atomic signed 32-bit integer
 8 |  * addition.
 9 |  **/
10 | 
11 | __device__ int myAtomicAdd(int *address, int incr)
12 | {
13 |     // Create an initial guess for the value stored at *address.
14 |     int guess = *address;
15 |     int oldValue = atomicCAS(address, guess, guess + incr);
16 | 
17 |     // Loop while the guess is incorrect.
18 |     while (oldValue != guess)
19 |     {
20 |         guess = oldValue;
21 |         oldValue = atomicCAS(address, guess, guess + incr);
22 |     }
23 | 
24 |     return oldValue;
25 | }
26 | 
27 | __global__ void kernel(int *sharedInteger)
28 | {
29 |     myAtomicAdd(sharedInteger, 1);
30 | }
31 | 
32 | int main(int argc, char **argv)
33 | {
34 |     int h_sharedInteger;
35 |     int *d_sharedInteger;
36 |     CHECK(cudaMalloc((void **)&d_sharedInteger, sizeof(int)));
37 |     CHECK(cudaMemset(d_sharedInteger, 0x00, sizeof(int)));
38 | 
39 |     kernel<<<4, 128>>>(d_sharedInteger);
40 | 
41 |     CHECK(cudaMemcpy(&h_sharedInteger, d_sharedInteger, sizeof(int),
42 |                      cudaMemcpyDeviceToHost));
43 |     printf("4 x 128 increments led to value of %d\n", h_sharedInteger);
44 | 
45 |     return 0;
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/chapter08/Makefile:
--------------------------------------------------------------------------------
 1 | CU_APPS=cublas cuda-openacc cufft-multi cufft cusparse rand-kernel \
 2 |         replace-rand-streams replace-rand
 3 | C_APPS=simple-data simple-kernels simple-parallel
 4 | 
 5 | all: ${C_APPS} ${CU_APPS}
 6 | 
 7 | cublas: cublas.cu
 8 | 	nvcc -O2 -arch=sm_20 -lcublas -o cublas cublas.cu
 9 | cuda-openacc: cuda-openacc.cu
10 | 	nvcc -O2 -arch=sm_20 -lcublas -lcurand -o cuda-openacc cuda-openacc.cu
11 | cufft-multi: cufft-multi.cu
12 | 	nvcc -O2 -arch=sm_20 -lcufft -o cufft-multi cufft-multi.cu
13 | cufft: cufft.cu
14 | 	nvcc -O2 -arch=sm_20 -lcufft -o cufft cufft.cu
15 | cusparse: cusparse.cu
16 | 	nvcc -O2 -arch=sm_20 -lcusparse -o cusparse cusparse.cu
17 | rand-kernel: rand-kernel.cu
18 | 	nvcc -O2 -arch=sm_20 -lcurand -o rand-kernel rand-kernel.cu
19 | replace-rand-streams: replace-rand-streams.cu
20 | 	nvcc -O2 -arch=sm_20 -lcurand -o replace-rand-streams replace-rand-streams.cu
21 | replace-rand: replace-rand.cu
22 | 	nvcc -O2 -arch=sm_20 -lcurand -o replace-rand replace-rand.cu
23 | %: %.cu
24 | 	nvcc -O2 -arch=sm_20 -o $@ $<
25 | %: %.c
26 | 	gcc -O2 -std=c99 -o $@ $<
27 | clean:
28 | 	rm -f ${CU_APPS} ${C_APPS}
29 | 


--------------------------------------------------------------------------------
/chapter08/cublas.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <cuda.h>
  5 | #include "cublas_v2.h"
  6 | 
  7 | /*
  8 |  * A simple example of performing matrix-vector multiplication using the cuBLAS
  9 |  * library and some randomly generated inputs.
 10 |  */
 11 | 
 12 | /*
 13 |  * M = # of rows
 14 |  * N = # of columns
 15 |  */
 16 | int M = 1024;
 17 | int N = 1024;
 18 | 
 19 | /*
 20 |  * Generate a vector of length N with random single-precision floating-point
 21 |  * values between 0 and 100.
 22 |  */
 23 | void generate_random_vector(int N, float **outX)
 24 | {
 25 |     int i;
 26 |     double rMax = (double)RAND_MAX;
 27 |     float *X = (float *)malloc(sizeof(float) * N);
 28 | 
 29 |     for (i = 0; i < N; i++)
 30 |     {
 31 |         int r = rand();
 32 |         double dr = (double)r;
 33 |         X[i] = (dr / rMax) * 100.0;
 34 |     }
 35 | 
 36 |     *outX = X;
 37 | }
 38 | 
 39 | /*
 40 |  * Generate a matrix with M rows and N columns in column-major order. The matrix
 41 |  * will be filled with random single-precision floating-point values between 0
 42 |  * and 100.
 43 |  */
 44 | void generate_random_dense_matrix(int M, int N, float **outA)
 45 | {
 46 |     int i, j;
 47 |     double rMax = (double)RAND_MAX;
 48 |     float *A = (float *)malloc(sizeof(float) * M * N);
 49 | 
 50 |     // For each column
 51 |     for (j = 0; j < N; j++)
 52 |     {
 53 |         // For each row
 54 |         for (i = 0; i < M; i++)
 55 |         {
 56 |             double dr = (double)rand();
 57 |             A[j * M + i] = (dr / rMax) * 100.0;
 58 |         }
 59 |     }
 60 | 
 61 |     *outA = A;
 62 | }
 63 | 
 64 | int main(int argc, char **argv)
 65 | {
 66 |     int i;
 67 |     float *A, *dA;
 68 |     float *X, *dX;
 69 |     float *Y, *dY;
 70 |     float beta;
 71 |     float alpha;
 72 |     cublasHandle_t handle = 0;
 73 | 
 74 |     alpha = 3.0f;
 75 |     beta = 4.0f;
 76 | 
 77 |     // Generate inputs
 78 |     srand(9384);
 79 |     generate_random_dense_matrix(M, N, &A);
 80 |     generate_random_vector(N, &X);
 81 |     generate_random_vector(M, &Y);
 82 | 
 83 |     // Create the cuBLAS handle
 84 |     CHECK_CUBLAS(cublasCreate(&handle));
 85 | 
 86 |     // Allocate device memory
 87 |     CHECK(cudaMalloc((void **)&dA, sizeof(float) * M * N));
 88 |     CHECK(cudaMalloc((void **)&dX, sizeof(float) * N));
 89 |     CHECK(cudaMalloc((void **)&dY, sizeof(float) * M));
 90 | 
 91 |     // Transfer inputs to the device
 92 |     CHECK_CUBLAS(cublasSetVector(N, sizeof(float), X, 1, dX, 1));
 93 |     CHECK_CUBLAS(cublasSetVector(M, sizeof(float), Y, 1, dY, 1));
 94 |     CHECK_CUBLAS(cublasSetMatrix(M, N, sizeof(float), A, M, dA, M));
 95 | 
 96 |     // Execute the matrix-vector multiplication
 97 |     CHECK_CUBLAS(cublasSgemv(handle, CUBLAS_OP_N, M, N, &alpha, dA, M, dX, 1,
 98 |                              &beta, dY, 1));
 99 | 
100 |     // Retrieve the output vector from the device
101 |     CHECK_CUBLAS(cublasGetVector(M, sizeof(float), dY, 1, Y, 1));
102 | 
103 |     for (i = 0; i < 10; i++)
104 |     {
105 |         printf("%2.2f\n", Y[i]);
106 |     }
107 | 
108 |     printf("...\n");
109 | 
110 |     free(A);
111 |     free(X);
112 |     free(Y);
113 | 
114 |     CHECK(cudaFree(dA));
115 |     CHECK(cudaFree(dY));
116 |     CHECK_CUBLAS(cublasDestroy(handle));
117 | 
118 |     return 0;
119 | }
120 | 


--------------------------------------------------------------------------------
/chapter08/cuda-openacc.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <curand.h>
  5 | #include <cublas_v2.h>
  6 | 
  7 | /*
  8 |  * This example illustrates the use of OpenACC and CUDA libraries in the same
  9 |  * application. cuRAND is used to fill two input matrices with random values.
 10 |  * OpenACC is used to implement a matrix-multiply using the parallel and loop
 11 |  * directives. Finally, cuBLAS is used to first sum the values of every row, and
 12 |  * then sum those values together to calculate the sum of all values in the
 13 |  * output matrix.
 14 |  */
 15 | 
 16 | #define M   1024
 17 | #define N   1024
 18 | #define P   1024
 19 | 
 20 | int main(int argc, char **argv)
 21 | {
 22 |     int i, j, k;
 23 |     float *__restrict__ d_A;
 24 |     float *__restrict__ d_B;
 25 |     float *__restrict__ d_C;
 26 |     float *d_row_sums;
 27 |     float total_sum;
 28 |     curandGenerator_t rand_state = 0;
 29 |     cublasHandle_t cublas_handle = 0;
 30 | 
 31 |     // Initialize the cuRAND and cuBLAS handles.
 32 |     CHECK_CURAND(curandCreateGenerator(&rand_state, CURAND_RNG_PSEUDO_DEFAULT));
 33 |     CHECK_CUBLAS(cublasCreate(&cublas_handle));
 34 | 
 35 |     // Allocate GPU memory for the input matrices, output matrix, and row sums.
 36 |     CHECK(cudaMalloc((void **)&d_A, sizeof(float) * M * N));
 37 |     CHECK(cudaMalloc((void **)&d_B, sizeof(float) * N * P));
 38 |     CHECK(cudaMalloc((void **)&d_C, sizeof(float) * M * P));
 39 |     CHECK(cudaMalloc((void **)&d_row_sums, sizeof(float) * M));
 40 | 
 41 |     // Generate random values in both input matrices.
 42 |     CHECK_CURAND(curandGenerateUniform(rand_state, d_A, M * N));
 43 |     CHECK_CURAND(curandGenerateUniform(rand_state, d_B, N * P));
 44 | 
 45 |     // Perform a matrix multiply parallelized across gangs and workers
 46 | #pragma acc parallel loop gang deviceptr(d_A, d_B, d_C)
 47 | 
 48 |     for (i = 0; i < M; i++)
 49 |     {
 50 | #pragma acc loop worker vector
 51 | 
 52 |         for (j = 0; j < P; j++)
 53 |         {
 54 |             float sum = 0.0f;
 55 | 
 56 |             for (k = 0; k < N; k++)
 57 |             {
 58 |                 sum += d_A[i * N + k] * d_B[k * P + j];
 59 |             }
 60 | 
 61 |             d_C[i * P + j] = sum;
 62 |         }
 63 |     }
 64 | 
 65 |     /*
 66 |      * Set cuBLAS to device pointer mode, indicating that all scalars are passed
 67 |      * as device pointers.
 68 |      */
 69 |     CHECK_CUBLAS(cublasSetPointerMode(cublas_handle,
 70 |                                       CUBLAS_POINTER_MODE_DEVICE));
 71 | 
 72 |     // Sum the values contained in each row.
 73 |     for (i = 0; i < M; i++)
 74 |     {
 75 |         CHECK_CUBLAS(cublasSasum(cublas_handle, P, d_C + (i * P), 1,
 76 |                                  d_row_sums + i));
 77 |     }
 78 | 
 79 |     /*
 80 |      * Set cuBLAS back to host pointer mode, indicating that all scalars are
 81 |      * passed as host pointers.
 82 |      */
 83 |     CHECK_CUBLAS(cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST));
 84 |     /*
 85 |      * Do the final sum of the sum of all rows to produce a total for the whole
 86 |      * output matrix.
 87 |      */
 88 |     CHECK_CUBLAS(cublasSasum(cublas_handle, M, d_row_sums, 1, &total_sum));
 89 |     CHECK(cudaDeviceSynchronize());
 90 | 
 91 |     // Release device memory
 92 |     CHECK(cudaFree(d_A));
 93 |     CHECK(cudaFree(d_B));
 94 |     CHECK(cudaFree(d_C));
 95 |     CHECK(cudaFree(d_row_sums));
 96 | 
 97 |     printf("Total sum = %f\n", total_sum);
 98 | 
 99 |     return 0;
100 | }
101 | 


--------------------------------------------------------------------------------
/chapter08/cufft-multi.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <cuda.h>
  5 | #include <cufftXt.h>
  6 | 
  7 | /*
  8 |  * An example usage of the Multi-GPU cuFFT XT library introduced in CUDA 6. This
  9 |  * example performs a 1D forward FFT across all devices detected in the system.
 10 |  */
 11 | 
 12 | /*
 13 |  * Create N fake samplings along the function cos(x). These samplings will be
 14 |  * stored as single-precision floating-point values.
 15 |  */
 16 | void generate_fake_samples(int N, float **out)
 17 | {
 18 |     int i;
 19 |     float *result = (float *)malloc(sizeof(float) * N);
 20 |     double delta = M_PI / 4.0;
 21 | 
 22 |     for (i = 0; i < N; i++)
 23 |     {
 24 |         result[i] = cos(i * delta);
 25 |     }
 26 | 
 27 |     *out = result;
 28 | }
 29 | 
 30 | /*
 31 |  * Convert a real-valued vector r of length Nto a complex-valued vector.
 32 |  */
 33 | void real_to_complex(float *r, cufftComplex **complx, int N)
 34 | {
 35 |     int i;
 36 |     (*complx) = (cufftComplex *)malloc(sizeof(cufftComplex) * N);
 37 | 
 38 |     for (i = 0; i < N; i++)
 39 |     {
 40 |         (*complx)[i].x = r[i];
 41 |         (*complx)[i].y = 0;
 42 |     }
 43 | }
 44 | 
 45 | /*
 46 |  * Retrieve device IDs for all CUDA devices in the current system.
 47 |  */
 48 | int getAllGpus(int **gpus)
 49 | {
 50 |     int i;
 51 |     int nGpus;
 52 | 
 53 |     CHECK(cudaGetDeviceCount(&nGpus));
 54 | 
 55 |     *gpus = (int *)malloc(sizeof(int) * nGpus);
 56 | 
 57 |     for (i = 0; i < nGpus; i++)
 58 |     {
 59 |         (*gpus)[i] = i;
 60 |     }
 61 | 
 62 |     return nGpus;
 63 | }
 64 | 
 65 | int main(int argc, char **argv)
 66 | {
 67 |     int i;
 68 |     int N = 1024;
 69 |     float *samples;
 70 |     cufftComplex *complexSamples;
 71 |     int *gpus;
 72 |     size_t *workSize;
 73 |     cufftHandle plan = 0;
 74 |     cudaLibXtDesc *dComplexSamples;
 75 | 
 76 |     int nGPUs = getAllGpus(&gpus);
 77 |     nGPUs = nGPUs > 2 ? 2 : nGPUs;
 78 |     workSize = (size_t *)malloc(sizeof(size_t) * nGPUs);
 79 | 
 80 |     // Setup the cuFFT Multi-GPU plan
 81 |     CHECK_CUFFT(cufftCreate(&plan));
 82 |     // CHECK_CUFFT(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
 83 |     CHECK_CUFFT(cufftXtSetGPUs(plan, 2, gpus));
 84 |     CHECK_CUFFT(cufftMakePlan1d(plan, N, CUFFT_C2C, 1, workSize));
 85 | 
 86 |     // Generate inputs
 87 |     generate_fake_samples(N, &samples);
 88 |     real_to_complex(samples, &complexSamples, N);
 89 |     cufftComplex *complexFreq = (cufftComplex *)malloc(
 90 |                                     sizeof(cufftComplex) * N);
 91 | 
 92 |     // Allocate memory across multiple GPUs and transfer the inputs into it
 93 |     CHECK_CUFFT(cufftXtMalloc(plan, &dComplexSamples, CUFFT_XT_FORMAT_INPLACE));
 94 |     CHECK_CUFFT(cufftXtMemcpy(plan, dComplexSamples, complexSamples,
 95 |                               CUFFT_COPY_HOST_TO_DEVICE));
 96 | 
 97 |     // Execute a complex-to-complex 1D FFT across multiple GPUs
 98 |     CHECK_CUFFT(cufftXtExecDescriptorC2C(plan, dComplexSamples, dComplexSamples,
 99 |                                          CUFFT_FORWARD));
100 | 
101 |     // Retrieve the results from multiple GPUs into host memory
102 |     CHECK_CUFFT(cufftXtMemcpy(plan, complexSamples, dComplexSamples,
103 |                               CUFFT_COPY_DEVICE_TO_HOST));
104 | 
105 |     printf("Fourier Coefficients:\n");
106 | 
107 |     for (i = 0; i < 30; i++)
108 |     {
109 |         printf("  %d: (%2.4f, %2.4f)\n", i + 1, complexFreq[i].x,
110 |                complexFreq[i].y);
111 |     }
112 | 
113 |     free(gpus);
114 |     free(samples);
115 |     free(complexSamples);
116 |     free(complexFreq);
117 |     free(workSize);
118 | 
119 |     CHECK_CUFFT(cufftXtFree(dComplexSamples));
120 |     CHECK_CUFFT(cufftDestroy(plan));
121 | 
122 |     return 0;
123 | }
124 | 


--------------------------------------------------------------------------------
/chapter08/cufft.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <cuda.h>
  5 | #include <cufft.h>
  6 | 
  7 | /*
  8 |  * An example usage of the cuFFT library. This example performs a 1D forward
  9 |  * FFT.
 10 |  */
 11 | 
 12 | int nprints = 30;
 13 | 
 14 | /*
 15 |  * Create N fake samplings along the function cos(x). These samplings will be
 16 |  * stored as single-precision floating-point values.
 17 |  */
 18 | void generate_fake_samples(int N, float **out)
 19 | {
 20 |     int i;
 21 |     float *result = (float *)malloc(sizeof(float) * N);
 22 |     double delta = M_PI / 20.0;
 23 | 
 24 |     for (i = 0; i < N; i++)
 25 |     {
 26 |         result[i] = cos(i * delta);
 27 |     }
 28 | 
 29 |     *out = result;
 30 | }
 31 | 
 32 | /*
 33 |  * Convert a real-valued vector r of length Nto a complex-valued vector.
 34 |  */
 35 | void real_to_complex(float *r, cufftComplex **complx, int N)
 36 | {
 37 |     int i;
 38 |     (*complx) = (cufftComplex *)malloc(sizeof(cufftComplex) * N);
 39 | 
 40 |     for (i = 0; i < N; i++)
 41 |     {
 42 |         (*complx)[i].x = r[i];
 43 |         (*complx)[i].y = 0;
 44 |     }
 45 | }
 46 | 
 47 | int main(int argc, char **argv)
 48 | {
 49 |     int i;
 50 |     int N = 2048;
 51 |     float *samples;
 52 |     cufftHandle plan = 0;
 53 |     cufftComplex *dComplexSamples, *complexSamples, *complexFreq;
 54 | 
 55 |     // Input Generation
 56 |     generate_fake_samples(N, &samples);
 57 |     real_to_complex(samples, &complexSamples, N);
 58 |     complexFreq = (cufftComplex *)malloc(
 59 |                       sizeof(cufftComplex) * N);
 60 |     printf("Initial Samples:\n");
 61 | 
 62 |     for (i = 0; i < nprints; i++)
 63 |     {
 64 |         printf("  %2.4f\n", samples[i]);
 65 |     }
 66 | 
 67 |     printf("  ...\n");
 68 | 
 69 |     // Setup the cuFFT plan
 70 |     CHECK_CUFFT(cufftPlan1d(&plan, N, CUFFT_C2C, 1));
 71 | 
 72 |     // Allocate device memory
 73 |     CHECK(cudaMalloc((void **)&dComplexSamples,
 74 |             sizeof(cufftComplex) * N));
 75 | 
 76 |     // Transfer inputs into device memory
 77 |     CHECK(cudaMemcpy(dComplexSamples, complexSamples,
 78 |             sizeof(cufftComplex) * N, cudaMemcpyHostToDevice));
 79 | 
 80 |     // Execute a complex-to-complex 1D FFT
 81 |     CHECK_CUFFT(cufftExecC2C(plan, dComplexSamples, dComplexSamples,
 82 |                              CUFFT_FORWARD));
 83 | 
 84 |     // Retrieve the results into host memory
 85 |     CHECK(cudaMemcpy(complexFreq, dComplexSamples,
 86 |             sizeof(cufftComplex) * N, cudaMemcpyDeviceToHost));
 87 | 
 88 |     printf("Fourier Coefficients:\n");
 89 | 
 90 |     for (i = 0; i < nprints; i++)
 91 |     {
 92 |         printf("  %d: (%2.4f, %2.4f)\n", i + 1, complexFreq[i].x,
 93 |                complexFreq[i].y);
 94 |     }
 95 | 
 96 |     printf("  ...\n");
 97 | 
 98 |     free(samples);
 99 |     free(complexSamples);
100 |     free(complexFreq);
101 | 
102 |     CHECK(cudaFree(dComplexSamples));
103 |     CHECK_CUFFT(cufftDestroy(plan));
104 | 
105 |     return 0;
106 | }
107 | 


--------------------------------------------------------------------------------
/chapter08/cusparse.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <cusparse_v2.h>
  5 | #include <cuda.h>
  6 | 
  7 | /*
  8 |  * This is an example demonstrating usage of the cuSPARSE library to perform a
  9 |  * sparse matrix-vector multiplication on randomly generated data.
 10 |  */
 11 | 
 12 | /*
 13 |  * M = # of rows
 14 |  * N = # of columns
 15 |  */
 16 | int M = 1024;
 17 | int N = 1024;
 18 | 
 19 | /*
 20 |  * Generate a vector of length N with random single-precision floating-point
 21 |  * values between 0 and 100.
 22 |  */
 23 | void generate_random_vector(int N, float **outX)
 24 | {
 25 |     int i;
 26 |     double rMax = (double)RAND_MAX;
 27 |     float *X = (float *)malloc(sizeof(float) * N);
 28 | 
 29 |     for (i = 0; i < N; i++)
 30 |     {
 31 |         int r = rand();
 32 |         double dr = (double)r;
 33 |         X[i] = (dr / rMax) * 100.0;
 34 |     }
 35 | 
 36 |     *outX = X;
 37 | }
 38 | 
 39 | /*
 40 |  * Generate random dense matrix A in column-major order, while rounding some
 41 |  * elements down to zero to ensure it is sparse.
 42 |  */
 43 | int generate_random_dense_matrix(int M, int N, float **outA)
 44 | {
 45 |     int i, j;
 46 |     double rMax = (double)RAND_MAX;
 47 |     float *A = (float *)malloc(sizeof(float) * M * N);
 48 |     int totalNnz = 0;
 49 | 
 50 |     for (j = 0; j < N; j++)
 51 |     {
 52 |         for (i = 0; i < M; i++)
 53 |         {
 54 |             int r = rand();
 55 |             float *curr = A + (j * M + i);
 56 | 
 57 |             if (r % 3 > 0)
 58 |             {
 59 |                 *curr = 0.0f;
 60 |             }
 61 |             else
 62 |             {
 63 |                 double dr = (double)r;
 64 |                 *curr = (dr / rMax) * 100.0;
 65 |             }
 66 | 
 67 |             if (*curr != 0.0f)
 68 |             {
 69 |                 totalNnz++;
 70 |             }
 71 |         }
 72 |     }
 73 | 
 74 |     *outA = A;
 75 |     return totalNnz;
 76 | }
 77 | 
 78 | int main(int argc, char **argv)
 79 | {
 80 |     int row;
 81 |     float *A, *dA;
 82 |     int *dNnzPerRow;
 83 |     float *dCsrValA;
 84 |     int *dCsrRowPtrA;
 85 |     int *dCsrColIndA;
 86 |     int totalNnz;
 87 |     float alpha = 3.0f;
 88 |     float beta = 4.0f;
 89 |     float *dX, *X;
 90 |     float *dY, *Y;
 91 |     cusparseHandle_t handle = 0;
 92 |     cusparseMatDescr_t descr = 0;
 93 | 
 94 |     // Generate input
 95 |     srand(9384);
 96 |     int trueNnz = generate_random_dense_matrix(M, N, &A);
 97 |     generate_random_vector(N, &X);
 98 |     generate_random_vector(M, &Y);
 99 | 
100 |     // Create the cuSPARSE handle
101 |     CHECK_CUSPARSE(cusparseCreate(&handle));
102 | 
103 |     // Allocate device memory for vectors and the dense form of the matrix A
104 |     CHECK(cudaMalloc((void **)&dX, sizeof(float) * N));
105 |     CHECK(cudaMalloc((void **)&dY, sizeof(float) * M));
106 |     CHECK(cudaMalloc((void **)&dA, sizeof(float) * M * N));
107 |     CHECK(cudaMalloc((void **)&dNnzPerRow, sizeof(int) * M));
108 | 
109 |     // Construct a descriptor of the matrix A
110 |     CHECK_CUSPARSE(cusparseCreateMatDescr(&descr));
111 |     CHECK_CUSPARSE(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
112 |     CHECK_CUSPARSE(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
113 | 
114 |     // Transfer the input vectors and dense matrix A to the device
115 |     CHECK(cudaMemcpy(dX, X, sizeof(float) * N, cudaMemcpyHostToDevice));
116 |     CHECK(cudaMemcpy(dY, Y, sizeof(float) * M, cudaMemcpyHostToDevice));
117 |     CHECK(cudaMemcpy(dA, A, sizeof(float) * M * N, cudaMemcpyHostToDevice));
118 | 
119 |     // Compute the number of non-zero elements in A
120 |     CHECK_CUSPARSE(cusparseSnnz(handle, CUSPARSE_DIRECTION_ROW, M, N, descr, dA,
121 |                                 M, dNnzPerRow, &totalNnz));
122 | 
123 |     if (totalNnz != trueNnz)
124 |     {
125 |         fprintf(stderr, "Difference detected between cuSPARSE NNZ and true "
126 |                 "value: expected %d but got %d\n", trueNnz, totalNnz);
127 |         return 1;
128 |     }
129 | 
130 |     // Allocate device memory to store the sparse CSR representation of A
131 |     CHECK(cudaMalloc((void **)&dCsrValA, sizeof(float) * totalNnz));
132 |     CHECK(cudaMalloc((void **)&dCsrRowPtrA, sizeof(int) * (M + 1)));
133 |     CHECK(cudaMalloc((void **)&dCsrColIndA, sizeof(int) * totalNnz));
134 | 
135 |     // Convert A from a dense formatting to a CSR formatting, using the GPU
136 |     CHECK_CUSPARSE(cusparseSdense2csr(handle, M, N, descr, dA, M, dNnzPerRow,
137 |                                       dCsrValA, dCsrRowPtrA, dCsrColIndA));
138 | 
139 |     // Perform matrix-vector multiplication with the CSR-formatted matrix A
140 |     CHECK_CUSPARSE(cusparseScsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
141 |                                   M, N, totalNnz, &alpha, descr, dCsrValA,
142 |                                   dCsrRowPtrA, dCsrColIndA, dX, &beta, dY));
143 | 
144 |     // Copy the result vector back to the host
145 |     CHECK(cudaMemcpy(Y, dY, sizeof(float) * M, cudaMemcpyDeviceToHost));
146 | 
147 |     for (row = 0; row < 10; row++)
148 |     {
149 |         printf("%2.2f\n", Y[row]);
150 |     }
151 | 
152 |     printf("...\n");
153 | 
154 |     free(A);
155 |     free(X);
156 |     free(Y);
157 | 
158 |     CHECK(cudaFree(dX));
159 |     CHECK(cudaFree(dY));
160 |     CHECK(cudaFree(dA));
161 |     CHECK(cudaFree(dNnzPerRow));
162 |     CHECK(cudaFree(dCsrValA));
163 |     CHECK(cudaFree(dCsrRowPtrA));
164 |     CHECK(cudaFree(dCsrColIndA));
165 | 
166 |     CHECK_CUSPARSE(cusparseDestroyMatDescr(descr));
167 |     CHECK_CUSPARSE(cusparseDestroy(handle));
168 | 
169 | 
170 |     return 0;
171 | }
172 | 


--------------------------------------------------------------------------------
/chapter08/drop-in.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | extern int sgemm_(char *transa, char *transb, int *m, int *
 5 |                   n, int *k, float *alpha, float *a, int *lda, float *b, int *
 6 |                   ldb, float *beta, float *c, int *ldc);
 7 | 
 8 | /*
 9 |  * A simple example of re-compiling legacy BLAS code to use the drop-in cuBLAS
10 |  * library.
11 |  */
12 | 
13 | /*
14 |  * M = # of rows
15 |  * N = # of columns
16 |  */
17 | int M = 1024;
18 | int N = 1024;
19 | 
20 | /*
21 |  * Generate a matrix with M rows and N columns in column-major order. The matrix
22 |  * will be filled with random single-precision floating-point values between 0
23 |  * and 100.
24 |  */
25 | void generate_random_dense_matrix(int M, int N, float **outA)
26 | {
27 |     int i, j;
28 |     double rMax = (double)RAND_MAX;
29 |     float *A = (float *)malloc(sizeof(float) * M * N);
30 | 
31 |     // For each column
32 |     for (j = 0; j < N; j++)
33 |     {
34 |         // For each row
35 |         for (i = 0; i < M; i++)
36 |         {
37 |             double dr = (double)rand();
38 |             A[j * M + i] = (dr / rMax) * 100.0;
39 |         }
40 |     }
41 | 
42 |     *outA = A;
43 | }
44 | 
45 | int main(int argc, char **argv)
46 | {
47 |     int i, j;
48 |     float *A, *B, *C;
49 |     float alpha = 3.0f;
50 |     float beta = 4.0f;
51 | 
52 |     // Generate inputs
53 |     srand(9384);
54 |     generate_random_dense_matrix(M, N, &A);
55 |     generate_random_dense_matrix(N, M, &B);
56 |     generate_random_dense_matrix(M, N, &C);
57 | 
58 |     sgemm_("N", "N", &M, &M, &N, &alpha, A, &M, B, &N, &beta, C, &M);
59 | 
60 |     for (i = 0; i < 10; i++)
61 |     {
62 |         for (j = 0; j < 10; j++)
63 |         {
64 |             printf("%2.2f ", C[j * M + i]);
65 |         }
66 | 
67 |         printf("...\n");
68 |     }
69 | 
70 |     printf("...\n");
71 | 
72 |     free(A);
73 |     free(B);
74 |     free(C);
75 | 
76 |     return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/chapter08/rand-kernel.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <cuda.h>
  5 | #include <curand_kernel.h>
  6 | 
  7 | /*
  8 |  * This example demonstrates two techniques for using the cuRAND host and device
  9 |  * API to generate random numbers for CUDA kernels to consume.
 10 |  */
 11 | 
 12 | int threads_per_block = 256;
 13 | int blocks_per_grid = 30;
 14 | 
 15 | /*
 16 |  * host_api_kernel consumes pre-generated random values from the cuRAND host API
 17 |  * to perform some dummy computation.
 18 |  */
 19 | __global__ void host_api_kernel(float *randomValues, float *out, int N)
 20 | {
 21 |     int i;
 22 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 23 |     int nthreads = gridDim.x * blockDim.x;
 24 | 
 25 |     for (i = tid; i < N; i += nthreads)
 26 |     {
 27 |         float rand = randomValues[i];
 28 |         rand = rand * 2;
 29 |         out[i] = rand;
 30 |     }
 31 | }
 32 | 
 33 | /*
 34 |  * device_api_kernel uses the cuRAND device API to generate random numbers
 35 |  * on-the-fly on the GPU, and then performs some dummy computation using them.
 36 |  */
 37 | __global__ void device_api_kernel(curandState *states, float *out, int N)
 38 | {
 39 |     int i;
 40 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 41 |     int nthreads = gridDim.x * blockDim.x;
 42 |     curandState *state = states + tid;
 43 | 
 44 |     curand_init(9384, tid, 0, state);
 45 | 
 46 |     for (i = tid; i < N; i += nthreads)
 47 |     {
 48 |         float rand = curand_uniform(state);
 49 |         rand = rand * 2;
 50 |         out[i] = rand;
 51 |     }
 52 | }
 53 | 
 54 | /*
 55 |  * use_host_api is an examples usage of the cuRAND host API to generate random
 56 |  * values to be consumed on the device.
 57 |  */
 58 | void use_host_api(int N)
 59 | {
 60 |     int i;
 61 |     curandGenerator_t randGen;
 62 |     float *dRand, *dOut, *hOut;
 63 | 
 64 |     // Create cuRAND generator (i.e. handle)
 65 |     CHECK_CURAND(curandCreateGenerator(&randGen, CURAND_RNG_PSEUDO_DEFAULT));
 66 | 
 67 |     // Allocate device memory to store the random values and output
 68 |     CHECK(cudaMalloc((void **)&dRand, sizeof(float) * N));
 69 |     CHECK(cudaMalloc((void **)&dOut, sizeof(float) * N));
 70 |     hOut = (float *)malloc(sizeof(float) * N);
 71 | 
 72 |     // Generate N random values from a uniform distribution
 73 |     CHECK_CURAND(curandGenerateUniform(randGen, dRand, N));
 74 | 
 75 |     // Consume the values generated by curandGenerateUniform
 76 |     host_api_kernel<<<blocks_per_grid, threads_per_block>>>(dRand, dOut, N);
 77 | 
 78 |     // Retrieve outputs
 79 |     CHECK(cudaMemcpy(hOut, dOut, sizeof(float) * N, cudaMemcpyDeviceToHost));
 80 | 
 81 |     printf("Sampling of output from host API:\n");
 82 | 
 83 |     for (i = 0; i < 10; i++)
 84 |     {
 85 |         printf("%2.4f\n", hOut[i]);
 86 |     }
 87 | 
 88 |     printf("...\n");
 89 | 
 90 |     free(hOut);
 91 |     CHECK(cudaFree(dRand));
 92 |     CHECK(cudaFree(dOut));
 93 |     CHECK_CURAND(curandDestroyGenerator(randGen));
 94 | }
 95 | 
 96 | /*
 97 |  * use_device_api is an examples usage of the cuRAND device API to use the GPU
 98 |  * to generate random values on the fly from inside a CUDA kernel.
 99 |  */
100 | void use_device_api(int N)
101 | {
102 |     int i;
103 |     static curandState *states = NULL;
104 |     float *dOut, *hOut;
105 | 
106 |     /*
107 |      * Allocate device memory to store the output and cuRAND device state
108 |      * objects (which are analogous to handles, but on the GPU).
109 |      */
110 |     CHECK(cudaMalloc((void **)&dOut, sizeof(float) * N));
111 |     CHECK(cudaMalloc((void **)&states, sizeof(curandState) *
112 |                 threads_per_block * blocks_per_grid));
113 |     hOut = (float *)malloc(sizeof(float) * N);
114 | 
115 |     // Execute a kernel that generates and consumes its own random numbers
116 |     device_api_kernel<<<blocks_per_grid, threads_per_block>>>(states, dOut, N);
117 | 
118 |     // Retrieve the results
119 |     CHECK(cudaMemcpy(hOut, dOut, sizeof(float) * N, cudaMemcpyDeviceToHost));
120 | 
121 |     printf("Sampling of output from device API:\n");
122 | 
123 |     for (i = 0; i < 10; i++)
124 |     {
125 |         printf("%2.4f\n", hOut[i]);
126 |     }
127 | 
128 |     printf("...\n");
129 | 
130 |     free(hOut);
131 |     CHECK(cudaFree(dOut));
132 |     CHECK(cudaFree(states));
133 | }
134 | 
135 | int main(int argc, char **argv)
136 | {
137 |     int N = 8388608;
138 | 
139 |     use_host_api(N);
140 |     use_device_api(N);
141 | 
142 |     return 0;
143 | }
144 | 


--------------------------------------------------------------------------------
/chapter08/replace-rand-streams.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <cuda.h>
  5 | #include <curand_kernel.h>
  6 | 
  7 | /*
  8 |  * This example is a clone of replace-rand.cu that uses CUDA streams to overlap
  9 |  * the generation of random numbers using cuSPARSE with any host computation.
 10 |  */
 11 | 
 12 | /*
 13 |  * initialize_state initializes cuRAND device state
 14 |  */
 15 | __global__ void initialize_state(curandState *states)
 16 | {
 17 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 18 |     curand_init(9384, tid, 0, states + tid);
 19 | }
 20 | 
 21 | /*
 22 |  * refill_randoms uses the cuRAND device API to generate N random values using
 23 |  * the states passed to the kernel.
 24 |  */
 25 | __global__ void refill_randoms(float *dRand, int N, curandState *states)
 26 | {
 27 |     int i;
 28 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 29 |     int nthreads = gridDim.x * blockDim.x;
 30 |     curandState *state = states + tid;
 31 | 
 32 |     for (i = tid; i < N; i += nthreads)
 33 |     {
 34 |         dRand[i] = curand_uniform(state);
 35 |     }
 36 | }
 37 | 
 38 | /*
 39 |  * An implementation of rand() that uses the cuRAND device API.
 40 |  */
 41 | float cuda_device_rand()
 42 | {
 43 |     static cudaStream_t stream = 0;
 44 |     static curandState *states = NULL;
 45 |     static float *dRand = NULL;
 46 |     static float *hRand = NULL;
 47 |     static int dRand_length = 1000000;
 48 |     static int dRand_used = dRand_length;
 49 | 
 50 |     int threads_per_block = 256;
 51 |     int blocks_per_grid = 30;
 52 | 
 53 |     if (dRand == NULL)
 54 |     {
 55 |         /*
 56 |          * If the cuRAND state hasn't been initialized yet, create a CUDA stream
 57 |          * to execute operations in, pre-allocate device memory to store the
 58 |          * generated random values in, and asynchronously launch a
 59 |          * refill_randoms kernel to begin generating random numbers.
 60 |          */
 61 |         CHECK(cudaStreamCreate(&stream));
 62 |         CHECK(cudaMalloc((void **)&dRand, sizeof(float) * dRand_length));
 63 |         CHECK(cudaMalloc((void **)&states, sizeof(curandState) *
 64 |                         threads_per_block * blocks_per_grid));
 65 |         hRand = (float *)malloc(sizeof(float) * dRand_length);
 66 |         initialize_state<<<blocks_per_grid, threads_per_block, 0, stream>>>(
 67 |             states);
 68 |         refill_randoms<<<blocks_per_grid, threads_per_block>>>(dRand,
 69 |                 dRand_length, states);
 70 |     }
 71 | 
 72 |     if (dRand_used == dRand_length)
 73 |     {
 74 |         /*
 75 |          * If all pre-generated random numbers have been consumed, wait for the
 76 |          * last launch of refill_randoms to complete, transfer those newly
 77 |          * generated random numbers back, and launch another batch random number
 78 |          * generation kernel asynchronously.
 79 |          */
 80 |         CHECK(cudaStreamSynchronize(stream));
 81 |         CHECK(cudaMemcpy(hRand, dRand, sizeof(float) * dRand_length,
 82 |                     cudaMemcpyDeviceToHost));
 83 |         refill_randoms<<<blocks_per_grid, threads_per_block, 0, stream>>>(dRand,
 84 |                 dRand_length, states);
 85 |         dRand_used = 0;
 86 |     }
 87 | 
 88 |     // Return the next pre-generated random number
 89 |     return hRand[dRand_used++];
 90 | }
 91 | 
 92 | /*
 93 |  * An implementation of rand() that uses the cuRAND host API.
 94 |  */
 95 | float cuda_host_rand()
 96 | {
 97 |     static cudaStream_t stream = 0;
 98 |     static float *dRand = NULL;
 99 |     static float *hRand = NULL;
100 |     curandGenerator_t randGen;
101 |     static int dRand_length = 1000000;
102 |     static int dRand_used = 1000000;
103 | 
104 |     if (dRand == NULL)
105 |     {
106 |         /*
107 |          * If the cuRAND state hasn't been initialized yet, construct a cuRAND
108 |          * generator and configure it to use a CUDA stream. Pre-allocate device
109 |          * memory to store the output random numbers and asynchronously launch
110 |          * curandGenerateUniform. Because curandGenerateUniform uses the randGen
111 |          * handle, it will execute in the set stream.
112 |          */
113 |         CHECK_CURAND(curandCreateGenerator(&randGen,
114 |                                            CURAND_RNG_PSEUDO_DEFAULT));
115 |         CHECK(cudaStreamCreate(&stream));
116 |         CHECK_CURAND(curandSetStream(randGen, stream));
117 | 
118 |         CHECK(cudaMalloc((void **)&dRand, sizeof(float) * dRand_length));
119 |         hRand = (float *)malloc(sizeof(float) * dRand_length);
120 |         CHECK_CURAND(curandGenerateUniform(randGen, dRand, dRand_length));
121 |     }
122 | 
123 |     if (dRand_used == dRand_length)
124 |     {
125 |         /*
126 |          * If all pre-generated random numbers have been consumed, wait for the
127 |          * last asynchronous curandGenerateUniform to complex, transfer the new
128 |          * batch of random numbers back to the host, and relaunch
129 |          * curandGenerateUniform.
130 |          */
131 |         CHECK(cudaStreamSynchronize(stream));
132 |         CHECK(cudaMemcpy(hRand, dRand, sizeof(float) * dRand_length,
133 |                         cudaMemcpyDeviceToHost));
134 |         CHECK_CURAND(curandGenerateUniform(randGen, dRand, dRand_length));
135 |         dRand_used = 0;
136 |     }
137 | 
138 |     // Return the next pre-generated random number
139 |     return hRand[dRand_used++];
140 | }
141 | 
142 | float host_rand()
143 | {
144 |     return (float)rand() / (float)RAND_MAX;
145 | }
146 | 
147 | int main(int argc, char **argv)
148 | {
149 |     int i;
150 |     int N = 8388608;
151 | 
152 |     for (i = 0; i < N; i++)
153 |     {
154 |         float h = host_rand();
155 |         float d = cuda_host_rand();
156 |         float dd = cuda_device_rand();
157 |         printf("%2.4f %2.4f %2.4f\n", h, d, dd);
158 |         getchar();
159 |     }
160 | 
161 |     return 0;
162 | }
163 | 


--------------------------------------------------------------------------------
/chapter08/replace-rand.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <cuda.h>
  5 | #include <curand_kernel.h>
  6 | 
  7 | /*
  8 |  * This example uses the cuRAND host and device API to replace the system rand()
  9 |  * call by pre-generating large chunks of random numbers before fetching one at
 10 |  * at time. If there are no unused random numbers left, a new batch is generated
 11 |  * synchronously.
 12 |  */
 13 | 
 14 | /*
 15 |  * initialize_state initializes cuRAND device state
 16 |  */
 17 | __global__ void initialize_state(curandState *states)
 18 | {
 19 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 20 |     curand_init(9384, tid, 0, states + tid);
 21 | }
 22 | 
 23 | /*
 24 |  * refill_randoms uses the cuRAND device API to generate N random values using
 25 |  * the states passed to the kernel.
 26 |  */
 27 | __global__ void refill_randoms(float *dRand, int N, curandState *states)
 28 | {
 29 |     int i;
 30 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 31 |     int nthreads = gridDim.x * blockDim.x;
 32 |     curandState *state = states + tid;
 33 | 
 34 |     for (i = tid; i < N; i += nthreads)
 35 |     {
 36 |         dRand[i] = curand_uniform(state);
 37 |     }
 38 | }
 39 | 
 40 | /*
 41 |  * An implementation of rand() that uses the cuRAND device API.
 42 |  */
 43 | float cuda_device_rand()
 44 | {
 45 |     static curandState *states = NULL;
 46 |     static float *dRand = NULL;
 47 |     static float *hRand = NULL;
 48 |     static int dRand_length = 1000000;
 49 |     static int dRand_used = 1000000;
 50 | 
 51 |     int threads_per_block = 256;
 52 |     int blocks_per_grid = 30;
 53 | 
 54 |     if (dRand == NULL)
 55 |     {
 56 |         /*
 57 |          * If the cuRAND state hasn't been initialized yet, pre-allocate memory
 58 |          * to store the generated random values in as well as the cuRAND device
 59 |          * state objects.
 60 |          */
 61 |         CHECK(cudaMalloc((void **)&dRand, sizeof(float) * dRand_length));
 62 |         CHECK(cudaMalloc((void **)&states, sizeof(curandState) *
 63 |                         threads_per_block * blocks_per_grid));
 64 |         hRand = (float *)malloc(sizeof(float) * dRand_length);
 65 |         // Initialize states on the device
 66 |         initialize_state<<<blocks_per_grid, threads_per_block>>>(states);
 67 |     }
 68 | 
 69 |     if (dRand_used == dRand_length)
 70 |     {
 71 |         /*
 72 |          * If all pre-generated random numbers have been consumed, regenerate a
 73 |          * new batch.
 74 |          */
 75 |         refill_randoms<<<blocks_per_grid, threads_per_block>>>(dRand,
 76 |                 dRand_length, states);
 77 |         CHECK(cudaMemcpy(hRand, dRand, sizeof(float) * dRand_length,
 78 |                         cudaMemcpyDeviceToHost));
 79 |         dRand_used = 0;
 80 |     }
 81 | 
 82 |     // Return the next pre-generated random number
 83 |     return hRand[dRand_used++];
 84 | }
 85 | 
 86 | /*
 87 |  * An implementation of rand() that uses the cuRAND host API.
 88 |  */
 89 | float cuda_host_rand()
 90 | {
 91 |     static float *dRand = NULL;
 92 |     static float *hRand = NULL;
 93 |     curandGenerator_t randGen;
 94 |     static int dRand_length = 1000000;
 95 |     static int dRand_used = 1000000;
 96 | 
 97 |     if (dRand == NULL)
 98 |     {
 99 |         /*
100 |          * If the cuRAND state hasn't been initialized yet, construct a cuRAND
101 |          * host generator and pre-allocate memory to store the generated random
102 |          * values in.
103 |          */
104 |         CHECK_CURAND(curandCreateGenerator(&randGen,
105 |                                            CURAND_RNG_PSEUDO_DEFAULT));
106 |         CHECK(cudaMalloc((void **)&dRand, sizeof(float) * dRand_length));
107 |         hRand = (float *)malloc(sizeof(float) * dRand_length);
108 |     }
109 | 
110 |     if (dRand_used == dRand_length)
111 |     {
112 |         /*
113 |          * If all pre-generated random numbers have been consumed, regenerate a
114 |          * new batch using curandGenerateUniform.
115 |          */
116 |         CHECK_CURAND(curandGenerateUniform(randGen, dRand, dRand_length));
117 |         CHECK(cudaMemcpy(hRand, dRand, sizeof(float) * dRand_length,
118 |                         cudaMemcpyDeviceToHost));
119 |         dRand_used = 0;
120 |     }
121 | 
122 |     // Return the next pre-generated random number
123 |     return hRand[dRand_used++];
124 | }
125 | 
126 | /*
127 |  * A reference implementation that uses system rand().
128 |  */
129 | float host_rand()
130 | {
131 |     return (float)rand() / (float)RAND_MAX;
132 | }
133 | 
134 | int main(int argc, char **argv)
135 | {
136 |     int i;
137 |     int N = 8388608;
138 | 
139 |     /*
140 |      * Allocate N random numbers from each of the random number generation
141 |      * functions implemented.
142 |      */
143 |     for (i = 0; i < N; i++)
144 |     {
145 |         float h = host_rand();
146 |         float d = cuda_host_rand();
147 |         float dd = cuda_device_rand();
148 |         printf("%2.4f %2.4f %2.4f\n", h, d, dd);
149 |         getchar();
150 |     }
151 | 
152 |     return 0;
153 | }
154 | 


--------------------------------------------------------------------------------
/chapter08/simple-data.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | /*
 5 |  * This example offers a brief introduction to the data directive. The
 6 |  * data directive allows the programmer to explicitly mark variables to be
 7 |  * transferred to or from the accelerator. This serves as a performance
 8 |  * optimization by eliminating redundant or unnecessary memcpys.
 9 |  */
10 | 
11 | #define N   1024
12 | 
13 | int main(int argc, char **argv)
14 | {
15 |     int i;
16 |     int *A = (int *)malloc(N * sizeof(int));
17 |     int *B = (int *)malloc(N * sizeof(int));
18 |     int *C = (int *)malloc(N * sizeof(int));
19 |     int *D = (int *)malloc(N * sizeof(int));
20 | 
21 |     // Initialize A and B
22 |     for (i = 0; i < N; i++)
23 |     {
24 |         A[i] = i;
25 |         B[i] = 2 * i;
26 |     }
27 | 
28 |     /*
29 |      * Transfer the full contents of A and B to the accelerator, and transfer
30 |      * the full contents of C and D back.
31 |      */
32 | #pragma acc data copyin(A[0:N], B[0:N]) copyout(C[0:N], D[0:N])
33 |     {
34 | #pragma acc parallel
35 |         {
36 | #pragma acc loop
37 | 
38 |             for (i = 0; i < N; i++)
39 |             {
40 |                 C[i] = A[i] + B[i];
41 |             }
42 | 
43 | #pragma acc loop
44 | 
45 |             for (i = 0; i < N; i++)
46 |             {
47 |                 D[i] = C[i] * A[i];
48 |             }
49 |         }
50 |     }
51 | 
52 |     // Display part of the results
53 |     for (i = 0; i < 10; i++)
54 |     {
55 |         printf("%d ", D[i]);
56 |     }
57 | 
58 |     printf("...\n");
59 | 
60 |     return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/chapter08/simple-kernels.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | /*
 5 |  * This example offers a brief introduction to the kernels directive. The
 6 |  * kernels directive attempts to break the code block that follows into
 7 |  * accelerator kernels, generally by searching for parallelizable loops. It then
 8 |  * launches each kernel on the acclerator using an automatically configured
 9 |  * thread configuration.
10 |  */
11 | 
12 | #define N   1024
13 | 
14 | int main(int argc, char **argv)
15 | {
16 |     int i;
17 |     /*
18 |      * restrict indicates to the compiler that the memory pointed to by A, B, C,
19 |      * and D will only be accessed through those respective pointers or by
20 |      * offsets from those pointers. This restriction makes it possible to
21 |      * analyze the loops below for parallelization.
22 |      */
23 |     int *restrict A = (int *)malloc(N * sizeof(int));
24 |     int *restrict B = (int *)malloc(N * sizeof(int));
25 |     int *restrict C = (int *)malloc(N * sizeof(int));
26 |     int *restrict D = (int *)malloc(N * sizeof(int));
27 | 
28 |     // Initialize A and B
29 |     for (i = 0; i < N; i++)
30 |     {
31 |         A[i] = i;
32 |         B[i] = 2 * i;
33 |     }
34 | 
35 |     // Execute the following block of code on an accelerator
36 | #pragma acc kernels
37 |     {
38 |         for (i = 0; i < N; i++)
39 |         {
40 |             C[i] = A[i] + B[i];
41 |         }
42 | 
43 |         for (i = 0; i < N; i++)
44 |         {
45 |             D[i] = C[i] * A[i];
46 |         }
47 |     }
48 | 
49 |     // Display part of the results
50 |     for (i = 0; i < 10; i++)
51 |     {
52 |         printf("%d ", D[i]);
53 |     }
54 | 
55 |     printf("...\n");
56 | 
57 |     return 0;
58 | }
59 | 


--------------------------------------------------------------------------------
/chapter08/simple-parallel.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | /*
 5 |  * This example offers a brief introduction to the parallel directive. The
 6 |  * parallel directive executes a fixed number of threads throughout the code
 7 |  * block that follows it.  The programmer is responsible for using that
 8 |  * parallelism.
 9 |  */
10 | 
11 | #define N   1024
12 | 
13 | int main(int argc, char **argv)
14 | {
15 |     int i;
16 |     /*
17 |      * Note that this example does not require the restrict keyword that
18 |      * simple-kernels.cu did. Because the parallel directive relies on the
19 |      * programmer to mark parallelism, the compiler does not need to be careful
20 |      * about multiple pointers referencing the same memory locations.
21 |      */
22 |     int *A = (int *)malloc(N * sizeof(int));
23 |     int *B = (int *)malloc(N * sizeof(int));
24 |     int *C = (int *)malloc(N * sizeof(int));
25 |     int *D = (int *)malloc(N * sizeof(int));
26 | 
27 |     // Initialize A and B
28 |     for (i = 0; i < N; i++)
29 |     {
30 |         A[i] = i;
31 |         B[i] = 2 * i;
32 |     }
33 | 
34 |     /*
35 |      * Execute the following block of code on an accelerator, parallelizing the
36 |      * two loops marked.
37 |      */
38 | #pragma acc parallel
39 |     {
40 | #pragma acc loop
41 | 
42 |         for (i = 0; i < N; i++)
43 |         {
44 |             C[i] = A[i] + B[i];
45 |         }
46 | 
47 | #pragma acc loop
48 | 
49 |         for (i = 0; i < N; i++)
50 |         {
51 |             D[i] = C[i] * A[i];
52 |         }
53 |     }
54 | 
55 |     // Display part of the results
56 |     for (i = 0; i < 10; i++)
57 |     {
58 |         printf("%d ", D[i]);
59 |     }
60 | 
61 |     printf("...\n");
62 | 
63 |     return 0;
64 | }
65 | 


--------------------------------------------------------------------------------
/chapter09/Makefile:
--------------------------------------------------------------------------------
 1 | CU_APPS=simple2DFD simpleMultiGPU simpleP2P_PingPong
 2 | C_APPS=simpleC2C simpleP2P simpleP2P_CUDA_Aware
 3 | 
 4 | all: ${C_APPS} ${CU_APPS}
 5 | 
 6 | simpleC2C: simpleC2C.c
 7 | 	gcc -O2 -std=c99 -I${MPI_HOME}/include -L${MPI_HOME}/lib -lmpi -o simpleC2C simpleC2C.c
 8 | simpleP2P: simpleP2P.c
 9 | 	gcc -O2 -std=c99 -I${MPI_HOME}/include -I${CUDA_HOME}/include -L${MPI_HOME}/lib -L${CUDA_HOME}/lib64 -lcudart -lmpi -o simpleP2P simpleP2P.c
10 | simpleP2P_CUDA_Aware: simpleP2P_CUDA_Aware.c
11 | 	gcc -O2 -std=c99 -I${MPI_HOME}/include -I${CUDA_HOME}/include -L${MPI_HOME}/lib -L${CUDA_HOME}/lib64 -lcudart -lmpi -o simpleP2P_CUDA_Aware simpleP2P_CUDA_Aware.c
12 | %: %.cu
13 | 	nvcc -O2 -arch=sm_20 -I${MPI_HOME}/include -o $@ $<
14 | %: %.c
15 | 	gcc -O2 -std=c99 -I${MPI_HOME}/include -o $@ $<
16 | clean:
17 | 	rm -f ${CU_APPS} ${C_APPS}
18 | 


--------------------------------------------------------------------------------
/chapter09/simpleC2C.c:
--------------------------------------------------------------------------------
  1 | #include <mpi.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | #include <unistd.h>
  6 | 
  7 | /*
  8 |  * A simple example of using non-blocking communication between multiple MPI
  9 |  * processes to send and receive a char*. The sends and receives are done
 10 |  * repeatedly and timing results allows inter-process bandwidth to be
 11 |  * calculated.
 12 |  */
 13 | 
 14 | #define MESSAGE_ALIGNMENT 64
 15 | #define MAX_MSG_SIZE (1<<22)
 16 | #define MYBUFSIZE MAX_MSG_SIZE
 17 | #define LOOP_LARGE  100
 18 | 
 19 | void initalData (void * sbuf, void * rbuf, size_t size)
 20 | {
 21 |     memset(sbuf, 'a', size);
 22 |     memset(rbuf, 'b', size);
 23 | }
 24 | 
 25 | int main (int argc, char *argv[])
 26 | {
 27 |     int rank, nprocs, ilen;
 28 |     char processor[MPI_MAX_PROCESSOR_NAME];
 29 |     double tstart = 0.0, tend = 0.0;
 30 | 
 31 |     MPI_Status reqstat;
 32 |     MPI_Request send_request;
 33 |     MPI_Request recv_request;
 34 | 
 35 |     MPI_Init(&argc, &argv);
 36 |     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 37 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 38 |     MPI_Get_processor_name(processor, &ilen);
 39 | 
 40 |     if(nprocs != 2)
 41 |     {
 42 |         if(rank == 0) printf("This test requires exactly two processes\n");
 43 | 
 44 |         MPI_Finalize();
 45 |         exit(EXIT_FAILURE);
 46 |     }
 47 | 
 48 |     char *s_buf, *r_buf;
 49 |     s_buf = (char *)malloc(MYBUFSIZE);
 50 |     r_buf = (char *)malloc(MYBUFSIZE);
 51 | 
 52 |     int other_proc = (rank == 1 ? 0 : 1);
 53 | 
 54 |     if(rank == 0 )
 55 |     {
 56 |         printf("%s allocates %d MB dynamic memory aligned to 64 byte\n",
 57 |                               argv[0], MAX_MSG_SIZE / 1024 / 1024);
 58 |     }
 59 | 
 60 |     printf("node=%d(%s): my other _proc = %d\n", rank, processor, other_proc);
 61 | 
 62 |     int loop = LOOP_LARGE;
 63 | 
 64 |     // latency test
 65 |     for(int size = 1024; size <= MAX_MSG_SIZE; size = size * 4)
 66 |     {
 67 |         initalData(s_buf, r_buf, size);
 68 | 
 69 |         MPI_Barrier(MPI_COMM_WORLD);
 70 | 
 71 |         if(rank == 0)
 72 |         {
 73 |             tstart = MPI_Wtime();
 74 | 
 75 |             for(int i = 0; i < loop; i++)
 76 |             {
 77 |                 MPI_Irecv(r_buf, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD,
 78 |                         &recv_request);
 79 |                 MPI_Isend(s_buf, size, MPI_CHAR, other_proc, 100,
 80 |                         MPI_COMM_WORLD, &send_request);
 81 |                 MPI_Waitall(1, &send_request, &reqstat);
 82 |                 MPI_Waitall(1, &recv_request, &reqstat);
 83 |             }
 84 | 
 85 |             tend = MPI_Wtime();
 86 |         }
 87 |         else
 88 |         {
 89 |             for(int i = 0; i < loop; i++)
 90 |             {
 91 |                 MPI_Irecv(r_buf, size, MPI_CHAR, other_proc, 100,
 92 |                         MPI_COMM_WORLD, &recv_request);
 93 |                 MPI_Isend(s_buf, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD,
 94 |                         &send_request);
 95 |                 MPI_Waitall(1, &send_request, &reqstat);
 96 |                 MPI_Waitall(1, &recv_request, &reqstat);
 97 |             }
 98 |         }
 99 | 
100 |         MPI_Barrier(MPI_COMM_WORLD);
101 | 
102 |         if(rank == 0)
103 |         {
104 |             double latency = (tend - tstart) * 1e6 / (2.0 * loop);
105 |             float performance = (float) size / (float) latency;
106 |             printf("%6d %s %10.2f μs %10.2f MB/sec\n",
107 |                    (size >= 1024 * 1024) ? size / 1024 / 1024 : size / 1024,
108 |                    (size >= 1024 * 1024) ? "MB" : "KB", latency, performance);
109 |             fflush(stdout);
110 |         }
111 |     }
112 | 
113 |     free(s_buf);
114 |     free(r_buf);
115 | 
116 |     MPI_Finalize();
117 | 
118 |     return EXIT_SUCCESS;
119 | }
120 | 


--------------------------------------------------------------------------------
/chapter09/simpleMultiGPU.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <assert.h>
  4 | #include <cuda_runtime.h>
  5 | 
  6 | /*
  7 |  * A simple example of a multi-GPU CUDA application implementing a vector sum.
  8 |  * Note that all communication and computation is done asynchronously in order
  9 |  * to overlap computation across multiple devices, and that this requires
 10 |  * allocating page-locked host memory associated with a specific device.
 11 |  */
 12 | 
 13 | __global__ void iKernel(float *A, float *B, float *C, const int N)
 14 | {
 15 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 16 | 
 17 |     if (i < N) C[i] = A[i] + B[i];
 18 | }
 19 | 
 20 | void checkResult(float *hostRef, float *gpuRef, const int N)
 21 | {
 22 |     double epsilon = 1.0E-8;
 23 | 
 24 |     for (int i = 0; i < N; i++)
 25 |     {
 26 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon)
 27 |         {
 28 |             printf("Arrays do not match!\n");
 29 |             printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i],
 30 |                     gpuRef[i], i);
 31 |             break;
 32 |         }
 33 |     }
 34 | }
 35 | 
 36 | void initialData(float * const ip, int const  size)
 37 | {
 38 |     for (int i = 0; i < size; i++)
 39 |     {
 40 |         ip[i] = (float)rand() / (float)RAND_MAX;
 41 |     }
 42 | }
 43 | 
 44 | void sumOnHost(float *A, float *B, float *C, const int N)
 45 | {
 46 |     for (int idx = 0; idx < N; idx++)
 47 |     {
 48 |         C[idx] = A[idx] + B[idx];
 49 |     }
 50 | }
 51 | 
 52 | int main(int argc, char **argv)
 53 | {
 54 |     int ngpus;
 55 | 
 56 |     printf("> starting %s", argv[0]);
 57 | 
 58 |     CHECK(cudaGetDeviceCount(&ngpus));
 59 |     printf(" CUDA-capable devices: %i\n", ngpus);
 60 | 
 61 |     int ishift = 24;
 62 | 
 63 |     if (argc > 2) ishift = atoi(argv[2]);
 64 | 
 65 |     int size = 1 << ishift;
 66 | 
 67 |     if (argc > 1)
 68 |     {
 69 |         if (atoi(argv[1]) > ngpus)
 70 |         {
 71 |             fprintf(stderr, "Invalid number of GPUs specified: %d is greater "
 72 |                     "than the total number of GPUs in this platform (%d)\n",
 73 |                     atoi(argv[1]), ngpus);
 74 |             exit(1);
 75 |         }
 76 | 
 77 |         ngpus  = atoi(argv[1]);
 78 |     }
 79 | 
 80 |     int    iSize  = size / ngpus;
 81 |     size_t iBytes = iSize * sizeof(float);
 82 | 
 83 |     printf("> total array size %d M, using %d devices with each device "
 84 |             "handling %d M\n", size / 1024 / 1024, ngpus, iSize / 1024 / 1024);
 85 | 
 86 |     // allocat device emory
 87 |     float **d_A = (float **)malloc(sizeof(float *) * ngpus);
 88 |     float **d_B = (float **)malloc(sizeof(float *) * ngpus);
 89 |     float **d_C = (float **)malloc(sizeof(float *) * ngpus);
 90 | 
 91 |     float **h_A = (float **)malloc(sizeof(float *) * ngpus);
 92 |     float **h_B = (float **)malloc(sizeof(float *) * ngpus);
 93 |     float **hostRef = (float **)malloc(sizeof(float *) * ngpus);
 94 |     float **gpuRef = (float **)malloc(sizeof(float *) * ngpus);
 95 |     cudaStream_t *stream = (cudaStream_t *)malloc(sizeof(cudaStream_t) * ngpus);
 96 | 
 97 |     for (int i = 0; i < ngpus; i++)
 98 |     {
 99 |         // set current device
100 |         CHECK(cudaSetDevice(i));
101 | 
102 |         // allocate device memory
103 |         CHECK(cudaMalloc((void **) &d_A[i], iBytes));
104 |         CHECK(cudaMalloc((void **) &d_B[i], iBytes));
105 |         CHECK(cudaMalloc((void **) &d_C[i], iBytes));
106 | 
107 |         // allocate page locked host memory for asynchronous data transfer
108 |         CHECK(cudaMallocHost((void **) &h_A[i],     iBytes));
109 |         CHECK(cudaMallocHost((void **) &h_B[i],     iBytes));
110 |         CHECK(cudaMallocHost((void **) &hostRef[i], iBytes));
111 |         CHECK(cudaMallocHost((void **) &gpuRef[i],  iBytes));
112 | 
113 |         // create streams for timing and synchronizing
114 |         CHECK(cudaStreamCreate(&stream[i]));
115 |     }
116 | 
117 |     dim3 block (512);
118 |     dim3 grid  ((iSize + block.x - 1) / block.x);
119 | 
120 |     for (int i = 0; i < ngpus; i++)
121 |     {
122 |         CHECK(cudaSetDevice(i));
123 |         initialData(h_A[i], iSize);
124 |         initialData(h_B[i], iSize);
125 |     }
126 | 
127 |     // record start time
128 |     double iStart = seconds();
129 | 
130 |     // distributing the workload across multiple devices
131 |     for (int i = 0; i < ngpus; i++)
132 |     {
133 |         CHECK(cudaSetDevice(i));
134 | 
135 |         CHECK(cudaMemcpyAsync(d_A[i], h_A[i], iBytes, cudaMemcpyHostToDevice,
136 |                               stream[i]));
137 |         CHECK(cudaMemcpyAsync(d_B[i], h_B[i], iBytes, cudaMemcpyHostToDevice,
138 |                               stream[i]));
139 | 
140 |         iKernel<<<grid, block, 0, stream[i]>>>(d_A[i], d_B[i], d_C[i], iSize);
141 | 
142 |         CHECK(cudaMemcpyAsync(gpuRef[i], d_C[i], iBytes, cudaMemcpyDeviceToHost,
143 |                               stream[i]));
144 |     }
145 | 
146 |     // synchronize streams
147 |     for (int i = 0; i < ngpus; i++)
148 |     {
149 |         CHECK(cudaSetDevice(i));
150 |         CHECK(cudaStreamSynchronize(stream[i]));
151 |     }
152 | 
153 |     // calculate the elapsed time in seconds
154 |     double iElaps = seconds() - iStart;
155 |     printf("%d GPU timer elapsed: %8.2fms \n", ngpus, iElaps * 1000.0);
156 | 
157 |     // check results
158 |     for (int i = 0; i < ngpus; i++)
159 |     {
160 |         //Set device
161 |         CHECK(cudaSetDevice(i));
162 |         sumOnHost(h_A[i], h_B[i], hostRef[i], iSize);
163 |         checkResult(hostRef[i], gpuRef[i], iSize);
164 |     }
165 | 
166 |     // Cleanup and shutdown
167 |     for (int i = 0; i < ngpus; i++)
168 |     {
169 |         CHECK(cudaSetDevice(i));
170 |         CHECK(cudaFree(d_A[i]));
171 |         CHECK(cudaFree(d_B[i]));
172 |         CHECK(cudaFree(d_C[i]));
173 | 
174 |         CHECK(cudaFreeHost(h_A[i]));
175 |         CHECK(cudaFreeHost(h_B[i]));
176 |         CHECK(cudaFreeHost(hostRef[i]));
177 | 
178 |         CHECK(cudaFreeHost(gpuRef[i]));
179 |         CHECK(cudaStreamDestroy(stream[i]));
180 | 
181 |         CHECK(cudaDeviceReset());
182 |     }
183 | 
184 |     free(d_A);
185 |     free(d_B);
186 |     free(d_C);
187 |     free(h_A);
188 |     free(h_B);
189 |     free(hostRef);
190 |     free(gpuRef);
191 |     free(stream);
192 | 
193 |     return EXIT_SUCCESS;
194 | }
195 | 


--------------------------------------------------------------------------------
/chapter09/simpleP2P.c:
--------------------------------------------------------------------------------
  1 | #include <mpi.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | #include <unistd.h>
  6 | #include <semaphore.h>
  7 | #include <cuda_runtime_api.h>
  8 | 
  9 | /*
 10 |  * A simple example of using the MPI and CUDA communication APIs to manually
 11 |  * transfer data from a GPU managed in one MPI process to a GPU managed in
 12 |  * another. The general steps performed are GPU0 -> cudaMemcpy -> rank0 ->
 13 |  * MPI_Isend -> rank1 -> cudaMemcpy -> GPU1.
 14 |  */
 15 | 
 16 | #define CHECK(call)                                                            \
 17 | {                                                                              \
 18 |     const cudaError_t error = call;                                            \
 19 |     if (error != cudaSuccess)                                                  \
 20 |     {                                                                          \
 21 |         fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
 22 |         fprintf(stderr, "code: %d, reason: %s\n", error,                       \
 23 |                 cudaGetErrorString(error));                                    \
 24 |     }                                                                          \
 25 | }
 26 | 
 27 | #define MESSAGE_ALIGNMENT 64
 28 | #define MAX_MSG_SIZE (1<<22)
 29 | #define MYBUFSIZE MAX_MSG_SIZE
 30 | 
 31 | #define LOOP_LARGE  100
 32 | #define SKIP_LARGE  10
 33 | #define LARGE_MESSAGE_SIZE  8192
 34 | 
 35 | int loop = LOOP_LARGE;
 36 | 
 37 | void initalData (void * sbuf, void * rbuf, size_t size)
 38 | {
 39 |     memset(sbuf, 'a', size);
 40 |     memset(rbuf, 'b', size);
 41 | }
 42 | 
 43 | int main (int argc, char *argv[])
 44 | {
 45 |     int rank, nprocs, ilen;
 46 |     char processor[MPI_MAX_PROCESSOR_NAME];
 47 |     double tstart = 0.0, tend = 0.0;
 48 | 
 49 |     MPI_Status reqstat;
 50 |     MPI_Request send_request;
 51 |     MPI_Request recv_request;
 52 | 
 53 |     MPI_Init(&argc, &argv);
 54 |     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 55 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 56 |     MPI_Get_processor_name(processor, &ilen);
 57 | 
 58 |     if (nprocs != 2)
 59 |     {
 60 |         if(rank == 0) printf("This test requires exactly two processes\n");
 61 | 
 62 |         MPI_Finalize();
 63 |         exit(EXIT_FAILURE);
 64 |     }
 65 | 
 66 |     int other_proc = (rank == 1 ? 0 : 1);
 67 | 
 68 |     // Hard code GPU affinity since this example only works with 2 GPUs.
 69 |     int igpu = (rank == 1 ? 0 : 1);
 70 | 
 71 |     if(rank == 0 )
 72 |         printf("%s allocates %d MB pinned memory with regual mpi and "
 73 |                "bidirectional bandwidth\n", argv[0],
 74 |                MAX_MSG_SIZE / 1024 / 1024);
 75 | 
 76 |     printf("node=%d(%s): my other _proc = %d and using GPU=%d\n", rank,
 77 |             processor, other_proc, igpu);
 78 | 
 79 |     char *h_src, *h_rcv;
 80 |     CHECK(cudaSetDevice(igpu));
 81 |     CHECK(cudaMallocHost((void**)&h_src, MYBUFSIZE));
 82 |     CHECK(cudaMallocHost((void**)&h_rcv, MYBUFSIZE));
 83 | 
 84 |     char *d_src, *d_rcv;
 85 |     CHECK(cudaSetDevice(igpu));
 86 |     CHECK(cudaMalloc((void **)&d_src, MYBUFSIZE));
 87 |     CHECK(cudaMalloc((void **)&d_rcv, MYBUFSIZE));
 88 | 
 89 |     initalData(h_src, h_rcv, MYBUFSIZE);
 90 | 
 91 |     CHECK(cudaMemcpy(d_src, h_src, MYBUFSIZE, cudaMemcpyDefault));
 92 |     CHECK(cudaMemcpy(d_rcv, h_rcv, MYBUFSIZE, cudaMemcpyDefault));
 93 | 
 94 |     // latency test
 95 |     for(int size = 1024; size <= MAX_MSG_SIZE; size = size * 4)
 96 |     {
 97 |         MPI_Barrier(MPI_COMM_WORLD);
 98 | 
 99 |         if(rank == 0)
100 |         {
101 |             tstart = MPI_Wtime();
102 | 
103 |             for(int i = 0; i < loop; i++)
104 |             {
105 |                 /*
106 |                  * Transfer data from the GPU to the host to be transmitted to
107 |                  * the other MPI process.
108 |                  */
109 |                 CHECK(cudaMemcpy(h_src, d_src, size, cudaMemcpyDeviceToHost));
110 | 
111 |                 // bi-directional transmission
112 |                 MPI_Irecv(h_rcv, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD,
113 |                           &recv_request);
114 |                 MPI_Isend(h_src, size, MPI_CHAR, other_proc, 100,
115 |                           MPI_COMM_WORLD, &send_request);
116 | 
117 |                 MPI_Waitall(1, &recv_request, &reqstat);
118 |                 MPI_Waitall(1, &send_request, &reqstat);
119 | 
120 |                 /*
121 |                  * Transfer the data received from the other MPI process to
122 |                  * the device.
123 |                  */
124 |                 CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice));
125 |             }
126 | 
127 |             tend = MPI_Wtime();
128 |         }
129 |         else
130 |         {
131 |             for(int i = 0; i < loop; i++)
132 |             {
133 |                 /*
134 |                  * Transfer data from the GPU to the host to be transmitted to
135 |                  * the other MPI process.
136 |                  */
137 |                 CHECK(cudaMemcpy(h_src, d_src, size, cudaMemcpyDeviceToHost));
138 | 
139 |                 // bi-directional transmission
140 |                 MPI_Irecv(h_rcv, size, MPI_CHAR, other_proc, 100,
141 |                           MPI_COMM_WORLD, &recv_request);
142 |                 MPI_Isend(h_src, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD,
143 |                           &send_request);
144 | 
145 |                 MPI_Waitall(1, &recv_request, &reqstat);
146 |                 MPI_Waitall(1, &send_request, &reqstat);
147 | 
148 |                 /*
149 |                  * Transfer the data received from the other MPI process to
150 |                  * the device.
151 |                  */
152 |                 CHECK(cudaMemcpy(d_rcv, h_rcv, size, cudaMemcpyHostToDevice));
153 |             }
154 |         }
155 | 
156 |         MPI_Barrier(MPI_COMM_WORLD);
157 | 
158 |         if(rank == 0)
159 |         {
160 |             double latency = (tend - tstart) * 1e6 / (2.0 * loop);
161 |             float performance = (float) size / (float) latency;
162 |             printf("%6d %s %10.2f μs %10.2f MB/sec\n",
163 |                    (size >= 1024 * 1024) ? size / 1024 / 1024 : size / 1024,
164 |                    (size >= 1024 * 1024) ? "MB" : "KB", latency, performance);
165 | 
166 |             fflush(stdout);
167 |         }
168 |     }
169 | 
170 |     CHECK(cudaFreeHost(h_src));
171 |     CHECK(cudaFreeHost(h_rcv));
172 | 
173 |     CHECK(cudaSetDevice(igpu));
174 |     CHECK(cudaFree(d_src));
175 |     CHECK(cudaFree(d_rcv));
176 | 
177 |     MPI_Finalize();
178 | 
179 |     return EXIT_SUCCESS;
180 | }
181 | 


--------------------------------------------------------------------------------
/chapter09/simpleP2P_CUDA_Aware.c:
--------------------------------------------------------------------------------
  1 | #include <mpi.h>
  2 | #include <stdlib.h>
  3 | #include <stdio.h>
  4 | #include <string.h>
  5 | #include <unistd.h>
  6 | #include <cuda.h>
  7 | #include <cuda_runtime_api.h>
  8 | 
  9 | /*
 10 |  * An example of using a CUDA-aware MPI implementation to transfer an array
 11 |  * directly from one GPU to another, between MPI processes. Note that no CUDA
 12 |  * transfer API calls are used here, and that device pointers are passed
 13 |  * directly to MPI_Isend and MPI_Irecv.
 14 |  */
 15 | 
 16 | #define CHECK(call)                                                            \
 17 | {                                                                              \
 18 |     const cudaError_t error = call;                                            \
 19 |     if (error != cudaSuccess)                                                  \
 20 |     {                                                                          \
 21 |         fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
 22 |         fprintf(stderr, "code: %d, reason: %s\n", error,                       \
 23 |                 cudaGetErrorString(error));                                    \
 24 |     }                                                                          \
 25 | }
 26 | 
 27 | #define MESSAGE_ALIGNMENT 64
 28 | #define MAX_MSG_SIZE (1<<22)
 29 | #define MYBUFSIZE MAX_MSG_SIZE
 30 | 
 31 | #define LOOP_LARGE  100
 32 | #define FIELD_WIDTH 20
 33 | #define FLOAT_PRECISION 2
 34 | 
 35 | void SetDeviceBeforeInit()
 36 | {
 37 |     int devCount = 0;
 38 |     int rank = atoi(getenv("MV2_COMM_WORLD_RANK"));
 39 |     int idev = (rank == 0 ? 1 : 0);
 40 |     CHECK(cudaSetDevice(idev));
 41 | 
 42 |     printf("local rank=%d: and idev %d\n", rank, idev);
 43 | }
 44 | 
 45 | int main (int argc, char *argv[])
 46 | {
 47 |     int rank, nprocs, ilen;
 48 |     char processor[MPI_MAX_PROCESSOR_NAME];
 49 |     double tstart = 0.0, tend = 0.0;
 50 | 
 51 |     MPI_Status reqstat;
 52 |     MPI_Request send_request;
 53 |     MPI_Request recv_request;
 54 | 
 55 |     MPI_Init(&argc, &argv);
 56 |     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 57 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 58 | 
 59 |     MPI_Get_processor_name(processor, &ilen);
 60 | 
 61 |     if(nprocs != 2)
 62 |     {
 63 |         if(rank == 0) printf("This test requires exactly two processes\n");
 64 | 
 65 |         MPI_Finalize();
 66 |         exit(EXIT_FAILURE);
 67 |     }
 68 | 
 69 |     char *h_src, *h_rcv;
 70 | 
 71 |     int other_proc = (rank == 1 ? 0 : 1);
 72 |     int igpu = (rank == 1 ? 0 : 1);
 73 | 
 74 |     int loop = LOOP_LARGE;
 75 | 
 76 |     printf("node=%d(%s): my other _proc = %d and using GPU=%d loop %d\n", rank,
 77 |            processor, other_proc, igpu, loop);
 78 | 
 79 |     char *d_src, *d_rcv;
 80 |     CHECK(cudaSetDevice(igpu));
 81 |     CHECK(cudaMalloc((void **)&d_src, MYBUFSIZE));
 82 |     CHECK(cudaMalloc((void **)&d_rcv, MYBUFSIZE));
 83 | 
 84 |     for (int size = 1; size <= MAX_MSG_SIZE; size *= 2)
 85 |     {
 86 |         MPI_Barrier(MPI_COMM_WORLD);
 87 | 
 88 |         CHECK(cudaMemset(d_src, 'a', size));
 89 |         CHECK(cudaMemset(d_rcv, 'b', size));
 90 | 
 91 |         if(rank == 0)
 92 |         {
 93 |             tstart = MPI_Wtime();
 94 | 
 95 |             for(int i = 0; i < loop; i++)
 96 |             {
 97 |                 MPI_Isend(d_src, size, MPI_CHAR, other_proc, 100,
 98 |                         MPI_COMM_WORLD, &send_request);
 99 |                 MPI_Irecv(d_rcv, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD,
100 |                         &recv_request);
101 | 
102 |                 MPI_Waitall(1, &recv_request, &reqstat);
103 |                 MPI_Waitall(1, &send_request, &reqstat);
104 | 
105 |             }
106 | 
107 |             tend = MPI_Wtime();
108 |         }
109 |         else
110 |         {
111 |             for(int i = 0; i < loop; i++)
112 |             {
113 |                 MPI_Isend(d_src, size, MPI_CHAR, other_proc, 10, MPI_COMM_WORLD,
114 |                         &send_request);
115 |                 MPI_Irecv(d_rcv, size, MPI_CHAR, other_proc, 100,
116 |                         MPI_COMM_WORLD, &recv_request);
117 | 
118 |                 MPI_Waitall(1, &recv_request, &reqstat);
119 |                 MPI_Waitall(1, &send_request, &reqstat);
120 |             }
121 |         }
122 | 
123 |         MPI_Barrier(MPI_COMM_WORLD);
124 | 
125 |         if(rank == 0)
126 |         {
127 |             double tmp = size / 1e6 * loop  * 2;
128 |             double t = (tend - tstart);
129 | 
130 |             printf("%-*d%*.*f\n", 10, size, FIELD_WIDTH, FLOAT_PRECISION,
131 |                     tmp / t);
132 |             fflush(stdout);
133 |         }
134 |     }
135 | 
136 |     CHECK(cudaSetDevice(igpu));
137 |     CHECK(cudaFree(d_src));
138 |     CHECK(cudaFree(d_rcv));
139 | 
140 |     MPI_Finalize();
141 | 
142 |     return EXIT_SUCCESS;
143 | }
144 | 


--------------------------------------------------------------------------------
/chapter10/debug-hazards.cu:
--------------------------------------------------------------------------------
  1 | #include "../common/common.h"
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | 
  5 | /**
  6 |  * This example illustrates different approaches to optimizing access to a
  7 |  * single shared variable by limiting conflicting, atomic operations on it.
  8 |  *
  9 |  * The first kernel, naive_reduction, simply performs an atomicAdd from every
 10 |  * thread on the same shared variable.
 11 |  *
 12 |  * simple_reduction first stores the values to be added together in shared
 13 |  * memory. Then, a single thread iterates over those values and computes a
 14 |  * partial sum. Finally, that partial sum is added to the global result using an
 15 |  * atomicAdd.
 16 |  *
 17 |  * parallel_reduction is the most complex example. It performs a parallel
 18 |  * reduction within each thread block. The partial result produced by that
 19 |  * local reduction is then added to the global result with an atomicAdd.
 20 |  *
 21 |  * The core of each of these kernels is wrapped in a loop to augment the amount
 22 |  * of work done and make timing the kernels at the millisecond granularity
 23 |  * feasible.
 24 |  **/
 25 | 
 26 | /**
 27 |  * This implementation makes use of shared memory and local reduction to improve
 28 |  * performance and decrease contention
 29 |  **/
 30 | __global__ void simple_reduction(int *shared_var, int *input_values, int N,
 31 |                                  int iters)
 32 | {
 33 |     __shared__ int local_mem[256];
 34 |     int iter, i;
 35 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 36 |     int local_tid = threadIdx.x;
 37 |     int local_dim = blockDim.x;
 38 |     int minThreadInThisBlock = blockIdx.x * blockDim.x;
 39 |     int maxThreadInThisBlock = minThreadInThisBlock + (blockDim.x - 1);
 40 | 
 41 |     if (maxThreadInThisBlock >= N)
 42 |     {
 43 |         local_dim = N - minThreadInThisBlock;
 44 |     }
 45 | 
 46 |     for (iter = 0; iter < iters; iter++)
 47 |     {
 48 |         if (tid < N)
 49 |         {
 50 |             local_mem[local_tid] = input_values[tid];
 51 |         }
 52 | 
 53 |         // Required for correctness
 54 |         // __syncthreads();
 55 | 
 56 |         /*
 57 |          * Perform the local reduction across values written to shared memory
 58 |          * by threads in this thread block.
 59 |          */
 60 |         if (local_tid == 0)
 61 |         {
 62 |             int sum = 0;
 63 | 
 64 |             for (i = 0; i < local_dim; i++)
 65 |             {
 66 |                 sum = sum + local_mem[i];
 67 |             }
 68 | 
 69 |             atomicAdd(shared_var, sum);
 70 |         }
 71 | 
 72 |         // Required for correctness
 73 |         // __syncthreads();
 74 |     }
 75 | }
 76 | 
 77 | int main(int argc, char **argv)
 78 | {
 79 |     int N = 20480;
 80 |     int block = 256;
 81 |     int device_iters = 3;
 82 |     int runs = 1;
 83 |     int i, true_value;
 84 |     int *d_shared_var, *d_input_values, *h_input_values;
 85 |     int h_sum;
 86 |     double mean_time = 0.0;
 87 | 
 88 |     CHECK(cudaMalloc((void **)&d_shared_var, sizeof(int)));
 89 |     CHECK(cudaMalloc((void **)&d_input_values, N * sizeof(int)));
 90 |     h_input_values = (int *)malloc(N * sizeof(int));
 91 | 
 92 |     for (i = 0; i < N; i++)
 93 |     {
 94 |         h_input_values[i] = i;
 95 |         true_value += i;
 96 |     }
 97 | 
 98 |     true_value *= device_iters;
 99 | 
100 |     for (i = 0; i < runs; i++)
101 |     {
102 |         CHECK(cudaMemset(d_shared_var, 0x00, sizeof(int)));
103 |         CHECK(cudaMemcpy(d_input_values, h_input_values, N * sizeof(int),
104 |                          cudaMemcpyHostToDevice));
105 |         double start = seconds();
106 | 
107 |         simple_reduction<<<N / block, block>>>(d_shared_var,
108 |                 d_input_values, N, device_iters);
109 | 
110 |         CHECK(cudaDeviceSynchronize());
111 |         mean_time += seconds() - start;
112 |         CHECK(cudaMemcpy(&h_sum, d_shared_var, sizeof(int),
113 |                          cudaMemcpyDeviceToHost));
114 | 
115 |         if (h_sum != true_value)
116 |         {
117 |             fprintf(stderr, "Validation failure: expected %d, got %d\n",
118 |                     true_value, h_sum);
119 |             return 1;
120 |         }
121 |     }
122 | 
123 |     mean_time /= runs;
124 | 
125 |     printf("Mean execution time for reduction: %.4f ms\n",
126 |            mean_time * 1000.0);
127 | 
128 |     return 0;
129 | }
130 | 


--------------------------------------------------------------------------------
/chapter10/debug-segfault.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | 
 4 | /*
 5 |  * This example purposefully introduces an invalid memory access on the GPU to
 6 |  * illustrate the use of cuda-gdb.
 7 |  */
 8 | 
 9 | #define N   1025
10 | #define M   12
11 | 
12 | __device__ int foo(int row, int col)
13 | {
14 |     return (2 * row);
15 | }
16 | 
17 | __global__ void kernel(int **arr)
18 | {
19 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
20 |     int i;
21 | 
22 |     /*
23 |      * Iterate over each row in parallel and column sequentially, assigning a
24 |      * value decided by foo.
25 |      */
26 |     for ( ; tid < N; tid++)
27 |     {
28 |         for (i = 0; i < M; i++)
29 |         {
30 |             arr[tid][i] = foo(tid, i);
31 |         }
32 |     }
33 | }
34 | 
35 | int main(int argc, char **argv)
36 | {
37 |     int i;
38 |     // Host representation of a 2D matrix
39 |     int **h_matrix;
40 |     // A host array of device pointers to the matrix rows on the device
41 |     int **d_ptrs;
42 |     // A device array of device pointers, filled from d_ptrs
43 |     int **d_matrix;
44 | 
45 |     h_matrix = (int **)malloc(N * sizeof(int *));
46 |     d_ptrs = (int **)malloc(N * sizeof(int *));
47 |     CHECK(cudaMalloc((void **)&d_matrix, N * sizeof(int *)));
48 |     CHECK(cudaMemset(d_matrix, 0x00, N * sizeof(int *)));
49 | 
50 |     // Allocate rows on the host and device
51 |     for (i = 0; i < N; i++)
52 |     {
53 |         h_matrix[i] = (int *)malloc(M * sizeof(int));
54 |         CHECK(cudaMalloc((void **)&d_ptrs[i], M * sizeof(int)));
55 |         CHECK(cudaMemset(d_ptrs[i], 0x00, M * sizeof(int)));
56 |     }
57 | 
58 |     int threadsPerBlock = 256;
59 |     int blocksPerGrid = 1024;
60 |     kernel<<<blocksPerGrid, threadsPerBlock>>>(d_matrix);
61 | 
62 |     // Copy rows back
63 |     for (i = 0; i < N; i++)
64 |     {
65 |         CHECK(cudaMemcpy(h_matrix[i], d_ptrs[i], M * sizeof(int),
66 |                         cudaMemcpyDeviceToHost));
67 |         CHECK(cudaFree(d_ptrs[i]));
68 |         free(h_matrix[i]);
69 |     }
70 | 
71 |     CHECK(cudaFree(d_matrix));
72 |     free(h_matrix);
73 | 
74 |     return 0;
75 | }
76 | 


--------------------------------------------------------------------------------
/chapter10/debug-segfault.fixed.cu:
--------------------------------------------------------------------------------
 1 | #include "../common/common.h"
 2 | #include <stdio.h>
 3 | 
 4 | /*
 5 |  * This example purposefully introduces an invalid memory access on the GPU to
 6 |  * illustrate the use of cuda-gdb.
 7 |  */
 8 | 
 9 | #define N   1025
10 | #define M   12
11 | 
12 | __device__ int foo(int row, int col)
13 | {
14 |     return (2 * row);
15 | }
16 | 
17 | __global__ void kernel(int **arr)
18 | {
19 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
20 |     int i;
21 | 
22 |     /*
23 |      * Iterate over each row in parallel and column sequentially, assigning a
24 |      * value decided by foo.
25 |      */
26 |     for ( ; tid < N; tid++)
27 |     {
28 |         for (i = 0; i < M; i++)
29 |         {
30 |             arr[tid][i] = foo(tid, i);
31 |         }
32 |     }
33 | }
34 | 
35 | int main(int argc, char **argv)
36 | {
37 |     int i;
38 |     // Host representation of a 2D matrix
39 |     int **h_matrix;
40 |     // A host array of device pointers to the matrix rows on the device
41 |     int **d_ptrs;
42 |     // A device array of device pointers, filled from d_ptrs
43 |     int **d_matrix;
44 | 
45 |     h_matrix = (int **)malloc(N * sizeof(int *));
46 |     d_ptrs = (int **)malloc(N * sizeof(int *));
47 |     CHECK(cudaMalloc((void **)&d_matrix, N * sizeof(int *)));
48 |     CHECK(cudaMemset(d_matrix, 0x00, N * sizeof(int *)));
49 | 
50 |     // Allocate rows on the host and device
51 |     for (i = 0; i < N; i++)
52 |     {
53 |         h_matrix[i] = (int *)malloc(M * sizeof(int));
54 |         CHECK(cudaMalloc((void **)&d_ptrs[i], M * sizeof(int)));
55 |         CHECK(cudaMemset(d_ptrs[i], 0x00, M * sizeof(int)));
56 |     }
57 | 
58 |     CHECK(cudaMemcpy(d_matrix, d_ptrs, N * sizeof(int *),
59 |                     cudaMemcpyHostToDevice));
60 | 
61 |     int threadsPerBlock = 256;
62 |     int blocksPerGrid = 1024;
63 |     kernel<<<blocksPerGrid, threadsPerBlock>>>(d_matrix);
64 | 
65 |     // Copy rows back
66 |     for (i = 0; i < N; i++)
67 |     {
68 |         CHECK(cudaMemcpy(h_matrix[i], d_ptrs[i], M * sizeof(int),
69 |                         cudaMemcpyDeviceToHost));
70 |         CHECK(cudaFree(d_ptrs[i]));
71 |         free(h_matrix[i]);
72 |     }
73 | 
74 |     CHECK(cudaFree(d_matrix));
75 |     free(h_matrix);
76 | 
77 |     return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/chapter10/generate_data.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | 
 4 | /*
 5 |  * Generate a sample input for crypt to encrypt and decrypt. generate_data
 6 |  * allows the user to specify the output file and length in bytes.
 7 |  */
 8 | 
 9 | #define CHUNK_SIZE 1024
10 | signed char chunk[CHUNK_SIZE];
11 | 
12 | int main(int argc, char **argv)
13 | {
14 |     int i, j;
15 |     FILE *out;
16 |     int outLength;
17 |     int *ichunk;
18 | 
19 |     if (argc != 3)
20 |     {
21 |         printf("usage: %s <output-file> <output-file-length>\n", argv[0]);
22 |         return (1);
23 |     }
24 | 
25 |     out = fopen(argv[1], "w");
26 | 
27 |     if (out == NULL)
28 |     {
29 |         fprintf(stderr, "Failed opening %s for writing\n", argv[1]);
30 |         return (1);
31 |     }
32 | 
33 |     outLength = atoi(argv[2]);
34 | 
35 |     if (outLength % 8 != 0)
36 |     {
37 |         fprintf(stderr, "The specified length (%d) must be evenly divisible "
38 |                 "by 8\n", outLength);
39 |         return (1);
40 |     }
41 | 
42 |     // Write in chunks of CHUNK_SIZE.
43 |     for (i = 0; i < outLength; i += CHUNK_SIZE)
44 |     {
45 |         int toWrite = CHUNK_SIZE;
46 | 
47 |         if (i + toWrite > outLength)
48 |         {
49 |             toWrite = outLength - i;
50 |         }
51 | 
52 |         for (j = 0; j < toWrite; j++)
53 |         {
54 |             chunk[j] = (i * CHUNK_SIZE + j);
55 |         }
56 | 
57 |         if (fwrite(chunk, 1, toWrite, out) != toWrite)
58 |         {
59 |             fprintf(stderr, "Error writing chunk of length %d\n", toWrite);
60 |             return (1);
61 |         }
62 |     }
63 | 
64 |     fclose(out);
65 | 
66 |     return (0);
67 | }
68 | 


--------------------------------------------------------------------------------