├── RmagmaExample.R~
├── README.md
├── calc_loglik.cu
├── RmagmaExample.R
├── gpuArrayExample.py
├── random-single.cu
├── random.cu
├── RNGexample-single.R
├── RNGexample.R
├── RCUDAexample.R
├── helloWorld.cu
├── PyCUDAexample.py
├── cudaBlasExample.c
├── kernelExample-pinned.cu
├── kernelExample-single.cu
├── kernelExample.cu
├── magmaExample.c
├── makeGPUimage.sh
├── gpu.Rmd
└── gpu.html


/RmagmaExample.R~:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | gpu-workshop-2014
2 | =================
3 | 
4 | Repository for materials for Chris Paciorek's workshop on GPU computation, April 2014.
5 | 
6 | The demo that will be given in the workshop is gpu.html, which is created by running "library(knitr); knit2html('gpu.Rmd')" in R.
7 | 


--------------------------------------------------------------------------------
/calc_loglik.cu:
--------------------------------------------------------------------------------
 1 | #define SQRT_TWO_PI 2.506628274631000
 2 | extern "C"
 3 | __global__ void calc_loglik(double* vals, int N, double mu, double sigma) {
 4 |    // note that this assumes no third dimension to the grid
 5 |      // id of the block
 6 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
 7 |     // size of each block (within grid of blocks)
 8 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
 9 |     // id of thread in a given block
10 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
11 |     // assign overall id/index of the thread
12 |     int idx = myblock * blocksize + subthread;
13 | 
14 |         if(idx < N) {
15 |             double std = (vals[idx] - mu)/ sigma;
16 |             double e = exp( - 0.5 * std * std);
17 |             vals[idx] = e / ( sigma * SQRT_TWO_PI);
18 |         }
19 | }
20 | 


--------------------------------------------------------------------------------
/RmagmaExample.R:
--------------------------------------------------------------------------------
 1 | library(magma)
 2 | # create a MAGMA matrix and do an operation via the CPU interface
 3 | for(n in c(4096, 8192)) {
 4 |   x <- matrix(rnorm(n^2), n)
 5 |   mX <- magma(x)
 6 |   v <- rnorm(n)
 7 |   mV <- magma(v)
 8 |   gpu(mV) # will indicate that we are using M
 9 | 
10 |   gpu_time <- system.time({
11 |     mY <- crossprod(mX);
12 |     mU <- chol(mY);
13 |     mR <- backsolve(mU, mV)
14 |   })
15 |                                         # 2.8 for n=4096; 18.3 for n=8192
16 | 
17 |   cpu_time <- system.time({
18 | 
19 |     Y <- crossprod(x);
20 |     U <- chol(Y);
21 |     R <- backsolve(U, v)
22 |   })
23 |                                         # 5.8 for n=4096; 45.2 for n=8192
24 |   cat("Timing for n=", n, "\n")
25 |   cat("GPU time: ", gpu_time[3], "\n")
26 |   cat("CPU time: ", cpu_time[3], "\n")
27 |   
28 | }
29 | 
30 | cat("Check for use of double precision empirically\n")
31 | print(range(abs(mY - Y)))
32 | options(digits = 16)
33 | print(mY[1:3, 1])
34 | print(Y[1:3, 1])
35 | 


--------------------------------------------------------------------------------
/gpuArrayExample.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | import pycuda.gpuarray as gpuarray
 4 | import pycuda.cumath as cumath
 5 | import numpy as np
 6 | 
 7 | N = np.int32(134931456)
 8 | 
 9 | start = drv.Event()
10 | end = drv.Event()
11 | 
12 | x = np.random.normal(size = N)
13 | 
14 | start.record()
15 | dX = gpuarray.to_gpu(x)
16 | end.record() 
17 | end.synchronize()
18 | print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3)
19 | 
20 | 
21 | print "Timing vectorized exponentiation:"
22 | 
23 | start.record()
24 | dexpX = cumath.exp(dX)
25 | end.record() 
26 | end.synchronize()
27 | print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)
28 | 
29 | start.record()
30 | expX = np.exp(x)
31 | end.record() 
32 | end.synchronize()
33 | print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
34 | 
35 | print "Timing vectorized dot product/sum of squares:"
36 | 
37 | start.record()
38 | gpuarray.dot(dX,dX)
39 | end.record() 
40 | end.synchronize()
41 | print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)
42 | 
43 | start.record()
44 | np.dot(x, x)
45 | end.record() 
46 | end.synchronize()
47 | print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
48 | 


--------------------------------------------------------------------------------
/random-single.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda.h>
 4 | #include <curand_kernel.h>
 5 | 
 6 | 
 7 | extern "C"
 8 | {
 9 | 
10 | __global__ void setup_kernel(curandState  *state, int seed, int n, int verbose)
11 | {
12 |     // Usual block/thread indexing...
13 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
14 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
15 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
16 |     int idx = myblock * blocksize + subthread;
17 |     if (verbose){
18 |         printf("Setting up RNG in thread %d (n=%d)...\n",idx,n);
19 |     }
20 |     curand_init(seed, idx, 0, &state[idx]);
21 |     return;
22 | }
23 | 
24 | __global__ void rnorm_basic_kernel(curandState *state, float *vals, int n, float mu, float sigma)
25 | {
26 |     // Usual block/thread indexing...
27 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
28 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
29 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
30 |     int idx = myblock * blocksize + subthread;
31 |          if (idx < n) {
32 |           vals[idx] = mu + sigma * curand_normal(&state[idx]);
33 |           }
34 |     return;
35 | }
36 | 
37 | 
38 | __global__ void rnorm_kernel(curandState *state, float *vals, int n, float mu, float sigma, int numSamples)
39 | {
40 |     // Usual block/thread indexing...
41 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
42 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
43 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
44 |     int idx = myblock * blocksize + subthread;
45 |     int k;  
46 |     int startIdx = idx*numSamples;
47 |     for(k = 0; k < numSamples; k++) {
48 |         if(startIdx + k < n) 
49 |           vals[startIdx + k] = mu + sigma * curand_normal(&state[idx]);
50 |     }
51 |     return;
52 | }
53 | 
54 | } // END extern 
55 | 
56 | 


--------------------------------------------------------------------------------
/random.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda.h>
 4 | #include <curand_kernel.h>
 5 | 
 6 | 
 7 | extern "C"
 8 | {
 9 | 
10 | __global__ void setup_kernel(curandState  *state, int seed, int n, int verbose)
11 | {
12 |     // Usual block/thread indexing...
13 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
14 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
15 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
16 |     int idx = myblock * blocksize + subthread;
17 |     if (verbose){
18 |         printf("Setting up RNG in thread %d (n=%d)...\n",idx,n);
19 |     }
20 |     curand_init(seed, idx, 0, &state[idx]);
21 |     return;
22 | }
23 | 
24 | __global__ void rnorm_basic_kernel(curandState *state, double *vals, int n, double mu, double sigma)
25 | {
26 |     // Usual block/thread indexing...
27 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
28 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
29 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
30 |     int idx = myblock * blocksize + subthread;
31 |          if (idx < n) {
32 |           vals[idx] = mu + sigma * curand_normal_double(&state[idx]);
33 |           }
34 |     return;
35 | }
36 | 
37 | 
38 | __global__ void rnorm_kernel(curandState *state, double *vals, int n, double mu, double sigma, int numSamples)
39 | {
40 |     // Usual block/thread indexing...
41 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
42 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
43 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
44 |     int idx = myblock * blocksize + subthread;
45 |     int k;  
46 |     int startIdx = idx*numSamples;
47 |     for(k = 0; k < numSamples; k++) {
48 |         if(startIdx + k < n) 
49 |           vals[startIdx + k] = mu + sigma * curand_normal_double(&state[idx]);
50 |     }
51 |     return;
52 | }
53 | 
54 | } // END extern 
55 | 
56 | 


--------------------------------------------------------------------------------
/RNGexample-single.R:
--------------------------------------------------------------------------------
 1 | library(RCUDA)
 2 | 
 3 | cat("Setting cuGetContext(TRUE)...\n")
 4 | cuGetContext(TRUE)
 5 | 
 6 | ptx = nvcc("random-single.cu", out = "random-single.ptx", target = "ptx",
 7 |      "-arch=compute_20", "-code=sm_20,compute_20")
 8 |   
 9 | m = loadModule(ptx)
10 | 
11 | setup = m$setup_kernel
12 | rnorm = m$rnorm_kernel
13 | 
14 | N = 1e8L  # NOTE 'N' is of type integer
15 | N_per_thread = 1000L
16 | 
17 | mu = 0.3
18 | sigma = 1.5
19 | 
20 | verbose = FALSE
21 | 
22 | # setting grid and block dimensions
23 | threads_per_block <- 1024L
24 | block_dims <- c(threads_per_block, 1L, 1L)
25 | grid_d <- as.integer(ceiling(sqrt((N/N_per_thread)/threads_per_block)))
26 | 
27 | grid_dims <- c(grid_d, grid_d, 1L)
28 | 
29 | cat("Grid size:\n")
30 | print(grid_dims)
31 | 
32 | nthreads <- as.integer(prod(grid_dims)*prod(block_dims))
33 | cat("Total number of threads to launch = ", nthreads, "\n")
34 | if (nthreads*N_per_thread < N){
35 |     stop("Grid is not large enough...!")
36 | }
37 | 
38 | cat("Running CUDA kernel...\n")
39 | 
40 | seed = 0L
41 | 
42 | 
43 | tRNGinit <- system.time({
44 |   rng_states <- cudaMalloc(numEls=nthreads, sizeof=48L, elType="curandState")
45 |   .cuda(setup, rng_states, seed, nthreads, as.integer(verbose), gridDim=grid_dims, blockDim=block_dims)
46 |   cudaDeviceSynchronize()
47 | })
48 | 
49 | tAlloc <- system.time({
50 |   dX = cudaMalloc(N, sizeof = 4L, elType = "float", strict = FALSE)
51 |   cudaDeviceSynchronize()
52 | })
53 | 
54 | tCalc <- system.time({
55 | .cuda(rnorm, rng_states, dX, N, mu, sigma, N_per_thread, gridDim=grid_dims, blockDim=block_dims)
56 |   cudaDeviceSynchronize()
57 | })
58 | 
59 | tTransferFromGPU <- system.time({
60 |   out = copyFromDevice(obj = dX, nels = dX@nels, type = "float")
61 |   cudaDeviceSynchronize()
62 | })
63 | 
64 | 
65 | tCPU <- system.time({
66 |   out2 <- rnorm(N, mu, sigma)
67 | })
68 | 
69 | # having RCUDA determine gridding
70 | tCalc_gridby <- system.time({
71 | .cuda(rnorm, rng_states, dX, N, mu, sigma, N_per_thread, gridBy = as.integer(ceiling(N/N_per_thread)))
72 |   cudaDeviceSynchronize()
73 | })
74 | 
75 | 
76 | cat("RNG initiation time: ", tRNGinit[3], "\n")
77 | cat("GPU memory allocation time: ", tAlloc[3], "\n")
78 | cat("Calculation time (GPU): ", tCalc[3], "\n")
79 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n")
80 | cat("Calculation time (CPU): ", tCPU[3], "\n")
81 | 


--------------------------------------------------------------------------------
/RNGexample.R:
--------------------------------------------------------------------------------
 1 | library(RCUDA)
 2 | 
 3 | cat("Setting cuGetContext(TRUE)...\n")
 4 | cuGetContext(TRUE)
 5 | 
 6 | ptx = nvcc("random.cu", out = "random.ptx", target = "ptx",
 7 |      "-arch=compute_20", "-code=sm_20,compute_20")
 8 |   
 9 | m = loadModule(ptx)
10 | 
11 | setup = m$setup_kernel
12 | rnorm = m$rnorm_kernel
13 | 
14 | N = 1e8L  # NOTE 'N' is of type integer
15 | N_per_thread = 1000L
16 | 
17 | mu = 0.3
18 | sigma = 1.5
19 | 
20 | verbose = FALSE
21 | 
22 | # setting grid and block dimensions
23 | threads_per_block <- 1024L
24 | block_dims <- c(threads_per_block, 1L, 1L)
25 | grid_d <- as.integer(ceiling(sqrt((N/N_per_thread)/threads_per_block)))
26 | 
27 | grid_dims <- c(grid_d, grid_d, 1L)
28 | 
29 | cat("Grid size:\n")
30 | print(grid_dims)
31 | 
32 | nthreads <- as.integer(prod(grid_dims)*prod(block_dims))
33 | cat("Total number of threads to launch = ", nthreads, "\n")
34 | if (nthreads*N_per_thread < N){
35 |     stop("Grid is not large enough...!")
36 | }
37 | 
38 | cat("Running CUDA kernel...\n")
39 | 
40 | seed = 0L
41 | 
42 | 
43 | tRNGinit <- system.time({
44 |   rng_states <- cudaMalloc(numEls=nthreads, sizeof=48L, elType="curandState")
45 |   .cuda(setup, rng_states, seed, nthreads, as.integer(verbose), gridDim=grid_dims, blockDim=block_dims)
46 |   cudaDeviceSynchronize()
47 | })
48 | 
49 | tAlloc <- system.time({
50 |   dX = cudaMalloc(N, sizeof = 8L, elType = "double", strict = TRUE)
51 |   cudaDeviceSynchronize()
52 | })
53 | 
54 | tCalc <- system.time({
55 | .cuda(rnorm, rng_states, dX, N, mu, sigma, N_per_thread, gridDim=grid_dims, blockDim=block_dims,.numericAsDouble = getOption("CUDA.useDouble", TRUE))
56 |   cudaDeviceSynchronize()
57 | })
58 | 
59 | tTransferFromGPU <- system.time({
60 |   out = copyFromDevice(obj = dX, nels = dX@nels, type = "double")
61 |   cudaDeviceSynchronize()
62 | })
63 | 
64 | 
65 | tCPU <- system.time({
66 |   out2 <- rnorm(N, mu, sigma)
67 | })
68 | 
69 | # having RCUDA determine gridding
70 | tCalc_gridby <- system.time({
71 | .cuda(rnorm, rng_states, dX, N, mu, sigma, N_per_thread, gridBy = as.integer(ceiling(N/N_per_thread)), .numericAsDouble = getOption("CUDA.useDouble", TRUE))
72 |   cudaDeviceSynchronize()
73 | })
74 | 
75 | 
76 | cat("RNG initiation time: ", tRNGinit[3], "\n")
77 | cat("GPU memory allocation time: ", tAlloc[3], "\n")
78 | cat("Calculation time (GPU): ", tCalc[3], "\n")
79 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n")
80 | cat("Calculation time (CPU): ", tCPU[3], "\n")
81 | 


--------------------------------------------------------------------------------
/RCUDAexample.R:
--------------------------------------------------------------------------------
 1 | # modification of one of the RCUDA examples to use use double precision
 2 | 
 3 | library(RCUDA)
 4 | 
 5 | cat("Setting cuGetContext(TRUE)...\n")
 6 | cuGetContext(TRUE)
 7 | 
 8 | # compile the kernel into a form that RCUDA can load
 9 | # system("nvcc --ptx  -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu")
10 | ptx = nvcc(file = 'calc_loglik.cu', out = 'calc_loglik.ptx',
11 |   target = "ptx", "-arch=compute_20", "-code=sm_20,compute_20")
12 | 
13 | m = loadModule(ptx)
14 | calc_loglik = m$calc_loglik
15 | 
16 | N = 134217728L  # NOTE 'N' is of type integer
17 | 
18 | set.seed(0)
19 | x = runif(N)
20 | mu = 0.3
21 | sigma = 1.5
22 | 
23 | # setting grid and block dimensions
24 | threads_per_block <- 1024L
25 | block_dims <- c(threads_per_block, 1L, 1L)
26 | grid_d <- as.integer(ceiling(sqrt(N/threads_per_block)))
27 | 
28 | grid_dims <- c(grid_d, grid_d, 1L)
29 | 
30 | cat("Grid size:\n")
31 | print(grid_dims)
32 | 
33 | nthreads <- prod(grid_dims)*prod(block_dims)
34 | cat("Total number of threads to launch = ", nthreads, "\n")
35 | if (nthreads < N){
36 |     stop("Grid is not large enough...!")
37 | }
38 | 
39 | cat("Running CUDA kernel...\n")
40 | 
41 | # basic usage with manual transfer
42 | tTransferToGPU <- system.time({
43 |   dX = copyToDevice(x, strict = TRUE)
44 |   cudaDeviceSynchronize()
45 | })
46 | tCalc <- system.time({
47 |   .cuda(calc_loglik, dX, N, mu, sigma, gridDim = grid_dims, blockDim = block_dims, .numericAsDouble = getOption("CUDA.useDouble", TRUE))
48 |   cudaDeviceSynchronize()
49 | })
50 | tTransferFromGPU <- system.time({
51 |   out = copyFromDevice(obj = dX, nels = dX@nels, type = "double")
52 |   cudaDeviceSynchronize()
53 | })
54 | 
55 | cat("Input values: ", x[1:3], "\n")
56 | cat("Output values: ", out[1:3], "\n")
57 | 
58 | # implicit transfer done by RCUDA behind the scenes
59 | tFull <- system.time({
60 |   out <- .cuda(calc_loglik, "x"=x, N, mu, sigma, gridDim=grid_dims, blockDim=block_dims, outputs="x", .numericAsDouble = getOption("CUDA.useDouble", TRUE))
61 |   cudaDeviceSynchronize()
62 | })
63 | 
64 | 
65 | cat("Output values (implicit transfer): ", out[1:3], "\n")
66 | 
67 | tCalc_R <- system.time({
68 |   out <- dnorm(x, mu, sigma)
69 | })
70 | 
71 | cat("Output values (CPU with R): ", out[1:3], "\n")
72 |                       
73 | cat("Transfer to GPU time: ", tTransferToGPU[3], "\n")
74 | cat("Calculation time (GPU): ", tCalc[3], "\n")
75 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n")
76 | cat("Calculation time (CPU): ", tCalc_R[3], "\n")
77 | cat("Combined calculation/transfer via .cuda time (GPU): ", tFull[3], "\n")
78 | 
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/helloWorld.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | // Note: Needs compute capability >= 2.0, so compile with:
 6 | // nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
 7 | 
 8 | // number of computations:
 9 | #define N 20000
10 | // constants for grid and block sizes 
11 | #define GRID_D1 20
12 | #define GRID_D2 2
13 | #define BLOCK_D1 512
14 | #define BLOCK_D2 1
15 | #define BLOCK_D3 1
16 | 
17 | // this is the kernel function called for each thread
18 | // we use the CUDA variables {threadIdx, blockIdx, blockDim, gridDim} to determine a unique ID for each thread
19 | __global__ void hello(void)
20 | {
21 |     // id of the block
22 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
23 |     // size of each block (within grid of blocks)
24 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
25 |     // id of thread in a given block
26 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
27 |     // assign overall id/index of the thread
28 |     int idx = myblock * blocksize + subthread;
29 |     if(idx < 2000 || idx > 19000) {
30 |        // print buffer from within the kernel is limited so only print for first and last chunks of threads
31 |     if (idx < N){      
32 |         printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => \
33 |        thread index=%d\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx);
34 |     } else {
35 |         printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => \
36 |         thread index=%d [### this thread would not be used for N=%d ###]\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, 
37 |         threadIdx.x, threadIdx.y, threadIdx.z, idx, N);
38 |     }
39 |     }
40 | }
41 | 
42 | 
43 | int main(int argc,char **argv)
44 | {
45 |     // objects containing the block and grid info
46 |     const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
47 |     const dim3 gridSize(GRID_D1, GRID_D2, 1);
48 |     int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*GRID_D1*GRID_D2;
49 |     if (nthreads < N){
50 |         printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N);
51 |     } else {
52 |         printf("Launching %d threads (N=%d)\n",nthreads,N);
53 |     }
54 |     
55 |     // launch the kernel on the specified grid of thread blocks
56 |     hello<<<gridSize, blockSize>>>();
57 |     
58 |     // Need to flush prints, otherwise none of the prints from within the kernel will show up
59 |     // as program exit does not flush the print buffer.
60 |     cudaError_t cudaerr = cudaDeviceSynchronize();
61 |     if (cudaerr){
62 |         printf("kernel launch failed with error \"%s\".\n",
63 |                cudaGetErrorString(cudaerr));
64 |     } else {
65 |         printf("kernel launch success!\n");
66 |     }
67 |     
68 |     printf("That's all!\n");
69 | 
70 |     return 0;
71 | }
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/PyCUDAexample.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | import numpy as np
 4 | import scipy as sp
 5 | from scipy.stats import norm
 6 | from pycuda.compiler import SourceModule
 7 | import math
 8 | 
 9 | # Here's the kernel, essentially identical to that used in the CUDA and RCUDA examples
10 | 
11 | m = SourceModule("""
12 | #include <stdio.h>
13 | #define SQRT_TWO_PI 2.506628274631000
14 | __global__ void dnorm_kernel(double *vals, double *x, int N, double mu, double sigma, int dbg)
15 | {
16 |    // note that this assumes no third dimension to the grid
17 |    int myblock = blockIdx.x + blockIdx.y * gridDim.x;
18 |     // size of each block (within grid of blocks)
19 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
20 |     // id of thread in a given block
21 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
22 |     // assign overall id/index of the thread
23 |     int idx = myblock * blocksize + subthread;
24 | 
25 |     if (idx < N) {
26 |         if (dbg){
27 |             printf("thread idx: %04d\\t x[%d] = %f\\t (n=%d,mu=%f,sigma=%f)\\n",idx,idx,x[idx],N,mu,sigma);
28 |         }
29 |         double std = (x[idx] - mu)/sigma;
30 |         double e = exp( - 0.5 * std * std);
31 |         vals[idx] = e / ( sigma * SQRT_TWO_PI);
32 |     } else {
33 |         if (dbg){
34 |             printf("thread idx: %04d\\t (>=N=%d)\\n",idx,N);
35 |         }
36 |     }
37 |     return;
38 | }
39 | """)
40 | 
41 | dnorm = m.get_function("dnorm_kernel")
42 | 
43 | # Arguments must be numpy datatypes i.e., n = 1000 will not work!
44 | 
45 | N = np.int32(134931456)
46 | 
47 | # Threads per block and number of blocks:
48 | threads_per_block = int(1024)
49 | block_dims = (threads_per_block, 1, 1)
50 | grid_d = int(math.ceil(math.sqrt(N/threads_per_block)))
51 | grid_dims = (grid_d, grid_d, 1)
52 | 
53 | 
54 | print("Generating random normals...")
55 | x = np.random.normal(size = N)
56 | 
57 | # Evaluate at N(0.3, 1.5)
58 | 
59 | mu = np.float64(0.3)
60 | sigma = np.float64(1.5)
61 | dbg = False # True
62 | verbose = np.int32(dbg)
63 | 
64 | # Allocate storage for the result:
65 | 
66 | out = np.zeros_like(x)
67 | 
68 | # Create two timers:
69 | start = drv.Event()
70 | end = drv.Event()
71 | 
72 | # Launch the kernel 
73 | print("Running GPU code...")
74 | start.record()
75 | 
76 | dnorm(drv.Out(out), drv.In(x), N, mu, sigma, verbose, block= block_dims, grid = grid_dims)
77 | 
78 | end.record() # end timing
79 | # calculate the run length
80 | end.synchronize()
81 | 
82 | gpu_secs = start.time_till(end)*1e-3
83 | print "Time for calculation (GPU): %fs" % gpu_secs
84 | 
85 | # Scipy version:
86 | print("Running Scipy CPU code...")
87 | start.record()
88 | out2 = norm.pdf(x, loc = mu, scale = sigma)
89 | end.record() # end timing
90 | # calculate the run length
91 | end.synchronize()
92 | cpu_secs = start.time_till(end)*1e-3
93 | print "Time for calculation (CPU): %fs" % cpu_secs
94 | 
95 | print "Output from GPU: %f %f %f" % (out[0], out[1], out[2])
96 | print "Output from CPU: %f %f %f" % (out2[0], out2[1], out2[2])
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/cudaBlasExample.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include <stdio.h>
  4 | 
  5 | #include <math.h>
  6 | #include <cuda_runtime.h>
  7 | #include "cublas_v2.h"
  8 | 
  9 | // compile as:
 10 | // export PATH=$PATH:/usr/local/cuda/bin
 11 | // nvcc cudaExample.C -I/usr/local/cuda/include -lcublas -o cudaExample
 12 | 
 13 | 
 14 | double read_timer() {
 15 |   struct timeval end;
 16 |   gettimeofday( &end, NULL );
 17 |   return end.tv_sec+1.e-6*end.tv_usec;
 18 | }
 19 | 
 20 | void fillMatrix( double *p, int n ) {
 21 |   int i;
 22 |   srand48(0);
 23 |   for( i = 0; i < n; i++ )
 24 |     p[i] = 2*drand48()-1;
 25 | }
 26 | 
 27 | int main( int argc, char **argv ) {
 28 |   printf("Starting\n");
 29 |   int size;
 30 |   cudaError_t cudaStat;
 31 |   cublasStatus_t stat;
 32 |   cublasHandle_t handle;
 33 |   int it;
 34 | 
 35 |   cublasOperation_t N = 'N';
 36 |   cublasOperation_t T = 'T';
 37 |   double one = 1., zero=0.;
 38 | 
 39 |   for( size = 256; size <= 8192; size*=2 ) {
 40 | 
 41 |     // allocate memory on host (CPU)
 42 |     double *A = (double*) malloc( sizeof(double)*size*size );
 43 |     double *B = (double*) malloc( sizeof(double)*size*size );
 44 | 
 45 |     cudaDeviceSynchronize();
 46 |     double tInit = read_timer();
 47 | 
 48 |     double *dA,*dB;
 49 |     // allocate memory on device (GPU)
 50 |     cudaStat = cudaMalloc((void**)&dA, sizeof(double)*size*size);
 51 |     if(cudaStat != cudaSuccess) {
 52 |       printf ("device memory allocation failed");
 53 |       return EXIT_FAILURE;
 54 |     }
 55 |     cudaStat = cudaMalloc((void**)&dB, sizeof(double)*size*size);
 56 |     if(cudaStat != cudaSuccess) {
 57 |       printf ("device memory allocation failed");
 58 |       return EXIT_FAILURE;
 59 |     }
 60 | 
 61 |     // wait until previous CUDA commands on GPU threads have finished
 62 |     // this allows us to do the timing correctly
 63 |     cudaDeviceSynchronize();
 64 | 
 65 |     double tAlloc = read_timer();
 66 | 
 67 |     
 68 |     // initialization of CUBLAS
 69 |     stat = cublasCreate(&handle);
 70 |     if(stat != CUBLAS_STATUS_SUCCESS) {
 71 |       printf ("CUBLAS initialization failed\n");
 72 |       return EXIT_FAILURE;
 73 |     }
 74 | 
 75 |     // create our test matrix on the CPU
 76 |     fillMatrix(B, size*size);
 77 | 
 78 |     cudaDeviceSynchronize();
 79 |     double tInit2 = read_timer();
 80 | 
 81 | 
 82 |     // copy matrix to GPU, with dB the pointer to the object on the GPU
 83 |     stat = cublasSetMatrix (size, size, sizeof(double), B, size, dB, size);
 84 |     if(stat != CUBLAS_STATUS_SUCCESS) {
 85 |       printf ("data download failed");
 86 |       cudaFree (dB);
 87 |       cublasDestroy(handle);
 88 |       return EXIT_FAILURE;
 89 |     }
 90 | 
 91 |     cudaDeviceSynchronize();
 92 |     double tTransferToGPU = read_timer();
 93 |  
 94 |     // call cublas matrix multiply (dA = dB * dB)
 95 |     cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &one, dB, size, dB, size, &zero, dA, size );
 96 | 
 97 |     cudaDeviceSynchronize();
 98 |     double tMatMult = read_timer();
 99 | 
100 |     // transfer matrix back to CPU
101 |     stat = cublasGetMatrix (size, size, sizeof(double), dA, size, A, size);
102 |     if(stat != CUBLAS_STATUS_SUCCESS) {
103 |       printf ("data upload failed");
104 |       cudaFree(dA);
105 |       cublasDestroy(handle);
106 |       return EXIT_FAILURE;
107 |     }
108 |     
109 |     cudaDeviceSynchronize();
110 |     double tTransferFromGPU = read_timer();
111 | 
112 |     printf("====================================================\n");
113 |     printf("Timing results for n = %d\n", size);
114 |     printf("GPU memory allocation time: %f\n", tAlloc - tInit);
115 |     printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2);
116 |     printf("Matrix multiply time: %f\n", tMatMult - tTransferToGPU);
117 |     printf("Transfer from GPU time: %f\n", tTransferFromGPU - tMatMult);
118 | 
119 | 
120 |     // free memory on GPU and CPU
121 |     cudaFree(dA);
122 |     cudaFree(dB);
123 |     cublasDestroy(handle);
124 |     free(A);
125 |     free(B);
126 |  
127 |   }
128 |   return EXIT_SUCCESS;
129 | }
130 | 


--------------------------------------------------------------------------------
/kernelExample-pinned.cu:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include<stdio.h>
  4 | #include<cuda.h>
  5 | #include<math.h>
  6 | 
  7 | //#define N 1000000
  8 | #define SQRT_TWO_PI 2.506628274631000
  9 | #define BLOCK_D1 1024
 10 | #define BLOCK_D2 1
 11 | #define BLOCK_D3 1
 12 | 
 13 | // Note: Needs compute capability >= 2.0 for calculation with doubles, so compile with:
 14 | // nvcc kernelExample-pinned.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample-pinned
 15 | // -use_fast_math
 16 | 
 17 | // CUDA kernel:
 18 | __global__ void calc_loglik(double* vals, int N, double mu, double sigma) {
 19 |    // note that this assumes no third dimension to the grid
 20 |     // id of the block
 21 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
 22 |     // size of each block (within grid of blocks)
 23 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
 24 |     // id of thread in a given block
 25 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
 26 |     // assign overall id/index of the thread
 27 |     int idx = myblock * blocksize + subthread;
 28 | 
 29 |         if(idx < N) {
 30 |             double std = (vals[idx] - mu)/sigma;
 31 |             double e = exp( - 0.5 * std * std);
 32 |             vals[idx] = e / ( sigma * SQRT_TWO_PI);
 33 |         }
 34 | }
 35 | 
 36 | int calc_loglik_cpu(double* vals, int N, double mu, double sigma) {
 37 |   double std, e;
 38 |   for(int idx = 0; idx < N; idx++) {
 39 |     std = (vals[idx] - mu)/sigma;
 40 |     e = exp( - 0.5 * std * std);
 41 |     vals[idx] = e / ( sigma * SQRT_TWO_PI);
 42 |   }
 43 |   return 0;
 44 | }
 45 | 
 46 | 
 47 | /* --------------------------- host code ------------------------------*/
 48 | void fill( double *p, int n ) {
 49 |   int i;
 50 |   srand48(0);
 51 |   for( i = 0; i < n; i++ )
 52 |     p[i] = 2*drand48()-1;
 53 | }
 54 | 
 55 | double read_timer() {
 56 |   struct timeval end;
 57 |   gettimeofday( &end, NULL );
 58 |   return end.tv_sec+1.e-6*end.tv_usec;
 59 | }
 60 | 
 61 | int main (int argc, char *argv[]) {
 62 |   double* cpu_vals;
 63 |   double* gpu_vals;
 64 |   int N;
 65 |   cudaError_t cudaStat;
 66 |  
 67 |   printf("====================================================\n");
 68 |   for( N = 32768; N <= 134217728; N*=8 ) {
 69 |     // allocated pinned and mapped memory on CPU
 70 |     cudaSetDeviceFlags(cudaDeviceMapHost);
 71 |     cudaHostAlloc((void**)&cpu_vals, N*sizeof(double), cudaHostAllocMapped);
 72 | 
 73 |     // map the CPU storage to the GPU to the CPU storage
 74 |     cudaStat = cudaHostGetDevicePointer(&gpu_vals, cpu_vals, 0);
 75 | 
 76 |     const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
 77 |     
 78 |     int tmp = ceil(pow(N/BLOCK_D1, 0.5));
 79 |     printf("Grid dimension is %i x %i\n", tmp, tmp);
 80 |     dim3 gridSize(tmp, tmp, 1);
 81 | 
 82 |     int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*tmp*tmp;
 83 |     if (nthreads < N){
 84 |         printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N);
 85 |     } else {
 86 |         printf("Launching %d threads (N=%d)\n", nthreads, N);
 87 |     }
 88 | 
 89 |     double mu = 0.0;
 90 |     double sigma = 1.0;
 91 | 
 92 |     // simulate 'data'
 93 |     fill(cpu_vals, N);
 94 |     printf("Input values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
 95 | 
 96 |     cudaDeviceSynchronize();
 97 |     double tInit = read_timer();
 98 | 
 99 |     // do the calculation
100 |     calc_loglik<<<gridSize, blockSize>>>(gpu_vals, N, mu, sigma);
101 |     
102 |     cudaDeviceSynchronize();
103 |     double tCalc = read_timer();
104 | 
105 |     printf("Output values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
106 | 
107 |     // do calculation on CPU for comparison (unfair as this will only use one core)
108 |     fill(cpu_vals, N);
109 |     double tInit2 = read_timer();
110 |     calc_loglik_cpu(cpu_vals, N, mu, sigma);
111 |     double tCalcCPU = read_timer();
112 | 
113 |     printf("Output values (CPU): %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
114 | 
115 |     printf("Timing results for n = %d\n", N);
116 |     printf("Calculation time (GPU): %f\n", tCalc - tInit);
117 |     printf("Calculation time (CPU): %f\n", tCalcCPU - tInit2);
118 | 
119 |     printf("Freeing memory...\n");
120 |     printf("====================================================\n");
121 |     cudaFreeHost(cpu_vals);
122 | 
123 |   }
124 |   printf("\n\nFinished.\n\n");
125 |   return 0;
126 | }
127 | 
128 | 


--------------------------------------------------------------------------------
/kernelExample-single.cu:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include<stdio.h>
  4 | #include<cuda.h>
  5 | #include<math.h>
  6 | 
  7 | //#define N 1000000
  8 | #define SQRT_TWO_PI 2.506628274631000
  9 | #define BLOCK_D1 1024
 10 | #define BLOCK_D2 1
 11 | #define BLOCK_D3 1
 12 | 
 13 | // Note: Needs compute capability >= 2.0 for calculation with doubles, so compile with:
 14 | // nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
 15 | // -use_fast_math doesn't seem to have any effect on speed
 16 | 
 17 | // CUDA kernel:
 18 | __global__ void calc_loglik(float* vals, int N, float mu, float sigma) {
 19 |    // note that this assumes no third dimension to the grid
 20 |     // id of the block
 21 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
 22 |     // size of each block (within grid of blocks)
 23 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
 24 |     // id of thread in a given block
 25 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
 26 |     // assign overall id/index of the thread
 27 |     int idx = myblock * blocksize + subthread;
 28 | 
 29 |         if(idx < N) {
 30 |             float std = (vals[idx] - mu)/sigma;
 31 |             float e = exp( - 0.5 * std * std);
 32 |             vals[idx] = e / ( sigma * SQRT_TWO_PI);
 33 |         }
 34 | }
 35 | 
 36 | // CPU analog for speed comparison
 37 | int calc_loglik_cpu(float* vals, int N, float mu, float sigma) {
 38 |   float std, e;
 39 |   for(int idx = 0; idx < N; idx++) {
 40 |     std = (vals[idx] - mu)/sigma;
 41 |     e = exp( - 0.5 * std * std);
 42 |     vals[idx] = e / ( sigma * SQRT_TWO_PI);
 43 |   }
 44 |   return 0;
 45 | }
 46 | 
 47 | 
 48 | /* --------------------------- host code ------------------------------*/
 49 | void fill( float *p, int n ) {
 50 |   int i;
 51 |   srand48(0);
 52 |   for( i = 0; i < n; i++ )
 53 |     p[i] = 2*drand48()-1;
 54 | }
 55 | 
 56 | double read_timer() {
 57 |   struct timeval end;
 58 |   gettimeofday( &end, NULL );
 59 |   return end.tv_sec+1.e-6*end.tv_usec;
 60 | }
 61 | 
 62 | int main (int argc, char *argv[]) {
 63 |   float* cpu_vals;
 64 |   float* gpu_vals;
 65 |   int N;
 66 |   cudaError_t cudaStat;
 67 |  
 68 |   printf("====================================================\n");
 69 |   for( N = 32768; N <= 134217728; N*=8 ) {
 70 |     cpu_vals = (float*) malloc( sizeof(float)*N );
 71 |     cudaStat = cudaMalloc(&gpu_vals, sizeof(float)*N);
 72 |     if(cudaStat != cudaSuccess) {
 73 |       printf ("device memory allocation failed");
 74 |       return EXIT_FAILURE;
 75 |     }
 76 | 
 77 |     // fixed block dimensions (1024x1x1 threads)
 78 |     const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
 79 |     
 80 |     // determine number of blocks we need for a given problem size
 81 |     int tmp = ceil(pow(N/BLOCK_D1, 0.5));
 82 |     printf("Grid dimension is %i x %i\n", tmp, tmp);
 83 |     dim3 gridSize(tmp, tmp, 1);
 84 | 
 85 |     int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*tmp*tmp;
 86 |     if (nthreads < N){
 87 |         printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N);
 88 |     } else {
 89 |         printf("Launching %d threads (N=%d)\n", nthreads, N);
 90 |     }
 91 | 
 92 |     float mu = 0.0;
 93 |     float sigma = 1.0;
 94 | 
 95 |     // simulate 'data'
 96 |     fill(cpu_vals, N);
 97 |     printf("Input values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
 98 | 
 99 |     cudaDeviceSynchronize();
100 |     double tInit = read_timer();
101 | 
102 |     // copy input data to the GPU
103 |     cudaStat = cudaMemcpy(gpu_vals, cpu_vals, N*sizeof(float), cudaMemcpyHostToDevice);
104 |     printf("Memory Copy from Host to Device ");
105 |     if (cudaStat){
106 |       printf("failed.\n");
107 |     } else {
108 |       printf("successful.\n");
109 |     }
110 |     cudaDeviceSynchronize();
111 |     double tTransferToGPU = read_timer();
112 | 
113 |     // do the calculation
114 |     calc_loglik<<<gridSize, blockSize>>>(gpu_vals, N, mu, sigma);
115 |     
116 |     cudaDeviceSynchronize();
117 |     double tCalc = read_timer();
118 | 
119 |     cudaStat = cudaMemcpy(cpu_vals, gpu_vals, N, cudaMemcpyDeviceToHost);
120 |     printf("Memory Copy from Device to Host ");
121 |     if (cudaStat){
122 |       printf("failed.\n");
123 |     } else {
124 |       printf("successful.\n");
125 |     }
126 |     cudaDeviceSynchronize();
127 |     double tTransferFromGPU = read_timer();
128 | 
129 |     printf("Output values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
130 | 
131 |     // do calculation on CPU for comparison (unfair as this will only use one core)
132 |     fill(cpu_vals, N);
133 |     double tInit2 = read_timer();
134 |     calc_loglik_cpu(cpu_vals, N, mu, sigma);
135 |     double tCalcCPU = read_timer();
136 | 
137 |     printf("Output values (CPU): %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
138 | 
139 |     printf("Timing results for n = %d\n", N);
140 |     printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit);
141 |     printf("Calculation time (GPU): %f\n", tCalc - tTransferToGPU);
142 |     printf("Calculation time (CPU): %f\n", tCalcCPU - tInit2);
143 |     printf("Transfer from GPU time: %f\n", tTransferFromGPU - tCalc);
144 | 
145 |     printf("Freeing memory...\n");
146 |     printf("====================================================\n");
147 |     free(cpu_vals);
148 |     cudaFree(gpu_vals);
149 | 
150 |   }
151 |   printf("\n\nFinished.\n\n");
152 |   return 0;
153 | }
154 | 
155 |  
156 | 


--------------------------------------------------------------------------------
/kernelExample.cu:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include<stdio.h>
  4 | #include<cuda.h>
  5 | #include<math.h>
  6 | 
  7 | //#define N 1000000
  8 | #define SQRT_TWO_PI 2.506628274631000
  9 | #define BLOCK_D1 1024
 10 | #define BLOCK_D2 1
 11 | #define BLOCK_D3 1
 12 | 
 13 | // Note: Needs compute capability >= 2.0 for calculation with doubles, so compile with:
 14 | // nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
 15 | // -use_fast_math doesn't seem to have any effect on speed
 16 | 
 17 | // CUDA kernel:
 18 | __global__ void calc_loglik(double* vals, int N, double mu, double sigma) {
 19 |    // note that this assumes no third dimension to the grid
 20 |     // id of the block
 21 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
 22 |     // size of each block (within grid of blocks)
 23 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
 24 |     // id of thread in a given block
 25 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
 26 |     // assign overall id/index of the thread
 27 |     int idx = myblock * blocksize + subthread;
 28 | 
 29 |         if(idx < N) {
 30 |             double std = (vals[idx] - mu)/sigma;
 31 |             double e = exp( - 0.5 * std * std);
 32 |             vals[idx] = e / ( sigma * SQRT_TWO_PI);
 33 |         }
 34 | }
 35 | 
 36 | // CPU analog for speed comparison
 37 | int calc_loglik_cpu(double* vals, int N, double mu, double sigma) {
 38 |   double std, e;
 39 |   for(int idx = 0; idx < N; idx++) {
 40 |     std = (vals[idx] - mu)/sigma;
 41 |     e = exp( - 0.5 * std * std);
 42 |     vals[idx] = e / ( sigma * SQRT_TWO_PI);
 43 |   }
 44 |   return 0;
 45 | }
 46 | 
 47 | 
 48 | /* --------------------------- host code ------------------------------*/
 49 | void fill( double *p, int n ) {
 50 |   int i;
 51 |   srand48(0);
 52 |   for( i = 0; i < n; i++ )
 53 |     p[i] = 2*drand48()-1;
 54 | }
 55 | 
 56 | double read_timer() {
 57 |   struct timeval end;
 58 |   gettimeofday( &end, NULL );
 59 |   return end.tv_sec+1.e-6*end.tv_usec;
 60 | }
 61 | 
 62 | int main (int argc, char *argv[]) {
 63 |   double* cpu_vals;
 64 |   double* gpu_vals;
 65 |   int N;
 66 |   cudaError_t cudaStat;
 67 |  
 68 |   printf("====================================================\n");
 69 |   for( N = 32768; N <= 134217728; N*=8 ) {
 70 |     cpu_vals = (double*) malloc( sizeof(double)*N );
 71 |     cudaStat = cudaMalloc(&gpu_vals, sizeof(double)*N);
 72 |     if(cudaStat != cudaSuccess) {
 73 |       printf ("device memory allocation failed");
 74 |       return EXIT_FAILURE;
 75 |     }
 76 | 
 77 |     // fixed block dimensions (1024x1x1 threads)
 78 |     const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
 79 |     
 80 |     // determine number of blocks we need for a given problem size
 81 |     int tmp = ceil(pow(N/BLOCK_D1, 0.5));
 82 |     printf("Grid dimension is %i x %i\n", tmp, tmp);
 83 |     dim3 gridSize(tmp, tmp, 1);
 84 | 
 85 |     int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*tmp*tmp;
 86 |     if (nthreads < N){
 87 |         printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N);
 88 |     } else {
 89 |         printf("Launching %d threads (N=%d)\n", nthreads, N);
 90 |     }
 91 | 
 92 |     double mu = 0.0;
 93 |     double sigma = 1.0;
 94 | 
 95 |     // simulate 'data'
 96 |     fill(cpu_vals, N);
 97 |     printf("Input values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
 98 | 
 99 |     cudaDeviceSynchronize();
100 |     double tInit = read_timer();
101 | 
102 |     // copy input data to the GPU
103 |     cudaStat = cudaMemcpy(gpu_vals, cpu_vals, N*sizeof(double), cudaMemcpyHostToDevice);
104 |     printf("Memory Copy from Host to Device ");
105 |     if (cudaStat){
106 |       printf("failed.\n");
107 |     } else {
108 |       printf("successful.\n");
109 |     }
110 |     cudaDeviceSynchronize();
111 |     double tTransferToGPU = read_timer();
112 | 
113 |     // do the calculation
114 |     calc_loglik<<<gridSize, blockSize>>>(gpu_vals, N, mu, sigma);
115 |     
116 |     cudaDeviceSynchronize();
117 |     double tCalc = read_timer();
118 | 
119 |     cudaStat = cudaMemcpy(cpu_vals, gpu_vals, N, cudaMemcpyDeviceToHost);
120 |     printf("Memory Copy from Device to Host ");
121 |     if (cudaStat){
122 |       printf("failed.\n");
123 |     } else {
124 |       printf("successful.\n");
125 |     }
126 |     cudaDeviceSynchronize();
127 |     double tTransferFromGPU = read_timer();
128 | 
129 |     printf("Output values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
130 | 
131 |     // do calculation on CPU for comparison (unfair as this will only use one core)
132 |     fill(cpu_vals, N);
133 |     double tInit2 = read_timer();
134 |     calc_loglik_cpu(cpu_vals, N, mu, sigma);
135 |     double tCalcCPU = read_timer();
136 | 
137 |     printf("Output values (CPU): %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
138 | 
139 |     printf("Timing results for n = %d\n", N);
140 |     printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit);
141 |     printf("Calculation time (GPU): %f\n", tCalc - tTransferToGPU);
142 |     printf("Calculation time (CPU): %f\n", tCalcCPU - tInit2);
143 |     printf("Transfer from GPU time: %f\n", tTransferFromGPU - tCalc);
144 | 
145 |     printf("Freeing memory...\n");
146 |     printf("====================================================\n");
147 |     free(cpu_vals);
148 |     cudaFree(gpu_vals);
149 | 
150 |   }
151 |   printf("\n\nFinished.\n\n");
152 |   return 0;
153 | }
154 | 
155 |  
156 | 


--------------------------------------------------------------------------------
/magmaExample.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include <stdio.h>
  4 | 
  5 | #include <math.h>
  6 | #include <cuda_runtime.h>
  7 | #include "cublas_v2.h"
  8 | 
  9 | #include "magma.h"
 10 | #include "magma_lapack.h"
 11 | 
 12 | // compile as:
 13 | // gcc magmaExample.c -O3 -DADD_ -fopenmp  -DHAVE_CUBLAS -I/usr/local/cuda/include -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma  -llapack -lblas -lcublas -o magmaExample
 14 | 
 15 | 
 16 | double read_timer() {
 17 |   struct timeval end;
 18 |   gettimeofday( &end, NULL );
 19 |   return end.tv_sec+1.e-6*end.tv_usec;
 20 | }
 21 | 
 22 | // BLAS/LAPACK functions for matrix multiply and Cholesky
 23 | // not needed as these are in magma_dlapack.h
 24 | // void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* );
 25 | // int dpotrf_(char* uplo, int* n, double* a, int* lda, int* info);
 26 | 
 27 | void fillMatrix( double *p, int n ) {
 28 |   int i;
 29 |   srand48(0);
 30 |   for( i = 0; i < n; i++ )
 31 |     p[i] = 2*drand48()-1;
 32 | }
 33 | 
 34 | 
 35 | int main( int argc, char **argv ) {
 36 |   printf("Starting\n");
 37 |   int size;
 38 |   cudaError_t cudaStat;
 39 |   magma_err_t magmaStat;
 40 |   cublasStatus_t stat;
 41 |   cublasHandle_t handle;
 42 |   int it,i;
 43 | 
 44 |   cublasOperation_t N = 'N';
 45 |   cublasOperation_t T = 'T';
 46 |   char N2 = 'N';
 47 |   char T2 = 'T';
 48 |   double one = 1., zero=0.;
 49 |   char uplo = 'L';
 50 |   int info;
 51 |   
 52 |   int err; double* A; double* B;
 53 |   magmaStat = magma_init();
 54 | 
 55 |   int use_pinned;
 56 |   if(argc > 1) {
 57 |     use_pinned = atoi(argv[1]);
 58 |   } else use_pinned = 0;
 59 |   printf("Setting use_pinned to %d\n", use_pinned);
 60 | 
 61 |   for( size = 256; size <= 8192; size*=2 ) {
 62 |  
 63 |      if(use_pinned) {
 64 |        // allocate pinned memory on CPU
 65 |        err = magma_dmalloc_pinned( &A,  size*size );  assert( err == 0 );
 66 |        err = magma_dmalloc_pinned( &B,  size*size );  assert( err == 0 );
 67 |      } else {
 68 |        // allocate standard memory on CPU
 69 |        A = (double*) malloc( sizeof(double)*size*size );
 70 |        B = (double*) malloc( sizeof(double)*size*size );
 71 |      }
 72 | 
 73 |     cudaDeviceSynchronize();
 74 |     double tInit = read_timer();     
 75 |     double *dA,*dB;
 76 |     // allocate memory on GPU
 77 |     magma_malloc( (void**) &dA, sizeof(double)*size*size );
 78 |     magma_malloc( (void**) &dB, sizeof(double)*size*size );
 79 |     
 80 |     cudaDeviceSynchronize();
 81 |     double tAlloc = read_timer();     
 82 |  
 83 |     fillMatrix(B, size*size);
 84 |  
 85 | 
 86 |     cudaDeviceSynchronize();
 87 |     double tInit2 = read_timer();
 88 | 
 89 |     // transfer data to GPU
 90 |     magma_dsetmatrix( size, size, B, size, dB, size );
 91 | 
 92 |     cudaDeviceSynchronize();
 93 |     double tTransferToGPU = read_timer();
 94 | 
 95 |     // matrix multiply
 96 |     magmablas_dgemm('N', 'T', size, size, size, one, dB, size, dB, size, zero, dA, size );
 97 |     // magma_dgemm is apparently synonymous with magmablas_dgemm
 98 | 
 99 |     cudaDeviceSynchronize();
100 |     double tMatMult = read_timer();
101 |  
102 |     // Cholesky decomposition on GPU with GPU interface (called with object on GPU)
103 |     magma_dpotrf_gpu( 'L', size, dA, size, &info );
104 | 
105 |     cudaDeviceSynchronize();
106 |     double tChol = read_timer();
107 | 
108 |     // transfer data back to CPU
109 |     magma_dgetmatrix( size, size, dA, size, A, size );
110 |     cudaDeviceSynchronize();
111 |     double tTransferFromGPU = read_timer();
112 |  
113 |     // standard BLAS matrix multiply on CPU
114 |     dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );
115 | 
116 |     cudaDeviceSynchronize();
117 |     double tMatMultBlas = read_timer();
118 | 
119 |     // Cholesky decomposition on GPU with CPU interface (called with object on CPU)
120 |     magma_dpotrf( 'L', size, A, size, &info );
121 | 
122 |     cudaDeviceSynchronize();
123 |     double tCholCpuInterface = read_timer();
124 | 
125 |     // recreate A = B * B (could just do a save and copy instead....)
126 |     dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );
127 | 
128 |     cudaDeviceSynchronize();
129 |     double tInit3 = read_timer();
130 | 
131 |     // standard Lapack Cholesky decomposition on CPU
132 |     dpotrf_(&uplo, &size, A, &size, &info);
133 |   
134 |     cudaDeviceSynchronize();
135 |     double tCholCpu= read_timer();
136 |  
137 | 
138 |     printf("====================================================\n");
139 |     printf("Timing results for n = %d\n", size);
140 |     printf("GPU memory allocation time: %f\n", tAlloc - tInit);
141 |     printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2);
142 |     printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU);
143 |     printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU);
144 |     printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult);
145 |     printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas);
146 |     printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3);
147 |     printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol);
148 | 
149 |     if(use_pinned) {
150 |       magma_free_pinned(A);
151 |       magma_free_pinned(B);
152 |     } else {
153 |       free(A);
154 |       free(B);
155 |     }
156 |     magma_free(dA);
157 |     magma_free(dB);
158 |  
159 |   }
160 |   return EXIT_SUCCESS;
161 | }
162 | 


--------------------------------------------------------------------------------
/makeGPUimage.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # at moment can't use g2 from starcluster, so...
  4 | # search AMIs in region of interest for ubuntu/images/hvm/ubuntu-precise-12.04
  5 | # choose latest and try to launch; may need to work backwards as some AMIs don't support g2.2xlarge
  6 | # oregon as of feb 2014: ami-d4d8b8e4, but doesn't work
  7 | # ami-52b22962 (20131114) did work
  8 | # launch from console
  9 | # look in "connect" button to find the ssh command: it will be like the following:
 10 | # ssh -i ~/.ssh/ec2star.rsa ec2-user@ec2-54-203-81-145.us-west-2.compute.amazonaws.com
 11 | # ecstar.rsa for oregon region, ecstar.rsa-east for east region
 12 | export ip=54-184-69-23
 13 | 
 14 | ssh -i ~/.ssh/ec2star.rsa ubuntu@ec2-${ip}.us-west-2.compute.amazonaws.com
 15 | 
 16 | sudo su
 17 | 
 18 | # CRAN repo
 19 | apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9 
 20 | echo "deb http://cran.cnr.berkeley.edu/bin/linux/ubuntu precise/" > \
 21 |         /etc/apt/sources.list.d/cran.list 
 22 | 
 23 | wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1204/x86_64/cuda-repo-ubuntu1204_5.5-0_amd64.deb
 24 | dpkg -i cuda-repo-ubuntu1204_5.5-0_amd64.deb
 25 | 
 26 | apt-get update
 27 | apt-get -y upgrade
 28 | # keep grub as is
 29 | 
 30 | apt-get install -y nfs-kernel-server nfs-common
 31 | ln -s /etc/init.d/nfs-kernel-server /etc/init.d/nfs
 32 | 
 33 | 
 34 | # apt-get install r-cran-rmpi # for Rmpi & openMPI - try this?
 35 | apt-get install -y emacs git r-recommended libopenmpi-dev libeigen3-dev curl curlftpfs libcurl4-openssl-dev openmpi-bin libopenblas-base libopenblas-dev octave3.2 ipython python-numpy python-scipy python-pandas python-matplotlib r-mathlib sqlite3
 36 | 
 37 | # cp libopenblas-base_0.1alpha2.2-3.2_amd64.deb libopenblas-dev_0.1alpha2.2-3.2_amd64.deb from scf to cloud machine:
 38 | #rsync -av paciorek@gandalf.berkeley.edu:/server/install/linux/PACKAGES-12.04/libopenblas*2.2-3.2*deb /tmp/
 39 | #cd /tmp
 40 | #PKGS="libopenblas-base_0.1alpha2.2-3.2_amd64.deb libopenblas-dev_0.1alpha2.2-3.2_amd64.deb"
 41 | #dpkg -i ${PKGS}
 42 | 
 43 | update-alternatives --set liblapack.so.3gf /usr/lib/lapack/liblapack.so.3gf
 44 | # need this or R tries to load an ATLAS function because liblapack.so.3gf points to atlas lapack (which is not installed)
 45 | 
 46 | R --no-save <<EOF 
 47 | pkgs <- c("Rcpp", "Matrix", "inline", "ggplot2", "plyr", "knitr", "lme4", "devtools", "DBI", "RSQLite", "foreach", "Rmpi", "doMPI", "doParallel", "iterators", "rlecuyer", "reshape2", "glmnet", "pbdDEMO", "pbdSLAP", "pbdMPI", "pbdBASE", "pbdPROF", "pbdDMAT", "RcppArmadillo", "RcppEigen", "bitops")
 48 | install.packages(pkgs, repos = "http://cran.cnr.berkeley.edu")
 49 | EOF
 50 | # failed with lock on Rmpi and glmnet but simply trying again with Rmpi,glmnet,doMPI was fine
 51 | # on a separate try it worked fine
 52 | 
 53 | apt-get install -y cuda-5-5  
 54 | # takes a while
 55 | 
 56 | # it suggests to reboot, so reboot VM via EC2 console
 57 | exit
 58 | ssh -i  ....
 59 | 
 60 | sudo su
 61 | 
 62 | 
 63 | # this creates /usr/local/cuda-5.5
 64 | # it has lib64, no lib
 65 | 
 66 | # happens automatically?
 67 | ln -s /usr/local/cuda-5.5 /usr/local/cuda
 68 | 
 69 | echo "" >> ~root/.bashrc
 70 | echo "export PATH=${PATH}:/usr/local/cuda/bin" >> ~root/.bashrc
 71 | echo "" >> ~ubuntu/.bashrc
 72 | echo "export PATH=${PATH}:/usr/local/cuda/bin" >> ~ubuntu/.bashrc
 73 | echo "" >> ~root/.bashrc
 74 | echo "alias gtop=\"nvidia-smi -q -g 0 -d UTILIZATION -l 1\"" >> ~root/.bashrc
 75 | echo "" >> ~ubuntu/.bashrc
 76 | echo "alias gtop=\"nvidia-smi -q -g 0 -d UTILIZATION -l 1\"" >> ~ubuntu/.bashrc
 77 | echo "" >> ~ubuntu/.bashrc
 78 | 
 79 | # create deviceQuery executable
 80 | nvcc deviceQuery.cpp -I/usr/local/cuda/include -I/usr/local/cuda-5.5/samples/common/inc -o /usr/local/cuda/bin/deviceQuery
 81 | 
 82 | 
 83 | source ~/.bashrc
 84 | 
 85 | 
 86 | echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf
 87 | ldconfig
 88 | 
 89 | nvidia-smi -q
 90 | gtop
 91 | # this checks we can access the gpu
 92 | 
 93 | cd /usr/src
 94 | mkdir magma
 95 | cd magma
 96 | wget http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-1.4.1.tar.gz
 97 | tar -xvzf magma-1.4.1.tar.gz
 98 | cd magma-1.4.1
 99 | # note I added -fPIC per the magma README to enable creation of a shared object
100 | scp paciorek@smeagol.berkeley.edu:~/staff/projects/gpus/make.inc.ubuntu.openblas.kepler make.inc
101 | 
102 | make 2>&1 | tee make.log
103 | 
104 | make shared 2>&1 | tee make.shared.log
105 | 
106 | # good to test dgemm: 
107 | # ./testing/testing_dgemm
108 | 
109 | mkdir /usr/local/magma
110 | make install prefix=/usr/local/magma
111 | 
112 | # also need magma-1.3.0 for R's magma pkg
113 | cd /usr/src
114 | mkdir magma-1.3.0
115 | cd magma-1.3.0
116 | wget http://icl.cs.utk.edu/projectsfiles/magma/pubs/magma-1.3.0.tar.gz
117 | tar -xvzf magma-1.3.0.tar.gz
118 | cd magma-1.3.0
119 | # for kepler only: per bug report at http://icl.cs.utk.edu/magma/forum/viewtopic.php?f=2&t=900
120 | sed 's/-DGPUSHMEM=300 -arch sm_35/-DGPUSHMEM=300 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35/g' Makefile.internal > /tmp/tmp
121 | mv /tmp/tmp Makefile.internal
122 | # note I added -fPIC per the magma README to enable creation of a shared object
123 | scp paciorek@smeagol.berkeley.edu:~/staff/projects/gpus/make.inc.ubuntu.openblas.1.3.0.kepler make.inc
124 | # for 1.3.0 I needed to add -DCUBLAS_GFORTRAN (like in mkl-gcc make.inc)
125 | 
126 | make 2>&1 | tee make.log
127 | 
128 | sed 's/\$(MAGMA_DIR)\/lib\/pkgconfig\/magma.pc/\$(MAGMA_DIR)\/lib\/pkgconfig\/magma.pc.in/g' Makefile > /tmp/tmp
129 | mv /tmp/tmp Makefile
130 | 
131 | make install prefix=/usr/local/magma-1.3.0
132 | 
133 | # good to test dgemm: 
134 | # ./testing/testing_dgemm
135 | 
136 | echo "/usr/local/magma/lib" >> /etc/ld.so.conf.d/magma.conf
137 | ldconfig
138 | # I don't think this will cause an issue with the R magma .so as R magma uses .a from magma 1.3.0 so no run-time linking for magma
139 | 
140 | 
141 | cd /tmp
142 | wget http://cran.r-project.org/src/contrib/magma_1.3.0-2.tar.gz
143 | R CMD INSTALL  --configure-args="
144 |        --with-cuda-home=/usr/local/cuda \
145 |        --with-magma-lib=/usr/local/magma-1.3.0/lib" \
146 |      magma_1.3.0-2.tar.gz 2>&1 | tee Rmagma.log
147 | 
148 | 
149 | 
150 | cd /usr/src
151 | 
152 | git clone https://github.com/duncantl/RCUDA
153 | git clone https://github.com/omegahat/RAutoGenRunTime
154 | 
155 | cd RCUDA/src
156 | ln -s ../../RAutoGenRunTime/src/RConverters.c .
157 | ln -s ../../RAutoGenRunTime/inst/include/RConverters.h .
158 | ln -s ../../RAutoGenRunTime/inst/include/RError.h .
159 | 
160 | cd ../..
161 | 
162 | R CMD build RCUDA
163 | R CMD build RAutoGenRunTime
164 | R CMD INSTALL RAutoGenRunTime_0.3-0.tar.gz 
165 | R CMD INSTALL RCUDA_0.4-0.tar.gz 
166 | 
167 | apt-get install -y python-pip
168 | 
169 | pip install pycuda
170 | # ignore warning msg
171 | 
172 | echo "  This is an Ubuntu 12.04 (Precise) based image with GPU support" >> /etc/motd.tail
173 | echo "    for use with EC2." >> /etc/motd.tail
174 | echo "  Developed by the Berkeley Statistical Computing Facility, February 2014." >> /etc/motd.tail
175 | echo "    Comments and questions can be sent to consult@stat.berkeley.edu." >> /etc/motd.tail
176 | echo "  " >> /etc/motd.tail
177 | echo "  It contains the following computational software," >> /etc/motd.tail
178 | echo "    optimized for statistical computation." >> /etc/motd.tail
179 | echo "  " >> /etc/motd.tail
180 | echo "  * R 3.0.2 linked to OpenBLAS and with a core set of parallel packages:" >> /etc/motd.tail
181 | echo "     (foreach, doParallel, doMPI, Rmpi, pbd)." >> /etc/motd.tail
182 | echo "  * iPython (0.12.1) and Octave (3.2.4)" >> /etc/motd.tail
183 | echo "  * CUDA (5.5) and MAGMA (1.4.1)" >> /etc/motd.tail
184 | echo "  * R and Python packages for GPU computation:" >> /etc/motd.tail
185 | echo "     R's magma package linked to MAGMA 1.3.0" >> /etc/motd.tail
186 | echo "     RCUDA (development version)" >> /etc/motd.tail
187 | echo "     PyCUDA" >> /etc/motd.tail
188 | echo "  " >> /etc/motd.tail
189 | 
190 | ### for image used for GPU workshop April 2014, put the public key in the ubuntu user's .ssh directory so others can log in:
191 | cd /home/ubuntu
192 | scp paciorek@smeagol.berkeley.edu:~/staff/workshops/gpu-workshop-2014/gpu_rsa.pub .ssh/.
193 | cat .ssh/gpu_rsa.pub >> .ssh/authorized_keys
194 | echo "Some rules for using this VM, since everyone is sharing the 'ubuntu' user name:" >> README_BEFORE_DOING_ANYTHING_ON_THIS_VM
195 | echo "1) create a unique directory here in /home/ubuntu and place your files only in that directory so that there are not conflicts with other users; e.g., create and put your files in /home/ubuntu/sarah if your name is sarah" >> README_BEFORE_DOING_ANYTHING_ON_THIS_VM
196 | echo "2) you have sudo privileges but you should not use them to alter the system" >> README_BEFORE_DOING_ANYTHING_ON_THIS_VM
197 | 
198 | chown ubuntu:ubuntu README_BEFORE_DOING_ANYTHING_ON_THIS_VM .ssh/gpu_rsa.pub
199 | 
200 | 
201 | #### Create image ##########################
202 | 
203 | # 1) now save the image in us-west-2 via point and click on VM page under Actions
204 | # 2) make it public
205 | # 3) test w/ starcluster 0.95.2 I now seem to be able to use StarCluster to start it (formerly it didn't like g2.2xlarge)
206 | 
207 | # need to put /usr/local/cuda/bin in path for paciorek as it's not 
208 | # need to copy authorized_keys from either root or ubuntu to paciorek
209 | 


--------------------------------------------------------------------------------
/gpu.Rmd:
--------------------------------------------------------------------------------
  1 | An Introduction to Using GPUs for Computation
  2 | ==================================================================
  3 | Chris Paciorek, Statistical Computing Facility, Department of Statistics, UC Berkeley
  4 | 
  5 | Presented: April 25, 2014
  6 | 
  7 | Last Revised: April 30, 2014
  8 | 
  9 | 
 10 | ```{r setup, include=FALSE}
 11 | opts_chunk$set(cache = TRUE) # because the compilation takes time, let's cache it
 12 | ```
 13 | 
 14 | # 0) This Tutorial
 15 | 
 16 | Materials for this tutorial, including the R markdown file that was used to create this document are available on github at `https://github.com/berkeley-scf/git-workshop-2014`.  You can download the files by doing a git clone:
 17 | ```{clone, eval=FALSE, engine='bash'}
 18 | git clone https://github.com/berkeley-scf/gpu-workshop-2014
 19 | ```
 20 | 
 21 | To create this HTML document, simply compile the corresponding R Markdown file in R:
 22 | ```{rmd-compile, eval=FALSE}
 23 | library(knitr)
 24 | knit2html('gpu.Rmd')
 25 | ```
 26 | 
 27 | # 1) Introduction
 28 | 
 29 | ### 1.1) Overview
 30 | 
 31 | GPUs (Graphics Processing Units) are processing units originally designed for rendering graphics on a computer quickly. This is done by having a large number of simple processing units for massively parallel calculation. The idea of general purpose GPU (GPGPU) computing is to exploit this capability for general computation. 
 32 | 
 33 | We'll see some high-level and somewhat lower-level ways to program calculations for implementation on the GPU. The basic context of GPU programming is "data parallelism", in which the same calculation is done to lots of pieces of data. This could be a mathematical calcuation on millions of entries in a vector or a simulation with many independent simulations. Some examples of data parallelism include matrix multiplication (doing the multiplication task on many separate matrix elements) or numerical integration (doing a numerical estimate of the piecewise integral on many intervals/regions), as well as standard statistical calculations such as simulation studies, bootstrapping, random forests, etc. This kind of computation also goes by the name `SIMD` (single instruction, multiple data).
 34 | 
 35 | ### 1.2) Hardware
 36 | 
 37 | Two of the main suppliers of GPUs are NVIDIA and AMD. `CUDA` is a platform for programming on GPUs specifically for NVIDIA GPUs that allows you to send C/C++/Fortran code for execution on the GPU.  `OpenCL` is an alternative that will work with a broader variety of GPUs. However, CUDA is quite popular, and since Amazon EC2 provides NVIDIA GPUs we'll use CUDA here. 
 38 | 
 39 | GPUs have many processing units but limited memory. Also, they can only use data in their own memory, not in the CPU's memory, so one must transfer data back and forth between the CPU (the `host`) and the GPU (the `device`). This copying can, in some computations, constitute a very large fraction of the overall computation. So it is best to create the data and/or leave the data (for subsequent calculations) on the GPU when possible and to limit transfers. 
 40 | 
 41 | The `g2.2xlarge` Amazon EC2 instance types have 1536 cores and 4 Gb memory. They're of the `Kepler` architecture (3rd generation). The 2nd generation was `Fermi` and the 1st was `Tesla`. (However note that `Tesla` is also used by NVIDIA to refer to different chip types, so for example the `cg1.4xlarge` Amazon EC2 instances have chips that are `NVIDIA Tesla M2050 GPUs ("Fermi" GF100)`, but are the `Fermi` architecture.) Originally GPUs supported only single precision (i.e., `float` calculations) but fortunately they now support double precision operations and all of the examples here will use doubles to avoid potential numerical issues, in particular with linear algebra calculations. 
 42 | 
 43 | #### Demonstration Using Amazon's EC2 
 44 | 
 45 | Since the SCF does not have any machines with a GPU, we'll need to use a cloud-based machine. Amazon's EC2 provide two types of GPU instances: `g2.2xlarge` and `cg1.4xlarge`. The first is more recent, though in some of my tests cg1.4xlarge was actually faster. However given that the price for g2.2xlarge is 65 cents per hour and cg1.4xlarge is more than $2 per hour, we'll use g2.2xlarge.
 46 | 
 47 | I've created an Amazon machine image (an AMI) that is the binary representation of the Linux Ubuntu operating system for a machine with support for GPU calculations. The AMI contains the following software and packages: R and RCUDA, Python and PyCUDA, CUDA, and MAGMA.  In other respects the AMI is similar to the SCF and EML Linux machines but with a reduced set of software.
 48 | 
 49 | Based on this AMI I've started a virtual machine (VM) that we can login to (see below for instructions) via SSH, just like any SCF/EML Linux server.
 50 | 
 51 | If you were willing to pay Amazon and had an account you can start a VM (in the Oregon [us-west-2] region) using the SCF AMI by searching for "Public Images" at the [EC2 console](https://console.aws.amazon.com/ec2/v2/home?region=us-west-2#Images:) for `scf-gpu_0.3`. Then just launch a VM, selecting `g2.2xlarge` under the `GPU instances` tab. Alternatively, if you are using StarCluster (e.g., [this tutorial](http://statistics.berkeley.edu/computing/cloud-computation) provides some info on using StarCluster with EC2 to start up VMs or clusters of VMs), you can start a VM using the SCF AMI by setting the following in the StarCluster `config` file:
 52 | 
 53 | ```{starcluster-config, eval=FALSE, engine='bash'}
 54 | AWS_REGION_NAME = us-west-2
 55 | AWS_REGION_HOST = ec2.us-west-2.amazonaws.com
 56 | NODE_IMAGE_ID = ami-b0374280 
 57 | NODE_INSTANCE_TYPE = g2.2xlarge
 58 | ```
 59 | 
 60 | 
 61 | Note that the EML (Economics) has a GPU on one of the EML Linux servers that EML users can access. If this is of interest to you, email consult@econ.berkeley.edu, and I will work to get it set up analogously to the Amazon VM and to help you get started. 
 62 | 
 63 | And note that Biostatistics has a GPU on one of its servers. Talk to Burke for more information.
 64 | 
 65 | #### The Amazon VM
 66 | 
 67 | I'll start up the Amazon VM, calling it `gpuvm` and ssh to it using my own Starcluster config file:
 68 | 
 69 | ```{start-VM, engine='bash'}
 70 | starcluster start -c gpu gpuvm
 71 | starcluster sshmaster -u paciorek gpuvm
 72 | ```
 73 | 
 74 | I also need to make sure that CUDA-related executables are in my path (they should already be set up for the `ubuntu` default user):
 75 | 
 76 | ```{path, engine='bash'}
 77 | export PATH=${PATH}:/usr/local/cuda/bin
 78 | echo "" >> ~/.bashrc
 79 | echo "export PATH=${PATH}:/usr/local/cuda/bin" >> ~/.bashrc
 80 | echo "" >> ~/.bashrc
 81 | echo "alias gtop=\"nvidia-smi -q -g 0 -d UTILIZATION -l 1\"" >> ~/.bashrc
 82 | 
 83 | ```
 84 | 
 85 | For the moment, you can connect to the Amazon VM I am using yourself. Here's what you need to do.
 86 | 
 87 | * copy the ssh key file, `gpu_rsa` that SCF provided access to (via email) to your computer (on a UNIX-like machine, including Macs), put it in `~/.ssh`)
 88 | * open a terminal window on a UNIX-alike machine (you might be able to ssh via putty or the like if you can point it to the key file you just copied to your machine) and ssh to the VM as follows, using the IP info provided by SCF (via email):
 89 | 
 90 | ```{ssh, engine='bash'}
 91 | export ip=VALUE_OBTAINED_FROM_SCF
 92 | ssh -i ~/.ssh/gpu_rsa ubuntu@${ip}.us-west-2.compute.amazonaws.com
 93 | ```
 94 | 
 95 | * since multiple people are sharing this VM and are all logging in as the 'ubuntu' user, please make a directory ~/ubuntu/YourUserName and only work within that directory
 96 | 
 97 | #### Observing Performance on the GPU
 98 | 
 99 | The following command will allow you to see some information analogous to `top` on the CPU. 
100 | 
101 | ```{gtop, engine='bash'}
102 | gtop
103 | ```
104 | 
105 | Here's some example output when the GPU is idle: 
106 | 
107 | ```{gtop output, engine='bash', eval=FALSE}
108 | ==============NVSMI LOG==============
109 | 
110 | Timestamp                           : Mon Apr  7 21:15:39 2014
111 | Driver Version                      : 319.37
112 | 
113 | Attached GPUs                       : 1
114 | GPU 0000:00:03.0
115 |     Utilization
116 |         Gpu                         : 0 %
117 |         Memory                      : 0 %
118 | ```
119 | 
120 | 
121 | 
122 | ### 1.4) Software Tools
123 | 
124 | Here are some of the useful software tools for doing computations on the GPU.
125 | 
126 | * CUDA - platform for programming on an NVIDIA GPU using C/C++/Fortran code
127 | * CUBLAS - a BLAS implementation for matrix-vector calculations on an NVIDIA GPU
128 | * CURANDOM - random number generation on an NVIDIA GPU
129 | * MAGMA - a package for combined CPU-GPU linear algebra, intended to be analogous to LAPACK + BLAS
130 | * RCUDA - an R package providing a front-end for CUDA
131 | * R's magma package - a front-end for MAGMA
132 | * PyCUDA - a Python package providing a front-end for CUDA
133 | 
134 | Note that RCUDA is still in development and is on Github, but should be high-quality as it is developed by Duncan Temple Lang at UC-Davis.
135 | 
136 | We'll see all of these in action.
137 | 
138 | There are also:
139 | * openCL - an alternative to CUDA that can also be used with non-NVIDIA GPUs
140 | * PyOpenCL
141 | * R's OpenCL package
142 | 
143 | #### A Note on Synchronization    
144 | 
145 | Note that in the various examples when I want to assess computational time, I make sure to synchronize the GPU via an appropriate function call. This ensures that all of the kernels have finished their calculations before I mark the end of the time interval. In general a function call to do a calculation on the GPU will simply start the calculation and then return, with the calculation continuing on the GPU.
146 | 
147 | # 2) Using Kernels for Parallel Computation
148 | 
149 | Kernels are functions that encode the core computational operations done on individual pieces of data. The basic mode of operation in this Section will be to write a kernel and then call the kernel on all the elements of a data object via C, R, or Python code. We'll need to pass the data from the CPU to the GPU and do the same in reverse to get the result. We'll also need to allocate memory on the GPU. However in some cases the transfer and allocation will be done automatically behind the scenes.
150 | 
151 | A note on the speed comparisons in this section. These compare a fully serial CPU calculation on a single core to calculation on the GPU. On a multicore machine, we could speed up the CPU calculation by writing code to parallelize the calculation (e.g., via threading in C/openMP or various parallelization tools in R or Python). 
152 | 
153 | See my comments in the last Section regarding some tips and references that may enable you to get more impressive speedups than I show in the demos here. 
154 | 
155 | ### 2.1) Background: 
156 | 
157 | #### Threads and Grids
158 | 
159 | Each individual computation or series of computations on the GPU is done in a thread. Threads are organized into blocks and blocks of threads are organized in a grid. The blocks and grids can be 1-, 2-, or 3-dimensional. E.g., you might have a 1-d block of 500 threads, with a grid of 3 x 3 such blocks, for a total of $500 \times 9 = 4500$ threads. The choice of the grid/block arrangement can affect efficiency. I can't provide much guidance on that so you'd need to experiment or do some additional research. For our purposes, we'll often use a 2-d grid of 1-d blocks. In general you'd want each independent calculation done in a separate thread, though as we'll see in Section 3 on simulation, one might want to do a sequence of calculations on each thread. In general, you'll want to pipeline together multiple operations within a computation to avoid copying from CPU to GPU and back. Alternatively, this can be done by keeping the data on the GPU and calling a second kernel. 
160 | 
161 | Threads are quick to start, and to get efficiency you want to have thousands of threads to exploit the parallelism of the GPU hardware. In general your calculations will have more threads than GPU cores.
162 | 
163 | This can all get quite complicated, with the possibility for communication amongst threads. We won't go into this, but threads within a block shared memory (distinct from the main GPU memory) and can synchronize with each other, while threads in different blocks cannot cooperate. The Suchard et al. paper referenced in the last Section discusses how to get more efficiency by having threads within a block cooperate and access shared memory, which is much faster than accessing the  main GPU (device) memory.
164 | 
165 | Executing the following code as root will create an executable that will show you details on the GPU, including the possible block and grid dimensions. 
166 | ```{deviceQuery, engine='bash', eval=FALSE}
167 | cd  /usr/local/cuda/samples/1_Utilities/deviceQuery
168 | nvcc deviceQuery.cpp -I/usr/local/cuda/include \
169 |    -I/usr/local/cuda-5.5/samples/common/inc -o /usr/local/cuda/bin/deviceQuery
170 | cd -
171 | ```
172 | 
173 | Now running `deviceQuery` will show output like the following (on the SCF VM):
174 | ```{deviceQuery output, engine='bash', eval=FALSE}
175 | paciorek@master:~$ deviceQuery
176 | deviceQuery Starting...
177 | 
178 |  CUDA Device Query (Runtime API) version (CUDART static linking)
179 | 
180 | Detected 1 CUDA Capable device(s)
181 | 
182 | Device 0: "GRID K520"
183 |   CUDA Driver Version / Runtime Version          5.5 / 5.5
184 |   CUDA Capability Major/Minor version number:    3.0
185 |   Total amount of global memory:                 4096 MBytes (4294770688 bytes)
186 |   ( 8) Multiprocessors, (192) CUDA Cores/MP:     1536 CUDA Cores
187 |   GPU Clock rate:                                797 MHz (0.80 GHz)
188 |   Memory Clock rate:                             2500 Mhz
189 |   Memory Bus Width:                              256-bit
190 |   L2 Cache Size:                                 524288 bytes
191 |   Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
192 |   Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
193 |   Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
194 |   Total amount of constant memory:               65536 bytes
195 |   Total amount of shared memory per block:       49152 bytes
196 |   Total number of registers available per block: 65536
197 |   Warp size:                                     32
198 |   Maximum number of threads per multiprocessor:  2048
199 |   Maximum number of threads per block:           1024
200 |   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
201 |   Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
202 |   Maximum memory pitch:                          2147483647 bytes
203 |   Texture alignment:                             512 bytes
204 |   Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
205 |   Run time limit on kernels:                     No
206 |   Integrated GPU sharing Host Memory:            No
207 |   Support host page-locked memory mapping:       Yes
208 |   Alignment requirement for Surfaces:            Yes
209 |   Device has ECC support:                        Disabled
210 |   Device supports Unified Addressing (UVA):      Yes
211 |   Device PCI Bus ID / PCI location ID:           0 / 3
212 |   Compute Mode:
213 |      < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
214 | 
215 | deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GRID K520
216 | Result = PASS
217 | ```
218 | 
219 | In particular note the information on the number of CUDA cores, the GPU's memory, and the information on the maximum threads per block and the maximum dimensions of thread blocks and grids.
220 | 
221 | #### GPU Calculations and Kernels
222 | 
223 | The basic series of operations is:
224 | * allocate memory on the GPU
225 | * transfer data from CPU to GPU
226 | * launch the kernel to operate on the threads, with a given block/grid arrangement
227 | * [optionally] launch another kernel, which can access data stored on the GPU, including results from the previous kernel
228 | * transfer results back to CPU
229 | 
230 | Some of this is obscured because CUDA, RCUDA, and PyCUDA do some of the work for you (and also obscured if you use pinned memory).
231 | 
232 | When we write a kernel, we will need to have some initial code that determines a unique ID for that thread  that allows the thread to access the appropriate part(s) of the data object(s) on the GPU. This is done based on information stored in variables that CUDA provides that have information about the thread and block indices and block and grid dimensions.
233 | 
234 | ### 2.2) Using CUDA Directly
235 | 
236 | #### Hello, world
237 | 
238 | First let's see a 'Hello, World' example that illustrates blocks of threads and grids of blocks.
239 | 
240 | The idea is to have at least as many threads as the number of computations you are doing. Our kernel function contains the core calculation we want to do (in this case printing 'Hello world!' and code that figures out the unique ID of each thread, as this is often used within a calculation.
241 | 
242 | Here's the [example code (helloWorld.cu on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/helloWorld.cu).
243 | 
244 | In this case, compilation is as follows. Given the CUDA functionality used in the code (in particular the call to `printf` within the kernel), we need to specify compilation for a `compute capability` >= 2.0 (corresponding to the Fermi generation of NVIDIA GPUs). Note that our query above indicated that the GPU we are using has capability 3.0, so 
245 | 
246 | ```{helloWorld-compile, engine='bash', eval=FALSE}
247 | nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
248 | ```
249 | 
250 | The result of this looks like:
251 | ```{helloWorld-output, eval=FALSE, engine='bash'}
252 | Launching 20480 threads (N=20000)
253 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(448,0,0) => thread index=1984
254 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(449,0,0) => thread index=1985
255 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(450,0,0) => thread index=1986
256 | ....
257 | 
258 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(220,0,0) => thread index=20188 
259 | [### this thread would not be used for N=20000 ###]
260 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(221,0,0) => thread index=20189 
261 | [### this thread would not be used for N=20000 ###]
262 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(222,0,0) => thread index=20190 
263 | [### this thread would not be used for N=20000 ###]
264 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(223,0,0) => thread index=20191 
265 | [### this thread would not be used for N=20000 ###]
266 | kernel launch success!
267 | That's all!
268 | ```
269 | 
270 | Note that because of some buffering issues, with this many threads, we can't see the output for all of them, hence the `if` statement in the kernel code. It is possible to retrieve info about the limit and change the limit using `cudaDeviceGetLimit()` and `cudaDeviceSetLimit()`.
271 | 
272 | #### Example of a 'Real' Computation
273 | 
274 | Now let's see an example of a distributed calculation using CUDA code, including memory allocation on the GPU and transfer between the GPU and CPU. Our example will be computing terms in an IID log-likelihood calculation. In this case we'll just use the normal density, but real applications would of course have more involved calculation.
275 | 
276 | Note that here, I'll use 1024 (the maximum based on `deviceQuery`) threads per block and then a grid (2-d for simplicity) sufficiently large so that we have at least as many threads as computational chunks. 
277 | 
278 | Here's the [code (kernelExample.cu on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/kernelExample.cu).
279 | 
280 | 
281 | Compilation is as follows. We again need to specify a compute capability >= 2.0, in this case in order to do calculations with doubles rather than floats.
282 | 
283 | 
284 | ```{kernelExample-compile, engine='bash', eval=FALSE}
285 | nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
286 | ```
287 | 
288 | Here are some results:
289 | ```{kernelExample-output, eval=FALSE, engine='bash'}
290 | ====================================================
291 | Grid dimension is 46 x 46
292 | Launching 2166784 threads (N=2097152)
293 | Input values: -0.658344 0.499804 -0.807257...
294 | Memory Copy from Host to Device successful.
295 | Memory Copy from Device to Host successful.
296 | Output values: 0.321214 0.352100 0.288007...
297 | Output values (CPU): 0.321214 0.352100 0.288007...
298 | Timing results for n = 2097152
299 | Transfer to GPU time: 0.008920
300 | Calculation time (GPU): 0.001766
301 | Calculation time (CPU): 0.070951
302 | Transfer from GPU time: 0.001337
303 | Freeing memory...
304 | ====================================================
305 | ...
306 | ====================================================
307 | Grid dimension is 363 x 363
308 | Launching 134931456 threads (N=134217728)
309 | Input values: -0.658344 0.499804 -0.807257...
310 | Memory Copy from Host to Device successful.
311 | Memory Copy from Device to Host successful.
312 | Output values: 0.321214 0.352100 0.288007...
313 | Output values (CPU): 0.321214 0.352100 0.288007...
314 | Timing results for n = 134217728
315 | Transfer to GPU time: 0.556857
316 | Calculation time (GPU): 0.110254
317 | Calculation time (CPU): 4.605744
318 | Transfer from GPU time: 0.068865
319 | Freeing memory...
320 | ====================================================
321 | ```
322 | 
323 | We see that the time for transferring to and from (particularly to) the GPU exceeds the calculation time, reinforcing the idea of keeping data on the GPU when possible. 
324 | 
325 | #### Using Pinned Memory
326 | 
327 | Here's some code where we use pinned memory that is 'mapped' to the GPU such that the GPU directly accesses CPU memory. This can be advantageous if one exceeds the GPU's memory and, according to some sources, is best when you load the data only once. Another approach, using pinned but not mapped memory allows for more efficient transfer but without the direct access from the GPU, with a hidden transfer done behind the scenes. This may be better if the data is loaded multiple times on the GPU.
328 | 
329 | Here's the [code (kernelExample-pinned.cu on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/kernelExample-pinned.cu).
330 | 
331 | 
332 | Here are some results:
333 | ```{kernelExample-pinned-output, eval=FALSE, engine='bash'}
334 | ====================================================
335 | Grid dimension is 46 x 46
336 | Launching 2166784 threads (N=2097152)
337 | Input values: -0.658344 0.499804 -0.807257...
338 | Output values: 0.321214 0.352100 0.288007...
339 | Output values (CPU): 0.321214 0.352100 0.288007...
340 | Timing results for n = 2097152
341 | Calculation time (GPU): 0.002080
342 | Calculation time (CPU): 0.071038
343 | Freeing memory...
344 | ====================================================
345 | ...
346 | ====================================================
347 | Grid dimension is 363 x 363
348 | Launching 134931456 threads (N=134217728)
349 | Input values: -0.658344 0.499804 -0.807257...
350 | Output values: 0.321214 0.352100 0.288007...
351 | Output values (CPU): 0.321214 0.352100 0.288007...
352 | Timing results for n = 134217728
353 | Calculation time (GPU): 0.255367
354 | Calculation time (CPU): 4.635453
355 | Freeing memory...
356 | ====================================================
357 | ```
358 | 
359 | So using pinned mapped memory seems to help quite a bit in this case, as the total time with pinned memory is less than the time used for transfer plus calculation in the previous examples.
360 | 
361 | ### 2.2) Calling CUDA Kernels from R (RCUDA)
362 | 
363 | When we want to use CUDA from R, the kernel function will remain the same, but the pre- and post-processing is done in R rather than in C. Here's an example, with the same log-likelihood kernel. The CUDA kernel code is saved in a [separate file (calc_loglik.cu on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/calc_loglik.cu) separate file but is identical to that in the full CUDA+C example above (with the exception that we need to wrap the kernel function in `extern "C"`).
364 | 
365 | Here's the [code (RCUDAexample.R on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/RCUDAexample.R)
366 | 
367 | In this example we see that we can either transfer data between CPU and GPU manually or have RCUDA do it for us. If we didn't want to overwrite the input, but rather to allocate separate space for the output on the GPU, we could use `cudaAlloc()`.  See `help(.cuda)` for some example code.
368 | 
369 | We need to compile the kernel into a ptx object file, either outside of R:
370 | 
371 | ```{RCUDAexample-compile, engine='bash', eval=FALSE}
372 | nvcc --ptx  -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu
373 | ```
374 | 
375 | or inside of R:
376 | ```{RCUDAexample-compile-inR, engine='R', eval=FALSE}
377 | ptx = nvcc(file = 'calc_loglik.cu', out = 'calc_loglik.ptx', target = 'ptx', '-arch=compute_20', '-code=sm_20,compute_20')
378 | ```
379 | 
380 | Here are some results:
381 | ```{RCUDAexample-output, eval=FALSE, engine='bash'}
382 | Setting cuGetContext(TRUE)...
383 | Grid size:
384 | [1] 363 363   1
385 | Total number of threads to launch =  134931456 
386 | Running CUDA kernel...
387 | Input values:  0.8966972 0.2655087 0.3721239 
388 | Output values:  0.2457292 0.2658912 0.2656543 
389 | Output values (implicit transfer):  0.2457292 0.2658912 0.2656543 
390 | Output values (CPU with R):  0.2457292 0.2658912 0.2656543 
391 | Transfer to GPU time:  0.374 
392 | Calculation time (GPU):  0.078 
393 | Transfer from GPU time:  0.689 
394 | Calculation time (CPU):  9.981 
395 | Combined calculation+transfer via .cuda time (GPU):  4.303 
396 | ```
397 | 
398 | So the transfer time is again substantial in relative terms. Without that time, the speedup would be substantial. Strangely the streamlined call in which RCUDA handles the transfer is quite a bit slower for reasons that are not clear to me, but the RCUDA developer (Duncan Temple Lang at UC Davis) is looking into this.
399 | 
400 | We can avoid explicitly specifying block and grid dimensions by using the `gridBy` argument to `.cuda`, which we'll see in a later example.
401 | 
402 | 
403 | WARNING #1: be very careful that the types of the R objects passed to the kernel match what the kernel is expecting. Otherwise the code can hang without an informative error message.
404 | 
405 | WARNING #2: Note the use of the `strict=TRUE` argument when passing values to the GPU. This ensures that numeric values are kept as doubles and not coerced to floats. 
406 | 
407 | ### 2.3) Calling CUDA Kernels from Python plus GPU-vectorized Calculations (PyCUDA)
408 | 
409 | With PyCUDA the kernel code can be directly embedded in the Python script. Otherwise it's fairly similar to the use of RCUDA. Here's the [code (PyCUDAexample.py on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/PyCUDAexample.py)
410 | 
411 | Here are some results:
412 | ```{PyCUDAexample-output, eval=FALSE, engine='bash'}
413 | Generating random normals...
414 | Running GPU code...
415 | Time for calculation (GPU): 1.512139s
416 | Running Scipy CPU code...
417 | Time for calculation (CPU): 21.398803s
418 | Output from GPU: 0.168458 0.174912 0.252148
419 | Output from CPU: 0.168458 0.174912 0.252148
420 | ```
421 | 
422 | WARNING: As was the case with R, be careful that the types of the Python objects passed to the kernel match what the kernel is expecting. 
423 | 
424 | PyCUDA also provides high-level functionality for vectorized calculations on the GPU. Basically you create a vector stored in GPU memory and then operate on it with a variety of mathematical functions. The modules that do this are `gpuarray` and `cumath`.
425 | 
426 | Here's the [code (gpuArrayExample.py on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/gpuArrayExample.py)
427 | 
428 | Here are the timing results. 
429 | ```{gpuArrayExample-output, eval=FALSE, engine='bash'}
430 | Transfer to GPU time: 0.314641s
431 | Timing vectorized exponentiation:
432 | GPU array calc time: 0.226006s
433 | CPU calc time: 3.155150s
434 | Timing vectorized dot product/sum of squares:
435 | GPU array calc time: 0.254579s
436 | CPU calc time: 0.088157s
437 | ```
438 | 
439 | So the fully-vectorized calculation sees a pretty good speed-up but the dot product, which involves a reduction (the summation) does not. Also note that there is some compilation that gets done when the code is run the first time that causes the GPU calculation to be slow the first time but not the second time the code is run. 
440 | 
441 | 
442 | 
443 | # 3) Random Number Generation (RNG) on the GPU
444 | 
445 | RNG is done via the CURAND (CUDA Random Number Generation) library. CURAND provides several different generators including the Mersenne Twister (the default in R). 
446 | 
447 | ### 3.1) Seeds and Sequences
448 | 
449 | From the CUDA documentation:
450 | 
451 | `For the highest quality parallel pseudorandom number generation, each experiment should be assigned a unique seed. Within an experiment, each thread of computation should be assigned a unique sequence number. If an experiment spans multiple kernel launches, it is recommended that threads between kernel launches be given the same seed, and sequence numbers be assigned in a monotonically increasing way. If the same configuration of threads is launched, random state can be preserved in global memory between launches to avoid state setup time.`
452 | 
453 | A lot of important info... we'll interpret/implement much of it in the demo below.
454 | 
455 | Recall that RNG on a computer involves generation of pseudo-random numbers from a deterministic, periodic sequence. The seed determines where one starts generating from within that sequence. The idea of the sequence numbers is to generate from non-overlapping blocks within the sequence, with each thread getting a different block.  
456 | 
457 | 
458 | 
459 | ### 3.2) Calling CURAND via RCUDA
460 | 
461 | For RNG, we need a kernel to initialize the RNG on each thread and one to do the sampling (though they could be combined in a single kernel). Note that the time involved in initializing the RNG for each thread is substantial. This shouldn't be a problem if one is doing a lot of calculations over time. To amortize this one-time expense, I generate multiple random numbers per thread. Here's the [kernel code (random.cu on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/random.cu).
462 | 
463 | And here's the [R code (RNGexample.R on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/RNGexample.R) to call the kernel, which looks very similar to the RCUDA code we've already seen.
464 | 
465 | We get a pretty good speed up, which would be even more impressive if we can set up the calculations such that we don't need to transfer the whole large vector back to the CPU.
466 | 
467 | ```{RNGexample-output, eval=FALSE, engine='bash'}
468 | RNG initiation time:  0.115 
469 | GPU memory allocation time:  0.003 
470 | Calculation time (GPU):  0.256 
471 | Transfer from GPU time:  0.501 
472 | --------------------------------------
473 | Total time (GPU): 0.875
474 | --------------------------------------
475 | Calculation time (CPU):  9.963 
476 | ```
477 | 
478 | Also note the memory cost of the RNG states for the threads, 48 bytes per thread, which could easily exceed GPU memory if one starts up many threads. 
479 | 
480 | One more note on RCUDA: we can have RCUDA decide on the gridding. Here's a modification of the RNG example to do this:
481 | 
482 | ```{gridBy, eval=FALSE}
483 | .cuda(rnorm, rng_states, dX, N, mu, sigma, N_per_thread, gridBy = nthreads, .numericAsDouble = getOption("CUDA.useDouble", TRUE))
484 | ```
485 | 
486 | At the moment, I'm not sure how to choose the RNG generator from within R. 
487 | 
488 | ### 3.3) Calling CURAND from C and from Python
489 | 
490 | I may flesh this out at some point, but by looking at the RNG example via RCUDA and the examples of calling kernels from C and Python in the previous section, it should be straightforward to do RNG on the GPU controlled by C or Python.
491 | 
492 | To choose the generator in C this should work (in this case choosing the Mersenne Twister):
493 | `curandCreateGenerator(CURAND_RNG_PSEUDO_MTGP32)`.
494 | 
495 | # 4) Linear Algebra on the GPU
496 | 
497 | We'll start with very high-level use of the GPU by simply calling linear algebra routines that use the GPU. The simplest approach for this is to use R's `magma` package.
498 | 
499 | Note that in the timing results, I am comparing to timing with the CPU on the VM. The VM reports 8 virtual CPUs but in some of the calculations does not seem to exploit all of the CPUs, so be wary of the head to head comparisons.
500 | 
501 | ### 4.1) Using MAGMA via R
502 | 
503 | The MAGMA library provides a drop-in for the functionality of the BLAS and LAPACK that carries out linear algebra on both the CPU and GPU, choosing smartly where to do various aspects of the calculation.
504 | 
505 | R's magma package provides a front-end to MAGMA, with functionality for arithmetic operations, backsolve, matrix multiplication, Cholesky, inverse, crossproduct, LU, QR, and solve. See `help("magma-class")` for a list, as `library(help = magma)` only lists a few of the functions in the package.
506 | 
507 | [Note for Demo: As we run the calculations on the GPU, let's look at the computation with our gtop utility.]
508 | 
509 | Here's the [example code (RmagmaExample.R on the github repository)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/RmagmaExample.R).
510 | 
511 | Note that by default we are using MAGMA's GPU interface. For more about this, see the next section of this document.
512 | 
513 | Here are some timing results:
514 | ```{magma-R-output, eval=FALSE, engine='bash'}
515 | Timing for n= 4096 
516 | GPU time:  3.27
517 | CPU time:  5.92 
518 | Timing for n= 8192 
519 | GPU time:  20.19 
520 | CPU time:  47.04
521 | Check for use of double precision empirically
522 | [1] 0.000000000000000e+00 8.185452315956354e-11
523 | [1] 8433.16034596550344  -20.63245489979067   13.58046013130892
524 | [1] 8433.16034596551981  -20.63245489979058   13.58046013130881
525 | ```
526 | 
527 | Remember to be careful of memory use as GPU's memory may be limited (on the EC2 instance, it's 4 Gb).
528 | 
529 | Testing the same computation on an 8-core SCF physical machine gives 2.6 seconds for n=4096 and 20.8 seconds for n=8192 using threaded OpenBLAS. On the VM the CPU-based BLAS/LAPACK calculations seems to only be using 2 cores for some reason that is not clear to me, even though the VM has 8 virtual cores.
530 | 
531 | ### 4.2) Using C to Call CUDA, CUDABLAS, and MAGMA
532 | 
533 | Next let's use CUDA and MAGMA calls directly in C code. Both CUDA (through CUDABLAS) and MAGMA provide access to BLAS functionality, but only MAGMA provides LAPACK-like functionality (i.e., matrix factorizations/decompositions). Note that we'll now need to directly manage memory allocation on the GPU and transferring data back and forth from CPU to GPU.
534 | 
535 | #### CUDA and CUDABLAS
536 | 
537 | The code doesn't look too different than C code or calls to BLAS/LAPACK, but we use some CUDA functions and CUDA types. Here's the [example code (cudaBlasExample.c on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/cudaBlasExample.c).
538 | 
539 | 
540 | Compilation goes as follows using `nvcc`, the analog to `gcc` when compiling for the GPU. As when compiling standard C code we need to be careful about compiler flags, header files, and linking. Note that in this case nvcc does not want the file to have .C or .cu extension. 
541 | ```{cuda-compile, eval=FALSE, engine='bash'}
542 | nvcc cudaBlasExample.c -I/usr/local/cuda/include -lcublas -o cudaBlasExample
543 | ```
544 | 
545 | And here are (some of) the results:
546 | ```{cudaBlas-example-output, eval=FALSE, engine='bash'}
547 | Starting
548 | ====================================================
549 | Timing results for n = 512
550 | GPU memory allocation time: 0.001930
551 | Transfer to GPU time: 0.000777
552 | Matrix multiply time: 0.003121
553 | Transfer from GPU time: 0.001484
554 | ====================================================
555 | Timing results for n = 4096
556 | GPU memory allocation time: 0.002925
557 | Transfer to GPU time: 0.040283
558 | Matrix multiply time: 1.476518
559 | Transfer from GPU time: 0.144702
560 | ====================================================
561 | Timing results for n = 8192
562 | GPU memory allocation time: 0.002807
563 | Transfer to GPU time: 0.159034
564 | Matrix multiply time: 11.807786
565 | Transfer from GPU time: 0.535246
566 | ```
567 | 
568 | For (rough) comparison, the $n=8192$ multiplication on one of the SCF cluster nodes in R (using ACML as the BLAS) takes 74 seconds with one core and 11 seconds with 8 cores.
569 | 
570 | #### MAGMA 
571 | 
572 | Now let's see the use of MAGMA. MAGMA provides analogous calls as CUDA/CUDABLAS for allocating memory, transferring data, and BLAS calls, as well as LAPACK type calls. Unfortunately the MAGMA documentation online appears to be seriously out-of-date, documenting version 0.2 whereas the current version of the software is 1.4.0.
573 | 
574 | Note that the LAPACK type calls have a CPU interface and a GPU interface. The GPU interface calls have function names ending in '_gpu' and operate on data objects in GPU memory. The CPU interface calls operate on data objects in CPU memory, handling the transfer to GPU memory as part of the calculation.
575 | 
576 | Also, one can use 'pinned' memory on the CPU, which can reduce the transfer time for data to and from the GPU. However, it can involve an increase in time for doing the original memory allocation on the CPU. In the example I can control which is used.
577 | 
578 | Here we'll compare timing for the GPU vs. standard BLAS/LAPACK as well as the CPU and GPU interfaces for the Cholesky.
579 | 
580 | Here's the [example code (magmaExample.c on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/magmaExample.c).
581 | 
582 | 
583 | Compilation and execution (with and without pinned memory) go as follows. Note we can use gcc and that we need to link in the CPU BLAS and LAPACK since MAGMA uses both CPU and GPU for calculations (plus in this example I directly call BLAS and LAPACK functions).
584 | ```{magma-compile, eval=FALSE, engine='bash'}
585 | gcc magmaExample.c -O3 -DADD_ -fopenmp  -DHAVE_CUBLAS -I/usr/local/cuda/include \
586 |    -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma \
587 |    -llapack -lblas -lcublas -o magmaExample
588 | ./magmaExample 1
589 | ./magmaExample 0
590 | ```
591 | 
592 | And here are (some of) the results:
593 | ```{magma-example-output, eval=FALSE, engine='bash'}
594 | Starting
595 | Setting use_pinned to 1
596 | ====================================================
597 | Timing results for n = 512
598 | GPU memory allocation time: 0.001107
599 | Transfer to GPU time: 0.000235
600 | Matrix multiply time (GPU): 0.002965
601 | Matrix multiply time (BLAS): 0.025640
602 | Cholesky factorization time (GPU w/ GPU interface): 0.006872
603 | Cholesky factorization time (GPU w/ CPU interface): 0.006856
604 | Cholesky factorization time (LAPACK): 0.004908
605 | Transfer from GPU time: 0.000252
606 | ====================================================
607 | Timing results for n = 4096
608 | GPU memory allocation time: 0.001535
609 | Transfer to GPU time: 0.014882
610 | Matrix multiply time (GPU): 1.471109
611 | Matrix multiply time (BLAS): 9.641088
612 | Cholesky factorization time (GPU w/ GPU interface): 0.303083
613 | Cholesky factorization time (GPU w/ CPU interface): 0.316703
614 | Cholesky factorization time (LAPACK): 1.537566
615 | Transfer from GPU time: 0.016509
616 | ./magmaExample 0
617 | ====================================================
618 | Timing results for n = 8192
619 | GPU memory allocation time: 0.004967
620 | Transfer to GPU time: 0.063750
621 | Matrix multiply time (GPU): 11.766860
622 | Matrix multiply time (BLAS): 77.529439
623 | Cholesky factorization time (GPU w/ GPU interface): 2.126884
624 | Cholesky factorization time (GPU w/ CPU interface): 2.161343
625 | Cholesky factorization time (LAPACK): 12.017636
626 | Transfer from GPU time: 0.072997
627 | 
628 | Setting use_pinned to 0
629 | ====================================================
630 | Timing results for n = 512
631 | GPU memory allocation time: 0.002136
632 | Transfer to GPU time: 0.001055
633 | Matrix multiply time (GPU): 0.002969
634 | Matrix multiply time (BLAS): 0.029986
635 | Cholesky factorization time (GPU w/ GPU interface): 0.009177
636 | Cholesky factorization time (GPU w/ CPU interface): 0.011693
637 | Cholesky factorization time (LAPACK): 0.004929
638 | Transfer from GPU time: 0.002238
639 | ====================================================
640 | Timing results for n = 4096
641 | GPU memory allocation time: 0.002929
642 | Transfer to GPU time: 0.056978
643 | Matrix multiply time (GPU): 1.471277
644 | Matrix multiply time (BLAS): 9.951325
645 | Cholesky factorization time (GPU w/ GPU interface): 0.308102
646 | Cholesky factorization time (GPU w/ CPU interface): 0.356540
647 | Cholesky factorization time (LAPACK): 1.551262
648 | Transfer from GPU time: 0.136033
649 | ====================================================
650 | Timing results for n = 8192
651 | GPU memory allocation time: 0.004951
652 | Transfer to GPU time: 0.226058
653 | Matrix multiply time (GPU): 11.767153
654 | Matrix multiply time (BLAS): 78.473712
655 | Cholesky factorization time (GPU w/ GPU interface): 2.125327
656 | Cholesky factorization time (GPU w/ CPU interface): 2.286922
657 | Cholesky factorization time (LAPACK): 12.059586
658 | Transfer from GPU time: 0.545454
659 | 
660 | ```
661 | 
662 | So we see decent speed-ups both for the matrix multiplication and the Cholesky factorization; however this is in comparison to a CPU calculation that only seems to use 2 of the 8 cores available. 
663 | 
664 | Using the CPU interface seems to provide a modest speedup (compared to the manual transfer + calculation time) as does using pinned memory. Note that the transfer time is non-negligible in certain ways (but not all ways) in this example.
665 | 
666 | 
667 | 
668 | # 5) Some Final Comments
669 | 
670 | ### 5.1) Some Thoughts on Improving Computational Speed
671 | 
672 | [Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10016#.U2GTuBUgoWk) and [Lee et al (2010; Journal of Computational and Graphical Statistics 19:769](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10039#.U2GT9BUgoWk) talk about the use of GPUs for statistics. The speedups they see can get as high as 120 times the speed of a single CPU core and 500 times a single CPU core, respectively. Some possible reasons for these more impressive speedups relative to those seen here include:
673 | 
674 | * Use of single precision floating point calculations. If single precision doesn't affect your calculation substantively, this is worth trying. Particularly on older GPUs (but perhaps still true), single precision was much faster than double precision. However, in tests in which I switched the kernelExample.cu and RNGexample.R/random.cu examples to single precision there was very little change in speed.
675 | 
676 | * Careful use of shared memory (shared amongst the threads in a block) in place of the main GPU memory (see the Suchard et al. paper)
677 | 
678 | * Computational tasks that are very arithmetically intensive but with limited memory access (see the Lee et al. paper)
679 | 
680 | So for some tasks and possibly with some additional coding effort, you may see speedups of 100-200 fold compared to a single CPU core, rather than the 40 fold speedup that is about the best seen in the demos here.
681 | 
682 | Finally, rather than bringing a large chunk of data back to the CPU, you might do a reduction/aggregation operation (e.g., summing over values) in GPU memory. To do this, here's a [presentation](http://will-landau.com/gpu/lectures/cudac-atomics/cudac-atomics.pdf)‎ that has some useful information. 
683 | 
684 | ### 5.2) A Comment on Compilation
685 | 
686 | * If you compile CUDA code into an object file, you can link that with other object files (e.g., from C or C++ code) into an executable that can operate on CPU and GPU. This also means you could compile a shared object (i.e., a library)  that you could call from R with .C, .Call, or Rcpp. 
687 | 
688 | 
689 | 


--------------------------------------------------------------------------------
/gpu.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
  5 | 
  6 | <title>An Introduction to Using GPUs for Computation</title>
  7 | 
  8 | <!-- Styles for R syntax highlighter -->
  9 | <style type="text/css">
 10 |    pre .operator,
 11 |    pre .paren {
 12 |      color: rgb(104, 118, 135)
 13 |    }
 14 | 
 15 |    pre .literal {
 16 |      color: #990073
 17 |    }
 18 | 
 19 |    pre .number {
 20 |      color: #099;
 21 |    }
 22 | 
 23 |    pre .comment {
 24 |      color: #998;
 25 |      font-style: italic
 26 |    }
 27 | 
 28 |    pre .keyword {
 29 |      color: #900;
 30 |      font-weight: bold
 31 |    }
 32 | 
 33 |    pre .identifier {
 34 |      color: rgb(0, 0, 0);
 35 |    }
 36 | 
 37 |    pre .string {
 38 |      color: #d14;
 39 |    }
 40 | </style>
 41 | 
 42 | <!-- R syntax highlighter -->
 43 | <script type="text/javascript">
 44 | var hljs=new function(){function m(p){return p.replace(/&/gm,"&amp;").replace(/</gm,"&lt;")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
 45 | hljs.initHighlightingOnLoad();
 46 | </script>
 47 | 
 48 | <!-- MathJax scripts -->
 49 | <script type="text/javascript" src="https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
 50 | </script>
 51 | 
 52 | 
 53 | <style type="text/css">
 54 | body, td {
 55 |    font-family: sans-serif;
 56 |    background-color: white;
 57 |    font-size: 13px;
 58 | }
 59 | 
 60 | body {
 61 |   max-width: 800px;
 62 |   margin: auto;
 63 |   line-height: 20px;
 64 | }
 65 | 
 66 | tt, code, pre {
 67 |    font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
 68 | }
 69 | 
 70 | h1 { 
 71 |    font-size:2.2em; 
 72 | }
 73 | 
 74 | h2 { 
 75 |    font-size:1.8em; 
 76 | }
 77 | 
 78 | h3 { 
 79 |    font-size:1.4em; 
 80 | }
 81 | 
 82 | h4 { 
 83 |    font-size:1.0em; 
 84 | }
 85 | 
 86 | h5 { 
 87 |    font-size:0.9em; 
 88 | }
 89 | 
 90 | h6 { 
 91 |    font-size:0.8em; 
 92 | }
 93 | 
 94 | a:visited {
 95 |    color: rgb(50%, 0%, 50%);
 96 | }
 97 | 
 98 | pre, img {
 99 |   max-width: 100%;
100 | }
101 | 
102 | pre code {
103 |    display: block; padding: 0.5em;
104 | }
105 | 
106 | code {
107 |   font-size: 92%;
108 |   border: 1px solid #ccc;
109 | }
110 | 
111 | code[class] {
112 |   background-color: #F8F8F8;
113 | }
114 | 
115 | table, td, th {
116 |   border: none;
117 | }
118 | 
119 | blockquote {
120 |    color:#666666;
121 |    margin:0;
122 |    padding-left: 1em;
123 |    border-left: 0.5em #EEE solid;
124 | }
125 | 
126 | hr {
127 |    height: 0px;
128 |    border-bottom: none;
129 |    border-top-width: thin;
130 |    border-top-style: dotted;
131 |    border-top-color: #999999;
132 | }
133 | 
134 | @media print {
135 |    * { 
136 |       background: transparent !important; 
137 |       color: black !important; 
138 |       filter:none !important; 
139 |       -ms-filter: none !important; 
140 |    }
141 | 
142 |    body { 
143 |       font-size:12pt; 
144 |       max-width:100%; 
145 |    }
146 |        
147 |    a, a:visited { 
148 |       text-decoration: underline; 
149 |    }
150 | 
151 |    hr { 
152 |       visibility: hidden;
153 |       page-break-before: always;
154 |    }
155 | 
156 |    pre, blockquote { 
157 |       padding-right: 1em; 
158 |       page-break-inside: avoid; 
159 |    }
160 | 
161 |    tr, img { 
162 |       page-break-inside: avoid; 
163 |    }
164 | 
165 |    img { 
166 |       max-width: 100% !important; 
167 |    }
168 | 
169 |    @page :left { 
170 |       margin: 15mm 20mm 15mm 10mm; 
171 |    }
172 |      
173 |    @page :right { 
174 |       margin: 15mm 10mm 15mm 20mm; 
175 |    }
176 | 
177 |    p, h2, h3 { 
178 |       orphans: 3; widows: 3; 
179 |    }
180 | 
181 |    h2, h3 { 
182 |       page-break-after: avoid; 
183 |    }
184 | }
185 | </style>
186 | 
187 | 
188 | 
189 | </head>
190 | 
191 | <body>
192 | <h1>An Introduction to Using GPUs for Computation</h1>
193 | 
194 | <p>Chris Paciorek, Statistical Computing Facility, Department of Statistics, UC Berkeley</p>
195 | 
196 | <p>Presented: April 25, 2014</p>
197 | 
198 | <p>Last Revised: April 30, 2014</p>
199 | 
200 | <h1>0) This Tutorial</h1>
201 | 
202 | <p>Materials for this tutorial, including the R markdown file that was used to create this document are available on github at <code>https://github.com/berkeley-scf/git-workshop-2014</code>.  You can download the files by doing a git clone:</p>
203 | 
204 | <pre><code class="clone, eval=FALSE, engine=&#39;bash&#39;">git clone https://github.com/berkeley-scf/gpu-workshop-2014
205 | </code></pre>
206 | 
207 | <p>To create this HTML document, simply compile the corresponding R Markdown file in R:</p>
208 | 
209 | <pre><code class="r">library(knitr)
210 | knit2html(&quot;gpu.Rmd&quot;)
211 | </code></pre>
212 | 
213 | <h1>1) Introduction</h1>
214 | 
215 | <h3>1.1) Overview</h3>
216 | 
217 | <p>GPUs (Graphics Processing Units) are processing units originally designed for rendering graphics on a computer quickly. This is done by having a large number of simple processing units for massively parallel calculation. The idea of general purpose GPU (GPGPU) computing is to exploit this capability for general computation. </p>
218 | 
219 | <p>We&#39;ll see some high-level and somewhat lower-level ways to program calculations for implementation on the GPU. The basic context of GPU programming is &ldquo;data parallelism&rdquo;, in which the same calculation is done to lots of pieces of data. This could be a mathematical calcuation on millions of entries in a vector or a simulation with many independent simulations. Some examples of data parallelism include matrix multiplication (doing the multiplication task on many separate matrix elements) or numerical integration (doing a numerical estimate of the piecewise integral on many intervals/regions), as well as standard statistical calculations such as simulation studies, bootstrapping, random forests, etc. This kind of computation also goes by the name <code>SIMD</code> (single instruction, multiple data).</p>
220 | 
221 | <h3>1.2) Hardware</h3>
222 | 
223 | <p>Two of the main suppliers of GPUs are NVIDIA and AMD. <code>CUDA</code> is a platform for programming on GPUs specifically for NVIDIA GPUs that allows you to send C/C++/Fortran code for execution on the GPU.  <code>OpenCL</code> is an alternative that will work with a broader variety of GPUs. However, CUDA is quite popular, and since Amazon EC2 provides NVIDIA GPUs we&#39;ll use CUDA here. </p>
224 | 
225 | <p>GPUs have many processing units but limited memory. Also, they can only use data in their own memory, not in the CPU&#39;s memory, so one must transfer data back and forth between the CPU (the <code>host</code>) and the GPU (the <code>device</code>). This copying can, in some computations, constitute a very large fraction of the overall computation. So it is best to create the data and/or leave the data (for subsequent calculations) on the GPU when possible and to limit transfers. </p>
226 | 
227 | <p>The <code>g2.2xlarge</code> Amazon EC2 instance types have 1536 cores and 4 Gb memory. They&#39;re of the <code>Kepler</code> architecture (3rd generation). The 2nd generation was <code>Fermi</code> and the 1st was <code>Tesla</code>. (However note that <code>Tesla</code> is also used by NVIDIA to refer to different chip types, so for example the <code>cg1.4xlarge</code> Amazon EC2 instances have chips that are <code>NVIDIA Tesla M2050 GPUs (&quot;Fermi&quot; GF100)</code>, but are the <code>Fermi</code> architecture.) Originally GPUs supported only single precision (i.e., <code>float</code> calculations) but fortunately they now support double precision operations and all of the examples here will use doubles to avoid potential numerical issues, in particular with linear algebra calculations. </p>
228 | 
229 | <h4>Demonstration Using Amazon&#39;s EC2</h4>
230 | 
231 | <p>Since the SCF does not have any machines with a GPU, we&#39;ll need to use a cloud-based machine. Amazon&#39;s EC2 provide two types of GPU instances: <code>g2.2xlarge</code> and <code>cg1.4xlarge</code>. The first is more recent, though in some of my tests cg1.4xlarge was actually faster. However given that the price for g2.2xlarge is 65 cents per hour and cg1.4xlarge is more than $2 per hour, we&#39;ll use g2.2xlarge.</p>
232 | 
233 | <p>I&#39;ve created an Amazon machine image (an AMI) that is the binary representation of the Linux Ubuntu operating system for a machine with support for GPU calculations. The AMI contains the following software and packages: R and RCUDA, Python and PyCUDA, CUDA, and MAGMA.  In other respects the AMI is similar to the SCF and EML Linux machines but with a reduced set of software.</p>
234 | 
235 | <p>Based on this AMI I&#39;ve started a virtual machine (VM) that we can login to (see below for instructions) via SSH, just like any SCF/EML Linux server.</p>
236 | 
237 | <p>If you were willing to pay Amazon and had an account you can start a VM (in the Oregon [us-west-2] region) using the SCF AMI by searching for &ldquo;Public Images&rdquo; at the <a href="https://console.aws.amazon.com/ec2/v2/home?region=us-west-2#Images:">EC2 console</a> for <code>scf-gpu_0.3</code>. Then just launch a VM, selecting <code>g2.2xlarge</code> under the <code>GPU instances</code> tab. Alternatively, if you are using StarCluster (e.g., <a href="http://statistics.berkeley.edu/computing/cloud-computation">this tutorial</a> provides some info on using StarCluster with EC2 to start up VMs or clusters of VMs), you can start a VM using the SCF AMI by setting the following in the StarCluster <code>config</code> file:</p>
238 | 
239 | <pre><code class="starcluster-config, eval=FALSE, engine=&#39;bash&#39;">AWS_REGION_NAME = us-west-2
240 | AWS_REGION_HOST = ec2.us-west-2.amazonaws.com
241 | NODE_IMAGE_ID = ami-b0374280 
242 | NODE_INSTANCE_TYPE = g2.2xlarge
243 | </code></pre>
244 | 
245 | <p>Note that the EML (Economics) has a GPU on one of the EML Linux servers that EML users can access. If this is of interest to you, email <a href="mailto:consult@econ.berkeley.edu">consult@econ.berkeley.edu</a>, and I will work to get it set up analogously to the Amazon VM and to help you get started. </p>
246 | 
247 | <p>And note that Biostatistics has a GPU on one of its servers. Talk to Burke for more information.</p>
248 | 
249 | <h4>The Amazon VM</h4>
250 | 
251 | <p>I&#39;ll start up the Amazon VM, calling it <code>gpuvm</code> and ssh to it using my own Starcluster config file:</p>
252 | 
253 | <pre><code class="start-VM, engine=&#39;bash&#39;">starcluster start -c gpu gpuvm
254 | starcluster sshmaster -u paciorek gpuvm
255 | </code></pre>
256 | 
257 | <p>I also need to make sure that CUDA-related executables are in my path (they should already be set up for the <code>ubuntu</code> default user):</p>
258 | 
259 | <pre><code class="path, engine=&#39;bash&#39;">export PATH=${PATH}:/usr/local/cuda/bin
260 | echo &quot;&quot; &gt;&gt; ~/.bashrc
261 | echo &quot;export PATH=${PATH}:/usr/local/cuda/bin&quot; &gt;&gt; ~/.bashrc
262 | echo &quot;&quot; &gt;&gt; ~/.bashrc
263 | echo &quot;alias gtop=\&quot;nvidia-smi -q -g 0 -d UTILIZATION -l 1\&quot;&quot; &gt;&gt; ~/.bashrc
264 | 
265 | </code></pre>
266 | 
267 | <p>For the moment, you can connect to the Amazon VM I am using yourself. Here&#39;s what you need to do.</p>
268 | 
269 | <ul>
270 | <li>copy the ssh key file, <code>gpu_rsa</code> that SCF provided access to (via email) to your computer (on a UNIX-like machine, including Macs), put it in <code>~/.ssh</code>)</li>
271 | <li>open a terminal window on a UNIX-alike machine (you might be able to ssh via putty or the like if you can point it to the key file you just copied to your machine) and ssh to the VM as follows, using the IP info provided by SCF (via email):</li>
272 | </ul>
273 | 
274 | <pre><code class="ssh, engine=&#39;bash&#39;">export ip=VALUE_OBTAINED_FROM_SCF
275 | ssh -i ~/.ssh/gpu_rsa ubuntu@${ip}.us-west-2.compute.amazonaws.com
276 | </code></pre>
277 | 
278 | <ul>
279 | <li>since multiple people are sharing this VM and are all logging in as the &#39;ubuntu&#39; user, please make a directory ~/ubuntu/YourUserName and only work within that directory</li>
280 | </ul>
281 | 
282 | <h4>Observing Performance on the GPU</h4>
283 | 
284 | <p>The following command will allow you to see some information analogous to <code>top</code> on the CPU. </p>
285 | 
286 | <pre><code class="gtop, engine=&#39;bash&#39;">gtop
287 | </code></pre>
288 | 
289 | <p>Here&#39;s some example output when the GPU is idle: </p>
290 | 
291 | <pre><code class="gtop output, engine=&#39;bash&#39;, eval=FALSE">==============NVSMI LOG==============
292 | 
293 | Timestamp                           : Mon Apr  7 21:15:39 2014
294 | Driver Version                      : 319.37
295 | 
296 | Attached GPUs                       : 1
297 | GPU 0000:00:03.0
298 |     Utilization
299 |         Gpu                         : 0 %
300 |         Memory                      : 0 %
301 | </code></pre>
302 | 
303 | <h3>1.4) Software Tools</h3>
304 | 
305 | <p>Here are some of the useful software tools for doing computations on the GPU.</p>
306 | 
307 | <ul>
308 | <li>CUDA - platform for programming on an NVIDIA GPU using C/C++/Fortran code</li>
309 | <li>CUBLAS - a BLAS implementation for matrix-vector calculations on an NVIDIA GPU</li>
310 | <li>CURANDOM - random number generation on an NVIDIA GPU</li>
311 | <li>MAGMA - a package for combined CPU-GPU linear algebra, intended to be analogous to LAPACK + BLAS</li>
312 | <li>RCUDA - an R package providing a front-end for CUDA</li>
313 | <li>R&#39;s magma package - a front-end for MAGMA</li>
314 | <li>PyCUDA - a Python package providing a front-end for CUDA</li>
315 | </ul>
316 | 
317 | <p>Note that RCUDA is still in development and is on Github, but should be high-quality as it is developed by Duncan Temple Lang at UC-Davis.</p>
318 | 
319 | <p>We&#39;ll see all of these in action.</p>
320 | 
321 | <p>There are also:</p>
322 | 
323 | <ul>
324 | <li>openCL - an alternative to CUDA that can also be used with non-NVIDIA GPUs</li>
325 | <li>PyOpenCL</li>
326 | <li>R&#39;s OpenCL package</li>
327 | </ul>
328 | 
329 | <h4>A Note on Synchronization</h4>
330 | 
331 | <p>Note that in the various examples when I want to assess computational time, I make sure to synchronize the GPU via an appropriate function call. This ensures that all of the kernels have finished their calculations before I mark the end of the time interval. In general a function call to do a calculation on the GPU will simply start the calculation and then return, with the calculation continuing on the GPU.</p>
332 | 
333 | <h1>2) Using Kernels for Parallel Computation</h1>
334 | 
335 | <p>Kernels are functions that encode the core computational operations done on individual pieces of data. The basic mode of operation in this Section will be to write a kernel and then call the kernel on all the elements of a data object via C, R, or Python code. We&#39;ll need to pass the data from the CPU to the GPU and do the same in reverse to get the result. We&#39;ll also need to allocate memory on the GPU. However in some cases the transfer and allocation will be done automatically behind the scenes.</p>
336 | 
337 | <p>A note on the speed comparisons in this section. These compare a fully serial CPU calculation on a single core to calculation on the GPU. On a multicore machine, we could speed up the CPU calculation by writing code to parallelize the calculation (e.g., via threading in C/openMP or various parallelization tools in R or Python). </p>
338 | 
339 | <p>See my comments in the last Section regarding some tips and references that may enable you to get more impressive speedups than I show in the demos here. </p>
340 | 
341 | <h3>2.1) Background:</h3>
342 | 
343 | <h4>Threads and Grids</h4>
344 | 
345 | <p>Each individual computation or series of computations on the GPU is done in a thread. Threads are organized into blocks and blocks of threads are organized in a grid. The blocks and grids can be 1-, 2-, or 3-dimensional. E.g., you might have a 1-d block of 500 threads, with a grid of 3 x 3 such blocks, for a total of \(500 \times 9 = 4500\) threads. The choice of the grid/block arrangement can affect efficiency. I can&#39;t provide much guidance on that so you&#39;d need to experiment or do some additional research. For our purposes, we&#39;ll often use a 2-d grid of 1-d blocks. In general you&#39;d want each independent calculation done in a separate thread, though as we&#39;ll see in Section 3 on simulation, one might want to do a sequence of calculations on each thread. In general, you&#39;ll want to pipeline together multiple operations within a computation to avoid copying from CPU to GPU and back. Alternatively, this can be done by keeping the data on the GPU and calling a second kernel. </p>
346 | 
347 | <p>Threads are quick to start, and to get efficiency you want to have thousands of threads to exploit the parallelism of the GPU hardware. In general your calculations will have more threads than GPU cores.</p>
348 | 
349 | <p>This can all get quite complicated, with the possibility for communication amongst threads. We won&#39;t go into this, but threads within a block shared memory (distinct from the main GPU memory) and can synchronize with each other, while threads in different blocks cannot cooperate. The Suchard et al. paper referenced in the last Section discusses how to get more efficiency by having threads within a block cooperate and access shared memory, which is much faster than accessing the  main GPU (device) memory.</p>
350 | 
351 | <p>Executing the following code as root will create an executable that will show you details on the GPU, including the possible block and grid dimensions. </p>
352 | 
353 | <pre><code class="deviceQuery, engine=&#39;bash&#39;, eval=FALSE">cd  /usr/local/cuda/samples/1_Utilities/deviceQuery
354 | nvcc deviceQuery.cpp -I/usr/local/cuda/include \
355 |    -I/usr/local/cuda-5.5/samples/common/inc -o /usr/local/cuda/bin/deviceQuery
356 | cd -
357 | </code></pre>
358 | 
359 | <p>Now running <code>deviceQuery</code> will show output like the following (on the SCF VM):</p>
360 | 
361 | <pre><code class="deviceQuery output, engine=&#39;bash&#39;, eval=FALSE">paciorek@master:~$ deviceQuery
362 | deviceQuery Starting...
363 | 
364 |  CUDA Device Query (Runtime API) version (CUDART static linking)
365 | 
366 | Detected 1 CUDA Capable device(s)
367 | 
368 | Device 0: &quot;GRID K520&quot;
369 |   CUDA Driver Version / Runtime Version          5.5 / 5.5
370 |   CUDA Capability Major/Minor version number:    3.0
371 |   Total amount of global memory:                 4096 MBytes (4294770688 bytes)
372 |   ( 8) Multiprocessors, (192) CUDA Cores/MP:     1536 CUDA Cores
373 |   GPU Clock rate:                                797 MHz (0.80 GHz)
374 |   Memory Clock rate:                             2500 Mhz
375 |   Memory Bus Width:                              256-bit
376 |   L2 Cache Size:                                 524288 bytes
377 |   Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
378 |   Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
379 |   Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
380 |   Total amount of constant memory:               65536 bytes
381 |   Total amount of shared memory per block:       49152 bytes
382 |   Total number of registers available per block: 65536
383 |   Warp size:                                     32
384 |   Maximum number of threads per multiprocessor:  2048
385 |   Maximum number of threads per block:           1024
386 |   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
387 |   Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
388 |   Maximum memory pitch:                          2147483647 bytes
389 |   Texture alignment:                             512 bytes
390 |   Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
391 |   Run time limit on kernels:                     No
392 |   Integrated GPU sharing Host Memory:            No
393 |   Support host page-locked memory mapping:       Yes
394 |   Alignment requirement for Surfaces:            Yes
395 |   Device has ECC support:                        Disabled
396 |   Device supports Unified Addressing (UVA):      Yes
397 |   Device PCI Bus ID / PCI location ID:           0 / 3
398 |   Compute Mode:
399 |      &lt; Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) &gt;
400 | 
401 | deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.5, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GRID K520
402 | Result = PASS
403 | </code></pre>
404 | 
405 | <p>In particular note the information on the number of CUDA cores, the GPU&#39;s memory, and the information on the maximum threads per block and the maximum dimensions of thread blocks and grids.</p>
406 | 
407 | <h4>GPU Calculations and Kernels</h4>
408 | 
409 | <p>The basic series of operations is:</p>
410 | 
411 | <ul>
412 | <li>allocate memory on the GPU</li>
413 | <li>transfer data from CPU to GPU</li>
414 | <li>launch the kernel to operate on the threads, with a given block/grid arrangement</li>
415 | <li>[optionally] launch another kernel, which can access data stored on the GPU, including results from the previous kernel</li>
416 | <li>transfer results back to CPU</li>
417 | </ul>
418 | 
419 | <p>Some of this is obscured because CUDA, RCUDA, and PyCUDA do some of the work for you (and also obscured if you use pinned memory).</p>
420 | 
421 | <p>When we write a kernel, we will need to have some initial code that determines a unique ID for that thread  that allows the thread to access the appropriate part(s) of the data object(s) on the GPU. This is done based on information stored in variables that CUDA provides that have information about the thread and block indices and block and grid dimensions.</p>
422 | 
423 | <h3>2.2) Using CUDA Directly</h3>
424 | 
425 | <h4>Hello, world</h4>
426 | 
427 | <p>First let&#39;s see a &#39;Hello, World&#39; example that illustrates blocks of threads and grids of blocks.</p>
428 | 
429 | <p>The idea is to have at least as many threads as the number of computations you are doing. Our kernel function contains the core calculation we want to do (in this case printing &#39;Hello world!&#39; and code that figures out the unique ID of each thread, as this is often used within a calculation.</p>
430 | 
431 | <p>Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/helloWorld.cu">example code (helloWorld.cu on the github repo)</a>.</p>
432 | 
433 | <p>In this case, compilation is as follows. Given the CUDA functionality used in the code (in particular the call to <code>printf</code> within the kernel), we need to specify compilation for a <code>compute capability</code> &gt;= 2.0 (corresponding to the Fermi generation of NVIDIA GPUs). Note that our query above indicated that the GPU we are using has capability 3.0, so </p>
434 | 
435 | <pre><code class="helloWorld-compile, engine=&#39;bash&#39;, eval=FALSE">nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
436 | </code></pre>
437 | 
438 | <p>The result of this looks like:</p>
439 | 
440 | <pre><code class="helloWorld-output, eval=FALSE, engine=&#39;bash&#39;">Launching 20480 threads (N=20000)
441 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(448,0,0) =&gt; thread index=1984
442 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(449,0,0) =&gt; thread index=1985
443 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(450,0,0) =&gt; thread index=1986
444 | ....
445 | 
446 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(220,0,0) =&gt; thread index=20188 
447 | [### this thread would not be used for N=20000 ###]
448 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(221,0,0) =&gt; thread index=20189 
449 | [### this thread would not be used for N=20000 ###]
450 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(222,0,0) =&gt; thread index=20190 
451 | [### this thread would not be used for N=20000 ###]
452 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(223,0,0) =&gt; thread index=20191 
453 | [### this thread would not be used for N=20000 ###]
454 | kernel launch success!
455 | That&#39;s all!
456 | </code></pre>
457 | 
458 | <p>Note that because of some buffering issues, with this many threads, we can&#39;t see the output for all of them, hence the <code>if</code> statement in the kernel code. It is possible to retrieve info about the limit and change the limit using <code>cudaDeviceGetLimit()</code> and <code>cudaDeviceSetLimit()</code>.</p>
459 | 
460 | <h4>Example of a &#39;Real&#39; Computation</h4>
461 | 
462 | <p>Now let&#39;s see an example of a distributed calculation using CUDA code, including memory allocation on the GPU and transfer between the GPU and CPU. Our example will be computing terms in an IID log-likelihood calculation. In this case we&#39;ll just use the normal density, but real applications would of course have more involved calculation.</p>
463 | 
464 | <p>Note that here, I&#39;ll use 1024 (the maximum based on <code>deviceQuery</code>) threads per block and then a grid (2-d for simplicity) sufficiently large so that we have at least as many threads as computational chunks. </p>
465 | 
466 | <p>Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/kernelExample.cu">code (kernelExample.cu on the github repo)</a>.</p>
467 | 
468 | <p>Compilation is as follows. We again need to specify a compute capability &gt;= 2.0, in this case in order to do calculations with doubles rather than floats.</p>
469 | 
470 | <pre><code class="kernelExample-compile, engine=&#39;bash&#39;, eval=FALSE">nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
471 | </code></pre>
472 | 
473 | <p>Here are some results:</p>
474 | 
475 | <pre><code class="kernelExample-output, eval=FALSE, engine=&#39;bash&#39;">====================================================
476 | Grid dimension is 46 x 46
477 | Launching 2166784 threads (N=2097152)
478 | Input values: -0.658344 0.499804 -0.807257...
479 | Memory Copy from Host to Device successful.
480 | Memory Copy from Device to Host successful.
481 | Output values: 0.321214 0.352100 0.288007...
482 | Output values (CPU): 0.321214 0.352100 0.288007...
483 | Timing results for n = 2097152
484 | Transfer to GPU time: 0.008920
485 | Calculation time (GPU): 0.001766
486 | Calculation time (CPU): 0.070951
487 | Transfer from GPU time: 0.001337
488 | Freeing memory...
489 | ====================================================
490 | ...
491 | ====================================================
492 | Grid dimension is 363 x 363
493 | Launching 134931456 threads (N=134217728)
494 | Input values: -0.658344 0.499804 -0.807257...
495 | Memory Copy from Host to Device successful.
496 | Memory Copy from Device to Host successful.
497 | Output values: 0.321214 0.352100 0.288007...
498 | Output values (CPU): 0.321214 0.352100 0.288007...
499 | Timing results for n = 134217728
500 | Transfer to GPU time: 0.556857
501 | Calculation time (GPU): 0.110254
502 | Calculation time (CPU): 4.605744
503 | Transfer from GPU time: 0.068865
504 | Freeing memory...
505 | ====================================================
506 | </code></pre>
507 | 
508 | <p>We see that the time for transferring to and from (particularly to) the GPU exceeds the calculation time, reinforcing the idea of keeping data on the GPU when possible. </p>
509 | 
510 | <h4>Using Pinned Memory</h4>
511 | 
512 | <p>Here&#39;s some code where we use pinned memory that is &#39;mapped&#39; to the GPU such that the GPU directly accesses CPU memory. This can be advantageous if one exceeds the GPU&#39;s memory and, according to some sources, is best when you load the data only once. Another approach, using pinned but not mapped memory allows for more efficient transfer but without the direct access from the GPU, with a hidden transfer done behind the scenes. This may be better if the data is loaded multiple times on the GPU.</p>
513 | 
514 | <p>Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/kernelExample-pinned.cu">code (kernelExample-pinned.cu on the github repo)</a>.</p>
515 | 
516 | <p>Here are some results:</p>
517 | 
518 | <pre><code class="kernelExample-pinned-output, eval=FALSE, engine=&#39;bash&#39;">====================================================
519 | Grid dimension is 46 x 46
520 | Launching 2166784 threads (N=2097152)
521 | Input values: -0.658344 0.499804 -0.807257...
522 | Output values: 0.321214 0.352100 0.288007...
523 | Output values (CPU): 0.321214 0.352100 0.288007...
524 | Timing results for n = 2097152
525 | Calculation time (GPU): 0.002080
526 | Calculation time (CPU): 0.071038
527 | Freeing memory...
528 | ====================================================
529 | ...
530 | ====================================================
531 | Grid dimension is 363 x 363
532 | Launching 134931456 threads (N=134217728)
533 | Input values: -0.658344 0.499804 -0.807257...
534 | Output values: 0.321214 0.352100 0.288007...
535 | Output values (CPU): 0.321214 0.352100 0.288007...
536 | Timing results for n = 134217728
537 | Calculation time (GPU): 0.255367
538 | Calculation time (CPU): 4.635453
539 | Freeing memory...
540 | ====================================================
541 | </code></pre>
542 | 
543 | <p>So using pinned mapped memory seems to help quite a bit in this case, as the total time with pinned memory is less than the time used for transfer plus calculation in the previous examples.</p>
544 | 
545 | <h3>2.2) Calling CUDA Kernels from R (RCUDA)</h3>
546 | 
547 | <p>When we want to use CUDA from R, the kernel function will remain the same, but the pre- and post-processing is done in R rather than in C. Here&#39;s an example, with the same log-likelihood kernel. The CUDA kernel code is saved in a <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/calc_loglik.cu">separate file (calc_loglik.cu on the github repo)</a> separate file but is identical to that in the full CUDA+C example above (with the exception that we need to wrap the kernel function in <code>extern &quot;C&quot;</code>).</p>
548 | 
549 | <p>Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/RCUDAexample.R">code (RCUDAexample.R on the github repo)</a></p>
550 | 
551 | <p>In this example we see that we can either transfer data between CPU and GPU manually or have RCUDA do it for us. If we didn&#39;t want to overwrite the input, but rather to allocate separate space for the output on the GPU, we could use <code>cudaAlloc()</code>.  See <code>help(.cuda)</code> for some example code.</p>
552 | 
553 | <p>We need to compile the kernel into a ptx object file, either outside of R:</p>
554 | 
555 | <pre><code class="RCUDAexample-compile, engine=&#39;bash&#39;, eval=FALSE">nvcc --ptx  -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu
556 | </code></pre>
557 | 
558 | <p>or inside of R:</p>
559 | 
560 | <pre><code class="RCUDAexample-compile-inR, engine=&#39;R&#39;, eval=FALSE">ptx = nvcc(file = &#39;calc_loglik.cu&#39;, out = &#39;calc_loglik.ptx&#39;, target = &#39;ptx&#39;, &#39;-arch=compute_20&#39;, &#39;-code=sm_20,compute_20&#39;)
561 | </code></pre>
562 | 
563 | <p>Here are some results:</p>
564 | 
565 | <pre><code class="RCUDAexample-output, eval=FALSE, engine=&#39;bash&#39;">Setting cuGetContext(TRUE)...
566 | Grid size:
567 | [1] 363 363   1
568 | Total number of threads to launch =  134931456 
569 | Running CUDA kernel...
570 | Input values:  0.8966972 0.2655087 0.3721239 
571 | Output values:  0.2457292 0.2658912 0.2656543 
572 | Output values (implicit transfer):  0.2457292 0.2658912 0.2656543 
573 | Output values (CPU with R):  0.2457292 0.2658912 0.2656543 
574 | Transfer to GPU time:  0.374 
575 | Calculation time (GPU):  0.078 
576 | Transfer from GPU time:  0.689 
577 | Calculation time (CPU):  9.981 
578 | Combined calculation+transfer via .cuda time (GPU):  4.303 
579 | </code></pre>
580 | 
581 | <p>So the transfer time is again substantial in relative terms. Without that time, the speedup would be substantial. Strangely the streamlined call in which RCUDA handles the transfer is quite a bit slower for reasons that are not clear to me, but the RCUDA developer (Duncan Temple Lang at UC Davis) is looking into this.</p>
582 | 
583 | <p>We can avoid explicitly specifying block and grid dimensions by using the <code>gridBy</code> argument to <code>.cuda</code>, which we&#39;ll see in a later example.</p>
584 | 
585 | <p>WARNING #1: be very careful that the types of the R objects passed to the kernel match what the kernel is expecting. Otherwise the code can hang without an informative error message.</p>
586 | 
587 | <p>WARNING #2: Note the use of the <code>strict=TRUE</code> argument when passing values to the GPU. This ensures that numeric values are kept as doubles and not coerced to floats. </p>
588 | 
589 | <h3>2.3) Calling CUDA Kernels from Python plus GPU-vectorized Calculations (PyCUDA)</h3>
590 | 
591 | <p>With PyCUDA the kernel code can be directly embedded in the Python script. Otherwise it&#39;s fairly similar to the use of RCUDA. Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/PyCUDAexample.py">code (PyCUDAexample.py on the github repo)</a></p>
592 | 
593 | <p>Here are some results:</p>
594 | 
595 | <pre><code class="PyCUDAexample-output, eval=FALSE, engine=&#39;bash&#39;">Generating random normals...
596 | Running GPU code...
597 | Time for calculation (GPU): 1.512139s
598 | Running Scipy CPU code...
599 | Time for calculation (CPU): 21.398803s
600 | Output from GPU: 0.168458 0.174912 0.252148
601 | Output from CPU: 0.168458 0.174912 0.252148
602 | </code></pre>
603 | 
604 | <p>WARNING: As was the case with R, be careful that the types of the Python objects passed to the kernel match what the kernel is expecting. </p>
605 | 
606 | <p>PyCUDA also provides high-level functionality for vectorized calculations on the GPU. Basically you create a vector stored in GPU memory and then operate on it with a variety of mathematical functions. The modules that do this are <code>gpuarray</code> and <code>cumath</code>.</p>
607 | 
608 | <p>Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/gpuArrayExample.py">code (gpuArrayExample.py on the github repo)</a></p>
609 | 
610 | <p>Here are the timing results. </p>
611 | 
612 | <pre><code class="gpuArrayExample-output, eval=FALSE, engine=&#39;bash&#39;">Transfer to GPU time: 0.314641s
613 | Timing vectorized exponentiation:
614 | GPU array calc time: 0.226006s
615 | CPU calc time: 3.155150s
616 | Timing vectorized dot product/sum of squares:
617 | GPU array calc time: 0.254579s
618 | CPU calc time: 0.088157s
619 | </code></pre>
620 | 
621 | <p>So the fully-vectorized calculation sees a pretty good speed-up but the dot product, which involves a reduction (the summation) does not. Also note that there is some compilation that gets done when the code is run the first time that causes the GPU calculation to be slow the first time but not the second time the code is run. </p>
622 | 
623 | <h1>3) Random Number Generation (RNG) on the GPU</h1>
624 | 
625 | <p>RNG is done via the CURAND (CUDA Random Number Generation) library. CURAND provides several different generators including the Mersenne Twister (the default in R). </p>
626 | 
627 | <h3>3.1) Seeds and Sequences</h3>
628 | 
629 | <p>From the CUDA documentation:</p>
630 | 
631 | <p><code>For the highest quality parallel pseudorandom number generation, each experiment should be assigned a unique seed. Within an experiment, each thread of computation should be assigned a unique sequence number. If an experiment spans multiple kernel launches, it is recommended that threads between kernel launches be given the same seed, and sequence numbers be assigned in a monotonically increasing way. If the same configuration of threads is launched, random state can be preserved in global memory between launches to avoid state setup time.</code></p>
632 | 
633 | <p>A lot of important info&hellip; we&#39;ll interpret/implement much of it in the demo below.</p>
634 | 
635 | <p>Recall that RNG on a computer involves generation of pseudo-random numbers from a deterministic, periodic sequence. The seed determines where one starts generating from within that sequence. The idea of the sequence numbers is to generate from non-overlapping blocks within the sequence, with each thread getting a different block.  </p>
636 | 
637 | <h3>3.2) Calling CURAND via RCUDA</h3>
638 | 
639 | <p>For RNG, we need a kernel to initialize the RNG on each thread and one to do the sampling (though they could be combined in a single kernel). Note that the time involved in initializing the RNG for each thread is substantial. This shouldn&#39;t be a problem if one is doing a lot of calculations over time. To amortize this one-time expense, I generate multiple random numbers per thread. Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/random.cu">kernel code (random.cu on the github repo)</a>.</p>
640 | 
641 | <p>And here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/RNGexample.R">R code (RNGexample.R on the github repo)</a> to call the kernel, which looks very similar to the RCUDA code we&#39;ve already seen.</p>
642 | 
643 | <p>We get a pretty good speed up, which would be even more impressive if we can set up the calculations such that we don&#39;t need to transfer the whole large vector back to the CPU.</p>
644 | 
645 | <pre><code class="RNGexample-output, eval=FALSE, engine=&#39;bash&#39;">RNG initiation time:  0.115 
646 | GPU memory allocation time:  0.003 
647 | Calculation time (GPU):  0.256 
648 | Transfer from GPU time:  0.501 
649 | --------------------------------------
650 | Total time (GPU): 0.875
651 | --------------------------------------
652 | Calculation time (CPU):  9.963 
653 | </code></pre>
654 | 
655 | <p>Also note the memory cost of the RNG states for the threads, 48 bytes per thread, which could easily exceed GPU memory if one starts up many threads. </p>
656 | 
657 | <p>One more note on RCUDA: we can have RCUDA decide on the gridding. Here&#39;s a modification of the RNG example to do this:</p>
658 | 
659 | <pre><code class="gridBy, eval=FALSE">.cuda(rnorm, rng_states, dX, N, mu, sigma, N_per_thread, gridBy = nthreads, .numericAsDouble = getOption(&quot;CUDA.useDouble&quot;, TRUE))
660 | </code></pre>
661 | 
662 | <p>At the moment, I&#39;m not sure how to choose the RNG generator from within R. </p>
663 | 
664 | <h3>3.3) Calling CURAND from C and from Python</h3>
665 | 
666 | <p>I may flesh this out at some point, but by looking at the RNG example via RCUDA and the examples of calling kernels from C and Python in the previous section, it should be straightforward to do RNG on the GPU controlled by C or Python.</p>
667 | 
668 | <p>To choose the generator in C this should work (in this case choosing the Mersenne Twister):
669 | <code>curandCreateGenerator(CURAND_RNG_PSEUDO_MTGP32)</code>.</p>
670 | 
671 | <h1>4) Linear Algebra on the GPU</h1>
672 | 
673 | <p>We&#39;ll start with very high-level use of the GPU by simply calling linear algebra routines that use the GPU. The simplest approach for this is to use R&#39;s <code>magma</code> package.</p>
674 | 
675 | <p>Note that in the timing results, I am comparing to timing with the CPU on the VM. The VM reports 8 virtual CPUs but in some of the calculations does not seem to exploit all of the CPUs, so be wary of the head to head comparisons.</p>
676 | 
677 | <h3>4.1) Using MAGMA via R</h3>
678 | 
679 | <p>The MAGMA library provides a drop-in for the functionality of the BLAS and LAPACK that carries out linear algebra on both the CPU and GPU, choosing smartly where to do various aspects of the calculation.</p>
680 | 
681 | <p>R&#39;s magma package provides a front-end to MAGMA, with functionality for arithmetic operations, backsolve, matrix multiplication, Cholesky, inverse, crossproduct, LU, QR, and solve. See <code>help(&quot;magma-class&quot;)</code> for a list, as <code>library(help = magma)</code> only lists a few of the functions in the package.</p>
682 | 
683 | <p>[Note for Demo: As we run the calculations on the GPU, let&#39;s look at the computation with our gtop utility.]</p>
684 | 
685 | <p>Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/RmagmaExample.R">example code (RmagmaExample.R on the github repository)</a>.</p>
686 | 
687 | <p>Note that by default we are using MAGMA&#39;s GPU interface. For more about this, see the next section of this document.</p>
688 | 
689 | <p>Here are some timing results:</p>
690 | 
691 | <pre><code class="magma-R-output, eval=FALSE, engine=&#39;bash&#39;">Timing for n= 4096 
692 | GPU time:  3.27
693 | CPU time:  5.92 
694 | Timing for n= 8192 
695 | GPU time:  20.19 
696 | CPU time:  47.04
697 | Check for use of double precision empirically
698 | [1] 0.000000000000000e+00 8.185452315956354e-11
699 | [1] 8433.16034596550344  -20.63245489979067   13.58046013130892
700 | [1] 8433.16034596551981  -20.63245489979058   13.58046013130881
701 | </code></pre>
702 | 
703 | <p>Remember to be careful of memory use as GPU&#39;s memory may be limited (on the EC2 instance, it&#39;s 4 Gb).</p>
704 | 
705 | <p>Testing the same computation on an 8-core SCF physical machine gives 2.6 seconds for n=4096 and 20.8 seconds for n=8192 using threaded OpenBLAS. On the VM the CPU-based BLAS/LAPACK calculations seems to only be using 2 cores for some reason that is not clear to me, even though the VM has 8 virtual cores.</p>
706 | 
707 | <h3>4.2) Using C to Call CUDA, CUDABLAS, and MAGMA</h3>
708 | 
709 | <p>Next let&#39;s use CUDA and MAGMA calls directly in C code. Both CUDA (through CUDABLAS) and MAGMA provide access to BLAS functionality, but only MAGMA provides LAPACK-like functionality (i.e., matrix factorizations/decompositions). Note that we&#39;ll now need to directly manage memory allocation on the GPU and transferring data back and forth from CPU to GPU.</p>
710 | 
711 | <h4>CUDA and CUDABLAS</h4>
712 | 
713 | <p>The code doesn&#39;t look too different than C code or calls to BLAS/LAPACK, but we use some CUDA functions and CUDA types. Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/cudaBlasExample.c">example code (cudaBlasExample.c on the github repo)</a>.</p>
714 | 
715 | <p>Compilation goes as follows using <code>nvcc</code>, the analog to <code>gcc</code> when compiling for the GPU. As when compiling standard C code we need to be careful about compiler flags, header files, and linking. Note that in this case nvcc does not want the file to have .C or .cu extension. </p>
716 | 
717 | <pre><code class="cuda-compile, eval=FALSE, engine=&#39;bash&#39;">nvcc cudaBlasExample.c -I/usr/local/cuda/include -lcublas -o cudaBlasExample
718 | </code></pre>
719 | 
720 | <p>And here are (some of) the results:</p>
721 | 
722 | <pre><code class="cudaBlas-example-output, eval=FALSE, engine=&#39;bash&#39;">Starting
723 | ====================================================
724 | Timing results for n = 512
725 | GPU memory allocation time: 0.001930
726 | Transfer to GPU time: 0.000777
727 | Matrix multiply time: 0.003121
728 | Transfer from GPU time: 0.001484
729 | ====================================================
730 | Timing results for n = 4096
731 | GPU memory allocation time: 0.002925
732 | Transfer to GPU time: 0.040283
733 | Matrix multiply time: 1.476518
734 | Transfer from GPU time: 0.144702
735 | ====================================================
736 | Timing results for n = 8192
737 | GPU memory allocation time: 0.002807
738 | Transfer to GPU time: 0.159034
739 | Matrix multiply time: 11.807786
740 | Transfer from GPU time: 0.535246
741 | </code></pre>
742 | 
743 | <p>For (rough) comparison, the \(n=8192\) multiplication on one of the SCF cluster nodes in R (using ACML as the BLAS) takes 74 seconds with one core and 11 seconds with 8 cores.</p>
744 | 
745 | <h4>MAGMA</h4>
746 | 
747 | <p>Now let&#39;s see the use of MAGMA. MAGMA provides analogous calls as CUDA/CUDABLAS for allocating memory, transferring data, and BLAS calls, as well as LAPACK type calls. Unfortunately the MAGMA documentation online appears to be seriously out-of-date, documenting version 0.2 whereas the current version of the software is 1.4.0.</p>
748 | 
749 | <p>Note that the LAPACK type calls have a CPU interface and a GPU interface. The GPU interface calls have function names ending in &#39;_gpu&#39; and operate on data objects in GPU memory. The CPU interface calls operate on data objects in CPU memory, handling the transfer to GPU memory as part of the calculation.</p>
750 | 
751 | <p>Also, one can use &#39;pinned&#39; memory on the CPU, which can reduce the transfer time for data to and from the GPU. However, it can involve an increase in time for doing the original memory allocation on the CPU. In the example I can control which is used.</p>
752 | 
753 | <p>Here we&#39;ll compare timing for the GPU vs. standard BLAS/LAPACK as well as the CPU and GPU interfaces for the Cholesky.</p>
754 | 
755 | <p>Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/magmaExample.c">example code (magmaExample.c on the github repo)</a>.</p>
756 | 
757 | <p>Compilation and execution (with and without pinned memory) go as follows. Note we can use gcc and that we need to link in the CPU BLAS and LAPACK since MAGMA uses both CPU and GPU for calculations (plus in this example I directly call BLAS and LAPACK functions).</p>
758 | 
759 | <pre><code class="magma-compile, eval=FALSE, engine=&#39;bash&#39;">gcc magmaExample.c -O3 -DADD_ -fopenmp  -DHAVE_CUBLAS -I/usr/local/cuda/include \
760 |    -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma \
761 |    -llapack -lblas -lcublas -o magmaExample
762 | ./magmaExample 1
763 | ./magmaExample 0
764 | </code></pre>
765 | 
766 | <p>And here are (some of) the results:</p>
767 | 
768 | <pre><code class="magma-example-output, eval=FALSE, engine=&#39;bash&#39;">Starting
769 | Setting use_pinned to 1
770 | ====================================================
771 | Timing results for n = 512
772 | GPU memory allocation time: 0.001107
773 | Transfer to GPU time: 0.000235
774 | Matrix multiply time (GPU): 0.002965
775 | Matrix multiply time (BLAS): 0.025640
776 | Cholesky factorization time (GPU w/ GPU interface): 0.006872
777 | Cholesky factorization time (GPU w/ CPU interface): 0.006856
778 | Cholesky factorization time (LAPACK): 0.004908
779 | Transfer from GPU time: 0.000252
780 | ====================================================
781 | Timing results for n = 4096
782 | GPU memory allocation time: 0.001535
783 | Transfer to GPU time: 0.014882
784 | Matrix multiply time (GPU): 1.471109
785 | Matrix multiply time (BLAS): 9.641088
786 | Cholesky factorization time (GPU w/ GPU interface): 0.303083
787 | Cholesky factorization time (GPU w/ CPU interface): 0.316703
788 | Cholesky factorization time (LAPACK): 1.537566
789 | Transfer from GPU time: 0.016509
790 | ./magmaExample 0
791 | ====================================================
792 | Timing results for n = 8192
793 | GPU memory allocation time: 0.004967
794 | Transfer to GPU time: 0.063750
795 | Matrix multiply time (GPU): 11.766860
796 | Matrix multiply time (BLAS): 77.529439
797 | Cholesky factorization time (GPU w/ GPU interface): 2.126884
798 | Cholesky factorization time (GPU w/ CPU interface): 2.161343
799 | Cholesky factorization time (LAPACK): 12.017636
800 | Transfer from GPU time: 0.072997
801 | 
802 | Setting use_pinned to 0
803 | ====================================================
804 | Timing results for n = 512
805 | GPU memory allocation time: 0.002136
806 | Transfer to GPU time: 0.001055
807 | Matrix multiply time (GPU): 0.002969
808 | Matrix multiply time (BLAS): 0.029986
809 | Cholesky factorization time (GPU w/ GPU interface): 0.009177
810 | Cholesky factorization time (GPU w/ CPU interface): 0.011693
811 | Cholesky factorization time (LAPACK): 0.004929
812 | Transfer from GPU time: 0.002238
813 | ====================================================
814 | Timing results for n = 4096
815 | GPU memory allocation time: 0.002929
816 | Transfer to GPU time: 0.056978
817 | Matrix multiply time (GPU): 1.471277
818 | Matrix multiply time (BLAS): 9.951325
819 | Cholesky factorization time (GPU w/ GPU interface): 0.308102
820 | Cholesky factorization time (GPU w/ CPU interface): 0.356540
821 | Cholesky factorization time (LAPACK): 1.551262
822 | Transfer from GPU time: 0.136033
823 | ====================================================
824 | Timing results for n = 8192
825 | GPU memory allocation time: 0.004951
826 | Transfer to GPU time: 0.226058
827 | Matrix multiply time (GPU): 11.767153
828 | Matrix multiply time (BLAS): 78.473712
829 | Cholesky factorization time (GPU w/ GPU interface): 2.125327
830 | Cholesky factorization time (GPU w/ CPU interface): 2.286922
831 | Cholesky factorization time (LAPACK): 12.059586
832 | Transfer from GPU time: 0.545454
833 | 
834 | </code></pre>
835 | 
836 | <p>So we see decent speed-ups both for the matrix multiplication and the Cholesky factorization; however this is in comparison to a CPU calculation that only seems to use 2 of the 8 cores available. </p>
837 | 
838 | <p>Using the CPU interface seems to provide a modest speedup (compared to the manual transfer + calculation time) as does using pinned memory. Note that the transfer time is non-negligible in certain ways (but not all ways) in this example.</p>
839 | 
840 | <h1>5) Some Final Comments</h1>
841 | 
842 | <h3>5.1) Some Thoughts on Improving Computational Speed</h3>
843 | 
844 | <p><a href="http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10016#.U2GTuBUgoWk">Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419</a> and <a href="http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10039#.U2GT9BUgoWk">Lee et al (2010; Journal of Computational and Graphical Statistics 19:769</a> talk about the use of GPUs for statistics. The speedups they see can get as high as 120 times the speed of a single CPU core and 500 times a single CPU core, respectively. Some possible reasons for these more impressive speedups relative to those seen here include:</p>
845 | 
846 | <ul>
847 | <li><p>Use of single precision floating point calculations. If single precision doesn&#39;t affect your calculation substantively, this is worth trying. Particularly on older GPUs (but perhaps still true), single precision was much faster than double precision. However, in tests in which I switched the kernelExample.cu and RNGexample.R/random.cu examples to single precision there was very little change in speed.</p></li>
848 | <li><p>Careful use of shared memory (shared amongst the threads in a block) in place of the main GPU memory (see the Suchard et al. paper)</p></li>
849 | <li><p>Computational tasks that are very arithmetically intensive but with limited memory access (see the Lee et al. paper)</p></li>
850 | </ul>
851 | 
852 | <p>So for some tasks and possibly with some additional coding effort, you may see speedups of 100-200 fold compared to a single CPU core, rather than the 40 fold speedup that is about the best seen in the demos here.</p>
853 | 
854 | <p>Finally, rather than bringing a large chunk of data back to the CPU, you might do a reduction/aggregation operation (e.g., summing over values) in GPU memory. To do this, here&#39;s a <a href="http://will-landau.com/gpu/lectures/cudac-atomics/cudac-atomics.pdf">presentation</a>‎ that has some useful information. </p>
855 | 
856 | <h3>5.2) A Comment on Compilation</h3>
857 | 
858 | <ul>
859 | <li>If you compile CUDA code into an object file, you can link that with other object files (e.g., from C or C++ code) into an executable that can operate on CPU and GPU. This also means you could compile a shared object (i.e., a library)  that you could call from R with .C, .Call, or Rcpp. </li>
860 | </ul>
861 | 
862 | </body>
863 | 
864 | </html>
865 | 


--------------------------------------------------------------------------------