├── README.md
├── RNGexample.R
├── build-bce-gpu.sh
├── calc_loglik.cu
├── cudaBlasExample.c
├── example
    ├── alphas.csv
    ├── compute_probs.cu
    ├── compute_probs_float.cu
    ├── compute_probs_unitStrides.cu
    ├── compute_probs_unitStrides_sharedMem.cu
    ├── compute_probs_unitStrides_sharedMem_float.cu
    ├── example_RCUDA.R
    ├── example_Rcpp.R
    ├── example_pureR.R
    └── setup_calc.R
├── gpu.Rmd
├── gpu.html
├── gpuArrayExample.py
├── helloWorld.cu
├── kernelExample-pinned.cu
├── kernelExample.R
├── kernelExample.cu
├── kernelExample.py
├── magmaExample.c
├── random.cu
├── savio-job-template.sh
└── savio.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # gpu-workshop-2016
 2 | Materials for workshop on GPU computation for statistics, data science, machine learning applications. Please see gpu.html to be guided through the materials.
 3 | 
 4 | Session 1: Monday, Feb. 1, 4:10 - 5:30 pm in Evans 1011
 5 | * Introduction to GPU resources that are available (Savio, Amazon EC2)
 6 | * Basics of using GPUs with C, R, and Python
 7 | 
 8 | Session 2: Monday Feb. 8, 4:10 - 5:30 pm in Evans 1011
 9 | * Use of packages such as Caffe, TensorFlow, etc. that use GPUs for
10 | back-end computation
11 | * Discussion of use cases by those using GPUs currently
12 | * Optimizing GPU usage
13 | 
14 | The workshop will be an introduction to using GPUs and will assume no
15 | previous knowledge of GPUs. I will assume familiarity with either R,
16 | C, or Python and at least modest familiarity with operating in a UNIX
17 | environment. The goal is to get folks up to speed on using GPUs, and
18 | we'll cover basic techniques for using a GPU with R, C, and Python.
19 | 
20 | 


--------------------------------------------------------------------------------
/RNGexample.R:
--------------------------------------------------------------------------------
 1 | library(RCUDA)
 2 | 
 3 | cat("Setting cuGetContext(TRUE)...\n")
 4 | cuGetContext(TRUE)
 5 | 
 6 | ptx = nvcc("random.cu", out = "random.ptx", target = "ptx",
 7 |      "-arch=compute_20", "-code=sm_20,compute_20")
 8 |   
 9 | mod = loadModule(ptx)
10 | 
11 | setup = mod$setup_kernel
12 | rnorm = mod$rnorm_kernel
13 | 
14 | n = as.integer(1e8)  # NOTE 'n' is of type integer
15 | n_per_thread = as.integer(1000)
16 | 
17 | mu = 0.3
18 | sigma = 1.5
19 | 
20 | verbose = FALSE
21 | 
22 | # setting grid and block dimensions
23 | threads_per_block <- as.integer(1024)
24 | block_dims <- c(threads_per_block, as.integer(1), as.integer(1))
25 | grid_d <- as.integer(ceiling(sqrt((n/n_per_thread)/threads_per_block)))
26 | 
27 | grid_dims <- c(grid_d, grid_d, as.integer(1))
28 | 
29 | cat("Grid size:\n")
30 | print(grid_dims)
31 | 
32 | nthreads <- as.integer(prod(grid_dims)*prod(block_dims))
33 | cat("Total number of threads to launch = ", nthreads, "\n")
34 | if (nthreads*n_per_thread < n){
35 |     stop("Grid is not large enough...!")
36 | }
37 | 
38 | cat("Running CUDA kernel...\n")
39 | 
40 | seed = as.integer(0)
41 | 
42 | 
43 | tRNGinit <- system.time({
44 |   rng_states <- cudaMalloc(numEls=nthreads, sizeof=as.integer(48), elType="curandState")
45 |   .cuda(setup, rng_states, seed, nthreads, as.integer(verbose), gridDim=grid_dims, blockDim=block_dims)
46 |   cudaDeviceSynchronize()
47 | })
48 | 
49 | tAlloc <- system.time({
50 |   dX = cudaMalloc(n, sizeof = as.integer(8), elType = "double", strict = TRUE)
51 |   cudaDeviceSynchronize()
52 | })
53 | 
54 | tCalc <- system.time({
55 | .cuda(rnorm, rng_states, dX, n, mu, sigma, n_per_thread, gridDim=grid_dims, blockDim=block_dims,.numericAsDouble = getOption("CUDA.useDouble", TRUE))
56 |   cudaDeviceSynchronize()
57 | })
58 | 
59 | tTransferFromGPU <- system.time({
60 |   out = copyFromDevice(obj = dX, nels = dX@nels, type = "double")
61 |   cudaDeviceSynchronize()
62 | })
63 | 
64 | 
65 | tCPU <- system.time({
66 |   out2 <- rnorm(n, mu, sigma)
67 | })
68 | 
69 | 
70 | 
71 | cat("RNG initiation time: ", tRNGinit[3], "\n")
72 | cat("GPU memory allocation time: ", tAlloc[3], "\n")
73 | cat("Calculation time (GPU): ", tCalc[3], "\n")
74 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n")
75 | cat("Calculation time (CPU): ", tCPU[3], "\n")
76 | 
77 | 


--------------------------------------------------------------------------------
/build-bce-gpu.sh:
--------------------------------------------------------------------------------
  1 | # start BCE-2015-fall from AWS console on a g2.2xlarge
  2 | # $0.65/hour; 4 Gb video RAM, 1536 CUDA cores
  3 | 
  4 | # make sure to increase space for home directory by requesting more when start instance, e.g. 30 Gb
  5 | 
  6 | # set variable holding IP address
  7 | # export ip=52-32-169-154
  8 | 
  9 | # ssh to the Amazon instance
 10 | # ssh -i ~/.ssh/ec2_rsa ubuntu@ec2-${ip}.us-west-2.compute.amazonaws.com
 11 | 
 12 | sudo su
 13 | 
 14 | # install CUDA
 15 | wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1504/x86_64/cuda-repo-ubuntu1504_7.5-18_amd64.deb
 16 | dpkg -i cuda-repo-ubuntu1504_7.5-18_amd64.deb
 17 | 
 18 | apt-get update
 19 | date >> /tmp/date
 20 | apt-get install -y cuda # a bit less than 10 mins
 21 | date >> /tmp/date
 22 | 
 23 | rm -rf cuda-repo-ubuntu1504_7.5-18_amd64.deb
 24 | 
 25 | 
 26 | # set up some utilities for monitoring the GPU
 27 | echo "" >> ~ubuntu/.bashrc
 28 | echo "export PATH=${PATH}:/usr/local/cuda/bin" >> ~ubuntu/.bashrc
 29 | echo "" >> ~ubuntu/.bashrc
 30 | echo "alias gtop=\"nvidia-smi -q -d UTILIZATION -l 1\"" >> ~ubuntu/.bashrc
 31 | echo "" >> ~ubuntu/.bashrc
 32 | echo "alias gmem=\"nvidia-smi -q -d MEMORY -l 1\"" >> ~ubuntu/.bashrc
 33 | 
 34 | # set up access to CUDA shared libraries
 35 | echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf
 36 | ldconfig
 37 | 
 38 | exit # back to ubuntu user
 39 | 
 40 | # reboot the instance
 41 | 
 42 | gtop
 43 | 
 44 | # gtop result without reboot will error:
 45 | #modprobe: ERROR: could not insert 'nvidia_352': No such device
 46 | #NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running. 
 47 | 
 48 | # create deviceQuery executable
 49 | sudo /usr/local/cuda/bin/nvcc /usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery.cpp -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc -o /usr/local/cuda/bin/deviceQuery
 50 | 
 51 | deviceQuery
 52 | 
 53 | # install PyCUDA
 54 | pip install pycuda
 55 | 
 56 | # install RCUDA
 57 | sudo su
 58 | 
 59 | cd /tmp
 60 | git clone https://github.com/duncantl/RCUDA
 61 | git clone https://github.com/omegahat/RAutoGenRunTime
 62 | 
 63 | cd RCUDA/src
 64 | ln -s ../../RAutoGenRunTime/src/RConverters.c .
 65 | ln -s ../../RAutoGenRunTime/inst/include/RConverters.h .
 66 | ln -s ../../RAutoGenRunTime/inst/include/RError.h .
 67 | 
 68 | cd ../..
 69 | 
 70 | Rscript -e "install.packages('bitops', repos = 'https://cran.cnr.berkeley.edu')"
 71 | 
 72 | R CMD build RCUDA
 73 | R CMD build RAutoGenRunTime
 74 | R CMD INSTALL RAutoGenRunTime_0.3-0.tar.gz 
 75 | R CMD INSTALL RCUDA_0.4-0.tar.gz 
 76 | 
 77 | # install MAGMA
 78 | export PATH=${PATH}:/usr/local/cuda/bin
 79 | 
 80 | MAGMA_VERSION=1.7.0
 81 | cd /usr/local
 82 | mkdir magma-${MAGMA_VERSION}
 83 | ln -s magma-${MAGMA_VERSION} magma
 84 | cd /usr/src
 85 | mkdir magma-${MAGMA_VERSION}
 86 | ln -s magma-${MAGMA_VERSION} magma
 87 | cd magma
 88 | wget http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-${MAGMA_VERSION}.tar.gz
 89 | tar -xvzf magma-${MAGMA_VERSION}.tar.gz
 90 | cd magma-${MAGMA_VERSION}
 91 | # note I added -fPIC per the magma README to enable creation of a shared object
 92 | cp make.inc.openblas make.inc
 93 | sed -i 's/-lopenblas/-llapack -lblas -lstdc++ -lm -lgfortran/' make.inc
 94 | sed -i 's/#GPU_TARGET.*/GPU_TARGET = Kepler/' make.inc
 95 | sed -i 's/.*(CUDADIR)\/lib64/LIBDIR\t\= -L$(CUDADIR)\/lib64/' make.inc
 96 | sed -i 's/.*OPENBLASDIR.*//' make.inc
 97 | sed -i 's/.*make.check-openblas.*//' make.inc
 98 | # make NVCCFLAGS look like:
 99 | # NVCCFLAGS = -O3         -DADD_       -Xcompiler "-fno-strict-aliasing $(FPIC)"
100 | 
101 | export CUDADIR=/usr/local/cuda
102 | make shared 2>&1 | tee ../make.shared.log
103 | make test 2>&1 | tee ../make.test.log
104 | make install prefix=/usr/local/magma 2>&1 | tee ../make.install.log
105 | 
106 | cd /usr/local/magma
107 | chmod ugo+r include/*
108 | 
109 | echo "/usr/local/magma/lib" >> /etc/ld.so.conf.d/SITE-magma.conf
110 | ldconfig
111 | 
112 | 
113 | #### Create image ##########################
114 | 
115 | # 1) now save the image in us-west-2 via point and click on VM page under Actions
116 | # 2) make it public
117 | 
118 | 


--------------------------------------------------------------------------------
/calc_loglik.cu:
--------------------------------------------------------------------------------
 1 | #define SQRT_TWO_PI 2.506628274631000
 2 | extern "C"
 3 | __global__ void calc_loglik(double* vals, int N, double mu, double sigma) {
 4 |    // note that this assumes no third dimension to the grid
 5 |      // id of the block
 6 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
 7 |     // size of each block (within grid of blocks)
 8 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
 9 |     // id of thread in a given block
10 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
11 |     // assign overall id/index of the thread
12 |     int idx = myblock * blocksize + subthread;
13 | 
14 |         if(idx < N) {
15 |             double std = (vals[idx] - mu)/ sigma;
16 |             double e = exp( - 0.5 * std * std);
17 |             vals[idx] = e / ( sigma * SQRT_TWO_PI);
18 |         }
19 | }
20 | 


--------------------------------------------------------------------------------
/cudaBlasExample.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include <stdio.h>
  4 | 
  5 | #include <math.h>
  6 | #include <cuda_runtime.h>
  7 | #include "cublas_v2.h"
  8 | 
  9 | // compile as:
 10 | // export PATH=$PATH:/usr/local/cuda/bin
 11 | // nvcc cudaBlasExample.c -I/usr/local/cuda/include -lcublas -o cudaBlasExample
 12 | 
 13 | 
 14 | double read_timer() {
 15 |   struct timeval end;
 16 |   gettimeofday( &end, NULL );
 17 |   return end.tv_sec+1.e-6*end.tv_usec;
 18 | }
 19 | 
 20 | void fillMatrix( double *p, int n ) {
 21 |   int i;
 22 |   srand48(0);
 23 |   for( i = 0; i < n; i++ )
 24 |     p[i] = 2*drand48()-1;
 25 | }
 26 | 
 27 | int main( int argc, char **argv ) {
 28 |   printf("Starting\n");
 29 |   int size;
 30 |   cudaError_t cudaStat;
 31 |   cublasStatus_t stat;
 32 |   cublasHandle_t handle;
 33 |   int it;
 34 | 
 35 |   cublasOperation_t N = 'N';
 36 |   cublasOperation_t T = 'T';
 37 |   double one = 1., zero=0.;
 38 | 
 39 |   for( size = 512; size <= 8192; size*=4 ) {
 40 | 
 41 |     // allocate memory on host (CPU)
 42 |     double *A = (double*) malloc( sizeof(double)*size*size );
 43 |     double *B = (double*) malloc( sizeof(double)*size*size );
 44 | 
 45 |     cudaDeviceSynchronize();
 46 |     double tInit = read_timer();
 47 | 
 48 |     double *dA,*dB;
 49 |     // allocate memory on device (GPU)
 50 |     cudaStat = cudaMalloc((void**)&dA, sizeof(double)*size*size);
 51 |     if(cudaStat != cudaSuccess) {
 52 |       printf ("device memory allocation failed");
 53 |       return EXIT_FAILURE;
 54 |     }
 55 |     cudaStat = cudaMalloc((void**)&dB, sizeof(double)*size*size);
 56 |     if(cudaStat != cudaSuccess) {
 57 |       printf ("device memory allocation failed");
 58 |       return EXIT_FAILURE;
 59 |     }
 60 | 
 61 |     // wait until previous CUDA commands on GPU threads have finished
 62 |     // this allows us to do the timing correctly
 63 |     cudaDeviceSynchronize();
 64 | 
 65 |     double tAlloc = read_timer();
 66 | 
 67 |     
 68 |     // initialization of CUBLAS
 69 |     stat = cublasCreate(&handle);
 70 |     if(stat != CUBLAS_STATUS_SUCCESS) {
 71 |       printf ("CUBLAS initialization failed\n");
 72 |       return EXIT_FAILURE;
 73 |     }
 74 | 
 75 |     // create our test matrix on the CPU
 76 |     fillMatrix(B, size*size);
 77 | 
 78 |     cudaDeviceSynchronize();
 79 |     double tInit2 = read_timer();
 80 | 
 81 | 
 82 |     // copy matrix to GPU, with dB the pointer to the object on the GPU
 83 |     stat = cublasSetMatrix (size, size, sizeof(double), B, size, dB, size);
 84 |     if(stat != CUBLAS_STATUS_SUCCESS) {
 85 |       printf ("data download failed");
 86 |       cudaFree (dB);
 87 |       cublasDestroy(handle);
 88 |       return EXIT_FAILURE;
 89 |     }
 90 | 
 91 |     cudaDeviceSynchronize();
 92 |     double tTransferToGPU = read_timer();
 93 |  
 94 |     // call cublas matrix multiply (dA = dB * dB)
 95 |     cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &one, dB, size, dB, size, &zero, dA, size );
 96 | 
 97 |     cudaDeviceSynchronize();
 98 |     double tMatMult = read_timer();
 99 | 
100 |     // transfer matrix back to CPU
101 |     stat = cublasGetMatrix (size, size, sizeof(double), dA, size, A, size);
102 |     if(stat != CUBLAS_STATUS_SUCCESS) {
103 |       printf ("data upload failed");
104 |       cudaFree(dA);
105 |       cublasDestroy(handle);
106 |       return EXIT_FAILURE;
107 |     }
108 |     
109 |     cudaDeviceSynchronize();
110 |     double tTransferFromGPU = read_timer();
111 | 
112 |     printf("====================================================\n");
113 |     printf("Timing results for n = %d\n", size);
114 |     printf("GPU memory allocation time: %f\n", tAlloc - tInit);
115 |     printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2);
116 |     printf("Matrix multiply time: %f\n", tMatMult - tTransferToGPU);
117 |     printf("Transfer from GPU time: %f\n", tTransferFromGPU - tMatMult);
118 | 
119 | 
120 |     // free memory on GPU and CPU
121 |     cudaFree(dA);
122 |     cudaFree(dB);
123 |     cublasDestroy(handle);
124 |     free(A);
125 |     free(B);
126 |  
127 |   }
128 |   return EXIT_SUCCESS;
129 | }
130 | 


--------------------------------------------------------------------------------
/example/compute_probs.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void compute_probs(double* alphas, double* rands, double* probs, int n, int K, int M) {
 3 |     // assign overall id/index of the thread = id of row
 4 |     int i = blockIdx.x * blockDim.x + threadIdx.x; 
 5 | 
 6 |     if(i < n) {
 7 |       double maxval;    
 8 |       int m, k;
 9 |       int maxind;
10 |       double M_d = (double) M; 
11 |       double* w = new double[K];
12 | 
13 |       for(k = 0; k < K; ++k){   // initialize probs (though already done on CPU)
14 |          probs[i*K + k] = 0.0;
15 |       }
16 | 
17 |       // core computations
18 |       for(m = 0; m < M; ++m){   // loop over Monte Carlo iterations
19 |         for(k = 0; k < K; ++k){  // generate W ~ N(alpha, 1)
20 |           w[k] = alphas[i*K + k] + rands[m*K + k];
21 |         }
22 | 
23 |         // determine which category has max W
24 |         maxind = K-1;
25 |         maxval = w[K-1];
26 |         for(k = 0; k < (K-1); ++k){
27 |           if(w[k] > maxval){
28 |             maxind = k;
29 |             maxval = w[k];
30 |           } 
31 |         }
32 |         probs[i*K + maxind] += 1.0;
33 |       }
34 | 
35 |       // compute final proportions
36 |       for(k = 0; k < K; ++k) {
37 |         probs[i*K + k] /= M_d;
38 |       }
39 |       free(w);
40 |     }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/example/compute_probs_float.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void compute_probs(float* alphas, float* rands, float* probs, int n, int K, int M) {
 3 |     // assign overall id/index of the thread = id of row
 4 |     int i = blockIdx.x * blockDim.x + threadIdx.x; 
 5 | 
 6 |     if(i < n) {
 7 |       float maxval;    
 8 |       int m, k;
 9 |       int maxind;
10 |       float M_d = (float) M; 
11 |       float* w = new float[K];
12 | 
13 |       for(k = 0; k < K; ++k){   // initialize probs (though already done on CPU)
14 |          probs[i*K + k] = 0.0;
15 |       }
16 |       for(m = 0; m < M; ++m){   // loop over Monte Carlo iterations
17 |         for(k = 0; k < K; ++k){  // generate W ~ N(alpha, 1)
18 |           w[k] = alphas[i*K + k] + rands[m*K + k];
19 |         }
20 | 
21 |         // determine which category has max W
22 |         maxind = K-1;
23 |         maxval = w[K-1];
24 |         for(k = 0; k < (K-1); ++k){
25 |           if(w[k] > maxval){
26 |             maxind = k;
27 |             maxval = w[k];
28 |           } 
29 |         }
30 |         probs[i*K + maxind] += 1.0;
31 |       }
32 |       // compute final proportions
33 |       for(k = 0; k < K; ++k) {
34 |         probs[i*K + k] /= M_d;
35 |       }
36 |       free(w);
37 |     }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/example/compute_probs_unitStrides.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void compute_probs(double* alphas, double* rands, double* probs, int n, int K, int M) {
 3 |     // assign overall id/index of the thread = id of row
 4 |     int i = blockIdx.x * blockDim.x + threadIdx.x; 
 5 | 
 6 |     if(i < n) {
 7 |       double maxval;    
 8 |       int m, k;
 9 |       int maxind;
10 |       double M_d = (double) M; 
11 |       double* w = new double[K];
12 | 
13 |       for(k = 0; k < K; ++k){  // initialize probs (though already done on CPU)
14 |          probs[k*n + i] = 0.0;
15 |       }
16 | 
17 |       // core computations
18 |       for(m = 0; m < M; ++m){    // loop over Monte Carlo iterations
19 |         for(k = 0; k < K; ++k){  // generate W ~ N(alpha, 1)
20 |           // with +i we now have unit strides in inner loop
21 |           w[k] = alphas[k*n + i] + rands[k*M + m];
22 |         }
23 | 
24 |         // determine which category has max W
25 |         maxind = K-1;
26 |         maxval = w[K-1];
27 |         for(k = 0; k < (K-1); ++k){
28 |           if(w[k] > maxval){
29 |             maxind = k;
30 |             maxval = w[k];
31 |           } 
32 |         }
33 |         probs[maxind*n + i] += 1.0;
34 |       }
35 | 
36 |       // compute final proportions
37 |       for(k = 0; k < K; ++k) {
38 |         // unit strides
39 |         probs[k*n + i] /= M_d;
40 |       }
41 |       free(w);
42 |     }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/example/compute_probs_unitStrides_sharedMem.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void compute_probs(double* alphas, double* rands, double* probs, int n, int K, int M) {
 3 |     // assign overall id/index of the thread = id of row
 4 |     int i = blockIdx.x * blockDim.x + threadIdx.x; 
 5 |     int threads_per_block = blockDim.x; 
 6 | 
 7 |     // set up shared memory: half for probs and half for w
 8 |     extern __shared__ double shared[];
 9 |     double* probs_shared = shared;
10 |     double* w = &shared[K*threads_per_block];    // shared mem is one big block, so need to index into latter portion of it to use for w
11 | 
12 | 
13 |     if(i < n) {
14 |       double maxval;    
15 |       int m, k;
16 |       int maxind;
17 |       double M_d = (double) M; 
18 |       
19 |       // initialize shared memory probs
20 |       for(k = 0; k < K; ++k) {
21 |         probs_shared[k*threads_per_block + threadIdx.x] = 0.0;
22 |       }
23 | 
24 |       // core computation
25 |       for(m = 0; m < M; ++m){     // loop over Monte Carlo iterations 
26 |         for(k = 0; k < K; ++k){   // generate W ~ N(alpha, 1)
27 |           w[k*threads_per_block + threadIdx.x] = alphas[k*n + i] + rands[k*M + m];
28 |         }
29 |         maxind = K-1;
30 |         maxval = w[(K-1)*threads_per_block + threadIdx.x];
31 |         for(k = 0; k < (K-1); ++k){
32 |           if(w[k*threads_per_block + threadIdx.x] > maxval){
33 |             maxind = k;
34 |             maxval = w[k*threads_per_block + threadIdx.x];
35 |           } 
36 |         }
37 |         probs_shared[maxind*threads_per_block + threadIdx.x] += 1.0;
38 |       }
39 | 
40 |       for(k = 0; k < K; ++k) {
41 |         probs_shared[k*threads_per_block + threadIdx.x] /= M_d;
42 |       }
43 |       
44 |       // copy to device memory so can be returned to CPU
45 |       for(k = 0; k < K; ++k) {
46 |           probs[k*n + i] = probs_shared[k*threads_per_block + threadIdx.x];
47 |       }
48 |     }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/example/compute_probs_unitStrides_sharedMem_float.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void compute_probs(float* alphas, float* rands, float* probs, int n, int K, int M) {
 3 |     // assign overall id/index of the thread = id of row
 4 |     int i = blockIdx.x * blockDim.x + threadIdx.x; 
 5 |     int threads_per_block = blockDim.x; 
 6 | 
 7 |     // set up shared memory: half for probs and half for w
 8 |     extern __shared__ float shared[];
 9 |     float* probs_shared = shared;
10 |     float* w = &shared[K*threads_per_block];    // shared mem is one big block, so need to index into latter portion of it to use for w
11 | 
12 | 
13 |     if(i < n) {
14 |       float maxval;    
15 |       int m, k;
16 |       int maxind;
17 |       float M_d = (float) M; 
18 |       
19 |       // initialize shared memory probs
20 |       for(k = 0; k < K; ++k) {
21 |         probs_shared[k*threads_per_block + threadIdx.x] = 0.0;
22 |       }
23 | 
24 |       // core computations
25 |       for(m = 0; m < M; ++m){     // loop over Monte Carlo iterations 
26 |         for(k = 0; k < K; ++k){   // generate W ~ N(alpha, 1)
27 |           w[k*threads_per_block + threadIdx.x] = alphas[k*n + i] + rands[k*M + m];
28 |         }
29 |         maxind = K-1;
30 |         maxval = w[(K-1)*threads_per_block + threadIdx.x];
31 |         for(k = 0; k < (K-1); ++k){
32 |           if(w[k*threads_per_block + threadIdx.x] > maxval){
33 |             maxind = k;
34 |             maxval = w[k*threads_per_block + threadIdx.x];
35 |           } 
36 |         }
37 |         probs_shared[maxind*threads_per_block + threadIdx.x] += 1.0;
38 |       }
39 | 
40 |       for(k = 0; k < K; ++k) {
41 |         probs_shared[k*threads_per_block + threadIdx.x] /= M_d;
42 |       }
43 |       
44 |       // copy to device memory so can be returned to CPU
45 |       for(k = 0; k < K; ++k) {
46 |           probs[k*n + i] = probs_shared[k*threads_per_block + threadIdx.x];
47 |       }
48 |     }
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/example/example_RCUDA.R:
--------------------------------------------------------------------------------
  1 | # modification of one of the RCUDA examples to use use double precision
  2 | 
  3 | library(RCUDA)
  4 | 
  5 | if(!exists('unitStrides') || is.null(unitStrides)) unitStrides <- FALSE
  6 | if(!exists('sharedMem') || is.null(sharedMem)) sharedMem <- FALSE
  7 | if(!exists('float') || is.null(float)) float <- FALSE
  8 | 
  9 | M <- as.integer(1e4) # important to have as integer!
 10 | 
 11 | # get the alphas and generate the random numbers
 12 | source('setup_calc.R')
 13 | 
 14 | cat("Setting cuGetContext(TRUE)...\n")
 15 | cuGetContext(TRUE)
 16 | 
 17 | # compile the kernel into a form that RCUDA can load; equivalent to this nvcc call:
 18 | # system("nvcc --ptx  -arch=compute_20 -code=sm_20,compute_20 -o compute_probs.ptx compute_probs.cu")
 19 | 
 20 | fn <- "compute_probs"
 21 | if(unitStrides) fn <- paste0(fn, "_unitStrides")
 22 | if(sharedMem) fn <- paste0(fn, "_sharedMem")
 23 | if(float) fn <- paste0(fn, "_float")
 24 | ptx = nvcc(file = paste0(fn, ".cu"), out = 'compute_probs.ptx',
 25 |   target = "ptx", "-arch=compute_20", "-code=sm_20,compute_20")
 26 | 
 27 | mod = loadModule(ptx)
 28 | compute_probs = mod$compute_probs
 29 | 
 30 | # setting grid and block dimensions
 31 | threads_per_block <- as.integer(192)
 32 | if(sharedMem) threads_per_block <- as.integer(96) # need fewer threads so that have enough room in 48Kb of shared memory 
 33 | block_dims <- c(threads_per_block, as.integer(1), as.integer(1))
 34 | grid_d <- as.integer(ceiling(n/threads_per_block))
 35 | 
 36 | grid_dims <- c(grid_d, as.integer(1), as.integer(1))
 37 | 
 38 | cat("Grid size:\n")
 39 | print(grid_dims)
 40 | 
 41 | nthreads <- prod(grid_dims)*prod(block_dims)
 42 | cat("Total number of threads to launch = ", nthreads, "\n")
 43 | if (nthreads < n){
 44 |     stop("Grid is not large enough...!")
 45 | }
 46 | 
 47 | cat("Running CUDA kernel...\n")
 48 | 
 49 | if(unitStrides) {
 50 |     probs <- matrix(0, nrow = n, ncol = K)
 51 |     tmp <- matrix(0, nrow = n, ncol = K)
 52 |     rands <- t(rands)
 53 |     alphas <- t(alphas)
 54 | } else {
 55 |     probs <- matrix(0, nrow = K, ncol = n)
 56 |     tmp <- matrix(0, nrow = K, ncol = n)
 57 | }
 58 | 
 59 | if(!float) {
 60 |     strict = TRUE # for double
 61 |     cuCtxSetSharedMemConfig("CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE")
 62 | } else strict = FALSE
 63 | 
 64 | sharedMemSize <- as.integer(
 65 |     ifelse(float, 4, 8)*K*threads_per_block*2
 66 |     )
 67 | 
 68 | if(sharedMem && sharedMemSize > 48000) stop("trying to use too much shared memory")
 69 | 
 70 | # basic usage with manual transfer
 71 | tTransferToGPU <- system.time({
 72 |   devAlphas = copyToDevice(alphas, strict = strict)
 73 |   devRands = copyToDevice(rands, strict = strict)
 74 |   devProbs = copyToDevice(probs, strict = strict)
 75 |   cudaDeviceSynchronize()
 76 | })
 77 | tCalc <- system.time({
 78 |     if(float) {
 79 |         .cuda(compute_probs, devAlphas, devRands, devProbs,
 80 |               n, K, M, gridDim = grid_dims, blockDim = block_dims, sharedMemBytes = ifelse(sharedMem, sharedMemSize, as.integer(0)))
 81 |     } else
 82 |         .cuda(compute_probs, devAlphas, devRands, devProbs,
 83 |       n, K, M, gridDim = grid_dims, blockDim = block_dims, sharedMemBytes = ifelse(sharedMem, sharedMemSize, as.integer(0)), .numericAsDouble = getOption("CUDA.useDouble", TRUE))
 84 |   cudaDeviceSynchronize()
 85 | })
 86 | tTransferFromGPU <- system.time({
 87 |   out = copyFromDevice(obj = devProbs, nels = devProbs@nels, type = "double")
 88 |   cudaDeviceSynchronize()
 89 | })
 90 | 
 91 | 
 92 | cat("Input values: ", alphas[1:3], "\n")
 93 | cat("Output values: ", out[1:3], "\n")
 94 |                      
 95 | cat("Transfer to GPU time: ", tTransferToGPU[3], "\n")
 96 | cat("Calculation time (GPU): ", tCalc[3], "\n")
 97 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n")
 98 | 
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/example/example_Rcpp.R:
--------------------------------------------------------------------------------
  1 | M <- 1e4
  2 | 
  3 | source('setup_calc.R')
  4 | 
  5 | require(Rcpp)
  6 | require(inline)
  7 | 
  8 | cppFunction('
  9 |   NumericMatrix compute_probs_mp(NumericMatrix alpha, NumericMatrix rands, int M, int n, int K, int nProc){
 10 | 
 11 |     NumericMatrix probs(n, K);
 12 |     int i;
 13 | 
 14 |     omp_set_num_threads(nProc);
 15 | 
 16 |     #pragma omp parallel for  
 17 |     for(i = 0; i < n; ++i){
 18 |       double max;    
 19 |       int m, k;
 20 |       int maxind;
 21 |       NumericVector w(K);
 22 | 
 23 |       for(k = 0; k < K; ++k){
 24 |          probs(i,k) = 0.0;
 25 |       }
 26 | 
 27 |       // core computation
 28 |       for(m = 0; m < M; ++m){
 29 |         for(k = 0; k < K; ++k){
 30 |           w(k) = alpha(i, k) + rands(m, k);
 31 |         }
 32 |         maxind = K-1;
 33 |         max = w(K-1);
 34 |         for(k = 0; k < (K-1); ++k){
 35 |           if(w(k) > max){
 36 |             maxind = k;
 37 |             max = w(k);
 38 |           } 
 39 |         }
 40 |         probs(i,maxind) += 1.0;
 41 |       }
 42 | 
 43 |       for(k = 0; k < K; ++k) {
 44 |         probs(i,k) /= M;
 45 |       }
 46 |     }
 47 |     return probs;
 48 |   }
 49 | ', plugins = c("openmp"), includes = c('#include <omp.h>'))
 50 | 
 51 | alphas <- t(alphas)
 52 | rands <- t(rands)
 53 | 
 54 | # 47 sec for 10000
 55 | system.time({          
 56 |     props1 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 1)
 57 | })
 58 | # 11.9 sec for 10000
 59 | system.time({
 60 |     props2 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 4)  
 61 | })
 62 | # 6.0 sec for 10000
 63 | system.time({
 64 |     props3 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 8)  
 65 | })
 66 | 
 67 | # using transposed alpha,rands doesn't change things much: 
 68 | 
 69 | cppFunction('
 70 |   NumericMatrix compute_probs_mp2(NumericMatrix alpha, NumericMatrix rands, int M, int n, int K, int nProc){
 71 | 
 72 |     NumericMatrix probs(K, n);
 73 |     int i;
 74 | 
 75 |     omp_set_num_threads(nProc);
 76 | 
 77 |     #pragma omp parallel for  
 78 |     for(i = 0; i < n; ++i){
 79 |       double max;    
 80 |       int k, m;
 81 |       int maxind;
 82 |       NumericVector w(K);
 83 | 
 84 |       for(k = 0; k < K; ++k){
 85 |          probs(k,i) = 0.0;
 86 |       }
 87 |       for(m = 0; m < M; ++m){
 88 |         for(k = 0; k < K; ++k){
 89 |           w(k) = alpha(k,i) + rands(k, m);
 90 |         }
 91 |         maxind = K-1;
 92 |         max = w(K-1);
 93 |         for(k = 0; k < (K-1); ++k){
 94 |           if(w(k) > max){
 95 |             maxind = k;
 96 |             max = w(k);
 97 |           } 
 98 |         }
 99 |         probs(maxind, i) += 1.0;
100 |       }
101 |       for(k = 0; k < K; ++k) {
102 |         probs(k,i) /= M;
103 |       }
104 |     }
105 |     return probs;
106 |   }
107 | ', plugins = c("openmp"), includes = c('#include <omp.h>'))
108 | 
109 | alphas <- t(alphas)
110 | rands <- t(rands)
111 | 
112 | # 50 sec. for 10000
113 | system.time({          
114 |     props1 <- compute_probs_mp2(alphas, rands, M, n, K, nProc = 1)
115 | })
116 | # 12.3 sec  for 10000
117 | system.time({
118 |     props2 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 4)  
119 | })
120 | # 6.2 sec  for 10000
121 | system.time({
122 |     props3 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 8)  
123 | })
124 | 
125 | 
126 | 


--------------------------------------------------------------------------------
/example/example_pureR.R:
--------------------------------------------------------------------------------
 1 | M <- 1e3
 2 | 
 3 | source('setup_calc.R')
 4 | 
 5 | props2 <- props3 <- props
 6 | 
 7 | # 2-3 sec per iteration for M = 1e6
 8 | # 72 sec for M=1000
 9 | system.time({
10 | for(i in 1:n) {
11 |     tmp <- alphas[ , i] + rands
12 |     id <- apply(tmp, 2, which.max)
13 |     tbl <- table(id)
14 |     props[as.integer(names(tbl)) , i] <- tbl / n
15 |     if(i %% 1000 == 0) print(c(i, date()))
16 | }
17 | })
18 | 
19 | # 57 sec for M=1000
20 | system.time({
21 | for(i in 1:n) {
22 |     tmp <- t(alphas[ , i] + rands)
23 |     id <- rep(1, M)
24 |     for(k in 2:K) {
25 |         wh <- tmp[, k ] > tmp[ , 1 ]
26 |         id[wh] <- k
27 |         tmp[wh, 1] <- tmp[wh, k ]
28 |     }
29 |     tbl <- table(id)
30 |     props2[as.integer(names(tbl)) , i] <- tbl / n
31 |     if(i %% 1000 == 0) print(c(i, date()))
32 | }
33 | })
34 | 
35 | nProc <- 4 
36 | 
37 | library(doParallel)
38 | registerDoParallel(nProc)
39 | 
40 | # 29 sec for M=1000, 4 cores
41 | # 14 sec for M=1000, 8 cores
42 | system.time({
43 | props3 <- foreach(i = 1:n, .combine = cbind) %dopar% {
44 |     tmp <- t(alphas[ , i] + rands)
45 |     id <- rep(1, M)
46 |     for(k in 2:K) {
47 |         wh <- tmp[, k ] > tmp[ , 1 ]
48 |         id[wh] <- k
49 |         tmp[wh, 1] <- tmp[wh, k ]
50 |     }
51 |     tbl <- table(id)
52 |     out <- rep(0, K)
53 |     out[as.integer(names(tbl))] <- tbl / M
54 |         if(i %% 1000 == 0) print(c(i, date()))
55 |     out
56 | }
57 | })
58 | 


--------------------------------------------------------------------------------
/example/setup_calc.R:
--------------------------------------------------------------------------------
 1 | alphas <- t(as.matrix(read.csv('alphas.csv', header = FALSE)))
 2 | 
 3 | n <- ncol(alphas)
 4 | K <- nrow(alphas)
 5 | 
 6 | props  <- matrix(0, K, n)
 7 | 
 8 | set.seed(0)
 9 | 
10 | system.time({
11 | rands <- matrix(rnorm(M*K), nrow = K, ncol = M)
12 | })
13 | 
14 | 


--------------------------------------------------------------------------------
/gpu.Rmd:
--------------------------------------------------------------------------------
  1 | Introduction to Computing with GPUs for Data Science
  2 | ==================================================================
  3 | Chris Paciorek, Statistical Computing Facility, Department of Statistics and Berkeley Research Computing, UC Berkeley
  4 | 
  5 | Presented: February 1 and 8, 2016
  6 | 
  7 | Last Revised: February 1, 2016
  8 | 
  9 | 
 10 | ```{r setup, include=FALSE}
 11 | opts_chunk$set(cache = TRUE) # because the compilation takes time, let's cache it
 12 | ```
 13 | 
 14 | # 0) This Tutorial
 15 | 
 16 | Materials for this tutorial, including the R markdown file that was used to create this document are available on github at [https://github.com/berkeley-scf/gpu-workshop-2016](https://github.com/berkeley-scf/gpu-workshop-2016).  You can download the files by doing a git clone:
 17 | ```{r clone, eval=FALSE, engine='bash'}
 18 | git clone https://github.com/berkeley-scf/gpu-workshop-2016
 19 | ```
 20 | 
 21 | To create this HTML document, simply compile the corresponding R Markdown file in R:
 22 | ```{r rmd-compile, eval=FALSE}
 23 | library(knitr)
 24 | knit2html('gpu.Rmd')
 25 | ```
 26 | 
 27 | or from the UNIX command line:
 28 | ```{r rmd-compile-bash, engine='bash', eval=FALSE}
 29 | Rscript -e "library(knitr); knit2html('gpu.Rmd')"
 30 | ```
 31 | 
 32 | 
 33 | # 1) Introduction
 34 | 
 35 | ### 1.1) Overview
 36 | 
 37 | GPUs (Graphics Processing Units) are processing units originally designed for rendering graphics on a computer quickly. This is done by having a large number of simple processing units for massively parallel calculation. The idea of general purpose GPU (GPGPU) computing is to exploit this capability for general computation. 
 38 | 
 39 | We'll see both high-level and low-level ways to program calculations for implementation on the GPU. The basic context of GPU programming is "data parallelism", in which the same calculation is done to lots of pieces of data. This could be a mathematical calculation on millions of entries in a vector or a simulation with many independent simulations. Some examples of data parallelism include matrix multiplication (doing the multiplication task on many separate matrix elements) or numerical integration (doing a numerical estimate of the piecewise integral on many intervals/regions), as well as standard statistical calculations such as simulation studies, bootstrapping, random forests, etc. This kind of computation also goes by the name "SIMD" (single instruction, multiple data).
 40 | 
 41 | ### 1.2) Hardware
 42 | 
 43 | Two of the main suppliers of GPUs are NVIDIA and AMD. *CUDA* is a platform for programming on GPUs specifically for NVIDIA GPUs that allows you to send C/C++/Fortran code for execution on the GPU.  *OpenCL* is an alternative that will work with a broader variety of GPUs. However, CUDA is quite popular, and there are a lot of tools designed for working with NVIDIA GPUs and based on CUDA, so we'll focus on CUDA here. 
 44 | 
 45 | GPUs have many processing units but somewhat limited memory. Also, they can only use data in their own memory, not in the CPU's memory, so one must transfer data back and forth between the CPU (the *host*) and the GPU (the *device*). This copying can, in some computations, constitute a very large fraction of the overall computation. So it is best to create the data and/or leave the data (for subsequent calculations) on the GPU when possible and to limit transfers. 
 46 | 
 47 | The current generation of NVIDIA GPUs is of the *Kepler* architecture (3rd generation). The 2nd generation was *Fermi* and the 1st was *Tesla*. (However note that *Tesla* is also used by NVIDIA to refer to different chip types). 
 48 | 
 49 | Originally GPUs supported only single precision (i.e., *float* calculations) but fortunately they now support double precision operations, and most of the examples here will use doubles to reduce the possibility of potential numerical issues, in particular with linear algebra calculations. But in many contexts, single precision will be fine, and the GPU will do computations more quickly with single precision. We'll explore this a bit later in the tutorial.
 50 | 
 51 | ### 1.3) Software Tools
 52 | 
 53 | Here are some of the useful software tools for doing computations on the GPU.
 54 | 
 55 | * CUDA - an extension of C/C++ for programming on an NVIDIA GPU 
 56 | * CUBLAS - a BLAS implementation for matrix-vector calculations on an NVIDIA GPU
 57 | * CURANDOM - random number generation on an NVIDIA GPU
 58 | * PyCUDA - a Python package providing a front-end for CUDA
 59 | * RCUDA - an R package providing a front-end for CUDA
 60 | * MAGMA - a package for combined CPU-GPU linear algebra, intended to be analogous to LAPACK + BLAS
 61 | 
 62 | Note that RCUDA is still in development and is on Github but not CRAN, but should be high-quality as it is developed by Duncan Temple Lang at UC-Davis.
 63 | 
 64 | We'll see all of these in action.
 65 | 
 66 | There are also:
 67 | * openCL - an alternative to CUDA that can also be used with non-NVIDIA GPUs
 68 | * CUDA Python (from Anaconda, but free for academic use)
 69 | * PyOpenCL
 70 | * R packages: OpenCL, gpuR, gmatrix, gputools
 71 | * BIDMach - software for fast machine learning with a GPU back end available
 72 | 
 73 | Finally, many of the popular machine learning packages focused on neural networks and deep learning can use GPUs behind the scenes; these include Theano, Caffe, Torch, Tensorflow, and mocha.jl, among others.
 74 | 
 75 | Some of these, such as PyCUDA and RCUDA allow you to easily interface to core CUDA code that you write yourself. Others, such as the other R packages and CUDA Python, allow you to program within R and Python but still use the GPU for some of the computation. Finally tools such as the various machine learning hide the details of the GPU usage from you and allow you to simply program in the environment of the software, with computations done on the GPU behind the scenes if a GPU is available. 
 76 | 
 77 | # 2) GPU hardware available at Berkeley
 78 | 
 79 | ### 2.1) Department-specific GPUs
 80 | 
 81 | #### Statistics
 82 | 
 83 | The Statistical Computing Facility has a GPU on our high-priority cluster. We'll use this GPU in the demos here, though it is only available for Statistics affiliates. More details on using the GPU are available [here](http://statistics.berkeley.edu/computing/servers/gpu).
 84 | 
 85 | #### Biostatistics
 86 | 
 87 | Biostatistics has a GPU on one of its servers. Talk to Burke for more information.
 88 | 
 89 | #### Economics
 90 | 
 91 | The EML (Economics) has a GPU on one of the EML Linux servers that EML users can access. If this is of interest to you, email consult@econ.berkeley.edu, and I will work to get it set up analogously to the Statistics GPU and the Amazon virtual machine (see below) and to help you get started. 
 92 | 
 93 | 
 94 | ### 2.2) GPUs on the campus Linux cluster, Savio
 95 | 
 96 | Savio recently purchased some nodes with GPUs. These are not yet available to the general public, but will soon be available to users affiliated with researchers who have purchased nodes on Savio and to users who are affiliated with faculty members using the faculty compute allowance. 
 97 | 
 98 | The general syntax for submitting a GPU-based job to Savio's SLURM based scheduler is as follows.
 99 | ```
100 | sbatch -A account_name -p savio2_gpu -N 1 -t 60:0 job.sh
101 | ```
102 | 
103 | Alternatively, simply do `sbatch job.sh` and include the scheduling flags in your *job.sh*, as demonstrated in [savio-job-template.sh](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/savio-job-template.sh).
104 | 
105 | To figure out what to fill in for *account_name*, you can look up your accounts with
106 | ```
107 | sacctmgr -p show associations user=${USER}
108 | ```
109 | 
110 | For an interactive session:
111 | ```
112 | srun -A account_name --pty -p savio2_gpu -N1 -t 30:0 /bin/bash
113 | ```
114 | 
115 | Before doing any compilation involving CUDA code you generally want to change your environment modules:
116 | ```
117 | module unload intel
118 | module load cuda
119 | ```
120 | 
121 | ### 2.3) GPUs through Amazon's EC2 service
122 | 
123 | The *g2.2xlarge* Amazon EC2 instance types have a GPU with 1536 cores and 4 Gb memory, along with 8 CPU cores. There is also a *g2.8xlarge* that has four GPUs and 32 CPU cores. They can be pretty expensive unless you use spot instances - currently 65 cents per hour for g2.2xlarge and $2.60 per hour for g2.8xlarge in the us-west-2 region. The g2.2xlarge GPUs are pretty old chips, and I found that some of the examples included here ran a lot slower on the EC2 instance than on the Statistics GPU (and likely than Savio, but I haven't checked that as much).
124 | 
125 | I've created an Amazon machine image (an AMI) that is the binary representation of the Linux Ubuntu operating system with support for GPU calculations. The AMI is based off of the [BCE virtual machine](bce.berkeley.edu) in use for a variety of projects and classes on campus. BCE provides a common set of software used in various data analysis/data science focused contexts, including Python and R. The BCE GPU AMI inherits this software and adds on various GPU-related software (in particular CUDA). Note also that the AMI is also similar to the  SCF and EML Linux machines but with a reduced set of software.
126 | 
127 | Based on the BCE-GPU AMI one can start up a virtual Linux machine that one can login to (see below for instructions) via SSH, just like any SCF/EML Linux server. If you were willing to pay Amazon and have an account, you can start a VM (in the Oregon [us-west-2] region) using the BCE GPU AMI by searching for *BCE-2015-fall-gpu* under "Public Images" at the [EC2 console](https://console.aws.amazon.com/ec2/v2/home?region=us-west-2#Images:). Then just launch a VM, selecting *g2.2xlarge* under the *GPU instances* tab. 
128 | 
129 | If you're interested in how to install CUDA-related software on an Ubuntu Linux machine, see [build-bce-gpu.sh](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/build-bce-gpu.sh) for the details of how I built the *BCE-2015-fall-gpu* image based on the *BCE-2015-fall* image.
130 | 
131 | # 3) Some basics of GPU use
132 | 
133 | ### 3.1) Getting information about the GPU
134 | 
135 | First let's see how we get information about the GPU and activity on the GPU.
136 | 
137 | #### Hardware specifications
138 | 
139 | 
140 | First, executing the following code as root will create an executable that will show you details on the GPU, including the possible block and grid dimensions (described shortly).
141 |  
142 | ```{r deviceQuery, engine='bash', eval=FALSE}
143 | cd  /usr/local/cuda/samples/1_Utilities/deviceQuery
144 | nvcc deviceQuery.cpp -I/usr/local/cuda/include \
145 |    -I/usr/local/cuda-5.5/samples/common/inc -o /usr/local/cuda/bin/deviceQuery
146 | cd -
147 | ```
148 | 
149 | Once the *deviceQuery* executable is created, you can run it whenever you want.
150 | 
151 | You'll see information such as the following.
152 | 
153 | ```
154 | paciorek@scf-sm20:~> deviceQuery
155 | deviceQuery Starting...
156 | 
157 |  CUDA Device Query (Runtime API) version (CUDART static linking)
158 | 
159 | Detected 1 CUDA Capable device(s)
160 | 
161 | Device 0: "Tesla K20Xm"
162 |   CUDA Driver Version / Runtime Version          7.0 / 7.0
163 |   CUDA Capability Major/Minor version number:    3.5
164 |   Total amount of global memory:                 5760 MBytes (6039339008 bytes)
165 |   (14) Multiprocessors, (192) CUDA Cores/MP:     2688 CUDA Cores
166 |   GPU Max Clock rate:                            732 MHz (0.73 GHz)
167 |   Memory Clock rate:                             2600 Mhz
168 |   Memory Bus Width:                              384-bit
169 |   L2 Cache Size:                                 1572864 bytes
170 |   Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
171 |   Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
172 |   Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
173 |   Total amount of constant memory:               65536 bytes
174 |   Total amount of shared memory per block:       49152 bytes
175 |   Total number of registers available per block: 65536
176 |   Warp size:                                     32
177 |   Maximum number of threads per multiprocessor:  2048
178 |   Maximum number of threads per block:           1024
179 |   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
180 |   Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
181 |   Maximum memory pitch:                          2147483647 bytes
182 |   Texture alignment:                             512 bytes
183 |   Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
184 |   Run time limit on kernels:                     No
185 |   Integrated GPU sharing Host Memory:            No
186 |   Support host page-locked memory mapping:       Yes
187 |   Alignment requirement for Surfaces:            Yes
188 |   Device has ECC support:                        Enabled
189 |   Device supports Unified Addressing (UVA):      Yes
190 |   Device PCI Domain ID / Bus ID / location ID:   0 / 2 / 0
191 |   Compute Mode:
192 |      < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
193 | 
194 | deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 1, Device0 = Tesla K20Xm
195 | Result = PASS
196 | ```
197 | 
198 | #### Observing Performance on the GPU
199 | 
200 | The following command will allow you to see some information analogous to *top* on the CPU. 
201 | 
202 | ```{r gtop, engine='bash', eval=FALSE}
203 | nvidia-smi -q -d UTILIZATION -l 1
204 | ```
205 | 
206 | Here's some example output when the GPU is idle: 
207 | 
208 | ```{r gtop output, engine='bash', eval=FALSE}
209 | ==============NVSMI LOG==============
210 | 
211 | Timestamp                           : Mon Jan 25 17:45:12 2016
212 | Driver Version                      : 346.46
213 | 
214 | Attached GPUs                       : 1
215 | GPU 0000:02:00.0
216 |     Utilization
217 |         Gpu                         : 0 %
218 |         Memory                      : 0 %
219 |         Encoder                     : 0 %
220 |         Decoder                     : 0 %
221 | 
222 | ```
223 | 
224 | Memory use based on the above does not seem to actually indicate how much of the overall GPU memory is in use for some reason.
225 | 
226 | Instead, to see how much memory is used on the GPU, the following will work:
227 | 
228 | ```{r gmem, engine='bash', eval=FALSE}
229 | nvidia-smi -q -d MEMORY -l 1
230 | ```
231 | 
232 | Here's some example output when not much memory is in use on the GPU: 
233 | 
234 | ```{r gmem-output, engine='bash', eval=FALSE}
235 | ==============NVSMI LOG==============
236 | 
237 | Timestamp                           : Thu Jan 28 12:06:24 2016
238 | Driver Version                      : 346.46
239 | 
240 | Attached GPUs                       : 1
241 | GPU 0000:02:00.0
242 |     FB Memory Usage
243 |         Total                       : 5759 MiB
244 |         Used                        : 12 MiB
245 |         Free                        : 5747 MiB
246 |     BAR1 Memory Usage
247 |         Total                       : 256 MiB
248 |         Used                        : 2 MiB
249 |         Free                        : 254 MiB
250 | ```
251 | 
252 | 
253 | ### 3.2) Overview of computation on a GPU
254 | 
255 | The basic series of operations to use a GPU when writing your own GPU code is:
256 | * allocate memory on the GPU
257 | * transfer data from CPU to GPU
258 | * launch the CUDA kernel to operate on the threads, with a given block/grid arrangement
259 | * (optionally) launch another kernel, which can access data stored on the GPU, including results from the previous kernel
260 | * transfer results back to CPU
261 | 
262 | The key computations are done in the *kernel*. Kernels are functions that encode the core computational operations that are executed in parallel. The basic mode of operation with a GPU when you are writing your own GPU code is to write a kernel using CUDA code and then call the kernel in parallel via C, R, or Python code. 
263 | 
264 | As outlined above, we need to pass any data from the CPU to the GPU and do the same in reverse to get the result. We'll also need to allocate memory on the GPU. However in some cases the transfer and allocation will be done automatically behind the scenes.
265 | 
266 | ### 3.3) Threads, Blocks, and Grids
267 | 
268 | Programming on a GPU (in particular programming for efficiency) requires some understanding of how parallelization works on the GPU. Each individual computation or series of computations on the GPU is done in a thread. Threads are organized into blocks and blocks of threads are organized in a grid. The blocks and grids can be 1-, 2-, or 3-dimensional. E.g., you might have a 1-d block of 256 threads, with a grid of 3 x 3 such blocks, for a total of $256 \times 9 = 2304$ threads. The choice of the grid/block arrangement can affect efficiency. I'm not an expert at this level of detail but we'll see some about this in the worked example. Note that using more than 1-dimensional grids and blocks is purely for the conceptual convenience of the programmer and doesn't correspond to anything on the hardware. So for the most part we'll use a one-dimensional grid of blocks and a one-dimensional blocks of threads.
269 | In general you'd want each independent calculation done in a separate thread, though as we'll see in Section 5 on simulation, one might want to do a sequence of calculations on each thread. In general, you'll want to pipeline together multiple operations within a computation to avoid copying from CPU to GPU and back. Alternatively, this can be done by keeping the data on the GPU and calling a second kernel. 
270 | 
271 | Threads are quick to start, and to get efficiency you want to have thousands of threads to exploit the parallelism of the GPU hardware. In general your calculations will have more threads than GPU cores; the GPU will manage the process of executing all the threads.
272 | 
273 | This can all get quite complicated, with the possibility for communication amongst threads. Threads within a block have some (48Kb) of shared memory (distinct from the main GPU memory) and can synchronize with each other, while threads in different blocks cannot cooperate. We'll see some basic examples of this in our working example later. The Suchard et al. paper referenced in the last Section discusses how to get more efficiency by having threads within a block cooperate and access shared memory, which is much faster than accessing the  main GPU (device) memory.
274 | 
275 | If we go back to the *deviceQuery* output, we'll see information on the number of physical CUDA cores and main GPU memory as well as information about  the maximum threads per block and the maximum dimensions of thread blocks and grids.
276 | 
277 | ### 3.4) "Hello, world" using CUDA directly
278 | 
279 | First let's see a 'Hello, World' example that illustrates blocks of threads and grids of blocks.
280 | 
281 | The idea is to have at least as many threads as the number of computations you are doing. Our kernel function contains the core calculation we want to do (in this case printing 'Hello world!') and code that figures out identifying information for each thread as discussed next. 
282 | 
283 | When we write a kernel, we will need to have some initial code that determines a unique ID for that thread  that allows the thread to access the appropriate part(s) of the data object(s) on the GPU and 'know' what part of the computation it should do. This is done based on information stored in variables that CUDA provides that have information about the thread and block indices and block and grid dimensions.
284 | 
285 | Here's the [example code (helloWorld.cu on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/helloWorld.cu).
286 | 
287 | In this case, compilation is as follows. Given the CUDA functionality used in the code (in particular the call to *printf* within the kernel), we need to specify compilation for a *compute capability* >= 2.0 (corresponding to the Fermi generation of NVIDIA GPUs) (more below). Note that our query above indicated that the GPU we are using has capability 3.5, so this constraint is fine. 
288 | 
289 | ```{r, helloWorld-compile, engine='bash', eval=FALSE}
290 | nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
291 | ```
292 | 
293 | The result of this looks like:
294 | ```{r, helloWorld-output, eval=FALSE, engine='bash'}
295 | Launching 20480 threads (N=20000)
296 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(448,0,0) => thread index=1984
297 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(449,0,0) => thread index=1985
298 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(450,0,0) => thread index=1986
299 | ....
300 | 
301 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(220,0,0) => thread index=20188 
302 | [### this thread would not be used for N=20000 ###]
303 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(221,0,0) => thread index=20189 
304 | [### this thread would not be used for N=20000 ###]
305 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(222,0,0) => thread index=20190 
306 | [### this thread would not be used for N=20000 ###]
307 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(223,0,0) => thread index=20191 
308 | [### this thread would not be used for N=20000 ###]
309 | kernel launch success!
310 | That's all!
311 | ```
312 | 
313 | Note that because of some buffering issues, with this many threads, we can't see the output for all of them, hence the *if* statement in the kernel code. It is possible to retrieve info about the limit and change the limit using *cudaDeviceGetLimit()* and *cudaDeviceSetLimit()*.
314 | 
315 | ### 3.5) CUDA *compute capability*
316 | 
317 | The *compute capability* basically refers to the evolving functionality of the NVIDIA architecture. Higher numbers provide more functionality but will only run on newer GPU hardware.
318 | 
319 | For example, to use doubles rather than floats you need compute capability of at least 1.3. This required compute capability needs to be specified when you are compiling CUDA code.
320 | 
321 | 
322 | # 4) Executing kernels
323 | 
324 | A note on the speed comparisons in the remaining section. These compare a fully serial CPU calculation on a single core to calculation on the GPU. On a multicore machine, we could speed up the CPU calculation by writing code to parallelize the calculation (e.g., via threading in C/openMP or various parallelization tools in R or Python). 
325 | 
326 | Also, note that in the various examples when I want to assess computational time, I make sure to synchronize all the threads via an appropriate function call. This ensures that all of the threads have finished their kernel calculations before I mark the end of the time interval. In general a function call to do a calculation on the GPU will simply start the calculation and then return, with the calculation continuing on the GPU.
327 | 
328 | In this section, I'll demonstrate calling a kernel that simply computes the normal density function (PDF) on a vector of values in parallel, one value per thread. 
329 | 
330 | ### 4.1) Running a kernel from C/CUDA
331 | 
332 | Now let's see our example implemented using CUDA code, including memory allocation on the GPU and transfer between the GPU and CPU. 
333 | 
334 | My kernel code allocates memory on the CPU and the device (GPU) memory and the kernel function uses the device memory for the alphas, random numbers, and the output values (the probability estimates). 
335 | 
336 | Note that here, I'll use 1024 threads per block and then a grid sufficiently large so that we have at least as many threads as computational chunks. 
337 | 
338 | Here's the [code (kernelExample.cu on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.cu).
339 | 
340 | Compilation is as follows. 
341 | 
342 | ```{r, kernelExample-compile, engine='bash', eval=FALSE}
343 | nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
344 | ```
345 | 
346 | Here are some results:
347 | ```{r, kernelExample-output, eval=FALSE, engine='bash'}
348 | ====================================================
349 | Grid dimension is 46 x 46
350 | Launching 2166784 threads (N=2097152)
351 | Input values: -0.658344 0.499804 -0.807257...
352 | Memory Copy from Host to Device successful.
353 | Memory Copy from Device to Host successful.
354 | Output values: 0.321214 0.352100 0.288007...
355 | Output values (CPU): 0.321214 0.352100 0.288007...
356 | Timing results for n = 2097152
357 | Transfer to GPU time: 0.009988
358 | Calculation time (GPU): 0.000366
359 | Calculation time (CPU): 0.058541
360 | Transfer from GPU time: 0.001716
361 | Freeing memory...
362 | ====================================================
363 | ...
364 | ...
365 | ====================================================
366 | Grid dimension is 363 x 363
367 | Launching 134931456 threads (N=134217728)
368 | Input values: -0.658344 0.499804 -0.807257...
369 | Memory Copy from Host to Device successful.
370 | Memory Copy from Device to Host successful.
371 | Output values: 0.321214 0.352100 0.288007...
372 | Output values (CPU): 0.321214 0.352100 0.288007...
373 | Timing results for n = 134217728
374 | Transfer to GPU time: 0.638223
375 | Calculation time (GPU): 0.021684
376 | Calculation time (CPU): 3.470199
377 | Transfer from GPU time: 0.055798
378 | Freeing memory...
379 | ====================================================
380 | ```
381 | 
382 | The speedup in pure computation time is very impressive (175x); surprisingly when I did this same benchmark two years ago with the EC2 g2.x2large instance the speedup was 'only' 40x. However, importantly, we do see that the time for transferring to and from (particularly to) the GPU exceeds the calculation time, reinforcing the idea of keeping data on the GPU when possible. 
383 | 
384 | #### Using Pinned Memory
385 | 
386 | Here's some code where we use pinned memory that is 'mapped' to the GPU such that the GPU directly accesses CPU memory. This can be advantageous if one exceeds the GPU's memory and, according to some sources, is best when you load the data only once. Another approach, using pinned but not mapped memory allows for more efficient transfer but without the direct access from the GPU, with a hidden transfer done behind the scenes. This may be better if the data is loaded multiple times on the GPU.
387 | 
388 | Here's the [code (kernelExample-pinned.cu on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample-pinned.cu).
389 | 
390 | 
391 | Here are some results:
392 | ```{r, kernelExample-pinned-output, eval=FALSE, engine='bash'}
393 | 
394 | ====================================================
395 | Grid dimension is 46 x 46
396 | Launching 2166784 threads (N=2097152)
397 | Input values: -0.658344 0.499804 -0.807257...
398 | Output values: 0.321214 0.352100 0.288007...
399 | Output values (CPU): 0.321214 0.352100 0.288007...
400 | Timing results for n = 2097152
401 | Calculation time (GPU): 0.003245
402 | Calculation time (CPU): 0.058515
403 | Freeing memory...
404 | ====================================================
405 | ...
406 | ...
407 | ====================================================
408 | Grid dimension is 363 x 363
409 | Launching 134931456 threads (N=134217728)
410 | Input values: -0.658344 0.499804 -0.807257...
411 | Output values: 0.321214 0.352100 0.288007...
412 | Output values (CPU): 0.321214 0.352100 0.288007...
413 | Timing results for n = 134217728
414 | Calculation time (GPU): 0.187535
415 | Calculation time (CPU): 3.757175
416 | Freeing memory...
417 | ====================================================
418 | ```
419 | 
420 | So using pinned mapped memory seems to help quite a bit in this case, as the total time with pinned memory is less than the time used for transfer plus calculation in the previous examples.
421 | 
422 | ### 4.2) Calling CUDA Kernels from R (RCUDA)
423 | 
424 | When we want to use CUDA from R, the kernel function will remain the same, but the pre- and post-processing is done in R rather than in C. Here's an example, with the same normal density kernel. The CUDA kernel code is saved in a [separate file (calc_loglik.cu on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/calc_loglik.cu) separate file but is identical to that in the full CUDA+C example above (with the exception that we need to wrap the kernel function in `extern "C"`).
425 | 
426 | Here's the [code (kernelExample.R on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.R)
427 | 
428 | In this example we see that we can either transfer data between CPU and GPU manually or have RCUDA do it for us. If we didn't want to overwrite the input, but rather to allocate separate space for the output on the GPU, we could use *cudaMalloc()* (see example in Section 5.2).
429 | 
430 | We need to compile the kernel into a ptx object file, either outside of R:
431 | 
432 | ```{r, RCUDAexample-compile, engine='bash', eval=FALSE}
433 | nvcc --ptx  -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu
434 | ```
435 | 
436 | or inside of R:
437 | ```{r, RCUDAexample-compile-inR, engine='R', eval=FALSE}
438 | ptx = nvcc(file = 'calc_loglik.cu', out = 'calc_loglik.ptx', target = 'ptx', '-arch=compute_20', '-code=sm_20,compute_20')
439 | ```
440 | 
441 | Here are some results:
442 | ```{r RCUDAexample_output, eval=FALSE, engine='bash'}
443 | Grid size:
444 | [1] 363 363   1
445 | Total number of threads to launch =  134931456 
446 | Running CUDA kernel...
447 | Input values:  0.8966972 0.2655087 0.3721239 
448 | Output values:  0.2457292 0.2658912 0.2656543 
449 | Output values (implicit transfer):  0.2457292 0.2658912 0.2656543 
450 | Output values (CPU with R):  0.2457292 0.2658912 0.2656543 
451 | Transfer to GPU time:  0.702 
452 | Calculation time (GPU):  0.044 
453 | Transfer from GPU time:  0.489 
454 | Calculation time (CPU):  8.432 
455 | Combined calculation/transfer via .cuda time (GPU):  1.203 
456 | ```
457 | 
458 | So the transfer time is again substantial in relative terms. Without that time, the speedup would be substantial. 
459 | 
460 | We can avoid explicitly specifying block and grid dimensions by using the *gridBy* argument to *.cuda()*, with syntax as shown in the *kernelExample.R*. For some reason that code is not working, though I have gotten it to work in other contexts.
461 | 
462 | 
463 | WARNING #1: be very careful that the types of the R objects passed to the kernel match what the kernel is expecting. Otherwise the code can hang without an informative error message. 
464 | 
465 | WARNING #2: Note the use of the `strict=TRUE` argument when passing values to the GPU. This ensures that numeric values are kept as doubles and not coerced to floats. 
466 | 
467 | ### 4.3) Calling CUDA Kernels from Python
468 | 
469 | With PyCUDA the kernel code can be directly embedded in the Python script. Otherwise it's fairly similar to the use of RCUDA. Here's the [code (kernelExample.py on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.py)
470 | 
471 | Here are some results:
472 | ```{r PyCUDAexample_output, eval=FALSE, engine='bash'}
473 | Generating random normals...
474 | Running GPU code...
475 | Time for calculation (GPU): 1.008687s
476 | Running Scipy CPU code...
477 | Time for calculation (CPU): 12.572273s
478 | Output from GPU: 0.177782 0.224597 0.109604
479 | Output from CPU: 0.177782 0.224597 0.109604
480 | ```
481 | 
482 | WARNING: As was the case with R, be careful that the types of the Python objects passed to the kernel match what the kernel is expecting. 
483 | 
484 | 
485 | # 5) Random Number Generation (RNG) on the GPU
486 | 
487 | RNG is done via the CURAND (CUDA Random Number Generation) library. CURAND provides several different generators including the Mersenne Twister (the default in R). 
488 | 
489 | ### 5.1) Seeds and Sequences
490 | 
491 | From the CUDA documentation:
492 | 
493 | *For the highest quality parallel pseudorandom number generation, each experiment should be assigned a unique seed. Within an experiment, each thread of computation should be assigned a unique sequence number. If an experiment spans multiple kernel launches, it is recommended that threads between kernel launches be given the same seed, and sequence numbers be assigned in a monotonically increasing way. If the same configuration of threads is launched, random state can be preserved in global memory between launches to avoid state setup time.*
494 | 
495 | A lot of important info... we'll interpret/implement much of it in the demo below.
496 | 
497 | Recall that RNG on a computer involves generation of pseudo-random numbers from a deterministic, periodic sequence. The seed determines where one starts generating from within that sequence. The idea of the sequence numbers is to generate from non-overlapping blocks within the sequence, with each thread getting a different block.  
498 | 
499 | 
500 | 
501 | ### 5.2) Calling CURAND via RCUDA
502 | 
503 | For RNG, we need a kernel to initialize the RNG on each thread and one to do the sampling (though they could be combined in a single kernel). Note that the time involved in initializing the RNG for each thread is substantial. This shouldn't be a problem if one is doing a lot of calculations over time. To amortize this one-time expense, I generate multiple random numbers per thread. Here's the [kernel code (random.cu on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/random.cu). The second argument to *curand_init* is the sequence number - by having contiguous sequence numbers for the threads, the position of the initial random number for a given thread is spaced $2^{67}$ values apart from the position of the initial random number for the next thread.
504 | 
505 | And here's the [R code (RNGexample.R on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/RNGexample.R) to call the kernel, which looks very similar to the RCUDA code we've already seen.
506 | 
507 | Here are some results:
508 | 
509 | 
510 | ```{r RNGexample-output, eval=FALSE, engine='bash'}
511 | RNG initiation time:  0.062
512 | GPU memory allocation time:  0.001 
513 | Calculation time (GPU):  0.228
514 | Transfer from GPU time:  0.423 
515 | Calculation time (CPU):  7.292
516 | ```
517 | 
518 | 
519 | We get a decent speed up, which would be more impressive if we can set up the calculations such that we don't need to transfer the whole large vector back to the CPU. Also, the code in *random.cu* uses non-unit strides and could probably be reworked for more efficient global memory access (see Section 7).
520 | 
521 | Also note the memory cost of the RNG states for the threads, 48 bytes per thread, which could easily exceed GPU memory if one starts up many threads. 
522 | 
523 | 
524 | At the moment, I'm not sure how to choose the RNG generator from within R. 
525 | 
526 | ### 5.3) Calling CURAND from C and from Python
527 | 
528 | I may flesh this out at some point, but by looking at the RNG example via RCUDA and the examples of calling kernels from C and Python in the previous section, it should be straightforward to do RNG on the GPU controlled by C or Python.
529 | 
530 | To choose the generator in C this should work (in this case choosing the Mersenne Twister):
531 | `curandCreateGenerator(CURAND_RNG_PSEUDO_MTGP32)`.
532 | 
533 | # 6) Using higher-level functionality to do linear algebra and vectorized operations on the GPU
534 | 
535 | The idea here is to use software that hides the details of the kernel implementation from us, relying on the expertise of others to efficiently code standard computations on the GPU. 
536 | 
537 | We'll start with very high-level use of the GPU by simply calling linear algebra routines that use the GPU. 
538 | 
539 | 
540 | ### 6.1) Using C to Call CUDABLAS and MAGMA
541 | 
542 | We can do linear algebra (and basic vectorized operations with vectors and matrices) using GPU implementations of BLAS/LAPACK type routines. Both CUDA (through CUDABLAS) and MAGMA provide access to BLAS functionality, but only MAGMA provides LAPACK-like functionality (i.e., matrix factorizations/decompositions). 
543 | 
544 | We'll make CUDABLAS and MAGMA calls directly in C code. The MAGMA library provides a drop-in for the functionality of the BLAS and LAPACK that carries out linear algebra on both the CPU and GPU, choosing smartly where to do various aspects of the calculation. We'll now need to directly manage memory allocation on the GPU and transferring data back and forth from CPU to GPU.
545 | 
546 | #### CUDA and CUDABLAS
547 | 
548 | The code doesn't look too different than C code or calls to BLAS/LAPACK, but we use some CUDA functions and CUDA types. Here's the [example code (cudaBlasExample.c on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/cudaBlasExample.c).
549 | 
550 | 
551 | Compilation goes as follows. Note that in this case nvcc does not want the file to have .C or .cu extension. 
552 | 
553 | ```{r, cuda-compile, eval=FALSE, engine='bash'}
554 | nvcc cudaBlasExample.c -I/usr/local/cuda/include -lcublas -o cudaBlasExample
555 | ```
556 | 
557 | And here are (some of) the results:
558 | ```{r cudaBlas-example-output, eval=FALSE, engine='bash'}
559 | Starting
560 | ====================================================
561 | Timing results for n = 512
562 | GPU memory allocation time: 0.000256
563 | Transfer to GPU time: 0.001642
564 | Matrix multiply time: 0.000481
565 | Transfer from GPU time: 0.001550
566 | ====================================================
567 | Timing results for n = 2048
568 | GPU memory allocation time: 0.000276
569 | Transfer to GPU time: 0.020364
570 | Matrix multiply time: 0.015466
571 | Transfer from GPU time: 0.015035
572 | ====================================================
573 | Timing results for n = 8192
574 | GPU memory allocation time: 0.000800
575 | Transfer to GPU time: 0.325620
576 | Matrix multiply time: 0.940571
577 | Transfer from GPU time: 0.229997
578 | ```
579 | 
580 | For (rough) comparison, the $n=8192$ multiplication on the CPU (using *openBLAS* as the BLAS, called from R) takes 106 seconds with one core and 18 seconds with 8 cores.
581 | 
582 | #### MAGMA 
583 | 
584 | Now let's see the use of [MAGMA](http://icl.cs.utk.edu/magma/). MAGMA provides analogous calls as CUDA/CUDABLAS for allocating memory, transferring data, and BLAS calls, as well as LAPACK type calls. 
585 | 
586 | Note that the LAPACK type calls have a CPU interface and a GPU interface. The GPU interface calls have function names ending in '_gpu' and operate on data objects in GPU memory. The CPU interface calls operate on data objects in CPU memory, handling the transfer to GPU memory as part of the calculation.
587 | 
588 | Here we'll compare timing for the GPU vs. standard BLAS/LAPACK as well as the CPU and GPU interfaces for the Cholesky.
589 | 
590 | Here's the [example code (magmaExample.c on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/magmaExample.c).
591 | 
592 | 
593 | Compilation and execution (with and without pinned memory) go as follows. Note we can use gcc and that we need to link in the CPU BLAS and LAPACK since MAGMA uses both CPU and GPU for calculations (plus in this example I directly call BLAS and LAPACK functions).
594 | ```{r magma-compile, eval=FALSE, engine='bash'}
595 | gcc magmaExample.c -O3 -DADD_ -fopenmp  -DHAVE_CUBLAS -I/usr/local/cuda/include \
596 |     -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma \
597 |     -llapack -lblas -lcublas -lcudart -o magmaExample
598 | ./magmaExample 1
599 | ./magmaExample 0
600 | ```
601 | 
602 | And here are (some of) the results:
603 | ```{r magma-example-output, eval=FALSE, engine='bash'}
604 | Starting
605 | Setting use_pinned to 1
606 | ====================================================
607 | Timing results for n = 512
608 | GPU memory allocation time: 0.000256
609 | Transfer to GPU time: 0.085331
610 | Matrix multiply time (GPU): 0.000692
611 | Matrix multiply time (BLAS): 0.049665
612 | Cholesky factorization time (GPU w/ GPU interface): 0.023938
613 | Cholesky factorization time (GPU w/ CPU interface): 0.004702
614 | Cholesky factorization time (LAPACK): 0.006958
615 | Transfer from GPU time: 0.000344
616 | ====================================================
617 | Timing results for n = 2048
618 | GPU memory allocation time: 0.000366
619 | Transfer to GPU time: 0.005706
620 | Matrix multiply time (GPU): 0.027141
621 | Matrix multiply time (BLAS): 0.446544
622 | Cholesky factorization time (GPU w/ GPU interface): 0.047918
623 | Cholesky factorization time (GPU w/ CPU interface): 0.025746
624 | Cholesky factorization time (LAPACK): 0.077203
625 | Transfer from GPU time: 0.005030
626 | ====================================================
627 | Timing results for n = 8192
628 | GPU memory allocation time: 0.000789
629 | Transfer to GPU time: 0.087303
630 | Matrix multiply time (GPU): 1.766567
631 | Matrix multiply time (BLAS): 23.807952
632 | Cholesky factorization time (GPU w/ GPU interface): 0.230186
633 | Cholesky factorization time (GPU w/ CPU interface): 0.259374
634 | Cholesky factorization time (LAPACK): 4.179541
635 | Transfer from GPU time: 0.079991
636 | 
637 | Setting use_pinned to 0
638 | ====================================================
639 | Timing results for n = 512
640 | GPU memory allocation time: 0.000257
641 | Transfer to GPU time: 0.086421
642 | Matrix multiply time (GPU): 0.000655
643 | Matrix multiply time (BLAS): 0.037689
644 | Cholesky factorization time (GPU w/ GPU interface): 0.016963
645 | Cholesky factorization time (GPU w/ CPU interface): 0.011957
646 | Cholesky factorization time (LAPACK): 0.005600
647 | Transfer from GPU time: 0.001391
648 | ====================================================
649 | Timing results for n = 2048
650 | GPU memory allocation time: 0.000369
651 | Transfer to GPU time: 0.009003
652 | Matrix multiply time (GPU): 0.027190
653 | Matrix multiply time (BLAS): 0.514402
654 | Cholesky factorization time (GPU w/ GPU interface): 0.039755
655 | Cholesky factorization time (GPU w/ CPU interface): 0.037521
656 | Cholesky factorization time (LAPACK): 0.081121
657 | Transfer from GPU time: 0.013978
658 | ====================================================
659 | Timing results for n = 8192
660 | GPU memory allocation time: 0.001062
661 | Transfer to GPU time: 0.136131
662 | Matrix multiply time (GPU): 1.775493
663 | Matrix multiply time (BLAS): 24.222220
664 | Cholesky factorization time (GPU w/ GPU interface): 0.224644
665 | Cholesky factorization time (GPU w/ CPU interface): 0.400515
666 | Cholesky factorization time (LAPACK): 4.183725
667 | Transfer from GPU time: 0.204625
668 | ```
669 | 
670 | So we see decent speed-ups both for the matrix multiplication and the Cholesky factorization; the comparisons are with respect to 8 CPU cores.
671 | 
672 | Using the CPU interface seems to provide a modest speedup (compared to the manual transfer + calculation time), as does using pinned memory.
673 | 
674 | ### 6.2) Using PyCUDA to do GPU calculations directly in Python
675 | 
676 | PyCUDA also provides high-level functionality for vectorized calculations on the GPU. Basically you create a vector stored in GPU memory and then operate on it with a variety of mathematical functions. The modules that do this are *gpuarray* and *cumath*.
677 | 
678 | Here's the [code (gpuArrayExample.py on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/gpuArrayExample.py)
679 | 
680 | Here are the timing results. 
681 | ```{r gpuArrayExample-output, eval=FALSE, engine='bash'}
682 | Transfer to GPU time: 0.639403s
683 | Timing vectorized exponentiation:
684 | GPU array calc time (initial): 0.276190s
685 | GPU array calc time: 0.014222s
686 | CPU calc time: 2.704504s
687 | Timing vectorized dot product/sum of squares:
688 | GPU array calc time (initial): 0.229969s
689 | GPU array calc time: 0.007769s
690 | CPU calc time: 0.071532s
691 | ```
692 | 
693 | So we see a good speedup for the vectorized exponentiation. However, there is some compilation that gets done when the code is run the first time that slows down the initial calculation. Also, again, the transfer of data to the GPU takes a chunk of time. 
694 | 
695 | For the dot product, the speedup is not as impressive, probably because the aggregation that is needed to do the sum involves coordination across threads. 
696 | 
697 | ### 6.3) Using R packages to do vectorized operations and linear algebra on the GPU
698 | 
699 | Various R packages hide the details of the GPU implementation and allow you to do vector and matrix operations, including linear algebra, using standard R code. In some cases they overload the usual R functions such that you can simply call a function of the same name as in base R.
700 | 
701 | Some packages you might investigate include: 
702 | * HiPLARM (apparently this uses MAGMA behind the scenes)
703 | * gpuR (uses openCL rather than CUDA)
704 | * gmatrix
705 | * gputools
706 | 
707 | # 7) An extended example of optimizing GPU kernel code
708 | 
709 | Here we'll implement a basic, but real computation that is a component of a larger collaboration I am engaged in. The basic context is understanding spatial variation in the species composition of forests in the eastern United States. The data are multinomial samples of counts of trees of different species at many different spatial locations (i.e., observations).  We fit a spatial version of a multicategory probit regression model. 
710 | 
711 | In our coding, I'll compare a basic R implementation as well as a C++ implementation with various GPU implementations designed to improve the speed of the GPU calculation. I'll use R to manage the C++ and CUDA code (via *Rcpp* and *RCUDA*) but there's no reason one couldn't do this via Python or C/C++ on the front-end. Our main focus will be on the different CUDA implementations.
712 | 
713 | All of the implementations are in the [example directory](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example) in the repository.
714 | 
715 | ### 7.1) Example: Probit regression probabilities
716 | 
717 | #### Probit regression basics
718 | 
719 | Consider probit regression, which is similar to logistic regression. The probability of a binary outcome is given as
720 | $p = P(Y = 1) = \Phi(X\beta)$ where $\Phi()$ is the normal CDF.
721 | 
722 | The probit model can be rewritten in a latent variable representation that in a Bayesian context can facilitate MCMC computations to fit the model:
723 | $$
724 | Y =  I(W > 0) \\
725 | $$
726 | $$
727 | W  \sim  N(X\beta , 1) \\
728 | $$
729 | 
730 | Suppose we know $\beta$. In order to determine $p$ we could use Monte Carlo simulation to estimate this integral:
731 | $P(Y = 1) = \int_{-\infty}^0 f(w) dw$.
732 | 
733 | Now for probit regression, we could just use standard methods to compute normal pdf integrals. But for the multinomial extension we discuss next, we need Monte Carlo simulation.
734 | 
735 | #### Multinomial probit regression
736 | 
737 | Let $Y$ be a categorical variable, $Y \in \{{1,2,\ldots,K}\}$. Then a multinomial extension of the latent variable probit model is
738 | $$
739 | Y = {arg\ max}_k {W_k}
740 | $$
741 | $$
742 | W_k \sim N(X\beta_k, 1)
743 | $$
744 | 
745 | Now to compute $p = ({P(Y=1), P(Y=2), \ldots, P(Y=K)})$ we can again do Monte Carlo simulation. The basic steps are:
746 |  - iterate m = 1, ... , M
747 |    - for k = 1,...,K, sample $W_k$ from its corresponding normal distribution
748 |    - determine the arg max of the $W_k$'s
749 |  - over the $M$ simulations, count the number of times each category had the largest corresponding $W_k$
750 | 
751 | The proportion of times the category corresponded to the largest $W_k$ is an estimate of the multinomial proportions of interest.
752 | 
753 | For our example, we want to do this computation for large $M$ (to reduce Monte Carlo error) and for many observations with different $X$ values. In our code, we will assume that we are given a vector ($\alpha_i = {\{X_i\beta_k\}}_{k=1,\ldots,K}$) for each observation, $i$, resulting in an $n$ by $K$ matrix.
754 | 
755 | Finally, note that I can reuse the random numbers I need across the $n$ observations (in fact, this probably reducesMonte Carlo error in certain ways), so I just need an $M$ by $K$ matrix of standard normal random variables. Even for large $M$ this is not so big, and I'll simply generate the values once on the CPU. 
756 | 
757 | ### 7.2) R and C baseline implementations
758 | 
759 | In [example_pureR.R](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_pureR.R) and [example_Rcpp.R](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_Rcpp.R) I've implemented the calculation for $n=26280$, $K=21$, and $M=10000$. I tried to write efficient vectorized R code and efficient C++ code (called from R, for convenience). I've also implemented parallel versions for both R and C++.
760 | 
761 | The pure R version takes about 570 seconds in serial and 140 seconds with eight cores.
762 | The C++ version takes about 47 seconds in serial and 6 seconds with eight cores.
763 | 
764 | ### 7.3) A basic (but thoughtful) implementation
765 | 
766 | [example_RCUDA.R](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_RCUDA.R) is the main R script that calls different kernel variations as I experimented with different strategies for efficiency. 
767 | 
768 | In [compute_probs.cu](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs.cu) I make use of the already-computed random numbers, and allocate a temporary vector *w* to hold the value of $w$ for the current Monte Carlo sample. 
769 | 
770 | Some features of my code:
771 |  - It's generally recommended to have 128-256 threads per block, with the number a multiple of 32 (because threads operate in lock-step in 'warps' of 32 threads). So I'm using 192 threads per block.
772 |  -  I then determine the number of blocks (of 192 threads each) that I need so I can have one thread for each of my $n$ observations.
773 |  - For this algorithm, as mentioned, I can reuse the random numbers across observations, so I don't generate individually on the GPU.
774 |  - I haven't thought about locality of memory access (i.e., strides, row-major vs. column-major) in this version of the code.
775 | 
776 | Let's execute this:
777 | 
778 | ```{r example-basic, eval=FALSE, engine='bash'}
779 | cd example
780 | Rscript example_RCUDA.R
781 | ```
782 | 
783 | This takes 12.1 seconds. 
784 | 
785 | ### 7.4) Accessing memory efficiently
786 | 
787 | Access to the device memory is slow (memory latency), but GPUs are good at switching between different threads while data is being retrieved from memory. Also, the GPU can access memory from consecutive memory locations efficiently and *coalesce* (combine) the memory accesses of groups of threads in a warp. Finally, threads in a warp execute in lock-step. The implications of this is that we want the threads in a warp to retrieve contiguous values from the device memory.  This means using a 'stride' of one when incrementing through a vector (analogous to moving along rows in a row-major matrix).
788 | 
789 | In the original code, I was striding through *alphas* and *probs* in strides of *k*. Thinking of the various matrices as having $K$ rows and being column-major, I was accessing values from adjacent columns on contigous threads when I should have accessed values from adjacent rows.
790 | 
791 | Let's transpose the matrices sent to the GPU memory and access adjacent rows, i.e., strides of one, across contiguous threads, as shown in [compute_probs_unitStrides.cu](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs_unitStrides.cu).
792 | 
793 | ```{r example-unitStrides, eval=FALSE, engine='bash'}
794 | echo "unitStrides <- TRUE" > /tmp/tmp.R
795 | cat example_RCUDA.R >> /tmp/tmp.R
796 | Rscript /tmp/tmp.R
797 | ```
798 | 
799 | This takes 8.5 seconds, which is a nice speed-up for a simple change, but not earth-shattering.
800 | 
801 | ### 7.5) Using shared memory (within a block)
802 | 
803 | Next let's consider whether it makes sense to move any data into shared memory, which can be accessed something like 100x as fast as device memory and functions like a programmer-managed cache. Shared memory is shared across all threads in a block. A couple implications of this are:
804 | 
805 | * We need to be careful to do the indexing within blocks.
806 | * We need to transfer any results out of shared memory in order to get it back to the CPU.
807 | * We don't need the calculations synchronized across threads because each thread owns the calculations for a single observation; however in other situations we might need to put a *barrier* in place that ensures all threads are finished with a particular calculation before any proceed to the next steps, using the *__syncthreads()* function.
808 | * We only have 48Kb of shared memory per block (see the results of *deviceQuery*), so we need to make sure the number of threads per block is not so large as to exceed that. In this case with 192 threads per block and $K=21$ values for each thread, we're over the maximum, so we need to go to 96 threads per block.
809 | 
810 | Here we notice that *w* and *probs* are accessed in device memory multiple times, and furthermore, *probs* is not even needed as an input, so let's try to manage these values in shared memory, as shown in [compute_probs_unitStrides_sharedMem.cu](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs_unitStrides_sharedMem.cu). 
811 | 
812 | ```{r example-sharedMem, eval=FALSE, engine='bash'}
813 | echo "unitStrides <- TRUE" > /tmp/tmp.R
814 | echo "sharedMem <- TRUE" >> /tmp/tmp.R
815 | cat example_RCUDA.R >> /tmp/tmp.R
816 | Rscript /tmp/tmp.R
817 | ```
818 | 
819 | This takes 1.5 seconds, so we see a big improvement from using shared memory.
820 | 
821 | Surprisingly, using shared memory for access to  *alphas* actually slowed things down 2-3-fold. I'm not sure why.
822 | 
823 | Finally in some cases you can use shared memory to avoid non-unit strides. Here's an example of a [matrix transpose](http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/). Basically any non-unit striding is done only in shared memory. Reading from and writing to device memory is done using unit strides. 
824 | 
825 | 
826 | ### 7.6) Using single precision (floats)
827 | 
828 | Traditionally GPU calculations are done in single precision and this can apparently be much faster than double precision calculations. 
829 | 
830 | Here  I get a roughly two- to three-fold speedup using floats rather than doubles, both for the original version of the code with non-unit strides and without shared memory (first example below) and for the optimized version of the code (second example below). As shown in the various "_float" kernel files, all I need to do is change "double" to "float". And when calling from R, there are some housekeeping items shown in [example_RCUDA.R](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_RCUDA.R).
831 | 
832 | ```{r example-basic-float, eval=FALSE, engine='bash'}
833 | echo "float <- TRUE" > /tmp/tmp.R
834 | cat example/example_RCUDA.R >> /tmp/tmp.R
835 | R CMD BATCH --no-save tmp.R 
836 | ```
837 | 
838 | ```{r example-sharedMem-float, eval=FALSE, engine='bash'}
839 | echo "float <- TRUE" > /tmp/tmp.R
840 | echo "unitStrides <- TRUE" > /tmp/tmp.R
841 | echo "sharedMem <- TRUE" >> /tmp/tmp.R
842 | cat example_RCUDA.R >> /tmp/tmp.R
843 | Rscript /tmp/tmp.R 
844 | ```
845 | 
846 | # 7.7) Summary
847 | 
848 | For this example, here are the speeds, and the speed relative to the eight-core C++ implementation:
849 | 
850 | | Implementation        | Time (sec.)         | Speed (relative to C++)  |
851 | | ------------- |-------------:| -----:|
852 | | R (8 cores)     | 140 | 0.04 |
853 | | C++ (8 cores)      |   6.0    |   1.0 |
854 | | basic CUDA |  12.1   |  0.5  |
855 | | unit strides | 8.5    |  0.7   |
856 | | shared memory |  1.5   |  4.0   |
857 | | shared memory + floats |  0.6   | 10.7    |
858 | 
859 | Interestingly on Savio, the C++ time was 9.8, while the shared memory time was 0.67 and the shared memory + floats time was 0.31. 
860 | 
861 | # 8) Final Comments
862 | 
863 | ### 8.1) Some Thoughts on Improving Computational Speed
864 | 
865 | [Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10016#.U2GTuBUgoWk) and [Lee et al (2010; Journal of Computational and Graphical Statistics 19:769](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10039#.U2GT9BUgoWk) talk about the use of GPUs for statistics. The speedups they see can get as high as 120 times the speed of a single CPU core and 500 times a single CPU core, respectively. Some of the reasons these speedups are so impressive (more so than some of the examples here) include:
866 | 
867 | * Use of single precision floating point calculations. If single precision doesn't affect your calculation substantively, this is worth trying. Particularly on older GPUs (but perhaps still true), single precision was much faster than double precision. 
868 | 
869 | * Computational tasks that are very arithmetically intensive but with limited memory access (see the Lee et al. paper)
870 | 
871 | * Ensuring that contigously-numbered threads access contigous memory locations
872 | 
873 | * Careful use of shared memory (shared amongst the threads in a block) in place of the main GPU memory (see the Suchard et al. paper); in particular this can avoid accessing non-contiguous memory 
874 | 
875 | * Avoiding conditional statements and synchronization/barriers, since threads operate in lock-step in groups of 32 threads (a 'warp')
876 | 
877 | So for some tasks and likely involving additional coding effort, you may see speedups of 100-200 fold compared to a single CPU core. 
878 | 
879 | Finally, rather than bringing a large chunk of data back to the CPU, you might do a reduction/aggregation operation (e.g., summing over values) in GPU memory. To do this, here's a [presentation](http://will-landau.com/gpu/lectures/cudac-atomics/cudac-atomics.pdf)‎ that has some useful information. 
880 | 
881 | ### 8.2) A Comment on Compilation
882 | 
883 | If you compile CUDA code into an object file, you can link that with other object files (e.g., from C or C++ code) into an executable that can operate on CPU and GPU. This also means you could compile a shared object (i.e., a library)  that you could call from R with .C, .Call, or Rcpp. 
884 | 
885 | 
886 | ### 8.3) Some references:
887 | 
888 |  - The book Parallel Computing for Data Science by Norman Matloff has some useful introductory material.
889 |  - The [NVIDIA website](http://devblogs.nvidia.com/parallelforall/) has a bunch of useful blog posts.
890 |  - [Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10016#.U2GTuBUgoWk)
891 |  - [Lee et al (2010; Journal of Computational and Graphical Statistics 19:769](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10039#.U2GT9BUgoWk)
892 | 


--------------------------------------------------------------------------------
/gpu.html:
--------------------------------------------------------------------------------
   1 | <!DOCTYPE html>
   2 | <html>
   3 | <head>
   4 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
   5 | 
   6 | <title>Introduction to Computing with GPUs for Data Science</title>
   7 | 
   8 | <script type="text/javascript">
   9 | window.onload = function() {
  10 |   var imgs = document.getElementsByTagName('img'), i, img;
  11 |   for (i = 0; i < imgs.length; i++) {
  12 |     img = imgs[i];
  13 |     // center an image if it is the only element of its parent
  14 |     if (img.parentElement.childElementCount === 1)
  15 |       img.parentElement.style.textAlign = 'center';
  16 |   }
  17 | };
  18 | </script>
  19 | 
  20 | <!-- Styles for R syntax highlighter -->
  21 | <style type="text/css">
  22 |    pre .operator,
  23 |    pre .paren {
  24 |      color: rgb(104, 118, 135)
  25 |    }
  26 | 
  27 |    pre .literal {
  28 |      color: #990073
  29 |    }
  30 | 
  31 |    pre .number {
  32 |      color: #099;
  33 |    }
  34 | 
  35 |    pre .comment {
  36 |      color: #998;
  37 |      font-style: italic
  38 |    }
  39 | 
  40 |    pre .keyword {
  41 |      color: #900;
  42 |      font-weight: bold
  43 |    }
  44 | 
  45 |    pre .identifier {
  46 |      color: rgb(0, 0, 0);
  47 |    }
  48 | 
  49 |    pre .string {
  50 |      color: #d14;
  51 |    }
  52 | </style>
  53 | 
  54 | <!-- R syntax highlighter -->
  55 | <script type="text/javascript">
  56 | var hljs=new function(){function m(p){return p.replace(/&/gm,"&amp;").replace(/</gm,"&lt;")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
  57 | hljs.initHighlightingOnLoad();
  58 | </script>
  59 | 
  60 | <!-- MathJax scripts -->
  61 | <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
  62 | </script>
  63 | 
  64 | 
  65 | <style type="text/css">
  66 | body, td {
  67 |    font-family: sans-serif;
  68 |    background-color: white;
  69 |    font-size: 13px;
  70 | }
  71 | 
  72 | body {
  73 |   max-width: 800px;
  74 |   margin: auto;
  75 |   padding: 1em;
  76 |   line-height: 20px;
  77 | }
  78 | 
  79 | tt, code, pre {
  80 |    font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
  81 | }
  82 | 
  83 | h1 {
  84 |    font-size:2.2em;
  85 | }
  86 | 
  87 | h2 {
  88 |    font-size:1.8em;
  89 | }
  90 | 
  91 | h3 {
  92 |    font-size:1.4em;
  93 | }
  94 | 
  95 | h4 {
  96 |    font-size:1.0em;
  97 | }
  98 | 
  99 | h5 {
 100 |    font-size:0.9em;
 101 | }
 102 | 
 103 | h6 {
 104 |    font-size:0.8em;
 105 | }
 106 | 
 107 | a:visited {
 108 |    color: rgb(50%, 0%, 50%);
 109 | }
 110 | 
 111 | pre, img {
 112 |   max-width: 100%;
 113 | }
 114 | pre {
 115 |   overflow-x: auto;
 116 | }
 117 | pre code {
 118 |    display: block; padding: 0.5em;
 119 | }
 120 | 
 121 | code {
 122 |   font-size: 92%;
 123 |   border: 1px solid #ccc;
 124 | }
 125 | 
 126 | code[class] {
 127 |   background-color: #F8F8F8;
 128 | }
 129 | 
 130 | table, td, th {
 131 |   border: none;
 132 | }
 133 | 
 134 | blockquote {
 135 |    color:#666666;
 136 |    margin:0;
 137 |    padding-left: 1em;
 138 |    border-left: 0.5em #EEE solid;
 139 | }
 140 | 
 141 | hr {
 142 |    height: 0px;
 143 |    border-bottom: none;
 144 |    border-top-width: thin;
 145 |    border-top-style: dotted;
 146 |    border-top-color: #999999;
 147 | }
 148 | 
 149 | @media print {
 150 |    * {
 151 |       background: transparent !important;
 152 |       color: black !important;
 153 |       filter:none !important;
 154 |       -ms-filter: none !important;
 155 |    }
 156 | 
 157 |    body {
 158 |       font-size:12pt;
 159 |       max-width:100%;
 160 |    }
 161 | 
 162 |    a, a:visited {
 163 |       text-decoration: underline;
 164 |    }
 165 | 
 166 |    hr {
 167 |       visibility: hidden;
 168 |       page-break-before: always;
 169 |    }
 170 | 
 171 |    pre, blockquote {
 172 |       padding-right: 1em;
 173 |       page-break-inside: avoid;
 174 |    }
 175 | 
 176 |    tr, img {
 177 |       page-break-inside: avoid;
 178 |    }
 179 | 
 180 |    img {
 181 |       max-width: 100% !important;
 182 |    }
 183 | 
 184 |    @page :left {
 185 |       margin: 15mm 20mm 15mm 10mm;
 186 |    }
 187 | 
 188 |    @page :right {
 189 |       margin: 15mm 10mm 15mm 20mm;
 190 |    }
 191 | 
 192 |    p, h2, h3 {
 193 |       orphans: 3; widows: 3;
 194 |    }
 195 | 
 196 |    h2, h3 {
 197 |       page-break-after: avoid;
 198 |    }
 199 | }
 200 | </style>
 201 | 
 202 | 
 203 | 
 204 | </head>
 205 | 
 206 | <body>
 207 | <h1>Introduction to Computing with GPUs for Data Science</h1>
 208 | 
 209 | <p>Chris Paciorek, Statistical Computing Facility, Department of Statistics and Berkeley Research Computing, UC Berkeley</p>
 210 | 
 211 | <p>Presented: February 1 and 8, 2016</p>
 212 | 
 213 | <p>Last Revised: February 1, 2016</p>
 214 | 
 215 | <h1>0) This Tutorial</h1>
 216 | 
 217 | <p>Materials for this tutorial, including the R markdown file that was used to create this document are available on github at <a href="https://github.com/berkeley-scf/gpu-workshop-2016">https://github.com/berkeley-scf/gpu-workshop-2016</a>.  You can download the files by doing a git clone:</p>
 218 | 
 219 | <pre><code class="bash">git clone https://github.com/berkeley-scf/gpu-workshop-2016
 220 | </code></pre>
 221 | 
 222 | <p>To create this HTML document, simply compile the corresponding R Markdown file in R:</p>
 223 | 
 224 | <pre><code class="r">library(knitr)
 225 | knit2html(&#39;gpu.Rmd&#39;)
 226 | </code></pre>
 227 | 
 228 | <p>or from the UNIX command line:</p>
 229 | 
 230 | <pre><code class="bash">Rscript -e &quot;library(knitr); knit2html(&#39;gpu.Rmd&#39;)&quot;
 231 | </code></pre>
 232 | 
 233 | <h1>1) Introduction</h1>
 234 | 
 235 | <h3>1.1) Overview</h3>
 236 | 
 237 | <p>GPUs (Graphics Processing Units) are processing units originally designed for rendering graphics on a computer quickly. This is done by having a large number of simple processing units for massively parallel calculation. The idea of general purpose GPU (GPGPU) computing is to exploit this capability for general computation. </p>
 238 | 
 239 | <p>We&#39;ll see both high-level and low-level ways to program calculations for implementation on the GPU. The basic context of GPU programming is &ldquo;data parallelism&rdquo;, in which the same calculation is done to lots of pieces of data. This could be a mathematical calculation on millions of entries in a vector or a simulation with many independent simulations. Some examples of data parallelism include matrix multiplication (doing the multiplication task on many separate matrix elements) or numerical integration (doing a numerical estimate of the piecewise integral on many intervals/regions), as well as standard statistical calculations such as simulation studies, bootstrapping, random forests, etc. This kind of computation also goes by the name &ldquo;SIMD&rdquo; (single instruction, multiple data).</p>
 240 | 
 241 | <h3>1.2) Hardware</h3>
 242 | 
 243 | <p>Two of the main suppliers of GPUs are NVIDIA and AMD. <em>CUDA</em> is a platform for programming on GPUs specifically for NVIDIA GPUs that allows you to send C/C++/Fortran code for execution on the GPU.  <em>OpenCL</em> is an alternative that will work with a broader variety of GPUs. However, CUDA is quite popular, and there are a lot of tools designed for working with NVIDIA GPUs and based on CUDA, so we&#39;ll focus on CUDA here. </p>
 244 | 
 245 | <p>GPUs have many processing units but somewhat limited memory. Also, they can only use data in their own memory, not in the CPU&#39;s memory, so one must transfer data back and forth between the CPU (the <em>host</em>) and the GPU (the <em>device</em>). This copying can, in some computations, constitute a very large fraction of the overall computation. So it is best to create the data and/or leave the data (for subsequent calculations) on the GPU when possible and to limit transfers. </p>
 246 | 
 247 | <p>The current generation of NVIDIA GPUs is of the <em>Kepler</em> architecture (3rd generation). The 2nd generation was <em>Fermi</em> and the 1st was <em>Tesla</em>. (However note that <em>Tesla</em> is also used by NVIDIA to refer to different chip types). </p>
 248 | 
 249 | <p>Originally GPUs supported only single precision (i.e., <em>float</em> calculations) but fortunately they now support double precision operations, and most of the examples here will use doubles to reduce the possibility of potential numerical issues, in particular with linear algebra calculations. But in many contexts, single precision will be fine, and the GPU will do computations more quickly with single precision. We&#39;ll explore this a bit later in the tutorial.</p>
 250 | 
 251 | <h3>1.3) Software Tools</h3>
 252 | 
 253 | <p>Here are some of the useful software tools for doing computations on the GPU.</p>
 254 | 
 255 | <ul>
 256 | <li>CUDA - an extension of C/C++ for programming on an NVIDIA GPU </li>
 257 | <li>CUBLAS - a BLAS implementation for matrix-vector calculations on an NVIDIA GPU</li>
 258 | <li>CURANDOM - random number generation on an NVIDIA GPU</li>
 259 | <li>PyCUDA - a Python package providing a front-end for CUDA</li>
 260 | <li>RCUDA - an R package providing a front-end for CUDA</li>
 261 | <li>MAGMA - a package for combined CPU-GPU linear algebra, intended to be analogous to LAPACK + BLAS</li>
 262 | </ul>
 263 | 
 264 | <p>Note that RCUDA is still in development and is on Github but not CRAN, but should be high-quality as it is developed by Duncan Temple Lang at UC-Davis.</p>
 265 | 
 266 | <p>We&#39;ll see all of these in action.</p>
 267 | 
 268 | <p>There are also:</p>
 269 | 
 270 | <ul>
 271 | <li>openCL - an alternative to CUDA that can also be used with non-NVIDIA GPUs</li>
 272 | <li>CUDA Python (from Anaconda, but free for academic use)</li>
 273 | <li>PyOpenCL</li>
 274 | <li>R packages: OpenCL, gpuR, gmatrix, gputools</li>
 275 | <li>BIDMach - software for fast machine learning with a GPU back end available</li>
 276 | </ul>
 277 | 
 278 | <p>Finally, many of the popular machine learning packages focused on neural networks and deep learning can use GPUs behind the scenes; these include Theano, Caffe, Torch, Tensorflow, and mocha.jl, among others.</p>
 279 | 
 280 | <p>Some of these, such as PyCUDA and RCUDA allow you to easily interface to core CUDA code that you write yourself. Others, such as the other R packages and CUDA Python, allow you to program within R and Python but still use the GPU for some of the computation. Finally tools such as the various machine learning hide the details of the GPU usage from you and allow you to simply program in the environment of the software, with computations done on the GPU behind the scenes if a GPU is available. </p>
 281 | 
 282 | <h1>2) GPU hardware available at Berkeley</h1>
 283 | 
 284 | <h3>2.1) Department-specific GPUs</h3>
 285 | 
 286 | <h4>Statistics</h4>
 287 | 
 288 | <p>The Statistical Computing Facility has a GPU on our high-priority cluster. We&#39;ll use this GPU in the demos here, though it is only available for Statistics affiliates. More details on using the GPU are available <a href="http://statistics.berkeley.edu/computing/servers/gpu">here</a>.</p>
 289 | 
 290 | <h4>Biostatistics</h4>
 291 | 
 292 | <p>Biostatistics has a GPU on one of its servers. Talk to Burke for more information.</p>
 293 | 
 294 | <h4>Economics</h4>
 295 | 
 296 | <p>The EML (Economics) has a GPU on one of the EML Linux servers that EML users can access. If this is of interest to you, email <a href="mailto:consult@econ.berkeley.edu">consult@econ.berkeley.edu</a>, and I will work to get it set up analogously to the Statistics GPU and the Amazon virtual machine (see below) and to help you get started. </p>
 297 | 
 298 | <h3>2.2) GPUs on the campus Linux cluster, Savio</h3>
 299 | 
 300 | <p>Savio recently purchased some nodes with GPUs. These are not yet available to the general public, but will soon be available to users affiliated with researchers who have purchased nodes on Savio and to users who are affiliated with faculty members using the faculty compute allowance. </p>
 301 | 
 302 | <p>The general syntax for submitting a GPU-based job to Savio&#39;s SLURM based scheduler is as follows.</p>
 303 | 
 304 | <pre><code>sbatch -A account_name -p savio2_gpu -N 1 -t 60:0 job.sh
 305 | </code></pre>
 306 | 
 307 | <p>Alternatively, simply do <code>sbatch job.sh</code> and include the scheduling flags in your <em>job.sh</em>, as demonstrated in <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/savio-job-template.sh">savio-job-template.sh</a>.</p>
 308 | 
 309 | <p>To figure out what to fill in for <em>account_name</em>, you can look up your accounts with</p>
 310 | 
 311 | <pre><code>sacctmgr -p show associations user=${USER}
 312 | </code></pre>
 313 | 
 314 | <p>For an interactive session:</p>
 315 | 
 316 | <pre><code>srun -A account_name --pty -p savio2_gpu -N1 -t 30:0 /bin/bash
 317 | </code></pre>
 318 | 
 319 | <p>Before doing any compilation involving CUDA code you generally want to change your environment modules:</p>
 320 | 
 321 | <pre><code>module unload intel
 322 | module load cuda
 323 | </code></pre>
 324 | 
 325 | <h3>2.3) GPUs through Amazon&#39;s EC2 service</h3>
 326 | 
 327 | <p>The <em>g2.2xlarge</em> Amazon EC2 instance types have a GPU with 1536 cores and 4 Gb memory, along with 8 CPU cores. There is also a <em>g2.8xlarge</em> that has four GPUs and 32 CPU cores. They can be pretty expensive unless you use spot instances - currently 65 cents per hour for g2.2xlarge and $2.60 per hour for g2.8xlarge in the us-west-2 region. The g2.2xlarge GPUs are pretty old chips, and I found that some of the examples included here ran a lot slower on the EC2 instance than on the Statistics GPU (and likely than Savio, but I haven&#39;t checked that as much).</p>
 328 | 
 329 | <p>I&#39;ve created an Amazon machine image (an AMI) that is the binary representation of the Linux Ubuntu operating system with support for GPU calculations. The AMI is based off of the <a href="bce.berkeley.edu">BCE virtual machine</a> in use for a variety of projects and classes on campus. BCE provides a common set of software used in various data analysis/data science focused contexts, including Python and R. The BCE GPU AMI inherits this software and adds on various GPU-related software (in particular CUDA). Note also that the AMI is also similar to the  SCF and EML Linux machines but with a reduced set of software.</p>
 330 | 
 331 | <p>Based on the BCE-GPU AMI one can start up a virtual Linux machine that one can login to (see below for instructions) via SSH, just like any SCF/EML Linux server. If you were willing to pay Amazon and have an account, you can start a VM (in the Oregon [us-west-2] region) using the BCE GPU AMI by searching for <em>BCE-2015-fall-gpu</em> under &ldquo;Public Images&rdquo; at the <a href="https://console.aws.amazon.com/ec2/v2/home?region=us-west-2#Images:">EC2 console</a>. Then just launch a VM, selecting <em>g2.2xlarge</em> under the <em>GPU instances</em> tab. </p>
 332 | 
 333 | <p>If you&#39;re interested in how to install CUDA-related software on an Ubuntu Linux machine, see <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/build-bce-gpu.sh">build-bce-gpu.sh</a> for the details of how I built the <em>BCE-2015-fall-gpu</em> image based on the <em>BCE-2015-fall</em> image.</p>
 334 | 
 335 | <h1>3) Some basics of GPU use</h1>
 336 | 
 337 | <h3>3.1) Getting information about the GPU</h3>
 338 | 
 339 | <p>First let&#39;s see how we get information about the GPU and activity on the GPU.</p>
 340 | 
 341 | <h4>Hardware specifications</h4>
 342 | 
 343 | <p>First, executing the following code as root will create an executable that will show you details on the GPU, including the possible block and grid dimensions (described shortly).</p>
 344 | 
 345 | <pre><code class="bash">cd  /usr/local/cuda/samples/1_Utilities/deviceQuery
 346 | nvcc deviceQuery.cpp -I/usr/local/cuda/include \
 347 |    -I/usr/local/cuda-5.5/samples/common/inc -o /usr/local/cuda/bin/deviceQuery
 348 | cd -
 349 | </code></pre>
 350 | 
 351 | <p>Once the <em>deviceQuery</em> executable is created, you can run it whenever you want.</p>
 352 | 
 353 | <p>You&#39;ll see information such as the following.</p>
 354 | 
 355 | <pre><code>paciorek@scf-sm20:~&gt; deviceQuery
 356 | deviceQuery Starting...
 357 | 
 358 |  CUDA Device Query (Runtime API) version (CUDART static linking)
 359 | 
 360 | Detected 1 CUDA Capable device(s)
 361 | 
 362 | Device 0: &quot;Tesla K20Xm&quot;
 363 |   CUDA Driver Version / Runtime Version          7.0 / 7.0
 364 |   CUDA Capability Major/Minor version number:    3.5
 365 |   Total amount of global memory:                 5760 MBytes (6039339008 bytes)
 366 |   (14) Multiprocessors, (192) CUDA Cores/MP:     2688 CUDA Cores
 367 |   GPU Max Clock rate:                            732 MHz (0.73 GHz)
 368 |   Memory Clock rate:                             2600 Mhz
 369 |   Memory Bus Width:                              384-bit
 370 |   L2 Cache Size:                                 1572864 bytes
 371 |   Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
 372 |   Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
 373 |   Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
 374 |   Total amount of constant memory:               65536 bytes
 375 |   Total amount of shared memory per block:       49152 bytes
 376 |   Total number of registers available per block: 65536
 377 |   Warp size:                                     32
 378 |   Maximum number of threads per multiprocessor:  2048
 379 |   Maximum number of threads per block:           1024
 380 |   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
 381 |   Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
 382 |   Maximum memory pitch:                          2147483647 bytes
 383 |   Texture alignment:                             512 bytes
 384 |   Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
 385 |   Run time limit on kernels:                     No
 386 |   Integrated GPU sharing Host Memory:            No
 387 |   Support host page-locked memory mapping:       Yes
 388 |   Alignment requirement for Surfaces:            Yes
 389 |   Device has ECC support:                        Enabled
 390 |   Device supports Unified Addressing (UVA):      Yes
 391 |   Device PCI Domain ID / Bus ID / location ID:   0 / 2 / 0
 392 |   Compute Mode:
 393 |      &lt; Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) &gt;
 394 | 
 395 | deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 1, Device0 = Tesla K20Xm
 396 | Result = PASS
 397 | </code></pre>
 398 | 
 399 | <h4>Observing Performance on the GPU</h4>
 400 | 
 401 | <p>The following command will allow you to see some information analogous to <em>top</em> on the CPU. </p>
 402 | 
 403 | <pre><code class="bash">nvidia-smi -q -d UTILIZATION -l 1
 404 | </code></pre>
 405 | 
 406 | <p>Here&#39;s some example output when the GPU is idle: </p>
 407 | 
 408 | <pre><code class="bash">==============NVSMI LOG==============
 409 | 
 410 | Timestamp                           : Mon Jan 25 17:45:12 2016
 411 | Driver Version                      : 346.46
 412 | 
 413 | Attached GPUs                       : 1
 414 | GPU 0000:02:00.0
 415 |     Utilization
 416 |         Gpu                         : 0 %
 417 |         Memory                      : 0 %
 418 |         Encoder                     : 0 %
 419 |         Decoder                     : 0 %
 420 | 
 421 | </code></pre>
 422 | 
 423 | <p>Memory use based on the above does not seem to actually indicate how much of the overall GPU memory is in use for some reason.</p>
 424 | 
 425 | <p>Instead, to see how much memory is used on the GPU, the following will work:</p>
 426 | 
 427 | <pre><code class="bash">nvidia-smi -q -d MEMORY -l 1
 428 | </code></pre>
 429 | 
 430 | <p>Here&#39;s some example output when not much memory is in use on the GPU: </p>
 431 | 
 432 | <pre><code class="bash">==============NVSMI LOG==============
 433 | 
 434 | Timestamp                           : Thu Jan 28 12:06:24 2016
 435 | Driver Version                      : 346.46
 436 | 
 437 | Attached GPUs                       : 1
 438 | GPU 0000:02:00.0
 439 |     FB Memory Usage
 440 |         Total                       : 5759 MiB
 441 |         Used                        : 12 MiB
 442 |         Free                        : 5747 MiB
 443 |     BAR1 Memory Usage
 444 |         Total                       : 256 MiB
 445 |         Used                        : 2 MiB
 446 |         Free                        : 254 MiB
 447 | </code></pre>
 448 | 
 449 | <h3>3.2) Overview of computation on a GPU</h3>
 450 | 
 451 | <p>The basic series of operations to use a GPU when writing your own GPU code is:</p>
 452 | 
 453 | <ul>
 454 | <li>allocate memory on the GPU</li>
 455 | <li>transfer data from CPU to GPU</li>
 456 | <li>launch the CUDA kernel to operate on the threads, with a given block/grid arrangement</li>
 457 | <li>(optionally) launch another kernel, which can access data stored on the GPU, including results from the previous kernel</li>
 458 | <li>transfer results back to CPU</li>
 459 | </ul>
 460 | 
 461 | <p>The key computations are done in the <em>kernel</em>. Kernels are functions that encode the core computational operations that are executed in parallel. The basic mode of operation with a GPU when you are writing your own GPU code is to write a kernel using CUDA code and then call the kernel in parallel via C, R, or Python code. </p>
 462 | 
 463 | <p>As outlined above, we need to pass any data from the CPU to the GPU and do the same in reverse to get the result. We&#39;ll also need to allocate memory on the GPU. However in some cases the transfer and allocation will be done automatically behind the scenes.</p>
 464 | 
 465 | <h3>3.3) Threads, Blocks, and Grids</h3>
 466 | 
 467 | <p>Programming on a GPU (in particular programming for efficiency) requires some understanding of how parallelization works on the GPU. Each individual computation or series of computations on the GPU is done in a thread. Threads are organized into blocks and blocks of threads are organized in a grid. The blocks and grids can be 1-, 2-, or 3-dimensional. E.g., you might have a 1-d block of 256 threads, with a grid of 3 x 3 such blocks, for a total of \(256 \times 9 = 2304\) threads. The choice of the grid/block arrangement can affect efficiency. I&#39;m not an expert at this level of detail but we&#39;ll see some about this in the worked example. Note that using more than 1-dimensional grids and blocks is purely for the conceptual convenience of the programmer and doesn&#39;t correspond to anything on the hardware. So for the most part we&#39;ll use a one-dimensional grid of blocks and a one-dimensional blocks of threads.
 468 | In general you&#39;d want each independent calculation done in a separate thread, though as we&#39;ll see in Section 5 on simulation, one might want to do a sequence of calculations on each thread. In general, you&#39;ll want to pipeline together multiple operations within a computation to avoid copying from CPU to GPU and back. Alternatively, this can be done by keeping the data on the GPU and calling a second kernel. </p>
 469 | 
 470 | <p>Threads are quick to start, and to get efficiency you want to have thousands of threads to exploit the parallelism of the GPU hardware. In general your calculations will have more threads than GPU cores; the GPU will manage the process of executing all the threads.</p>
 471 | 
 472 | <p>This can all get quite complicated, with the possibility for communication amongst threads. Threads within a block have some (48Kb) of shared memory (distinct from the main GPU memory) and can synchronize with each other, while threads in different blocks cannot cooperate. We&#39;ll see some basic examples of this in our working example later. The Suchard et al. paper referenced in the last Section discusses how to get more efficiency by having threads within a block cooperate and access shared memory, which is much faster than accessing the  main GPU (device) memory.</p>
 473 | 
 474 | <p>If we go back to the <em>deviceQuery</em> output, we&#39;ll see information on the number of physical CUDA cores and main GPU memory as well as information about  the maximum threads per block and the maximum dimensions of thread blocks and grids.</p>
 475 | 
 476 | <h3>3.4) &ldquo;Hello, world&rdquo; using CUDA directly</h3>
 477 | 
 478 | <p>First let&#39;s see a &#39;Hello, World&#39; example that illustrates blocks of threads and grids of blocks.</p>
 479 | 
 480 | <p>The idea is to have at least as many threads as the number of computations you are doing. Our kernel function contains the core calculation we want to do (in this case printing &#39;Hello world!&#39;) and code that figures out identifying information for each thread as discussed next. </p>
 481 | 
 482 | <p>When we write a kernel, we will need to have some initial code that determines a unique ID for that thread  that allows the thread to access the appropriate part(s) of the data object(s) on the GPU and &#39;know&#39; what part of the computation it should do. This is done based on information stored in variables that CUDA provides that have information about the thread and block indices and block and grid dimensions.</p>
 483 | 
 484 | <p>Here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/helloWorld.cu">example code (helloWorld.cu on the github repo)</a>.</p>
 485 | 
 486 | <p>In this case, compilation is as follows. Given the CUDA functionality used in the code (in particular the call to <em>printf</em> within the kernel), we need to specify compilation for a <em>compute capability</em> &gt;= 2.0 (corresponding to the Fermi generation of NVIDIA GPUs) (more below). Note that our query above indicated that the GPU we are using has capability 3.5, so this constraint is fine. </p>
 487 | 
 488 | <pre><code class="bash">nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
 489 | </code></pre>
 490 | 
 491 | <p>The result of this looks like:</p>
 492 | 
 493 | <pre><code class="bash">Launching 20480 threads (N=20000)
 494 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(448,0,0) =&gt; thread index=1984
 495 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(449,0,0) =&gt; thread index=1985
 496 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(450,0,0) =&gt; thread index=1986
 497 | ....
 498 | 
 499 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(220,0,0) =&gt; thread index=20188 
 500 | [### this thread would not be used for N=20000 ###]
 501 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(221,0,0) =&gt; thread index=20189 
 502 | [### this thread would not be used for N=20000 ###]
 503 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(222,0,0) =&gt; thread index=20190 
 504 | [### this thread would not be used for N=20000 ###]
 505 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(223,0,0) =&gt; thread index=20191 
 506 | [### this thread would not be used for N=20000 ###]
 507 | kernel launch success!
 508 | That&#39;s all!
 509 | </code></pre>
 510 | 
 511 | <p>Note that because of some buffering issues, with this many threads, we can&#39;t see the output for all of them, hence the <em>if</em> statement in the kernel code. It is possible to retrieve info about the limit and change the limit using <em>cudaDeviceGetLimit()</em> and <em>cudaDeviceSetLimit()</em>.</p>
 512 | 
 513 | <h3>3.5) CUDA <em>compute capability</em></h3>
 514 | 
 515 | <p>The <em>compute capability</em> basically refers to the evolving functionality of the NVIDIA architecture. Higher numbers provide more functionality but will only run on newer GPU hardware.</p>
 516 | 
 517 | <p>For example, to use doubles rather than floats you need compute capability of at least 1.3. This required compute capability needs to be specified when you are compiling CUDA code.</p>
 518 | 
 519 | <h1>4) Executing kernels</h1>
 520 | 
 521 | <p>A note on the speed comparisons in the remaining section. These compare a fully serial CPU calculation on a single core to calculation on the GPU. On a multicore machine, we could speed up the CPU calculation by writing code to parallelize the calculation (e.g., via threading in C/openMP or various parallelization tools in R or Python). </p>
 522 | 
 523 | <p>Also, note that in the various examples when I want to assess computational time, I make sure to synchronize all the threads via an appropriate function call. This ensures that all of the threads have finished their kernel calculations before I mark the end of the time interval. In general a function call to do a calculation on the GPU will simply start the calculation and then return, with the calculation continuing on the GPU.</p>
 524 | 
 525 | <p>In this section, I&#39;ll demonstrate calling a kernel that simply computes the normal density function (PDF) on a vector of values in parallel, one value per thread. </p>
 526 | 
 527 | <h3>4.1) Running a kernel from C/CUDA</h3>
 528 | 
 529 | <p>Now let&#39;s see our example implemented using CUDA code, including memory allocation on the GPU and transfer between the GPU and CPU. </p>
 530 | 
 531 | <p>My kernel code allocates memory on the CPU and the device (GPU) memory and the kernel function uses the device memory for the alphas, random numbers, and the output values (the probability estimates). </p>
 532 | 
 533 | <p>Note that here, I&#39;ll use 1024 threads per block and then a grid sufficiently large so that we have at least as many threads as computational chunks. </p>
 534 | 
 535 | <p>Here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.cu">code (kernelExample.cu on the github repo)</a>.</p>
 536 | 
 537 | <p>Compilation is as follows. </p>
 538 | 
 539 | <pre><code class="bash">nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
 540 | </code></pre>
 541 | 
 542 | <p>Here are some results:</p>
 543 | 
 544 | <pre><code class="bash">====================================================
 545 | Grid dimension is 46 x 46
 546 | Launching 2166784 threads (N=2097152)
 547 | Input values: -0.658344 0.499804 -0.807257...
 548 | Memory Copy from Host to Device successful.
 549 | Memory Copy from Device to Host successful.
 550 | Output values: 0.321214 0.352100 0.288007...
 551 | Output values (CPU): 0.321214 0.352100 0.288007...
 552 | Timing results for n = 2097152
 553 | Transfer to GPU time: 0.009988
 554 | Calculation time (GPU): 0.000366
 555 | Calculation time (CPU): 0.058541
 556 | Transfer from GPU time: 0.001716
 557 | Freeing memory...
 558 | ====================================================
 559 | ...
 560 | ...
 561 | ====================================================
 562 | Grid dimension is 363 x 363
 563 | Launching 134931456 threads (N=134217728)
 564 | Input values: -0.658344 0.499804 -0.807257...
 565 | Memory Copy from Host to Device successful.
 566 | Memory Copy from Device to Host successful.
 567 | Output values: 0.321214 0.352100 0.288007...
 568 | Output values (CPU): 0.321214 0.352100 0.288007...
 569 | Timing results for n = 134217728
 570 | Transfer to GPU time: 0.638223
 571 | Calculation time (GPU): 0.021684
 572 | Calculation time (CPU): 3.470199
 573 | Transfer from GPU time: 0.055798
 574 | Freeing memory...
 575 | ====================================================
 576 | </code></pre>
 577 | 
 578 | <p>The speedup in pure computation time is very impressive (175x); surprisingly when I did this same benchmark two years ago with the EC2 g2.x2large instance the speedup was &#39;only&#39; 40x. However, importantly, we do see that the time for transferring to and from (particularly to) the GPU exceeds the calculation time, reinforcing the idea of keeping data on the GPU when possible. </p>
 579 | 
 580 | <h4>Using Pinned Memory</h4>
 581 | 
 582 | <p>Here&#39;s some code where we use pinned memory that is &#39;mapped&#39; to the GPU such that the GPU directly accesses CPU memory. This can be advantageous if one exceeds the GPU&#39;s memory and, according to some sources, is best when you load the data only once. Another approach, using pinned but not mapped memory allows for more efficient transfer but without the direct access from the GPU, with a hidden transfer done behind the scenes. This may be better if the data is loaded multiple times on the GPU.</p>
 583 | 
 584 | <p>Here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample-pinned.cu">code (kernelExample-pinned.cu on the github repo)</a>.</p>
 585 | 
 586 | <p>Here are some results:</p>
 587 | 
 588 | <pre><code class="bash">
 589 | ====================================================
 590 | Grid dimension is 46 x 46
 591 | Launching 2166784 threads (N=2097152)
 592 | Input values: -0.658344 0.499804 -0.807257...
 593 | Output values: 0.321214 0.352100 0.288007...
 594 | Output values (CPU): 0.321214 0.352100 0.288007...
 595 | Timing results for n = 2097152
 596 | Calculation time (GPU): 0.003245
 597 | Calculation time (CPU): 0.058515
 598 | Freeing memory...
 599 | ====================================================
 600 | ...
 601 | ...
 602 | ====================================================
 603 | Grid dimension is 363 x 363
 604 | Launching 134931456 threads (N=134217728)
 605 | Input values: -0.658344 0.499804 -0.807257...
 606 | Output values: 0.321214 0.352100 0.288007...
 607 | Output values (CPU): 0.321214 0.352100 0.288007...
 608 | Timing results for n = 134217728
 609 | Calculation time (GPU): 0.187535
 610 | Calculation time (CPU): 3.757175
 611 | Freeing memory...
 612 | ====================================================
 613 | </code></pre>
 614 | 
 615 | <p>So using pinned mapped memory seems to help quite a bit in this case, as the total time with pinned memory is less than the time used for transfer plus calculation in the previous examples.</p>
 616 | 
 617 | <h3>4.2) Calling CUDA Kernels from R (RCUDA)</h3>
 618 | 
 619 | <p>When we want to use CUDA from R, the kernel function will remain the same, but the pre- and post-processing is done in R rather than in C. Here&#39;s an example, with the same normal density kernel. The CUDA kernel code is saved in a <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/calc_loglik.cu">separate file (calc_loglik.cu on the github repo)</a> separate file but is identical to that in the full CUDA+C example above (with the exception that we need to wrap the kernel function in <code>extern &quot;C&quot;</code>).</p>
 620 | 
 621 | <p>Here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.R">code (kernelExample.R on the github repo)</a></p>
 622 | 
 623 | <p>In this example we see that we can either transfer data between CPU and GPU manually or have RCUDA do it for us. If we didn&#39;t want to overwrite the input, but rather to allocate separate space for the output on the GPU, we could use <em>cudaMalloc()</em> (see example in Section 5.2).</p>
 624 | 
 625 | <p>We need to compile the kernel into a ptx object file, either outside of R:</p>
 626 | 
 627 | <pre><code class="bash">nvcc --ptx  -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu
 628 | </code></pre>
 629 | 
 630 | <p>or inside of R:</p>
 631 | 
 632 | <pre><code class="r">ptx = nvcc(file = &#39;calc_loglik.cu&#39;, out = &#39;calc_loglik.ptx&#39;, target = &#39;ptx&#39;, &#39;-arch=compute_20&#39;, &#39;-code=sm_20,compute_20&#39;)
 633 | </code></pre>
 634 | 
 635 | <p>Here are some results:</p>
 636 | 
 637 | <pre><code class="bash">Grid size:
 638 | [1] 363 363   1
 639 | Total number of threads to launch =  134931456 
 640 | Running CUDA kernel...
 641 | Input values:  0.8966972 0.2655087 0.3721239 
 642 | Output values:  0.2457292 0.2658912 0.2656543 
 643 | Output values (implicit transfer):  0.2457292 0.2658912 0.2656543 
 644 | Output values (CPU with R):  0.2457292 0.2658912 0.2656543 
 645 | Transfer to GPU time:  0.702 
 646 | Calculation time (GPU):  0.044 
 647 | Transfer from GPU time:  0.489 
 648 | Calculation time (CPU):  8.432 
 649 | Combined calculation/transfer via .cuda time (GPU):  1.203 
 650 | </code></pre>
 651 | 
 652 | <p>So the transfer time is again substantial in relative terms. Without that time, the speedup would be substantial. </p>
 653 | 
 654 | <p>We can avoid explicitly specifying block and grid dimensions by using the <em>gridBy</em> argument to <em>.cuda()</em>, with syntax as shown in the <em>kernelExample.R</em>. For some reason that code is not working, though I have gotten it to work in other contexts.</p>
 655 | 
 656 | <p>WARNING #1: be very careful that the types of the R objects passed to the kernel match what the kernel is expecting. Otherwise the code can hang without an informative error message. </p>
 657 | 
 658 | <p>WARNING #2: Note the use of the <code>strict=TRUE</code> argument when passing values to the GPU. This ensures that numeric values are kept as doubles and not coerced to floats. </p>
 659 | 
 660 | <h3>4.3) Calling CUDA Kernels from Python</h3>
 661 | 
 662 | <p>With PyCUDA the kernel code can be directly embedded in the Python script. Otherwise it&#39;s fairly similar to the use of RCUDA. Here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.py">code (kernelExample.py on the github repo)</a></p>
 663 | 
 664 | <p>Here are some results:</p>
 665 | 
 666 | <pre><code class="bash">Generating random normals...
 667 | Running GPU code...
 668 | Time for calculation (GPU): 1.008687s
 669 | Running Scipy CPU code...
 670 | Time for calculation (CPU): 12.572273s
 671 | Output from GPU: 0.177782 0.224597 0.109604
 672 | Output from CPU: 0.177782 0.224597 0.109604
 673 | </code></pre>
 674 | 
 675 | <p>WARNING: As was the case with R, be careful that the types of the Python objects passed to the kernel match what the kernel is expecting. </p>
 676 | 
 677 | <h1>5) Random Number Generation (RNG) on the GPU</h1>
 678 | 
 679 | <p>RNG is done via the CURAND (CUDA Random Number Generation) library. CURAND provides several different generators including the Mersenne Twister (the default in R). </p>
 680 | 
 681 | <h3>5.1) Seeds and Sequences</h3>
 682 | 
 683 | <p>From the CUDA documentation:</p>
 684 | 
 685 | <p><em>For the highest quality parallel pseudorandom number generation, each experiment should be assigned a unique seed. Within an experiment, each thread of computation should be assigned a unique sequence number. If an experiment spans multiple kernel launches, it is recommended that threads between kernel launches be given the same seed, and sequence numbers be assigned in a monotonically increasing way. If the same configuration of threads is launched, random state can be preserved in global memory between launches to avoid state setup time.</em></p>
 686 | 
 687 | <p>A lot of important info&hellip; we&#39;ll interpret/implement much of it in the demo below.</p>
 688 | 
 689 | <p>Recall that RNG on a computer involves generation of pseudo-random numbers from a deterministic, periodic sequence. The seed determines where one starts generating from within that sequence. The idea of the sequence numbers is to generate from non-overlapping blocks within the sequence, with each thread getting a different block.  </p>
 690 | 
 691 | <h3>5.2) Calling CURAND via RCUDA</h3>
 692 | 
 693 | <p>For RNG, we need a kernel to initialize the RNG on each thread and one to do the sampling (though they could be combined in a single kernel). Note that the time involved in initializing the RNG for each thread is substantial. This shouldn&#39;t be a problem if one is doing a lot of calculations over time. To amortize this one-time expense, I generate multiple random numbers per thread. Here&#39;s the <a href="https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/random.cu">kernel code (random.cu on the github repo)</a>. The second argument to <em>curand_init</em> is the sequence number - by having contiguous sequence numbers for the threads, the position of the initial random number for a given thread is spaced \(2^{67}\) values apart from the position of the initial random number for the next thread.</p>
 694 | 
 695 | <p>And here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/RNGexample.R">R code (RNGexample.R on the github repo)</a> to call the kernel, which looks very similar to the RCUDA code we&#39;ve already seen.</p>
 696 | 
 697 | <p>Here are some results:</p>
 698 | 
 699 | <pre><code class="bash">RNG initiation time:  0.062
 700 | GPU memory allocation time:  0.001 
 701 | Calculation time (GPU):  0.228
 702 | Transfer from GPU time:  0.423 
 703 | Calculation time (CPU):  7.292
 704 | </code></pre>
 705 | 
 706 | <p>We get a decent speed up, which would be more impressive if we can set up the calculations such that we don&#39;t need to transfer the whole large vector back to the CPU. Also, the code in <em>random.cu</em> uses non-unit strides and could probably be reworked for more efficient global memory access (see Section 7).</p>
 707 | 
 708 | <p>Also note the memory cost of the RNG states for the threads, 48 bytes per thread, which could easily exceed GPU memory if one starts up many threads. </p>
 709 | 
 710 | <p>At the moment, I&#39;m not sure how to choose the RNG generator from within R. </p>
 711 | 
 712 | <h3>5.3) Calling CURAND from C and from Python</h3>
 713 | 
 714 | <p>I may flesh this out at some point, but by looking at the RNG example via RCUDA and the examples of calling kernels from C and Python in the previous section, it should be straightforward to do RNG on the GPU controlled by C or Python.</p>
 715 | 
 716 | <p>To choose the generator in C this should work (in this case choosing the Mersenne Twister):
 717 | <code>curandCreateGenerator(CURAND_RNG_PSEUDO_MTGP32)</code>.</p>
 718 | 
 719 | <h1>6) Using higher-level functionality to do linear algebra and vectorized operations on the GPU</h1>
 720 | 
 721 | <p>The idea here is to use software that hides the details of the kernel implementation from us, relying on the expertise of others to efficiently code standard computations on the GPU. </p>
 722 | 
 723 | <p>We&#39;ll start with very high-level use of the GPU by simply calling linear algebra routines that use the GPU. </p>
 724 | 
 725 | <h3>6.1) Using C to Call CUDABLAS and MAGMA</h3>
 726 | 
 727 | <p>We can do linear algebra (and basic vectorized operations with vectors and matrices) using GPU implementations of BLAS/LAPACK type routines. Both CUDA (through CUDABLAS) and MAGMA provide access to BLAS functionality, but only MAGMA provides LAPACK-like functionality (i.e., matrix factorizations/decompositions). </p>
 728 | 
 729 | <p>We&#39;ll make CUDABLAS and MAGMA calls directly in C code. The MAGMA library provides a drop-in for the functionality of the BLAS and LAPACK that carries out linear algebra on both the CPU and GPU, choosing smartly where to do various aspects of the calculation. We&#39;ll now need to directly manage memory allocation on the GPU and transferring data back and forth from CPU to GPU.</p>
 730 | 
 731 | <h4>CUDA and CUDABLAS</h4>
 732 | 
 733 | <p>The code doesn&#39;t look too different than C code or calls to BLAS/LAPACK, but we use some CUDA functions and CUDA types. Here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/cudaBlasExample.c">example code (cudaBlasExample.c on the github repo)</a>.</p>
 734 | 
 735 | <p>Compilation goes as follows. Note that in this case nvcc does not want the file to have .C or .cu extension. </p>
 736 | 
 737 | <pre><code class="bash">nvcc cudaBlasExample.c -I/usr/local/cuda/include -lcublas -o cudaBlasExample
 738 | </code></pre>
 739 | 
 740 | <p>And here are (some of) the results:</p>
 741 | 
 742 | <pre><code class="bash">Starting
 743 | ====================================================
 744 | Timing results for n = 512
 745 | GPU memory allocation time: 0.000256
 746 | Transfer to GPU time: 0.001642
 747 | Matrix multiply time: 0.000481
 748 | Transfer from GPU time: 0.001550
 749 | ====================================================
 750 | Timing results for n = 2048
 751 | GPU memory allocation time: 0.000276
 752 | Transfer to GPU time: 0.020364
 753 | Matrix multiply time: 0.015466
 754 | Transfer from GPU time: 0.015035
 755 | ====================================================
 756 | Timing results for n = 8192
 757 | GPU memory allocation time: 0.000800
 758 | Transfer to GPU time: 0.325620
 759 | Matrix multiply time: 0.940571
 760 | Transfer from GPU time: 0.229997
 761 | </code></pre>
 762 | 
 763 | <p>For (rough) comparison, the \(n=8192\) multiplication on the CPU (using <em>openBLAS</em> as the BLAS, called from R) takes 106 seconds with one core and 18 seconds with 8 cores.</p>
 764 | 
 765 | <h4>MAGMA</h4>
 766 | 
 767 | <p>Now let&#39;s see the use of <a href="http://icl.cs.utk.edu/magma/">MAGMA</a>. MAGMA provides analogous calls as CUDA/CUDABLAS for allocating memory, transferring data, and BLAS calls, as well as LAPACK type calls. </p>
 768 | 
 769 | <p>Note that the LAPACK type calls have a CPU interface and a GPU interface. The GPU interface calls have function names ending in &#39;_gpu&#39; and operate on data objects in GPU memory. The CPU interface calls operate on data objects in CPU memory, handling the transfer to GPU memory as part of the calculation.</p>
 770 | 
 771 | <p>Here we&#39;ll compare timing for the GPU vs. standard BLAS/LAPACK as well as the CPU and GPU interfaces for the Cholesky.</p>
 772 | 
 773 | <p>Here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/magmaExample.c">example code (magmaExample.c on the github repo)</a>.</p>
 774 | 
 775 | <p>Compilation and execution (with and without pinned memory) go as follows. Note we can use gcc and that we need to link in the CPU BLAS and LAPACK since MAGMA uses both CPU and GPU for calculations (plus in this example I directly call BLAS and LAPACK functions).</p>
 776 | 
 777 | <pre><code class="bash">gcc magmaExample.c -O3 -DADD_ -fopenmp  -DHAVE_CUBLAS -I/usr/local/cuda/include \
 778 |     -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma \
 779 |     -llapack -lblas -lcublas -lcudart -o magmaExample
 780 | ./magmaExample 1
 781 | ./magmaExample 0
 782 | </code></pre>
 783 | 
 784 | <p>And here are (some of) the results:</p>
 785 | 
 786 | <pre><code class="bash">Starting
 787 | Setting use_pinned to 1
 788 | ====================================================
 789 | Timing results for n = 512
 790 | GPU memory allocation time: 0.000256
 791 | Transfer to GPU time: 0.085331
 792 | Matrix multiply time (GPU): 0.000692
 793 | Matrix multiply time (BLAS): 0.049665
 794 | Cholesky factorization time (GPU w/ GPU interface): 0.023938
 795 | Cholesky factorization time (GPU w/ CPU interface): 0.004702
 796 | Cholesky factorization time (LAPACK): 0.006958
 797 | Transfer from GPU time: 0.000344
 798 | ====================================================
 799 | Timing results for n = 2048
 800 | GPU memory allocation time: 0.000366
 801 | Transfer to GPU time: 0.005706
 802 | Matrix multiply time (GPU): 0.027141
 803 | Matrix multiply time (BLAS): 0.446544
 804 | Cholesky factorization time (GPU w/ GPU interface): 0.047918
 805 | Cholesky factorization time (GPU w/ CPU interface): 0.025746
 806 | Cholesky factorization time (LAPACK): 0.077203
 807 | Transfer from GPU time: 0.005030
 808 | ====================================================
 809 | Timing results for n = 8192
 810 | GPU memory allocation time: 0.000789
 811 | Transfer to GPU time: 0.087303
 812 | Matrix multiply time (GPU): 1.766567
 813 | Matrix multiply time (BLAS): 23.807952
 814 | Cholesky factorization time (GPU w/ GPU interface): 0.230186
 815 | Cholesky factorization time (GPU w/ CPU interface): 0.259374
 816 | Cholesky factorization time (LAPACK): 4.179541
 817 | Transfer from GPU time: 0.079991
 818 | 
 819 | Setting use_pinned to 0
 820 | ====================================================
 821 | Timing results for n = 512
 822 | GPU memory allocation time: 0.000257
 823 | Transfer to GPU time: 0.086421
 824 | Matrix multiply time (GPU): 0.000655
 825 | Matrix multiply time (BLAS): 0.037689
 826 | Cholesky factorization time (GPU w/ GPU interface): 0.016963
 827 | Cholesky factorization time (GPU w/ CPU interface): 0.011957
 828 | Cholesky factorization time (LAPACK): 0.005600
 829 | Transfer from GPU time: 0.001391
 830 | ====================================================
 831 | Timing results for n = 2048
 832 | GPU memory allocation time: 0.000369
 833 | Transfer to GPU time: 0.009003
 834 | Matrix multiply time (GPU): 0.027190
 835 | Matrix multiply time (BLAS): 0.514402
 836 | Cholesky factorization time (GPU w/ GPU interface): 0.039755
 837 | Cholesky factorization time (GPU w/ CPU interface): 0.037521
 838 | Cholesky factorization time (LAPACK): 0.081121
 839 | Transfer from GPU time: 0.013978
 840 | ====================================================
 841 | Timing results for n = 8192
 842 | GPU memory allocation time: 0.001062
 843 | Transfer to GPU time: 0.136131
 844 | Matrix multiply time (GPU): 1.775493
 845 | Matrix multiply time (BLAS): 24.222220
 846 | Cholesky factorization time (GPU w/ GPU interface): 0.224644
 847 | Cholesky factorization time (GPU w/ CPU interface): 0.400515
 848 | Cholesky factorization time (LAPACK): 4.183725
 849 | Transfer from GPU time: 0.204625
 850 | </code></pre>
 851 | 
 852 | <p>So we see decent speed-ups both for the matrix multiplication and the Cholesky factorization; the comparisons are with respect to 8 CPU cores.</p>
 853 | 
 854 | <p>Using the CPU interface seems to provide a modest speedup (compared to the manual transfer + calculation time), as does using pinned memory.</p>
 855 | 
 856 | <h3>6.2) Using PyCUDA to do GPU calculations directly in Python</h3>
 857 | 
 858 | <p>PyCUDA also provides high-level functionality for vectorized calculations on the GPU. Basically you create a vector stored in GPU memory and then operate on it with a variety of mathematical functions. The modules that do this are <em>gpuarray</em> and <em>cumath</em>.</p>
 859 | 
 860 | <p>Here&#39;s the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/gpuArrayExample.py">code (gpuArrayExample.py on the github repo)</a></p>
 861 | 
 862 | <p>Here are the timing results. </p>
 863 | 
 864 | <pre><code class="bash">Transfer to GPU time: 0.639403s
 865 | Timing vectorized exponentiation:
 866 | GPU array calc time (initial): 0.276190s
 867 | GPU array calc time: 0.014222s
 868 | CPU calc time: 2.704504s
 869 | Timing vectorized dot product/sum of squares:
 870 | GPU array calc time (initial): 0.229969s
 871 | GPU array calc time: 0.007769s
 872 | CPU calc time: 0.071532s
 873 | </code></pre>
 874 | 
 875 | <p>So we see a good speedup for the vectorized exponentiation. However, there is some compilation that gets done when the code is run the first time that slows down the initial calculation. Also, again, the transfer of data to the GPU takes a chunk of time. </p>
 876 | 
 877 | <p>For the dot product, the speedup is not as impressive, probably because the aggregation that is needed to do the sum involves coordination across threads. </p>
 878 | 
 879 | <h3>6.3) Using R packages to do vectorized operations and linear algebra on the GPU</h3>
 880 | 
 881 | <p>Various R packages hide the details of the GPU implementation and allow you to do vector and matrix operations, including linear algebra, using standard R code. In some cases they overload the usual R functions such that you can simply call a function of the same name as in base R.</p>
 882 | 
 883 | <p>Some packages you might investigate include: </p>
 884 | 
 885 | <ul>
 886 | <li>HiPLARM (apparently this uses MAGMA behind the scenes)</li>
 887 | <li>gpuR (uses openCL rather than CUDA)</li>
 888 | <li>gmatrix</li>
 889 | <li>gputools</li>
 890 | </ul>
 891 | 
 892 | <h1>7) An extended example of optimizing GPU kernel code</h1>
 893 | 
 894 | <p>Here we&#39;ll implement a basic, but real computation that is a component of a larger collaboration I am engaged in. The basic context is understanding spatial variation in the species composition of forests in the eastern United States. The data are multinomial samples of counts of trees of different species at many different spatial locations (i.e., observations).  We fit a spatial version of a multicategory probit regression model. </p>
 895 | 
 896 | <p>In our coding, I&#39;ll compare a basic R implementation as well as a C++ implementation with various GPU implementations designed to improve the speed of the GPU calculation. I&#39;ll use R to manage the C++ and CUDA code (via <em>Rcpp</em> and <em>RCUDA</em>) but there&#39;s no reason one couldn&#39;t do this via Python or C/C++ on the front-end. Our main focus will be on the different CUDA implementations.</p>
 897 | 
 898 | <p>All of the implementations are in the <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example">example directory</a> in the repository.</p>
 899 | 
 900 | <h3>7.1) Example: Probit regression probabilities</h3>
 901 | 
 902 | <h4>Probit regression basics</h4>
 903 | 
 904 | <p>Consider probit regression, which is similar to logistic regression. The probability of a binary outcome is given as
 905 | \(p = P(Y = 1) = \Phi(X\beta)\) where \(\Phi()\) is the normal CDF.</p>
 906 | 
 907 | <p>The probit model can be rewritten in a latent variable representation that in a Bayesian context can facilitate MCMC computations to fit the model:
 908 | \[
 909 | Y =  I(W > 0) \
 910 | \]
 911 | \[
 912 | W  \sim  N(X\beta , 1) \
 913 | \]</p>
 914 | 
 915 | <p>Suppose we know \(\beta\). In order to determine \(p\) we could use Monte Carlo simulation to estimate this integral:
 916 | \(P(Y = 1) = \int_{-\infty}^0 f(w) dw\).</p>
 917 | 
 918 | <p>Now for probit regression, we could just use standard methods to compute normal pdf integrals. But for the multinomial extension we discuss next, we need Monte Carlo simulation.</p>
 919 | 
 920 | <h4>Multinomial probit regression</h4>
 921 | 
 922 | <p>Let \(Y\) be a categorical variable, \(Y \in \{{1,2,\ldots,K}\}\). Then a multinomial extension of the latent variable probit model is
 923 | \[
 924 | Y = {arg\ max}_k {W_k}
 925 | \]
 926 | \[
 927 | W_k \sim N(X\beta_k, 1)
 928 | \]</p>
 929 | 
 930 | <p>Now to compute \(p = ({P(Y=1), P(Y=2), \ldots, P(Y=K)})\) we can again do Monte Carlo simulation. The basic steps are:</p>
 931 | 
 932 | <ul>
 933 | <li>iterate m = 1, &hellip; , M
 934 | 
 935 | <ul>
 936 | <li>for k = 1,&hellip;,K, sample \(W_k\) from its corresponding normal distribution</li>
 937 | <li>determine the arg max of the \(W_k\)&#39;s</li>
 938 | </ul></li>
 939 | <li>over the \(M\) simulations, count the number of times each category had the largest corresponding \(W_k\)</li>
 940 | </ul>
 941 | 
 942 | <p>The proportion of times the category corresponded to the largest \(W_k\) is an estimate of the multinomial proportions of interest.</p>
 943 | 
 944 | <p>For our example, we want to do this computation for large \(M\) (to reduce Monte Carlo error) and for many observations with different \(X\) values. In our code, we will assume that we are given a vector (\(\alpha_i = {\{X_i\beta_k\}}_{k=1,\ldots,K}\)) for each observation, \(i\), resulting in an \(n\) by \(K\) matrix.</p>
 945 | 
 946 | <p>Finally, note that I can reuse the random numbers I need across the \(n\) observations (in fact, this probably reducesMonte Carlo error in certain ways), so I just need an \(M\) by \(K\) matrix of standard normal random variables. Even for large \(M\) this is not so big, and I&#39;ll simply generate the values once on the CPU. </p>
 947 | 
 948 | <h3>7.2) R and C baseline implementations</h3>
 949 | 
 950 | <p>In <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_pureR.R">example_pureR.R</a> and <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_Rcpp.R">example_Rcpp.R</a> I&#39;ve implemented the calculation for \(n=26280\), \(K=21\), and \(M=10000\). I tried to write efficient vectorized R code and efficient C++ code (called from R, for convenience). I&#39;ve also implemented parallel versions for both R and C++.</p>
 951 | 
 952 | <p>The pure R version takes about 570 seconds in serial and 140 seconds with eight cores.
 953 | The C++ version takes about 47 seconds in serial and 6 seconds with eight cores.</p>
 954 | 
 955 | <h3>7.3) A basic (but thoughtful) implementation</h3>
 956 | 
 957 | <p><a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_RCUDA.R">example_RCUDA.R</a> is the main R script that calls different kernel variations as I experimented with different strategies for efficiency. </p>
 958 | 
 959 | <p>In <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs.cu">compute_probs.cu</a> I make use of the already-computed random numbers, and allocate a temporary vector <em>w</em> to hold the value of \(w\) for the current Monte Carlo sample. </p>
 960 | 
 961 | <p>Some features of my code:</p>
 962 | 
 963 | <ul>
 964 | <li>It&#39;s generally recommended to have 128-256 threads per block, with the number a multiple of 32 (because threads operate in lock-step in &#39;warps&#39; of 32 threads). So I&#39;m using 192 threads per block.</li>
 965 | <li> I then determine the number of blocks (of 192 threads each) that I need so I can have one thread for each of my \(n\) observations.</li>
 966 | <li>For this algorithm, as mentioned, I can reuse the random numbers across observations, so I don&#39;t generate individually on the GPU.</li>
 967 | <li>I haven&#39;t thought about locality of memory access (i.e., strides, row-major vs. column-major) in this version of the code.</li>
 968 | </ul>
 969 | 
 970 | <p>Let&#39;s execute this:</p>
 971 | 
 972 | <pre><code class="bash">cd example
 973 | Rscript example_RCUDA.R
 974 | </code></pre>
 975 | 
 976 | <p>This takes 12.1 seconds. </p>
 977 | 
 978 | <h3>7.4) Accessing memory efficiently</h3>
 979 | 
 980 | <p>Access to the device memory is slow (memory latency), but GPUs are good at switching between different threads while data is being retrieved from memory. Also, the GPU can access memory from consecutive memory locations efficiently and <em>coalesce</em> (combine) the memory accesses of groups of threads in a warp. Finally, threads in a warp execute in lock-step. The implications of this is that we want the threads in a warp to retrieve contiguous values from the device memory.  This means using a &#39;stride&#39; of one when incrementing through a vector (analogous to moving along rows in a row-major matrix).</p>
 981 | 
 982 | <p>In the original code, I was striding through <em>alphas</em> and <em>probs</em> in strides of <em>k</em>. Thinking of the various matrices as having \(K\) rows and being column-major, I was accessing values from adjacent columns on contigous threads when I should have accessed values from adjacent rows.</p>
 983 | 
 984 | <p>Let&#39;s transpose the matrices sent to the GPU memory and access adjacent rows, i.e., strides of one, across contiguous threads, as shown in <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs_unitStrides.cu">compute_probs_unitStrides.cu</a>.</p>
 985 | 
 986 | <pre><code class="bash">echo &quot;unitStrides &lt;- TRUE&quot; &gt; /tmp/tmp.R
 987 | cat example_RCUDA.R &gt;&gt; /tmp/tmp.R
 988 | Rscript /tmp/tmp.R
 989 | </code></pre>
 990 | 
 991 | <p>This takes 8.5 seconds, which is a nice speed-up for a simple change, but not earth-shattering.</p>
 992 | 
 993 | <h3>7.5) Using shared memory (within a block)</h3>
 994 | 
 995 | <p>Next let&#39;s consider whether it makes sense to move any data into shared memory, which can be accessed something like 100x as fast as device memory and functions like a programmer-managed cache. Shared memory is shared across all threads in a block. A couple implications of this are:</p>
 996 | 
 997 | <ul>
 998 | <li>We need to be careful to do the indexing within blocks.</li>
 999 | <li>We need to transfer any results out of shared memory in order to get it back to the CPU.</li>
1000 | <li>We don&#39;t need the calculations synchronized across threads because each thread owns the calculations for a single observation; however in other situations we might need to put a <em>barrier</em> in place that ensures all threads are finished with a particular calculation before any proceed to the next steps, using the <em>__syncthreads()</em> function.</li>
1001 | <li>We only have 48Kb of shared memory per block (see the results of <em>deviceQuery</em>), so we need to make sure the number of threads per block is not so large as to exceed that. In this case with 192 threads per block and \(K=21\) values for each thread, we&#39;re over the maximum, so we need to go to 96 threads per block.</li>
1002 | </ul>
1003 | 
1004 | <p>Here we notice that <em>w</em> and <em>probs</em> are accessed in device memory multiple times, and furthermore, <em>probs</em> is not even needed as an input, so let&#39;s try to manage these values in shared memory, as shown in <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs_unitStrides_sharedMem.cu">compute_probs_unitStrides_sharedMem.cu</a>. </p>
1005 | 
1006 | <pre><code class="bash">echo &quot;unitStrides &lt;- TRUE&quot; &gt; /tmp/tmp.R
1007 | echo &quot;sharedMem &lt;- TRUE&quot; &gt;&gt; /tmp/tmp.R
1008 | cat example_RCUDA.R &gt;&gt; /tmp/tmp.R
1009 | Rscript /tmp/tmp.R
1010 | </code></pre>
1011 | 
1012 | <p>This takes 1.5 seconds, so we see a big improvement from using shared memory.</p>
1013 | 
1014 | <p>Surprisingly, using shared memory for access to  <em>alphas</em> actually slowed things down 2-3-fold. I&#39;m not sure why.</p>
1015 | 
1016 | <p>Finally in some cases you can use shared memory to avoid non-unit strides. Here&#39;s an example of a <a href="http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/">matrix transpose</a>. Basically any non-unit striding is done only in shared memory. Reading from and writing to device memory is done using unit strides. </p>
1017 | 
1018 | <h3>7.6) Using single precision (floats)</h3>
1019 | 
1020 | <p>Traditionally GPU calculations are done in single precision and this can apparently be much faster than double precision calculations. </p>
1021 | 
1022 | <p>Here  I get a roughly two- to three-fold speedup using floats rather than doubles, both for the original version of the code with non-unit strides and without shared memory (first example below) and for the optimized version of the code (second example below). As shown in the various &ldquo;_float&rdquo; kernel files, all I need to do is change &ldquo;double&rdquo; to &ldquo;float&rdquo;. And when calling from R, there are some housekeeping items shown in <a href="https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_RCUDA.R">example_RCUDA.R</a>.</p>
1023 | 
1024 | <pre><code class="bash">echo &quot;float &lt;- TRUE&quot; &gt; /tmp/tmp.R
1025 | cat example/example_RCUDA.R &gt;&gt; /tmp/tmp.R
1026 | R CMD BATCH --no-save tmp.R 
1027 | </code></pre>
1028 | 
1029 | <pre><code class="bash">echo &quot;float &lt;- TRUE&quot; &gt; /tmp/tmp.R
1030 | echo &quot;unitStrides &lt;- TRUE&quot; &gt; /tmp/tmp.R
1031 | echo &quot;sharedMem &lt;- TRUE&quot; &gt;&gt; /tmp/tmp.R
1032 | cat example_RCUDA.R &gt;&gt; /tmp/tmp.R
1033 | Rscript /tmp/tmp.R 
1034 | </code></pre>
1035 | 
1036 | <h1>7.7) Summary</h1>
1037 | 
1038 | <p>For this example, here are the speeds, and the speed relative to the eight-core C++ implementation:</p>
1039 | 
1040 | <table><thead>
1041 | <tr>
1042 | <th>Implementation</th>
1043 | <th align="right">Time (sec.)</th>
1044 | <th align="right">Speed (relative to C++)</th>
1045 | </tr>
1046 | </thead><tbody>
1047 | <tr>
1048 | <td>R (8 cores)</td>
1049 | <td align="right">140</td>
1050 | <td align="right">0.04</td>
1051 | </tr>
1052 | <tr>
1053 | <td>C++ (8 cores)</td>
1054 | <td align="right">6.0</td>
1055 | <td align="right">1.0</td>
1056 | </tr>
1057 | <tr>
1058 | <td>basic CUDA</td>
1059 | <td align="right">12.1</td>
1060 | <td align="right">0.5</td>
1061 | </tr>
1062 | <tr>
1063 | <td>unit strides</td>
1064 | <td align="right">8.5</td>
1065 | <td align="right">0.7</td>
1066 | </tr>
1067 | <tr>
1068 | <td>shared memory</td>
1069 | <td align="right">1.5</td>
1070 | <td align="right">4.0</td>
1071 | </tr>
1072 | <tr>
1073 | <td>shared memory + floats</td>
1074 | <td align="right">0.6</td>
1075 | <td align="right">10.7</td>
1076 | </tr>
1077 | </tbody></table>
1078 | 
1079 | <p>Interestingly on Savio, the C++ time was 9.8, while the shared memory time was 0.67 and the shared memory + floats time was 0.31. </p>
1080 | 
1081 | <h1>8) Final Comments</h1>
1082 | 
1083 | <h3>8.1) Some Thoughts on Improving Computational Speed</h3>
1084 | 
1085 | <p><a href="http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10016#.U2GTuBUgoWk">Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419</a> and <a href="http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10039#.U2GT9BUgoWk">Lee et al (2010; Journal of Computational and Graphical Statistics 19:769</a> talk about the use of GPUs for statistics. The speedups they see can get as high as 120 times the speed of a single CPU core and 500 times a single CPU core, respectively. Some of the reasons these speedups are so impressive (more so than some of the examples here) include:</p>
1086 | 
1087 | <ul>
1088 | <li><p>Use of single precision floating point calculations. If single precision doesn&#39;t affect your calculation substantively, this is worth trying. Particularly on older GPUs (but perhaps still true), single precision was much faster than double precision. </p></li>
1089 | <li><p>Computational tasks that are very arithmetically intensive but with limited memory access (see the Lee et al. paper)</p></li>
1090 | <li><p>Ensuring that contigously-numbered threads access contigous memory locations</p></li>
1091 | <li><p>Careful use of shared memory (shared amongst the threads in a block) in place of the main GPU memory (see the Suchard et al. paper); in particular this can avoid accessing non-contiguous memory </p></li>
1092 | <li><p>Avoiding conditional statements and synchronization/barriers, since threads operate in lock-step in groups of 32 threads (a &#39;warp&#39;)</p></li>
1093 | </ul>
1094 | 
1095 | <p>So for some tasks and likely involving additional coding effort, you may see speedups of 100-200 fold compared to a single CPU core. </p>
1096 | 
1097 | <p>Finally, rather than bringing a large chunk of data back to the CPU, you might do a reduction/aggregation operation (e.g., summing over values) in GPU memory. To do this, here&#39;s a <a href="http://will-landau.com/gpu/lectures/cudac-atomics/cudac-atomics.pdf">presentation</a>‎ that has some useful information. </p>
1098 | 
1099 | <h3>8.2) A Comment on Compilation</h3>
1100 | 
1101 | <p>If you compile CUDA code into an object file, you can link that with other object files (e.g., from C or C++ code) into an executable that can operate on CPU and GPU. This also means you could compile a shared object (i.e., a library)  that you could call from R with .C, .Call, or Rcpp. </p>
1102 | 
1103 | <h3>8.3) Some references:</h3>
1104 | 
1105 | <ul>
1106 | <li>The book Parallel Computing for Data Science by Norman Matloff has some useful introductory material.</li>
1107 | <li>The <a href="http://devblogs.nvidia.com/parallelforall/">NVIDIA website</a> has a bunch of useful blog posts.</li>
1108 | <li><a href="http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10016#.U2GTuBUgoWk">Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419</a></li>
1109 | <li><a href="http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10039#.U2GT9BUgoWk">Lee et al (2010; Journal of Computational and Graphical Statistics 19:769</a></li>
1110 | </ul>
1111 | 
1112 | </body>
1113 | 
1114 | </html>
1115 | 


--------------------------------------------------------------------------------
/gpuArrayExample.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | import pycuda.gpuarray as gpuarray
 4 | import pycuda.cumath as cumath
 5 | import numpy as np
 6 | 
 7 | n = np.int32(134217728)
 8 | 
 9 | start = drv.Event()
10 | end = drv.Event()
11 | 
12 | x = np.random.normal(size = n)
13 | x_short = np.random.normal(size = 8)
14 | 
15 | start.record()
16 | dev_x = gpuarray.to_gpu(x)
17 | dev_x_short = gpuarray.to_gpu(x_short)
18 | end.record() 
19 | end.synchronize()
20 | print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3)
21 | 
22 | 
23 | print "Timing vectorized exponentiation:"
24 | 
25 | start.record()
26 | dev_expx_short = cumath.exp(dev_x_short)
27 | end.record() 
28 | end.synchronize()
29 | print "GPU array calc time (initial): %fs" %(start.time_till(end)*1e-3)
30 | 
31 | start.record()
32 | dev_expx = cumath.exp(dev_x)
33 | end.record() 
34 | end.synchronize()
35 | print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)
36 | 
37 | start.record()
38 | exp_x = np.exp(x)
39 | end.record() 
40 | end.synchronize()
41 | print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
42 | 
43 | print "Timing vectorized dot product/sum of squares:"
44 | 
45 | start.record()
46 | gpuarray.dot(dev_x_short,dev_x_short)
47 | end.record() 
48 | end.synchronize()
49 | print "GPU array calc time (initial): %fs" %(start.time_till(end)*1e-3)
50 | 
51 | start.record()
52 | gpuarray.dot(dev_x,dev_x)
53 | end.record() 
54 | end.synchronize()
55 | print "GPU array calc time: %fs" %(start.time_till(end)*1e-3)
56 | 
57 | start.record()
58 | np.dot(x, x)
59 | end.record() 
60 | end.synchronize()
61 | print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
62 | 


--------------------------------------------------------------------------------
/helloWorld.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cuda.h>
 3 | #include <cuda_runtime.h>
 4 | 
 5 | // Note: Needs compute capability >= 2.0, so compile with:
 6 | // nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
 7 | 
 8 | // number of computations:
 9 | #define N 20000
10 | // constants for grid and block sizes 
11 | #define GRID_D1 20
12 | #define GRID_D2 2
13 | #define BLOCK_D1 512
14 | #define BLOCK_D2 1
15 | #define BLOCK_D3 1
16 | 
17 | // this is the kernel function called for each thread
18 | // we use the CUDA variables {threadIdx, blockIdx, blockDim, gridDim} to determine a unique ID for each thread
19 | __global__ void hello(void)
20 | {
21 |     // id of the block
22 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
23 |     // size of each block (within grid of blocks)
24 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
25 |     // id of thread in a given block
26 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
27 |     // assign overall id/index of the thread
28 |     int idx = myblock * blocksize + subthread;
29 |     if(idx < 2000 || idx > 19000) {
30 |        // print buffer from within the kernel is limited so only print for first and last chunks of threads
31 |     if (idx < N){      
32 |         printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => \
33 |        thread index=%d\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx);
34 |     } else {
35 |         printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => \
36 |         thread index=%d [### this thread would not be used for N=%d ###]\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, 
37 |         threadIdx.x, threadIdx.y, threadIdx.z, idx, N);
38 |     }
39 |     }
40 | }
41 | 
42 | 
43 | int main(int argc,char **argv)
44 | {
45 |     // objects containing the block and grid info
46 |     const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
47 |     const dim3 gridSize(GRID_D1, GRID_D2, 1);
48 |     int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*GRID_D1*GRID_D2;
49 |     if (nthreads < N){
50 |         printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N);
51 |     } else {
52 |         printf("Launching %d threads (N=%d)\n",nthreads,N);
53 |     }
54 |     
55 |     // launch the kernel on the specified grid of thread blocks
56 |     hello<<<gridSize, blockSize>>>();
57 |     
58 |     // Need to flush prints, otherwise none of the prints from within the kernel will show up
59 |     // as program exit does not flush the print buffer.
60 |     cudaError_t cudaerr = cudaDeviceSynchronize();
61 |     if (cudaerr){
62 |         printf("kernel launch failed with error \"%s\".\n",
63 |                cudaGetErrorString(cudaerr));
64 |     } else {
65 |         printf("kernel launch success!\n");
66 |     }
67 |     
68 |     printf("That's all!\n");
69 | 
70 |     return 0;
71 | }
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/kernelExample-pinned.cu:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include<stdio.h>
  4 | #include<cuda.h>
  5 | #include<math.h>
  6 | 
  7 | #define SQRT_TWO_PI 2.506628274631000
  8 | #define BLOCK_D1 1024
  9 | #define BLOCK_D2 1
 10 | #define BLOCK_D3 1
 11 | 
 12 | // Note: Needs compute capability >= 2.0 for calculation with doubles, so compile with:
 13 | // nvcc kernelExample-pinned.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample-pinned
 14 | // -use_fast_math
 15 | 
 16 | // CUDA kernel:
 17 | __global__ void calc_loglik(double* vals, int n, double mu, double sigma) {
 18 |    // note that this assumes no third dimension to the grid
 19 |     // id of the block
 20 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
 21 |     // size of each block (within grid of blocks)
 22 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
 23 |     // id of thread in a given block
 24 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
 25 |     // assign overall id/index of the thread
 26 |     int idx = myblock * blocksize + subthread;
 27 | 
 28 |         if(idx < n) {
 29 |             double std = (vals[idx] - mu)/sigma;
 30 |             double e = exp( - 0.5 * std * std);
 31 |             vals[idx] = e / ( sigma * SQRT_TWO_PI);
 32 |         }
 33 | }
 34 | 
 35 | int calc_loglik_cpu(double* vals, int n, double mu, double sigma) {
 36 |   double std, e;
 37 |   for(int idx = 0; idx < n; idx++) {
 38 |     std = (vals[idx] - mu)/sigma;
 39 |     e = exp( - 0.5 * std * std);
 40 |     vals[idx] = e / ( sigma * SQRT_TWO_PI);
 41 |   }
 42 |   return 0;
 43 | }
 44 | 
 45 | 
 46 | /* --------------------------- host code ------------------------------*/
 47 | void fill( double *p, int n ) {
 48 |   int i;
 49 |   srand48(0);
 50 |   for( i = 0; i < n; i++ )
 51 |     p[i] = 2*drand48()-1;
 52 | }
 53 | 
 54 | double read_timer() {
 55 |   struct timeval end;
 56 |   gettimeofday( &end, NULL );
 57 |   return end.tv_sec+1.e-6*end.tv_usec;
 58 | }
 59 | 
 60 | int main (int argc, char *argv[]) {
 61 |   double* cpu_vals;
 62 |   double* gpu_vals;
 63 |   int n;
 64 |   cudaError_t cudaStat;
 65 |  
 66 |   printf("====================================================\n");
 67 |   for( n = 32768; n <= 134217728; n*=8 ) {
 68 |     // allocated pinned and mapped memory on CPU
 69 |     cudaSetDeviceFlags(cudaDeviceMapHost);
 70 |     cudaHostAlloc((void**)&cpu_vals, n*sizeof(double), cudaHostAllocMapped);
 71 | 
 72 |     // map the CPU storage to the GPU to the CPU storage
 73 |     cudaStat = cudaHostGetDevicePointer(&gpu_vals, cpu_vals, 0);
 74 |     if(cudaStat != cudaSuccess) {
 75 |       printf ("device memory mapping failed");
 76 |       return EXIT_FAILURE;
 77 |     }
 78 | 
 79 |     const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
 80 |     
 81 |     int tmp = ceil(pow(n/BLOCK_D1, 0.5));
 82 |     printf("Grid dimension is %i x %i\n", tmp, tmp);
 83 |     dim3 gridSize(tmp, tmp, 1);
 84 | 
 85 |     int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*tmp*tmp;
 86 |     if (nthreads < n){
 87 |         printf("\n============ NOT ENOUGH THREADS TO COVER n=%d ===============\n\n",n);
 88 |     } else {
 89 |         printf("Launching %d threads (n=%d)\n", nthreads, n);
 90 |     }
 91 | 
 92 |     double mu = 0.0;
 93 |     double sigma = 1.0;
 94 | 
 95 |     // simulate 'data'
 96 |     fill(cpu_vals, n);
 97 |     printf("Input values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
 98 | 
 99 |     cudaDeviceSynchronize();
100 |     double tInit = read_timer();
101 | 
102 |     // do the calculation
103 |     calc_loglik<<<gridSize, blockSize>>>(gpu_vals, n, mu, sigma);
104 |     
105 |     cudaDeviceSynchronize();
106 |     double tCalc = read_timer();
107 | 
108 |     printf("Output values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
109 | 
110 |     // do calculation on CPU for comparison (unfair as this will only use one core)
111 |     fill(cpu_vals, n);
112 |     double tInit2 = read_timer();
113 |     calc_loglik_cpu(cpu_vals, n, mu, sigma);
114 |     double tCalcCPU = read_timer();
115 | 
116 |     printf("Output values (CPU): %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
117 | 
118 |     printf("Timing results for n = %d\n", n);
119 |     printf("Calculation time (GPU): %f\n", tCalc - tInit);
120 |     printf("Calculation time (CPU): %f\n", tCalcCPU - tInit2);
121 | 
122 |     printf("Freeing memory...\n");
123 |     printf("====================================================\n");
124 |     cudaFreeHost(cpu_vals);
125 | 
126 |   }
127 |   printf("\n\nFinished.\n\n");
128 |   return 0;
129 | }
130 | 
131 | 


--------------------------------------------------------------------------------
/kernelExample.R:
--------------------------------------------------------------------------------
 1 | # modification of one of the RCUDA examples to use use double precision
 2 | 
 3 | library(RCUDA)
 4 | 
 5 | cat("Setting cuGetContext(TRUE)...\n")
 6 | cuGetContext(TRUE)
 7 | 
 8 | # compile the kernel into a form that RCUDA can load
 9 | # system("nvcc --ptx  -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu")
10 | ptx = nvcc(file = 'calc_loglik.cu', out = 'calc_loglik.ptx',
11 |   target = "ptx", "-arch=compute_20", "-code=sm_20,compute_20")
12 | 
13 | mod = loadModule(ptx)
14 | calc_loglik = mod$calc_loglik
15 | 
16 | n = as.integer(134217728)  
17 | 
18 | set.seed(0)
19 | x = runif(n)
20 | mu = 0.3
21 | sigma = 1.5
22 | 
23 | # setting grid and block dimensions
24 | threads_per_block <- as.integer(1024)
25 | block_dims <- c(threads_per_block, as.integer(1), as.integer(1))
26 | grid_d <- as.integer(ceiling(sqrt(n/threads_per_block)))
27 | 
28 | grid_dims <- c(grid_d, grid_d, as.integer(1))
29 | 
30 | cat("Grid size:\n")
31 | print(grid_dims)
32 | 
33 | nthreads <- as.integer(prod(grid_dims)*prod(block_dims))
34 | cat("Total number of threads to launch = ", nthreads, "\n")
35 | if (nthreads < n){
36 |     stop("Grid is not large enough...!")
37 | }
38 | 
39 | cat("Running CUDA kernel...\n")
40 | 
41 | # basic usage with manual transfer
42 | tTransferToGPU <- system.time({
43 |   dX = copyToDevice(x, strict = TRUE)
44 |   cudaDeviceSynchronize()
45 | })
46 | tCalc <- system.time({
47 |   .cuda(calc_loglik, dX, n, mu, sigma, gridDim = grid_dims, blockDim = block_dims, .numericAsDouble = getOption("CUDA.useDouble", TRUE))
48 |   cudaDeviceSynchronize()
49 | })
50 | tTransferFromGPU <- system.time({
51 |   out = copyFromDevice(obj = dX, nels = dX@nels, type = "double")
52 |   cudaDeviceSynchronize()
53 | })
54 | 
55 | cat("Input values: ", x[1:3], "\n")
56 | cat("Output values: ", out[1:3], "\n")
57 | 
58 | # alternative that bundles transfer and computation all in one, with
59 | # implicit transfer done by RCUDA behind the scenes
60 | tFull <- system.time({
61 |   out <- .cuda(calc_loglik, "x"=x, n, mu, sigma, gridDim=grid_dims, blockDim=block_dims, outputs="x", .numericAsDouble = getOption("CUDA.useDouble", TRUE))
62 |   cudaDeviceSynchronize()
63 | })
64 | 
65 | 
66 | cat("Output values (implicit transfer): ", out[1:3], "\n")
67 | 
68 | # having RCUDA determine gridding - not working for some reason
69 | ## tCalc_gridby <- system.time({
70 | ##  .cuda(calc_loglik, dX, n, mu, sigma, gridBy = nthreads, .numericAsDouble = getOption("CUDA.useDouble", TRUE))
71 | ##  cudaDeviceSynchronize()
72 | ## })
73 | 
74 | 
75 | 
76 | tCalc_R <- system.time({
77 |   out <- dnorm(x, mu, sigma)
78 | })
79 | 
80 | cat("Output values (CPU with R): ", out[1:3], "\n")
81 |                       
82 | cat("Transfer to GPU time: ", tTransferToGPU[3], "\n")
83 | cat("Calculation time (GPU): ", tCalc[3], "\n")
84 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n")
85 | cat("Calculation time (CPU): ", tCalc_R[3], "\n")
86 | cat("Combined calculation/transfer via .cuda time (GPU): ", tFull[3], "\n")
87 | #cat("Calculation time (GPU with gridBy): ", tCalc_gridBy[3], "\n")
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/kernelExample.cu:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include<stdio.h>
  4 | #include<cuda.h>
  5 | #include<math.h>
  6 | 
  7 | #define SQRT_TWO_PI 2.506628274631000
  8 | #define BLOCK_D1 1024
  9 | #define BLOCK_D2 1
 10 | #define BLOCK_D3 1
 11 | 
 12 | // Note: Needs compute capability >= 2.0 for calculation with doubles, so compile with:
 13 | // nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
 14 | // -use_fast_math doesn't seem to have any effect on speed
 15 | 
 16 | // CUDA kernel:
 17 | __global__ void calc_loglik(double* vals, int n, double mu, double sigma) {
 18 |    // note that this assumes no third dimension to the grid
 19 |     // id of the block
 20 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
 21 |     // size of each block (within grid of blocks)
 22 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
 23 |     // id of thread in a given block
 24 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
 25 |     // assign overall id/index of the thread
 26 |     int idx = myblock * blocksize + subthread;
 27 | 
 28 |         if(idx < n) {
 29 |             double std = (vals[idx] - mu)/sigma;
 30 |             double e = exp( - 0.5 * std * std);
 31 |             vals[idx] = e / ( sigma * SQRT_TWO_PI);
 32 |         }
 33 | }
 34 | 
 35 | // CPU analog for speed comparison
 36 | int calc_loglik_cpu(double* vals, int n, double mu, double sigma) {
 37 |   double std, e;
 38 |   for(int idx = 0; idx < n; idx++) {
 39 |     std = (vals[idx] - mu)/sigma;
 40 |     e = exp( - 0.5 * std * std);
 41 |     vals[idx] = e / ( sigma * SQRT_TWO_PI);
 42 |   }
 43 |   return 0;
 44 | }
 45 | 
 46 | 
 47 | /* --------------------------- host code ------------------------------*/
 48 | void fill( double *p, int n ) {
 49 |   int i;
 50 |   srand48(0);
 51 |   for( i = 0; i < n; i++ )
 52 |     p[i] = 2*drand48()-1;
 53 | }
 54 | 
 55 | double read_timer() {
 56 |   struct timeval end;
 57 |   gettimeofday( &end, NULL );
 58 |   return end.tv_sec+1.e-6*end.tv_usec;
 59 | }
 60 | 
 61 | int main (int argc, char *argv[]) {
 62 |   double* cpu_vals;
 63 |   double* gpu_vals;
 64 |   int n;
 65 |   cudaError_t cudaStat;
 66 |  
 67 | 
 68 |   printf("====================================================\n");
 69 |   for( n = 32768; n <= 134217728; n*=8 ) {
 70 |     cpu_vals = (double*) malloc( sizeof(double)*n );
 71 |     cudaStat = cudaMalloc(&gpu_vals, sizeof(double)*n);
 72 |     if(cudaStat != cudaSuccess) {
 73 |       printf ("device memory allocation failed");
 74 |       return EXIT_FAILURE;
 75 |     }
 76 | 
 77 |     // fixed block dimensions (1024x1x1 threads)
 78 |     const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3);
 79 |     
 80 |     // determine number of blocks we need for a given problem size
 81 |     int tmp = ceil(pow(n/BLOCK_D1, 0.5));
 82 |     printf("Grid dimension is %i x %i\n", tmp, tmp);
 83 |     dim3 gridSize(tmp, tmp, 1);
 84 | 
 85 |     int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*tmp*tmp;
 86 |     if (nthreads < n){
 87 |         printf("\n============ NOT ENOUGH THREADS TO COVER n=%d ===============\n\n",n);
 88 |     } else {
 89 |         printf("Launching %d threads (n=%d)\n", nthreads, n);
 90 |     }
 91 | 
 92 |     double mu = 0.0;
 93 |     double sigma = 1.0;
 94 | 
 95 |     // simulate 'data'
 96 |     fill(cpu_vals, n);
 97 |     printf("Input values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
 98 | 
 99 |     cudaDeviceSynchronize();
100 |     double tInit = read_timer();
101 | 
102 |     // copy input data to the GPU
103 |     cudaStat = cudaMemcpy(gpu_vals, cpu_vals, n*sizeof(double), cudaMemcpyHostToDevice);
104 |     printf("Memory Copy from Host to Device ");
105 |     if (cudaStat){
106 |       printf("failed.\n");
107 |     } else {
108 |       printf("successful.\n");
109 |     }
110 |     cudaDeviceSynchronize();
111 |     double tTransferToGPU = read_timer();
112 | 
113 |     // do the calculation
114 |     calc_loglik<<<gridSize, blockSize>>>(gpu_vals, n, mu, sigma);
115 |     cudaDeviceSynchronize();
116 |     double tCalc = read_timer();
117 | 
118 |     cudaStat = cudaMemcpy(cpu_vals, gpu_vals, n, cudaMemcpyDeviceToHost);
119 |     printf("Memory Copy from Device to Host ");
120 |     if (cudaStat){
121 |       printf("failed.\n");
122 |     } else {
123 |       printf("successful.\n");
124 |     }
125 |     cudaDeviceSynchronize();
126 |     double tTransferFromGPU = read_timer();
127 | 
128 |     printf("Output values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
129 | 
130 |     // do calculation on CPU for comparison (unfair as this will only use one core)
131 |     fill(cpu_vals, n);
132 |     double tInit2 = read_timer();
133 |     calc_loglik_cpu(cpu_vals, n, mu, sigma);
134 |     double tCalcCPU = read_timer();
135 | 
136 |     printf("Output values (CPU): %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]);
137 | 
138 |     printf("Timing results for n = %d\n", n);
139 |     printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit);
140 |     printf("Calculation time (GPU): %f\n", tCalc - tTransferToGPU);
141 |     printf("Calculation time (CPU): %f\n", tCalcCPU - tInit2);
142 |     printf("Transfer from GPU time: %f\n", tTransferFromGPU - tCalc);
143 | 
144 |     printf("Freeing memory...\n");
145 |     printf("====================================================\n");
146 |     free(cpu_vals);
147 |     cudaFree(gpu_vals);
148 | 
149 |   }
150 |   printf("\n\nFinished.\n\n");
151 |   return 0;
152 | }
153 | 
154 |  
155 | 


--------------------------------------------------------------------------------
/kernelExample.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | import numpy as np
 4 | import scipy as sp
 5 | from scipy.stats import norm
 6 | from pycuda.compiler import SourceModule
 7 | import math
 8 | 
 9 | # Here's the kernel, essentially identical to that used in the CUDA and RCUDA examples
10 | 
11 | mod = SourceModule("""
12 | #include <stdio.h>
13 | #define SQRT_TWO_PI 2.506628274631000
14 | __global__ void calc_loglik(double *vals, double *x, int n, double mu, double sigma, int dbg)
15 | {
16 |    // note that this assumes no third dimension to the grid
17 |    int myblock = blockIdx.x + blockIdx.y * gridDim.x;
18 |     // size of each block (within grid of blocks)
19 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
20 |     // id of thread in a given block
21 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
22 |     // assign overall id/index of the thread
23 |     int idx = myblock * blocksize + subthread;
24 | 
25 |     if (idx < n) {
26 |         if (dbg){
27 |             printf("thread idx: %04d\\t x[%d] = %f\\t (n=%d,mu=%f,sigma=%f)\\n",idx,idx,x[idx],n,mu,sigma);
28 |         }
29 |         double std = (x[idx] - mu)/sigma;
30 |         double e = exp( - 0.5 * std * std);
31 |         vals[idx] = e / ( sigma * SQRT_TWO_PI);
32 |     } else {
33 |         if (dbg){
34 |             printf("thread idx: %04d\\t (>=n=%d)\\n",idx,n);
35 |         }
36 |     }
37 |     return;
38 | }
39 | """)
40 | 
41 | calc_loglik = mod.get_function("calc_loglik")
42 | 
43 | # Arguments must be numpy datatypes i.e., n = 1000 will not work!
44 | 
45 | n = np.int32(134217728)
46 | 
47 | # Threads per block and number of blocks:
48 | threads_per_block = int(1024)
49 | block_dims = (threads_per_block, 1, 1)
50 | grid_d = int(math.ceil(math.sqrt(n/threads_per_block)))
51 | grid_dims = (grid_d, grid_d, 1)
52 | 
53 | 
54 | print("Generating random normals...")
55 | x = np.random.normal(size = n)
56 | 
57 | # Evaluate at N(0.3, 1.5)
58 | 
59 | mu = np.float64(0.3)
60 | sigma = np.float64(1.5)
61 | dbg = False # True
62 | verbose = np.int32(dbg)
63 | 
64 | # Allocate storage for the result:
65 | 
66 | out = np.zeros_like(x)
67 | 
68 | # Create two timers:
69 | start = drv.Event()
70 | end = drv.Event()
71 | 
72 | # Launch the kernel 
73 | print("Running GPU code...")
74 | start.record()
75 | 
76 | calc_loglik(drv.Out(out), drv.In(x), n, mu, sigma, verbose, block= block_dims, grid = grid_dims)
77 | 
78 | end.record() # end timing
79 | # calculate the run length
80 | end.synchronize()
81 | 
82 | gpu_secs = start.time_till(end)*1e-3
83 | print("Time for calculation (GPU): %fs" % gpu_secs)
84 | 
85 | # Scipy version:
86 | print("Running Scipy CPU code...")
87 | start.record()
88 | out2 = norm.pdf(x, loc = mu, scale = sigma)
89 | end.record() # end timing
90 | # calculate the run length
91 | end.synchronize()
92 | cpu_secs = start.time_till(end)*1e-3
93 | print("Time for calculation (CPU): %fs" % cpu_secs)
94 | 
95 | print("Output from GPU: %f %f %f" % (out[0], out[1], out[2]))
96 | print("Output from CPU: %f %f %f" % (out2[0], out2[1], out2[2]))
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/magmaExample.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <sys/time.h>
  3 | #include <stdio.h>
  4 | 
  5 | #include <math.h>
  6 | #include <cuda_runtime.h>
  7 | #include "cublas_v2.h"
  8 | 
  9 | #include "magma.h"
 10 | #include "magma_lapack.h"
 11 | 
 12 | // compile as:
 13 | // gcc magmaExample.c -O3 -DADD_ -fopenmp  -DHAVE_CUBLAS -I/usr/local/cuda/include -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma  -llapack -lblas -lcublas -lcudart -o magmaExample
 14 | 
 15 | 
 16 | double read_timer() {
 17 |   struct timeval end;
 18 |   gettimeofday( &end, NULL );
 19 |   return end.tv_sec+1.e-6*end.tv_usec;
 20 | }
 21 | 
 22 | // BLAS/LAPACK functions for matrix multiply and Cholesky
 23 | // not needed as these are in magma_dlapack.h
 24 | // void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* );
 25 | // int dpotrf_(char* uplo, int* n, double* a, int* lda, int* info);
 26 | 
 27 | void fillMatrix( double *p, int n ) {
 28 |   int i;
 29 |   srand48(0);
 30 |   for( i = 0; i < n; i++ )
 31 |     p[i] = 2*drand48()-1;
 32 | }
 33 | 
 34 | 
 35 | int main( int argc, char **argv ) {
 36 |   printf("Starting\n");
 37 |   int size;
 38 |   cudaError_t cudaStat;
 39 |   magma_int_t magmaStat;
 40 |   cublasStatus_t stat;
 41 |   cublasHandle_t handle;
 42 |   int it,i;
 43 | 
 44 |   cublasOperation_t N = 'N';
 45 |   cublasOperation_t T = 'T';
 46 |   char N2 = 'N';
 47 |   char T2 = 'T';
 48 | 
 49 |   double one = 1., zero=0.;
 50 |   char uplo = 'L';
 51 |   int info;
 52 |   
 53 |   int err; double* A; double* B;
 54 |   magmaStat = magma_init();
 55 | 
 56 |   int use_pinned;
 57 |   if(argc > 1) {
 58 |     use_pinned = atoi(argv[1]);
 59 |   } else use_pinned = 0;
 60 |   printf("Setting use_pinned to %d\n", use_pinned);
 61 | 
 62 |   for( size = 512; size <= 8192; size*=4 ) {
 63 |  
 64 |      if(use_pinned) {
 65 |        // allocate pinned memory on CPU
 66 |        err = magma_dmalloc_pinned( &A,  size*size );  assert( err == 0 );
 67 |        err = magma_dmalloc_pinned( &B,  size*size );  assert( err == 0 );
 68 |      } else {
 69 |        // allocate standard memory on CPU
 70 |        A = (double*) malloc( sizeof(double)*size*size );
 71 |        B = (double*) malloc( sizeof(double)*size*size );
 72 |      }
 73 | 
 74 |     cudaDeviceSynchronize();
 75 |     double tInit = read_timer();     
 76 |     double *dA,*dB;
 77 |     // allocate memory on GPU
 78 |     magma_malloc( (void**) &dA, sizeof(double)*size*size );
 79 |     magma_malloc( (void**) &dB, sizeof(double)*size*size );
 80 |     
 81 |     cudaDeviceSynchronize();
 82 |     double tAlloc = read_timer();     
 83 |  
 84 |     fillMatrix(B, size*size);
 85 |  
 86 | 
 87 |     cudaDeviceSynchronize();
 88 |     double tInit2 = read_timer();
 89 | 
 90 |     // transfer data to GPU
 91 |     magma_dsetmatrix( size, size, B, size, dB, size );
 92 | 
 93 |     cudaDeviceSynchronize();
 94 |     double tTransferToGPU = read_timer();
 95 | 
 96 |     // matrix multiply
 97 |     magmablas_dgemm(MagmaNoTrans, MagmaTrans, size, size, size, one, dB, size, dB, size, zero, dA, size );
 98 |     // magma_dgemm may be more general in terms of being able to call GPU or MIC
 99 | 
100 |     cudaDeviceSynchronize();
101 |     double tMatMult = read_timer();
102 |  
103 |     // Cholesky decomposition on GPU with GPU interface (called with object on GPU)
104 |     magma_dpotrf_gpu(MagmaLower, size, dA, size, &info );
105 | 
106 |     cudaDeviceSynchronize();
107 |     double tChol = read_timer();
108 | 
109 |     // transfer data back to CPU
110 |     magma_dgetmatrix( size, size, dA, size, A, size );
111 |     cudaDeviceSynchronize();
112 |     double tTransferFromGPU = read_timer();
113 |  
114 |     // standard BLAS matrix multiply on CPU
115 |     dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );
116 | 
117 |     cudaDeviceSynchronize();
118 |     double tMatMultBlas = read_timer();
119 | 
120 |     // Cholesky decomposition on GPU with CPU interface (called with object on CPU)
121 |     magma_dpotrf(MagmaLower, size, A, size, &info );
122 | 
123 |     cudaDeviceSynchronize();
124 |     double tCholCpuInterface = read_timer();
125 | 
126 |     // recreate A = B * B (could just do a save and copy instead....)
127 |     dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size );
128 | 
129 |     cudaDeviceSynchronize();
130 |     double tInit3 = read_timer();
131 | 
132 |     // standard Lapack Cholesky decomposition on CPU
133 |     dpotrf_(&uplo, &size, A, &size, &info);
134 |   
135 |     cudaDeviceSynchronize();
136 |     double tCholCpu= read_timer();
137 |  
138 | 
139 |     printf("====================================================\n");
140 |     printf("Timing results for n = %d\n", size);
141 |     printf("GPU memory allocation time: %f\n", tAlloc - tInit);
142 |     printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2);
143 |     printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU);
144 |     printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU);
145 |     printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult);
146 |     printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas);
147 |     printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3);
148 |     printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol);
149 | 
150 |     if(use_pinned) {
151 |       magma_free_pinned(A);
152 |       magma_free_pinned(B);
153 |     } else {
154 |       free(A);
155 |       free(B);
156 |     }
157 |     magma_free(dA);
158 |     magma_free(dB);
159 |  
160 |   }
161 |   return EXIT_SUCCESS;
162 | }
163 | 


--------------------------------------------------------------------------------
/random.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <cuda.h>
 4 | #include <curand_kernel.h>
 5 | 
 6 | 
 7 | extern "C"
 8 | {
 9 | 
10 | __global__ void setup_kernel(curandState  *state, int seed, int n, int verbose)
11 | {
12 |     // Usual block/thread indexing...
13 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
14 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
15 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
16 |     int idx = myblock * blocksize + subthread;
17 |     if (verbose){
18 |         printf("Setting up RNG in thread %d (n=%d)...\n",idx,n);
19 |     }
20 |     curand_init(seed, idx, 0, &state[idx]);
21 |     return;
22 | }
23 | 
24 | __global__ void rnorm_basic_kernel(curandState *state, double *vals, int n, double mu, double sigma)
25 | {
26 |     // Usual block/thread indexing...
27 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
28 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
29 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
30 |     int idx = myblock * blocksize + subthread;
31 |          if (idx < n) {
32 |           vals[idx] = mu + sigma * curand_normal_double(&state[idx]);
33 |           }
34 |     return;
35 | }
36 | 
37 | 
38 | __global__ void rnorm_kernel(curandState *state, double *vals, int n, double mu, double sigma, int numSamples)
39 | {
40 |     // Usual block/thread indexing...
41 |     int myblock = blockIdx.x + blockIdx.y * gridDim.x;
42 |     int blocksize = blockDim.x * blockDim.y * blockDim.z;
43 |     int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x;
44 |     int idx = myblock * blocksize + subthread;
45 |     int k;  
46 |     int startIdx = idx*numSamples;
47 |     for(k = 0; k < numSamples; k++) {
48 |         if(startIdx + k < n) 
49 |           vals[startIdx + k] = mu + sigma * curand_normal_double(&state[idx]);
50 |     }
51 |     return;
52 | }
53 | 
54 | } // END extern 
55 | 
56 | 


--------------------------------------------------------------------------------
/savio-job-template.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=test-gpu
 3 | #SBATCH --partition=savio2_gpu
 4 | #SBATCH --account=ac_scsguest
 5 | #SBATCH --nodes=1
 6 | #SBATCH --time=02:30:00
 7 | #SBATCH --mail-user=paciorek@stat.berkeley.edu
 8 | 
 9 | module load cuda
10 | module unload intel  # do this to avoid compilation issues
11 | 
12 | # insert code here to run your computations
13 | 


--------------------------------------------------------------------------------
/savio.sh:
--------------------------------------------------------------------------------
 1 | pledge # use 1x2
 2 | ssh paciorek@hpc.brc.berkeley.edu
 3 | 
 4 | 
 5 | module unload intel
 6 | module load cuda
 7 | 
 8 | sacctmgr -p show associations user=paciorek
 9 | 
10 | srun -A ac_scsguest -p savio2_gpu  -N 1 -t 30:0 --pty bash
11 | srun -u -A ac_scsguest -p savio2_gpu  -N 1 -t 30:0 bash -i
12 | 
13 | alias gtop=\"nvidia-smi -q -d UTILIZATION -l 1\"
14 | alias gmem=\"nvidia-smi -q -d MEMORY -l 1\"
15 | 
16 | nvcc ${CUDA_DIR}/samples/1_Utilities/deviceQuery/deviceQuery.cpp -I${CUDA_DIR}/include -I${CUDA_DIR}/samples/common/inc -o deviceQuery
17 | 
18 | 


--------------------------------------------------------------------------------