├── README.md ├── RNGexample.R ├── build-bce-gpu.sh ├── calc_loglik.cu ├── cudaBlasExample.c ├── example ├── alphas.csv ├── compute_probs.cu ├── compute_probs_float.cu ├── compute_probs_unitStrides.cu ├── compute_probs_unitStrides_sharedMem.cu ├── compute_probs_unitStrides_sharedMem_float.cu ├── example_RCUDA.R ├── example_Rcpp.R ├── example_pureR.R └── setup_calc.R ├── gpu.Rmd ├── gpu.html ├── gpuArrayExample.py ├── helloWorld.cu ├── kernelExample-pinned.cu ├── kernelExample.R ├── kernelExample.cu ├── kernelExample.py ├── magmaExample.c ├── random.cu ├── savio-job-template.sh └── savio.sh /README.md: -------------------------------------------------------------------------------- 1 | # gpu-workshop-2016 2 | Materials for workshop on GPU computation for statistics, data science, machine learning applications. Please see gpu.html to be guided through the materials. 3 | 4 | Session 1: Monday, Feb. 1, 4:10 - 5:30 pm in Evans 1011 5 | * Introduction to GPU resources that are available (Savio, Amazon EC2) 6 | * Basics of using GPUs with C, R, and Python 7 | 8 | Session 2: Monday Feb. 8, 4:10 - 5:30 pm in Evans 1011 9 | * Use of packages such as Caffe, TensorFlow, etc. that use GPUs for 10 | back-end computation 11 | * Discussion of use cases by those using GPUs currently 12 | * Optimizing GPU usage 13 | 14 | The workshop will be an introduction to using GPUs and will assume no 15 | previous knowledge of GPUs. I will assume familiarity with either R, 16 | C, or Python and at least modest familiarity with operating in a UNIX 17 | environment. The goal is to get folks up to speed on using GPUs, and 18 | we'll cover basic techniques for using a GPU with R, C, and Python. 19 | 20 | -------------------------------------------------------------------------------- /RNGexample.R: -------------------------------------------------------------------------------- 1 | library(RCUDA) 2 | 3 | cat("Setting cuGetContext(TRUE)...\n") 4 | cuGetContext(TRUE) 5 | 6 | ptx = nvcc("random.cu", out = "random.ptx", target = "ptx", 7 | "-arch=compute_20", "-code=sm_20,compute_20") 8 | 9 | mod = loadModule(ptx) 10 | 11 | setup = mod$setup_kernel 12 | rnorm = mod$rnorm_kernel 13 | 14 | n = as.integer(1e8) # NOTE 'n' is of type integer 15 | n_per_thread = as.integer(1000) 16 | 17 | mu = 0.3 18 | sigma = 1.5 19 | 20 | verbose = FALSE 21 | 22 | # setting grid and block dimensions 23 | threads_per_block <- as.integer(1024) 24 | block_dims <- c(threads_per_block, as.integer(1), as.integer(1)) 25 | grid_d <- as.integer(ceiling(sqrt((n/n_per_thread)/threads_per_block))) 26 | 27 | grid_dims <- c(grid_d, grid_d, as.integer(1)) 28 | 29 | cat("Grid size:\n") 30 | print(grid_dims) 31 | 32 | nthreads <- as.integer(prod(grid_dims)*prod(block_dims)) 33 | cat("Total number of threads to launch = ", nthreads, "\n") 34 | if (nthreads*n_per_thread < n){ 35 | stop("Grid is not large enough...!") 36 | } 37 | 38 | cat("Running CUDA kernel...\n") 39 | 40 | seed = as.integer(0) 41 | 42 | 43 | tRNGinit <- system.time({ 44 | rng_states <- cudaMalloc(numEls=nthreads, sizeof=as.integer(48), elType="curandState") 45 | .cuda(setup, rng_states, seed, nthreads, as.integer(verbose), gridDim=grid_dims, blockDim=block_dims) 46 | cudaDeviceSynchronize() 47 | }) 48 | 49 | tAlloc <- system.time({ 50 | dX = cudaMalloc(n, sizeof = as.integer(8), elType = "double", strict = TRUE) 51 | cudaDeviceSynchronize() 52 | }) 53 | 54 | tCalc <- system.time({ 55 | .cuda(rnorm, rng_states, dX, n, mu, sigma, n_per_thread, gridDim=grid_dims, blockDim=block_dims,.numericAsDouble = getOption("CUDA.useDouble", TRUE)) 56 | cudaDeviceSynchronize() 57 | }) 58 | 59 | tTransferFromGPU <- system.time({ 60 | out = copyFromDevice(obj = dX, nels = dX@nels, type = "double") 61 | cudaDeviceSynchronize() 62 | }) 63 | 64 | 65 | tCPU <- system.time({ 66 | out2 <- rnorm(n, mu, sigma) 67 | }) 68 | 69 | 70 | 71 | cat("RNG initiation time: ", tRNGinit[3], "\n") 72 | cat("GPU memory allocation time: ", tAlloc[3], "\n") 73 | cat("Calculation time (GPU): ", tCalc[3], "\n") 74 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n") 75 | cat("Calculation time (CPU): ", tCPU[3], "\n") 76 | 77 | -------------------------------------------------------------------------------- /build-bce-gpu.sh: -------------------------------------------------------------------------------- 1 | # start BCE-2015-fall from AWS console on a g2.2xlarge 2 | # $0.65/hour; 4 Gb video RAM, 1536 CUDA cores 3 | 4 | # make sure to increase space for home directory by requesting more when start instance, e.g. 30 Gb 5 | 6 | # set variable holding IP address 7 | # export ip=52-32-169-154 8 | 9 | # ssh to the Amazon instance 10 | # ssh -i ~/.ssh/ec2_rsa ubuntu@ec2-${ip}.us-west-2.compute.amazonaws.com 11 | 12 | sudo su 13 | 14 | # install CUDA 15 | wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1504/x86_64/cuda-repo-ubuntu1504_7.5-18_amd64.deb 16 | dpkg -i cuda-repo-ubuntu1504_7.5-18_amd64.deb 17 | 18 | apt-get update 19 | date >> /tmp/date 20 | apt-get install -y cuda # a bit less than 10 mins 21 | date >> /tmp/date 22 | 23 | rm -rf cuda-repo-ubuntu1504_7.5-18_amd64.deb 24 | 25 | 26 | # set up some utilities for monitoring the GPU 27 | echo "" >> ~ubuntu/.bashrc 28 | echo "export PATH=${PATH}:/usr/local/cuda/bin" >> ~ubuntu/.bashrc 29 | echo "" >> ~ubuntu/.bashrc 30 | echo "alias gtop=\"nvidia-smi -q -d UTILIZATION -l 1\"" >> ~ubuntu/.bashrc 31 | echo "" >> ~ubuntu/.bashrc 32 | echo "alias gmem=\"nvidia-smi -q -d MEMORY -l 1\"" >> ~ubuntu/.bashrc 33 | 34 | # set up access to CUDA shared libraries 35 | echo "/usr/local/cuda/lib64" >> /etc/ld.so.conf.d/cuda.conf 36 | ldconfig 37 | 38 | exit # back to ubuntu user 39 | 40 | # reboot the instance 41 | 42 | gtop 43 | 44 | # gtop result without reboot will error: 45 | #modprobe: ERROR: could not insert 'nvidia_352': No such device 46 | #NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running. 47 | 48 | # create deviceQuery executable 49 | sudo /usr/local/cuda/bin/nvcc /usr/local/cuda/samples/1_Utilities/deviceQuery/deviceQuery.cpp -I/usr/local/cuda/include -I/usr/local/cuda/samples/common/inc -o /usr/local/cuda/bin/deviceQuery 50 | 51 | deviceQuery 52 | 53 | # install PyCUDA 54 | pip install pycuda 55 | 56 | # install RCUDA 57 | sudo su 58 | 59 | cd /tmp 60 | git clone https://github.com/duncantl/RCUDA 61 | git clone https://github.com/omegahat/RAutoGenRunTime 62 | 63 | cd RCUDA/src 64 | ln -s ../../RAutoGenRunTime/src/RConverters.c . 65 | ln -s ../../RAutoGenRunTime/inst/include/RConverters.h . 66 | ln -s ../../RAutoGenRunTime/inst/include/RError.h . 67 | 68 | cd ../.. 69 | 70 | Rscript -e "install.packages('bitops', repos = 'https://cran.cnr.berkeley.edu')" 71 | 72 | R CMD build RCUDA 73 | R CMD build RAutoGenRunTime 74 | R CMD INSTALL RAutoGenRunTime_0.3-0.tar.gz 75 | R CMD INSTALL RCUDA_0.4-0.tar.gz 76 | 77 | # install MAGMA 78 | export PATH=${PATH}:/usr/local/cuda/bin 79 | 80 | MAGMA_VERSION=1.7.0 81 | cd /usr/local 82 | mkdir magma-${MAGMA_VERSION} 83 | ln -s magma-${MAGMA_VERSION} magma 84 | cd /usr/src 85 | mkdir magma-${MAGMA_VERSION} 86 | ln -s magma-${MAGMA_VERSION} magma 87 | cd magma 88 | wget http://icl.cs.utk.edu/projectsfiles/magma/downloads/magma-${MAGMA_VERSION}.tar.gz 89 | tar -xvzf magma-${MAGMA_VERSION}.tar.gz 90 | cd magma-${MAGMA_VERSION} 91 | # note I added -fPIC per the magma README to enable creation of a shared object 92 | cp make.inc.openblas make.inc 93 | sed -i 's/-lopenblas/-llapack -lblas -lstdc++ -lm -lgfortran/' make.inc 94 | sed -i 's/#GPU_TARGET.*/GPU_TARGET = Kepler/' make.inc 95 | sed -i 's/.*(CUDADIR)\/lib64/LIBDIR\t\= -L$(CUDADIR)\/lib64/' make.inc 96 | sed -i 's/.*OPENBLASDIR.*//' make.inc 97 | sed -i 's/.*make.check-openblas.*//' make.inc 98 | # make NVCCFLAGS look like: 99 | # NVCCFLAGS = -O3 -DADD_ -Xcompiler "-fno-strict-aliasing $(FPIC)" 100 | 101 | export CUDADIR=/usr/local/cuda 102 | make shared 2>&1 | tee ../make.shared.log 103 | make test 2>&1 | tee ../make.test.log 104 | make install prefix=/usr/local/magma 2>&1 | tee ../make.install.log 105 | 106 | cd /usr/local/magma 107 | chmod ugo+r include/* 108 | 109 | echo "/usr/local/magma/lib" >> /etc/ld.so.conf.d/SITE-magma.conf 110 | ldconfig 111 | 112 | 113 | #### Create image ########################## 114 | 115 | # 1) now save the image in us-west-2 via point and click on VM page under Actions 116 | # 2) make it public 117 | 118 | -------------------------------------------------------------------------------- /calc_loglik.cu: -------------------------------------------------------------------------------- 1 | #define SQRT_TWO_PI 2.506628274631000 2 | extern "C" 3 | __global__ void calc_loglik(double* vals, int N, double mu, double sigma) { 4 | // note that this assumes no third dimension to the grid 5 | // id of the block 6 | int myblock = blockIdx.x + blockIdx.y * gridDim.x; 7 | // size of each block (within grid of blocks) 8 | int blocksize = blockDim.x * blockDim.y * blockDim.z; 9 | // id of thread in a given block 10 | int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; 11 | // assign overall id/index of the thread 12 | int idx = myblock * blocksize + subthread; 13 | 14 | if(idx < N) { 15 | double std = (vals[idx] - mu)/ sigma; 16 | double e = exp( - 0.5 * std * std); 17 | vals[idx] = e / ( sigma * SQRT_TWO_PI); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /cudaBlasExample.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include "cublas_v2.h" 8 | 9 | // compile as: 10 | // export PATH=$PATH:/usr/local/cuda/bin 11 | // nvcc cudaBlasExample.c -I/usr/local/cuda/include -lcublas -o cudaBlasExample 12 | 13 | 14 | double read_timer() { 15 | struct timeval end; 16 | gettimeofday( &end, NULL ); 17 | return end.tv_sec+1.e-6*end.tv_usec; 18 | } 19 | 20 | void fillMatrix( double *p, int n ) { 21 | int i; 22 | srand48(0); 23 | for( i = 0; i < n; i++ ) 24 | p[i] = 2*drand48()-1; 25 | } 26 | 27 | int main( int argc, char **argv ) { 28 | printf("Starting\n"); 29 | int size; 30 | cudaError_t cudaStat; 31 | cublasStatus_t stat; 32 | cublasHandle_t handle; 33 | int it; 34 | 35 | cublasOperation_t N = 'N'; 36 | cublasOperation_t T = 'T'; 37 | double one = 1., zero=0.; 38 | 39 | for( size = 512; size <= 8192; size*=4 ) { 40 | 41 | // allocate memory on host (CPU) 42 | double *A = (double*) malloc( sizeof(double)*size*size ); 43 | double *B = (double*) malloc( sizeof(double)*size*size ); 44 | 45 | cudaDeviceSynchronize(); 46 | double tInit = read_timer(); 47 | 48 | double *dA,*dB; 49 | // allocate memory on device (GPU) 50 | cudaStat = cudaMalloc((void**)&dA, sizeof(double)*size*size); 51 | if(cudaStat != cudaSuccess) { 52 | printf ("device memory allocation failed"); 53 | return EXIT_FAILURE; 54 | } 55 | cudaStat = cudaMalloc((void**)&dB, sizeof(double)*size*size); 56 | if(cudaStat != cudaSuccess) { 57 | printf ("device memory allocation failed"); 58 | return EXIT_FAILURE; 59 | } 60 | 61 | // wait until previous CUDA commands on GPU threads have finished 62 | // this allows us to do the timing correctly 63 | cudaDeviceSynchronize(); 64 | 65 | double tAlloc = read_timer(); 66 | 67 | 68 | // initialization of CUBLAS 69 | stat = cublasCreate(&handle); 70 | if(stat != CUBLAS_STATUS_SUCCESS) { 71 | printf ("CUBLAS initialization failed\n"); 72 | return EXIT_FAILURE; 73 | } 74 | 75 | // create our test matrix on the CPU 76 | fillMatrix(B, size*size); 77 | 78 | cudaDeviceSynchronize(); 79 | double tInit2 = read_timer(); 80 | 81 | 82 | // copy matrix to GPU, with dB the pointer to the object on the GPU 83 | stat = cublasSetMatrix (size, size, sizeof(double), B, size, dB, size); 84 | if(stat != CUBLAS_STATUS_SUCCESS) { 85 | printf ("data download failed"); 86 | cudaFree (dB); 87 | cublasDestroy(handle); 88 | return EXIT_FAILURE; 89 | } 90 | 91 | cudaDeviceSynchronize(); 92 | double tTransferToGPU = read_timer(); 93 | 94 | // call cublas matrix multiply (dA = dB * dB) 95 | cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, size, size, size, &one, dB, size, dB, size, &zero, dA, size ); 96 | 97 | cudaDeviceSynchronize(); 98 | double tMatMult = read_timer(); 99 | 100 | // transfer matrix back to CPU 101 | stat = cublasGetMatrix (size, size, sizeof(double), dA, size, A, size); 102 | if(stat != CUBLAS_STATUS_SUCCESS) { 103 | printf ("data upload failed"); 104 | cudaFree(dA); 105 | cublasDestroy(handle); 106 | return EXIT_FAILURE; 107 | } 108 | 109 | cudaDeviceSynchronize(); 110 | double tTransferFromGPU = read_timer(); 111 | 112 | printf("====================================================\n"); 113 | printf("Timing results for n = %d\n", size); 114 | printf("GPU memory allocation time: %f\n", tAlloc - tInit); 115 | printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2); 116 | printf("Matrix multiply time: %f\n", tMatMult - tTransferToGPU); 117 | printf("Transfer from GPU time: %f\n", tTransferFromGPU - tMatMult); 118 | 119 | 120 | // free memory on GPU and CPU 121 | cudaFree(dA); 122 | cudaFree(dB); 123 | cublasDestroy(handle); 124 | free(A); 125 | free(B); 126 | 127 | } 128 | return EXIT_SUCCESS; 129 | } 130 | -------------------------------------------------------------------------------- /example/compute_probs.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void compute_probs(double* alphas, double* rands, double* probs, int n, int K, int M) { 3 | // assign overall id/index of the thread = id of row 4 | int i = blockIdx.x * blockDim.x + threadIdx.x; 5 | 6 | if(i < n) { 7 | double maxval; 8 | int m, k; 9 | int maxind; 10 | double M_d = (double) M; 11 | double* w = new double[K]; 12 | 13 | for(k = 0; k < K; ++k){ // initialize probs (though already done on CPU) 14 | probs[i*K + k] = 0.0; 15 | } 16 | 17 | // core computations 18 | for(m = 0; m < M; ++m){ // loop over Monte Carlo iterations 19 | for(k = 0; k < K; ++k){ // generate W ~ N(alpha, 1) 20 | w[k] = alphas[i*K + k] + rands[m*K + k]; 21 | } 22 | 23 | // determine which category has max W 24 | maxind = K-1; 25 | maxval = w[K-1]; 26 | for(k = 0; k < (K-1); ++k){ 27 | if(w[k] > maxval){ 28 | maxind = k; 29 | maxval = w[k]; 30 | } 31 | } 32 | probs[i*K + maxind] += 1.0; 33 | } 34 | 35 | // compute final proportions 36 | for(k = 0; k < K; ++k) { 37 | probs[i*K + k] /= M_d; 38 | } 39 | free(w); 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /example/compute_probs_float.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void compute_probs(float* alphas, float* rands, float* probs, int n, int K, int M) { 3 | // assign overall id/index of the thread = id of row 4 | int i = blockIdx.x * blockDim.x + threadIdx.x; 5 | 6 | if(i < n) { 7 | float maxval; 8 | int m, k; 9 | int maxind; 10 | float M_d = (float) M; 11 | float* w = new float[K]; 12 | 13 | for(k = 0; k < K; ++k){ // initialize probs (though already done on CPU) 14 | probs[i*K + k] = 0.0; 15 | } 16 | for(m = 0; m < M; ++m){ // loop over Monte Carlo iterations 17 | for(k = 0; k < K; ++k){ // generate W ~ N(alpha, 1) 18 | w[k] = alphas[i*K + k] + rands[m*K + k]; 19 | } 20 | 21 | // determine which category has max W 22 | maxind = K-1; 23 | maxval = w[K-1]; 24 | for(k = 0; k < (K-1); ++k){ 25 | if(w[k] > maxval){ 26 | maxind = k; 27 | maxval = w[k]; 28 | } 29 | } 30 | probs[i*K + maxind] += 1.0; 31 | } 32 | // compute final proportions 33 | for(k = 0; k < K; ++k) { 34 | probs[i*K + k] /= M_d; 35 | } 36 | free(w); 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /example/compute_probs_unitStrides.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void compute_probs(double* alphas, double* rands, double* probs, int n, int K, int M) { 3 | // assign overall id/index of the thread = id of row 4 | int i = blockIdx.x * blockDim.x + threadIdx.x; 5 | 6 | if(i < n) { 7 | double maxval; 8 | int m, k; 9 | int maxind; 10 | double M_d = (double) M; 11 | double* w = new double[K]; 12 | 13 | for(k = 0; k < K; ++k){ // initialize probs (though already done on CPU) 14 | probs[k*n + i] = 0.0; 15 | } 16 | 17 | // core computations 18 | for(m = 0; m < M; ++m){ // loop over Monte Carlo iterations 19 | for(k = 0; k < K; ++k){ // generate W ~ N(alpha, 1) 20 | // with +i we now have unit strides in inner loop 21 | w[k] = alphas[k*n + i] + rands[k*M + m]; 22 | } 23 | 24 | // determine which category has max W 25 | maxind = K-1; 26 | maxval = w[K-1]; 27 | for(k = 0; k < (K-1); ++k){ 28 | if(w[k] > maxval){ 29 | maxind = k; 30 | maxval = w[k]; 31 | } 32 | } 33 | probs[maxind*n + i] += 1.0; 34 | } 35 | 36 | // compute final proportions 37 | for(k = 0; k < K; ++k) { 38 | // unit strides 39 | probs[k*n + i] /= M_d; 40 | } 41 | free(w); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /example/compute_probs_unitStrides_sharedMem.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void compute_probs(double* alphas, double* rands, double* probs, int n, int K, int M) { 3 | // assign overall id/index of the thread = id of row 4 | int i = blockIdx.x * blockDim.x + threadIdx.x; 5 | int threads_per_block = blockDim.x; 6 | 7 | // set up shared memory: half for probs and half for w 8 | extern __shared__ double shared[]; 9 | double* probs_shared = shared; 10 | double* w = &shared[K*threads_per_block]; // shared mem is one big block, so need to index into latter portion of it to use for w 11 | 12 | 13 | if(i < n) { 14 | double maxval; 15 | int m, k; 16 | int maxind; 17 | double M_d = (double) M; 18 | 19 | // initialize shared memory probs 20 | for(k = 0; k < K; ++k) { 21 | probs_shared[k*threads_per_block + threadIdx.x] = 0.0; 22 | } 23 | 24 | // core computation 25 | for(m = 0; m < M; ++m){ // loop over Monte Carlo iterations 26 | for(k = 0; k < K; ++k){ // generate W ~ N(alpha, 1) 27 | w[k*threads_per_block + threadIdx.x] = alphas[k*n + i] + rands[k*M + m]; 28 | } 29 | maxind = K-1; 30 | maxval = w[(K-1)*threads_per_block + threadIdx.x]; 31 | for(k = 0; k < (K-1); ++k){ 32 | if(w[k*threads_per_block + threadIdx.x] > maxval){ 33 | maxind = k; 34 | maxval = w[k*threads_per_block + threadIdx.x]; 35 | } 36 | } 37 | probs_shared[maxind*threads_per_block + threadIdx.x] += 1.0; 38 | } 39 | 40 | for(k = 0; k < K; ++k) { 41 | probs_shared[k*threads_per_block + threadIdx.x] /= M_d; 42 | } 43 | 44 | // copy to device memory so can be returned to CPU 45 | for(k = 0; k < K; ++k) { 46 | probs[k*n + i] = probs_shared[k*threads_per_block + threadIdx.x]; 47 | } 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /example/compute_probs_unitStrides_sharedMem_float.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void compute_probs(float* alphas, float* rands, float* probs, int n, int K, int M) { 3 | // assign overall id/index of the thread = id of row 4 | int i = blockIdx.x * blockDim.x + threadIdx.x; 5 | int threads_per_block = blockDim.x; 6 | 7 | // set up shared memory: half for probs and half for w 8 | extern __shared__ float shared[]; 9 | float* probs_shared = shared; 10 | float* w = &shared[K*threads_per_block]; // shared mem is one big block, so need to index into latter portion of it to use for w 11 | 12 | 13 | if(i < n) { 14 | float maxval; 15 | int m, k; 16 | int maxind; 17 | float M_d = (float) M; 18 | 19 | // initialize shared memory probs 20 | for(k = 0; k < K; ++k) { 21 | probs_shared[k*threads_per_block + threadIdx.x] = 0.0; 22 | } 23 | 24 | // core computations 25 | for(m = 0; m < M; ++m){ // loop over Monte Carlo iterations 26 | for(k = 0; k < K; ++k){ // generate W ~ N(alpha, 1) 27 | w[k*threads_per_block + threadIdx.x] = alphas[k*n + i] + rands[k*M + m]; 28 | } 29 | maxind = K-1; 30 | maxval = w[(K-1)*threads_per_block + threadIdx.x]; 31 | for(k = 0; k < (K-1); ++k){ 32 | if(w[k*threads_per_block + threadIdx.x] > maxval){ 33 | maxind = k; 34 | maxval = w[k*threads_per_block + threadIdx.x]; 35 | } 36 | } 37 | probs_shared[maxind*threads_per_block + threadIdx.x] += 1.0; 38 | } 39 | 40 | for(k = 0; k < K; ++k) { 41 | probs_shared[k*threads_per_block + threadIdx.x] /= M_d; 42 | } 43 | 44 | // copy to device memory so can be returned to CPU 45 | for(k = 0; k < K; ++k) { 46 | probs[k*n + i] = probs_shared[k*threads_per_block + threadIdx.x]; 47 | } 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /example/example_RCUDA.R: -------------------------------------------------------------------------------- 1 | # modification of one of the RCUDA examples to use use double precision 2 | 3 | library(RCUDA) 4 | 5 | if(!exists('unitStrides') || is.null(unitStrides)) unitStrides <- FALSE 6 | if(!exists('sharedMem') || is.null(sharedMem)) sharedMem <- FALSE 7 | if(!exists('float') || is.null(float)) float <- FALSE 8 | 9 | M <- as.integer(1e4) # important to have as integer! 10 | 11 | # get the alphas and generate the random numbers 12 | source('setup_calc.R') 13 | 14 | cat("Setting cuGetContext(TRUE)...\n") 15 | cuGetContext(TRUE) 16 | 17 | # compile the kernel into a form that RCUDA can load; equivalent to this nvcc call: 18 | # system("nvcc --ptx -arch=compute_20 -code=sm_20,compute_20 -o compute_probs.ptx compute_probs.cu") 19 | 20 | fn <- "compute_probs" 21 | if(unitStrides) fn <- paste0(fn, "_unitStrides") 22 | if(sharedMem) fn <- paste0(fn, "_sharedMem") 23 | if(float) fn <- paste0(fn, "_float") 24 | ptx = nvcc(file = paste0(fn, ".cu"), out = 'compute_probs.ptx', 25 | target = "ptx", "-arch=compute_20", "-code=sm_20,compute_20") 26 | 27 | mod = loadModule(ptx) 28 | compute_probs = mod$compute_probs 29 | 30 | # setting grid and block dimensions 31 | threads_per_block <- as.integer(192) 32 | if(sharedMem) threads_per_block <- as.integer(96) # need fewer threads so that have enough room in 48Kb of shared memory 33 | block_dims <- c(threads_per_block, as.integer(1), as.integer(1)) 34 | grid_d <- as.integer(ceiling(n/threads_per_block)) 35 | 36 | grid_dims <- c(grid_d, as.integer(1), as.integer(1)) 37 | 38 | cat("Grid size:\n") 39 | print(grid_dims) 40 | 41 | nthreads <- prod(grid_dims)*prod(block_dims) 42 | cat("Total number of threads to launch = ", nthreads, "\n") 43 | if (nthreads < n){ 44 | stop("Grid is not large enough...!") 45 | } 46 | 47 | cat("Running CUDA kernel...\n") 48 | 49 | if(unitStrides) { 50 | probs <- matrix(0, nrow = n, ncol = K) 51 | tmp <- matrix(0, nrow = n, ncol = K) 52 | rands <- t(rands) 53 | alphas <- t(alphas) 54 | } else { 55 | probs <- matrix(0, nrow = K, ncol = n) 56 | tmp <- matrix(0, nrow = K, ncol = n) 57 | } 58 | 59 | if(!float) { 60 | strict = TRUE # for double 61 | cuCtxSetSharedMemConfig("CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE") 62 | } else strict = FALSE 63 | 64 | sharedMemSize <- as.integer( 65 | ifelse(float, 4, 8)*K*threads_per_block*2 66 | ) 67 | 68 | if(sharedMem && sharedMemSize > 48000) stop("trying to use too much shared memory") 69 | 70 | # basic usage with manual transfer 71 | tTransferToGPU <- system.time({ 72 | devAlphas = copyToDevice(alphas, strict = strict) 73 | devRands = copyToDevice(rands, strict = strict) 74 | devProbs = copyToDevice(probs, strict = strict) 75 | cudaDeviceSynchronize() 76 | }) 77 | tCalc <- system.time({ 78 | if(float) { 79 | .cuda(compute_probs, devAlphas, devRands, devProbs, 80 | n, K, M, gridDim = grid_dims, blockDim = block_dims, sharedMemBytes = ifelse(sharedMem, sharedMemSize, as.integer(0))) 81 | } else 82 | .cuda(compute_probs, devAlphas, devRands, devProbs, 83 | n, K, M, gridDim = grid_dims, blockDim = block_dims, sharedMemBytes = ifelse(sharedMem, sharedMemSize, as.integer(0)), .numericAsDouble = getOption("CUDA.useDouble", TRUE)) 84 | cudaDeviceSynchronize() 85 | }) 86 | tTransferFromGPU <- system.time({ 87 | out = copyFromDevice(obj = devProbs, nels = devProbs@nels, type = "double") 88 | cudaDeviceSynchronize() 89 | }) 90 | 91 | 92 | cat("Input values: ", alphas[1:3], "\n") 93 | cat("Output values: ", out[1:3], "\n") 94 | 95 | cat("Transfer to GPU time: ", tTransferToGPU[3], "\n") 96 | cat("Calculation time (GPU): ", tCalc[3], "\n") 97 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n") 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /example/example_Rcpp.R: -------------------------------------------------------------------------------- 1 | M <- 1e4 2 | 3 | source('setup_calc.R') 4 | 5 | require(Rcpp) 6 | require(inline) 7 | 8 | cppFunction(' 9 | NumericMatrix compute_probs_mp(NumericMatrix alpha, NumericMatrix rands, int M, int n, int K, int nProc){ 10 | 11 | NumericMatrix probs(n, K); 12 | int i; 13 | 14 | omp_set_num_threads(nProc); 15 | 16 | #pragma omp parallel for 17 | for(i = 0; i < n; ++i){ 18 | double max; 19 | int m, k; 20 | int maxind; 21 | NumericVector w(K); 22 | 23 | for(k = 0; k < K; ++k){ 24 | probs(i,k) = 0.0; 25 | } 26 | 27 | // core computation 28 | for(m = 0; m < M; ++m){ 29 | for(k = 0; k < K; ++k){ 30 | w(k) = alpha(i, k) + rands(m, k); 31 | } 32 | maxind = K-1; 33 | max = w(K-1); 34 | for(k = 0; k < (K-1); ++k){ 35 | if(w(k) > max){ 36 | maxind = k; 37 | max = w(k); 38 | } 39 | } 40 | probs(i,maxind) += 1.0; 41 | } 42 | 43 | for(k = 0; k < K; ++k) { 44 | probs(i,k) /= M; 45 | } 46 | } 47 | return probs; 48 | } 49 | ', plugins = c("openmp"), includes = c('#include ')) 50 | 51 | alphas <- t(alphas) 52 | rands <- t(rands) 53 | 54 | # 47 sec for 10000 55 | system.time({ 56 | props1 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 1) 57 | }) 58 | # 11.9 sec for 10000 59 | system.time({ 60 | props2 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 4) 61 | }) 62 | # 6.0 sec for 10000 63 | system.time({ 64 | props3 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 8) 65 | }) 66 | 67 | # using transposed alpha,rands doesn't change things much: 68 | 69 | cppFunction(' 70 | NumericMatrix compute_probs_mp2(NumericMatrix alpha, NumericMatrix rands, int M, int n, int K, int nProc){ 71 | 72 | NumericMatrix probs(K, n); 73 | int i; 74 | 75 | omp_set_num_threads(nProc); 76 | 77 | #pragma omp parallel for 78 | for(i = 0; i < n; ++i){ 79 | double max; 80 | int k, m; 81 | int maxind; 82 | NumericVector w(K); 83 | 84 | for(k = 0; k < K; ++k){ 85 | probs(k,i) = 0.0; 86 | } 87 | for(m = 0; m < M; ++m){ 88 | for(k = 0; k < K; ++k){ 89 | w(k) = alpha(k,i) + rands(k, m); 90 | } 91 | maxind = K-1; 92 | max = w(K-1); 93 | for(k = 0; k < (K-1); ++k){ 94 | if(w(k) > max){ 95 | maxind = k; 96 | max = w(k); 97 | } 98 | } 99 | probs(maxind, i) += 1.0; 100 | } 101 | for(k = 0; k < K; ++k) { 102 | probs(k,i) /= M; 103 | } 104 | } 105 | return probs; 106 | } 107 | ', plugins = c("openmp"), includes = c('#include ')) 108 | 109 | alphas <- t(alphas) 110 | rands <- t(rands) 111 | 112 | # 50 sec. for 10000 113 | system.time({ 114 | props1 <- compute_probs_mp2(alphas, rands, M, n, K, nProc = 1) 115 | }) 116 | # 12.3 sec for 10000 117 | system.time({ 118 | props2 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 4) 119 | }) 120 | # 6.2 sec for 10000 121 | system.time({ 122 | props3 <- compute_probs_mp(alphas, rands, M, n, K, nProc = 8) 123 | }) 124 | 125 | 126 | -------------------------------------------------------------------------------- /example/example_pureR.R: -------------------------------------------------------------------------------- 1 | M <- 1e3 2 | 3 | source('setup_calc.R') 4 | 5 | props2 <- props3 <- props 6 | 7 | # 2-3 sec per iteration for M = 1e6 8 | # 72 sec for M=1000 9 | system.time({ 10 | for(i in 1:n) { 11 | tmp <- alphas[ , i] + rands 12 | id <- apply(tmp, 2, which.max) 13 | tbl <- table(id) 14 | props[as.integer(names(tbl)) , i] <- tbl / n 15 | if(i %% 1000 == 0) print(c(i, date())) 16 | } 17 | }) 18 | 19 | # 57 sec for M=1000 20 | system.time({ 21 | for(i in 1:n) { 22 | tmp <- t(alphas[ , i] + rands) 23 | id <- rep(1, M) 24 | for(k in 2:K) { 25 | wh <- tmp[, k ] > tmp[ , 1 ] 26 | id[wh] <- k 27 | tmp[wh, 1] <- tmp[wh, k ] 28 | } 29 | tbl <- table(id) 30 | props2[as.integer(names(tbl)) , i] <- tbl / n 31 | if(i %% 1000 == 0) print(c(i, date())) 32 | } 33 | }) 34 | 35 | nProc <- 4 36 | 37 | library(doParallel) 38 | registerDoParallel(nProc) 39 | 40 | # 29 sec for M=1000, 4 cores 41 | # 14 sec for M=1000, 8 cores 42 | system.time({ 43 | props3 <- foreach(i = 1:n, .combine = cbind) %dopar% { 44 | tmp <- t(alphas[ , i] + rands) 45 | id <- rep(1, M) 46 | for(k in 2:K) { 47 | wh <- tmp[, k ] > tmp[ , 1 ] 48 | id[wh] <- k 49 | tmp[wh, 1] <- tmp[wh, k ] 50 | } 51 | tbl <- table(id) 52 | out <- rep(0, K) 53 | out[as.integer(names(tbl))] <- tbl / M 54 | if(i %% 1000 == 0) print(c(i, date())) 55 | out 56 | } 57 | }) 58 | -------------------------------------------------------------------------------- /example/setup_calc.R: -------------------------------------------------------------------------------- 1 | alphas <- t(as.matrix(read.csv('alphas.csv', header = FALSE))) 2 | 3 | n <- ncol(alphas) 4 | K <- nrow(alphas) 5 | 6 | props <- matrix(0, K, n) 7 | 8 | set.seed(0) 9 | 10 | system.time({ 11 | rands <- matrix(rnorm(M*K), nrow = K, ncol = M) 12 | }) 13 | 14 | -------------------------------------------------------------------------------- /gpu.Rmd: -------------------------------------------------------------------------------- 1 | Introduction to Computing with GPUs for Data Science 2 | ================================================================== 3 | Chris Paciorek, Statistical Computing Facility, Department of Statistics and Berkeley Research Computing, UC Berkeley 4 | 5 | Presented: February 1 and 8, 2016 6 | 7 | Last Revised: February 1, 2016 8 | 9 | 10 | ```{r setup, include=FALSE} 11 | opts_chunk$set(cache = TRUE) # because the compilation takes time, let's cache it 12 | ``` 13 | 14 | # 0) This Tutorial 15 | 16 | Materials for this tutorial, including the R markdown file that was used to create this document are available on github at [https://github.com/berkeley-scf/gpu-workshop-2016](https://github.com/berkeley-scf/gpu-workshop-2016). You can download the files by doing a git clone: 17 | ```{r clone, eval=FALSE, engine='bash'} 18 | git clone https://github.com/berkeley-scf/gpu-workshop-2016 19 | ``` 20 | 21 | To create this HTML document, simply compile the corresponding R Markdown file in R: 22 | ```{r rmd-compile, eval=FALSE} 23 | library(knitr) 24 | knit2html('gpu.Rmd') 25 | ``` 26 | 27 | or from the UNIX command line: 28 | ```{r rmd-compile-bash, engine='bash', eval=FALSE} 29 | Rscript -e "library(knitr); knit2html('gpu.Rmd')" 30 | ``` 31 | 32 | 33 | # 1) Introduction 34 | 35 | ### 1.1) Overview 36 | 37 | GPUs (Graphics Processing Units) are processing units originally designed for rendering graphics on a computer quickly. This is done by having a large number of simple processing units for massively parallel calculation. The idea of general purpose GPU (GPGPU) computing is to exploit this capability for general computation. 38 | 39 | We'll see both high-level and low-level ways to program calculations for implementation on the GPU. The basic context of GPU programming is "data parallelism", in which the same calculation is done to lots of pieces of data. This could be a mathematical calculation on millions of entries in a vector or a simulation with many independent simulations. Some examples of data parallelism include matrix multiplication (doing the multiplication task on many separate matrix elements) or numerical integration (doing a numerical estimate of the piecewise integral on many intervals/regions), as well as standard statistical calculations such as simulation studies, bootstrapping, random forests, etc. This kind of computation also goes by the name "SIMD" (single instruction, multiple data). 40 | 41 | ### 1.2) Hardware 42 | 43 | Two of the main suppliers of GPUs are NVIDIA and AMD. *CUDA* is a platform for programming on GPUs specifically for NVIDIA GPUs that allows you to send C/C++/Fortran code for execution on the GPU. *OpenCL* is an alternative that will work with a broader variety of GPUs. However, CUDA is quite popular, and there are a lot of tools designed for working with NVIDIA GPUs and based on CUDA, so we'll focus on CUDA here. 44 | 45 | GPUs have many processing units but somewhat limited memory. Also, they can only use data in their own memory, not in the CPU's memory, so one must transfer data back and forth between the CPU (the *host*) and the GPU (the *device*). This copying can, in some computations, constitute a very large fraction of the overall computation. So it is best to create the data and/or leave the data (for subsequent calculations) on the GPU when possible and to limit transfers. 46 | 47 | The current generation of NVIDIA GPUs is of the *Kepler* architecture (3rd generation). The 2nd generation was *Fermi* and the 1st was *Tesla*. (However note that *Tesla* is also used by NVIDIA to refer to different chip types). 48 | 49 | Originally GPUs supported only single precision (i.e., *float* calculations) but fortunately they now support double precision operations, and most of the examples here will use doubles to reduce the possibility of potential numerical issues, in particular with linear algebra calculations. But in many contexts, single precision will be fine, and the GPU will do computations more quickly with single precision. We'll explore this a bit later in the tutorial. 50 | 51 | ### 1.3) Software Tools 52 | 53 | Here are some of the useful software tools for doing computations on the GPU. 54 | 55 | * CUDA - an extension of C/C++ for programming on an NVIDIA GPU 56 | * CUBLAS - a BLAS implementation for matrix-vector calculations on an NVIDIA GPU 57 | * CURANDOM - random number generation on an NVIDIA GPU 58 | * PyCUDA - a Python package providing a front-end for CUDA 59 | * RCUDA - an R package providing a front-end for CUDA 60 | * MAGMA - a package for combined CPU-GPU linear algebra, intended to be analogous to LAPACK + BLAS 61 | 62 | Note that RCUDA is still in development and is on Github but not CRAN, but should be high-quality as it is developed by Duncan Temple Lang at UC-Davis. 63 | 64 | We'll see all of these in action. 65 | 66 | There are also: 67 | * openCL - an alternative to CUDA that can also be used with non-NVIDIA GPUs 68 | * CUDA Python (from Anaconda, but free for academic use) 69 | * PyOpenCL 70 | * R packages: OpenCL, gpuR, gmatrix, gputools 71 | * BIDMach - software for fast machine learning with a GPU back end available 72 | 73 | Finally, many of the popular machine learning packages focused on neural networks and deep learning can use GPUs behind the scenes; these include Theano, Caffe, Torch, Tensorflow, and mocha.jl, among others. 74 | 75 | Some of these, such as PyCUDA and RCUDA allow you to easily interface to core CUDA code that you write yourself. Others, such as the other R packages and CUDA Python, allow you to program within R and Python but still use the GPU for some of the computation. Finally tools such as the various machine learning hide the details of the GPU usage from you and allow you to simply program in the environment of the software, with computations done on the GPU behind the scenes if a GPU is available. 76 | 77 | # 2) GPU hardware available at Berkeley 78 | 79 | ### 2.1) Department-specific GPUs 80 | 81 | #### Statistics 82 | 83 | The Statistical Computing Facility has a GPU on our high-priority cluster. We'll use this GPU in the demos here, though it is only available for Statistics affiliates. More details on using the GPU are available [here](http://statistics.berkeley.edu/computing/servers/gpu). 84 | 85 | #### Biostatistics 86 | 87 | Biostatistics has a GPU on one of its servers. Talk to Burke for more information. 88 | 89 | #### Economics 90 | 91 | The EML (Economics) has a GPU on one of the EML Linux servers that EML users can access. If this is of interest to you, email consult@econ.berkeley.edu, and I will work to get it set up analogously to the Statistics GPU and the Amazon virtual machine (see below) and to help you get started. 92 | 93 | 94 | ### 2.2) GPUs on the campus Linux cluster, Savio 95 | 96 | Savio recently purchased some nodes with GPUs. These are not yet available to the general public, but will soon be available to users affiliated with researchers who have purchased nodes on Savio and to users who are affiliated with faculty members using the faculty compute allowance. 97 | 98 | The general syntax for submitting a GPU-based job to Savio's SLURM based scheduler is as follows. 99 | ``` 100 | sbatch -A account_name -p savio2_gpu -N 1 -t 60:0 job.sh 101 | ``` 102 | 103 | Alternatively, simply do `sbatch job.sh` and include the scheduling flags in your *job.sh*, as demonstrated in [savio-job-template.sh](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/savio-job-template.sh). 104 | 105 | To figure out what to fill in for *account_name*, you can look up your accounts with 106 | ``` 107 | sacctmgr -p show associations user=${USER} 108 | ``` 109 | 110 | For an interactive session: 111 | ``` 112 | srun -A account_name --pty -p savio2_gpu -N1 -t 30:0 /bin/bash 113 | ``` 114 | 115 | Before doing any compilation involving CUDA code you generally want to change your environment modules: 116 | ``` 117 | module unload intel 118 | module load cuda 119 | ``` 120 | 121 | ### 2.3) GPUs through Amazon's EC2 service 122 | 123 | The *g2.2xlarge* Amazon EC2 instance types have a GPU with 1536 cores and 4 Gb memory, along with 8 CPU cores. There is also a *g2.8xlarge* that has four GPUs and 32 CPU cores. They can be pretty expensive unless you use spot instances - currently 65 cents per hour for g2.2xlarge and $2.60 per hour for g2.8xlarge in the us-west-2 region. The g2.2xlarge GPUs are pretty old chips, and I found that some of the examples included here ran a lot slower on the EC2 instance than on the Statistics GPU (and likely than Savio, but I haven't checked that as much). 124 | 125 | I've created an Amazon machine image (an AMI) that is the binary representation of the Linux Ubuntu operating system with support for GPU calculations. The AMI is based off of the [BCE virtual machine](bce.berkeley.edu) in use for a variety of projects and classes on campus. BCE provides a common set of software used in various data analysis/data science focused contexts, including Python and R. The BCE GPU AMI inherits this software and adds on various GPU-related software (in particular CUDA). Note also that the AMI is also similar to the SCF and EML Linux machines but with a reduced set of software. 126 | 127 | Based on the BCE-GPU AMI one can start up a virtual Linux machine that one can login to (see below for instructions) via SSH, just like any SCF/EML Linux server. If you were willing to pay Amazon and have an account, you can start a VM (in the Oregon [us-west-2] region) using the BCE GPU AMI by searching for *BCE-2015-fall-gpu* under "Public Images" at the [EC2 console](https://console.aws.amazon.com/ec2/v2/home?region=us-west-2#Images:). Then just launch a VM, selecting *g2.2xlarge* under the *GPU instances* tab. 128 | 129 | If you're interested in how to install CUDA-related software on an Ubuntu Linux machine, see [build-bce-gpu.sh](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/build-bce-gpu.sh) for the details of how I built the *BCE-2015-fall-gpu* image based on the *BCE-2015-fall* image. 130 | 131 | # 3) Some basics of GPU use 132 | 133 | ### 3.1) Getting information about the GPU 134 | 135 | First let's see how we get information about the GPU and activity on the GPU. 136 | 137 | #### Hardware specifications 138 | 139 | 140 | First, executing the following code as root will create an executable that will show you details on the GPU, including the possible block and grid dimensions (described shortly). 141 | 142 | ```{r deviceQuery, engine='bash', eval=FALSE} 143 | cd /usr/local/cuda/samples/1_Utilities/deviceQuery 144 | nvcc deviceQuery.cpp -I/usr/local/cuda/include \ 145 | -I/usr/local/cuda-5.5/samples/common/inc -o /usr/local/cuda/bin/deviceQuery 146 | cd - 147 | ``` 148 | 149 | Once the *deviceQuery* executable is created, you can run it whenever you want. 150 | 151 | You'll see information such as the following. 152 | 153 | ``` 154 | paciorek@scf-sm20:~> deviceQuery 155 | deviceQuery Starting... 156 | 157 | CUDA Device Query (Runtime API) version (CUDART static linking) 158 | 159 | Detected 1 CUDA Capable device(s) 160 | 161 | Device 0: "Tesla K20Xm" 162 | CUDA Driver Version / Runtime Version 7.0 / 7.0 163 | CUDA Capability Major/Minor version number: 3.5 164 | Total amount of global memory: 5760 MBytes (6039339008 bytes) 165 | (14) Multiprocessors, (192) CUDA Cores/MP: 2688 CUDA Cores 166 | GPU Max Clock rate: 732 MHz (0.73 GHz) 167 | Memory Clock rate: 2600 Mhz 168 | Memory Bus Width: 384-bit 169 | L2 Cache Size: 1572864 bytes 170 | Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096) 171 | Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers 172 | Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers 173 | Total amount of constant memory: 65536 bytes 174 | Total amount of shared memory per block: 49152 bytes 175 | Total number of registers available per block: 65536 176 | Warp size: 32 177 | Maximum number of threads per multiprocessor: 2048 178 | Maximum number of threads per block: 1024 179 | Max dimension size of a thread block (x,y,z): (1024, 1024, 64) 180 | Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535) 181 | Maximum memory pitch: 2147483647 bytes 182 | Texture alignment: 512 bytes 183 | Concurrent copy and kernel execution: Yes with 2 copy engine(s) 184 | Run time limit on kernels: No 185 | Integrated GPU sharing Host Memory: No 186 | Support host page-locked memory mapping: Yes 187 | Alignment requirement for Surfaces: Yes 188 | Device has ECC support: Enabled 189 | Device supports Unified Addressing (UVA): Yes 190 | Device PCI Domain ID / Bus ID / location ID: 0 / 2 / 0 191 | Compute Mode: 192 | < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) > 193 | 194 | deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 1, Device0 = Tesla K20Xm 195 | Result = PASS 196 | ``` 197 | 198 | #### Observing Performance on the GPU 199 | 200 | The following command will allow you to see some information analogous to *top* on the CPU. 201 | 202 | ```{r gtop, engine='bash', eval=FALSE} 203 | nvidia-smi -q -d UTILIZATION -l 1 204 | ``` 205 | 206 | Here's some example output when the GPU is idle: 207 | 208 | ```{r gtop output, engine='bash', eval=FALSE} 209 | ==============NVSMI LOG============== 210 | 211 | Timestamp : Mon Jan 25 17:45:12 2016 212 | Driver Version : 346.46 213 | 214 | Attached GPUs : 1 215 | GPU 0000:02:00.0 216 | Utilization 217 | Gpu : 0 % 218 | Memory : 0 % 219 | Encoder : 0 % 220 | Decoder : 0 % 221 | 222 | ``` 223 | 224 | Memory use based on the above does not seem to actually indicate how much of the overall GPU memory is in use for some reason. 225 | 226 | Instead, to see how much memory is used on the GPU, the following will work: 227 | 228 | ```{r gmem, engine='bash', eval=FALSE} 229 | nvidia-smi -q -d MEMORY -l 1 230 | ``` 231 | 232 | Here's some example output when not much memory is in use on the GPU: 233 | 234 | ```{r gmem-output, engine='bash', eval=FALSE} 235 | ==============NVSMI LOG============== 236 | 237 | Timestamp : Thu Jan 28 12:06:24 2016 238 | Driver Version : 346.46 239 | 240 | Attached GPUs : 1 241 | GPU 0000:02:00.0 242 | FB Memory Usage 243 | Total : 5759 MiB 244 | Used : 12 MiB 245 | Free : 5747 MiB 246 | BAR1 Memory Usage 247 | Total : 256 MiB 248 | Used : 2 MiB 249 | Free : 254 MiB 250 | ``` 251 | 252 | 253 | ### 3.2) Overview of computation on a GPU 254 | 255 | The basic series of operations to use a GPU when writing your own GPU code is: 256 | * allocate memory on the GPU 257 | * transfer data from CPU to GPU 258 | * launch the CUDA kernel to operate on the threads, with a given block/grid arrangement 259 | * (optionally) launch another kernel, which can access data stored on the GPU, including results from the previous kernel 260 | * transfer results back to CPU 261 | 262 | The key computations are done in the *kernel*. Kernels are functions that encode the core computational operations that are executed in parallel. The basic mode of operation with a GPU when you are writing your own GPU code is to write a kernel using CUDA code and then call the kernel in parallel via C, R, or Python code. 263 | 264 | As outlined above, we need to pass any data from the CPU to the GPU and do the same in reverse to get the result. We'll also need to allocate memory on the GPU. However in some cases the transfer and allocation will be done automatically behind the scenes. 265 | 266 | ### 3.3) Threads, Blocks, and Grids 267 | 268 | Programming on a GPU (in particular programming for efficiency) requires some understanding of how parallelization works on the GPU. Each individual computation or series of computations on the GPU is done in a thread. Threads are organized into blocks and blocks of threads are organized in a grid. The blocks and grids can be 1-, 2-, or 3-dimensional. E.g., you might have a 1-d block of 256 threads, with a grid of 3 x 3 such blocks, for a total of $256 \times 9 = 2304$ threads. The choice of the grid/block arrangement can affect efficiency. I'm not an expert at this level of detail but we'll see some about this in the worked example. Note that using more than 1-dimensional grids and blocks is purely for the conceptual convenience of the programmer and doesn't correspond to anything on the hardware. So for the most part we'll use a one-dimensional grid of blocks and a one-dimensional blocks of threads. 269 | In general you'd want each independent calculation done in a separate thread, though as we'll see in Section 5 on simulation, one might want to do a sequence of calculations on each thread. In general, you'll want to pipeline together multiple operations within a computation to avoid copying from CPU to GPU and back. Alternatively, this can be done by keeping the data on the GPU and calling a second kernel. 270 | 271 | Threads are quick to start, and to get efficiency you want to have thousands of threads to exploit the parallelism of the GPU hardware. In general your calculations will have more threads than GPU cores; the GPU will manage the process of executing all the threads. 272 | 273 | This can all get quite complicated, with the possibility for communication amongst threads. Threads within a block have some (48Kb) of shared memory (distinct from the main GPU memory) and can synchronize with each other, while threads in different blocks cannot cooperate. We'll see some basic examples of this in our working example later. The Suchard et al. paper referenced in the last Section discusses how to get more efficiency by having threads within a block cooperate and access shared memory, which is much faster than accessing the main GPU (device) memory. 274 | 275 | If we go back to the *deviceQuery* output, we'll see information on the number of physical CUDA cores and main GPU memory as well as information about the maximum threads per block and the maximum dimensions of thread blocks and grids. 276 | 277 | ### 3.4) "Hello, world" using CUDA directly 278 | 279 | First let's see a 'Hello, World' example that illustrates blocks of threads and grids of blocks. 280 | 281 | The idea is to have at least as many threads as the number of computations you are doing. Our kernel function contains the core calculation we want to do (in this case printing 'Hello world!') and code that figures out identifying information for each thread as discussed next. 282 | 283 | When we write a kernel, we will need to have some initial code that determines a unique ID for that thread that allows the thread to access the appropriate part(s) of the data object(s) on the GPU and 'know' what part of the computation it should do. This is done based on information stored in variables that CUDA provides that have information about the thread and block indices and block and grid dimensions. 284 | 285 | Here's the [example code (helloWorld.cu on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/helloWorld.cu). 286 | 287 | In this case, compilation is as follows. Given the CUDA functionality used in the code (in particular the call to *printf* within the kernel), we need to specify compilation for a *compute capability* >= 2.0 (corresponding to the Fermi generation of NVIDIA GPUs) (more below). Note that our query above indicated that the GPU we are using has capability 3.5, so this constraint is fine. 288 | 289 | ```{r, helloWorld-compile, engine='bash', eval=FALSE} 290 | nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld 291 | ``` 292 | 293 | The result of this looks like: 294 | ```{r, helloWorld-output, eval=FALSE, engine='bash'} 295 | Launching 20480 threads (N=20000) 296 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(448,0,0) => thread index=1984 297 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(449,0,0) => thread index=1985 298 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(450,0,0) => thread index=1986 299 | .... 300 | 301 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(220,0,0) => thread index=20188 302 | [### this thread would not be used for N=20000 ###] 303 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(221,0,0) => thread index=20189 304 | [### this thread would not be used for N=20000 ###] 305 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(222,0,0) => thread index=20190 306 | [### this thread would not be used for N=20000 ###] 307 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(223,0,0) => thread index=20191 308 | [### this thread would not be used for N=20000 ###] 309 | kernel launch success! 310 | That's all! 311 | ``` 312 | 313 | Note that because of some buffering issues, with this many threads, we can't see the output for all of them, hence the *if* statement in the kernel code. It is possible to retrieve info about the limit and change the limit using *cudaDeviceGetLimit()* and *cudaDeviceSetLimit()*. 314 | 315 | ### 3.5) CUDA *compute capability* 316 | 317 | The *compute capability* basically refers to the evolving functionality of the NVIDIA architecture. Higher numbers provide more functionality but will only run on newer GPU hardware. 318 | 319 | For example, to use doubles rather than floats you need compute capability of at least 1.3. This required compute capability needs to be specified when you are compiling CUDA code. 320 | 321 | 322 | # 4) Executing kernels 323 | 324 | A note on the speed comparisons in the remaining section. These compare a fully serial CPU calculation on a single core to calculation on the GPU. On a multicore machine, we could speed up the CPU calculation by writing code to parallelize the calculation (e.g., via threading in C/openMP or various parallelization tools in R or Python). 325 | 326 | Also, note that in the various examples when I want to assess computational time, I make sure to synchronize all the threads via an appropriate function call. This ensures that all of the threads have finished their kernel calculations before I mark the end of the time interval. In general a function call to do a calculation on the GPU will simply start the calculation and then return, with the calculation continuing on the GPU. 327 | 328 | In this section, I'll demonstrate calling a kernel that simply computes the normal density function (PDF) on a vector of values in parallel, one value per thread. 329 | 330 | ### 4.1) Running a kernel from C/CUDA 331 | 332 | Now let's see our example implemented using CUDA code, including memory allocation on the GPU and transfer between the GPU and CPU. 333 | 334 | My kernel code allocates memory on the CPU and the device (GPU) memory and the kernel function uses the device memory for the alphas, random numbers, and the output values (the probability estimates). 335 | 336 | Note that here, I'll use 1024 threads per block and then a grid sufficiently large so that we have at least as many threads as computational chunks. 337 | 338 | Here's the [code (kernelExample.cu on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.cu). 339 | 340 | Compilation is as follows. 341 | 342 | ```{r, kernelExample-compile, engine='bash', eval=FALSE} 343 | nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample 344 | ``` 345 | 346 | Here are some results: 347 | ```{r, kernelExample-output, eval=FALSE, engine='bash'} 348 | ==================================================== 349 | Grid dimension is 46 x 46 350 | Launching 2166784 threads (N=2097152) 351 | Input values: -0.658344 0.499804 -0.807257... 352 | Memory Copy from Host to Device successful. 353 | Memory Copy from Device to Host successful. 354 | Output values: 0.321214 0.352100 0.288007... 355 | Output values (CPU): 0.321214 0.352100 0.288007... 356 | Timing results for n = 2097152 357 | Transfer to GPU time: 0.009988 358 | Calculation time (GPU): 0.000366 359 | Calculation time (CPU): 0.058541 360 | Transfer from GPU time: 0.001716 361 | Freeing memory... 362 | ==================================================== 363 | ... 364 | ... 365 | ==================================================== 366 | Grid dimension is 363 x 363 367 | Launching 134931456 threads (N=134217728) 368 | Input values: -0.658344 0.499804 -0.807257... 369 | Memory Copy from Host to Device successful. 370 | Memory Copy from Device to Host successful. 371 | Output values: 0.321214 0.352100 0.288007... 372 | Output values (CPU): 0.321214 0.352100 0.288007... 373 | Timing results for n = 134217728 374 | Transfer to GPU time: 0.638223 375 | Calculation time (GPU): 0.021684 376 | Calculation time (CPU): 3.470199 377 | Transfer from GPU time: 0.055798 378 | Freeing memory... 379 | ==================================================== 380 | ``` 381 | 382 | The speedup in pure computation time is very impressive (175x); surprisingly when I did this same benchmark two years ago with the EC2 g2.x2large instance the speedup was 'only' 40x. However, importantly, we do see that the time for transferring to and from (particularly to) the GPU exceeds the calculation time, reinforcing the idea of keeping data on the GPU when possible. 383 | 384 | #### Using Pinned Memory 385 | 386 | Here's some code where we use pinned memory that is 'mapped' to the GPU such that the GPU directly accesses CPU memory. This can be advantageous if one exceeds the GPU's memory and, according to some sources, is best when you load the data only once. Another approach, using pinned but not mapped memory allows for more efficient transfer but without the direct access from the GPU, with a hidden transfer done behind the scenes. This may be better if the data is loaded multiple times on the GPU. 387 | 388 | Here's the [code (kernelExample-pinned.cu on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample-pinned.cu). 389 | 390 | 391 | Here are some results: 392 | ```{r, kernelExample-pinned-output, eval=FALSE, engine='bash'} 393 | 394 | ==================================================== 395 | Grid dimension is 46 x 46 396 | Launching 2166784 threads (N=2097152) 397 | Input values: -0.658344 0.499804 -0.807257... 398 | Output values: 0.321214 0.352100 0.288007... 399 | Output values (CPU): 0.321214 0.352100 0.288007... 400 | Timing results for n = 2097152 401 | Calculation time (GPU): 0.003245 402 | Calculation time (CPU): 0.058515 403 | Freeing memory... 404 | ==================================================== 405 | ... 406 | ... 407 | ==================================================== 408 | Grid dimension is 363 x 363 409 | Launching 134931456 threads (N=134217728) 410 | Input values: -0.658344 0.499804 -0.807257... 411 | Output values: 0.321214 0.352100 0.288007... 412 | Output values (CPU): 0.321214 0.352100 0.288007... 413 | Timing results for n = 134217728 414 | Calculation time (GPU): 0.187535 415 | Calculation time (CPU): 3.757175 416 | Freeing memory... 417 | ==================================================== 418 | ``` 419 | 420 | So using pinned mapped memory seems to help quite a bit in this case, as the total time with pinned memory is less than the time used for transfer plus calculation in the previous examples. 421 | 422 | ### 4.2) Calling CUDA Kernels from R (RCUDA) 423 | 424 | When we want to use CUDA from R, the kernel function will remain the same, but the pre- and post-processing is done in R rather than in C. Here's an example, with the same normal density kernel. The CUDA kernel code is saved in a [separate file (calc_loglik.cu on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/calc_loglik.cu) separate file but is identical to that in the full CUDA+C example above (with the exception that we need to wrap the kernel function in `extern "C"`). 425 | 426 | Here's the [code (kernelExample.R on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.R) 427 | 428 | In this example we see that we can either transfer data between CPU and GPU manually or have RCUDA do it for us. If we didn't want to overwrite the input, but rather to allocate separate space for the output on the GPU, we could use *cudaMalloc()* (see example in Section 5.2). 429 | 430 | We need to compile the kernel into a ptx object file, either outside of R: 431 | 432 | ```{r, RCUDAexample-compile, engine='bash', eval=FALSE} 433 | nvcc --ptx -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu 434 | ``` 435 | 436 | or inside of R: 437 | ```{r, RCUDAexample-compile-inR, engine='R', eval=FALSE} 438 | ptx = nvcc(file = 'calc_loglik.cu', out = 'calc_loglik.ptx', target = 'ptx', '-arch=compute_20', '-code=sm_20,compute_20') 439 | ``` 440 | 441 | Here are some results: 442 | ```{r RCUDAexample_output, eval=FALSE, engine='bash'} 443 | Grid size: 444 | [1] 363 363 1 445 | Total number of threads to launch = 134931456 446 | Running CUDA kernel... 447 | Input values: 0.8966972 0.2655087 0.3721239 448 | Output values: 0.2457292 0.2658912 0.2656543 449 | Output values (implicit transfer): 0.2457292 0.2658912 0.2656543 450 | Output values (CPU with R): 0.2457292 0.2658912 0.2656543 451 | Transfer to GPU time: 0.702 452 | Calculation time (GPU): 0.044 453 | Transfer from GPU time: 0.489 454 | Calculation time (CPU): 8.432 455 | Combined calculation/transfer via .cuda time (GPU): 1.203 456 | ``` 457 | 458 | So the transfer time is again substantial in relative terms. Without that time, the speedup would be substantial. 459 | 460 | We can avoid explicitly specifying block and grid dimensions by using the *gridBy* argument to *.cuda()*, with syntax as shown in the *kernelExample.R*. For some reason that code is not working, though I have gotten it to work in other contexts. 461 | 462 | 463 | WARNING #1: be very careful that the types of the R objects passed to the kernel match what the kernel is expecting. Otherwise the code can hang without an informative error message. 464 | 465 | WARNING #2: Note the use of the `strict=TRUE` argument when passing values to the GPU. This ensures that numeric values are kept as doubles and not coerced to floats. 466 | 467 | ### 4.3) Calling CUDA Kernels from Python 468 | 469 | With PyCUDA the kernel code can be directly embedded in the Python script. Otherwise it's fairly similar to the use of RCUDA. Here's the [code (kernelExample.py on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/kernelExample.py) 470 | 471 | Here are some results: 472 | ```{r PyCUDAexample_output, eval=FALSE, engine='bash'} 473 | Generating random normals... 474 | Running GPU code... 475 | Time for calculation (GPU): 1.008687s 476 | Running Scipy CPU code... 477 | Time for calculation (CPU): 12.572273s 478 | Output from GPU: 0.177782 0.224597 0.109604 479 | Output from CPU: 0.177782 0.224597 0.109604 480 | ``` 481 | 482 | WARNING: As was the case with R, be careful that the types of the Python objects passed to the kernel match what the kernel is expecting. 483 | 484 | 485 | # 5) Random Number Generation (RNG) on the GPU 486 | 487 | RNG is done via the CURAND (CUDA Random Number Generation) library. CURAND provides several different generators including the Mersenne Twister (the default in R). 488 | 489 | ### 5.1) Seeds and Sequences 490 | 491 | From the CUDA documentation: 492 | 493 | *For the highest quality parallel pseudorandom number generation, each experiment should be assigned a unique seed. Within an experiment, each thread of computation should be assigned a unique sequence number. If an experiment spans multiple kernel launches, it is recommended that threads between kernel launches be given the same seed, and sequence numbers be assigned in a monotonically increasing way. If the same configuration of threads is launched, random state can be preserved in global memory between launches to avoid state setup time.* 494 | 495 | A lot of important info... we'll interpret/implement much of it in the demo below. 496 | 497 | Recall that RNG on a computer involves generation of pseudo-random numbers from a deterministic, periodic sequence. The seed determines where one starts generating from within that sequence. The idea of the sequence numbers is to generate from non-overlapping blocks within the sequence, with each thread getting a different block. 498 | 499 | 500 | 501 | ### 5.2) Calling CURAND via RCUDA 502 | 503 | For RNG, we need a kernel to initialize the RNG on each thread and one to do the sampling (though they could be combined in a single kernel). Note that the time involved in initializing the RNG for each thread is substantial. This shouldn't be a problem if one is doing a lot of calculations over time. To amortize this one-time expense, I generate multiple random numbers per thread. Here's the [kernel code (random.cu on the github repo)](https://github.com/berkeley-scf/gpu-workshop-2014/blob/master/random.cu). The second argument to *curand_init* is the sequence number - by having contiguous sequence numbers for the threads, the position of the initial random number for a given thread is spaced $2^{67}$ values apart from the position of the initial random number for the next thread. 504 | 505 | And here's the [R code (RNGexample.R on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/RNGexample.R) to call the kernel, which looks very similar to the RCUDA code we've already seen. 506 | 507 | Here are some results: 508 | 509 | 510 | ```{r RNGexample-output, eval=FALSE, engine='bash'} 511 | RNG initiation time: 0.062 512 | GPU memory allocation time: 0.001 513 | Calculation time (GPU): 0.228 514 | Transfer from GPU time: 0.423 515 | Calculation time (CPU): 7.292 516 | ``` 517 | 518 | 519 | We get a decent speed up, which would be more impressive if we can set up the calculations such that we don't need to transfer the whole large vector back to the CPU. Also, the code in *random.cu* uses non-unit strides and could probably be reworked for more efficient global memory access (see Section 7). 520 | 521 | Also note the memory cost of the RNG states for the threads, 48 bytes per thread, which could easily exceed GPU memory if one starts up many threads. 522 | 523 | 524 | At the moment, I'm not sure how to choose the RNG generator from within R. 525 | 526 | ### 5.3) Calling CURAND from C and from Python 527 | 528 | I may flesh this out at some point, but by looking at the RNG example via RCUDA and the examples of calling kernels from C and Python in the previous section, it should be straightforward to do RNG on the GPU controlled by C or Python. 529 | 530 | To choose the generator in C this should work (in this case choosing the Mersenne Twister): 531 | `curandCreateGenerator(CURAND_RNG_PSEUDO_MTGP32)`. 532 | 533 | # 6) Using higher-level functionality to do linear algebra and vectorized operations on the GPU 534 | 535 | The idea here is to use software that hides the details of the kernel implementation from us, relying on the expertise of others to efficiently code standard computations on the GPU. 536 | 537 | We'll start with very high-level use of the GPU by simply calling linear algebra routines that use the GPU. 538 | 539 | 540 | ### 6.1) Using C to Call CUDABLAS and MAGMA 541 | 542 | We can do linear algebra (and basic vectorized operations with vectors and matrices) using GPU implementations of BLAS/LAPACK type routines. Both CUDA (through CUDABLAS) and MAGMA provide access to BLAS functionality, but only MAGMA provides LAPACK-like functionality (i.e., matrix factorizations/decompositions). 543 | 544 | We'll make CUDABLAS and MAGMA calls directly in C code. The MAGMA library provides a drop-in for the functionality of the BLAS and LAPACK that carries out linear algebra on both the CPU and GPU, choosing smartly where to do various aspects of the calculation. We'll now need to directly manage memory allocation on the GPU and transferring data back and forth from CPU to GPU. 545 | 546 | #### CUDA and CUDABLAS 547 | 548 | The code doesn't look too different than C code or calls to BLAS/LAPACK, but we use some CUDA functions and CUDA types. Here's the [example code (cudaBlasExample.c on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/cudaBlasExample.c). 549 | 550 | 551 | Compilation goes as follows. Note that in this case nvcc does not want the file to have .C or .cu extension. 552 | 553 | ```{r, cuda-compile, eval=FALSE, engine='bash'} 554 | nvcc cudaBlasExample.c -I/usr/local/cuda/include -lcublas -o cudaBlasExample 555 | ``` 556 | 557 | And here are (some of) the results: 558 | ```{r cudaBlas-example-output, eval=FALSE, engine='bash'} 559 | Starting 560 | ==================================================== 561 | Timing results for n = 512 562 | GPU memory allocation time: 0.000256 563 | Transfer to GPU time: 0.001642 564 | Matrix multiply time: 0.000481 565 | Transfer from GPU time: 0.001550 566 | ==================================================== 567 | Timing results for n = 2048 568 | GPU memory allocation time: 0.000276 569 | Transfer to GPU time: 0.020364 570 | Matrix multiply time: 0.015466 571 | Transfer from GPU time: 0.015035 572 | ==================================================== 573 | Timing results for n = 8192 574 | GPU memory allocation time: 0.000800 575 | Transfer to GPU time: 0.325620 576 | Matrix multiply time: 0.940571 577 | Transfer from GPU time: 0.229997 578 | ``` 579 | 580 | For (rough) comparison, the $n=8192$ multiplication on the CPU (using *openBLAS* as the BLAS, called from R) takes 106 seconds with one core and 18 seconds with 8 cores. 581 | 582 | #### MAGMA 583 | 584 | Now let's see the use of [MAGMA](http://icl.cs.utk.edu/magma/). MAGMA provides analogous calls as CUDA/CUDABLAS for allocating memory, transferring data, and BLAS calls, as well as LAPACK type calls. 585 | 586 | Note that the LAPACK type calls have a CPU interface and a GPU interface. The GPU interface calls have function names ending in '_gpu' and operate on data objects in GPU memory. The CPU interface calls operate on data objects in CPU memory, handling the transfer to GPU memory as part of the calculation. 587 | 588 | Here we'll compare timing for the GPU vs. standard BLAS/LAPACK as well as the CPU and GPU interfaces for the Cholesky. 589 | 590 | Here's the [example code (magmaExample.c on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/magmaExample.c). 591 | 592 | 593 | Compilation and execution (with and without pinned memory) go as follows. Note we can use gcc and that we need to link in the CPU BLAS and LAPACK since MAGMA uses both CPU and GPU for calculations (plus in this example I directly call BLAS and LAPACK functions). 594 | ```{r magma-compile, eval=FALSE, engine='bash'} 595 | gcc magmaExample.c -O3 -DADD_ -fopenmp -DHAVE_CUBLAS -I/usr/local/cuda/include \ 596 | -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma \ 597 | -llapack -lblas -lcublas -lcudart -o magmaExample 598 | ./magmaExample 1 599 | ./magmaExample 0 600 | ``` 601 | 602 | And here are (some of) the results: 603 | ```{r magma-example-output, eval=FALSE, engine='bash'} 604 | Starting 605 | Setting use_pinned to 1 606 | ==================================================== 607 | Timing results for n = 512 608 | GPU memory allocation time: 0.000256 609 | Transfer to GPU time: 0.085331 610 | Matrix multiply time (GPU): 0.000692 611 | Matrix multiply time (BLAS): 0.049665 612 | Cholesky factorization time (GPU w/ GPU interface): 0.023938 613 | Cholesky factorization time (GPU w/ CPU interface): 0.004702 614 | Cholesky factorization time (LAPACK): 0.006958 615 | Transfer from GPU time: 0.000344 616 | ==================================================== 617 | Timing results for n = 2048 618 | GPU memory allocation time: 0.000366 619 | Transfer to GPU time: 0.005706 620 | Matrix multiply time (GPU): 0.027141 621 | Matrix multiply time (BLAS): 0.446544 622 | Cholesky factorization time (GPU w/ GPU interface): 0.047918 623 | Cholesky factorization time (GPU w/ CPU interface): 0.025746 624 | Cholesky factorization time (LAPACK): 0.077203 625 | Transfer from GPU time: 0.005030 626 | ==================================================== 627 | Timing results for n = 8192 628 | GPU memory allocation time: 0.000789 629 | Transfer to GPU time: 0.087303 630 | Matrix multiply time (GPU): 1.766567 631 | Matrix multiply time (BLAS): 23.807952 632 | Cholesky factorization time (GPU w/ GPU interface): 0.230186 633 | Cholesky factorization time (GPU w/ CPU interface): 0.259374 634 | Cholesky factorization time (LAPACK): 4.179541 635 | Transfer from GPU time: 0.079991 636 | 637 | Setting use_pinned to 0 638 | ==================================================== 639 | Timing results for n = 512 640 | GPU memory allocation time: 0.000257 641 | Transfer to GPU time: 0.086421 642 | Matrix multiply time (GPU): 0.000655 643 | Matrix multiply time (BLAS): 0.037689 644 | Cholesky factorization time (GPU w/ GPU interface): 0.016963 645 | Cholesky factorization time (GPU w/ CPU interface): 0.011957 646 | Cholesky factorization time (LAPACK): 0.005600 647 | Transfer from GPU time: 0.001391 648 | ==================================================== 649 | Timing results for n = 2048 650 | GPU memory allocation time: 0.000369 651 | Transfer to GPU time: 0.009003 652 | Matrix multiply time (GPU): 0.027190 653 | Matrix multiply time (BLAS): 0.514402 654 | Cholesky factorization time (GPU w/ GPU interface): 0.039755 655 | Cholesky factorization time (GPU w/ CPU interface): 0.037521 656 | Cholesky factorization time (LAPACK): 0.081121 657 | Transfer from GPU time: 0.013978 658 | ==================================================== 659 | Timing results for n = 8192 660 | GPU memory allocation time: 0.001062 661 | Transfer to GPU time: 0.136131 662 | Matrix multiply time (GPU): 1.775493 663 | Matrix multiply time (BLAS): 24.222220 664 | Cholesky factorization time (GPU w/ GPU interface): 0.224644 665 | Cholesky factorization time (GPU w/ CPU interface): 0.400515 666 | Cholesky factorization time (LAPACK): 4.183725 667 | Transfer from GPU time: 0.204625 668 | ``` 669 | 670 | So we see decent speed-ups both for the matrix multiplication and the Cholesky factorization; the comparisons are with respect to 8 CPU cores. 671 | 672 | Using the CPU interface seems to provide a modest speedup (compared to the manual transfer + calculation time), as does using pinned memory. 673 | 674 | ### 6.2) Using PyCUDA to do GPU calculations directly in Python 675 | 676 | PyCUDA also provides high-level functionality for vectorized calculations on the GPU. Basically you create a vector stored in GPU memory and then operate on it with a variety of mathematical functions. The modules that do this are *gpuarray* and *cumath*. 677 | 678 | Here's the [code (gpuArrayExample.py on the github repo)](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/gpuArrayExample.py) 679 | 680 | Here are the timing results. 681 | ```{r gpuArrayExample-output, eval=FALSE, engine='bash'} 682 | Transfer to GPU time: 0.639403s 683 | Timing vectorized exponentiation: 684 | GPU array calc time (initial): 0.276190s 685 | GPU array calc time: 0.014222s 686 | CPU calc time: 2.704504s 687 | Timing vectorized dot product/sum of squares: 688 | GPU array calc time (initial): 0.229969s 689 | GPU array calc time: 0.007769s 690 | CPU calc time: 0.071532s 691 | ``` 692 | 693 | So we see a good speedup for the vectorized exponentiation. However, there is some compilation that gets done when the code is run the first time that slows down the initial calculation. Also, again, the transfer of data to the GPU takes a chunk of time. 694 | 695 | For the dot product, the speedup is not as impressive, probably because the aggregation that is needed to do the sum involves coordination across threads. 696 | 697 | ### 6.3) Using R packages to do vectorized operations and linear algebra on the GPU 698 | 699 | Various R packages hide the details of the GPU implementation and allow you to do vector and matrix operations, including linear algebra, using standard R code. In some cases they overload the usual R functions such that you can simply call a function of the same name as in base R. 700 | 701 | Some packages you might investigate include: 702 | * HiPLARM (apparently this uses MAGMA behind the scenes) 703 | * gpuR (uses openCL rather than CUDA) 704 | * gmatrix 705 | * gputools 706 | 707 | # 7) An extended example of optimizing GPU kernel code 708 | 709 | Here we'll implement a basic, but real computation that is a component of a larger collaboration I am engaged in. The basic context is understanding spatial variation in the species composition of forests in the eastern United States. The data are multinomial samples of counts of trees of different species at many different spatial locations (i.e., observations). We fit a spatial version of a multicategory probit regression model. 710 | 711 | In our coding, I'll compare a basic R implementation as well as a C++ implementation with various GPU implementations designed to improve the speed of the GPU calculation. I'll use R to manage the C++ and CUDA code (via *Rcpp* and *RCUDA*) but there's no reason one couldn't do this via Python or C/C++ on the front-end. Our main focus will be on the different CUDA implementations. 712 | 713 | All of the implementations are in the [example directory](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example) in the repository. 714 | 715 | ### 7.1) Example: Probit regression probabilities 716 | 717 | #### Probit regression basics 718 | 719 | Consider probit regression, which is similar to logistic regression. The probability of a binary outcome is given as 720 | $p = P(Y = 1) = \Phi(X\beta)$ where $\Phi()$ is the normal CDF. 721 | 722 | The probit model can be rewritten in a latent variable representation that in a Bayesian context can facilitate MCMC computations to fit the model: 723 | $$ 724 | Y = I(W > 0) \\ 725 | $$ 726 | $$ 727 | W \sim N(X\beta , 1) \\ 728 | $$ 729 | 730 | Suppose we know $\beta$. In order to determine $p$ we could use Monte Carlo simulation to estimate this integral: 731 | $P(Y = 1) = \int_{-\infty}^0 f(w) dw$. 732 | 733 | Now for probit regression, we could just use standard methods to compute normal pdf integrals. But for the multinomial extension we discuss next, we need Monte Carlo simulation. 734 | 735 | #### Multinomial probit regression 736 | 737 | Let $Y$ be a categorical variable, $Y \in \{{1,2,\ldots,K}\}$. Then a multinomial extension of the latent variable probit model is 738 | $$ 739 | Y = {arg\ max}_k {W_k} 740 | $$ 741 | $$ 742 | W_k \sim N(X\beta_k, 1) 743 | $$ 744 | 745 | Now to compute $p = ({P(Y=1), P(Y=2), \ldots, P(Y=K)})$ we can again do Monte Carlo simulation. The basic steps are: 746 | - iterate m = 1, ... , M 747 | - for k = 1,...,K, sample $W_k$ from its corresponding normal distribution 748 | - determine the arg max of the $W_k$'s 749 | - over the $M$ simulations, count the number of times each category had the largest corresponding $W_k$ 750 | 751 | The proportion of times the category corresponded to the largest $W_k$ is an estimate of the multinomial proportions of interest. 752 | 753 | For our example, we want to do this computation for large $M$ (to reduce Monte Carlo error) and for many observations with different $X$ values. In our code, we will assume that we are given a vector ($\alpha_i = {\{X_i\beta_k\}}_{k=1,\ldots,K}$) for each observation, $i$, resulting in an $n$ by $K$ matrix. 754 | 755 | Finally, note that I can reuse the random numbers I need across the $n$ observations (in fact, this probably reducesMonte Carlo error in certain ways), so I just need an $M$ by $K$ matrix of standard normal random variables. Even for large $M$ this is not so big, and I'll simply generate the values once on the CPU. 756 | 757 | ### 7.2) R and C baseline implementations 758 | 759 | In [example_pureR.R](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_pureR.R) and [example_Rcpp.R](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_Rcpp.R) I've implemented the calculation for $n=26280$, $K=21$, and $M=10000$. I tried to write efficient vectorized R code and efficient C++ code (called from R, for convenience). I've also implemented parallel versions for both R and C++. 760 | 761 | The pure R version takes about 570 seconds in serial and 140 seconds with eight cores. 762 | The C++ version takes about 47 seconds in serial and 6 seconds with eight cores. 763 | 764 | ### 7.3) A basic (but thoughtful) implementation 765 | 766 | [example_RCUDA.R](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_RCUDA.R) is the main R script that calls different kernel variations as I experimented with different strategies for efficiency. 767 | 768 | In [compute_probs.cu](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs.cu) I make use of the already-computed random numbers, and allocate a temporary vector *w* to hold the value of $w$ for the current Monte Carlo sample. 769 | 770 | Some features of my code: 771 | - It's generally recommended to have 128-256 threads per block, with the number a multiple of 32 (because threads operate in lock-step in 'warps' of 32 threads). So I'm using 192 threads per block. 772 | - I then determine the number of blocks (of 192 threads each) that I need so I can have one thread for each of my $n$ observations. 773 | - For this algorithm, as mentioned, I can reuse the random numbers across observations, so I don't generate individually on the GPU. 774 | - I haven't thought about locality of memory access (i.e., strides, row-major vs. column-major) in this version of the code. 775 | 776 | Let's execute this: 777 | 778 | ```{r example-basic, eval=FALSE, engine='bash'} 779 | cd example 780 | Rscript example_RCUDA.R 781 | ``` 782 | 783 | This takes 12.1 seconds. 784 | 785 | ### 7.4) Accessing memory efficiently 786 | 787 | Access to the device memory is slow (memory latency), but GPUs are good at switching between different threads while data is being retrieved from memory. Also, the GPU can access memory from consecutive memory locations efficiently and *coalesce* (combine) the memory accesses of groups of threads in a warp. Finally, threads in a warp execute in lock-step. The implications of this is that we want the threads in a warp to retrieve contiguous values from the device memory. This means using a 'stride' of one when incrementing through a vector (analogous to moving along rows in a row-major matrix). 788 | 789 | In the original code, I was striding through *alphas* and *probs* in strides of *k*. Thinking of the various matrices as having $K$ rows and being column-major, I was accessing values from adjacent columns on contigous threads when I should have accessed values from adjacent rows. 790 | 791 | Let's transpose the matrices sent to the GPU memory and access adjacent rows, i.e., strides of one, across contiguous threads, as shown in [compute_probs_unitStrides.cu](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs_unitStrides.cu). 792 | 793 | ```{r example-unitStrides, eval=FALSE, engine='bash'} 794 | echo "unitStrides <- TRUE" > /tmp/tmp.R 795 | cat example_RCUDA.R >> /tmp/tmp.R 796 | Rscript /tmp/tmp.R 797 | ``` 798 | 799 | This takes 8.5 seconds, which is a nice speed-up for a simple change, but not earth-shattering. 800 | 801 | ### 7.5) Using shared memory (within a block) 802 | 803 | Next let's consider whether it makes sense to move any data into shared memory, which can be accessed something like 100x as fast as device memory and functions like a programmer-managed cache. Shared memory is shared across all threads in a block. A couple implications of this are: 804 | 805 | * We need to be careful to do the indexing within blocks. 806 | * We need to transfer any results out of shared memory in order to get it back to the CPU. 807 | * We don't need the calculations synchronized across threads because each thread owns the calculations for a single observation; however in other situations we might need to put a *barrier* in place that ensures all threads are finished with a particular calculation before any proceed to the next steps, using the *__syncthreads()* function. 808 | * We only have 48Kb of shared memory per block (see the results of *deviceQuery*), so we need to make sure the number of threads per block is not so large as to exceed that. In this case with 192 threads per block and $K=21$ values for each thread, we're over the maximum, so we need to go to 96 threads per block. 809 | 810 | Here we notice that *w* and *probs* are accessed in device memory multiple times, and furthermore, *probs* is not even needed as an input, so let's try to manage these values in shared memory, as shown in [compute_probs_unitStrides_sharedMem.cu](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/compute_probs_unitStrides_sharedMem.cu). 811 | 812 | ```{r example-sharedMem, eval=FALSE, engine='bash'} 813 | echo "unitStrides <- TRUE" > /tmp/tmp.R 814 | echo "sharedMem <- TRUE" >> /tmp/tmp.R 815 | cat example_RCUDA.R >> /tmp/tmp.R 816 | Rscript /tmp/tmp.R 817 | ``` 818 | 819 | This takes 1.5 seconds, so we see a big improvement from using shared memory. 820 | 821 | Surprisingly, using shared memory for access to *alphas* actually slowed things down 2-3-fold. I'm not sure why. 822 | 823 | Finally in some cases you can use shared memory to avoid non-unit strides. Here's an example of a [matrix transpose](http://devblogs.nvidia.com/parallelforall/efficient-matrix-transpose-cuda-cc/). Basically any non-unit striding is done only in shared memory. Reading from and writing to device memory is done using unit strides. 824 | 825 | 826 | ### 7.6) Using single precision (floats) 827 | 828 | Traditionally GPU calculations are done in single precision and this can apparently be much faster than double precision calculations. 829 | 830 | Here I get a roughly two- to three-fold speedup using floats rather than doubles, both for the original version of the code with non-unit strides and without shared memory (first example below) and for the optimized version of the code (second example below). As shown in the various "_float" kernel files, all I need to do is change "double" to "float". And when calling from R, there are some housekeeping items shown in [example_RCUDA.R](https://rawgit.com/berkeley-scf/gpu-workshop-2016/master/example/example_RCUDA.R). 831 | 832 | ```{r example-basic-float, eval=FALSE, engine='bash'} 833 | echo "float <- TRUE" > /tmp/tmp.R 834 | cat example/example_RCUDA.R >> /tmp/tmp.R 835 | R CMD BATCH --no-save tmp.R 836 | ``` 837 | 838 | ```{r example-sharedMem-float, eval=FALSE, engine='bash'} 839 | echo "float <- TRUE" > /tmp/tmp.R 840 | echo "unitStrides <- TRUE" > /tmp/tmp.R 841 | echo "sharedMem <- TRUE" >> /tmp/tmp.R 842 | cat example_RCUDA.R >> /tmp/tmp.R 843 | Rscript /tmp/tmp.R 844 | ``` 845 | 846 | # 7.7) Summary 847 | 848 | For this example, here are the speeds, and the speed relative to the eight-core C++ implementation: 849 | 850 | | Implementation | Time (sec.) | Speed (relative to C++) | 851 | | ------------- |-------------:| -----:| 852 | | R (8 cores) | 140 | 0.04 | 853 | | C++ (8 cores) | 6.0 | 1.0 | 854 | | basic CUDA | 12.1 | 0.5 | 855 | | unit strides | 8.5 | 0.7 | 856 | | shared memory | 1.5 | 4.0 | 857 | | shared memory + floats | 0.6 | 10.7 | 858 | 859 | Interestingly on Savio, the C++ time was 9.8, while the shared memory time was 0.67 and the shared memory + floats time was 0.31. 860 | 861 | # 8) Final Comments 862 | 863 | ### 8.1) Some Thoughts on Improving Computational Speed 864 | 865 | [Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10016#.U2GTuBUgoWk) and [Lee et al (2010; Journal of Computational and Graphical Statistics 19:769](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10039#.U2GT9BUgoWk) talk about the use of GPUs for statistics. The speedups they see can get as high as 120 times the speed of a single CPU core and 500 times a single CPU core, respectively. Some of the reasons these speedups are so impressive (more so than some of the examples here) include: 866 | 867 | * Use of single precision floating point calculations. If single precision doesn't affect your calculation substantively, this is worth trying. Particularly on older GPUs (but perhaps still true), single precision was much faster than double precision. 868 | 869 | * Computational tasks that are very arithmetically intensive but with limited memory access (see the Lee et al. paper) 870 | 871 | * Ensuring that contigously-numbered threads access contigous memory locations 872 | 873 | * Careful use of shared memory (shared amongst the threads in a block) in place of the main GPU memory (see the Suchard et al. paper); in particular this can avoid accessing non-contiguous memory 874 | 875 | * Avoiding conditional statements and synchronization/barriers, since threads operate in lock-step in groups of 32 threads (a 'warp') 876 | 877 | So for some tasks and likely involving additional coding effort, you may see speedups of 100-200 fold compared to a single CPU core. 878 | 879 | Finally, rather than bringing a large chunk of data back to the CPU, you might do a reduction/aggregation operation (e.g., summing over values) in GPU memory. To do this, here's a [presentation](http://will-landau.com/gpu/lectures/cudac-atomics/cudac-atomics.pdf)‎ that has some useful information. 880 | 881 | ### 8.2) A Comment on Compilation 882 | 883 | If you compile CUDA code into an object file, you can link that with other object files (e.g., from C or C++ code) into an executable that can operate on CPU and GPU. This also means you could compile a shared object (i.e., a library) that you could call from R with .C, .Call, or Rcpp. 884 | 885 | 886 | ### 8.3) Some references: 887 | 888 | - The book Parallel Computing for Data Science by Norman Matloff has some useful introductory material. 889 | - The [NVIDIA website](http://devblogs.nvidia.com/parallelforall/) has a bunch of useful blog posts. 890 | - [Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10016#.U2GTuBUgoWk) 891 | - [Lee et al (2010; Journal of Computational and Graphical Statistics 19:769](http://www.tandfonline.com/doi/abs/10.1198/jcgs.2010.10039#.U2GT9BUgoWk) 892 | -------------------------------------------------------------------------------- /gpu.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Introduction to Computing with GPUs for Data Science 7 | 8 | 19 | 20 | 21 | 53 | 54 | 55 | 59 | 60 | 61 | 63 | 64 | 65 | 201 | 202 | 203 | 204 | 205 | 206 | 207 |

Introduction to Computing with GPUs for Data Science

208 | 209 |

Chris Paciorek, Statistical Computing Facility, Department of Statistics and Berkeley Research Computing, UC Berkeley

210 | 211 |

Presented: February 1 and 8, 2016

212 | 213 |

Last Revised: February 1, 2016

214 | 215 |

0) This Tutorial

216 | 217 |

Materials for this tutorial, including the R markdown file that was used to create this document are available on github at https://github.com/berkeley-scf/gpu-workshop-2016. You can download the files by doing a git clone:

218 | 219 |
git clone https://github.com/berkeley-scf/gpu-workshop-2016
 220 | 
221 | 222 |

To create this HTML document, simply compile the corresponding R Markdown file in R:

223 | 224 |
library(knitr)
 225 | knit2html('gpu.Rmd')
 226 | 
227 | 228 |

or from the UNIX command line:

229 | 230 |
Rscript -e "library(knitr); knit2html('gpu.Rmd')"
 231 | 
232 | 233 |

1) Introduction

234 | 235 |

1.1) Overview

236 | 237 |

GPUs (Graphics Processing Units) are processing units originally designed for rendering graphics on a computer quickly. This is done by having a large number of simple processing units for massively parallel calculation. The idea of general purpose GPU (GPGPU) computing is to exploit this capability for general computation.

238 | 239 |

We'll see both high-level and low-level ways to program calculations for implementation on the GPU. The basic context of GPU programming is “data parallelism”, in which the same calculation is done to lots of pieces of data. This could be a mathematical calculation on millions of entries in a vector or a simulation with many independent simulations. Some examples of data parallelism include matrix multiplication (doing the multiplication task on many separate matrix elements) or numerical integration (doing a numerical estimate of the piecewise integral on many intervals/regions), as well as standard statistical calculations such as simulation studies, bootstrapping, random forests, etc. This kind of computation also goes by the name “SIMD” (single instruction, multiple data).

240 | 241 |

1.2) Hardware

242 | 243 |

Two of the main suppliers of GPUs are NVIDIA and AMD. CUDA is a platform for programming on GPUs specifically for NVIDIA GPUs that allows you to send C/C++/Fortran code for execution on the GPU. OpenCL is an alternative that will work with a broader variety of GPUs. However, CUDA is quite popular, and there are a lot of tools designed for working with NVIDIA GPUs and based on CUDA, so we'll focus on CUDA here.

244 | 245 |

GPUs have many processing units but somewhat limited memory. Also, they can only use data in their own memory, not in the CPU's memory, so one must transfer data back and forth between the CPU (the host) and the GPU (the device). This copying can, in some computations, constitute a very large fraction of the overall computation. So it is best to create the data and/or leave the data (for subsequent calculations) on the GPU when possible and to limit transfers.

246 | 247 |

The current generation of NVIDIA GPUs is of the Kepler architecture (3rd generation). The 2nd generation was Fermi and the 1st was Tesla. (However note that Tesla is also used by NVIDIA to refer to different chip types).

248 | 249 |

Originally GPUs supported only single precision (i.e., float calculations) but fortunately they now support double precision operations, and most of the examples here will use doubles to reduce the possibility of potential numerical issues, in particular with linear algebra calculations. But in many contexts, single precision will be fine, and the GPU will do computations more quickly with single precision. We'll explore this a bit later in the tutorial.

250 | 251 |

1.3) Software Tools

252 | 253 |

Here are some of the useful software tools for doing computations on the GPU.

254 | 255 |
    256 |
  • CUDA - an extension of C/C++ for programming on an NVIDIA GPU
  • 257 |
  • CUBLAS - a BLAS implementation for matrix-vector calculations on an NVIDIA GPU
  • 258 |
  • CURANDOM - random number generation on an NVIDIA GPU
  • 259 |
  • PyCUDA - a Python package providing a front-end for CUDA
  • 260 |
  • RCUDA - an R package providing a front-end for CUDA
  • 261 |
  • MAGMA - a package for combined CPU-GPU linear algebra, intended to be analogous to LAPACK + BLAS
  • 262 |
263 | 264 |

Note that RCUDA is still in development and is on Github but not CRAN, but should be high-quality as it is developed by Duncan Temple Lang at UC-Davis.

265 | 266 |

We'll see all of these in action.

267 | 268 |

There are also:

269 | 270 |
    271 |
  • openCL - an alternative to CUDA that can also be used with non-NVIDIA GPUs
  • 272 |
  • CUDA Python (from Anaconda, but free for academic use)
  • 273 |
  • PyOpenCL
  • 274 |
  • R packages: OpenCL, gpuR, gmatrix, gputools
  • 275 |
  • BIDMach - software for fast machine learning with a GPU back end available
  • 276 |
277 | 278 |

Finally, many of the popular machine learning packages focused on neural networks and deep learning can use GPUs behind the scenes; these include Theano, Caffe, Torch, Tensorflow, and mocha.jl, among others.

279 | 280 |

Some of these, such as PyCUDA and RCUDA allow you to easily interface to core CUDA code that you write yourself. Others, such as the other R packages and CUDA Python, allow you to program within R and Python but still use the GPU for some of the computation. Finally tools such as the various machine learning hide the details of the GPU usage from you and allow you to simply program in the environment of the software, with computations done on the GPU behind the scenes if a GPU is available.

281 | 282 |

2) GPU hardware available at Berkeley

283 | 284 |

2.1) Department-specific GPUs

285 | 286 |

Statistics

287 | 288 |

The Statistical Computing Facility has a GPU on our high-priority cluster. We'll use this GPU in the demos here, though it is only available for Statistics affiliates. More details on using the GPU are available here.

289 | 290 |

Biostatistics

291 | 292 |

Biostatistics has a GPU on one of its servers. Talk to Burke for more information.

293 | 294 |

Economics

295 | 296 |

The EML (Economics) has a GPU on one of the EML Linux servers that EML users can access. If this is of interest to you, email consult@econ.berkeley.edu, and I will work to get it set up analogously to the Statistics GPU and the Amazon virtual machine (see below) and to help you get started.

297 | 298 |

2.2) GPUs on the campus Linux cluster, Savio

299 | 300 |

Savio recently purchased some nodes with GPUs. These are not yet available to the general public, but will soon be available to users affiliated with researchers who have purchased nodes on Savio and to users who are affiliated with faculty members using the faculty compute allowance.

301 | 302 |

The general syntax for submitting a GPU-based job to Savio's SLURM based scheduler is as follows.

303 | 304 |
sbatch -A account_name -p savio2_gpu -N 1 -t 60:0 job.sh
 305 | 
306 | 307 |

Alternatively, simply do sbatch job.sh and include the scheduling flags in your job.sh, as demonstrated in savio-job-template.sh.

308 | 309 |

To figure out what to fill in for account_name, you can look up your accounts with

310 | 311 |
sacctmgr -p show associations user=${USER}
 312 | 
313 | 314 |

For an interactive session:

315 | 316 |
srun -A account_name --pty -p savio2_gpu -N1 -t 30:0 /bin/bash
 317 | 
318 | 319 |

Before doing any compilation involving CUDA code you generally want to change your environment modules:

320 | 321 |
module unload intel
 322 | module load cuda
 323 | 
324 | 325 |

2.3) GPUs through Amazon's EC2 service

326 | 327 |

The g2.2xlarge Amazon EC2 instance types have a GPU with 1536 cores and 4 Gb memory, along with 8 CPU cores. There is also a g2.8xlarge that has four GPUs and 32 CPU cores. They can be pretty expensive unless you use spot instances - currently 65 cents per hour for g2.2xlarge and $2.60 per hour for g2.8xlarge in the us-west-2 region. The g2.2xlarge GPUs are pretty old chips, and I found that some of the examples included here ran a lot slower on the EC2 instance than on the Statistics GPU (and likely than Savio, but I haven't checked that as much).

328 | 329 |

I've created an Amazon machine image (an AMI) that is the binary representation of the Linux Ubuntu operating system with support for GPU calculations. The AMI is based off of the BCE virtual machine in use for a variety of projects and classes on campus. BCE provides a common set of software used in various data analysis/data science focused contexts, including Python and R. The BCE GPU AMI inherits this software and adds on various GPU-related software (in particular CUDA). Note also that the AMI is also similar to the SCF and EML Linux machines but with a reduced set of software.

330 | 331 |

Based on the BCE-GPU AMI one can start up a virtual Linux machine that one can login to (see below for instructions) via SSH, just like any SCF/EML Linux server. If you were willing to pay Amazon and have an account, you can start a VM (in the Oregon [us-west-2] region) using the BCE GPU AMI by searching for BCE-2015-fall-gpu under “Public Images” at the EC2 console. Then just launch a VM, selecting g2.2xlarge under the GPU instances tab.

332 | 333 |

If you're interested in how to install CUDA-related software on an Ubuntu Linux machine, see build-bce-gpu.sh for the details of how I built the BCE-2015-fall-gpu image based on the BCE-2015-fall image.

334 | 335 |

3) Some basics of GPU use

336 | 337 |

3.1) Getting information about the GPU

338 | 339 |

First let's see how we get information about the GPU and activity on the GPU.

340 | 341 |

Hardware specifications

342 | 343 |

First, executing the following code as root will create an executable that will show you details on the GPU, including the possible block and grid dimensions (described shortly).

344 | 345 |
cd  /usr/local/cuda/samples/1_Utilities/deviceQuery
 346 | nvcc deviceQuery.cpp -I/usr/local/cuda/include \
 347 |    -I/usr/local/cuda-5.5/samples/common/inc -o /usr/local/cuda/bin/deviceQuery
 348 | cd -
 349 | 
350 | 351 |

Once the deviceQuery executable is created, you can run it whenever you want.

352 | 353 |

You'll see information such as the following.

354 | 355 |
paciorek@scf-sm20:~> deviceQuery
 356 | deviceQuery Starting...
 357 | 
 358 |  CUDA Device Query (Runtime API) version (CUDART static linking)
 359 | 
 360 | Detected 1 CUDA Capable device(s)
 361 | 
 362 | Device 0: "Tesla K20Xm"
 363 |   CUDA Driver Version / Runtime Version          7.0 / 7.0
 364 |   CUDA Capability Major/Minor version number:    3.5
 365 |   Total amount of global memory:                 5760 MBytes (6039339008 bytes)
 366 |   (14) Multiprocessors, (192) CUDA Cores/MP:     2688 CUDA Cores
 367 |   GPU Max Clock rate:                            732 MHz (0.73 GHz)
 368 |   Memory Clock rate:                             2600 Mhz
 369 |   Memory Bus Width:                              384-bit
 370 |   L2 Cache Size:                                 1572864 bytes
 371 |   Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
 372 |   Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
 373 |   Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
 374 |   Total amount of constant memory:               65536 bytes
 375 |   Total amount of shared memory per block:       49152 bytes
 376 |   Total number of registers available per block: 65536
 377 |   Warp size:                                     32
 378 |   Maximum number of threads per multiprocessor:  2048
 379 |   Maximum number of threads per block:           1024
 380 |   Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
 381 |   Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
 382 |   Maximum memory pitch:                          2147483647 bytes
 383 |   Texture alignment:                             512 bytes
 384 |   Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
 385 |   Run time limit on kernels:                     No
 386 |   Integrated GPU sharing Host Memory:            No
 387 |   Support host page-locked memory mapping:       Yes
 388 |   Alignment requirement for Surfaces:            Yes
 389 |   Device has ECC support:                        Enabled
 390 |   Device supports Unified Addressing (UVA):      Yes
 391 |   Device PCI Domain ID / Bus ID / location ID:   0 / 2 / 0
 392 |   Compute Mode:
 393 |      < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
 394 | 
 395 | deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 1, Device0 = Tesla K20Xm
 396 | Result = PASS
 397 | 
398 | 399 |

Observing Performance on the GPU

400 | 401 |

The following command will allow you to see some information analogous to top on the CPU.

402 | 403 |
nvidia-smi -q -d UTILIZATION -l 1
 404 | 
405 | 406 |

Here's some example output when the GPU is idle:

407 | 408 |
==============NVSMI LOG==============
 409 | 
 410 | Timestamp                           : Mon Jan 25 17:45:12 2016
 411 | Driver Version                      : 346.46
 412 | 
 413 | Attached GPUs                       : 1
 414 | GPU 0000:02:00.0
 415 |     Utilization
 416 |         Gpu                         : 0 %
 417 |         Memory                      : 0 %
 418 |         Encoder                     : 0 %
 419 |         Decoder                     : 0 %
 420 | 
 421 | 
422 | 423 |

Memory use based on the above does not seem to actually indicate how much of the overall GPU memory is in use for some reason.

424 | 425 |

Instead, to see how much memory is used on the GPU, the following will work:

426 | 427 |
nvidia-smi -q -d MEMORY -l 1
 428 | 
429 | 430 |

Here's some example output when not much memory is in use on the GPU:

431 | 432 |
==============NVSMI LOG==============
 433 | 
 434 | Timestamp                           : Thu Jan 28 12:06:24 2016
 435 | Driver Version                      : 346.46
 436 | 
 437 | Attached GPUs                       : 1
 438 | GPU 0000:02:00.0
 439 |     FB Memory Usage
 440 |         Total                       : 5759 MiB
 441 |         Used                        : 12 MiB
 442 |         Free                        : 5747 MiB
 443 |     BAR1 Memory Usage
 444 |         Total                       : 256 MiB
 445 |         Used                        : 2 MiB
 446 |         Free                        : 254 MiB
 447 | 
448 | 449 |

3.2) Overview of computation on a GPU

450 | 451 |

The basic series of operations to use a GPU when writing your own GPU code is:

452 | 453 |
    454 |
  • allocate memory on the GPU
  • 455 |
  • transfer data from CPU to GPU
  • 456 |
  • launch the CUDA kernel to operate on the threads, with a given block/grid arrangement
  • 457 |
  • (optionally) launch another kernel, which can access data stored on the GPU, including results from the previous kernel
  • 458 |
  • transfer results back to CPU
  • 459 |
460 | 461 |

The key computations are done in the kernel. Kernels are functions that encode the core computational operations that are executed in parallel. The basic mode of operation with a GPU when you are writing your own GPU code is to write a kernel using CUDA code and then call the kernel in parallel via C, R, or Python code.

462 | 463 |

As outlined above, we need to pass any data from the CPU to the GPU and do the same in reverse to get the result. We'll also need to allocate memory on the GPU. However in some cases the transfer and allocation will be done automatically behind the scenes.

464 | 465 |

3.3) Threads, Blocks, and Grids

466 | 467 |

Programming on a GPU (in particular programming for efficiency) requires some understanding of how parallelization works on the GPU. Each individual computation or series of computations on the GPU is done in a thread. Threads are organized into blocks and blocks of threads are organized in a grid. The blocks and grids can be 1-, 2-, or 3-dimensional. E.g., you might have a 1-d block of 256 threads, with a grid of 3 x 3 such blocks, for a total of \(256 \times 9 = 2304\) threads. The choice of the grid/block arrangement can affect efficiency. I'm not an expert at this level of detail but we'll see some about this in the worked example. Note that using more than 1-dimensional grids and blocks is purely for the conceptual convenience of the programmer and doesn't correspond to anything on the hardware. So for the most part we'll use a one-dimensional grid of blocks and a one-dimensional blocks of threads. 468 | In general you'd want each independent calculation done in a separate thread, though as we'll see in Section 5 on simulation, one might want to do a sequence of calculations on each thread. In general, you'll want to pipeline together multiple operations within a computation to avoid copying from CPU to GPU and back. Alternatively, this can be done by keeping the data on the GPU and calling a second kernel.

469 | 470 |

Threads are quick to start, and to get efficiency you want to have thousands of threads to exploit the parallelism of the GPU hardware. In general your calculations will have more threads than GPU cores; the GPU will manage the process of executing all the threads.

471 | 472 |

This can all get quite complicated, with the possibility for communication amongst threads. Threads within a block have some (48Kb) of shared memory (distinct from the main GPU memory) and can synchronize with each other, while threads in different blocks cannot cooperate. We'll see some basic examples of this in our working example later. The Suchard et al. paper referenced in the last Section discusses how to get more efficiency by having threads within a block cooperate and access shared memory, which is much faster than accessing the main GPU (device) memory.

473 | 474 |

If we go back to the deviceQuery output, we'll see information on the number of physical CUDA cores and main GPU memory as well as information about the maximum threads per block and the maximum dimensions of thread blocks and grids.

475 | 476 |

3.4) “Hello, world” using CUDA directly

477 | 478 |

First let's see a 'Hello, World' example that illustrates blocks of threads and grids of blocks.

479 | 480 |

The idea is to have at least as many threads as the number of computations you are doing. Our kernel function contains the core calculation we want to do (in this case printing 'Hello world!') and code that figures out identifying information for each thread as discussed next.

481 | 482 |

When we write a kernel, we will need to have some initial code that determines a unique ID for that thread that allows the thread to access the appropriate part(s) of the data object(s) on the GPU and 'know' what part of the computation it should do. This is done based on information stored in variables that CUDA provides that have information about the thread and block indices and block and grid dimensions.

483 | 484 |

Here's the example code (helloWorld.cu on the github repo).

485 | 486 |

In this case, compilation is as follows. Given the CUDA functionality used in the code (in particular the call to printf within the kernel), we need to specify compilation for a compute capability >= 2.0 (corresponding to the Fermi generation of NVIDIA GPUs) (more below). Note that our query above indicated that the GPU we are using has capability 3.5, so this constraint is fine.

487 | 488 |
nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld
 489 | 
490 | 491 |

The result of this looks like:

492 | 493 |
Launching 20480 threads (N=20000)
 494 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(448,0,0) => thread index=1984
 495 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(449,0,0) => thread index=1985
 496 | Hello world! My block index is (3,0) [Grid dims=(20,2)], 3D-thread index within block=(450,0,0) => thread index=1986
 497 | ....
 498 | 
 499 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(220,0,0) => thread index=20188 
 500 | [### this thread would not be used for N=20000 ###]
 501 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(221,0,0) => thread index=20189 
 502 | [### this thread would not be used for N=20000 ###]
 503 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(222,0,0) => thread index=20190 
 504 | [### this thread would not be used for N=20000 ###]
 505 | Hello world! My block index is (19,1) [Grid dims=(20,2)], 3D-thread index within block=(223,0,0) => thread index=20191 
 506 | [### this thread would not be used for N=20000 ###]
 507 | kernel launch success!
 508 | That's all!
 509 | 
510 | 511 |

Note that because of some buffering issues, with this many threads, we can't see the output for all of them, hence the if statement in the kernel code. It is possible to retrieve info about the limit and change the limit using cudaDeviceGetLimit() and cudaDeviceSetLimit().

512 | 513 |

3.5) CUDA compute capability

514 | 515 |

The compute capability basically refers to the evolving functionality of the NVIDIA architecture. Higher numbers provide more functionality but will only run on newer GPU hardware.

516 | 517 |

For example, to use doubles rather than floats you need compute capability of at least 1.3. This required compute capability needs to be specified when you are compiling CUDA code.

518 | 519 |

4) Executing kernels

520 | 521 |

A note on the speed comparisons in the remaining section. These compare a fully serial CPU calculation on a single core to calculation on the GPU. On a multicore machine, we could speed up the CPU calculation by writing code to parallelize the calculation (e.g., via threading in C/openMP or various parallelization tools in R or Python).

522 | 523 |

Also, note that in the various examples when I want to assess computational time, I make sure to synchronize all the threads via an appropriate function call. This ensures that all of the threads have finished their kernel calculations before I mark the end of the time interval. In general a function call to do a calculation on the GPU will simply start the calculation and then return, with the calculation continuing on the GPU.

524 | 525 |

In this section, I'll demonstrate calling a kernel that simply computes the normal density function (PDF) on a vector of values in parallel, one value per thread.

526 | 527 |

4.1) Running a kernel from C/CUDA

528 | 529 |

Now let's see our example implemented using CUDA code, including memory allocation on the GPU and transfer between the GPU and CPU.

530 | 531 |

My kernel code allocates memory on the CPU and the device (GPU) memory and the kernel function uses the device memory for the alphas, random numbers, and the output values (the probability estimates).

532 | 533 |

Note that here, I'll use 1024 threads per block and then a grid sufficiently large so that we have at least as many threads as computational chunks.

534 | 535 |

Here's the code (kernelExample.cu on the github repo).

536 | 537 |

Compilation is as follows.

538 | 539 |
nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample
 540 | 
541 | 542 |

Here are some results:

543 | 544 |
====================================================
 545 | Grid dimension is 46 x 46
 546 | Launching 2166784 threads (N=2097152)
 547 | Input values: -0.658344 0.499804 -0.807257...
 548 | Memory Copy from Host to Device successful.
 549 | Memory Copy from Device to Host successful.
 550 | Output values: 0.321214 0.352100 0.288007...
 551 | Output values (CPU): 0.321214 0.352100 0.288007...
 552 | Timing results for n = 2097152
 553 | Transfer to GPU time: 0.009988
 554 | Calculation time (GPU): 0.000366
 555 | Calculation time (CPU): 0.058541
 556 | Transfer from GPU time: 0.001716
 557 | Freeing memory...
 558 | ====================================================
 559 | ...
 560 | ...
 561 | ====================================================
 562 | Grid dimension is 363 x 363
 563 | Launching 134931456 threads (N=134217728)
 564 | Input values: -0.658344 0.499804 -0.807257...
 565 | Memory Copy from Host to Device successful.
 566 | Memory Copy from Device to Host successful.
 567 | Output values: 0.321214 0.352100 0.288007...
 568 | Output values (CPU): 0.321214 0.352100 0.288007...
 569 | Timing results for n = 134217728
 570 | Transfer to GPU time: 0.638223
 571 | Calculation time (GPU): 0.021684
 572 | Calculation time (CPU): 3.470199
 573 | Transfer from GPU time: 0.055798
 574 | Freeing memory...
 575 | ====================================================
 576 | 
577 | 578 |

The speedup in pure computation time is very impressive (175x); surprisingly when I did this same benchmark two years ago with the EC2 g2.x2large instance the speedup was 'only' 40x. However, importantly, we do see that the time for transferring to and from (particularly to) the GPU exceeds the calculation time, reinforcing the idea of keeping data on the GPU when possible.

579 | 580 |

Using Pinned Memory

581 | 582 |

Here's some code where we use pinned memory that is 'mapped' to the GPU such that the GPU directly accesses CPU memory. This can be advantageous if one exceeds the GPU's memory and, according to some sources, is best when you load the data only once. Another approach, using pinned but not mapped memory allows for more efficient transfer but without the direct access from the GPU, with a hidden transfer done behind the scenes. This may be better if the data is loaded multiple times on the GPU.

583 | 584 |

Here's the code (kernelExample-pinned.cu on the github repo).

585 | 586 |

Here are some results:

587 | 588 |

 589 | ====================================================
 590 | Grid dimension is 46 x 46
 591 | Launching 2166784 threads (N=2097152)
 592 | Input values: -0.658344 0.499804 -0.807257...
 593 | Output values: 0.321214 0.352100 0.288007...
 594 | Output values (CPU): 0.321214 0.352100 0.288007...
 595 | Timing results for n = 2097152
 596 | Calculation time (GPU): 0.003245
 597 | Calculation time (CPU): 0.058515
 598 | Freeing memory...
 599 | ====================================================
 600 | ...
 601 | ...
 602 | ====================================================
 603 | Grid dimension is 363 x 363
 604 | Launching 134931456 threads (N=134217728)
 605 | Input values: -0.658344 0.499804 -0.807257...
 606 | Output values: 0.321214 0.352100 0.288007...
 607 | Output values (CPU): 0.321214 0.352100 0.288007...
 608 | Timing results for n = 134217728
 609 | Calculation time (GPU): 0.187535
 610 | Calculation time (CPU): 3.757175
 611 | Freeing memory...
 612 | ====================================================
 613 | 
614 | 615 |

So using pinned mapped memory seems to help quite a bit in this case, as the total time with pinned memory is less than the time used for transfer plus calculation in the previous examples.

616 | 617 |

4.2) Calling CUDA Kernels from R (RCUDA)

618 | 619 |

When we want to use CUDA from R, the kernel function will remain the same, but the pre- and post-processing is done in R rather than in C. Here's an example, with the same normal density kernel. The CUDA kernel code is saved in a separate file (calc_loglik.cu on the github repo) separate file but is identical to that in the full CUDA+C example above (with the exception that we need to wrap the kernel function in extern "C").

620 | 621 |

Here's the code (kernelExample.R on the github repo)

622 | 623 |

In this example we see that we can either transfer data between CPU and GPU manually or have RCUDA do it for us. If we didn't want to overwrite the input, but rather to allocate separate space for the output on the GPU, we could use cudaMalloc() (see example in Section 5.2).

624 | 625 |

We need to compile the kernel into a ptx object file, either outside of R:

626 | 627 |
nvcc --ptx  -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu
 628 | 
629 | 630 |

or inside of R:

631 | 632 |
ptx = nvcc(file = 'calc_loglik.cu', out = 'calc_loglik.ptx', target = 'ptx', '-arch=compute_20', '-code=sm_20,compute_20')
 633 | 
634 | 635 |

Here are some results:

636 | 637 |
Grid size:
 638 | [1] 363 363   1
 639 | Total number of threads to launch =  134931456 
 640 | Running CUDA kernel...
 641 | Input values:  0.8966972 0.2655087 0.3721239 
 642 | Output values:  0.2457292 0.2658912 0.2656543 
 643 | Output values (implicit transfer):  0.2457292 0.2658912 0.2656543 
 644 | Output values (CPU with R):  0.2457292 0.2658912 0.2656543 
 645 | Transfer to GPU time:  0.702 
 646 | Calculation time (GPU):  0.044 
 647 | Transfer from GPU time:  0.489 
 648 | Calculation time (CPU):  8.432 
 649 | Combined calculation/transfer via .cuda time (GPU):  1.203 
 650 | 
651 | 652 |

So the transfer time is again substantial in relative terms. Without that time, the speedup would be substantial.

653 | 654 |

We can avoid explicitly specifying block and grid dimensions by using the gridBy argument to .cuda(), with syntax as shown in the kernelExample.R. For some reason that code is not working, though I have gotten it to work in other contexts.

655 | 656 |

WARNING #1: be very careful that the types of the R objects passed to the kernel match what the kernel is expecting. Otherwise the code can hang without an informative error message.

657 | 658 |

WARNING #2: Note the use of the strict=TRUE argument when passing values to the GPU. This ensures that numeric values are kept as doubles and not coerced to floats.

659 | 660 |

4.3) Calling CUDA Kernels from Python

661 | 662 |

With PyCUDA the kernel code can be directly embedded in the Python script. Otherwise it's fairly similar to the use of RCUDA. Here's the code (kernelExample.py on the github repo)

663 | 664 |

Here are some results:

665 | 666 |
Generating random normals...
 667 | Running GPU code...
 668 | Time for calculation (GPU): 1.008687s
 669 | Running Scipy CPU code...
 670 | Time for calculation (CPU): 12.572273s
 671 | Output from GPU: 0.177782 0.224597 0.109604
 672 | Output from CPU: 0.177782 0.224597 0.109604
 673 | 
674 | 675 |

WARNING: As was the case with R, be careful that the types of the Python objects passed to the kernel match what the kernel is expecting.

676 | 677 |

5) Random Number Generation (RNG) on the GPU

678 | 679 |

RNG is done via the CURAND (CUDA Random Number Generation) library. CURAND provides several different generators including the Mersenne Twister (the default in R).

680 | 681 |

5.1) Seeds and Sequences

682 | 683 |

From the CUDA documentation:

684 | 685 |

For the highest quality parallel pseudorandom number generation, each experiment should be assigned a unique seed. Within an experiment, each thread of computation should be assigned a unique sequence number. If an experiment spans multiple kernel launches, it is recommended that threads between kernel launches be given the same seed, and sequence numbers be assigned in a monotonically increasing way. If the same configuration of threads is launched, random state can be preserved in global memory between launches to avoid state setup time.

686 | 687 |

A lot of important info… we'll interpret/implement much of it in the demo below.

688 | 689 |

Recall that RNG on a computer involves generation of pseudo-random numbers from a deterministic, periodic sequence. The seed determines where one starts generating from within that sequence. The idea of the sequence numbers is to generate from non-overlapping blocks within the sequence, with each thread getting a different block.

690 | 691 |

5.2) Calling CURAND via RCUDA

692 | 693 |

For RNG, we need a kernel to initialize the RNG on each thread and one to do the sampling (though they could be combined in a single kernel). Note that the time involved in initializing the RNG for each thread is substantial. This shouldn't be a problem if one is doing a lot of calculations over time. To amortize this one-time expense, I generate multiple random numbers per thread. Here's the kernel code (random.cu on the github repo). The second argument to curand_init is the sequence number - by having contiguous sequence numbers for the threads, the position of the initial random number for a given thread is spaced \(2^{67}\) values apart from the position of the initial random number for the next thread.

694 | 695 |

And here's the R code (RNGexample.R on the github repo) to call the kernel, which looks very similar to the RCUDA code we've already seen.

696 | 697 |

Here are some results:

698 | 699 |
RNG initiation time:  0.062
 700 | GPU memory allocation time:  0.001 
 701 | Calculation time (GPU):  0.228
 702 | Transfer from GPU time:  0.423 
 703 | Calculation time (CPU):  7.292
 704 | 
705 | 706 |

We get a decent speed up, which would be more impressive if we can set up the calculations such that we don't need to transfer the whole large vector back to the CPU. Also, the code in random.cu uses non-unit strides and could probably be reworked for more efficient global memory access (see Section 7).

707 | 708 |

Also note the memory cost of the RNG states for the threads, 48 bytes per thread, which could easily exceed GPU memory if one starts up many threads.

709 | 710 |

At the moment, I'm not sure how to choose the RNG generator from within R.

711 | 712 |

5.3) Calling CURAND from C and from Python

713 | 714 |

I may flesh this out at some point, but by looking at the RNG example via RCUDA and the examples of calling kernels from C and Python in the previous section, it should be straightforward to do RNG on the GPU controlled by C or Python.

715 | 716 |

To choose the generator in C this should work (in this case choosing the Mersenne Twister): 717 | curandCreateGenerator(CURAND_RNG_PSEUDO_MTGP32).

718 | 719 |

6) Using higher-level functionality to do linear algebra and vectorized operations on the GPU

720 | 721 |

The idea here is to use software that hides the details of the kernel implementation from us, relying on the expertise of others to efficiently code standard computations on the GPU.

722 | 723 |

We'll start with very high-level use of the GPU by simply calling linear algebra routines that use the GPU.

724 | 725 |

6.1) Using C to Call CUDABLAS and MAGMA

726 | 727 |

We can do linear algebra (and basic vectorized operations with vectors and matrices) using GPU implementations of BLAS/LAPACK type routines. Both CUDA (through CUDABLAS) and MAGMA provide access to BLAS functionality, but only MAGMA provides LAPACK-like functionality (i.e., matrix factorizations/decompositions).

728 | 729 |

We'll make CUDABLAS and MAGMA calls directly in C code. The MAGMA library provides a drop-in for the functionality of the BLAS and LAPACK that carries out linear algebra on both the CPU and GPU, choosing smartly where to do various aspects of the calculation. We'll now need to directly manage memory allocation on the GPU and transferring data back and forth from CPU to GPU.

730 | 731 |

CUDA and CUDABLAS

732 | 733 |

The code doesn't look too different than C code or calls to BLAS/LAPACK, but we use some CUDA functions and CUDA types. Here's the example code (cudaBlasExample.c on the github repo).

734 | 735 |

Compilation goes as follows. Note that in this case nvcc does not want the file to have .C or .cu extension.

736 | 737 |
nvcc cudaBlasExample.c -I/usr/local/cuda/include -lcublas -o cudaBlasExample
 738 | 
739 | 740 |

And here are (some of) the results:

741 | 742 |
Starting
 743 | ====================================================
 744 | Timing results for n = 512
 745 | GPU memory allocation time: 0.000256
 746 | Transfer to GPU time: 0.001642
 747 | Matrix multiply time: 0.000481
 748 | Transfer from GPU time: 0.001550
 749 | ====================================================
 750 | Timing results for n = 2048
 751 | GPU memory allocation time: 0.000276
 752 | Transfer to GPU time: 0.020364
 753 | Matrix multiply time: 0.015466
 754 | Transfer from GPU time: 0.015035
 755 | ====================================================
 756 | Timing results for n = 8192
 757 | GPU memory allocation time: 0.000800
 758 | Transfer to GPU time: 0.325620
 759 | Matrix multiply time: 0.940571
 760 | Transfer from GPU time: 0.229997
 761 | 
762 | 763 |

For (rough) comparison, the \(n=8192\) multiplication on the CPU (using openBLAS as the BLAS, called from R) takes 106 seconds with one core and 18 seconds with 8 cores.

764 | 765 |

MAGMA

766 | 767 |

Now let's see the use of MAGMA. MAGMA provides analogous calls as CUDA/CUDABLAS for allocating memory, transferring data, and BLAS calls, as well as LAPACK type calls.

768 | 769 |

Note that the LAPACK type calls have a CPU interface and a GPU interface. The GPU interface calls have function names ending in '_gpu' and operate on data objects in GPU memory. The CPU interface calls operate on data objects in CPU memory, handling the transfer to GPU memory as part of the calculation.

770 | 771 |

Here we'll compare timing for the GPU vs. standard BLAS/LAPACK as well as the CPU and GPU interfaces for the Cholesky.

772 | 773 |

Here's the example code (magmaExample.c on the github repo).

774 | 775 |

Compilation and execution (with and without pinned memory) go as follows. Note we can use gcc and that we need to link in the CPU BLAS and LAPACK since MAGMA uses both CPU and GPU for calculations (plus in this example I directly call BLAS and LAPACK functions).

776 | 777 |
gcc magmaExample.c -O3 -DADD_ -fopenmp  -DHAVE_CUBLAS -I/usr/local/cuda/include \
 778 |     -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma \
 779 |     -llapack -lblas -lcublas -lcudart -o magmaExample
 780 | ./magmaExample 1
 781 | ./magmaExample 0
 782 | 
783 | 784 |

And here are (some of) the results:

785 | 786 |
Starting
 787 | Setting use_pinned to 1
 788 | ====================================================
 789 | Timing results for n = 512
 790 | GPU memory allocation time: 0.000256
 791 | Transfer to GPU time: 0.085331
 792 | Matrix multiply time (GPU): 0.000692
 793 | Matrix multiply time (BLAS): 0.049665
 794 | Cholesky factorization time (GPU w/ GPU interface): 0.023938
 795 | Cholesky factorization time (GPU w/ CPU interface): 0.004702
 796 | Cholesky factorization time (LAPACK): 0.006958
 797 | Transfer from GPU time: 0.000344
 798 | ====================================================
 799 | Timing results for n = 2048
 800 | GPU memory allocation time: 0.000366
 801 | Transfer to GPU time: 0.005706
 802 | Matrix multiply time (GPU): 0.027141
 803 | Matrix multiply time (BLAS): 0.446544
 804 | Cholesky factorization time (GPU w/ GPU interface): 0.047918
 805 | Cholesky factorization time (GPU w/ CPU interface): 0.025746
 806 | Cholesky factorization time (LAPACK): 0.077203
 807 | Transfer from GPU time: 0.005030
 808 | ====================================================
 809 | Timing results for n = 8192
 810 | GPU memory allocation time: 0.000789
 811 | Transfer to GPU time: 0.087303
 812 | Matrix multiply time (GPU): 1.766567
 813 | Matrix multiply time (BLAS): 23.807952
 814 | Cholesky factorization time (GPU w/ GPU interface): 0.230186
 815 | Cholesky factorization time (GPU w/ CPU interface): 0.259374
 816 | Cholesky factorization time (LAPACK): 4.179541
 817 | Transfer from GPU time: 0.079991
 818 | 
 819 | Setting use_pinned to 0
 820 | ====================================================
 821 | Timing results for n = 512
 822 | GPU memory allocation time: 0.000257
 823 | Transfer to GPU time: 0.086421
 824 | Matrix multiply time (GPU): 0.000655
 825 | Matrix multiply time (BLAS): 0.037689
 826 | Cholesky factorization time (GPU w/ GPU interface): 0.016963
 827 | Cholesky factorization time (GPU w/ CPU interface): 0.011957
 828 | Cholesky factorization time (LAPACK): 0.005600
 829 | Transfer from GPU time: 0.001391
 830 | ====================================================
 831 | Timing results for n = 2048
 832 | GPU memory allocation time: 0.000369
 833 | Transfer to GPU time: 0.009003
 834 | Matrix multiply time (GPU): 0.027190
 835 | Matrix multiply time (BLAS): 0.514402
 836 | Cholesky factorization time (GPU w/ GPU interface): 0.039755
 837 | Cholesky factorization time (GPU w/ CPU interface): 0.037521
 838 | Cholesky factorization time (LAPACK): 0.081121
 839 | Transfer from GPU time: 0.013978
 840 | ====================================================
 841 | Timing results for n = 8192
 842 | GPU memory allocation time: 0.001062
 843 | Transfer to GPU time: 0.136131
 844 | Matrix multiply time (GPU): 1.775493
 845 | Matrix multiply time (BLAS): 24.222220
 846 | Cholesky factorization time (GPU w/ GPU interface): 0.224644
 847 | Cholesky factorization time (GPU w/ CPU interface): 0.400515
 848 | Cholesky factorization time (LAPACK): 4.183725
 849 | Transfer from GPU time: 0.204625
 850 | 
851 | 852 |

So we see decent speed-ups both for the matrix multiplication and the Cholesky factorization; the comparisons are with respect to 8 CPU cores.

853 | 854 |

Using the CPU interface seems to provide a modest speedup (compared to the manual transfer + calculation time), as does using pinned memory.

855 | 856 |

6.2) Using PyCUDA to do GPU calculations directly in Python

857 | 858 |

PyCUDA also provides high-level functionality for vectorized calculations on the GPU. Basically you create a vector stored in GPU memory and then operate on it with a variety of mathematical functions. The modules that do this are gpuarray and cumath.

859 | 860 |

Here's the code (gpuArrayExample.py on the github repo)

861 | 862 |

Here are the timing results.

863 | 864 |
Transfer to GPU time: 0.639403s
 865 | Timing vectorized exponentiation:
 866 | GPU array calc time (initial): 0.276190s
 867 | GPU array calc time: 0.014222s
 868 | CPU calc time: 2.704504s
 869 | Timing vectorized dot product/sum of squares:
 870 | GPU array calc time (initial): 0.229969s
 871 | GPU array calc time: 0.007769s
 872 | CPU calc time: 0.071532s
 873 | 
874 | 875 |

So we see a good speedup for the vectorized exponentiation. However, there is some compilation that gets done when the code is run the first time that slows down the initial calculation. Also, again, the transfer of data to the GPU takes a chunk of time.

876 | 877 |

For the dot product, the speedup is not as impressive, probably because the aggregation that is needed to do the sum involves coordination across threads.

878 | 879 |

6.3) Using R packages to do vectorized operations and linear algebra on the GPU

880 | 881 |

Various R packages hide the details of the GPU implementation and allow you to do vector and matrix operations, including linear algebra, using standard R code. In some cases they overload the usual R functions such that you can simply call a function of the same name as in base R.

882 | 883 |

Some packages you might investigate include:

884 | 885 |
    886 |
  • HiPLARM (apparently this uses MAGMA behind the scenes)
  • 887 |
  • gpuR (uses openCL rather than CUDA)
  • 888 |
  • gmatrix
  • 889 |
  • gputools
  • 890 |
891 | 892 |

7) An extended example of optimizing GPU kernel code

893 | 894 |

Here we'll implement a basic, but real computation that is a component of a larger collaboration I am engaged in. The basic context is understanding spatial variation in the species composition of forests in the eastern United States. The data are multinomial samples of counts of trees of different species at many different spatial locations (i.e., observations). We fit a spatial version of a multicategory probit regression model.

895 | 896 |

In our coding, I'll compare a basic R implementation as well as a C++ implementation with various GPU implementations designed to improve the speed of the GPU calculation. I'll use R to manage the C++ and CUDA code (via Rcpp and RCUDA) but there's no reason one couldn't do this via Python or C/C++ on the front-end. Our main focus will be on the different CUDA implementations.

897 | 898 |

All of the implementations are in the example directory in the repository.

899 | 900 |

7.1) Example: Probit regression probabilities

901 | 902 |

Probit regression basics

903 | 904 |

Consider probit regression, which is similar to logistic regression. The probability of a binary outcome is given as 905 | \(p = P(Y = 1) = \Phi(X\beta)\) where \(\Phi()\) is the normal CDF.

906 | 907 |

The probit model can be rewritten in a latent variable representation that in a Bayesian context can facilitate MCMC computations to fit the model: 908 | \[ 909 | Y = I(W > 0) \ 910 | \] 911 | \[ 912 | W \sim N(X\beta , 1) \ 913 | \]

914 | 915 |

Suppose we know \(\beta\). In order to determine \(p\) we could use Monte Carlo simulation to estimate this integral: 916 | \(P(Y = 1) = \int_{-\infty}^0 f(w) dw\).

917 | 918 |

Now for probit regression, we could just use standard methods to compute normal pdf integrals. But for the multinomial extension we discuss next, we need Monte Carlo simulation.

919 | 920 |

Multinomial probit regression

921 | 922 |

Let \(Y\) be a categorical variable, \(Y \in \{{1,2,\ldots,K}\}\). Then a multinomial extension of the latent variable probit model is 923 | \[ 924 | Y = {arg\ max}_k {W_k} 925 | \] 926 | \[ 927 | W_k \sim N(X\beta_k, 1) 928 | \]

929 | 930 |

Now to compute \(p = ({P(Y=1), P(Y=2), \ldots, P(Y=K)})\) we can again do Monte Carlo simulation. The basic steps are:

931 | 932 |
    933 |
  • iterate m = 1, … , M 934 | 935 |
      936 |
    • for k = 1,…,K, sample \(W_k\) from its corresponding normal distribution
    • 937 |
    • determine the arg max of the \(W_k\)'s
    • 938 |
  • 939 |
  • over the \(M\) simulations, count the number of times each category had the largest corresponding \(W_k\)
  • 940 |
941 | 942 |

The proportion of times the category corresponded to the largest \(W_k\) is an estimate of the multinomial proportions of interest.

943 | 944 |

For our example, we want to do this computation for large \(M\) (to reduce Monte Carlo error) and for many observations with different \(X\) values. In our code, we will assume that we are given a vector (\(\alpha_i = {\{X_i\beta_k\}}_{k=1,\ldots,K}\)) for each observation, \(i\), resulting in an \(n\) by \(K\) matrix.

945 | 946 |

Finally, note that I can reuse the random numbers I need across the \(n\) observations (in fact, this probably reducesMonte Carlo error in certain ways), so I just need an \(M\) by \(K\) matrix of standard normal random variables. Even for large \(M\) this is not so big, and I'll simply generate the values once on the CPU.

947 | 948 |

7.2) R and C baseline implementations

949 | 950 |

In example_pureR.R and example_Rcpp.R I've implemented the calculation for \(n=26280\), \(K=21\), and \(M=10000\). I tried to write efficient vectorized R code and efficient C++ code (called from R, for convenience). I've also implemented parallel versions for both R and C++.

951 | 952 |

The pure R version takes about 570 seconds in serial and 140 seconds with eight cores. 953 | The C++ version takes about 47 seconds in serial and 6 seconds with eight cores.

954 | 955 |

7.3) A basic (but thoughtful) implementation

956 | 957 |

example_RCUDA.R is the main R script that calls different kernel variations as I experimented with different strategies for efficiency.

958 | 959 |

In compute_probs.cu I make use of the already-computed random numbers, and allocate a temporary vector w to hold the value of \(w\) for the current Monte Carlo sample.

960 | 961 |

Some features of my code:

962 | 963 |
    964 |
  • It's generally recommended to have 128-256 threads per block, with the number a multiple of 32 (because threads operate in lock-step in 'warps' of 32 threads). So I'm using 192 threads per block.
  • 965 |
  • I then determine the number of blocks (of 192 threads each) that I need so I can have one thread for each of my \(n\) observations.
  • 966 |
  • For this algorithm, as mentioned, I can reuse the random numbers across observations, so I don't generate individually on the GPU.
  • 967 |
  • I haven't thought about locality of memory access (i.e., strides, row-major vs. column-major) in this version of the code.
  • 968 |
969 | 970 |

Let's execute this:

971 | 972 |
cd example
 973 | Rscript example_RCUDA.R
 974 | 
975 | 976 |

This takes 12.1 seconds.

977 | 978 |

7.4) Accessing memory efficiently

979 | 980 |

Access to the device memory is slow (memory latency), but GPUs are good at switching between different threads while data is being retrieved from memory. Also, the GPU can access memory from consecutive memory locations efficiently and coalesce (combine) the memory accesses of groups of threads in a warp. Finally, threads in a warp execute in lock-step. The implications of this is that we want the threads in a warp to retrieve contiguous values from the device memory. This means using a 'stride' of one when incrementing through a vector (analogous to moving along rows in a row-major matrix).

981 | 982 |

In the original code, I was striding through alphas and probs in strides of k. Thinking of the various matrices as having \(K\) rows and being column-major, I was accessing values from adjacent columns on contigous threads when I should have accessed values from adjacent rows.

983 | 984 |

Let's transpose the matrices sent to the GPU memory and access adjacent rows, i.e., strides of one, across contiguous threads, as shown in compute_probs_unitStrides.cu.

985 | 986 |
echo "unitStrides <- TRUE" > /tmp/tmp.R
 987 | cat example_RCUDA.R >> /tmp/tmp.R
 988 | Rscript /tmp/tmp.R
 989 | 
990 | 991 |

This takes 8.5 seconds, which is a nice speed-up for a simple change, but not earth-shattering.

992 | 993 |

7.5) Using shared memory (within a block)

994 | 995 |

Next let's consider whether it makes sense to move any data into shared memory, which can be accessed something like 100x as fast as device memory and functions like a programmer-managed cache. Shared memory is shared across all threads in a block. A couple implications of this are:

996 | 997 |
    998 |
  • We need to be careful to do the indexing within blocks.
  • 999 |
  • We need to transfer any results out of shared memory in order to get it back to the CPU.
  • 1000 |
  • We don't need the calculations synchronized across threads because each thread owns the calculations for a single observation; however in other situations we might need to put a barrier in place that ensures all threads are finished with a particular calculation before any proceed to the next steps, using the __syncthreads() function.
  • 1001 |
  • We only have 48Kb of shared memory per block (see the results of deviceQuery), so we need to make sure the number of threads per block is not so large as to exceed that. In this case with 192 threads per block and \(K=21\) values for each thread, we're over the maximum, so we need to go to 96 threads per block.
  • 1002 |
1003 | 1004 |

Here we notice that w and probs are accessed in device memory multiple times, and furthermore, probs is not even needed as an input, so let's try to manage these values in shared memory, as shown in compute_probs_unitStrides_sharedMem.cu.

1005 | 1006 |
echo "unitStrides <- TRUE" > /tmp/tmp.R
1007 | echo "sharedMem <- TRUE" >> /tmp/tmp.R
1008 | cat example_RCUDA.R >> /tmp/tmp.R
1009 | Rscript /tmp/tmp.R
1010 | 
1011 | 1012 |

This takes 1.5 seconds, so we see a big improvement from using shared memory.

1013 | 1014 |

Surprisingly, using shared memory for access to alphas actually slowed things down 2-3-fold. I'm not sure why.

1015 | 1016 |

Finally in some cases you can use shared memory to avoid non-unit strides. Here's an example of a matrix transpose. Basically any non-unit striding is done only in shared memory. Reading from and writing to device memory is done using unit strides.

1017 | 1018 |

7.6) Using single precision (floats)

1019 | 1020 |

Traditionally GPU calculations are done in single precision and this can apparently be much faster than double precision calculations.

1021 | 1022 |

Here I get a roughly two- to three-fold speedup using floats rather than doubles, both for the original version of the code with non-unit strides and without shared memory (first example below) and for the optimized version of the code (second example below). As shown in the various “_float” kernel files, all I need to do is change “double” to “float”. And when calling from R, there are some housekeeping items shown in example_RCUDA.R.

1023 | 1024 |
echo "float <- TRUE" > /tmp/tmp.R
1025 | cat example/example_RCUDA.R >> /tmp/tmp.R
1026 | R CMD BATCH --no-save tmp.R 
1027 | 
1028 | 1029 |
echo "float <- TRUE" > /tmp/tmp.R
1030 | echo "unitStrides <- TRUE" > /tmp/tmp.R
1031 | echo "sharedMem <- TRUE" >> /tmp/tmp.R
1032 | cat example_RCUDA.R >> /tmp/tmp.R
1033 | Rscript /tmp/tmp.R 
1034 | 
1035 | 1036 |

7.7) Summary

1037 | 1038 |

For this example, here are the speeds, and the speed relative to the eight-core C++ implementation:

1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 |
ImplementationTime (sec.)Speed (relative to C++)
R (8 cores)1400.04
C++ (8 cores)6.01.0
basic CUDA12.10.5
unit strides8.50.7
shared memory1.54.0
shared memory + floats0.610.7
1078 | 1079 |

Interestingly on Savio, the C++ time was 9.8, while the shared memory time was 0.67 and the shared memory + floats time was 0.31.

1080 | 1081 |

8) Final Comments

1082 | 1083 |

8.1) Some Thoughts on Improving Computational Speed

1084 | 1085 |

Suchard et al (2010; Journal of Computational and Graphical Statistics 19:419 and Lee et al (2010; Journal of Computational and Graphical Statistics 19:769 talk about the use of GPUs for statistics. The speedups they see can get as high as 120 times the speed of a single CPU core and 500 times a single CPU core, respectively. Some of the reasons these speedups are so impressive (more so than some of the examples here) include:

1086 | 1087 |
    1088 |
  • Use of single precision floating point calculations. If single precision doesn't affect your calculation substantively, this is worth trying. Particularly on older GPUs (but perhaps still true), single precision was much faster than double precision.

  • 1089 |
  • Computational tasks that are very arithmetically intensive but with limited memory access (see the Lee et al. paper)

  • 1090 |
  • Ensuring that contigously-numbered threads access contigous memory locations

  • 1091 |
  • Careful use of shared memory (shared amongst the threads in a block) in place of the main GPU memory (see the Suchard et al. paper); in particular this can avoid accessing non-contiguous memory

  • 1092 |
  • Avoiding conditional statements and synchronization/barriers, since threads operate in lock-step in groups of 32 threads (a 'warp')

  • 1093 |
1094 | 1095 |

So for some tasks and likely involving additional coding effort, you may see speedups of 100-200 fold compared to a single CPU core.

1096 | 1097 |

Finally, rather than bringing a large chunk of data back to the CPU, you might do a reduction/aggregation operation (e.g., summing over values) in GPU memory. To do this, here's a presentation‎ that has some useful information.

1098 | 1099 |

8.2) A Comment on Compilation

1100 | 1101 |

If you compile CUDA code into an object file, you can link that with other object files (e.g., from C or C++ code) into an executable that can operate on CPU and GPU. This also means you could compile a shared object (i.e., a library) that you could call from R with .C, .Call, or Rcpp.

1102 | 1103 |

8.3) Some references:

1104 | 1105 | 1111 | 1112 | 1113 | 1114 | 1115 | -------------------------------------------------------------------------------- /gpuArrayExample.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | import pycuda.gpuarray as gpuarray 4 | import pycuda.cumath as cumath 5 | import numpy as np 6 | 7 | n = np.int32(134217728) 8 | 9 | start = drv.Event() 10 | end = drv.Event() 11 | 12 | x = np.random.normal(size = n) 13 | x_short = np.random.normal(size = 8) 14 | 15 | start.record() 16 | dev_x = gpuarray.to_gpu(x) 17 | dev_x_short = gpuarray.to_gpu(x_short) 18 | end.record() 19 | end.synchronize() 20 | print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3) 21 | 22 | 23 | print "Timing vectorized exponentiation:" 24 | 25 | start.record() 26 | dev_expx_short = cumath.exp(dev_x_short) 27 | end.record() 28 | end.synchronize() 29 | print "GPU array calc time (initial): %fs" %(start.time_till(end)*1e-3) 30 | 31 | start.record() 32 | dev_expx = cumath.exp(dev_x) 33 | end.record() 34 | end.synchronize() 35 | print "GPU array calc time: %fs" %(start.time_till(end)*1e-3) 36 | 37 | start.record() 38 | exp_x = np.exp(x) 39 | end.record() 40 | end.synchronize() 41 | print "CPU calc time: %fs" %(start.time_till(end)*1e-3) 42 | 43 | print "Timing vectorized dot product/sum of squares:" 44 | 45 | start.record() 46 | gpuarray.dot(dev_x_short,dev_x_short) 47 | end.record() 48 | end.synchronize() 49 | print "GPU array calc time (initial): %fs" %(start.time_till(end)*1e-3) 50 | 51 | start.record() 52 | gpuarray.dot(dev_x,dev_x) 53 | end.record() 54 | end.synchronize() 55 | print "GPU array calc time: %fs" %(start.time_till(end)*1e-3) 56 | 57 | start.record() 58 | np.dot(x, x) 59 | end.record() 60 | end.synchronize() 61 | print "CPU calc time: %fs" %(start.time_till(end)*1e-3) 62 | -------------------------------------------------------------------------------- /helloWorld.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // Note: Needs compute capability >= 2.0, so compile with: 6 | // nvcc helloWorld.cu -arch=compute_20 -code=sm_20,compute_20 -o helloWorld 7 | 8 | // number of computations: 9 | #define N 20000 10 | // constants for grid and block sizes 11 | #define GRID_D1 20 12 | #define GRID_D2 2 13 | #define BLOCK_D1 512 14 | #define BLOCK_D2 1 15 | #define BLOCK_D3 1 16 | 17 | // this is the kernel function called for each thread 18 | // we use the CUDA variables {threadIdx, blockIdx, blockDim, gridDim} to determine a unique ID for each thread 19 | __global__ void hello(void) 20 | { 21 | // id of the block 22 | int myblock = blockIdx.x + blockIdx.y * gridDim.x; 23 | // size of each block (within grid of blocks) 24 | int blocksize = blockDim.x * blockDim.y * blockDim.z; 25 | // id of thread in a given block 26 | int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; 27 | // assign overall id/index of the thread 28 | int idx = myblock * blocksize + subthread; 29 | if(idx < 2000 || idx > 19000) { 30 | // print buffer from within the kernel is limited so only print for first and last chunks of threads 31 | if (idx < N){ 32 | printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => \ 33 | thread index=%d\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, threadIdx.x, threadIdx.y, threadIdx.z, idx); 34 | } else { 35 | printf("Hello world! My block index is (%d,%d) [Grid dims=(%d,%d)], 3D-thread index within block=(%d,%d,%d) => \ 36 | thread index=%d [### this thread would not be used for N=%d ###]\n", blockIdx.x, blockIdx.y, gridDim.x, gridDim.y, 37 | threadIdx.x, threadIdx.y, threadIdx.z, idx, N); 38 | } 39 | } 40 | } 41 | 42 | 43 | int main(int argc,char **argv) 44 | { 45 | // objects containing the block and grid info 46 | const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3); 47 | const dim3 gridSize(GRID_D1, GRID_D2, 1); 48 | int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*GRID_D1*GRID_D2; 49 | if (nthreads < N){ 50 | printf("\n============ NOT ENOUGH THREADS TO COVER N=%d ===============\n\n",N); 51 | } else { 52 | printf("Launching %d threads (N=%d)\n",nthreads,N); 53 | } 54 | 55 | // launch the kernel on the specified grid of thread blocks 56 | hello<<>>(); 57 | 58 | // Need to flush prints, otherwise none of the prints from within the kernel will show up 59 | // as program exit does not flush the print buffer. 60 | cudaError_t cudaerr = cudaDeviceSynchronize(); 61 | if (cudaerr){ 62 | printf("kernel launch failed with error \"%s\".\n", 63 | cudaGetErrorString(cudaerr)); 64 | } else { 65 | printf("kernel launch success!\n"); 66 | } 67 | 68 | printf("That's all!\n"); 69 | 70 | return 0; 71 | } 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /kernelExample-pinned.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define SQRT_TWO_PI 2.506628274631000 8 | #define BLOCK_D1 1024 9 | #define BLOCK_D2 1 10 | #define BLOCK_D3 1 11 | 12 | // Note: Needs compute capability >= 2.0 for calculation with doubles, so compile with: 13 | // nvcc kernelExample-pinned.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample-pinned 14 | // -use_fast_math 15 | 16 | // CUDA kernel: 17 | __global__ void calc_loglik(double* vals, int n, double mu, double sigma) { 18 | // note that this assumes no third dimension to the grid 19 | // id of the block 20 | int myblock = blockIdx.x + blockIdx.y * gridDim.x; 21 | // size of each block (within grid of blocks) 22 | int blocksize = blockDim.x * blockDim.y * blockDim.z; 23 | // id of thread in a given block 24 | int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; 25 | // assign overall id/index of the thread 26 | int idx = myblock * blocksize + subthread; 27 | 28 | if(idx < n) { 29 | double std = (vals[idx] - mu)/sigma; 30 | double e = exp( - 0.5 * std * std); 31 | vals[idx] = e / ( sigma * SQRT_TWO_PI); 32 | } 33 | } 34 | 35 | int calc_loglik_cpu(double* vals, int n, double mu, double sigma) { 36 | double std, e; 37 | for(int idx = 0; idx < n; idx++) { 38 | std = (vals[idx] - mu)/sigma; 39 | e = exp( - 0.5 * std * std); 40 | vals[idx] = e / ( sigma * SQRT_TWO_PI); 41 | } 42 | return 0; 43 | } 44 | 45 | 46 | /* --------------------------- host code ------------------------------*/ 47 | void fill( double *p, int n ) { 48 | int i; 49 | srand48(0); 50 | for( i = 0; i < n; i++ ) 51 | p[i] = 2*drand48()-1; 52 | } 53 | 54 | double read_timer() { 55 | struct timeval end; 56 | gettimeofday( &end, NULL ); 57 | return end.tv_sec+1.e-6*end.tv_usec; 58 | } 59 | 60 | int main (int argc, char *argv[]) { 61 | double* cpu_vals; 62 | double* gpu_vals; 63 | int n; 64 | cudaError_t cudaStat; 65 | 66 | printf("====================================================\n"); 67 | for( n = 32768; n <= 134217728; n*=8 ) { 68 | // allocated pinned and mapped memory on CPU 69 | cudaSetDeviceFlags(cudaDeviceMapHost); 70 | cudaHostAlloc((void**)&cpu_vals, n*sizeof(double), cudaHostAllocMapped); 71 | 72 | // map the CPU storage to the GPU to the CPU storage 73 | cudaStat = cudaHostGetDevicePointer(&gpu_vals, cpu_vals, 0); 74 | if(cudaStat != cudaSuccess) { 75 | printf ("device memory mapping failed"); 76 | return EXIT_FAILURE; 77 | } 78 | 79 | const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3); 80 | 81 | int tmp = ceil(pow(n/BLOCK_D1, 0.5)); 82 | printf("Grid dimension is %i x %i\n", tmp, tmp); 83 | dim3 gridSize(tmp, tmp, 1); 84 | 85 | int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*tmp*tmp; 86 | if (nthreads < n){ 87 | printf("\n============ NOT ENOUGH THREADS TO COVER n=%d ===============\n\n",n); 88 | } else { 89 | printf("Launching %d threads (n=%d)\n", nthreads, n); 90 | } 91 | 92 | double mu = 0.0; 93 | double sigma = 1.0; 94 | 95 | // simulate 'data' 96 | fill(cpu_vals, n); 97 | printf("Input values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]); 98 | 99 | cudaDeviceSynchronize(); 100 | double tInit = read_timer(); 101 | 102 | // do the calculation 103 | calc_loglik<<>>(gpu_vals, n, mu, sigma); 104 | 105 | cudaDeviceSynchronize(); 106 | double tCalc = read_timer(); 107 | 108 | printf("Output values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]); 109 | 110 | // do calculation on CPU for comparison (unfair as this will only use one core) 111 | fill(cpu_vals, n); 112 | double tInit2 = read_timer(); 113 | calc_loglik_cpu(cpu_vals, n, mu, sigma); 114 | double tCalcCPU = read_timer(); 115 | 116 | printf("Output values (CPU): %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]); 117 | 118 | printf("Timing results for n = %d\n", n); 119 | printf("Calculation time (GPU): %f\n", tCalc - tInit); 120 | printf("Calculation time (CPU): %f\n", tCalcCPU - tInit2); 121 | 122 | printf("Freeing memory...\n"); 123 | printf("====================================================\n"); 124 | cudaFreeHost(cpu_vals); 125 | 126 | } 127 | printf("\n\nFinished.\n\n"); 128 | return 0; 129 | } 130 | 131 | -------------------------------------------------------------------------------- /kernelExample.R: -------------------------------------------------------------------------------- 1 | # modification of one of the RCUDA examples to use use double precision 2 | 3 | library(RCUDA) 4 | 5 | cat("Setting cuGetContext(TRUE)...\n") 6 | cuGetContext(TRUE) 7 | 8 | # compile the kernel into a form that RCUDA can load 9 | # system("nvcc --ptx -arch=compute_20 -code=sm_20,compute_20 -o calc_loglik.ptx calc_loglik.cu") 10 | ptx = nvcc(file = 'calc_loglik.cu', out = 'calc_loglik.ptx', 11 | target = "ptx", "-arch=compute_20", "-code=sm_20,compute_20") 12 | 13 | mod = loadModule(ptx) 14 | calc_loglik = mod$calc_loglik 15 | 16 | n = as.integer(134217728) 17 | 18 | set.seed(0) 19 | x = runif(n) 20 | mu = 0.3 21 | sigma = 1.5 22 | 23 | # setting grid and block dimensions 24 | threads_per_block <- as.integer(1024) 25 | block_dims <- c(threads_per_block, as.integer(1), as.integer(1)) 26 | grid_d <- as.integer(ceiling(sqrt(n/threads_per_block))) 27 | 28 | grid_dims <- c(grid_d, grid_d, as.integer(1)) 29 | 30 | cat("Grid size:\n") 31 | print(grid_dims) 32 | 33 | nthreads <- as.integer(prod(grid_dims)*prod(block_dims)) 34 | cat("Total number of threads to launch = ", nthreads, "\n") 35 | if (nthreads < n){ 36 | stop("Grid is not large enough...!") 37 | } 38 | 39 | cat("Running CUDA kernel...\n") 40 | 41 | # basic usage with manual transfer 42 | tTransferToGPU <- system.time({ 43 | dX = copyToDevice(x, strict = TRUE) 44 | cudaDeviceSynchronize() 45 | }) 46 | tCalc <- system.time({ 47 | .cuda(calc_loglik, dX, n, mu, sigma, gridDim = grid_dims, blockDim = block_dims, .numericAsDouble = getOption("CUDA.useDouble", TRUE)) 48 | cudaDeviceSynchronize() 49 | }) 50 | tTransferFromGPU <- system.time({ 51 | out = copyFromDevice(obj = dX, nels = dX@nels, type = "double") 52 | cudaDeviceSynchronize() 53 | }) 54 | 55 | cat("Input values: ", x[1:3], "\n") 56 | cat("Output values: ", out[1:3], "\n") 57 | 58 | # alternative that bundles transfer and computation all in one, with 59 | # implicit transfer done by RCUDA behind the scenes 60 | tFull <- system.time({ 61 | out <- .cuda(calc_loglik, "x"=x, n, mu, sigma, gridDim=grid_dims, blockDim=block_dims, outputs="x", .numericAsDouble = getOption("CUDA.useDouble", TRUE)) 62 | cudaDeviceSynchronize() 63 | }) 64 | 65 | 66 | cat("Output values (implicit transfer): ", out[1:3], "\n") 67 | 68 | # having RCUDA determine gridding - not working for some reason 69 | ## tCalc_gridby <- system.time({ 70 | ## .cuda(calc_loglik, dX, n, mu, sigma, gridBy = nthreads, .numericAsDouble = getOption("CUDA.useDouble", TRUE)) 71 | ## cudaDeviceSynchronize() 72 | ## }) 73 | 74 | 75 | 76 | tCalc_R <- system.time({ 77 | out <- dnorm(x, mu, sigma) 78 | }) 79 | 80 | cat("Output values (CPU with R): ", out[1:3], "\n") 81 | 82 | cat("Transfer to GPU time: ", tTransferToGPU[3], "\n") 83 | cat("Calculation time (GPU): ", tCalc[3], "\n") 84 | cat("Transfer from GPU time: ", tTransferFromGPU[3], "\n") 85 | cat("Calculation time (CPU): ", tCalc_R[3], "\n") 86 | cat("Combined calculation/transfer via .cuda time (GPU): ", tFull[3], "\n") 87 | #cat("Calculation time (GPU with gridBy): ", tCalc_gridBy[3], "\n") 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /kernelExample.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define SQRT_TWO_PI 2.506628274631000 8 | #define BLOCK_D1 1024 9 | #define BLOCK_D2 1 10 | #define BLOCK_D3 1 11 | 12 | // Note: Needs compute capability >= 2.0 for calculation with doubles, so compile with: 13 | // nvcc kernelExample.cu -arch=compute_20 -code=sm_20,compute_20 -o kernelExample 14 | // -use_fast_math doesn't seem to have any effect on speed 15 | 16 | // CUDA kernel: 17 | __global__ void calc_loglik(double* vals, int n, double mu, double sigma) { 18 | // note that this assumes no third dimension to the grid 19 | // id of the block 20 | int myblock = blockIdx.x + blockIdx.y * gridDim.x; 21 | // size of each block (within grid of blocks) 22 | int blocksize = blockDim.x * blockDim.y * blockDim.z; 23 | // id of thread in a given block 24 | int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; 25 | // assign overall id/index of the thread 26 | int idx = myblock * blocksize + subthread; 27 | 28 | if(idx < n) { 29 | double std = (vals[idx] - mu)/sigma; 30 | double e = exp( - 0.5 * std * std); 31 | vals[idx] = e / ( sigma * SQRT_TWO_PI); 32 | } 33 | } 34 | 35 | // CPU analog for speed comparison 36 | int calc_loglik_cpu(double* vals, int n, double mu, double sigma) { 37 | double std, e; 38 | for(int idx = 0; idx < n; idx++) { 39 | std = (vals[idx] - mu)/sigma; 40 | e = exp( - 0.5 * std * std); 41 | vals[idx] = e / ( sigma * SQRT_TWO_PI); 42 | } 43 | return 0; 44 | } 45 | 46 | 47 | /* --------------------------- host code ------------------------------*/ 48 | void fill( double *p, int n ) { 49 | int i; 50 | srand48(0); 51 | for( i = 0; i < n; i++ ) 52 | p[i] = 2*drand48()-1; 53 | } 54 | 55 | double read_timer() { 56 | struct timeval end; 57 | gettimeofday( &end, NULL ); 58 | return end.tv_sec+1.e-6*end.tv_usec; 59 | } 60 | 61 | int main (int argc, char *argv[]) { 62 | double* cpu_vals; 63 | double* gpu_vals; 64 | int n; 65 | cudaError_t cudaStat; 66 | 67 | 68 | printf("====================================================\n"); 69 | for( n = 32768; n <= 134217728; n*=8 ) { 70 | cpu_vals = (double*) malloc( sizeof(double)*n ); 71 | cudaStat = cudaMalloc(&gpu_vals, sizeof(double)*n); 72 | if(cudaStat != cudaSuccess) { 73 | printf ("device memory allocation failed"); 74 | return EXIT_FAILURE; 75 | } 76 | 77 | // fixed block dimensions (1024x1x1 threads) 78 | const dim3 blockSize(BLOCK_D1, BLOCK_D2, BLOCK_D3); 79 | 80 | // determine number of blocks we need for a given problem size 81 | int tmp = ceil(pow(n/BLOCK_D1, 0.5)); 82 | printf("Grid dimension is %i x %i\n", tmp, tmp); 83 | dim3 gridSize(tmp, tmp, 1); 84 | 85 | int nthreads = BLOCK_D1*BLOCK_D2*BLOCK_D3*tmp*tmp; 86 | if (nthreads < n){ 87 | printf("\n============ NOT ENOUGH THREADS TO COVER n=%d ===============\n\n",n); 88 | } else { 89 | printf("Launching %d threads (n=%d)\n", nthreads, n); 90 | } 91 | 92 | double mu = 0.0; 93 | double sigma = 1.0; 94 | 95 | // simulate 'data' 96 | fill(cpu_vals, n); 97 | printf("Input values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]); 98 | 99 | cudaDeviceSynchronize(); 100 | double tInit = read_timer(); 101 | 102 | // copy input data to the GPU 103 | cudaStat = cudaMemcpy(gpu_vals, cpu_vals, n*sizeof(double), cudaMemcpyHostToDevice); 104 | printf("Memory Copy from Host to Device "); 105 | if (cudaStat){ 106 | printf("failed.\n"); 107 | } else { 108 | printf("successful.\n"); 109 | } 110 | cudaDeviceSynchronize(); 111 | double tTransferToGPU = read_timer(); 112 | 113 | // do the calculation 114 | calc_loglik<<>>(gpu_vals, n, mu, sigma); 115 | cudaDeviceSynchronize(); 116 | double tCalc = read_timer(); 117 | 118 | cudaStat = cudaMemcpy(cpu_vals, gpu_vals, n, cudaMemcpyDeviceToHost); 119 | printf("Memory Copy from Device to Host "); 120 | if (cudaStat){ 121 | printf("failed.\n"); 122 | } else { 123 | printf("successful.\n"); 124 | } 125 | cudaDeviceSynchronize(); 126 | double tTransferFromGPU = read_timer(); 127 | 128 | printf("Output values: %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]); 129 | 130 | // do calculation on CPU for comparison (unfair as this will only use one core) 131 | fill(cpu_vals, n); 132 | double tInit2 = read_timer(); 133 | calc_loglik_cpu(cpu_vals, n, mu, sigma); 134 | double tCalcCPU = read_timer(); 135 | 136 | printf("Output values (CPU): %f %f %f...\n", cpu_vals[0], cpu_vals[1], cpu_vals[2]); 137 | 138 | printf("Timing results for n = %d\n", n); 139 | printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit); 140 | printf("Calculation time (GPU): %f\n", tCalc - tTransferToGPU); 141 | printf("Calculation time (CPU): %f\n", tCalcCPU - tInit2); 142 | printf("Transfer from GPU time: %f\n", tTransferFromGPU - tCalc); 143 | 144 | printf("Freeing memory...\n"); 145 | printf("====================================================\n"); 146 | free(cpu_vals); 147 | cudaFree(gpu_vals); 148 | 149 | } 150 | printf("\n\nFinished.\n\n"); 151 | return 0; 152 | } 153 | 154 | 155 | -------------------------------------------------------------------------------- /kernelExample.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | import numpy as np 4 | import scipy as sp 5 | from scipy.stats import norm 6 | from pycuda.compiler import SourceModule 7 | import math 8 | 9 | # Here's the kernel, essentially identical to that used in the CUDA and RCUDA examples 10 | 11 | mod = SourceModule(""" 12 | #include 13 | #define SQRT_TWO_PI 2.506628274631000 14 | __global__ void calc_loglik(double *vals, double *x, int n, double mu, double sigma, int dbg) 15 | { 16 | // note that this assumes no third dimension to the grid 17 | int myblock = blockIdx.x + blockIdx.y * gridDim.x; 18 | // size of each block (within grid of blocks) 19 | int blocksize = blockDim.x * blockDim.y * blockDim.z; 20 | // id of thread in a given block 21 | int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; 22 | // assign overall id/index of the thread 23 | int idx = myblock * blocksize + subthread; 24 | 25 | if (idx < n) { 26 | if (dbg){ 27 | printf("thread idx: %04d\\t x[%d] = %f\\t (n=%d,mu=%f,sigma=%f)\\n",idx,idx,x[idx],n,mu,sigma); 28 | } 29 | double std = (x[idx] - mu)/sigma; 30 | double e = exp( - 0.5 * std * std); 31 | vals[idx] = e / ( sigma * SQRT_TWO_PI); 32 | } else { 33 | if (dbg){ 34 | printf("thread idx: %04d\\t (>=n=%d)\\n",idx,n); 35 | } 36 | } 37 | return; 38 | } 39 | """) 40 | 41 | calc_loglik = mod.get_function("calc_loglik") 42 | 43 | # Arguments must be numpy datatypes i.e., n = 1000 will not work! 44 | 45 | n = np.int32(134217728) 46 | 47 | # Threads per block and number of blocks: 48 | threads_per_block = int(1024) 49 | block_dims = (threads_per_block, 1, 1) 50 | grid_d = int(math.ceil(math.sqrt(n/threads_per_block))) 51 | grid_dims = (grid_d, grid_d, 1) 52 | 53 | 54 | print("Generating random normals...") 55 | x = np.random.normal(size = n) 56 | 57 | # Evaluate at N(0.3, 1.5) 58 | 59 | mu = np.float64(0.3) 60 | sigma = np.float64(1.5) 61 | dbg = False # True 62 | verbose = np.int32(dbg) 63 | 64 | # Allocate storage for the result: 65 | 66 | out = np.zeros_like(x) 67 | 68 | # Create two timers: 69 | start = drv.Event() 70 | end = drv.Event() 71 | 72 | # Launch the kernel 73 | print("Running GPU code...") 74 | start.record() 75 | 76 | calc_loglik(drv.Out(out), drv.In(x), n, mu, sigma, verbose, block= block_dims, grid = grid_dims) 77 | 78 | end.record() # end timing 79 | # calculate the run length 80 | end.synchronize() 81 | 82 | gpu_secs = start.time_till(end)*1e-3 83 | print("Time for calculation (GPU): %fs" % gpu_secs) 84 | 85 | # Scipy version: 86 | print("Running Scipy CPU code...") 87 | start.record() 88 | out2 = norm.pdf(x, loc = mu, scale = sigma) 89 | end.record() # end timing 90 | # calculate the run length 91 | end.synchronize() 92 | cpu_secs = start.time_till(end)*1e-3 93 | print("Time for calculation (CPU): %fs" % cpu_secs) 94 | 95 | print("Output from GPU: %f %f %f" % (out[0], out[1], out[2])) 96 | print("Output from CPU: %f %f %f" % (out2[0], out2[1], out2[2])) 97 | 98 | 99 | -------------------------------------------------------------------------------- /magmaExample.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include "cublas_v2.h" 8 | 9 | #include "magma.h" 10 | #include "magma_lapack.h" 11 | 12 | // compile as: 13 | // gcc magmaExample.c -O3 -DADD_ -fopenmp -DHAVE_CUBLAS -I/usr/local/cuda/include -I/usr/local/magma/include -L/usr/local/cuda/lib64 -L/usr/local/magma/lib -lmagma -llapack -lblas -lcublas -lcudart -o magmaExample 14 | 15 | 16 | double read_timer() { 17 | struct timeval end; 18 | gettimeofday( &end, NULL ); 19 | return end.tv_sec+1.e-6*end.tv_usec; 20 | } 21 | 22 | // BLAS/LAPACK functions for matrix multiply and Cholesky 23 | // not needed as these are in magma_dlapack.h 24 | // void dgemm_( char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int* ); 25 | // int dpotrf_(char* uplo, int* n, double* a, int* lda, int* info); 26 | 27 | void fillMatrix( double *p, int n ) { 28 | int i; 29 | srand48(0); 30 | for( i = 0; i < n; i++ ) 31 | p[i] = 2*drand48()-1; 32 | } 33 | 34 | 35 | int main( int argc, char **argv ) { 36 | printf("Starting\n"); 37 | int size; 38 | cudaError_t cudaStat; 39 | magma_int_t magmaStat; 40 | cublasStatus_t stat; 41 | cublasHandle_t handle; 42 | int it,i; 43 | 44 | cublasOperation_t N = 'N'; 45 | cublasOperation_t T = 'T'; 46 | char N2 = 'N'; 47 | char T2 = 'T'; 48 | 49 | double one = 1., zero=0.; 50 | char uplo = 'L'; 51 | int info; 52 | 53 | int err; double* A; double* B; 54 | magmaStat = magma_init(); 55 | 56 | int use_pinned; 57 | if(argc > 1) { 58 | use_pinned = atoi(argv[1]); 59 | } else use_pinned = 0; 60 | printf("Setting use_pinned to %d\n", use_pinned); 61 | 62 | for( size = 512; size <= 8192; size*=4 ) { 63 | 64 | if(use_pinned) { 65 | // allocate pinned memory on CPU 66 | err = magma_dmalloc_pinned( &A, size*size ); assert( err == 0 ); 67 | err = magma_dmalloc_pinned( &B, size*size ); assert( err == 0 ); 68 | } else { 69 | // allocate standard memory on CPU 70 | A = (double*) malloc( sizeof(double)*size*size ); 71 | B = (double*) malloc( sizeof(double)*size*size ); 72 | } 73 | 74 | cudaDeviceSynchronize(); 75 | double tInit = read_timer(); 76 | double *dA,*dB; 77 | // allocate memory on GPU 78 | magma_malloc( (void**) &dA, sizeof(double)*size*size ); 79 | magma_malloc( (void**) &dB, sizeof(double)*size*size ); 80 | 81 | cudaDeviceSynchronize(); 82 | double tAlloc = read_timer(); 83 | 84 | fillMatrix(B, size*size); 85 | 86 | 87 | cudaDeviceSynchronize(); 88 | double tInit2 = read_timer(); 89 | 90 | // transfer data to GPU 91 | magma_dsetmatrix( size, size, B, size, dB, size ); 92 | 93 | cudaDeviceSynchronize(); 94 | double tTransferToGPU = read_timer(); 95 | 96 | // matrix multiply 97 | magmablas_dgemm(MagmaNoTrans, MagmaTrans, size, size, size, one, dB, size, dB, size, zero, dA, size ); 98 | // magma_dgemm may be more general in terms of being able to call GPU or MIC 99 | 100 | cudaDeviceSynchronize(); 101 | double tMatMult = read_timer(); 102 | 103 | // Cholesky decomposition on GPU with GPU interface (called with object on GPU) 104 | magma_dpotrf_gpu(MagmaLower, size, dA, size, &info ); 105 | 106 | cudaDeviceSynchronize(); 107 | double tChol = read_timer(); 108 | 109 | // transfer data back to CPU 110 | magma_dgetmatrix( size, size, dA, size, A, size ); 111 | cudaDeviceSynchronize(); 112 | double tTransferFromGPU = read_timer(); 113 | 114 | // standard BLAS matrix multiply on CPU 115 | dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); 116 | 117 | cudaDeviceSynchronize(); 118 | double tMatMultBlas = read_timer(); 119 | 120 | // Cholesky decomposition on GPU with CPU interface (called with object on CPU) 121 | magma_dpotrf(MagmaLower, size, A, size, &info ); 122 | 123 | cudaDeviceSynchronize(); 124 | double tCholCpuInterface = read_timer(); 125 | 126 | // recreate A = B * B (could just do a save and copy instead....) 127 | dgemm_( &N2, &T2, &size, &size, &size, &one, B, &size, B, &size, &zero, A, &size ); 128 | 129 | cudaDeviceSynchronize(); 130 | double tInit3 = read_timer(); 131 | 132 | // standard Lapack Cholesky decomposition on CPU 133 | dpotrf_(&uplo, &size, A, &size, &info); 134 | 135 | cudaDeviceSynchronize(); 136 | double tCholCpu= read_timer(); 137 | 138 | 139 | printf("====================================================\n"); 140 | printf("Timing results for n = %d\n", size); 141 | printf("GPU memory allocation time: %f\n", tAlloc - tInit); 142 | printf("Transfer to GPU time: %f\n", tTransferToGPU - tInit2); 143 | printf("Matrix multiply time (GPU): %f\n", tMatMult - tTransferToGPU); 144 | printf("Matrix multiply time (BLAS): %f\n", tMatMultBlas - tTransferToGPU); 145 | printf("Cholesky factorization time (GPU w/ GPU interface): %f\n", tChol - tMatMult); 146 | printf("Cholesky factorization time (GPU w/ CPU interface): %f\n", tCholCpuInterface - tMatMultBlas); 147 | printf("Cholesky factorization time (LAPACK): %f\n", tCholCpu - tInit3); 148 | printf("Transfer from GPU time: %f\n", tTransferFromGPU - tChol); 149 | 150 | if(use_pinned) { 151 | magma_free_pinned(A); 152 | magma_free_pinned(B); 153 | } else { 154 | free(A); 155 | free(B); 156 | } 157 | magma_free(dA); 158 | magma_free(dB); 159 | 160 | } 161 | return EXIT_SUCCESS; 162 | } 163 | -------------------------------------------------------------------------------- /random.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | extern "C" 8 | { 9 | 10 | __global__ void setup_kernel(curandState *state, int seed, int n, int verbose) 11 | { 12 | // Usual block/thread indexing... 13 | int myblock = blockIdx.x + blockIdx.y * gridDim.x; 14 | int blocksize = blockDim.x * blockDim.y * blockDim.z; 15 | int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; 16 | int idx = myblock * blocksize + subthread; 17 | if (verbose){ 18 | printf("Setting up RNG in thread %d (n=%d)...\n",idx,n); 19 | } 20 | curand_init(seed, idx, 0, &state[idx]); 21 | return; 22 | } 23 | 24 | __global__ void rnorm_basic_kernel(curandState *state, double *vals, int n, double mu, double sigma) 25 | { 26 | // Usual block/thread indexing... 27 | int myblock = blockIdx.x + blockIdx.y * gridDim.x; 28 | int blocksize = blockDim.x * blockDim.y * blockDim.z; 29 | int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; 30 | int idx = myblock * blocksize + subthread; 31 | if (idx < n) { 32 | vals[idx] = mu + sigma * curand_normal_double(&state[idx]); 33 | } 34 | return; 35 | } 36 | 37 | 38 | __global__ void rnorm_kernel(curandState *state, double *vals, int n, double mu, double sigma, int numSamples) 39 | { 40 | // Usual block/thread indexing... 41 | int myblock = blockIdx.x + blockIdx.y * gridDim.x; 42 | int blocksize = blockDim.x * blockDim.y * blockDim.z; 43 | int subthread = threadIdx.z*(blockDim.x * blockDim.y) + threadIdx.y*blockDim.x + threadIdx.x; 44 | int idx = myblock * blocksize + subthread; 45 | int k; 46 | int startIdx = idx*numSamples; 47 | for(k = 0; k < numSamples; k++) { 48 | if(startIdx + k < n) 49 | vals[startIdx + k] = mu + sigma * curand_normal_double(&state[idx]); 50 | } 51 | return; 52 | } 53 | 54 | } // END extern 55 | 56 | -------------------------------------------------------------------------------- /savio-job-template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=test-gpu 3 | #SBATCH --partition=savio2_gpu 4 | #SBATCH --account=ac_scsguest 5 | #SBATCH --nodes=1 6 | #SBATCH --time=02:30:00 7 | #SBATCH --mail-user=paciorek@stat.berkeley.edu 8 | 9 | module load cuda 10 | module unload intel # do this to avoid compilation issues 11 | 12 | # insert code here to run your computations 13 | -------------------------------------------------------------------------------- /savio.sh: -------------------------------------------------------------------------------- 1 | pledge # use 1x2 2 | ssh paciorek@hpc.brc.berkeley.edu 3 | 4 | 5 | module unload intel 6 | module load cuda 7 | 8 | sacctmgr -p show associations user=paciorek 9 | 10 | srun -A ac_scsguest -p savio2_gpu -N 1 -t 30:0 --pty bash 11 | srun -u -A ac_scsguest -p savio2_gpu -N 1 -t 30:0 bash -i 12 | 13 | alias gtop=\"nvidia-smi -q -d UTILIZATION -l 1\" 14 | alias gmem=\"nvidia-smi -q -d MEMORY -l 1\" 15 | 16 | nvcc ${CUDA_DIR}/samples/1_Utilities/deviceQuery/deviceQuery.cpp -I${CUDA_DIR}/include -I${CUDA_DIR}/samples/common/inc -o deviceQuery 17 | 18 | --------------------------------------------------------------------------------