├── Chapter06 ├── 6.1. Understanding How CUDA-C-C++ Works with A Simple Example │ ├── README.md │ ├── cpu_multiply.cpp │ ├── gpu_multiply.cu │ └── gpu_multiply_revised.cu ├── 6.4. How Computing in PyCUDA Works on Python │ ├── README.md │ ├── conventional_python_code.py │ ├── pycuda_based_python_code.py │ └── testing_pycuda_without_any_cuda_code.py └── 6.6. Writing your first PyCUDA Programs to Compute a General Purpose Solution │ ├── README.md │ ├── hello_world_from_nvidia_gpu.py │ └── mathematical_formula.py ├── Chapter07 ├── 7.1. Understanding How ROCm-C-C++ Works with HIPify, HIP and OpenCL │ ├── README.md │ ├── gpu_multiply_kernel.cl │ ├── gpu_multiply_ported_to_hip.cpp │ ├── gpu_multiply_revised_partially_ported_to_hip.cpp │ └── main.cpp ├── 7.4. How Computing in PyOpenCL Works on Python │ ├── README.md │ ├── pyopencl_based_python_code.py │ ├── pyopencl_version_of_pycuda_example.py │ └── testing_pyopencl_without_opencl_code.py └── 7.6. Writing your first PyOpenCL Programs to Compute a General Purpose Solution │ ├── README.md │ ├── pyopencl_shortened_kernel_with_python.py │ ├── pyopencl_simplified_opencl_with_python.py │ └── simultaneous_division.py ├── Chapter08 ├── 8.7. Writing your first CuPy and Numba enabled 'Accelerate'd Programs to Compute GPGPU Solutions │ ├── CuPy │ │ ├── README.md │ │ ├── raw_cuda_kernel_on_cupy.py │ │ └── replacing_numpy_code_with_cupy.py │ ├── Numba │ │ ├── README.md │ │ ├── numba_cuda_kernel_with_numpy.py │ │ ├── using_jit_with_numba.py │ │ └── using_numba_with_vectorized_cuda_target.py │ └── README.md └── 8.8. Interoperability between CuPy and Numba within a Single Python Program │ ├── README.md │ └── numba_cuda_kernel_with_cupy_interoperability.py ├── Chapter10 └── 10.10. Writing your first GPU accelerated machine learning programs │ ├── README.md │ ├── pytorch_cifar-10.py │ └── tensorflow_fashion-mnist.py ├── Chapter11 └── 11.4. Testing an Example from the Deepchem Repository │ ├── DeepChem_BACE.ipynb │ ├── README.md │ └── deepchem_predicting_a_ki_ligand_for_a_protein.py ├── LICENSE └── README.md /Chapter06/6.1. Understanding How CUDA-C-C++ Works with A Simple Example/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter06/6.1. Understanding How CUDA-C-C++ Works with A Simple Example/cpu_multiply.cpp: -------------------------------------------------------------------------------- 1 | #include //Defining standard input/output stream objects 2 | #include //For using predefined math functions 3 | #define N 500000000 //500 Million elements 4 | 5 | clock_t begin, end; 6 | float cpu_time_used; 7 | 8 | // This is a function to multiply two array elements and also update the results on the second array 9 | void multiply(int n, double *p, double *q) 10 | { 11 | for (int i = 0; i < n; i++) 12 | q[i] = p[i] * q[i]; 13 | } 14 | 15 | int main(void) 16 | { 17 | double *p = new double[N]; 18 | double *q = new double[N]; 19 | 20 | // initialize arrays p and q on the host 21 | for (int i = 0; i < N; i++) { 22 | p[i] = 24.0; 23 | q[i] = 12.0; 24 | } 25 | 26 | // Run function on 500 Million elements on the CPU 27 | begin = clock(); 28 | multiply(N, p, q); 29 | end = clock(); 30 | cpu_time_used = ((double) (end - begin)) / CLOCKS_PER_SEC; 31 | 32 | // Verifying all values to be 288.0 33 | // fabs(q[i]-288) (absolute value) should be 0 34 | double maxError = 0.0; 35 | for (int i = 0; i < N; i++){ 36 | maxError = fmax(maxError, fabs(q[i]-288.0)); 37 | } 38 | std::cout << "Multiply function CPU execution time: " << cpu_time_used << " second(s)" << std::endl; 39 | std::cout << "Max error: " << maxError << std::endl; 40 | 41 | // Free memory 42 | delete [] p; 43 | delete [] q; 44 | 45 | return 0; 46 | } 47 | 48 | -------------------------------------------------------------------------------- /Chapter06/6.1. Understanding How CUDA-C-C++ Works with A Simple Example/gpu_multiply.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define N 500000000 //500 Million Elements 4 | #define THREADS_PER_BLOCK 1024 5 | 6 | // GPU kernel function to multiply two array elements and also update the results on the second array 7 | __global__ void multiply(double *p, double *q, unsigned long n){ 8 | int index = threadIdx.x + blockIdx.x * blockDim.x; 9 | if (index < n) 10 | q[index] = p[index] * q[index]; 11 | } 12 | 13 | 14 | int main(void) { 15 | 16 | double *p, *q; // host copies of p, q 17 | double *gpu_p, *gpu_q; // device copies of p, q 18 | unsigned long size = N * sizeof(unsigned long); // we need space for N unsigned long integers 19 | unsigned long i; 20 | // Allocate GPU/device copies of gpu_p, gpu_q 21 | cudaMalloc((void**)&gpu_p, size); 22 | cudaMalloc((void**)&gpu_q, size); 23 | 24 | 25 | // Allocate CPU/host copies of p, q 26 | p = (double *)malloc(size); 27 | q = (double *)malloc(size); 28 | 29 | 30 | // Setup input values 31 | for (i = 0; i < N - 1; ++i) 32 | { 33 | p[i] = 24.0; 34 | q[i] = 12.0; 35 | } 36 | 37 | // Copy inputs to device 38 | cudaMemcpy(gpu_p, p, size, cudaMemcpyHostToDevice); 39 | cudaMemcpy(gpu_q, q, size, cudaMemcpyHostToDevice); 40 | 41 | //INITIALIZE CUDA EVENTS 42 | cudaEvent_t start, stop; 43 | float elapsedTime; 44 | 45 | //CREATING EVENTS 46 | cudaEventCreate(&start); 47 | cudaEventCreate(&stop); 48 | cudaEventRecord(start, 0); 49 | 50 | //CUDA KERNEL STUFF HERE... 51 | // Launch multiply() kernel on GPU with N threads 52 | multiply << <(N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK >> >(gpu_p, gpu_q, N); 53 | 54 | //FINISH RECORDING 55 | cudaEventRecord(stop, 0); 56 | cudaEventSynchronize(stop); 57 | 58 | //CALCULATE ELAPSED TIME 59 | cudaEventElapsedTime(&elapsedTime, start, stop); 60 | 61 | //DISPLAY COMPUTATION TIME 62 | 63 | cudaDeviceProp prop; 64 | int count; 65 | 66 | cudaGetDeviceCount(&count); 67 | 68 | for (int igtx = 0; igtx < count; igtx++) { 69 | cudaGetDeviceProperties(&prop, igtx); 70 | printf("\nGPU Device used for computation: %s\n", prop.name); 71 | printf("\nMultiplication on GPU computed in: %f milliseconds", elapsedTime); 72 | } 73 | 74 | // Copy device result back to host copy of q 75 | cudaMemcpy(q, gpu_q, size, cudaMemcpyDeviceToHost); 76 | 77 | // Verifying all values to be 288.0 78 | // fabs(q[i]-288) (absolute value) should be 0 79 | double maxError = 0.0; 80 | for (int i = 0; i < N-1; ++i){ 81 | maxError = fmax(maxError, fabs(q[i]-288.0)); 82 | } 83 | std::cout << "\nMax error: " << maxError << std::endl; 84 | 85 | // Clean CPU memory allocations 86 | free(p); free(q); 87 | 88 | // Clean GPU memory allocations 89 | cudaFree(gpu_p); 90 | cudaFree(gpu_q); 91 | 92 | return 0; 93 | } 94 | -------------------------------------------------------------------------------- /Chapter06/6.1. Understanding How CUDA-C-C++ Works with A Simple Example/gpu_multiply_revised.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define N 500000000 //500 Million Elements 4 | #define THREADS_PER_BLOCK 1024 5 | 6 | // GPU kernel function to multiply two array elements and also update the results on the second array 7 | __global__ void multiply(double *p, double *q, unsigned long n){ 8 | int index = threadIdx.x + blockIdx.x * blockDim.x; 9 | if (index < n) 10 | q[index] = p[index] * q[index]; 11 | } 12 | 13 | 14 | int main(void) { 15 | 16 | double *p, *q; // host copies of p, q 17 | //double *gpu_p, *gpu_q; // device copies of p, q 18 | unsigned long size = N * sizeof(unsigned long); // we need space for N unsigned long integers 19 | unsigned long i; 20 | 21 | /* 22 | // Allocate GPU/device copies of gpu_p, gpu_q 23 | cudaMalloc((void**)&gpu_p, size); 24 | cudaMalloc((void**)&gpu_q, size); 25 | 26 | 27 | // Allocate CPU/host copies of p, q 28 | p = (double *)malloc(size); 29 | q = (double *)malloc(size); 30 | */ 31 | 32 | //Unified Memory Allocation for CPU and GPU 33 | cudaMallocManaged((void**)&p, size); 34 | cudaMallocManaged((void**)&q, size); 35 | 36 | 37 | // Setup input values 38 | for (i = 0; i < N - 1; ++i) 39 | { 40 | p[i] = 24.0; 41 | q[i] = 12.0; 42 | } 43 | 44 | /* 45 | // Copy inputs to device 46 | cudaMemcpy(gpu_p, p, size, cudaMemcpyHostToDevice); 47 | cudaMemcpy(gpu_q, q, size, cudaMemcpyHostToDevice); 48 | */ 49 | 50 | //INITIALIZE CUDA EVENTS 51 | cudaEvent_t start, stop; 52 | float elapsedTime; 53 | 54 | //CREATING EVENTS 55 | cudaEventCreate(&start); 56 | cudaEventCreate(&stop); 57 | cudaEventRecord(start, 0); 58 | cudaMemPrefetchAsync(p, N * sizeof(double), 0); 59 | cudaMemPrefetchAsync(q, N * sizeof(double), 0); 60 | cudaDeviceSynchronize(); 61 | 62 | //CUDA KERNEL STUFF HERE... 63 | // Launch multiply() kernel on GPU with N threads 64 | multiply <<<(N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>(p, q, N); 65 | 66 | //FINISH RECORDING 67 | cudaEventRecord(stop, 0); 68 | cudaDeviceSynchronize(); 69 | cudaEventSynchronize(stop); 70 | 71 | //CALCULATE ELAPSED TIME 72 | cudaEventElapsedTime(&elapsedTime, start, stop); 73 | 74 | //DISPLAY COMPUTATION TIME 75 | 76 | cudaDeviceProp prop; 77 | int count; 78 | 79 | cudaGetDeviceCount(&count); 80 | 81 | for (int igtx = 0; igtx < count; igtx++) { 82 | cudaGetDeviceProperties(&prop, igtx); 83 | printf("\nGPU Device used for computation: %s\n", prop.name); 84 | printf("\nMultiplication on GPU computed in: %f milliseconds", elapsedTime); 85 | } 86 | 87 | 88 | /* 89 | // Copy device result back to host copy of q 90 | cudaMemcpy(q, gpu_q, size, cudaMemcpyDeviceToHost); 91 | */ 92 | 93 | // Verifying all values to be 288.0 94 | // fabs(q[i]-288) (absolute value) should be 0 95 | double maxError = 0.0; 96 | for (int i = 0; i < N-1; ++i){ 97 | maxError = fmax(maxError, fabs(q[i]-288.0)); 98 | } 99 | std::cout << "\nMax error: " << maxError << std::endl; 100 | 101 | // Clean unified memory allocations 102 | cudaFree(p); 103 | cudaFree(q); 104 | 105 | return 0; 106 | } 107 | -------------------------------------------------------------------------------- /Chapter06/6.4. How Computing in PyCUDA Works on Python/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter06/6.4. How Computing in PyCUDA Works on Python/conventional_python_code.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from timeit import default_timer as timer 3 | 4 | N = 500000000 #500 Million Elements 5 | 6 | # CPU Function to multiply two array elements and also update the results on the second array 7 | def multiply(p_cpu, q_cpu): 8 | for i in range(N): 9 | q_cpu[i] = p_cpu[i] * q_cpu[i] 10 | 11 | def main(): 12 | #Initialize the two arrays of double data type all with 0.0 values upto N 13 | p = np.zeros(N, dtype=np.double) 14 | q = np.zeros(N, dtype=np.double) 15 | #Update all the elements in the two arrays with 23.0 and 12.0 respectively 16 | p.fill(23.0) 17 | q.fill(12.0) 18 | 19 | #Time the CPU Function 20 | begin = timer() 21 | multiply(p, q) 22 | numpy_cpu_time = timer() - begin 23 | 24 | #Report CPU Computation Time 25 | print("CPU function took %f seconds." % numpy_cpu_time) 26 | 27 | #Choose a random integer index value between 0 to N 28 | random = np.random.randint(0, N) 29 | #Verify all values to be 276.0 for second array by random selection 30 | print("New value of second array element with random index", random, "is", q[random]) 31 | 32 | if __name__ == "__main__": 33 | main() -------------------------------------------------------------------------------- /Chapter06/6.4. How Computing in PyCUDA Works on Python/pycuda_based_python_code.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as cudadrv 3 | import numpy 4 | import pycuda.gpuarray as gpuarray 5 | from pycuda.elementwise import ElementwiseKernel 6 | 7 | #Here, we multiply the two values and update all the elements of the second array with the new product 8 | #Note that we pass C syntax into the ElementwiseKernel 9 | multiply = ElementwiseKernel( 10 | "double *a_gpu, double *b_gpu", 11 | "b_gpu[i] = a_gpu[i] * b_gpu[i]", 12 | "multiply") 13 | 14 | N = 500000000 #500 Million Elements 15 | 16 | a_gpu = gpuarray.to_gpu(numpy.zeros(N).astype(numpy.double)) 17 | b_gpu = gpuarray.to_gpu(numpy.zeros(N).astype(numpy.double)) 18 | 19 | a_gpu.fill(23.0) 20 | b_gpu.fill(12.0) 21 | 22 | begin = cudadrv.Event() 23 | end = cudadrv.Event() 24 | 25 | # Time the GPU function 26 | begin.record() 27 | multiply(a_gpu, b_gpu) 28 | end.record() 29 | end.synchronize() 30 | gpu_multiply_time = begin.time_till(end) 31 | 32 | random = numpy.random.randint(0,N) 33 | 34 | #Randomly choose index from second array to confirm changes to second array 35 | print("New value of second array element with random index", random, "is", b_gpu[random]) 36 | 37 | # Report GPU Function time 38 | print("GPU function took %f milliseconds." % gpu_multiply_time) -------------------------------------------------------------------------------- /Chapter06/6.4. How Computing in PyCUDA Works on Python/testing_pycuda_without_any_cuda_code.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.gpuarray as gpuarray 3 | import numpy as np 4 | import pycuda.driver as cudrv 5 | 6 | N = 500000000 7 | 8 | begin = cudrv.Event() 9 | end = cudrv.Event() 10 | 11 | begin.record() 12 | a_gpu = gpuarray.to_gpu(np.zeros(N).astype(np.double)) 13 | a_gpu.fill(23.0) 14 | b_gpu=a_gpu*12.0 15 | end.record() 16 | end.synchronize() 17 | pycuda_gpu_time = begin.time_till(end) 18 | 19 | random = np.random.randint(0,N) 20 | 21 | print("Choosing second array element with index", random, "at random:", b_gpu[random]) 22 | 23 | print("\nGPU took %f milliseconds." % pycuda_gpu_time) 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /Chapter06/6.6. Writing your first PyCUDA Programs to Compute a General Purpose Solution/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter06/6.6. Writing your first PyCUDA Programs to Compute a General Purpose Solution/hello_world_from_nvidia_gpu.py: -------------------------------------------------------------------------------- 1 | #Auto initialization for CUDA 2 | import pycuda.autoinit 3 | #Importing SourceModule from the PyCUDA Compiler module 4 | from pycuda.compiler import SourceModule 5 | 6 | #For a single thread on a single block 7 | N=1 8 | #Setting threads per block. Here, it is set to 1 9 | Threads_per_block = (int(1)) 10 | #Setting blocks per grid, also calculated as 1. 11 | Blocks_per_grid = (int((N + Threads_per_block - 1) / Threads_per_block)) 12 | 13 | #Printing from the GPU device itself! 14 | mod = SourceModule(""" 15 | __global__ void hello_from_nvidia_gpu() 16 | { 17 | printf("Hello World from NVIDIA GPU!"); 18 | } 19 | """) 20 | #Return the Function name in the get_function module 21 | hello_from_nvidia_gpu = mod.get_function("hello_from_nvidia_gpu") 22 | 23 | #Invoking the NVIDIA GPU Kernel 24 | hello_from_nvidia_gpu(block=(Threads_per_block, 1, 1), grid=(Blocks_per_grid, 1)) -------------------------------------------------------------------------------- /Chapter06/6.6. Writing your first PyCUDA Programs to Compute a General Purpose Solution/mathematical_formula.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import numpy 3 | import pycuda.gpuarray as gpuarray 4 | import pycuda.driver as drv 5 | from pycuda.elementwise import ElementwiseKernel 6 | 7 | #Here, we compute a mathematical formula 8 | #Note that we pass C/C++ syntax into the ElementwiseKernel 9 | compute = ElementwiseKernel( 10 | "double *a_gpu, double *b_gpu, double *c_gpu", 11 | "c_gpu[i] = (cos(a_gpu[i])*cos(a_gpu[i])) * (b_gpu[i]* b_gpu[i])+sin(a_gpu[i]*b_gpu[i])", 12 | "compute") 13 | 14 | N = 500000000 15 | #Threads_per_block = (int(1024)) 16 | #Blocks_per_grid = (int((N + Threads_per_block - 1) / Threads_per_block)) 17 | 18 | a_gpu = gpuarray.to_gpu(numpy.zeros(N).astype(numpy.double)) 19 | b_gpu = gpuarray.to_gpu(numpy.zeros(N).astype(numpy.double)) 20 | c_gpu = gpuarray.to_gpu(numpy.zeros(N).astype(numpy.double)) 21 | 22 | a_gpu.fill(24.0) 23 | b_gpu.fill(12.0) 24 | 25 | start = drv.Event() 26 | end = drv.Event() 27 | 28 | # Time the GPU function 29 | start.record() 30 | compute(a_gpu, b_gpu, c_gpu) 31 | end.record() 32 | end.synchronize() 33 | gpu_compute_time = start.time_till(end) 34 | 35 | random = numpy.random.randint(0,N) 36 | 37 | #Randomly choose index from second array to confirm changes to second array 38 | print("New value of second array element with random index ", random, "is ", c_gpu[random]) 39 | 40 | # Report times 41 | print("GPU function took %f milliseconds." % gpu_compute_time) -------------------------------------------------------------------------------- /Chapter07/7.1. Understanding How ROCm-C-C++ Works with HIPify, HIP and OpenCL/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter07/7.1. Understanding How ROCm-C-C++ Works with HIPify, HIP and OpenCL/gpu_multiply_kernel.cl: -------------------------------------------------------------------------------- 1 | // GPU kernel function to multiply two array elements and also update the results on a third array 2 | __kernel void multiply(__global double *p, __global double *q, __global double *r) { 3 | 4 | // Indexing the current element to process - equivalent to int index = threadIdx.x + blockIdx.x * blockDim.x in CUDA 5 | int index = get_global_id(0); 6 | 7 | // Simultaneous multiplication within this OpenCL kernel 8 | r[index] = p[index] * q[index]; 9 | } 10 | -------------------------------------------------------------------------------- /Chapter07/7.1. Understanding How ROCm-C-C++ Works with HIPify, HIP and OpenCL/gpu_multiply_ported_to_hip.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define N 500000000 //500 Million Elements 5 | #define THREADS_PER_BLOCK 1024 6 | 7 | // GPU kernel function to multiply two array elements and also update the results on the second array 8 | __global__ void multiply(double *p, double *q, unsigned long n){ 9 | int index = threadIdx.x + blockIdx.x * blockDim.x; 10 | //int index = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; 11 | if (index < n) 12 | q[index] = p[index] * q[index]; 13 | } 14 | 15 | 16 | int main(void) { 17 | 18 | double *p, *q; // host copies of p, q 19 | double *gpu_p, *gpu_q; // device copies of p, q 20 | unsigned long size = N * sizeof(unsigned long); // we need space for N unsigned long integers 21 | unsigned long i; 22 | // Allocate GPU/device copies of gpu_p, gpu_q 23 | hipMalloc((void**)&gpu_p, size); 24 | hipMalloc((void**)&gpu_q, size); 25 | 26 | 27 | // Allocate CPU/host copies of p, q 28 | p = (double *)malloc(size); 29 | q = (double *)malloc(size); 30 | 31 | 32 | // Setup input values 33 | for (i = 0; i < N - 1; ++i) 34 | { 35 | p[i] = 24.0; 36 | q[i] = 12.0; 37 | } 38 | 39 | // Copy inputs to device 40 | hipMemcpy(gpu_p, p, size, hipMemcpyHostToDevice); 41 | hipMemcpy(gpu_q, q, size, hipMemcpyHostToDevice); 42 | 43 | //INITIALIZE HIP EVENTS 44 | hipEvent_t start, stop; 45 | float elapsedTime; 46 | 47 | //CREATING EVENTS 48 | hipEventCreate(&start); 49 | hipEventCreate(&stop); 50 | hipEventRecord(start, 0); 51 | 52 | //HIP KERNEL STUFF HERE... 53 | // Launch multiply() kernel on GPU with N threads 54 | hipLaunchKernelGGL(multiply, dim3((N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK), dim3(THREADS_PER_BLOCK), 0, 0, gpu_p, gpu_q, N); 55 | 56 | //FINISH RECORDING 57 | hipEventRecord(stop, 0); 58 | hipEventSynchronize(stop); 59 | 60 | //CALCULATE ELAPSED TIME 61 | hipEventElapsedTime(&elapsedTime, start, stop); 62 | 63 | //DISPLAY COMPUTATION TIME 64 | 65 | hipDeviceProp_t prop; 66 | int count; 67 | 68 | hipGetDeviceCount(&count); 69 | 70 | for (int igtx = 0; igtx < count; igtx++) { 71 | hipGetDeviceProperties(&prop, igtx); 72 | printf("\nGPU Device used for computation: %s\n", prop.name); 73 | printf("\nMultiplication on GPU computed in: %f milliseconds", elapsedTime); 74 | } 75 | 76 | // Copy device result back to host copy of q 77 | hipMemcpy(q, gpu_q, size, hipMemcpyDeviceToHost); 78 | 79 | // Verifying all values to be 288.0 80 | // fabs(q[i]-288) (absolute value) should be 0 81 | double maxError = 0.0; 82 | for (int i = 0; i < N-1; ++i){ 83 | maxError = fmax(maxError, fabs(q[i]-288.0)); 84 | } 85 | std::cout << "\nMax error: " << maxError << std::endl; 86 | 87 | // Clean CPU memory allocations 88 | free(p); free(q); 89 | 90 | // Clean GPU memory allocations 91 | hipFree(gpu_p); 92 | hipFree(gpu_q); 93 | 94 | return 0; 95 | } 96 | -------------------------------------------------------------------------------- /Chapter07/7.1. Understanding How ROCm-C-C++ Works with HIPify, HIP and OpenCL/gpu_multiply_revised_partially_ported_to_hip.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #define N 500000000 //500 Million Elements 6 | #define THREADS_PER_BLOCK 1024 7 | 8 | // GPU kernel function to multiply two array elements and also update the results on the second array 9 | __global__ void multiply(double *p, double *q, unsigned long n){ 10 | int index = threadIdx.x + blockIdx.x * blockDim.x; 11 | if (index < n) 12 | q[index] = p[index] * q[index]; 13 | } 14 | 15 | 16 | int main(void) { 17 | 18 | double *p, *q; // host copies of p, q 19 | //double *gpu_p, *gpu_q; // device copies of p, q 20 | unsigned long size = N * sizeof(unsigned long); // we need space for N unsigned long integers 21 | unsigned long i; 22 | 23 | /* 24 | // Allocate GPU/device copies of gpu_p, gpu_q 25 | cudaMalloc((void**)&gpu_p, size); 26 | cudaMalloc((void**)&gpu_q, size); 27 | 28 | 29 | // Allocate CPU/host copies of p, q 30 | p = (double *)malloc(size); 31 | q = (double *)malloc(size); 32 | */ 33 | 34 | //Unified Memory Allocation for CPU and GPU 35 | cudaMallocManaged((void**)&p, size); 36 | cudaMallocManaged((void**)&q, size); 37 | 38 | 39 | // Setup input values 40 | for (i = 0; i < N - 1; ++i) 41 | { 42 | p[i] = 24.0; 43 | q[i] = 12.0; 44 | } 45 | 46 | /* 47 | // Copy inputs to device 48 | cudaMemcpy(gpu_p, p, size, cudaMemcpyHostToDevice); 49 | cudaMemcpy(gpu_q, q, size, cudaMemcpyHostToDevice); 50 | */ 51 | 52 | //INITIALIZE CUDA EVENTS 53 | hipEvent_t start, stop; 54 | float elapsedTime; 55 | 56 | //CREATING EVENTS 57 | hipEventCreate(&start); 58 | hipEventCreate(&stop); 59 | hipEventRecord(start, 0); 60 | cudaMemPrefetchAsync(p, N * sizeof(double), 0); 61 | cudaMemPrefetchAsync(q, N * sizeof(double), 0); 62 | hipDeviceSynchronize(); 63 | 64 | //CUDA KERNEL STUFF HERE... 65 | // Launch multiply() kernel on GPU with N threads 66 | hipLaunchKernelGGL(multiply, dim3((N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK), dim3(THREADS_PER_BLOCK), 0, 0, p, q, N); 67 | 68 | //FINISH RECORDING 69 | hipEventRecord(stop, 0); 70 | hipDeviceSynchronize(); 71 | hipEventSynchronize(stop); 72 | 73 | //CALCULATE ELAPSED TIME 74 | hipEventElapsedTime(&elapsedTime, start, stop); 75 | 76 | //DISPLAY COMPUTATION TIME 77 | 78 | hipDeviceProp_t prop; 79 | int count; 80 | 81 | hipGetDeviceCount(&count); 82 | 83 | for (int igtx = 0; igtx < count; igtx++) { 84 | hipGetDeviceProperties(&prop, igtx); 85 | printf("\nGPU Device used for computation: %s\n", prop.name); 86 | printf("\nMultiplication on GPU computed in: %f milliseconds", elapsedTime); 87 | } 88 | 89 | 90 | /* 91 | // Copy device result back to host copy of q 92 | cudaMemcpy(q, gpu_q, size, cudaMemcpyDeviceToHost); 93 | */ 94 | 95 | // Verifying all values to be 288.0 96 | // fabs(q[i]-288) (absolute value) should be 0 97 | double maxError = 0.0; 98 | for (int i = 0; i < N-1; ++i){ 99 | maxError = fmax(maxError, fabs(q[i]-288.0)); 100 | } 101 | std::cout << "\nMax error: " << maxError << std::endl; 102 | 103 | // Clean unified memory allocations 104 | hipFree(p); 105 | hipFree(q); 106 | 107 | return 0; 108 | } 109 | -------------------------------------------------------------------------------- /Chapter07/7.1. Understanding How ROCm-C-C++ Works with HIPify, HIP and OpenCL/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | using namespace std; 7 | 8 | #define N (0x15000000) //15 Million Elements 9 | 10 | int main(void) { 11 | // Creating two input arrays with two values on each 12 | double *p = (double*)malloc(sizeof(double)*N); 13 | double *q = (double*)malloc(sizeof(double)*N); 14 | for(int i = 0; i < N-1; ++i) { 15 | p[i] = 23; 16 | q[i] = 12; 17 | } 18 | 19 | // Loading source code from .cl file into the array cl_source 20 | FILE *opencl_file; 21 | char *cl_source; 22 | size_t source_size; 23 | 24 | opencl_file = fopen("gpu_multiply_kernel.cl", "r"); 25 | if (!opencl_file) { 26 | fprintf(stderr, "Failed to load opencl_kernel.\n"); 27 | exit(1); 28 | } 29 | cl_source = (char*)malloc(N); 30 | source_size = fread( cl_source, 1, N, opencl_file); 31 | fclose( opencl_file ); 32 | 33 | // Fetching platform and device information 34 | cl_platform_id platform_id = NULL; 35 | cl_device_id device_id = NULL; 36 | cl_uint get_num_devices; 37 | cl_uint get_num_platforms; 38 | cl_int for_kernel = clGetPlatformIDs(1, &platform_id, &get_num_platforms); 39 | for_kernel = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &get_num_devices); 40 | 41 | // Creating an OpenCL context 42 | cl_context opencl_context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &for_kernel); 43 | 44 | // Creating a command queue and enabling profiling to find computation time 45 | 46 | cl_command_queue_properties profiling_on[] {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; 47 | cl_command_queue command_queue = clCreateCommandQueueWithProperties(opencl_context, device_id, profiling_on, &for_kernel); 48 | 49 | // Creating memory buffers on the device for each array - similar to cudaMalloc on CUDA 50 | cl_mem p_gpu = clCreateBuffer(opencl_context, CL_MEM_READ_ONLY, N * sizeof(double), NULL, &for_kernel); 51 | cl_mem q_gpu = clCreateBuffer(opencl_context, CL_MEM_READ_ONLY, N * sizeof(double), NULL, &for_kernel); 52 | cl_mem r_gpu = clCreateBuffer(opencl_context, CL_MEM_WRITE_ONLY, N * sizeof(double), NULL, &for_kernel); 53 | 54 | // Transferring p and q to their respective memory buffers on the device for multiplication - similar to cudaMemcpyHostToDevice on CUDA 55 | for_kernel = clEnqueueWriteBuffer(command_queue, p_gpu, CL_TRUE, 0, N * sizeof(double), p, 0, NULL, NULL); 56 | for_kernel = clEnqueueWriteBuffer(command_queue, q_gpu, CL_TRUE, 0, N * sizeof(double), q, 0, NULL, NULL); 57 | 58 | // Creating an OpenCL program from the opencl_kernel source 59 | cl_program opencl_program = clCreateProgramWithSource(opencl_context, 1, (const char **)&cl_source, (const size_t *)&source_size, &for_kernel); 60 | 61 | // Building the OpenCL program 62 | for_kernel = clBuildProgram(opencl_program, 1, &device_id, NULL, NULL, NULL); 63 | 64 | // Creating the OpenCL kernel 65 | cl_kernel opencl_kernel = clCreateKernel(opencl_program, "multiply", &for_kernel); 66 | 67 | // Arguments of the OpenCL kernel for the device 68 | for_kernel = clSetKernelArg(opencl_kernel, 0, sizeof(cl_mem), (void *)&p_gpu); 69 | for_kernel = clSetKernelArg(opencl_kernel, 1, sizeof(cl_mem), (void *)&q_gpu); 70 | for_kernel = clSetKernelArg(opencl_kernel, 2, sizeof(cl_mem), (void *)&r_gpu); 71 | 72 | // Allocation of work items and groups - work items are similar to threads and groups are similar to blocks as on CUDA. 73 | size_t global_item_size = N; // Setting the global item size - similar to maximum number of threads' usage 74 | size_t local_item_size = 1024; // Dividing work items into groups of 1024 75 | 76 | // C++ and OpenCL allocations for displaying device name 77 | int pf_index, dev_index; 78 | char* device_name; 79 | size_t nameSize; 80 | cl_uint platform_count; 81 | cl_platform_id* platforms; 82 | cl_uint device_count; 83 | cl_device_id* devices; 84 | 85 | // Fetching all platforms to display device name 86 | clGetPlatformIDs(0, NULL, &platform_count); 87 | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platform_count); 88 | clGetPlatformIDs(platform_count, platforms, NULL); 89 | 90 | for (pf_index = 0; pf_index < platform_count; pf_index++) { 91 | 92 | // Fetching all OpenCL supported devices in the system 93 | clGetDeviceIDs(platforms[pf_index], CL_DEVICE_TYPE_ALL, 0, NULL, &device_count); 94 | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * device_count); 95 | clGetDeviceIDs(platforms[pf_index], CL_DEVICE_TYPE_ALL, device_count, devices, NULL); 96 | 97 | // Display critical attributes for each device (just one here in our case) 98 | for (dev_index = 0; dev_index < device_count; dev_index++) { 99 | 100 | // Display the device name 101 | clGetDeviceInfo(devices[dev_index], CL_DEVICE_NAME, 0, NULL, &nameSize); 102 | device_name = (char*) malloc(nameSize); 103 | clGetDeviceInfo(devices[dev_index], CL_DEVICE_NAME, nameSize, device_name, NULL); 104 | printf("Device used for computation: %s\n", device_name); 105 | free(device_name); 106 | 107 | } 108 | 109 | free(devices); 110 | 111 | } 112 | 113 | free(platforms); 114 | 115 | cl_event event; // Creating an event variable for timing 116 | 117 | for_kernel = clEnqueueNDRangeKernel(command_queue, opencl_kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &event); 118 | 119 | clWaitForEvents (1, &event); // Waiting for the event 120 | 121 | clFinish(command_queue); //Waiting until all commands have completed 122 | 123 | // Obtaining the start and end time for the event 124 | cl_ulong begin; 125 | cl_ulong end; 126 | 127 | clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(begin), &begin, NULL); 128 | clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); 129 | 130 | double duration = end - begin; 131 | 132 | // Printing the device computation time - note that on OpenCL, the default unit is nanoseconds in contrast to milliseconds on CUDA. 133 | printf("Multiplication on device computed in: %lf nanoseconds = %lf milliseconds\n", duration, duration/1000000); 134 | 135 | 136 | // Transferring r_gpu to its respective memory buffer on the host - similar to cudaMemcpyDeviceToHost on CUDA 137 | double *r = (double*)malloc(sizeof(double)*N); 138 | for_kernel = clEnqueueReadBuffer(command_queue, r_gpu, CL_TRUE, 0, N * sizeof(double), r, 0, NULL, NULL); 139 | 140 | // Verifying all values to be 276.0 141 | // fabs(q[i]-288) (absolute value) should be 0 142 | double maxError = 0.0; 143 | for (int i = 0; i < N-1; ++i){ 144 | maxError = fmax(maxError, fabs(r[i]-276.0)); 145 | } 146 | std::cout<<"\nMax error: "<

7 | 8 | This is the code repository for [Hands-On GPU Computing with Python](https://www.packtpub.com/in/big-data-and-business-intelligence/hands-gpu-computing-python?utm_source=github&utm_medium=repository&utm_campaign=), published by Packt. 9 | 10 | **Explore the capabilities of GPUs for solving high performance computational problems** 11 | 12 | ## What is this book about? 13 | GPUs are proving to be excellent general purpose-parallel computing solutions for high performance tasks such as deep learning and scientific computing. 14 | 15 | This book will be your guide to getting started with GPU computing. It will start with introducing GPU computing and explain the architecture and programming models for GPUs. You will learn, by example, how to perform GPU programming with Python, and you’ll look at using integrations such as PyCUDA, PyOpenCL, CuPy, Numba, Tensorflow, Keras and PyTorch with Anaconda for various tasks such as machine learning, data mining and scientific computing. Going further, you will get to grips with GPU work flows, management, and deployment using modern containerization solutions. Toward the end of the book, you will get familiar with the principles of distributed computing for training machine learning models and enhancing efficiency and performance. 16 | 17 | This book covers the following exciting features: 18 | * Utilize Python libraries and frameworks for GPU acceleration 19 | * Set up a GPU-enabled programmable machine learning environment on your system with Anaconda 20 | * Deploy your machine learning system on cloud containers with illustrated examples 21 | * Explore PyCUDA and PyOpenCL and compare them with platforms such as CUDA, OpenCL and ROCm. 22 | * Perform data mining tasks with machine learning models on GPUs 23 | * Extend your knowledge of GPU computing in scientific applications 24 | 25 | 26 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789341078) today! 27 | 28 |

30 | 31 | ## Instructions and Navigations 32 | All of the code is organized into folders. For example, Chapter02. 33 | 34 | The code will look like the following: 35 | ``` 36 | // Run function on 500 Million elements on the CPU 37 | begin = clock(); 38 | multiply(N, p, q); 39 | end = clock(); 40 | cpu_time_used = ((double) (end - begin)) / CLOCKS_PER_SEC; 41 | ``` 42 | 43 | **Following is what you need for this book:** 44 | Data Scientist, Machine Learning enthusiasts and professionals who wants to get started with GPU computation and perform the complex tasks with low-latency. Intermediate knowledge of Python programming is assumed. 45 | 46 | With the following software list you can run all code files present in the book (Chapter 1-11). 47 | ### Software List 48 | | Chapter | Software required | OS required | 49 | | -------- | ------------------------------------ | ----------------------------------- | 50 | | 2 - 11 | PyCharm Community Edition, PyCharm Educational Edition, PyCharm for Anaconda Community Edition, PyCharm Professional Edition, PyCharm for Anaconda Professional Edition, PyDev, Jupyter Notebook, Jupyter Lab, Eric, CUDA, ROCm, Anaconda, CuPy, Numba, Google Colaboratory, Tensorflow, PyTorch, DeepChem | Linux (preferably Ubuntu) | 51 | 52 | We also provide a PDF file that has color images of the screenshots/diagrams used in this book. [Click here to download it](https://www.packtpub.com/sites/default/files/downloads/9781789341072_ColorImages.pdf). 53 | 54 | ### Related products 55 | * Hands-On GPU-Accelerated Computer Vision with OpenCV and CUDA [[Packt]](https://www.packtpub.com/application-development/hands-gpu-accelerated-computer-vision-opencv-and-cuda?utm_source=github&utm_medium=repository&utm_campaign=) [[Amazon]](https://www.amazon.com/dp/1789348293) 56 | 57 | * Learn OpenCV 4 By Building Projects - Second Edition [[Packt]](https://www.packtpub.com/application-development/learn-opencv-4-building-projects-second-edition?utm_source=github&utm_medium=repository&utm_campaign=) [[Amazon]](https://www.amazon.com/dp/1789341221) 58 | 59 | ## Get to Know the Author 60 | **Avimanyu Bandyopadhyay** 61 | is currently pursuing a PhD degree in Bioinformatics based on applied GPU computing in Computational Biology at Heritage Institute of Technology, Kolkata, India. Since 2014, he developed a keen interest in GPU computing, and used CUDA for his master's thesis. He has experience as a systems administrator as well, particularly on the Linux platform. 62 | Avimanyu is also a scientific writer, technology communicator, and a passionate gamer. He has published technical writing on open source computing and has actively participated in NVIDIA's GPU computing conferences since 2016. A big-time Linux fan, he strongly believes in the significance of Linux and an open source approach in scientific research. Deep learning with GPUs is his new passion! 63 | 64 | ### Suggestions and Feedback 65 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 66 | 67 | 68 | ### Download a free PDF 69 | 70 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF. 71 |

https://packt.link/free-ebook/9781789341072

--------------------------------------------------------------------------------