├── Chapter01 └── mandelbrot0.py ├── Chapter02 └── launch-python-cuda-environment.bat ├── Chapter03 ├── deviceQuery.py ├── gpu_mandelbrot0.py ├── nbody.py ├── simple_element_kernel_example0.py ├── simple_scankernel0.py ├── simple_scankernel1.py └── time_calc0.py ├── Chapter04 ├── conway_gpu.py ├── conway_gpu_syncthreads.py ├── conway_gpu_syncthreads_shared.py ├── naive_prefix.py ├── simple_scalar_multiply_kernel.py └── work-efficient_prefix.py ├── Chapter05 ├── conway_gpu_streams.py ├── gpu_mandelbrot_context_sync.py ├── multi-kernel.py ├── multi-kernel_events.py ├── multi-kernel_multi-thread.py ├── multi-kernel_streams.py ├── simple_context_create.py ├── simple_event_example.py └── single_thread_example.py ├── Chapter06 ├── broken_matrix_ker.py ├── divergence_test.cu ├── hello-world_gpu.py ├── matrix_ker.cu └── matrix_ker.py ├── Chapter07 ├── conv_2d.py ├── cublas_gemm_flops.py └── latte.jpg ├── Chapter08 ├── monte_carlo_integrator.py ├── monte_carlo_pi.py └── thrust_dot_product.cu ├── LICENSE └── README.md /Chapter01/mandelbrot0.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | import matplotlib 3 | #this will prevent the figure from popping up 4 | matplotlib.use('Agg') 5 | 6 | from matplotlib import pyplot as plt 7 | import numpy as np 8 | 9 | 10 | def simple_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound): 11 | 12 | real_vals = np.linspace(real_low, real_high, width) 13 | imag_vals = np.linspace(imag_low, imag_high, height) 14 | 15 | # we will represent members as 1, non-members as 0. 16 | 17 | mandelbrot_graph = np.ones((height,width), dtype=np.float32) 18 | 19 | for x in range(width): 20 | 21 | for y in range(height): 22 | 23 | c = np.complex64( real_vals[x] + imag_vals[y] * 1j ) 24 | z = np.complex64(0) 25 | 26 | for i in range(max_iters): 27 | 28 | z = z**2 + c 29 | 30 | if(np.abs(z) > upper_bound): 31 | mandelbrot_graph[y,x] = 0 32 | break 33 | 34 | return mandelbrot_graph 35 | 36 | 37 | if __name__ == '__main__': 38 | 39 | t1 = time() 40 | mandel = simple_mandelbrot(512,512,-2,2,-2,2,256, 2.5) 41 | t2 = time() 42 | mandel_time = t2 - t1 43 | 44 | t1 = time() 45 | fig = plt.figure(1) 46 | plt.imshow(mandel, extent=(-2, 2, -2, 2)) 47 | plt.savefig('mandelbrot.png', dpi=fig.dpi) 48 | t2 = time() 49 | 50 | dump_time = t2 - t1 51 | 52 | print('It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time)) 53 | print('It took {} seconds to dump the image.'.format(dump_time)) 54 | -------------------------------------------------------------------------------- /Chapter02/launch-python-cuda-environment.bat: -------------------------------------------------------------------------------- 1 | REM This batch script will set up an appropriate Python environment for CUDA GPU programming under Windows. 2 | REM The last line launches a CMD prompt. This can be any environment however. 3 | REM If you wish to use an IDE such as Spyder or Jupyter Notebook, just change the last line to "spyder" 4 | REM or "jupyter-notebook". 5 | 6 | call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 7 | call "C:\Users\%username%\Anaconda3\Scripts\activate.bat" C:\Users\%username%\Anaconda3 8 | cmd 9 | -------------------------------------------------------------------------------- /Chapter03/deviceQuery.py: -------------------------------------------------------------------------------- 1 | import pycuda 2 | import pycuda.driver as drv 3 | drv.init() 4 | 5 | print ('CUDA device query (PyCUDA version) \n') 6 | 7 | print ('Detected {} CUDA Capable device(s) \n'.format(drv.Device.count())) 8 | 9 | for i in range(drv.Device.count()): 10 | 11 | gpu_device = drv.Device(i) 12 | print ('Device {}: {}'.format( i, gpu_device.name() ) ) 13 | compute_capability = float( '%d.%d' % gpu_device.compute_capability() ) 14 | print ('\t Compute Capability: {}'.format(compute_capability)) 15 | print ('\t Total Memory: {} megabytes'.format(gpu_device.total_memory()//(1024**2))) 16 | 17 | # The following will give us all remaining device attributes as seen 18 | # in the original deviceQuery. 19 | # We set up a dictionary as such so that we can easily index 20 | # the values using a string descriptor. 21 | 22 | device_attributes_tuples = gpu_device.get_attributes().items() 23 | device_attributes = {} 24 | 25 | for k, v in device_attributes_tuples: 26 | device_attributes[str(k)] = v 27 | 28 | num_mp = device_attributes['MULTIPROCESSOR_COUNT'] 29 | 30 | # Cores per multiprocessor is not reported by the GPU! 31 | # We must use a lookup table based on compute capability. 32 | # See the following: 33 | # http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities 34 | 35 | cuda_cores_per_mp = { 3.0 : 192, 3.2 : 192, 3.5 : 192, 3.7 : 192, 5.0 : 128, 5.1 : 128, 5.2 : 128, 5.3 : 128, 6.0 : 64, 6.1 : 128,\ 36 | 6.2 : 128, 7.0 : 64, 7.1 : 64, 7.2 : 64, 7.3 : 64, 7.4 : 64, 7.5 : 64}[compute_capability] 37 | 38 | print ('\t ({}) Multiprocessors, ({}) CUDA Cores / Multiprocessor: {} CUDA Cores'.format(num_mp, cuda_cores_per_mp, num_mp*cuda_cores_per_mp)) 39 | 40 | device_attributes.pop('MULTIPROCESSOR_COUNT') 41 | 42 | for k in device_attributes.keys(): 43 | print ('\t {}: {}'.format(k, device_attributes[k])) 44 | -------------------------------------------------------------------------------- /Chapter03/gpu_mandelbrot0.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | import matplotlib 3 | #this will prevent the figure from popping up 4 | matplotlib.use('Agg') 5 | from matplotlib import pyplot as plt 6 | import numpy as np 7 | import pycuda.autoinit 8 | from pycuda import gpuarray 9 | from pycuda.elementwise import ElementwiseKernel 10 | 11 | mandel_ker = ElementwiseKernel( 12 | "pycuda::complex *lattice, float *mandelbrot_graph, int max_iters, float upper_bound", 13 | """ 14 | mandelbrot_graph[i] = 1; 15 | 16 | pycuda::complex c = lattice[i]; 17 | pycuda::complex z(0,0); 18 | 19 | for (int j = 0; j < max_iters; j++) 20 | { 21 | 22 | z = z*z + c; 23 | 24 | if(abs(z) > upper_bound) 25 | { 26 | mandelbrot_graph[i] = 0; 27 | break; 28 | } 29 | 30 | } 31 | 32 | """, 33 | "mandel_ker") 34 | 35 | def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound): 36 | 37 | # we set up our complex lattice as such 38 | real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64) 39 | imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j 40 | mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64) 41 | 42 | # copy complex lattice to the GPU 43 | mandelbrot_lattice_gpu = gpuarray.to_gpu(mandelbrot_lattice) 44 | 45 | # allocate an empty array on the GPU 46 | mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32) 47 | 48 | mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound)) 49 | 50 | mandelbrot_graph = mandelbrot_graph_gpu.get() 51 | 52 | return mandelbrot_graph 53 | 54 | 55 | if __name__ == '__main__': 56 | 57 | t1 = time() 58 | mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2) 59 | t2 = time() 60 | 61 | mandel_time = t2 - t1 62 | 63 | t1 = time() 64 | fig = plt.figure(1) 65 | plt.imshow(mandel, extent=(-2, 2, -2, 2)) 66 | plt.savefig('mandelbrot.png', dpi=fig.dpi) 67 | t2 = time() 68 | 69 | dump_time = t2 - t1 70 | 71 | print ('It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time)) 72 | print ('It took {} seconds to dump the image.'.format(dump_time)) 73 | -------------------------------------------------------------------------------- /Chapter03/nbody.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pycuda.autoinit 3 | from pycuda import gpuarray 4 | from time import time 5 | from pycuda.elementwise import ElementwiseKernel 6 | import matplotlib.pyplot as plt 7 | import matplotlib.animation as animation 8 | 9 | 10 | nbody_ker = ElementwiseKernel( 11 | "float *in_x, float *in_y, float *in_v_x, float *in_v_y, \ 12 | float *out_x, float *out_y, float *out_v_x, float *out_v_y, \ 13 | float *m, float t, int num_bodies", 14 | """ 15 | #define G 6.67408313e-11 16 | 17 | float net_force_x = 0.0f; 18 | float net_force_y = 0.0f; 19 | 20 | for(int n=0; n < num_bodies; ++n) { 21 | if (n == i) 22 | continue; 23 | 24 | float r2 = powf(in_x[i] - in_x[n], 2.0f) + powf(in_y[i] - in_y[n], 2.0f); 25 | float r = sqrtf(r2); 26 | 27 | float force = G*m[i]*m[n] / r2; 28 | 29 | float force_x = force * ( in_x[n] - in_x[i] ) / r; 30 | float force_y = force * ( in_y[n] - in_y[i] ) / r; 31 | 32 | net_force_x += force_x; 33 | net_force_y += force_y; 34 | 35 | } 36 | 37 | float a_x = net_force_x / m[i]; 38 | float a_y = net_force_y / m[i]; 39 | 40 | out_x[i] = in_x[i] + in_v_x[i]*t + 0.5f * a_x * powf(t,2.0f); 41 | out_y[i] = in_y[i] + in_v_y[i]*t + 0.5f * a_y * powf(t,2.0f); 42 | 43 | out_v_x[i] = in_v_x[i] + a_x*t; 44 | out_v_y[i] = in_v_y[i] + a_y*t; 45 | """, 46 | "nbody_ker") 47 | 48 | REZ = 128 49 | NUM_BODIES=np.int32(4000) 50 | t=np.float32(0.005) 51 | 52 | in_x = gpuarray.to_gpu(np.float32(np.random.random(NUM_BODIES) + .5 )) 53 | in_y = gpuarray.to_gpu(np.float32(np.random.random(NUM_BODIES) + .5)) 54 | in_v_x = gpuarray.to_gpu(np.float32(np.random.random(NUM_BODIES) - .5)) 55 | in_v_y = gpuarray.to_gpu(np.float32(np.random.random(NUM_BODIES) - .5)) 56 | 57 | out_x = gpuarray.empty_like(in_x) 58 | out_y = gpuarray.empty_like(in_y) 59 | out_v_x = gpuarray.empty_like(in_v_x) 60 | out_v_y = gpuarray.empty_like(in_v_y) 61 | 62 | masses = np.float32(np.random.random(NUM_BODIES)*10000) 63 | m = gpuarray.to_gpu(masses) 64 | 65 | 66 | def xy_to_img(x_coords, y_coords, masses): 67 | 68 | img_out = np.zeros((2*REZ,2*REZ), dtype=np.int32) 69 | 70 | for x, y, mass in zip(x_coords, y_coords, masses): 71 | if (x < 0 or y < 0 or not np.isfinite(x) or not np.isfinite(y)): 72 | continue 73 | int_x = int(np.round(x * REZ)) 74 | int_y = int(np.round(y * REZ)) 75 | 76 | if (int_x < 2*REZ and int_y < 2*REZ): 77 | img_out[int_x, int_y] += int(mass) 78 | 79 | return img_out 80 | 81 | def update_gpu(frameNum, img, in_x, in_y, in_v_x, in_v_y, out_x, out_y, out_v_x, out_v_y,t, NUM_BODIES, masses): 82 | 83 | if frameNum % 2 == 0: 84 | nbody_ker(in_x,in_y,in_v_x,in_v_y,out_x,out_y,out_v_x,out_v_y,m,t,NUM_BODIES) 85 | img.set_data(xy_to_img(out_x.get(), out_y.get(), masses)) 86 | else: 87 | nbody_ker(out_x,out_y,out_v_x,out_v_y,in_x,in_y,in_v_x,in_v_y,m,t,NUM_BODIES) 88 | img.set_data(xy_to_img(in_x.get(), in_y.get(), masses)) 89 | 90 | return img 91 | 92 | 93 | if __name__ == '__main__': 94 | 95 | fig, ax = plt.subplots() 96 | img = ax.imshow( xy_to_img(in_x.get(), in_y.get(), masses) , interpolation='nearest') 97 | ani = animation.FuncAnimation(fig, update_gpu, fargs=(img, in_x, in_y, in_v_x, in_v_y, out_x, out_y, out_v_x, out_v_y, t, NUM_BODIES, masses) , interval=0, frames=100, save_count=100) 98 | 99 | plt.show() 100 | -------------------------------------------------------------------------------- /Chapter03/simple_element_kernel_example0.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pycuda.autoinit 3 | from pycuda import gpuarray 4 | from time import time 5 | from pycuda.elementwise import ElementwiseKernel 6 | 7 | host_data = np.float32( np.random.random(50000000) ) 8 | 9 | gpu_2x_ker = ElementwiseKernel( 10 | "float *in, float *out", 11 | "out[i] = 2*in[i];", 12 | "gpu_2x_ker") 13 | 14 | def speedcomparison(): 15 | t1 = time() 16 | host_data_2x = host_data * np.float32(2) 17 | t2 = time() 18 | print('total time to compute on CPU: %f' % (t2 - t1)) 19 | device_data = gpuarray.to_gpu(host_data) 20 | # allocate memory for output 21 | device_data_2x = gpuarray.empty_like(device_data) 22 | t1 = time() 23 | gpu_2x_ker(device_data, device_data_2x) 24 | t2 = time() 25 | from_device = device_data_2x.get() 26 | print('total time to compute on GPU: %f' % (t2 - t1)) 27 | print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x) )) 28 | 29 | 30 | if __name__ == '__main__': 31 | speedcomparison() 32 | -------------------------------------------------------------------------------- /Chapter03/simple_scankernel0.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pycuda.autoinit 3 | from pycuda import gpuarray 4 | from pycuda.scan import InclusiveScanKernel 5 | 6 | seq = np.array([1,2,3,4],dtype=np.int32) 7 | seq_gpu = gpuarray.to_gpu(seq) 8 | sum_gpu = InclusiveScanKernel(np.int32, "a+b") 9 | print(sum_gpu(seq_gpu).get()) 10 | print(np.cumsum(seq)) 11 | -------------------------------------------------------------------------------- /Chapter03/simple_scankernel1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pycuda.autoinit 3 | from pycuda import gpuarray 4 | from pycuda.scan import InclusiveScanKernel 5 | 6 | seq = np.array([1,100,-3,-10000, 4, 10000, 66, 14, 21],dtype=np.int32) 7 | seq_gpu = gpuarray.to_gpu(seq) 8 | max_gpu = InclusiveScanKernel(np.int32, "a > b ? a : b") 9 | print(max_gpu(seq_gpu).get()[-1]) 10 | print(np.max(seq)) 11 | -------------------------------------------------------------------------------- /Chapter03/time_calc0.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pycuda.autoinit 3 | from pycuda import gpuarray 4 | from time import time 5 | 6 | 7 | host_data = np.float32( np.random.random(50000000) ) 8 | 9 | t1 = time() 10 | host_data_2x = host_data * np.float32(2) 11 | t2 = time() 12 | 13 | print('total time to compute on CPU: %f' % (t2 - t1)) 14 | 15 | 16 | device_data = gpuarray.to_gpu(host_data) 17 | 18 | t1 = time() 19 | device_data_2x = device_data * np.float32( 2 ) 20 | t2 = time() 21 | 22 | from_device = device_data_2x.get() 23 | 24 | 25 | print('total time to compute on GPU: %f' % (t2 - t1)) 26 | print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x) )) 27 | -------------------------------------------------------------------------------- /Chapter04/conway_gpu.py: -------------------------------------------------------------------------------- 1 | # Conway's game of life in Python / CUDA C 2 | # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA" 3 | 4 | import pycuda.autoinit 5 | import pycuda.driver as drv 6 | from pycuda import gpuarray 7 | from pycuda.compiler import SourceModule 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import matplotlib.animation as animation 11 | 12 | ker = SourceModule(""" 13 | #define _X ( threadIdx.x + blockIdx.x * blockDim.x ) 14 | #define _Y ( threadIdx.y + blockIdx.y * blockDim.y ) 15 | 16 | #define _WIDTH ( blockDim.x * gridDim.x ) 17 | #define _HEIGHT ( blockDim.y * gridDim.y ) 18 | 19 | #define _XM(x) ( (x + _WIDTH) % _WIDTH ) 20 | #define _YM(y) ( (y + _HEIGHT) % _HEIGHT ) 21 | 22 | #define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH ) 23 | 24 | // return the number of living neighbors for a given cell 25 | __device__ int nbrs(int x, int y, int * in) 26 | { 27 | return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \ 28 | + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \ 29 | + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] ); 30 | } 31 | 32 | __global__ void conway_ker(int * lattice_out, int * lattice ) 33 | { 34 | // x, y are the appropriate values for the cell covered by this thread 35 | int x = _X, y = _Y; 36 | 37 | // count the number of neighbors around the current cell 38 | int n = nbrs(x, y, lattice); 39 | 40 | 41 | // if the current cell is alive, then determine if it lives or dies for the next generation. 42 | if ( lattice[_INDEX(x,y)] == 1) 43 | switch(n) 44 | { 45 | // if the cell is alive: it remains alive only if it has 2 or 3 neighbors. 46 | case 2: 47 | case 3: lattice_out[_INDEX(x,y)] = 1; 48 | break; 49 | default: lattice_out[_INDEX(x,y)] = 0; 50 | } 51 | else if( lattice[_INDEX(x,y)] == 0 ) 52 | switch(n) 53 | { 54 | // a dead cell comes to life only if it has 3 neighbors that are alive. 55 | case 3: lattice_out[_INDEX(x,y)] = 1; 56 | break; 57 | default: lattice_out[_INDEX(x,y)] = 0; 58 | } 59 | 60 | } 61 | """) 62 | 63 | 64 | conway_ker = ker.get_function("conway_ker") 65 | 66 | 67 | def update_gpu(frameNum, img, newLattice_gpu, lattice_gpu, N): 68 | 69 | conway_ker( newLattice_gpu, lattice_gpu, grid=(N//32,N//32,1), block=(32,32,1) ) 70 | 71 | img.set_data(newLattice_gpu.get() ) 72 | 73 | 74 | lattice_gpu[:] = newLattice_gpu[:] 75 | 76 | return img 77 | 78 | 79 | if __name__ == '__main__': 80 | # set lattice size 81 | N = 512 82 | 83 | lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) ) 84 | lattice_gpu = gpuarray.to_gpu(lattice) 85 | 86 | newLattice_gpu = gpuarray.empty_like(lattice_gpu) 87 | 88 | fig, ax = plt.subplots() 89 | img = ax.imshow(lattice_gpu.get(), interpolation='nearest') 90 | ani = animation.FuncAnimation(fig, update_gpu, fargs=(img, newLattice_gpu, lattice_gpu, N, ) , interval=0, frames=1000, save_count=1000) 91 | 92 | plt.show() 93 | -------------------------------------------------------------------------------- /Chapter04/conway_gpu_syncthreads.py: -------------------------------------------------------------------------------- 1 | # Iterative version of Conway's game of life in Python / CUDA C 2 | # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA" 3 | 4 | import pycuda.autoinit 5 | import pycuda.driver as drv 6 | from pycuda import gpuarray 7 | from pycuda.compiler import SourceModule 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | ker = SourceModule(""" 13 | #define _X ( threadIdx.x + blockIdx.x * blockDim.x ) 14 | #define _Y ( threadIdx.y + blockIdx.y * blockDim.y ) 15 | 16 | #define _WIDTH ( blockDim.x * gridDim.x ) 17 | #define _HEIGHT ( blockDim.y * gridDim.y ) 18 | 19 | #define _XM(x) ( (x + _WIDTH) % _WIDTH ) 20 | #define _YM(y) ( (y + _HEIGHT) % _HEIGHT ) 21 | 22 | #define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH ) 23 | 24 | // return the number of living neighbors for a given cell 25 | __device__ int nbrs(int x, int y, int * in) 26 | { 27 | return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \ 28 | + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \ 29 | + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] ); 30 | } 31 | 32 | __global__ void conway_ker(int * lattice, int iters) 33 | { 34 | // x, y are the appropriate values for the cell covered by this thread 35 | int x = _X, y = _Y; 36 | 37 | for (int i = 0; i < iters; i++) 38 | { 39 | 40 | // count the number of neighbors around the current cell 41 | int n = nbrs(x, y, lattice); 42 | 43 | int cell_value; 44 | 45 | 46 | // if the current cell is alive, then determine if it lives or dies for the next generation. 47 | if ( lattice[_INDEX(x,y)] == 1) 48 | switch(n) 49 | { 50 | // if the cell is alive: it remains alive only if it has 2 or 3 neighbors. 51 | case 2: 52 | case 3: cell_value = 1; 53 | break; 54 | default: cell_value = 0; 55 | } 56 | else if( lattice[_INDEX(x,y)] == 0 ) 57 | switch(n) 58 | { 59 | // a dead cell comes to life only if it has 3 neighbors that are alive. 60 | case 3: cell_value = 1; 61 | break; 62 | default: cell_value = 0; 63 | } 64 | 65 | __syncthreads(); 66 | lattice[_INDEX(x,y)] = cell_value; 67 | __syncthreads(); 68 | } 69 | 70 | } 71 | """) 72 | 73 | 74 | conway_ker = ker.get_function("conway_ker") 75 | 76 | 77 | 78 | 79 | if __name__ == '__main__': 80 | # set lattice size 81 | N = 32 82 | 83 | lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) ) 84 | lattice_gpu = gpuarray.to_gpu(lattice) 85 | conway_ker(lattice_gpu, np.int32(100000), grid=(1,1,1), block=(32,32,1)) 86 | fig = plt.figure(1) 87 | plt.imshow(lattice_gpu.get()) 88 | plt.show() 89 | 90 | -------------------------------------------------------------------------------- /Chapter04/conway_gpu_syncthreads_shared.py: -------------------------------------------------------------------------------- 1 | # Iterative Conway's game of life in Python / CUDA C 2 | # this version is meant to illustrate the use of shared kernel memory in CUDA. 3 | # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA" 4 | 5 | import pycuda.autoinit 6 | import pycuda.driver as drv 7 | from pycuda import gpuarray 8 | from pycuda.compiler import SourceModule 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from time import time 12 | 13 | shared_ker = SourceModule(""" 14 | #define _iters 1000000 15 | 16 | #define _X ( threadIdx.x + blockIdx.x * blockDim.x ) 17 | #define _Y ( threadIdx.y + blockIdx.y * blockDim.y ) 18 | 19 | #define _WIDTH ( blockDim.x * gridDim.x ) 20 | #define _HEIGHT ( blockDim.y * gridDim.y ) 21 | 22 | #define _XM(x) ( (x + _WIDTH) % _WIDTH ) 23 | #define _YM(y) ( (y + _HEIGHT) % _HEIGHT ) 24 | 25 | #define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH ) 26 | 27 | // return the number of living neighbors for a given cell 28 | __device__ int nbrs(int x, int y, int * in) 29 | { 30 | return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \ 31 | + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \ 32 | + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] ); 33 | } 34 | 35 | __global__ void conway_ker_shared(int * p_lattice, int iters) 36 | { 37 | // x, y are the appropriate values for the cell covered by this thread 38 | int x = _X, y = _Y; 39 | __shared__ int lattice[32*32]; 40 | 41 | 42 | lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)]; 43 | __syncthreads(); 44 | 45 | for (int i = 0; i < iters; i++) 46 | { 47 | 48 | // count the number of neighbors around the current cell 49 | int n = nbrs(x, y, lattice); 50 | 51 | int cell_value; 52 | 53 | 54 | // if the current cell is alive, then determine if it lives or dies for the next generation. 55 | if ( lattice[_INDEX(x,y)] == 1) 56 | switch(n) 57 | { 58 | // if the cell is alive: it remains alive only if it has 2 or 3 neighbors. 59 | case 2: 60 | case 3: cell_value = 1; 61 | break; 62 | default: cell_value = 0; 63 | } 64 | else if( lattice[_INDEX(x,y)] == 0 ) 65 | switch(n) 66 | { 67 | // a dead cell comes to life only if it has 3 neighbors that are alive. 68 | case 3: cell_value = 1; 69 | break; 70 | default: cell_value = 0; 71 | } 72 | 73 | __syncthreads(); 74 | lattice[_INDEX(x,y)] = cell_value; 75 | __syncthreads(); 76 | 77 | } 78 | 79 | __syncthreads(); 80 | p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)]; 81 | __syncthreads(); 82 | 83 | } 84 | """) 85 | 86 | 87 | conway_ker_shared = shared_ker.get_function("conway_ker_shared") 88 | 89 | 90 | if __name__ == '__main__': 91 | # set lattice size 92 | N = 32 93 | 94 | lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) ) 95 | lattice_gpu = gpuarray.to_gpu(lattice) 96 | 97 | conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1,1,1), block=(32,32,1)) 98 | 99 | fig = plt.figure(1) 100 | plt.imshow(lattice_gpu.get()) 101 | plt.show() 102 | 103 | 104 | -------------------------------------------------------------------------------- /Chapter04/naive_prefix.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | import numpy as np 4 | from pycuda import gpuarray 5 | from pycuda.compiler import SourceModule 6 | from time import time 7 | # this is a naive parallel prefix-sum kernel that uses shared memory 8 | naive_ker = SourceModule(""" 9 | __global__ void naive_prefix(double *vec, double *out) 10 | { 11 | __shared__ double sum_buf[1024]; 12 | int tid = threadIdx.x; 13 | sum_buf[tid] = vec[tid]; 14 | 15 | // begin parallel prefix sum algorithm 16 | 17 | int iter = 1; 18 | for (int i=0; i < 10; i++) 19 | { 20 | __syncthreads(); 21 | if (tid >= iter ) 22 | { 23 | sum_buf[tid] = sum_buf[tid] + sum_buf[tid - iter]; 24 | } 25 | 26 | iter *= 2; 27 | } 28 | 29 | __syncthreads(); 30 | out[tid] = sum_buf[tid]; 31 | __syncthreads(); 32 | 33 | } 34 | """) 35 | naive_gpu = naive_ker.get_function("naive_prefix") 36 | 37 | 38 | 39 | if __name__ == '__main__': 40 | 41 | 42 | testvec = np.random.randn(1024).astype(np.float64) 43 | testvec_gpu = gpuarray.to_gpu(testvec) 44 | 45 | outvec_gpu = gpuarray.empty_like(testvec_gpu) 46 | 47 | naive_gpu( testvec_gpu , outvec_gpu, block=(1024,1,1), grid=(1,1,1)) 48 | 49 | total_sum = sum( testvec) 50 | total_sum_gpu = outvec_gpu[-1].get() 51 | 52 | print("Does our kernel work correctly? : {}".format(np.allclose(total_sum_gpu , total_sum) )) 53 | -------------------------------------------------------------------------------- /Chapter04/simple_scalar_multiply_kernel.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | import numpy as np 4 | from pycuda import gpuarray 5 | from pycuda.compiler import SourceModule 6 | 7 | ker = SourceModule(""" 8 | __global__ void scalar_multiply_kernel(float *outvec, float scalar, float *vec) 9 | { 10 | int i = threadIdx.x; 11 | outvec[i] = scalar*vec[i]; 12 | } 13 | """) 14 | 15 | scalar_multiply_gpu = ker.get_function("scalar_multiply_kernel") 16 | 17 | testvec = np.random.randn(512).astype(np.float32) 18 | testvec_gpu = gpuarray.to_gpu(testvec) 19 | outvec_gpu = gpuarray.empty_like(testvec_gpu) 20 | 21 | scalar_multiply_gpu( outvec_gpu, np.float32(2), testvec_gpu, block=(512,1,1), grid=(1,1,1)) 22 | 23 | print("Does our kernel work correctly? : {}".format(np.allclose(outvec_gpu.get() , 2*testvec) )) 24 | -------------------------------------------------------------------------------- /Chapter04/work-efficient_prefix.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pycuda.autoinit 3 | import pycuda.driver as drv 4 | import numpy as np 5 | from pycuda import gpuarray 6 | from pycuda.compiler import SourceModule 7 | from time import time 8 | 9 | # this is a work-efficent parallel prefix-sum algorithm. 10 | # written by Brian Tuomanen for "Hands On GPU Programming with Python and CUDA" 11 | 12 | # kernel for up-sweep phase 13 | up_ker = SourceModule(""" 14 | __global__ void up_ker(double *x, double *x_old, int k ) 15 | { 16 | int tid = blockIdx.x*blockDim.x + threadIdx.x; 17 | 18 | int _2k = 1 << k; 19 | int _2k1 = 1 << (k+1); 20 | 21 | int j = tid* _2k1; 22 | 23 | x[j + _2k1 - 1] = x_old[j + _2k -1 ] + x_old[j + _2k1 - 1]; 24 | 25 | } 26 | """) 27 | 28 | up_gpu = up_ker.get_function("up_ker") 29 | 30 | # implementation of up-sweep phase 31 | def up_sweep(x): 32 | # let's typecast to be safe. 33 | x = np.float64(x) 34 | x_gpu = gpuarray.to_gpu(np.float64(x) ) 35 | x_old_gpu = x_gpu.copy() 36 | for k in range( int(np.log2(x.size) ) ) : 37 | num_threads = int(np.ceil( x.size / 2**(k+1))) 38 | grid_size = int(np.ceil(num_threads / 32)) 39 | 40 | if grid_size > 1: 41 | block_size = 32 42 | else: 43 | block_size = num_threads 44 | 45 | up_gpu(x_gpu, x_old_gpu, np.int32(k) , block=(block_size,1,1), grid=(grid_size,1,1)) 46 | x_old_gpu[:] = x_gpu[:] 47 | 48 | x_out = x_gpu.get() 49 | return(x_out) 50 | 51 | # kernel for down-sweep phase 52 | down_ker = SourceModule(""" 53 | __global__ void down_ker(double *y, double *y_old, int k) 54 | { 55 | int tid = blockIdx.x*blockDim.x + threadIdx.x; 56 | 57 | int _2k = 1 << k; 58 | int _2k1 = 1 << (k+1); 59 | 60 | int j = tid*_2k1; 61 | 62 | y[j + _2k - 1 ] = y_old[j + _2k1 - 1]; 63 | y[j + _2k1 - 1] = y_old[j + _2k1 - 1] + y_old[j + _2k - 1]; 64 | } 65 | """) 66 | 67 | down_gpu = down_ker.get_function("down_ker") 68 | 69 | 70 | # implementation of down-sweep phase 71 | def down_sweep(y): 72 | y = np.float64(y) 73 | y[-1] = 0 74 | y_gpu = gpuarray.to_gpu(y) 75 | y_old_gpu = y_gpu.copy() 76 | for k in reversed(range(int(np.log2(y.size)))): 77 | num_threads = int(np.ceil( y.size / 2**(k+1))) 78 | grid_size = int(np.ceil(num_threads / 32)) 79 | 80 | if grid_size > 1: 81 | block_size = 32 82 | else: 83 | block_size = num_threads 84 | 85 | down_gpu(y_gpu, y_old_gpu, np.int32(k), block=(block_size,1,1), grid=(grid_size,1,1)) 86 | y_old_gpu[:] = y_gpu[:] 87 | y_out = y_gpu.get() 88 | return(y_out) 89 | 90 | 91 | # full implementation of work-efficient parallel prefix sum 92 | def efficient_prefix(x): 93 | return(down_sweep(up_sweep(x))) 94 | 95 | 96 | 97 | if __name__ == '__main__': 98 | 99 | 100 | testvec = np.random.randn(32*1024).astype(np.float64) 101 | testvec_gpu = gpuarray.to_gpu(testvec) 102 | 103 | outvec_gpu = gpuarray.empty_like(testvec_gpu) 104 | 105 | prefix_sum = np.roll(np.cumsum(testvec), 1) 106 | prefix_sum[0] = 0 107 | 108 | prefix_sum_gpu = efficient_prefix(testvec) 109 | 110 | print("Does our work-efficient prefix work? {}".format(np.allclose(prefix_sum_gpu, prefix_sum))) 111 | 112 | 113 | -------------------------------------------------------------------------------- /Chapter05/conway_gpu_streams.py: -------------------------------------------------------------------------------- 1 | # CUDA Stream-based Concurrent Conway's game of life in Python / CUDA C 2 | # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA" 3 | 4 | import pycuda.autoinit 5 | import pycuda.driver as drv 6 | from pycuda import gpuarray 7 | from pycuda.compiler import SourceModule 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import matplotlib.animation as animation 11 | 12 | ker = SourceModule(""" 13 | #define _X ( threadIdx.x + blockIdx.x * blockDim.x ) 14 | #define _Y ( threadIdx.y + blockIdx.y * blockDim.y ) 15 | 16 | #define _WIDTH ( blockDim.x * gridDim.x ) 17 | #define _HEIGHT ( blockDim.y * gridDim.y ) 18 | 19 | #define _XM(x) ( (x + _WIDTH) % _WIDTH ) 20 | #define _YM(y) ( (y + _HEIGHT) % _HEIGHT ) 21 | 22 | #define _INDEX(x,y) ( _XM(x) + _YM(y) * _WIDTH ) 23 | 24 | // return the number of living neighbors for a given cell 25 | __device__ int nbrs(int x, int y, int * in) 26 | { 27 | return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \ 28 | + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \ 29 | + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] ); 30 | } 31 | 32 | __global__ void conway_ker(int * lattice_out, int * lattice ) 33 | { 34 | // x, y are the appropriate values for the cell covered by this thread 35 | int x = _X, y = _Y; 36 | 37 | // count the number of neighbors around the current cell 38 | int n = nbrs(x, y, lattice); 39 | 40 | 41 | // if the current cell is alive, then determine if it lives or dies for the next generation. 42 | if ( lattice[_INDEX(x,y)] == 1) 43 | switch(n) 44 | { 45 | // if the cell is alive: it remains alive only if it has 2 or 3 neighbors. 46 | case 2: 47 | case 3: lattice_out[_INDEX(x,y)] = 1; 48 | break; 49 | default: lattice_out[_INDEX(x,y)] = 0; 50 | } 51 | else if( lattice[_INDEX(x,y)] == 0 ) 52 | switch(n) 53 | { 54 | // a dead cell comes to life only if it has 3 neighbors that are alive. 55 | case 3: lattice_out[_INDEX(x,y)] = 1; 56 | break; 57 | default: lattice_out[_INDEX(x,y)] = 0; 58 | } 59 | 60 | } 61 | """) 62 | 63 | 64 | conway_ker = ker.get_function("conway_ker") 65 | 66 | 67 | def update_gpu(frameNum, imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent): 68 | 69 | for k in range(num_concurrent): 70 | conway_ker( newLattices_gpu[k], lattices_gpu[k], grid=(N//32,N//32,1), block=(32,32,1), stream=streams[k] ) 71 | 72 | imgs[k].set_data(newLattices_gpu[k].get_async(stream=streams[k]) ) 73 | 74 | lattices_gpu[k].set_async(newLattices_gpu[k], stream=streams[k]) 75 | 76 | 77 | return imgs 78 | 79 | 80 | if __name__ == '__main__': 81 | # set lattice size 82 | N = 128 83 | 84 | num_concurrent = 4 85 | 86 | streams = [] 87 | lattices_gpu = [] 88 | newLattices_gpu = [] 89 | 90 | for k in range(num_concurrent): 91 | streams.append(drv.Stream()) 92 | lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) ) 93 | lattices_gpu.append(gpuarray.to_gpu(lattice)) 94 | newLattices_gpu.append(gpuarray.empty_like(lattices_gpu[k])) 95 | 96 | fig, ax = plt.subplots(nrows=1, ncols=num_concurrent) 97 | imgs = [] 98 | 99 | for k in range(num_concurrent): 100 | imgs.append( ax[k].imshow(lattices_gpu[k].get_async(stream=streams[k]), interpolation='nearest') ) 101 | 102 | ani = animation.FuncAnimation(fig, update_gpu, fargs=(imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent) , interval=0, frames=1000, save_count=1000) 103 | 104 | plt.show() 105 | -------------------------------------------------------------------------------- /Chapter05/gpu_mandelbrot_context_sync.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | import matplotlib 3 | #this will prevent the figure from popping up 4 | matplotlib.use('Agg') 5 | from matplotlib import pyplot as plt 6 | import numpy as np 7 | import pycuda.autoinit 8 | from pycuda import gpuarray 9 | from pycuda.elementwise import ElementwiseKernel 10 | 11 | mandel_ker = ElementwiseKernel( 12 | "pycuda::complex *lattice, float *mandelbrot_graph, int max_iters, float upper_bound", 13 | """ 14 | mandelbrot_graph[i] = 1; 15 | 16 | pycuda::complex c = lattice[i]; 17 | pycuda::complex z(0,0); 18 | 19 | for (int j = 0; j < max_iters; j++) 20 | { 21 | 22 | z = z*z + c; 23 | 24 | if(abs(z) > upper_bound) 25 | { 26 | mandelbrot_graph[i] = 0; 27 | break; 28 | } 29 | 30 | } 31 | 32 | """, 33 | "mandel_ker") 34 | 35 | def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound): 36 | 37 | # we set up our complex lattice as such 38 | real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64) 39 | imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j 40 | mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64) 41 | 42 | # copy complex lattice to the GPU 43 | mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice) 44 | 45 | # synchronize in current context 46 | pycuda.autoinit.context.synchronize() 47 | 48 | # allocate an empty array on the GPU 49 | mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32) 50 | 51 | mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound)) 52 | 53 | pycuda.autoinit.context.synchronize() 54 | 55 | mandelbrot_graph = mandelbrot_graph_gpu.get_async() 56 | 57 | pycuda.autoinit.context.synchronize() 58 | 59 | return mandelbrot_graph 60 | 61 | 62 | if __name__ == '__main__': 63 | 64 | t1 = time() 65 | mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2) 66 | t2 = time() 67 | 68 | mandel_time = t2 - t1 69 | 70 | t1 = time() 71 | fig = plt.figure(1) 72 | plt.imshow(mandel, extent=(-2, 2, -2, 2)) 73 | plt.savefig('mandelbrot.png', dpi=fig.dpi) 74 | t2 = time() 75 | 76 | dump_time = t2 - t1 77 | 78 | print 'It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time) 79 | print 'It took {} seconds to dump the image.'.format(dump_time) 80 | -------------------------------------------------------------------------------- /Chapter05/multi-kernel.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | from pycuda import gpuarray 4 | from pycuda.compiler import SourceModule 5 | import numpy as np 6 | from time import time 7 | 8 | num_arrays = 200 9 | array_len = 1024**2 10 | 11 | ker = SourceModule(""" 12 | __global__ void mult_ker(float * array, int array_len) 13 | { 14 | int thd = blockIdx.x*blockDim.x + threadIdx.x; 15 | int num_iters = array_len / blockDim.x; 16 | 17 | for(int j=0; j < num_iters; j++) 18 | { 19 | int i = j * blockDim.x + thd; 20 | 21 | for(int k = 0; k < 50; k++) 22 | { 23 | array[i] *= 2.0; 24 | array[i] /= 2.0; 25 | } 26 | } 27 | 28 | } 29 | """) 30 | 31 | mult_ker = ker.get_function('mult_ker') 32 | 33 | data = [] 34 | data_gpu = [] 35 | gpu_out = [] 36 | 37 | # generate random arrays. 38 | for _ in range(num_arrays): 39 | data.append(np.random.randn(array_len).astype('float32')) 40 | 41 | t_start = time() 42 | 43 | # copy arrays to GPU. 44 | for k in range(num_arrays): 45 | data_gpu.append(gpuarray.to_gpu(data[k])) 46 | 47 | # process arrays. 48 | for k in range(num_arrays): 49 | mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1)) 50 | 51 | # copy arrays from GPU. 52 | for k in range(num_arrays): 53 | gpu_out.append(data_gpu[k].get()) 54 | 55 | t_end = time() 56 | 57 | for k in range(num_arrays): 58 | assert (np.allclose(gpu_out[k], data[k])) 59 | 60 | print('Total time: %f' % (t_end - t_start)) 61 | -------------------------------------------------------------------------------- /Chapter05/multi-kernel_events.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | from pycuda import gpuarray 4 | from pycuda.compiler import SourceModule 5 | import numpy as np 6 | from time import time 7 | 8 | num_arrays = 200 9 | array_len = 1024**2 10 | 11 | ker = SourceModule(""" 12 | __global__ void mult_ker(float * array, int array_len) 13 | { 14 | int thd = blockIdx.x*blockDim.x + threadIdx.x; 15 | int num_iters = array_len / blockDim.x; 16 | for(int j=0; j < num_iters; j++) 17 | { 18 | int i = j * blockDim.x + thd; 19 | for(int k = 0; k < 50; k++) 20 | { 21 | array[i] *= 2.0; 22 | array[i] /= 2.0; 23 | } 24 | } 25 | } 26 | """) 27 | 28 | mult_ker = ker.get_function('mult_ker') 29 | 30 | data = [] 31 | data_gpu = [] 32 | gpu_out = [] 33 | streams = [] 34 | start_events = [] 35 | end_events = [] 36 | 37 | for _ in range(num_arrays): 38 | streams.append(drv.Stream()) 39 | start_events.append(drv.Event()) 40 | end_events.append(drv.Event()) 41 | 42 | # generate random arrays. 43 | for _ in range(num_arrays): 44 | data.append(np.random.randn(array_len).astype('float32')) 45 | 46 | t_start = time() 47 | 48 | # copy arrays to GPU. 49 | for k in range(num_arrays): 50 | data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k])) 51 | 52 | # process arrays. 53 | for k in range(num_arrays): 54 | start_events[k].record(streams[k]) 55 | mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k]) 56 | for k in range(num_arrays): 57 | end_events[k].record(streams[k]) 58 | 59 | # copy arrays from GPU. 60 | for k in range(num_arrays): 61 | gpu_out.append(data_gpu[k].get_async(stream=streams[k])) 62 | 63 | t_end = time() 64 | 65 | for k in range(num_arrays): 66 | assert (np.allclose(gpu_out[k], data[k])) 67 | 68 | kernel_times = [] 69 | 70 | for k in range(num_arrays): 71 | kernel_times.append(start_events[k].time_till(end_events[k])) 72 | 73 | print('Total time: %f' % (t_end - t_start)) 74 | print('Mean kernel duration (milliseconds): %f' % np.mean(kernel_times)) 75 | print('Mean kernel standard deviation (milliseconds): %f' % np.std(kernel_times)) 76 | -------------------------------------------------------------------------------- /Chapter05/multi-kernel_multi-thread.py: -------------------------------------------------------------------------------- 1 | import pycuda 2 | import pycuda.driver as drv 3 | from pycuda import gpuarray 4 | from pycuda.compiler import SourceModule 5 | import numpy as np 6 | from time import time 7 | import threading 8 | 9 | 10 | num_arrays = 10 11 | array_len = 1024**2 12 | 13 | kernel_code = """ 14 | __global__ void mult_ker(float * array, int array_len) 15 | { 16 | int thd = blockIdx.x*blockDim.x + threadIdx.x; 17 | int num_iters = array_len / blockDim.x; 18 | 19 | for(int j=0; j < num_iters; j++) 20 | { 21 | int i = j * blockDim.x + thd; 22 | 23 | for(int k = 0; k < 50; k++) 24 | { 25 | array[i] *= 2.0; 26 | array[i] /= 2.0; 27 | } 28 | } 29 | 30 | } 31 | """ 32 | 33 | class KernelLauncherThread(threading.Thread): 34 | def __init__(self, input_array): 35 | threading.Thread.__init__(self) 36 | self.input_array = input_array 37 | self.output_array = None 38 | 39 | def run(self): 40 | self.dev = drv.Device(0) 41 | self.context = self.dev.make_context() 42 | 43 | self.ker = SourceModule(kernel_code) 44 | 45 | self.mult_ker = self.ker.get_function('mult_ker') 46 | 47 | self.array_gpu = gpuarray.to_gpu(self.input_array) 48 | 49 | self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1)) 50 | 51 | self.output_array = self.array_gpu.get() 52 | 53 | self.context.pop() 54 | 55 | def join(self): 56 | threading.Thread.join(self) 57 | return self.output_array 58 | 59 | drv.init() 60 | 61 | 62 | data = [] 63 | gpu_out = [] 64 | threads = [] 65 | 66 | # generate random arrays and thread objects. 67 | for _ in range(num_arrays): 68 | data.append(np.random.randn(array_len).astype('float32')) 69 | 70 | for k in range(num_arrays): 71 | # create a thread that uses data we just generated 72 | threads.append(KernelLauncherThread(data[k])) 73 | 74 | # launch threads to process arrays. 75 | for k in range(num_arrays): 76 | threads[k].start() 77 | 78 | # get data from launched threads. 79 | for k in range(num_arrays): 80 | gpu_out.append(threads[k].join()) 81 | 82 | for k in range(num_arrays): 83 | assert (np.allclose(gpu_out[k], data[k])) 84 | 85 | -------------------------------------------------------------------------------- /Chapter05/multi-kernel_streams.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | from pycuda import gpuarray 4 | from pycuda.compiler import SourceModule 5 | import numpy as np 6 | from time import time 7 | 8 | num_arrays = 200 9 | array_len = 1024**2 10 | 11 | ker = SourceModule(""" 12 | __global__ void mult_ker(float * array, int array_len) 13 | { 14 | int thd = blockIdx.x*blockDim.x + threadIdx.x; 15 | int num_iters = array_len / blockDim.x; 16 | 17 | for(int j=0; j < num_iters; j++) 18 | { 19 | int i = j * blockDim.x + thd; 20 | 21 | for(int k = 0; k < 50; k++) 22 | { 23 | array[i] *= 2.0; 24 | array[i] /= 2.0; 25 | } 26 | } 27 | 28 | } 29 | """) 30 | 31 | mult_ker = ker.get_function('mult_ker') 32 | 33 | data = [] 34 | data_gpu = [] 35 | gpu_out = [] 36 | streams = [] 37 | 38 | for _ in range(num_arrays): 39 | streams.append(drv.Stream()) 40 | 41 | # generate random arrays. 42 | for _ in range(num_arrays): 43 | data.append(np.random.randn(array_len).astype('float32')) 44 | 45 | t_start = time() 46 | 47 | # copy arrays to GPU. 48 | for k in range(num_arrays): 49 | data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k])) 50 | 51 | # process arrays. 52 | for k in range(num_arrays): 53 | mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k]) 54 | 55 | # copy arrays from GPU. 56 | for k in range(num_arrays): 57 | gpu_out.append(data_gpu[k].get_async(stream=streams[k])) 58 | 59 | t_end = time() 60 | 61 | for k in range(num_arrays): 62 | assert (np.allclose(gpu_out[k], data[k])) 63 | 64 | print('Total time: %f' % (t_end - t_start)) 65 | -------------------------------------------------------------------------------- /Chapter05/simple_context_create.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from pycuda import gpuarray 3 | import pycuda.driver as drv 4 | 5 | drv.init() 6 | dev = drv.Device(0) 7 | ctx = dev.make_context() 8 | 9 | x = gpuarray.to_gpu(np.float32([1,2,3])) 10 | print(x.get()) 11 | 12 | ctx.pop() 13 | -------------------------------------------------------------------------------- /Chapter05/simple_event_example.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | from pycuda import gpuarray 4 | from pycuda.compiler import SourceModule 5 | import numpy as np 6 | from time import time 7 | 8 | ker = SourceModule(""" 9 | __global__ void mult_ker(float * array, int array_len) 10 | { 11 | int thd = blockIdx.x*blockDim.x + threadIdx.x; 12 | int num_iters = array_len / blockDim.x; 13 | 14 | for(int j=0; j < num_iters; j++) 15 | { 16 | int i = j * blockDim.x + thd; 17 | 18 | for(int k = 0; k < 50; k++) 19 | { 20 | array[i] *= 2.0; 21 | array[i] /= 2.0; 22 | } 23 | } 24 | } 25 | """) 26 | 27 | mult_ker = ker.get_function('mult_ker') 28 | 29 | array_len = 100*1024**2 30 | 31 | data = np.random.randn(array_len).astype('float32') 32 | data_gpu = gpuarray.to_gpu(data) 33 | 34 | start_event = drv.Event() 35 | end_event = drv.Event() 36 | 37 | start_event.record() 38 | mult_ker(data_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1)) 39 | end_event.record() 40 | 41 | end_event.synchronize() 42 | 43 | print('Has the kernel started yet? {}'.format(start_event.query())) 44 | print('Has the kernel ended yet? {}'.format(end_event.query())) 45 | 46 | print('Kernel execution time in milliseconds: %f ' % start_event.time_till(end_event)) 47 | 48 | -------------------------------------------------------------------------------- /Chapter05/single_thread_example.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | class PointlessExampleThread(threading.Thread): 4 | def __init__(self): 5 | threading.Thread.__init__(self) 6 | self.return_value = None 7 | 8 | def run(self): 9 | print('Hello from the thread you just spawned!') 10 | self.return_value = 123 11 | 12 | def join(self): 13 | threading.Thread.join(self) 14 | return self.return_value 15 | 16 | 17 | NewThread = PointlessExampleThread() 18 | NewThread.start() 19 | thread_output = NewThread.join() 20 | print('The thread completed and returned this value: %s' % thread_output) 21 | -------------------------------------------------------------------------------- /Chapter06/broken_matrix_ker.py: -------------------------------------------------------------------------------- 1 | # Note: this code is intentionally broken!!! 2 | # (This is intended to show a case study of how to debug CUDA code 3 | # using printf.) 4 | 5 | import pycuda.autoinit 6 | import pycuda.driver as drv 7 | from pycuda import gpuarray 8 | from pycuda.compiler import SourceModule 9 | import numpy as np 10 | 11 | 12 | ker = SourceModule(''' 13 | // row-column dot-product for matrix multiplication 14 | __device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N) 15 | { 16 | 17 | //printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d, N is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col, N); 18 | float val = 0; 19 | 20 | for (int k=0; k < N; k++) 21 | { 22 | 23 | // broken version 24 | val += matrix_a[ row + k*N ] * matrix_b[ col*N + k]; 25 | 26 | //if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) 27 | // printf("Dot-product loop: k value is %d, matrix_a value is %f, matrix_b is %f.\\n", k, matrix_a[ row + k*N ], matrix_b[ col*N + k]); 28 | 29 | // fixed version 30 | //val += matrix_a[ row*N + k ] * matrix_b[ col + k*N]; 31 | } 32 | 33 | return(val); 34 | 35 | } 36 | 37 | // matrix multiplication kernel that is parallelized over row/column tuples. 38 | __global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N) 39 | { 40 | 41 | // broken version 42 | int row = blockIdx.x + threadIdx.x; 43 | int col = blockIdx.y + threadIdx.y; 44 | 45 | // fixed version 46 | //int row = blockIdx.x*blockDim.x + threadIdx.x; 47 | //int col = blockIdx.y*blockDim.y + threadIdx.y; 48 | 49 | //printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col); 50 | 51 | 52 | // broken version 53 | output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, col, row, N); 54 | 55 | // fixed version 56 | //output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N); 57 | 58 | 59 | } 60 | ''') 61 | 62 | matrix_ker = ker.get_function('matrix_mult_ker') 63 | 64 | test_a = np.float32( [range(1,5)] * 4 ) 65 | test_b = np.float32([range(14,10, -1)]*4 ) 66 | 67 | output_mat = np.matmul(test_a, test_b) 68 | 69 | test_a_gpu = gpuarray.to_gpu(test_a) 70 | test_b_gpu = gpuarray.to_gpu(test_b) 71 | output_mat_gpu = gpuarray.empty_like(test_a_gpu) 72 | 73 | matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1)) 74 | 75 | assert( np.allclose(output_mat_gpu.get(), output_mat) ) 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /Chapter06/divergence_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | __global__ void divergence_test_ker() 5 | { 6 | if( threadIdx.x % 2 == 0) 7 | printf("threadIdx.x %d : This is an even thread.\n", threadIdx.x); 8 | else 9 | printf("threadIdx.x %d : This is an odd thread.\n", threadIdx.x); 10 | } 11 | 12 | __host__ int main() 13 | { 14 | cudaSetDevice(0); 15 | divergence_test_ker<<<1, 32>>>(); 16 | cudaDeviceSynchronize(); 17 | cudaDeviceReset(); 18 | } 19 | -------------------------------------------------------------------------------- /Chapter06/hello-world_gpu.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | from pycuda import gpuarray 4 | from pycuda.compiler import SourceModule 5 | 6 | ker = SourceModule(''' 7 | __global__ void hello_world_ker() 8 | { 9 | printf("Hello world from thread %d, in block %d!\\n", threadIdx.x, blockIdx.x); 10 | 11 | __syncthreads(); 12 | 13 | if(threadIdx.x == 0 && blockIdx.x == 0) 14 | { 15 | printf("-------------------------------------\\n"); 16 | printf("This kernel was launched over a grid consisting of %d blocks,\\n", gridDim.x); 17 | printf("where each block has %d threads.\\n", blockDim.x); 18 | } 19 | } 20 | ''') 21 | 22 | hello_ker = ker.get_function("hello_world_ker") 23 | hello_ker( block=(5,1,1), grid=(2,1,1) ) 24 | -------------------------------------------------------------------------------- /Chapter06/matrix_ker.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #define _EPSILON 0.001 5 | #define _ABS(x) ( x > 0.0f ? x : -x ) 6 | 7 | __host__ int allclose(float *A, float *B, int len) 8 | { 9 | 10 | int returnval = 0; 11 | 12 | for (int i = 0; i < len; i++) 13 | { 14 | if ( _ABS(A[i] - B[i]) > _EPSILON ) 15 | { 16 | returnval = -1; 17 | break; 18 | } 19 | } 20 | 21 | return(returnval); 22 | } 23 | 24 | 25 | // row-column dot-product for matrix multiplication 26 | __device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N) 27 | { 28 | float val = 0; 29 | 30 | for (int k=0; k < N; k++) 31 | { 32 | val += matrix_a[ row*N + k ] * matrix_b[ col + k*N]; 33 | } 34 | 35 | return(val); 36 | } 37 | 38 | // matrix multiplication kernel that is parallelized over row/column tuples. 39 | __global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N) 40 | { 41 | 42 | int row = blockIdx.x*blockDim.x + threadIdx.x; 43 | int col = blockIdx.y*blockDim.y + threadIdx.y; 44 | 45 | output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N); 46 | } 47 | 48 | 49 | __host__ int main() 50 | { 51 | 52 | // Initialize to use first GPU. 53 | cudaSetDevice(0); 54 | 55 | // this indicates the width/height of the matrices 56 | int N = 4; 57 | 58 | // this will indicate how many bytes to allocate to store a test or output matrix 59 | int num_bytes = sizeof(float)*N*N; 60 | 61 | // input test matrix A 62 | float h_A[] = { 1.0, 2.0, 3.0, 4.0, \ 63 | 1.0, 2.0, 3.0, 4.0, \ 64 | 1.0, 2.0, 3.0, 4.0, \ 65 | 1.0, 2.0, 3.0, 4.0 }; 66 | 67 | // input test matrix B 68 | float h_B[] = { 14.0, 13.0, 12.0, 11.0, \ 69 | 14.0, 13.0, 12.0, 11.0, \ 70 | 14.0, 13.0, 12.0, 11.0, \ 71 | 14.0, 13.0, 12.0, 11.0 }; 72 | 73 | // expected output of A times B 74 | float h_AxB[] = { 140.0, 130.0, 120.0, 110.0, \ 75 | 140.0, 130.0, 120.0, 110.0, \ 76 | 140.0, 130.0, 120.0, 110.0, \ 77 | 140.0, 130.0, 120.0, 110.0 }; 78 | 79 | 80 | // these pointers will be used for the GPU. 81 | // (notice how we use normal float pointers) 82 | float * d_A; 83 | float * d_B; 84 | float * d_output; 85 | 86 | // allocate memory for the test matrices on the GPU 87 | cudaMalloc((float **) &d_A, num_bytes); 88 | cudaMalloc((float **) &d_B, num_bytes); 89 | 90 | // copy the test matrices to the GPU 91 | cudaMemcpy(d_A, h_A, num_bytes, cudaMemcpyHostToDevice); 92 | cudaMemcpy(d_B, h_B, num_bytes, cudaMemcpyHostToDevice); 93 | 94 | // allocate memory for output on GPU 95 | cudaMalloc((float **) &d_output, num_bytes); 96 | 97 | // this will store the output from the GPU 98 | float * h_output; 99 | h_output = (float *) malloc(num_bytes); 100 | 101 | // setup our block and grid launch parameters with the dim3 class. 102 | dim3 block(2,2,1); 103 | dim3 grid(2,2,1); 104 | 105 | // launch our kernel 106 | matrix_mult_ker <<< grid, block >>> (d_A, d_B, d_output, N); 107 | 108 | // synchronize on the host, to ensure our kernel has finished executing. 109 | cudaDeviceSynchronize(); 110 | 111 | // copy output from device to host. 112 | cudaMemcpy(h_output, d_output, num_bytes, cudaMemcpyDeviceToHost); 113 | 114 | // synchronize again. 115 | cudaDeviceSynchronize(); 116 | 117 | // free arrays on device. 118 | cudaFree(d_A); 119 | cudaFree(d_B); 120 | cudaFree(d_output); 121 | 122 | // reset the GPU. 123 | cudaDeviceReset(); 124 | 125 | 126 | // Check to see if we got the expected output. 127 | // in both cases, remember to de-allocate h_output before returning. 128 | 129 | if (allclose(h_AxB, h_output, N*N) < 0) 130 | { 131 | printf("Error! Output of kernel does not match expected output.\n"); 132 | free(h_output); 133 | return(-1); 134 | } 135 | else 136 | { 137 | printf("Success! Output of kernel matches expected output.\n"); 138 | free(h_output); 139 | return(0); 140 | } 141 | 142 | 143 | } 144 | -------------------------------------------------------------------------------- /Chapter06/matrix_ker.py: -------------------------------------------------------------------------------- 1 | # This program is the "fixed" version of broken_matrix_ker.py 2 | 3 | # This is to be used for an exercise where this code is translated to 4 | # a pure CUDA-C version. 5 | 6 | import pycuda.autoinit 7 | import pycuda.driver as drv 8 | from pycuda import gpuarray 9 | from pycuda.compiler import SourceModule 10 | import numpy as np 11 | 12 | 13 | ker = SourceModule(''' 14 | // row-column dot-product for matrix multiplication 15 | __device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N) 16 | { 17 | float val = 0; 18 | 19 | for (int k=0; k < N; k++) 20 | { 21 | val += matrix_a[ row*N + k ] * matrix_b[ col + k*N]; 22 | } 23 | 24 | return(val); 25 | 26 | } 27 | 28 | // matrix multiplication kernel that is parallelized over row/column tuples. 29 | __global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N) 30 | { 31 | 32 | int row = blockIdx.x*blockDim.x + threadIdx.x; 33 | int col = blockIdx.y*blockDim.y + threadIdx.y; 34 | 35 | output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N); 36 | 37 | } 38 | ''') 39 | 40 | matrix_ker = ker.get_function('matrix_mult_ker') 41 | 42 | test_a = np.float32([range(1,5)] * 4) 43 | test_b = np.float32([range(14,10, -1)]*4 ) 44 | 45 | output_mat = np.matmul(test_a, test_b) 46 | 47 | test_a_gpu = gpuarray.to_gpu(test_a) 48 | test_b_gpu = gpuarray.to_gpu(test_b) 49 | output_mat_gpu = gpuarray.empty_like(test_a_gpu) 50 | 51 | matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1)) 52 | 53 | assert(np.allclose(output_mat_gpu.get(), output_mat) ) 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /Chapter07/conv_2d.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pycuda.autoinit 3 | from pycuda import gpuarray 4 | import numpy as np 5 | from skcuda import fft 6 | from skcuda import linalg 7 | from matplotlib import pyplot as plt 8 | 9 | 10 | def cufft_conv(x , y): 11 | 12 | x = x.astype(np.complex64) 13 | y = y.astype(np.complex64) 14 | 15 | if (x.shape != y.shape): 16 | return -1 17 | 18 | plan = fft.Plan(x.shape, np.complex64, np.complex64) 19 | inverse_plan = fft.Plan(x.shape, np.complex64, np.complex64) 20 | 21 | x_gpu = gpuarray.to_gpu(x) 22 | y_gpu = gpuarray.to_gpu(y) 23 | 24 | x_fft = gpuarray.empty_like(x_gpu, dtype=np.complex64) 25 | y_fft = gpuarray.empty_like(y_gpu, dtype=np.complex64) 26 | out_gpu = gpuarray.empty_like(x_gpu, dtype=np.complex64) 27 | 28 | fft.fft(x_gpu, x_fft, plan) 29 | fft.fft(y_gpu, y_fft, plan) 30 | 31 | 32 | linalg.multiply(x_fft, y_fft, overwrite=True) 33 | 34 | fft.ifft(y_fft, out_gpu, inverse_plan, scale=True) 35 | 36 | conv_out = out_gpu.get() 37 | 38 | return conv_out 39 | 40 | 41 | def conv_2d(ker, img): 42 | 43 | padded_ker = np.zeros( (img.shape[0] + 2*ker.shape[0], img.shape[1] + 2*ker.shape[1] )).astype(np.float32) 44 | 45 | padded_ker[:ker.shape[0], :ker.shape[1]] = ker 46 | 47 | padded_ker = np.roll(padded_ker, shift=-ker.shape[0]//2, axis=0) 48 | padded_ker = np.roll(padded_ker, shift=-ker.shape[1]//2, axis=1) 49 | 50 | padded_img = np.zeros_like(padded_ker).astype(np.float32) 51 | 52 | padded_img[ker.shape[0]:-ker.shape[0], ker.shape[1]:-ker.shape[1]] = img 53 | 54 | out_ = cufft_conv(padded_ker, padded_img) 55 | 56 | output = out_[ker.shape[0]:-ker.shape[0], ker.shape[1]:-ker.shape[1]] 57 | 58 | return output 59 | 60 | gaussian_filter = lambda x, y, sigma : (1 / np.sqrt(2*np.pi*(sigma**2)) )*np.exp( -(x**2 + y**2) / (2 * (sigma**2) )) 61 | 62 | def gaussian_ker(sigma): 63 | ker_ = np.zeros((2*sigma+1, 2*sigma+1)) 64 | 65 | for i in range(2*sigma + 1): 66 | for j in range(2*sigma + 1): 67 | ker_[i,j] = gaussian_filter(i - sigma, j - sigma, sigma) 68 | 69 | total_ = np.sum(ker_.ravel()) 70 | 71 | ker_ = ker_ / total_ 72 | 73 | return ker_ 74 | 75 | 76 | if __name__ == '__main__': 77 | 78 | latte = np.float32(plt.imread('latte.jpg')) / 255 79 | latte_blurred = np.zeros_like(latte) 80 | ker = gaussian_ker(30) 81 | 82 | for k in range(3): 83 | latte_blurred[:,:,k] = conv_2d(ker, latte[:,:,k]) 84 | 85 | 86 | fig, (ax0, ax1) = plt.subplots(1,2) 87 | fig.suptitle('Gaussian Filtering', fontsize=20) 88 | ax0.set_title('Before') 89 | ax0.axis('off') 90 | ax0.imshow(latte) 91 | ax1.set_title('After') 92 | ax1.axis('off') 93 | ax1.imshow(latte_blurred) 94 | plt.tight_layout() 95 | plt.subplots_adjust(top=.85) 96 | plt.show() 97 | -------------------------------------------------------------------------------- /Chapter07/cublas_gemm_flops.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | from pycuda import gpuarray 3 | import numpy as np 4 | from skcuda import cublas 5 | from time import time 6 | 7 | m = 5000 8 | n = 10000 9 | k = 10000 10 | 11 | 12 | def compute_gflops(precision='S'): 13 | 14 | 15 | if precision=='S': 16 | float_type = 'float32' 17 | elif precision=='D': 18 | float_type = 'float64' 19 | else: 20 | return -1 21 | 22 | 23 | A = np.random.randn(m, k).astype(float_type) 24 | B = np.random.randn(k, n).astype(float_type) 25 | C = np.random.randn(m, n).astype(float_type) 26 | 27 | A_cm = A.T.copy() 28 | B_cm = B.T.copy() 29 | C_cm = C.T.copy() 30 | 31 | A_gpu = gpuarray.to_gpu(A_cm) 32 | B_gpu = gpuarray.to_gpu(B_cm) 33 | C_gpu = gpuarray.to_gpu(C_cm) 34 | 35 | alpha = np.random.randn() 36 | beta = np.random.randn() 37 | 38 | transa = cublas._CUBLAS_OP['N'] 39 | transb = cublas._CUBLAS_OP['N'] 40 | 41 | lda = m 42 | ldb = k 43 | ldc = m 44 | 45 | t = time() 46 | handle = cublas.cublasCreate() 47 | 48 | 49 | exec('cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \ 50 | B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision) 51 | 52 | cublas.cublasDestroy(handle) 53 | t = time() - t 54 | 55 | gflops = 2*m*n*(k+1)*(10**-9) / t 56 | 57 | return gflops 58 | 59 | if __name__ == '__main__': 60 | print('Single-precision performance: %s GFLOPS' % compute_gflops('S')) 61 | print('Double-precision performance: %s GFLOPS' % compute_gflops('D')) 62 | -------------------------------------------------------------------------------- /Chapter07/latte.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Hands-On-GPU-Programming-with-CUDA-C-and-Python-3.x-Second-Edition/36ddc06599d8fea62db05a6dc35de06274c4c0d2/Chapter07/latte.jpg -------------------------------------------------------------------------------- /Chapter08/monte_carlo_integrator.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | from pycuda import gpuarray 4 | from pycuda.compiler import SourceModule 5 | import numpy as np 6 | 7 | # https://docs.nvidia.com/cuda/cuda-math-api/index.html 8 | 9 | MonteCarloKernelTemplate = ''' 10 | #include 11 | 12 | #define ULL unsigned long long 13 | #define _R(z) ( 1.0f / (z) ) 14 | #define _P2(z) ( (z) * (z) ) 15 | 16 | // p stands for "precision" (single or double) 17 | __device__ inline %(p)s f(%(p)s x) 18 | { 19 | %(p)s y; 20 | 21 | %(math_function)s; 22 | 23 | return y; 24 | } 25 | 26 | 27 | extern "C" { 28 | __global__ void monte_carlo(int iters, %(p)s lo, %(p)s hi, %(p)s * ys_out) 29 | { 30 | curandState cr_state; 31 | 32 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 33 | 34 | int num_threads = blockDim.x * gridDim.x; 35 | 36 | %(p)s t_width = (hi - lo) / ( %(p)s ) num_threads; 37 | 38 | %(p)s density = ( ( %(p)s ) iters ) / t_width; 39 | 40 | %(p)s t_lo = t_width*tid + lo; 41 | %(p)s t_hi = t_lo + t_width; 42 | 43 | 44 | curand_init( (ULL) clock() + (ULL) tid, (ULL) 0, (ULL) 0, &cr_state); 45 | 46 | %(p)s y, y_sum = 0.0f; 47 | 48 | 49 | %(p)s rand_val, x; 50 | for (int i=0; i < iters; i++) 51 | { 52 | rand_val = curand_uniform%(p_curand)s(&cr_state); 53 | 54 | x = t_lo + t_width * rand_val; 55 | 56 | y_sum += f(x); 57 | } 58 | 59 | y = y_sum / density; 60 | 61 | ys_out[tid] = y; 62 | } 63 | 64 | } 65 | ''' 66 | 67 | 68 | class MonteCarloIntegrator: 69 | 70 | def __init__(self, math_function='y = sin(x)', precision='d', lo=0, hi=np.pi, samples_per_thread=10**5, num_blocks=100): 71 | 72 | self.math_function = math_function 73 | 74 | if precision in [None, 's', 'S', 'single', np.float32]: 75 | self.precision = 'float' 76 | self.numpy_precision = np.float32 77 | self.p_curand = '' 78 | elif precision in ['d','D', 'double', np.float64]: 79 | self.precision = 'double' 80 | self.numpy_precision = np.float64 81 | self.p_curand = '_double' 82 | else: 83 | raise Exception('precision is invalid datatype!') 84 | 85 | if (hi - lo <= 0): 86 | raise Exception('hi - lo <= 0!') 87 | else: 88 | self.hi = hi 89 | self.lo = lo 90 | 91 | MonteCarloDict = {'p' : self.precision, 'p_curand' : self.p_curand, 'math_function' : self.math_function} 92 | 93 | self.MonteCarloCode = MonteCarloKernelTemplate % MonteCarloDict 94 | 95 | self.ker = SourceModule(no_extern_c=True , options=['-w'], source=self.MonteCarloCode) 96 | 97 | self.f = self.ker.get_function('monte_carlo') 98 | 99 | self.num_blocks = num_blocks 100 | 101 | self.samples_per_thread = samples_per_thread 102 | 103 | 104 | def definite_integral(self, lo=None, hi=None, samples_per_thread=None, num_blocks=None): 105 | 106 | if lo is None or hi is None: 107 | lo = self.lo 108 | hi = self.hi 109 | 110 | if samples_per_thread is None: 111 | samples_per_thread = self.samples_per_thread 112 | 113 | if num_blocks is None: 114 | num_blocks = self.num_blocks 115 | grid = (num_blocks,1,1) 116 | else: 117 | grid = (num_blocks,1,1) 118 | 119 | block = (32,1,1) 120 | 121 | num_threads = 32*num_blocks 122 | 123 | self.ys = gpuarray.empty((num_threads,) , dtype=self.numpy_precision) 124 | 125 | self.f(np.int32(samples_per_thread), self.numpy_precision(lo), self.numpy_precision(hi), self.ys, block=block, grid=grid) 126 | 127 | self.nintegral = np.sum(self.ys.get() ) 128 | 129 | return np.sum(self.nintegral) 130 | 131 | 132 | 133 | if __name__ == '__main__': 134 | 135 | integral_tests = [('y =log(x)*_P2(sin(x))', 11.733 , 18.472, 8.9999), ('y = _R( 1 + sinh(2*x)*_P2(log(x)) )', .9, 4, .584977), ('y = (cosh(x)*sin(x))/ sqrt( pow(x,3) + _P2(sin(x)))', 1.85, 4.81, -3.34553) ] 136 | 137 | 138 | for f, lo, hi, expected in integral_tests: 139 | mci = MonteCarloIntegrator(math_function=f, precision='d', lo=lo, hi=hi) 140 | print('The Monte Carlo numerical integration of the function\n \t f: x -> %s \n \t from x = %s to x = %s is : %s ' % (f, lo, hi, mci.definite_integral())) 141 | print('where the expected value is : %s\n' % expected) 142 | -------------------------------------------------------------------------------- /Chapter08/monte_carlo_pi.py: -------------------------------------------------------------------------------- 1 | import pycuda.autoinit 2 | import pycuda.driver as drv 3 | from pycuda import gpuarray 4 | from pycuda.compiler import SourceModule 5 | import numpy as np 6 | from sympy import Rational 7 | 8 | ker = SourceModule(no_extern_c=True ,source=''' 9 | #include 10 | #define _PYTHAG(a,b) (a*a + b*b) 11 | #define ULL unsigned long long 12 | 13 | extern "C" { 14 | 15 | __global__ void estimate_pi(ULL iters, ULL * hits) 16 | { 17 | 18 | curandState cr_state; 19 | 20 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 21 | 22 | curand_init( (ULL) clock() + (ULL) tid, (ULL) 0, \ 23 | (ULL) 0, &cr_state); 24 | 25 | float x, y; 26 | 27 | for(ULL i=0; i < iters; i++) 28 | { 29 | 30 | x = curand_uniform(&cr_state); 31 | y = curand_uniform(&cr_state); 32 | 33 | 34 | if(_PYTHAG(x,y) <= 1.0f) 35 | hits[tid]++; 36 | } 37 | 38 | return; 39 | 40 | } 41 | 42 | }// (End of 'extern "C"' here) 43 | ''') 44 | 45 | 46 | 47 | pi_ker = ker.get_function("estimate_pi") 48 | 49 | threads_per_block = 32 50 | blocks_per_grid = 512 51 | 52 | total_threads = threads_per_block * blocks_per_grid 53 | 54 | hits_d = gpuarray.zeros((total_threads,),dtype=np.uint64) 55 | 56 | iters = 2**24 57 | 58 | pi_ker(np.uint64(iters), hits_d, grid=(blocks_per_grid,1,1), block=(threads_per_block,1,1)) 59 | 60 | total_hits = np.sum( hits_d.get() ) 61 | total = np.uint64(total_threads) * np.uint64(iters) 62 | 63 | est_pi_symbolic = Rational(4)*Rational(int(total_hits), int(total) ) 64 | 65 | est_pi = np.float(est_pi_symbolic.evalf()) 66 | 67 | print("Our Monte Carlo estimate of Pi is : %s" % est_pi) 68 | print("NumPy's Pi constant is: %s " % np.pi) 69 | 70 | print("Our estimate passes NumPy's 'allclose' : %s" % np.allclose(est_pi, np.pi)) 71 | -------------------------------------------------------------------------------- /Chapter08/thrust_dot_product.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | struct multiply_functor { 8 | 9 | float w; 10 | 11 | multiply_functor(float _w = 1) : w(_w) {} 12 | 13 | __device__ float operator() (const float & x, const float & y) { 14 | return w * x * y; 15 | } 16 | }; 17 | 18 | float dot_product(thrust::device_vector &v, thrust::device_vector &w ) //, thrust::device_vector &z) 19 | { 20 | thrust::device_vector z(v.size()); 21 | 22 | thrust::transform(v.begin(), v.end(), w.begin(), z.begin(), multiply_functor()); 23 | 24 | return thrust::reduce(z.begin(), z.end()); 25 | } 26 | 27 | int main(void) 28 | { 29 | 30 | thrust::device_vector v; 31 | 32 | v.push_back(1.0f); 33 | v.push_back(2.0f); 34 | v.push_back(3.0f); 35 | 36 | thrust::device_vector w(3); 37 | 38 | thrust::fill(w.begin(), w.end(), 1.0f); 39 | 40 | for (int i = 0; i < v.size(); i++) 41 | cout << "v[" << i << "] == " << v[i] << endl; 42 | 43 | for (int i = 0; i < w.size(); i++) 44 | cout << "w[" << i << "] == " << w[i] << endl; 45 | 46 | cout << "dot_product(v , w) == " << dot_product(v,w) << endl; 47 | 48 | return 0; 49 | } 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hands-On-GPU-Programming-with-CUDA-C-and-Python-3.x-Second-Edition 2 | Hands-On GPU Programming with CUDA C and Python 3.x, Second Edition, published by Packt 3 | --------------------------------------------------------------------------------