├── Chapter01
    └── mandelbrot0.py
├── Chapter02
    └── launch-python-cuda-environment.bat
├── Chapter03
    ├── deviceQuery.py
    ├── gpu_mandelbrot0.py
    ├── nbody.py
    ├── simple_element_kernel_example0.py
    ├── simple_scankernel0.py
    ├── simple_scankernel1.py
    └── time_calc0.py
├── Chapter04
    ├── conway_gpu.py
    ├── conway_gpu_syncthreads.py
    ├── conway_gpu_syncthreads_shared.py
    ├── naive_prefix.py
    ├── simple_scalar_multiply_kernel.py
    └── work-efficient_prefix.py
├── Chapter05
    ├── conway_gpu_streams.py
    ├── gpu_mandelbrot_context_sync.py
    ├── multi-kernel.py
    ├── multi-kernel_events.py
    ├── multi-kernel_multi-thread.py
    ├── multi-kernel_streams.py
    ├── simple_context_create.py
    ├── simple_event_example.py
    └── single_thread_example.py
├── Chapter06
    ├── broken_matrix_ker.py
    ├── divergence_test.cu
    ├── hello-world_gpu.py
    ├── matrix_ker.cu
    └── matrix_ker.py
├── Chapter07
    ├── conv_2d.py
    ├── cublas_gemm_flops.py
    └── latte.jpg
├── Chapter08
    ├── monte_carlo_integrator.py
    ├── monte_carlo_pi.py
    └── thrust_dot_product.cu
├── LICENSE
└── README.md


/Chapter01/mandelbrot0.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | import matplotlib
 3 | #this will prevent the figure from popping up
 4 | matplotlib.use('Agg')
 5 | 
 6 | from matplotlib import pyplot as plt
 7 | import numpy as np
 8 | 
 9 | 
10 | def simple_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
11 | 
12 |     real_vals = np.linspace(real_low, real_high, width)
13 |     imag_vals = np.linspace(imag_low, imag_high, height)
14 | 
15 |     # we will represent members as 1, non-members as 0.
16 | 
17 |     mandelbrot_graph = np.ones((height,width), dtype=np.float32)
18 | 
19 |     for x in range(width):
20 | 
21 |         for y in range(height):
22 | 
23 |             c = np.complex64( real_vals[x] + imag_vals[y] * 1j  )
24 |             z = np.complex64(0)
25 | 
26 |             for i in range(max_iters):
27 | 
28 |                 z = z**2 + c
29 | 
30 |                 if(np.abs(z) > upper_bound):
31 |                     mandelbrot_graph[y,x] = 0
32 |                     break
33 | 
34 |     return mandelbrot_graph
35 | 
36 | 
37 | if __name__ == '__main__':
38 | 
39 |     t1 = time()
40 |     mandel = simple_mandelbrot(512,512,-2,2,-2,2,256, 2.5)
41 |     t2 = time()
42 |     mandel_time = t2 - t1
43 | 
44 |     t1 = time()
45 |     fig = plt.figure(1)
46 |     plt.imshow(mandel, extent=(-2, 2, -2, 2))
47 |     plt.savefig('mandelbrot.png', dpi=fig.dpi)
48 |     t2 = time()
49 | 
50 |     dump_time = t2 - t1
51 | 
52 |     print('It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time))
53 |     print('It took {} seconds to dump the image.'.format(dump_time))
54 | 


--------------------------------------------------------------------------------
/Chapter02/launch-python-cuda-environment.bat:
--------------------------------------------------------------------------------
1 | REM This batch script will set up an appropriate Python environment for CUDA GPU programming under Windows.
2 | REM The last line launches a CMD prompt.  This can be any environment however.
3 | REM If you wish to use an IDE such as Spyder or Jupyter Notebook, just change the last line to "spyder"
4 | REM or "jupyter-notebook". 
5 | 
6 | call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
7 | call "C:\Users\%username%\Anaconda3\Scripts\activate.bat" C:\Users\%username%\Anaconda3
8 | cmd
9 | 


--------------------------------------------------------------------------------
/Chapter03/deviceQuery.py:
--------------------------------------------------------------------------------
 1 | import pycuda
 2 | import pycuda.driver as drv
 3 | drv.init()
 4 | 
 5 | print ('CUDA device query (PyCUDA version) \n')
 6 | 
 7 | print ('Detected {} CUDA Capable device(s) \n'.format(drv.Device.count()))
 8 | 
 9 | for i in range(drv.Device.count()):
10 |     
11 |     gpu_device = drv.Device(i)
12 |     print ('Device {}: {}'.format( i, gpu_device.name() ) )
13 |     compute_capability = float( '%d.%d' % gpu_device.compute_capability() )
14 |     print ('\t Compute Capability: {}'.format(compute_capability))
15 |     print ('\t Total Memory: {} megabytes'.format(gpu_device.total_memory()//(1024**2)))
16 |     
17 |     # The following will give us all remaining device attributes as seen 
18 |     # in the original deviceQuery.
19 |     # We set up a dictionary as such so that we can easily index
20 |     # the values using a string descriptor.
21 |     
22 |     device_attributes_tuples = gpu_device.get_attributes().items() 
23 |     device_attributes = {}
24 |     
25 |     for k, v in device_attributes_tuples:
26 |         device_attributes[str(k)] = v
27 |     
28 |     num_mp = device_attributes['MULTIPROCESSOR_COUNT']
29 |     
30 |     # Cores per multiprocessor is not reported by the GPU!  
31 |     # We must use a lookup table based on compute capability.
32 |     # See the following:
33 |     # http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
34 |     
35 |     cuda_cores_per_mp = { 3.0 : 192, 3.2 : 192, 3.5 : 192, 3.7 : 192, 5.0 : 128, 5.1 : 128, 5.2 : 128, 5.3 : 128, 6.0 : 64, 6.1 : 128,\
36 |          6.2 : 128, 7.0 : 64, 7.1 : 64, 7.2 : 64, 7.3 : 64, 7.4 : 64, 7.5 : 64}[compute_capability]
37 |     
38 |     print ('\t ({}) Multiprocessors, ({}) CUDA Cores / Multiprocessor: {} CUDA Cores'.format(num_mp, cuda_cores_per_mp, num_mp*cuda_cores_per_mp))
39 |     
40 |     device_attributes.pop('MULTIPROCESSOR_COUNT')
41 |     
42 |     for k in device_attributes.keys():
43 |         print ('\t {}: {}'.format(k, device_attributes[k]))
44 | 


--------------------------------------------------------------------------------
/Chapter03/gpu_mandelbrot0.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | import matplotlib
 3 | #this will prevent the figure from popping up
 4 | matplotlib.use('Agg')
 5 | from matplotlib import pyplot as plt
 6 | import numpy as np
 7 | import pycuda.autoinit
 8 | from pycuda import gpuarray
 9 | from pycuda.elementwise import ElementwiseKernel
10 | 
11 | mandel_ker = ElementwiseKernel(
12 | "pycuda::complex<float> *lattice, float *mandelbrot_graph, int max_iters, float upper_bound",
13 | """
14 | mandelbrot_graph[i] = 1;
15 | 
16 | pycuda::complex<float> c = lattice[i]; 
17 | pycuda::complex<float> z(0,0);
18 | 
19 | for (int j = 0; j < max_iters; j++)
20 |     {
21 |     
22 |      z = z*z + c;
23 |      
24 |      if(abs(z) > upper_bound)
25 |          {
26 |           mandelbrot_graph[i] = 0;
27 |           break;
28 |          }
29 | 
30 |     }
31 |          
32 | """,
33 | "mandel_ker")
34 | 
35 | def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
36 | 
37 |     # we set up our complex lattice as such
38 |     real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
39 |     imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
40 |     mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)    
41 |     
42 |     # copy complex lattice to the GPU
43 |     mandelbrot_lattice_gpu = gpuarray.to_gpu(mandelbrot_lattice)
44 | 
45 |     # allocate an empty array on the GPU
46 |     mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
47 | 
48 |     mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
49 |               
50 |     mandelbrot_graph = mandelbrot_graph_gpu.get()
51 |     
52 |     return mandelbrot_graph
53 | 
54 | 
55 | if __name__ == '__main__':
56 | 
57 |     t1 = time()
58 |     mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2)
59 |     t2 = time()
60 | 
61 |     mandel_time = t2 - t1
62 | 
63 |     t1 = time()
64 |     fig = plt.figure(1)
65 |     plt.imshow(mandel, extent=(-2, 2, -2, 2))
66 |     plt.savefig('mandelbrot.png', dpi=fig.dpi)
67 |     t2 = time()
68 | 
69 |     dump_time = t2 - t1
70 | 
71 |     print ('It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time))
72 |     print ('It took {} seconds to dump the image.'.format(dump_time))
73 | 


--------------------------------------------------------------------------------
/Chapter03/nbody.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pycuda.autoinit
  3 | from pycuda import gpuarray
  4 | from time import time
  5 | from pycuda.elementwise import ElementwiseKernel
  6 | import matplotlib.pyplot as plt 
  7 | import matplotlib.animation as animation
  8 | 
  9 | 
 10 | nbody_ker = ElementwiseKernel(
 11 | "float *in_x, float *in_y, float *in_v_x, float *in_v_y,  \
 12 | float *out_x, float *out_y, float *out_v_x, float *out_v_y, \
 13 | float *m, float t, int num_bodies",
 14 | """
 15 | #define G   6.67408313e-11
 16 | 
 17 | float net_force_x = 0.0f;
 18 | float net_force_y = 0.0f;
 19 | 
 20 | for(int n=0; n < num_bodies; ++n) {
 21 |         if (n == i)
 22 |             continue;
 23 |   
 24 |         float r2 = powf(in_x[i] - in_x[n], 2.0f) + powf(in_y[i] - in_y[n], 2.0f);
 25 |         float r = sqrtf(r2);
 26 |         
 27 |         float force = G*m[i]*m[n] / r2;
 28 |         
 29 |         float force_x = force * ( in_x[n] - in_x[i]  ) / r;
 30 |         float force_y = force * ( in_y[n] - in_y[i]  ) / r;
 31 |         
 32 |         net_force_x += force_x;
 33 |         net_force_y += force_y;
 34 |         
 35 | }
 36 | 
 37 | float a_x = net_force_x / m[i];
 38 | float a_y = net_force_y / m[i];
 39 | 
 40 | out_x[i] = in_x[i] + in_v_x[i]*t + 0.5f * a_x * powf(t,2.0f);
 41 | out_y[i] = in_y[i] + in_v_y[i]*t + 0.5f * a_y * powf(t,2.0f);
 42 | 
 43 | out_v_x[i] = in_v_x[i] + a_x*t;
 44 | out_v_y[i] = in_v_y[i] + a_y*t;
 45 | """,
 46 | "nbody_ker")
 47 | 
 48 | REZ = 128
 49 | NUM_BODIES=np.int32(4000)
 50 | t=np.float32(0.005)
 51 | 
 52 | in_x = gpuarray.to_gpu(np.float32(np.random.random(NUM_BODIES) + .5 ))
 53 | in_y = gpuarray.to_gpu(np.float32(np.random.random(NUM_BODIES) + .5))
 54 | in_v_x = gpuarray.to_gpu(np.float32(np.random.random(NUM_BODIES) - .5))
 55 | in_v_y = gpuarray.to_gpu(np.float32(np.random.random(NUM_BODIES) - .5))
 56 | 
 57 | out_x = gpuarray.empty_like(in_x)
 58 | out_y = gpuarray.empty_like(in_y)
 59 | out_v_x = gpuarray.empty_like(in_v_x)
 60 | out_v_y = gpuarray.empty_like(in_v_y)
 61 | 
 62 | masses = np.float32(np.random.random(NUM_BODIES)*10000)
 63 | m = gpuarray.to_gpu(masses)
 64 | 
 65 | 
 66 | def xy_to_img(x_coords, y_coords, masses):
 67 |     
 68 |     img_out = np.zeros((2*REZ,2*REZ), dtype=np.int32)
 69 |     
 70 |     for x, y, mass in zip(x_coords, y_coords, masses):
 71 |         if (x < 0 or y < 0 or not np.isfinite(x) or not np.isfinite(y)):
 72 |             continue
 73 |         int_x = int(np.round(x * REZ))
 74 |         int_y = int(np.round(y * REZ))
 75 |         
 76 |         if (int_x < 2*REZ and int_y < 2*REZ):
 77 |             img_out[int_x, int_y] += int(mass)
 78 |             
 79 |     return img_out
 80 | 
 81 | def update_gpu(frameNum, img, in_x, in_y, in_v_x, in_v_y, out_x, out_y, out_v_x, out_v_y,t, NUM_BODIES, masses):
 82 |     
 83 |     if frameNum % 2 == 0:
 84 |         nbody_ker(in_x,in_y,in_v_x,in_v_y,out_x,out_y,out_v_x,out_v_y,m,t,NUM_BODIES)
 85 |         img.set_data(xy_to_img(out_x.get(), out_y.get(), masses))
 86 |     else:
 87 |         nbody_ker(out_x,out_y,out_v_x,out_v_y,in_x,in_y,in_v_x,in_v_y,m,t,NUM_BODIES)
 88 |         img.set_data(xy_to_img(in_x.get(), in_y.get(), masses))
 89 | 
 90 |     return img
 91 |     
 92 | 
 93 | if __name__ == '__main__':   
 94 | 
 95 |     fig, ax = plt.subplots()
 96 |     img = ax.imshow( xy_to_img(in_x.get(), in_y.get(), masses)  , interpolation='nearest')
 97 |     ani = animation.FuncAnimation(fig, update_gpu, fargs=(img, in_x, in_y, in_v_x, in_v_y, out_x, out_y, out_v_x, out_v_y, t, NUM_BODIES, masses) , interval=0, frames=100, save_count=100)    
 98 |      
 99 |     plt.show()
100 | 


--------------------------------------------------------------------------------
/Chapter03/simple_element_kernel_example0.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pycuda.autoinit
 3 | from pycuda import gpuarray
 4 | from time import time
 5 | from pycuda.elementwise import ElementwiseKernel
 6 | 
 7 | host_data = np.float32( np.random.random(50000000) )
 8 | 
 9 | gpu_2x_ker = ElementwiseKernel(
10 | "float *in, float *out",
11 | "out[i] = 2*in[i];",
12 | "gpu_2x_ker")
13 | 
14 | def speedcomparison():
15 |     t1 = time()
16 |     host_data_2x =  host_data * np.float32(2)
17 |     t2 = time()
18 |     print('total time to compute on CPU: %f' % (t2 - t1))
19 |     device_data = gpuarray.to_gpu(host_data)
20 |     # allocate memory for output
21 |     device_data_2x = gpuarray.empty_like(device_data)
22 |     t1 = time()
23 |     gpu_2x_ker(device_data, device_data_2x)
24 |     t2 = time()
25 |     from_device = device_data_2x.get()
26 |     print('total time to compute on GPU: %f' % (t2 - t1))
27 |     print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x) ))
28 |     
29 | 
30 | if __name__ == '__main__':
31 |     speedcomparison()
32 | 


--------------------------------------------------------------------------------
/Chapter03/simple_scankernel0.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pycuda.autoinit
 3 | from pycuda import gpuarray
 4 | from pycuda.scan import InclusiveScanKernel
 5 | 
 6 | seq = np.array([1,2,3,4],dtype=np.int32)
 7 | seq_gpu = gpuarray.to_gpu(seq)
 8 | sum_gpu = InclusiveScanKernel(np.int32, "a+b")
 9 | print(sum_gpu(seq_gpu).get())
10 | print(np.cumsum(seq))
11 | 


--------------------------------------------------------------------------------
/Chapter03/simple_scankernel1.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pycuda.autoinit
 3 | from pycuda import gpuarray
 4 | from pycuda.scan import InclusiveScanKernel
 5 | 
 6 | seq = np.array([1,100,-3,-10000, 4, 10000, 66, 14, 21],dtype=np.int32)
 7 | seq_gpu = gpuarray.to_gpu(seq)
 8 | max_gpu = InclusiveScanKernel(np.int32, "a > b ? a : b")
 9 | print(max_gpu(seq_gpu).get()[-1])
10 | print(np.max(seq))
11 | 


--------------------------------------------------------------------------------
/Chapter03/time_calc0.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pycuda.autoinit
 3 | from pycuda import gpuarray
 4 | from time import time
 5 | 
 6 | 
 7 | host_data = np.float32( np.random.random(50000000) )
 8 | 
 9 | t1 = time()
10 | host_data_2x =  host_data * np.float32(2)
11 | t2 = time()
12 | 
13 | print('total time to compute on CPU: %f' % (t2 - t1))
14 | 
15 | 
16 | device_data = gpuarray.to_gpu(host_data)
17 | 
18 | t1 = time()
19 | device_data_2x =  device_data * np.float32( 2 )
20 | t2 = time()
21 | 
22 | from_device = device_data_2x.get()
23 | 
24 | 
25 | print('total time to compute on GPU: %f' % (t2 - t1))
26 | print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x) ))
27 | 


--------------------------------------------------------------------------------
/Chapter04/conway_gpu.py:
--------------------------------------------------------------------------------
 1 | # Conway's game of life in Python / CUDA C
 2 | # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
 3 | 
 4 | import pycuda.autoinit
 5 | import pycuda.driver as drv
 6 | from pycuda import gpuarray
 7 | from pycuda.compiler import SourceModule
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt 
10 | import matplotlib.animation as animation
11 | 
12 | ker = SourceModule("""
13 | #define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
14 | #define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
15 | 
16 | #define _WIDTH  ( blockDim.x * gridDim.x )
17 | #define _HEIGHT ( blockDim.y * gridDim.y  )
18 | 
19 | #define _XM(x)  ( (x + _WIDTH) % _WIDTH )
20 | #define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
21 | 
22 | #define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
23 | 
24 | // return the number of living neighbors for a given cell                
25 | __device__ int nbrs(int x, int y, int * in)
26 | {
27 |      return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
28 |                    + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
29 |                    + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
30 | }
31 | 
32 | __global__ void conway_ker(int * lattice_out, int * lattice  )
33 | {
34 |    // x, y are the appropriate values for the cell covered by this thread
35 |    int x = _X, y = _Y;
36 |    
37 |    // count the number of neighbors around the current cell
38 |    int n = nbrs(x, y, lattice);
39 |                    
40 |     
41 |     // if the current cell is alive, then determine if it lives or dies for the next generation.
42 |     if ( lattice[_INDEX(x,y)] == 1)
43 |        switch(n)
44 |        {
45 |           // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
46 |           case 2:
47 |           case 3: lattice_out[_INDEX(x,y)] = 1;
48 |                   break;
49 |           default: lattice_out[_INDEX(x,y)] = 0;                   
50 |        }
51 |     else if( lattice[_INDEX(x,y)] == 0 )
52 |          switch(n)
53 |          {
54 |             // a dead cell comes to life only if it has 3 neighbors that are alive.
55 |             case 3: lattice_out[_INDEX(x,y)] = 1;
56 |                     break;
57 |             default: lattice_out[_INDEX(x,y)] = 0;         
58 |          }
59 |          
60 | }
61 | """)
62 | 
63 | 
64 | conway_ker = ker.get_function("conway_ker")
65 |      
66 | 
67 | def update_gpu(frameNum, img, newLattice_gpu, lattice_gpu, N):
68 |     
69 |     conway_ker(  newLattice_gpu, lattice_gpu, grid=(N//32,N//32,1), block=(32,32,1)   )
70 |     
71 |     img.set_data(newLattice_gpu.get() )
72 |     
73 |     
74 |     lattice_gpu[:] = newLattice_gpu[:]
75 |     
76 |     return img
77 |     
78 | 
79 | if __name__ == '__main__':
80 |     # set lattice size
81 |     N = 512
82 |     
83 |     lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) )
84 |     lattice_gpu = gpuarray.to_gpu(lattice)
85 |     
86 |     newLattice_gpu = gpuarray.empty_like(lattice_gpu)        
87 | 
88 |     fig, ax = plt.subplots()
89 |     img = ax.imshow(lattice_gpu.get(), interpolation='nearest')
90 |     ani = animation.FuncAnimation(fig, update_gpu, fargs=(img, newLattice_gpu, lattice_gpu, N, ) , interval=0, frames=1000, save_count=1000)    
91 |      
92 |     plt.show()
93 | 


--------------------------------------------------------------------------------
/Chapter04/conway_gpu_syncthreads.py:
--------------------------------------------------------------------------------
 1 | # Iterative version of Conway's game of life in Python / CUDA C
 2 | # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
 3 | 
 4 | import pycuda.autoinit
 5 | import pycuda.driver as drv
 6 | from pycuda import gpuarray
 7 | from pycuda.compiler import SourceModule
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt 
10 | 
11 | 
12 | ker = SourceModule("""
13 | #define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
14 | #define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
15 | 
16 | #define _WIDTH  ( blockDim.x * gridDim.x )
17 | #define _HEIGHT ( blockDim.y * gridDim.y  )
18 | 
19 | #define _XM(x)  ( (x + _WIDTH) % _WIDTH )
20 | #define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
21 | 
22 | #define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
23 | 
24 | // return the number of living neighbors for a given cell                
25 | __device__ int nbrs(int x, int y, int * in)
26 | {
27 |      return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
28 |                    + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
29 |                    + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
30 | }
31 | 
32 | __global__ void conway_ker(int * lattice, int iters)
33 | {
34 |    // x, y are the appropriate values for the cell covered by this thread
35 |    int x = _X, y = _Y;
36 |    
37 |    for (int i = 0; i < iters; i++)
38 |    {
39 |    
40 |        // count the number of neighbors around the current cell
41 |        int n = nbrs(x, y, lattice);
42 |        
43 |        int cell_value;
44 |                        
45 |         
46 |         // if the current cell is alive, then determine if it lives or dies for the next generation.
47 |         if ( lattice[_INDEX(x,y)] == 1)
48 |            switch(n)
49 |            {
50 |               // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
51 |               case 2:
52 |               case 3: cell_value = 1;
53 |                       break;
54 |               default: cell_value = 0;                   
55 |            }
56 |         else if( lattice[_INDEX(x,y)] == 0 )
57 |              switch(n)
58 |              {
59 |                 // a dead cell comes to life only if it has 3 neighbors that are alive.
60 |                 case 3: cell_value = 1;
61 |                         break;
62 |                 default: cell_value = 0;         
63 |              }
64 |              
65 |         __syncthreads();
66 |         lattice[_INDEX(x,y)] = cell_value;
67 |         __syncthreads(); 
68 |     }
69 |          
70 | }
71 | """)
72 | 
73 | 
74 | conway_ker = ker.get_function("conway_ker")
75 |      
76 | 
77 |     
78 | 
79 | if __name__ == '__main__':
80 |     # set lattice size
81 |     N = 32
82 |     
83 |     lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) )
84 |     lattice_gpu = gpuarray.to_gpu(lattice)
85 |     conway_ker(lattice_gpu, np.int32(100000), grid=(1,1,1), block=(32,32,1))
86 |     fig = plt.figure(1)
87 |     plt.imshow(lattice_gpu.get())
88 |     plt.show()
89 |     
90 | 


--------------------------------------------------------------------------------
/Chapter04/conway_gpu_syncthreads_shared.py:
--------------------------------------------------------------------------------
  1 | # Iterative Conway's game of life in Python / CUDA C
  2 | # this version is meant to illustrate the use of shared kernel memory in CUDA.
  3 | # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
  4 | 
  5 | import pycuda.autoinit
  6 | import pycuda.driver as drv
  7 | from pycuda import gpuarray
  8 | from pycuda.compiler import SourceModule
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt 
 11 | from time import time
 12 |     
 13 | shared_ker = SourceModule("""    
 14 | #define _iters 1000000                       
 15 | 
 16 | #define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
 17 | #define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
 18 | 
 19 | #define _WIDTH  ( blockDim.x * gridDim.x )
 20 | #define _HEIGHT ( blockDim.y * gridDim.y  )
 21 | 
 22 | #define _XM(x)  ( (x + _WIDTH) % _WIDTH )
 23 | #define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
 24 | 
 25 | #define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
 26 | 
 27 | // return the number of living neighbors for a given cell                
 28 | __device__ int nbrs(int x, int y, int * in)
 29 | {
 30 |      return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
 31 |                    + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
 32 |                    + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
 33 | }
 34 | 
 35 | __global__ void conway_ker_shared(int * p_lattice, int iters)
 36 | {
 37 |    // x, y are the appropriate values for the cell covered by this thread
 38 |    int x = _X, y = _Y;
 39 |    __shared__ int lattice[32*32];
 40 |    
 41 |    
 42 |    lattice[_INDEX(x,y)] = p_lattice[_INDEX(x,y)];
 43 |    __syncthreads();
 44 | 
 45 |    for (int i = 0; i < iters; i++)
 46 |    {
 47 |    
 48 |        // count the number of neighbors around the current cell
 49 |        int n = nbrs(x, y, lattice);
 50 |        
 51 |        int cell_value;
 52 |                        
 53 |         
 54 |         // if the current cell is alive, then determine if it lives or dies for the next generation.
 55 |         if ( lattice[_INDEX(x,y)] == 1)
 56 |            switch(n)
 57 |            {
 58 |               // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
 59 |               case 2:
 60 |               case 3: cell_value = 1;
 61 |                       break;
 62 |               default: cell_value = 0;                   
 63 |            }
 64 |         else if( lattice[_INDEX(x,y)] == 0 )
 65 |              switch(n)
 66 |              {
 67 |                 // a dead cell comes to life only if it has 3 neighbors that are alive.
 68 |                 case 3: cell_value = 1;
 69 |                         break;
 70 |                 default: cell_value = 0;         
 71 |              }
 72 |              
 73 |         __syncthreads();
 74 |         lattice[_INDEX(x,y)] = cell_value;
 75 |         __syncthreads();
 76 |          
 77 |     }
 78 |              
 79 |     __syncthreads();
 80 |     p_lattice[_INDEX(x,y)] = lattice[_INDEX(x,y)];
 81 |     __syncthreads();
 82 |          
 83 | }
 84 | """)
 85 | 
 86 | 
 87 | conway_ker_shared = shared_ker.get_function("conway_ker_shared")
 88 |     
 89 | 
 90 | if __name__ == '__main__':
 91 |     # set lattice size
 92 |     N = 32
 93 |     
 94 |     lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) )
 95 |     lattice_gpu = gpuarray.to_gpu(lattice)    
 96 |     
 97 |     conway_ker_shared(lattice_gpu, np.int32(1000000), grid=(1,1,1), block=(32,32,1))    
 98 |     
 99 |     fig = plt.figure(1)
100 |     plt.imshow(lattice_gpu.get())
101 |     plt.show()
102 | 
103 |     
104 | 


--------------------------------------------------------------------------------
/Chapter04/naive_prefix.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | import numpy as np
 4 | from pycuda import gpuarray
 5 | from pycuda.compiler import SourceModule
 6 | from time import time
 7 | # this is a naive parallel prefix-sum kernel that uses shared memory
 8 | naive_ker = SourceModule("""
 9 | __global__ void naive_prefix(double *vec, double *out)
10 | {
11 |      __shared__ double sum_buf[1024];     
12 |      int tid = threadIdx.x;     
13 |      sum_buf[tid] = vec[tid];
14 |      
15 |      // begin parallel prefix sum algorithm
16 |      
17 |      int iter = 1;
18 |      for (int i=0; i < 10; i++)
19 |      {
20 |          __syncthreads();
21 |          if (tid >= iter )
22 |          {
23 |              sum_buf[tid] = sum_buf[tid] + sum_buf[tid - iter];            
24 |          }
25 |          
26 |          iter *= 2;
27 |      }
28 |          
29 |     __syncthreads();
30 |     out[tid] = sum_buf[tid];
31 |     __syncthreads();
32 |         
33 | }
34 | """)
35 | naive_gpu = naive_ker.get_function("naive_prefix")
36 |     
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     
41 |     
42 |     testvec = np.random.randn(1024).astype(np.float64)
43 |     testvec_gpu = gpuarray.to_gpu(testvec)
44 |     
45 |     outvec_gpu = gpuarray.empty_like(testvec_gpu)
46 | 
47 |     naive_gpu( testvec_gpu , outvec_gpu, block=(1024,1,1), grid=(1,1,1))
48 |     
49 |     total_sum = sum( testvec)
50 |     total_sum_gpu = outvec_gpu[-1].get()
51 |     
52 |     print("Does our kernel work correctly? : {}".format(np.allclose(total_sum_gpu , total_sum) ))
53 | 


--------------------------------------------------------------------------------
/Chapter04/simple_scalar_multiply_kernel.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | import numpy as np
 4 | from pycuda import gpuarray
 5 | from pycuda.compiler import SourceModule
 6 | 
 7 | ker = SourceModule("""
 8 | __global__ void scalar_multiply_kernel(float *outvec, float scalar, float *vec)
 9 | {
10 |      int i = threadIdx.x;
11 |      outvec[i] = scalar*vec[i];
12 | }
13 | """)
14 | 
15 | scalar_multiply_gpu = ker.get_function("scalar_multiply_kernel")
16 | 
17 | testvec = np.random.randn(512).astype(np.float32)
18 | testvec_gpu = gpuarray.to_gpu(testvec)
19 | outvec_gpu = gpuarray.empty_like(testvec_gpu)
20 | 
21 | scalar_multiply_gpu( outvec_gpu, np.float32(2), testvec_gpu, block=(512,1,1), grid=(1,1,1))
22 | 
23 | print("Does our kernel work correctly? : {}".format(np.allclose(outvec_gpu.get() , 2*testvec) ))
24 | 


--------------------------------------------------------------------------------
/Chapter04/work-efficient_prefix.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import pycuda.autoinit
  3 | import pycuda.driver as drv
  4 | import numpy as np
  5 | from pycuda import gpuarray
  6 | from pycuda.compiler import SourceModule
  7 | from time import time
  8 | 
  9 | # this is a work-efficent parallel prefix-sum algorithm.
 10 | # written by Brian Tuomanen for "Hands On GPU Programming with Python and CUDA"
 11 | 
 12 | # kernel for up-sweep phase
 13 | up_ker = SourceModule("""
 14 | __global__ void up_ker(double *x, double *x_old, int k )
 15 | {
 16 |      int tid =  blockIdx.x*blockDim.x + threadIdx.x;
 17 |      
 18 |      int _2k = 1 << k;
 19 |      int _2k1 = 1 << (k+1);
 20 |      
 21 |      int j = tid* _2k1;
 22 |      
 23 |      x[j + _2k1 - 1] = x_old[j + _2k -1 ] + x_old[j + _2k1 - 1];
 24 | 
 25 | }
 26 | """)
 27 | 
 28 | up_gpu = up_ker.get_function("up_ker")
 29 | 
 30 | # implementation of up-sweep phase
 31 | def up_sweep(x):
 32 |     # let's typecast to be safe.
 33 |     x = np.float64(x)
 34 |     x_gpu = gpuarray.to_gpu(np.float64(x) )
 35 |     x_old_gpu = x_gpu.copy()
 36 |     for k in range( int(np.log2(x.size) ) ) : 
 37 |         num_threads = int(np.ceil( x.size / 2**(k+1)))
 38 |         grid_size = int(np.ceil(num_threads / 32))
 39 |         
 40 |         if grid_size > 1:
 41 |             block_size = 32
 42 |         else:
 43 |             block_size = num_threads
 44 |             
 45 |         up_gpu(x_gpu, x_old_gpu, np.int32(k)  , block=(block_size,1,1), grid=(grid_size,1,1))
 46 |         x_old_gpu[:] = x_gpu[:]
 47 |         
 48 |     x_out = x_gpu.get()
 49 |     return(x_out)
 50 | 
 51 | # kernel for down-sweep phase
 52 | down_ker = SourceModule("""
 53 | __global__ void down_ker(double *y, double *y_old,  int k)
 54 | {
 55 |      int tid =  blockIdx.x*blockDim.x + threadIdx.x;
 56 |      
 57 |      int _2k = 1 << k;
 58 |      int _2k1 = 1 << (k+1);
 59 |      
 60 |      int j = tid*_2k1;
 61 |      
 62 |      y[j + _2k - 1 ] = y_old[j + _2k1 - 1];
 63 |      y[j + _2k1 - 1] = y_old[j + _2k1 - 1] + y_old[j + _2k - 1];
 64 | }
 65 | """)
 66 | 
 67 | down_gpu = down_ker.get_function("down_ker")
 68 | 
 69 |     
 70 | # implementation of down-sweep phase
 71 | def down_sweep(y):
 72 |     y = np.float64(y)
 73 |     y[-1] = 0
 74 |     y_gpu = gpuarray.to_gpu(y)
 75 |     y_old_gpu = y_gpu.copy()
 76 |     for k in reversed(range(int(np.log2(y.size)))):
 77 |         num_threads = int(np.ceil( y.size / 2**(k+1)))
 78 |         grid_size = int(np.ceil(num_threads / 32))
 79 |         
 80 |         if grid_size > 1:
 81 |             block_size = 32
 82 |         else:
 83 |             block_size = num_threads
 84 |             
 85 |         down_gpu(y_gpu, y_old_gpu, np.int32(k), block=(block_size,1,1), grid=(grid_size,1,1))
 86 |         y_old_gpu[:] = y_gpu[:]
 87 |     y_out = y_gpu.get()
 88 |     return(y_out)
 89 |     
 90 |    
 91 | # full implementation of work-efficient parallel prefix sum
 92 | def efficient_prefix(x):
 93 |         return(down_sweep(up_sweep(x)))
 94 | 
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     
 99 |     
100 |     testvec = np.random.randn(32*1024).astype(np.float64)
101 |     testvec_gpu = gpuarray.to_gpu(testvec)
102 |     
103 |     outvec_gpu = gpuarray.empty_like(testvec_gpu)
104 |      
105 |     prefix_sum = np.roll(np.cumsum(testvec), 1)
106 |     prefix_sum[0] = 0
107 |     
108 |     prefix_sum_gpu = efficient_prefix(testvec)
109 |     
110 |     print("Does our work-efficient prefix work? {}".format(np.allclose(prefix_sum_gpu, prefix_sum)))
111 |     
112 |     
113 | 


--------------------------------------------------------------------------------
/Chapter05/conway_gpu_streams.py:
--------------------------------------------------------------------------------
  1 | # CUDA Stream-based Concurrent Conway's game of life in Python / CUDA C
  2 | # written by Brian Tuomanen for "Hands on GPU Programming with Python and CUDA"
  3 | 
  4 | import pycuda.autoinit
  5 | import pycuda.driver as drv
  6 | from pycuda import gpuarray
  7 | from pycuda.compiler import SourceModule
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt 
 10 | import matplotlib.animation as animation
 11 | 
 12 | ker = SourceModule("""
 13 | #define _X  ( threadIdx.x + blockIdx.x * blockDim.x )
 14 | #define _Y  ( threadIdx.y + blockIdx.y * blockDim.y )
 15 | 
 16 | #define _WIDTH  ( blockDim.x * gridDim.x )
 17 | #define _HEIGHT ( blockDim.y * gridDim.y  )
 18 | 
 19 | #define _XM(x)  ( (x + _WIDTH) % _WIDTH )
 20 | #define _YM(y)  ( (y + _HEIGHT) % _HEIGHT )
 21 | 
 22 | #define _INDEX(x,y)  ( _XM(x)  + _YM(y) * _WIDTH )
 23 | 
 24 | // return the number of living neighbors for a given cell                
 25 | __device__ int nbrs(int x, int y, int * in)
 26 | {
 27 |      return ( in[ _INDEX(x -1, y+1) ] + in[ _INDEX(x-1, y) ] + in[ _INDEX(x-1, y-1) ] \
 28 |                    + in[ _INDEX(x, y+1)] + in[_INDEX(x, y - 1)] \
 29 |                    + in[ _INDEX(x+1, y+1) ] + in[ _INDEX(x+1, y) ] + in[ _INDEX(x+1, y-1) ] );
 30 | }
 31 | 
 32 | __global__ void conway_ker(int * lattice_out, int * lattice  )
 33 | {
 34 |    // x, y are the appropriate values for the cell covered by this thread
 35 |    int x = _X, y = _Y;
 36 |    
 37 |    // count the number of neighbors around the current cell
 38 |    int n = nbrs(x, y, lattice);
 39 |                    
 40 |     
 41 |     // if the current cell is alive, then determine if it lives or dies for the next generation.
 42 |     if ( lattice[_INDEX(x,y)] == 1)
 43 |        switch(n)
 44 |        {
 45 |           // if the cell is alive: it remains alive only if it has 2 or 3 neighbors.
 46 |           case 2:
 47 |           case 3: lattice_out[_INDEX(x,y)] = 1;
 48 |                   break;
 49 |           default: lattice_out[_INDEX(x,y)] = 0;                   
 50 |        }
 51 |     else if( lattice[_INDEX(x,y)] == 0 )
 52 |          switch(n)
 53 |          {
 54 |             // a dead cell comes to life only if it has 3 neighbors that are alive.
 55 |             case 3: lattice_out[_INDEX(x,y)] = 1;
 56 |                     break;
 57 |             default: lattice_out[_INDEX(x,y)] = 0;         
 58 |          }
 59 |          
 60 | }
 61 | """)
 62 | 
 63 | 
 64 | conway_ker = ker.get_function("conway_ker")
 65 |     
 66 | 
 67 | def update_gpu(frameNum, imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent):
 68 |     
 69 |     for k in range(num_concurrent):
 70 |         conway_ker(  newLattices_gpu[k], lattices_gpu[k], grid=(N//32,N//32,1), block=(32,32,1), stream=streams[k]   )
 71 |         
 72 |         imgs[k].set_data(newLattices_gpu[k].get_async(stream=streams[k]) )
 73 |         
 74 |         lattices_gpu[k].set_async(newLattices_gpu[k], stream=streams[k])
 75 |         
 76 |     
 77 |     return imgs
 78 |     
 79 | 
 80 | if __name__ == '__main__':
 81 |     # set lattice size
 82 |     N = 128
 83 |     
 84 |     num_concurrent = 4
 85 |     
 86 |     streams = []
 87 |     lattices_gpu = []
 88 |     newLattices_gpu = []
 89 |     
 90 |     for k in range(num_concurrent):
 91 |         streams.append(drv.Stream())
 92 |         lattice = np.int32( np.random.choice([1,0], N*N, p=[0.25, 0.75]).reshape(N, N) )
 93 |         lattices_gpu.append(gpuarray.to_gpu(lattice)) 
 94 |         newLattices_gpu.append(gpuarray.empty_like(lattices_gpu[k]))      
 95 | 
 96 |     fig, ax = plt.subplots(nrows=1, ncols=num_concurrent)
 97 |     imgs = []
 98 |     
 99 |     for k in range(num_concurrent):
100 |         imgs.append( ax[k].imshow(lattices_gpu[k].get_async(stream=streams[k]), interpolation='nearest') )
101 |     
102 |     ani = animation.FuncAnimation(fig, update_gpu, fargs=(imgs, newLattices_gpu, lattices_gpu, N, streams, num_concurrent) , interval=0, frames=1000, save_count=1000)    
103 |      
104 |     plt.show()
105 | 


--------------------------------------------------------------------------------
/Chapter05/gpu_mandelbrot_context_sync.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | import matplotlib
 3 | #this will prevent the figure from popping up
 4 | matplotlib.use('Agg')
 5 | from matplotlib import pyplot as plt
 6 | import numpy as np
 7 | import pycuda.autoinit
 8 | from pycuda import gpuarray
 9 | from pycuda.elementwise import ElementwiseKernel
10 | 
11 | mandel_ker = ElementwiseKernel(
12 | "pycuda::complex<float> *lattice, float *mandelbrot_graph, int max_iters, float upper_bound",
13 | """
14 | mandelbrot_graph[i] = 1;
15 | 
16 | pycuda::complex<float> c = lattice[i]; 
17 | pycuda::complex<float> z(0,0);
18 | 
19 | for (int j = 0; j < max_iters; j++)
20 |     {
21 |     
22 |      z = z*z + c;
23 |      
24 |      if(abs(z) > upper_bound)
25 |          {
26 |           mandelbrot_graph[i] = 0;
27 |           break;
28 |          }
29 | 
30 |     }
31 |          
32 | """,
33 | "mandel_ker")
34 | 
35 | def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound):
36 | 
37 |     # we set up our complex lattice as such
38 |     real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64)
39 |     imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j
40 |     mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64)    
41 |     
42 |     # copy complex lattice to the GPU
43 |     mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice)
44 | 
45 |     # synchronize in current context
46 |     pycuda.autoinit.context.synchronize()
47 | 
48 |     # allocate an empty array on the GPU
49 |     mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32)
50 | 
51 |     mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound))
52 | 
53 |     pycuda.autoinit.context.synchronize()
54 |               
55 |     mandelbrot_graph = mandelbrot_graph_gpu.get_async()
56 |     
57 |     pycuda.autoinit.context.synchronize()
58 | 
59 |     return mandelbrot_graph
60 | 
61 | 
62 | if __name__ == '__main__':
63 | 
64 |     t1 = time()
65 |     mandel = gpu_mandelbrot(512,512,-2,2,-2,2,256, 2)
66 |     t2 = time()
67 | 
68 |     mandel_time = t2 - t1
69 | 
70 |     t1 = time()
71 |     fig = plt.figure(1)
72 |     plt.imshow(mandel, extent=(-2, 2, -2, 2))
73 |     plt.savefig('mandelbrot.png', dpi=fig.dpi)
74 |     t2 = time()
75 | 
76 |     dump_time = t2 - t1
77 | 
78 |     print 'It took {} seconds to calculate the Mandelbrot graph.'.format(mandel_time)
79 |     print 'It took {} seconds to dump the image.'.format(dump_time)
80 | 


--------------------------------------------------------------------------------
/Chapter05/multi-kernel.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | from pycuda import gpuarray
 4 | from pycuda.compiler import SourceModule
 5 | import numpy as np
 6 | from time import time
 7 | 
 8 | num_arrays = 200
 9 | array_len = 1024**2
10 | 
11 | ker = SourceModule("""       
12 | __global__ void mult_ker(float * array, int array_len)
13 | {
14 |      int thd = blockIdx.x*blockDim.x + threadIdx.x;
15 |      int num_iters = array_len / blockDim.x;
16 | 
17 |      for(int j=0; j < num_iters; j++)
18 |      {
19 |          int i = j * blockDim.x + thd;
20 | 
21 |          for(int k = 0; k < 50; k++)
22 |          {
23 |               array[i] *= 2.0;
24 |               array[i] /= 2.0;
25 |          }
26 |      }
27 | 
28 | }
29 | """)
30 | 
31 | mult_ker = ker.get_function('mult_ker')
32 | 
33 | data = []
34 | data_gpu = []
35 | gpu_out = []
36 | 
37 | # generate random arrays.
38 | for _ in range(num_arrays):
39 |     data.append(np.random.randn(array_len).astype('float32'))
40 | 
41 | t_start = time()
42 | 
43 | # copy arrays to GPU.
44 | for k in range(num_arrays):
45 |     data_gpu.append(gpuarray.to_gpu(data[k]))
46 | 
47 | # process arrays.
48 | for k in range(num_arrays):
49 |     mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1))
50 | 
51 | # copy arrays from GPU.
52 | for k in range(num_arrays):
53 |     gpu_out.append(data_gpu[k].get())
54 | 
55 | t_end = time()
56 | 
57 | for k in range(num_arrays):
58 |     assert (np.allclose(gpu_out[k], data[k]))
59 | 
60 | print('Total time: %f' % (t_end - t_start))
61 | 


--------------------------------------------------------------------------------
/Chapter05/multi-kernel_events.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | from pycuda import gpuarray
 4 | from pycuda.compiler import SourceModule
 5 | import numpy as np
 6 | from time import time
 7 | 
 8 | num_arrays = 200
 9 | array_len = 1024**2
10 | 
11 | ker = SourceModule("""       
12 | __global__ void mult_ker(float * array, int array_len)
13 | {
14 |      int thd = blockIdx.x*blockDim.x + threadIdx.x;
15 |      int num_iters = array_len / blockDim.x;
16 |      for(int j=0; j < num_iters; j++)
17 |      {
18 |          int i = j * blockDim.x + thd;
19 |          for(int k = 0; k < 50; k++)
20 |          {
21 |               array[i] *= 2.0;
22 |               array[i] /= 2.0;
23 |          }
24 |      }
25 | }
26 | """)
27 | 
28 | mult_ker = ker.get_function('mult_ker')
29 | 
30 | data = []
31 | data_gpu = []
32 | gpu_out = []
33 | streams = []
34 | start_events = []
35 | end_events = []
36 | 
37 | for _ in range(num_arrays):
38 |     streams.append(drv.Stream())
39 |     start_events.append(drv.Event())
40 |     end_events.append(drv.Event())
41 | 
42 | # generate random arrays.
43 | for _ in range(num_arrays):
44 |     data.append(np.random.randn(array_len).astype('float32'))
45 | 
46 | t_start = time()
47 | 
48 | # copy arrays to GPU.
49 | for k in range(num_arrays):
50 |     data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
51 | 
52 | # process arrays.
53 | for k in range(num_arrays):
54 |     start_events[k].record(streams[k])
55 |     mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
56 | for k in range(num_arrays):
57 |     end_events[k].record(streams[k])
58 |     
59 | # copy arrays from GPU.
60 | for k in range(num_arrays):
61 |     gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
62 | 
63 | t_end = time()
64 | 
65 | for k in range(num_arrays):
66 |     assert (np.allclose(gpu_out[k], data[k]))
67 | 
68 | kernel_times = []
69 | 
70 | for k in range(num_arrays):
71 |     kernel_times.append(start_events[k].time_till(end_events[k]))
72 | 
73 | print('Total time: %f' % (t_end - t_start))
74 | print('Mean kernel duration (milliseconds): %f' % np.mean(kernel_times))
75 | print('Mean kernel standard deviation (milliseconds): %f' % np.std(kernel_times))
76 | 


--------------------------------------------------------------------------------
/Chapter05/multi-kernel_multi-thread.py:
--------------------------------------------------------------------------------
 1 | import pycuda
 2 | import pycuda.driver as drv
 3 | from pycuda import gpuarray
 4 | from pycuda.compiler import SourceModule
 5 | import numpy as np
 6 | from time import time
 7 | import threading 
 8 | 
 9 | 
10 | num_arrays = 10
11 | array_len = 1024**2
12 | 
13 | kernel_code = """       
14 | __global__ void mult_ker(float * array, int array_len)
15 | {
16 |      int thd = blockIdx.x*blockDim.x + threadIdx.x;
17 |      int num_iters = array_len / blockDim.x;
18 | 
19 |      for(int j=0; j < num_iters; j++)
20 |      {
21 |          int i = j * blockDim.x + thd;
22 | 
23 |          for(int k = 0; k < 50; k++)
24 |          {
25 |               array[i] *= 2.0;
26 |               array[i] /= 2.0;
27 |          }
28 |      }
29 |  
30 | }
31 | """
32 | 
33 | class KernelLauncherThread(threading.Thread):
34 |     def __init__(self, input_array):
35 |         threading.Thread.__init__(self)
36 |         self.input_array = input_array
37 |         self.output_array = None
38 |   
39 |     def run(self):
40 |         self.dev = drv.Device(0)
41 |         self.context = self.dev.make_context()
42 |         
43 |         self.ker = SourceModule(kernel_code)
44 | 
45 |         self.mult_ker = self.ker.get_function('mult_ker')
46 | 
47 |         self.array_gpu = gpuarray.to_gpu(self.input_array)
48 |         
49 |         self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
50 | 
51 |         self.output_array = self.array_gpu.get()
52 |         
53 |         self.context.pop()
54 |         
55 |     def join(self):
56 |         threading.Thread.join(self)
57 |         return self.output_array
58 | 
59 | drv.init()
60 | 
61 | 
62 | data = []
63 | gpu_out = []
64 | threads = []
65 | 
66 | # generate random arrays and thread objects.
67 | for _ in range(num_arrays):
68 |     data.append(np.random.randn(array_len).astype('float32'))
69 | 
70 | for k in range(num_arrays):
71 |     # create a thread that uses data we just generated
72 |     threads.append(KernelLauncherThread(data[k]))
73 | 
74 | # launch threads to process arrays.
75 | for k in range(num_arrays):
76 |     threads[k].start()
77 |     
78 | # get data from launched threads.
79 | for k in range(num_arrays):
80 |     gpu_out.append(threads[k].join())
81 | 
82 | for k in range(num_arrays):
83 |     assert (np.allclose(gpu_out[k], data[k]))
84 | 
85 | 


--------------------------------------------------------------------------------
/Chapter05/multi-kernel_streams.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | from pycuda import gpuarray
 4 | from pycuda.compiler import SourceModule
 5 | import numpy as np
 6 | from time import time
 7 | 
 8 | num_arrays = 200
 9 | array_len = 1024**2
10 | 
11 | ker = SourceModule("""       
12 | __global__ void mult_ker(float * array, int array_len)
13 | {
14 |      int thd = blockIdx.x*blockDim.x + threadIdx.x;
15 |      int num_iters = array_len / blockDim.x;
16 | 
17 |      for(int j=0; j < num_iters; j++)
18 |      {
19 |          int i = j * blockDim.x + thd;
20 | 
21 |          for(int k = 0; k < 50; k++)
22 |          {
23 |               array[i] *= 2.0;
24 |               array[i] /= 2.0;
25 |          }
26 |      }
27 | 
28 | }
29 | """)
30 | 
31 | mult_ker = ker.get_function('mult_ker')
32 | 
33 | data = []
34 | data_gpu = []
35 | gpu_out = []
36 | streams = []
37 | 
38 | for _ in range(num_arrays):
39 |     streams.append(drv.Stream())
40 | 
41 | # generate random arrays.
42 | for _ in range(num_arrays):
43 |     data.append(np.random.randn(array_len).astype('float32'))
44 | 
45 | t_start = time()
46 | 
47 | # copy arrays to GPU.
48 | for k in range(num_arrays):
49 |     data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k]))
50 | 
51 | # process arrays.
52 | for k in range(num_arrays):
53 |     mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1), stream=streams[k])
54 | 
55 | # copy arrays from GPU.
56 | for k in range(num_arrays):
57 |     gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
58 | 
59 | t_end = time()
60 | 
61 | for k in range(num_arrays):
62 |     assert (np.allclose(gpu_out[k], data[k]))
63 | 
64 | print('Total time: %f' % (t_end - t_start))
65 | 


--------------------------------------------------------------------------------
/Chapter05/simple_context_create.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from pycuda import gpuarray
 3 | import pycuda.driver as drv
 4 | 
 5 | drv.init()
 6 | dev = drv.Device(0)
 7 | ctx = dev.make_context()
 8 | 
 9 | x = gpuarray.to_gpu(np.float32([1,2,3]))
10 | print(x.get())
11 | 
12 | ctx.pop()
13 | 


--------------------------------------------------------------------------------
/Chapter05/simple_event_example.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | from pycuda import gpuarray
 4 | from pycuda.compiler import SourceModule
 5 | import numpy as np
 6 | from time import time
 7 | 
 8 | ker = SourceModule("""       
 9 | __global__ void mult_ker(float * array, int array_len)
10 | {
11 |      int thd = blockIdx.x*blockDim.x + threadIdx.x;
12 |      int num_iters = array_len / blockDim.x;
13 | 
14 |      for(int j=0; j < num_iters; j++)
15 |      {
16 |          int i = j * blockDim.x + thd;
17 | 
18 |          for(int k = 0; k < 50; k++)
19 |          {
20 |               array[i] *= 2.0;
21 |               array[i] /= 2.0;
22 |          }
23 |      }
24 | }
25 | """)
26 | 
27 | mult_ker = ker.get_function('mult_ker')
28 | 
29 | array_len = 100*1024**2
30 | 
31 | data = np.random.randn(array_len).astype('float32')
32 | data_gpu = gpuarray.to_gpu(data)
33 | 
34 | start_event = drv.Event()
35 | end_event = drv.Event()
36 | 
37 | start_event.record()
38 | mult_ker(data_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))
39 | end_event.record()
40 | 
41 | end_event.synchronize()
42 | 
43 | print('Has the kernel started yet? {}'.format(start_event.query()))
44 | print('Has the kernel ended yet? {}'.format(end_event.query()))
45 | 
46 | print('Kernel execution time in milliseconds: %f ' % start_event.time_till(end_event))
47 | 
48 | 


--------------------------------------------------------------------------------
/Chapter05/single_thread_example.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | class PointlessExampleThread(threading.Thread):
 4 |     def __init__(self):
 5 |         threading.Thread.__init__(self)
 6 |         self.return_value = None
 7 |         
 8 |     def run(self):
 9 |         print('Hello from the thread you just spawned!')
10 |         self.return_value = 123
11 |         
12 |     def join(self):
13 |         threading.Thread.join(self)
14 |         return self.return_value
15 |     
16 | 
17 | NewThread = PointlessExampleThread()
18 | NewThread.start()
19 | thread_output = NewThread.join()
20 | print('The thread completed and returned this value: %s' % thread_output)
21 | 


--------------------------------------------------------------------------------
/Chapter06/broken_matrix_ker.py:
--------------------------------------------------------------------------------
 1 | # Note: this code is intentionally broken!!!
 2 | # (This is intended to show a case study of how to debug CUDA code
 3 | # using printf.)
 4 | 
 5 | import pycuda.autoinit
 6 | import pycuda.driver as drv
 7 | from pycuda import gpuarray
 8 | from pycuda.compiler import SourceModule
 9 | import numpy as np
10 | 
11 | 
12 | ker = SourceModule('''
13 | // row-column dot-product for matrix multiplication
14 | __device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
15 | {
16 | 
17 |     //printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d, N is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col, N);
18 | 	float val = 0;
19 | 	
20 | 	for (int k=0; k < N; k++)
21 | 	{
22 |     
23 |         // broken version
24 |         val += matrix_a[ row + k*N ] * matrix_b[ col*N + k];
25 | 
26 |         //if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0)
27 |         //    printf("Dot-product loop: k value is %d, matrix_a value is %f, matrix_b is %f.\\n", k, matrix_a[ row + k*N ], matrix_b[ col*N + k]);
28 | 		
29 |         // fixed version
30 |         //val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
31 | 	}
32 | 	
33 | 	return(val);
34 | 
35 | }
36 | 
37 | // matrix multiplication kernel that is parallelized over row/column tuples.
38 | __global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
39 | {
40 | 
41 |    // broken version
42 |    int row = blockIdx.x + threadIdx.x;
43 | 	int col = blockIdx.y + threadIdx.y;
44 |     
45 |     // fixed version
46 |     //int row = blockIdx.x*blockDim.x + threadIdx.x;
47 |     //int col = blockIdx.y*blockDim.y + threadIdx.y;
48 |     
49 |     //printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col);
50 | 
51 | 	
52 |     // broken version
53 |     output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, col, row, N);
54 |     
55 |     // fixed version
56 | 	//output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
57 | 
58 | 
59 | }
60 | ''')
61 | 
62 | matrix_ker = ker.get_function('matrix_mult_ker')
63 | 
64 | test_a = np.float32( [range(1,5)] * 4 )
65 | test_b = np.float32([range(14,10, -1)]*4 )
66 | 
67 | output_mat = np.matmul(test_a, test_b)
68 | 
69 | test_a_gpu = gpuarray.to_gpu(test_a)
70 | test_b_gpu = gpuarray.to_gpu(test_b)
71 | output_mat_gpu = gpuarray.empty_like(test_a_gpu)
72 | 
73 | matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1))
74 | 
75 | assert( np.allclose(output_mat_gpu.get(), output_mat) )
76 | 
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/Chapter06/divergence_test.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <stdio.h>
 3 | 
 4 | __global__ void divergence_test_ker()
 5 | {
 6 | 	if( threadIdx.x % 2 == 0)
 7 | 		printf("threadIdx.x %d : This is an even thread.\n", threadIdx.x);
 8 | 	else
 9 | 		printf("threadIdx.x %d : This is an odd thread.\n", threadIdx.x);
10 | }
11 | 
12 | __host__ int main()
13 | {
14 | 	cudaSetDevice(0);
15 | 	divergence_test_ker<<<1, 32>>>();
16 | 	cudaDeviceSynchronize();
17 | 	cudaDeviceReset();
18 | }
19 | 


--------------------------------------------------------------------------------
/Chapter06/hello-world_gpu.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | from pycuda import gpuarray
 4 | from pycuda.compiler import SourceModule
 5 | 
 6 | ker = SourceModule('''
 7 | __global__ void hello_world_ker()
 8 | {
 9 | 	printf("Hello world from thread %d, in block %d!\\n", threadIdx.x, blockIdx.x);
10 | 	
11 | 	__syncthreads();
12 | 	
13 | 	if(threadIdx.x == 0 && blockIdx.x == 0)
14 | 	{
15 | 		printf("-------------------------------------\\n");
16 | 		printf("This kernel was launched over a grid consisting of %d blocks,\\n", gridDim.x);
17 | 		printf("where each block has %d threads.\\n", blockDim.x);
18 | 	}
19 | }
20 | ''')
21 | 
22 | hello_ker = ker.get_function("hello_world_ker")
23 | hello_ker(  block=(5,1,1), grid=(2,1,1) )
24 | 


--------------------------------------------------------------------------------
/Chapter06/matrix_ker.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #define _EPSILON 0.001
  5 | #define _ABS(x)	( x > 0.0f ? x : -x )
  6 | 
  7 | __host__ int allclose(float *A, float *B, int len)
  8 | {
  9 | 
 10 | 	int returnval = 0;
 11 | 	
 12 | 	for (int i = 0; i < len; i++)
 13 | 	{
 14 | 		if ( _ABS(A[i] - B[i]) > _EPSILON )
 15 | 		{
 16 | 			returnval = -1;
 17 | 			break;
 18 | 		}
 19 | 	}
 20 | 	
 21 | 	return(returnval);
 22 | }
 23 | 
 24 | 
 25 | // row-column dot-product for matrix multiplication
 26 | __device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
 27 | {
 28 | 	float val = 0;
 29 | 	
 30 | 	for (int k=0; k < N; k++)
 31 | 	{
 32 |         val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
 33 | 	}
 34 | 	
 35 | 	return(val);
 36 | }
 37 | 
 38 | // matrix multiplication kernel that is parallelized over row/column tuples.
 39 | __global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
 40 | {
 41 | 
 42 |     int row = blockIdx.x*blockDim.x + threadIdx.x;
 43 |     int col = blockIdx.y*blockDim.y + threadIdx.y;
 44 | 
 45 | 	output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
 46 | }
 47 | 
 48 | 
 49 | __host__ int main()
 50 | {
 51 | 
 52 | 	// Initialize to use first GPU.
 53 | 	cudaSetDevice(0);
 54 | 
 55 | 	// this indicates the width/height of the matrices
 56 | 	int N = 4;
 57 | 	
 58 | 	// this will indicate how many bytes to allocate to store a test or output matrix
 59 | 	int num_bytes = sizeof(float)*N*N;
 60 | 	
 61 | 	// input test matrix A
 62 | 	float h_A[] = {	1.0,  2.0,  3.0,  4.0, \
 63 | 					1.0,  2.0,  3.0,  4.0, \
 64 | 					1.0,  2.0,  3.0,  4.0, \
 65 | 					1.0,  2.0,  3.0,  4.0 };
 66 | 					
 67 | 	// input test matrix B
 68 | 	float h_B[] = {	14.0,  13.0,  12.0,  11.0, \
 69 | 					14.0,  13.0,  12.0,  11.0, \
 70 | 					14.0,  13.0,  12.0,  11.0, \
 71 | 					14.0,  13.0,  12.0,  11.0 };
 72 | 	
 73 | 	// expected output of A times B
 74 | 	float h_AxB[] = { 140.0,  130.0,  120.0,  110.0, \
 75 | 					140.0,  130.0,  120.0,  110.0, \
 76 | 					140.0,  130.0,  120.0,  110.0, \
 77 | 					140.0,  130.0,  120.0,  110.0 };
 78 | 					
 79 | 					
 80 | 	// these pointers will be used for the GPU.
 81 | 	// (notice how we use normal float pointers)
 82 | 	float * d_A;
 83 | 	float * d_B;
 84 | 	float * d_output;
 85 | 	
 86 | 	// allocate memory for the test matrices on the GPU
 87 | 	cudaMalloc((float **) &d_A, num_bytes);
 88 | 	cudaMalloc((float **) &d_B, num_bytes);
 89 | 	
 90 | 	// copy the test matrices to the GPU
 91 | 	cudaMemcpy(d_A, h_A, num_bytes, cudaMemcpyHostToDevice);
 92 | 	cudaMemcpy(d_B, h_B, num_bytes, cudaMemcpyHostToDevice);
 93 | 	
 94 | 	// allocate memory for output on GPU
 95 | 	cudaMalloc((float **) &d_output, num_bytes);
 96 | 	
 97 | 	// this will store the output from the GPU
 98 | 	float * h_output;
 99 | 	h_output = (float *) malloc(num_bytes);
100 | 
101 | 	// setup our block and grid launch parameters with the dim3 class.
102 | 	dim3 block(2,2,1);
103 | 	dim3 grid(2,2,1);
104 | 	
105 | 	// launch our kernel
106 | 	matrix_mult_ker <<< grid, block >>> (d_A, d_B, d_output, N);
107 | 	
108 | 	// synchronize on the host, to ensure our kernel has finished executing.
109 | 	cudaDeviceSynchronize();
110 | 	
111 | 	// copy output from device to host.
112 | 	cudaMemcpy(h_output, d_output, num_bytes, cudaMemcpyDeviceToHost);
113 | 
114 | 	// synchronize again.
115 | 	cudaDeviceSynchronize();
116 | 	
117 | 	// free arrays on device.
118 | 	cudaFree(d_A);
119 | 	cudaFree(d_B);
120 | 	cudaFree(d_output);
121 | 	
122 | 	// reset the GPU.
123 | 	cudaDeviceReset();
124 | 	
125 | 	
126 | 	// Check to see if we got the expected output.
127 | 	// in both cases, remember to de-allocate h_output before returning.
128 | 	
129 | 	if (allclose(h_AxB, h_output, N*N) < 0)
130 | 	{
131 | 		printf("Error!  Output of kernel does not match expected output.\n");
132 | 		free(h_output);
133 | 		return(-1);
134 | 	}
135 | 	else
136 | 	{
137 | 		printf("Success!  Output of kernel matches expected output.\n");
138 | 		free(h_output);
139 | 		return(0);
140 | 	}
141 | 
142 | 
143 | }
144 | 


--------------------------------------------------------------------------------
/Chapter06/matrix_ker.py:
--------------------------------------------------------------------------------
 1 | # This program is the "fixed" version of broken_matrix_ker.py
 2 | 
 3 | # This is to be used for an exercise where this code is translated to
 4 | # a pure CUDA-C version.
 5 | 
 6 | import pycuda.autoinit
 7 | import pycuda.driver as drv
 8 | from pycuda import gpuarray
 9 | from pycuda.compiler import SourceModule
10 | import numpy as np
11 | 
12 | 
13 | ker = SourceModule('''
14 | // row-column dot-product for matrix multiplication
15 | __device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
16 | {
17 | 	float val = 0;
18 | 	
19 | 	for (int k=0; k < N; k++)
20 | 	{
21 |         val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
22 | 	}
23 | 	
24 | 	return(val);
25 | 
26 | }
27 | 
28 | // matrix multiplication kernel that is parallelized over row/column tuples.
29 | __global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
30 | {
31 | 
32 |     int row = blockIdx.x*blockDim.x + threadIdx.x;
33 |     int col = blockIdx.y*blockDim.y + threadIdx.y;
34 | 
35 | 	output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);
36 | 
37 | }
38 | ''')
39 | 
40 | matrix_ker = ker.get_function('matrix_mult_ker')
41 | 
42 | test_a = np.float32([range(1,5)] * 4)
43 | test_b = np.float32([range(14,10, -1)]*4 )
44 | 
45 | output_mat = np.matmul(test_a, test_b)
46 | 
47 | test_a_gpu = gpuarray.to_gpu(test_a)
48 | test_b_gpu = gpuarray.to_gpu(test_b)
49 | output_mat_gpu = gpuarray.empty_like(test_a_gpu)
50 | 
51 | matrix_ker(test_a_gpu, test_b_gpu, output_mat_gpu, np.int32(4), block=(2,2,1), grid=(2,2,1))
52 | 
53 | assert(np.allclose(output_mat_gpu.get(), output_mat) )
54 | 
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/Chapter07/conv_2d.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import pycuda.autoinit
 3 | from pycuda import gpuarray
 4 | import numpy as np
 5 | from skcuda import fft
 6 | from skcuda import linalg
 7 | from matplotlib import pyplot as plt
 8 | 
 9 | 
10 | def cufft_conv(x , y):
11 |     
12 |     x = x.astype(np.complex64)
13 |     y = y.astype(np.complex64)
14 |     
15 |     if (x.shape != y.shape):
16 |         return -1
17 |     
18 |     plan = fft.Plan(x.shape, np.complex64, np.complex64)
19 |     inverse_plan = fft.Plan(x.shape, np.complex64, np.complex64)
20 |     
21 |     x_gpu = gpuarray.to_gpu(x)
22 |     y_gpu = gpuarray.to_gpu(y)
23 |     
24 |     x_fft = gpuarray.empty_like(x_gpu, dtype=np.complex64)
25 |     y_fft = gpuarray.empty_like(y_gpu, dtype=np.complex64)
26 |     out_gpu = gpuarray.empty_like(x_gpu, dtype=np.complex64)
27 |     
28 |     fft.fft(x_gpu, x_fft, plan)
29 |     fft.fft(y_gpu, y_fft, plan)
30 |     
31 |     
32 |     linalg.multiply(x_fft, y_fft, overwrite=True)
33 |     
34 |     fft.ifft(y_fft, out_gpu, inverse_plan, scale=True)
35 |     
36 |     conv_out = out_gpu.get()
37 |     
38 |     return conv_out
39 | 
40 | 
41 | def conv_2d(ker, img):
42 |     
43 |     padded_ker = np.zeros( (img.shape[0] + 2*ker.shape[0],  img.shape[1] + 2*ker.shape[1] )).astype(np.float32)
44 |     
45 |     padded_ker[:ker.shape[0], :ker.shape[1]] = ker
46 |     
47 |     padded_ker = np.roll(padded_ker, shift=-ker.shape[0]//2, axis=0)
48 |     padded_ker = np.roll(padded_ker, shift=-ker.shape[1]//2, axis=1)
49 |     
50 |     padded_img = np.zeros_like(padded_ker).astype(np.float32)
51 |     
52 |     padded_img[ker.shape[0]:-ker.shape[0], ker.shape[1]:-ker.shape[1]] = img
53 |     
54 |     out_ = cufft_conv(padded_ker, padded_img)
55 |     
56 |     output = out_[ker.shape[0]:-ker.shape[0], ker.shape[1]:-ker.shape[1]]
57 |     
58 |     return output
59 | 
60 | gaussian_filter = lambda x, y, sigma : (1 / np.sqrt(2*np.pi*(sigma**2)) )*np.exp( -(x**2 + y**2) / (2 * (sigma**2) ))
61 | 
62 | def gaussian_ker(sigma):
63 |     ker_ = np.zeros((2*sigma+1, 2*sigma+1))
64 |     
65 |     for i in range(2*sigma + 1):
66 |         for j in range(2*sigma + 1):
67 |             ker_[i,j] = gaussian_filter(i - sigma, j - sigma, sigma)
68 |             
69 |     total_ = np.sum(ker_.ravel())
70 |     
71 |     ker_ = ker_ / total_
72 |     
73 |     return ker_
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     
78 |     latte = np.float32(plt.imread('latte.jpg')) / 255
79 |     latte_blurred = np.zeros_like(latte)
80 |     ker = gaussian_ker(30)
81 |     
82 |     for k in range(3):
83 |         latte_blurred[:,:,k] = conv_2d(ker, latte[:,:,k])
84 |     
85 |     
86 |     fig, (ax0, ax1) = plt.subplots(1,2)
87 |     fig.suptitle('Gaussian Filtering', fontsize=20)
88 |     ax0.set_title('Before')
89 |     ax0.axis('off')
90 |     ax0.imshow(latte)
91 |     ax1.set_title('After')
92 |     ax1.axis('off')
93 |     ax1.imshow(latte_blurred)
94 |     plt.tight_layout()
95 |     plt.subplots_adjust(top=.85)
96 |     plt.show()
97 | 


--------------------------------------------------------------------------------
/Chapter07/cublas_gemm_flops.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | from pycuda import gpuarray
 3 | import numpy as np
 4 | from skcuda import cublas
 5 | from time import time
 6 | 
 7 | m = 5000
 8 | n = 10000
 9 | k = 10000
10 | 
11 | 
12 | def compute_gflops(precision='S'):
13 | 
14 | 
15 | 	if precision=='S':
16 | 		float_type = 'float32'
17 | 	elif precision=='D':
18 | 		float_type = 'float64'
19 | 	else:
20 | 		return -1
21 | 		
22 | 		
23 | 	A = np.random.randn(m, k).astype(float_type)
24 | 	B = np.random.randn(k, n).astype(float_type)
25 | 	C = np.random.randn(m, n).astype(float_type)
26 | 
27 | 	A_cm = A.T.copy()
28 | 	B_cm = B.T.copy()
29 | 	C_cm = C.T.copy()
30 | 
31 | 	A_gpu = gpuarray.to_gpu(A_cm)
32 | 	B_gpu = gpuarray.to_gpu(B_cm)
33 | 	C_gpu = gpuarray.to_gpu(C_cm)
34 | 
35 | 	alpha = np.random.randn()
36 | 	beta = np.random.randn()
37 | 
38 | 	transa = cublas._CUBLAS_OP['N']
39 | 	transb = cublas._CUBLAS_OP['N']
40 | 
41 | 	lda = m
42 | 	ldb = k
43 | 	ldc = m
44 | 
45 | 	t = time()
46 | 	handle = cublas.cublasCreate()
47 | 
48 | 	
49 | 	exec('cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \
50 | 						B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision)
51 | 	
52 | 	cublas.cublasDestroy(handle)
53 | 	t = time() - t
54 | 
55 | 	gflops = 2*m*n*(k+1)*(10**-9) / t 
56 | 	
57 | 	return gflops
58 | 
59 | if __name__ == '__main__':
60 | 	print('Single-precision performance: %s GFLOPS' % compute_gflops('S'))
61 | 	print('Double-precision performance: %s GFLOPS' % compute_gflops('D'))
62 | 


--------------------------------------------------------------------------------
/Chapter07/latte.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Hands-On-GPU-Programming-with-CUDA-C-and-Python-3.x-Second-Edition/36ddc06599d8fea62db05a6dc35de06274c4c0d2/Chapter07/latte.jpg


--------------------------------------------------------------------------------
/Chapter08/monte_carlo_integrator.py:
--------------------------------------------------------------------------------
  1 | import pycuda.autoinit
  2 | import pycuda.driver as drv
  3 | from pycuda import gpuarray
  4 | from pycuda.compiler import SourceModule
  5 | import numpy as np
  6 | 
  7 | # https://docs.nvidia.com/cuda/cuda-math-api/index.html
  8 | 
  9 | MonteCarloKernelTemplate = '''
 10 | #include <curand_kernel.h>
 11 | 
 12 | #define ULL  unsigned long long
 13 | #define _R(z)   ( 1.0f / (z) )
 14 | #define _P2(z)   ( (z) * (z) )
 15 | 
 16 | // p stands for "precision" (single or double)
 17 | __device__ inline %(p)s  f(%(p)s x)
 18 | {
 19 |      %(p)s y;
 20 |  
 21 |      %(math_function)s;
 22 | 
 23 |     return y;
 24 | }
 25 | 
 26 | 
 27 | extern "C" {
 28 | __global__ void monte_carlo(int iters, %(p)s lo, %(p)s hi, %(p)s * ys_out)
 29 | {
 30 |     curandState cr_state;
 31 |      
 32 | 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
 33 |     
 34 |     int num_threads = blockDim.x * gridDim.x;
 35 |     
 36 |     %(p)s t_width = (hi - lo) / ( %(p)s ) num_threads;
 37 |     
 38 |     %(p)s density = ( ( %(p)s ) iters ) / t_width;
 39 |     
 40 |     %(p)s t_lo = t_width*tid + lo;
 41 |     %(p)s t_hi = t_lo + t_width;
 42 |     
 43 | 
 44 | 	curand_init( (ULL)  clock() + (ULL) tid, (ULL) 0, (ULL) 0, &cr_state);
 45 |     
 46 |      %(p)s y, y_sum = 0.0f;
 47 |      
 48 |      
 49 |      %(p)s rand_val, x;
 50 |      for (int i=0; i < iters; i++)
 51 |      {
 52 |          rand_val = curand_uniform%(p_curand)s(&cr_state);
 53 |          
 54 |          x = t_lo + t_width * rand_val;
 55 |          
 56 |          y_sum += f(x);
 57 |      }
 58 |      
 59 |      y = y_sum / density;
 60 |      
 61 |      ys_out[tid] = y;
 62 | }
 63 | 
 64 | }
 65 | '''
 66 | 
 67 | 
 68 | class MonteCarloIntegrator:
 69 |     
 70 |     def __init__(self, math_function='y = sin(x)', precision='d', lo=0, hi=np.pi, samples_per_thread=10**5, num_blocks=100):
 71 |         
 72 |         self.math_function = math_function
 73 |         
 74 |         if precision in [None, 's', 'S', 'single', np.float32]:
 75 |             self.precision = 'float'
 76 |             self.numpy_precision = np.float32
 77 |             self.p_curand = ''
 78 |         elif precision in ['d','D', 'double', np.float64]:
 79 |             self.precision = 'double'
 80 |             self.numpy_precision = np.float64
 81 |             self.p_curand = '_double'
 82 |         else:
 83 |             raise Exception('precision is invalid datatype!')
 84 |             
 85 |         if (hi - lo <= 0):
 86 |             raise Exception('hi - lo <= 0!')
 87 |         else:
 88 |             self.hi = hi
 89 |             self.lo = lo
 90 |               
 91 |         MonteCarloDict = {'p' : self.precision, 'p_curand' : self.p_curand, 'math_function' : self.math_function}
 92 |         
 93 |         self.MonteCarloCode = MonteCarloKernelTemplate % MonteCarloDict
 94 |         
 95 |         self.ker = SourceModule(no_extern_c=True , options=['-w'], source=self.MonteCarloCode)
 96 |         
 97 |         self.f = self.ker.get_function('monte_carlo')
 98 |         
 99 |         self.num_blocks = num_blocks
100 |         
101 |         self.samples_per_thread = samples_per_thread
102 |         
103 |             
104 |     def definite_integral(self, lo=None, hi=None, samples_per_thread=None, num_blocks=None):
105 |         
106 |         if lo is None or hi is None:
107 |             lo = self.lo
108 |             hi = self.hi
109 |             
110 |         if samples_per_thread is None:
111 |             samples_per_thread = self.samples_per_thread
112 |             
113 |         if num_blocks is None:
114 |             num_blocks = self.num_blocks
115 |             grid = (num_blocks,1,1)
116 |         else:
117 |             grid = (num_blocks,1,1)
118 |             
119 |         block = (32,1,1)
120 |         
121 |         num_threads = 32*num_blocks
122 |         
123 |         self.ys = gpuarray.empty((num_threads,) , dtype=self.numpy_precision)
124 |         
125 |         self.f(np.int32(samples_per_thread), self.numpy_precision(lo), self.numpy_precision(hi), self.ys, block=block, grid=grid)
126 |         
127 |         self.nintegral = np.sum(self.ys.get() )
128 |         
129 |         return np.sum(self.nintegral)
130 |     
131 |     
132 |     
133 | if __name__ == '__main__':
134 | 
135 |     integral_tests = [('y =log(x)*_P2(sin(x))', 11.733 , 18.472, 8.9999), ('y = _R( 1 + sinh(2*x)*_P2(log(x)) )', .9, 4, .584977), ('y = (cosh(x)*sin(x))/ sqrt( pow(x,3) + _P2(sin(x)))', 1.85, 4.81,  -3.34553) ]
136 |     
137 |     
138 |     for f, lo, hi, expected in integral_tests:
139 |         mci = MonteCarloIntegrator(math_function=f, precision='d', lo=lo, hi=hi)
140 |         print('The Monte Carlo numerical integration of the function\n \t f: x -> %s \n \t from x = %s to x = %s is : %s ' % (f, lo, hi, mci.definite_integral()))
141 |         print('where the expected value is : %s\n' % expected)
142 | 


--------------------------------------------------------------------------------
/Chapter08/monte_carlo_pi.py:
--------------------------------------------------------------------------------
 1 | import pycuda.autoinit
 2 | import pycuda.driver as drv
 3 | from pycuda import gpuarray
 4 | from pycuda.compiler import SourceModule
 5 | import numpy as np
 6 | from sympy import Rational
 7 | 
 8 | ker = SourceModule(no_extern_c=True ,source='''
 9 | #include <curand_kernel.h>
10 | #define _PYTHAG(a,b)  (a*a + b*b)
11 | #define ULL  unsigned long long
12 | 
13 | extern "C" {
14 | 
15 | __global__ void estimate_pi(ULL iters, ULL * hits)
16 | {
17 | 
18 | 	curandState cr_state;
19 |      
20 | 	int tid = blockIdx.x * blockDim.x + threadIdx.x;
21 | 
22 | 	curand_init( (ULL)  clock() + (ULL) tid, (ULL) 0, \
23 | 	(ULL) 0, &cr_state);
24 | 
25 | 	float x, y;
26 |  
27 | 	for(ULL i=0; i < iters; i++)
28 | 	{ 
29 | 
30 | 		 x = curand_uniform(&cr_state);
31 | 		 y = curand_uniform(&cr_state);
32 | 		 
33 | 		 
34 | 		 if(_PYTHAG(x,y) <= 1.0f)
35 | 			 hits[tid]++;
36 | 	}
37 |  
38 |  return;
39 | 
40 | }
41 | 
42 | }// (End of 'extern "C"' here)
43 | ''')
44 | 
45 | 
46 | 
47 | pi_ker = ker.get_function("estimate_pi")
48 | 
49 | threads_per_block = 32
50 | blocks_per_grid = 512 
51 | 
52 | total_threads = threads_per_block * blocks_per_grid
53 | 
54 | hits_d = gpuarray.zeros((total_threads,),dtype=np.uint64)
55 | 
56 | iters = 2**24   
57 | 
58 | pi_ker(np.uint64(iters), hits_d, grid=(blocks_per_grid,1,1), block=(threads_per_block,1,1))
59 | 
60 | total_hits = np.sum( hits_d.get()  )
61 | total = np.uint64(total_threads) * np.uint64(iters)
62 | 
63 | est_pi_symbolic =  Rational(4)*Rational(int(total_hits), int(total) )
64 | 
65 | est_pi = np.float(est_pi_symbolic.evalf())
66 | 
67 | print("Our Monte Carlo estimate of Pi is : %s" % est_pi)
68 | print("NumPy's Pi constant is: %s " % np.pi)
69 | 
70 | print("Our estimate passes NumPy's 'allclose' : %s" % np.allclose(est_pi, np.pi))
71 | 


--------------------------------------------------------------------------------
/Chapter08/thrust_dot_product.cu:
--------------------------------------------------------------------------------
 1 | #include <thrust/host_vector.h>
 2 | #include <thrust/device_vector.h>
 3 | #include <iostream>
 4 | 
 5 | using namespace std;
 6 | 
 7 | struct multiply_functor {
 8 | 
 9 | 	float w;
10 | 
11 | 	multiply_functor(float _w = 1) : w(_w) {}
12 | 
13 | 	__device__ float operator() (const float & x, const float & y)  { 
14 | 		return  w * x * y;
15 | 	}
16 | };
17 | 
18 | float dot_product(thrust::device_vector<float> &v, thrust::device_vector<float> &w ) //, thrust::device_vector<float> &z)
19 | {
20 | 	thrust::device_vector<float> z(v.size());
21 | 
22 | 	thrust::transform(v.begin(), v.end(), w.begin(), z.begin(), multiply_functor());
23 | 	
24 | 	return thrust::reduce(z.begin(), z.end());
25 | }
26 | 
27 | int main(void)
28 | {
29 | 
30 | 	thrust::device_vector<float> v;
31 | 
32 | 	v.push_back(1.0f);
33 | 	v.push_back(2.0f);
34 | 	v.push_back(3.0f);
35 | 
36 | 	thrust::device_vector<float> w(3);
37 | 
38 | 	thrust::fill(w.begin(), w.end(), 1.0f);
39 | 
40 | 	for (int i = 0; i < v.size(); i++)
41 | 		cout << "v[" << i << "] == " << v[i] << endl;
42 | 
43 | 	for (int i = 0; i < w.size(); i++)
44 | 		cout << "w[" << i << "] == " << w[i] << endl;
45 | 	
46 | 	cout << "dot_product(v , w) == " << dot_product(v,w) << endl;
47 | 
48 | 	return 0;
49 | }
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hands-On-GPU-Programming-with-CUDA-C-and-Python-3.x-Second-Edition
2 | Hands-On GPU Programming with CUDA C and Python 3.x, Second Edition, published by Packt
3 | 


--------------------------------------------------------------------------------