├── README.md
├── 020_array_sum.py
├── 040_elementwise.py
├── 010_introspection.py
├── 021_array_sum.py
└── 030_timing.py


/README.md:
--------------------------------------------------------------------------------
 1 | ## PyOpenCL Inline Comments Tutorial (In Progress)
 2 | 
 3 | This tutorial is an introduction to parallel programming with Python and OpenCL.  The lessons in the tutorial are numbered PyOpenCL scripts with inline comments.
 4 | 
 5 | ### About The Tutorial
 6 | 
 7 | PyOpenCL is a tool that is worth learning.  Python allows exceptional clarity-of-expression while OpenCL provides access to all the power modern hardware can deliver.  Together they are a great combination.
 8 | 
 9 | # Completed Lessons
10 | 
11 | - 010 Introspection - Find out about your computer's OpenCL situation
12 | - 020 Array Sum - Use OpenCL To Add Two Large Random Arrays - Hiding Details
13 | - 021 Array Sum - Use OpenCL To Add Two Large Random Arrays - Showing Details
14 | 


--------------------------------------------------------------------------------
/020_array_sum.py:
--------------------------------------------------------------------------------
 1 | # Use OpenCL To Add Two Random Arrays (This Way Hides Details)
 2 | 
 3 | import pyopencl as cl  # Import the OpenCL GPU computing API
 4 | import pyopencl.array as pycl_array  # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object)
 5 | import numpy as np  # Import Numpy number tools
 6 | 
 7 | context = cl.create_some_context()  # Initialize the Context
 8 | queue = cl.CommandQueue(context)  # Instantiate a Queue
 9 | 
10 | a = pycl_array.to_device(queue, np.random.rand(50000).astype(np.float32))
11 | b = pycl_array.to_device(queue, np.random.rand(50000).astype(np.float32))  
12 | # Create two random pyopencl arrays
13 | c = pycl_array.empty_like(a)  # Create an empty pyopencl destination array
14 | 
15 | program = cl.Program(context, """
16 | __kernel void sum(__global const float *a, __global const float *b, __global float *c)
17 | {
18 |   int i = get_global_id(0);
19 |   c[i] = a[i] + b[i];
20 | }""").build()  # Create the OpenCL program
21 | 
22 | program.sum(queue, a.shape, None, a.data, b.data, c.data)  # Enqueue the program for execution and store the result in c
23 | 
24 | print("a: {}".format(a))
25 | print("b: {}".format(b))
26 | print("c: {}".format(c))  
27 | # Print all three arrays, to show sum() worked


--------------------------------------------------------------------------------
/040_elementwise.py:
--------------------------------------------------------------------------------
 1 | # Use OpenCL To Add Two Random Arrays (Using PyOpenCL Arrays and Elementwise)
 2 | 
 3 | import pyopencl as cl  # Import the OpenCL GPU computing API
 4 | import pyopencl.array as cl_array  # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object)
 5 | import numpy  # Import Numpy number tools
 6 | 
 7 | context = cl.create_some_context()  # Initialize the Context
 8 | queue = cl.CommandQueue(context)  # Instantiate a Queue
 9 | 
10 | a = cl_array.to_device(queue, numpy.random.randn(10).astype(numpy.float32))  # Create a random pyopencl array
11 | b = cl_array.to_device(queue, numpy.random.randn(10).astype(numpy.float32))  # Create a random pyopencl array
12 | c = cl_array.empty_like(a)  # Create an empty pyopencl destination array
13 | 
14 | sum = cl.elementwise.ElementwiseKernel(context, "float *a, float *b, float *c", "c[i] = a[i] + b[i]", "sum")
15 | # Create an elementwise kernel object
16 | #  - Arguments: a string formatted as a C argument list
17 | #  - Operation: a snippet of C that carries out the desired map operatino
18 | #  - Name: the fuction name as which the kernel is compiled
19 | 
20 | sum(a, b, c)  # Call the elementwise kernel
21 | 
22 | print("a: {}".format(a))
23 | print("b: {}".format(b))
24 | print("c: {}".format(c))
25 | # Print all three arrays, to show sum() worked


--------------------------------------------------------------------------------
/010_introspection.py:
--------------------------------------------------------------------------------
 1 | # Find out about your computer's OpenCL situation
 2 | 
 3 | # Import the OpenCL GPU computing API
 4 | import pyopencl as cl
 5 | 
 6 | print('\n' + '=' * 60 + '\nOpenCL Platforms and Devices')
 7 | # Print each platform on this computer
 8 | for platform in cl.get_platforms():
 9 |     print('=' * 60)
10 |     print('Platform - Name:  ' + platform.name)
11 |     print('Platform - Vendor:  ' + platform.vendor)
12 |     print('Platform - Version:  ' + platform.version)
13 |     print('Platform - Profile:  ' + platform.profile)
14 |     # Print each device per-platform
15 |     for device in platform.get_devices():
16 |         print('    ' + '-' * 56)
17 |         print('    Device - Name:  ' + device.name)
18 |         print('    Device - Type:  ' + cl.device_type.to_string(device.type))
19 |         print('    Device - Max Clock Speed:  {0} Mhz'.format(device.max_clock_frequency))
20 |         print('    Device - Compute Units:  {0}'.format(device.max_compute_units))
21 |         print('    Device - Local Memory:  {0:.0f} KB'.format(device.local_mem_size/1024.0))
22 |         print('    Device - Constant Memory:  {0:.0f} KB'.format(device.max_constant_buffer_size/1024.0))
23 |         print('    Device - Global Memory: {0:.0f} GB'.format(device.global_mem_size/1073741824.0))
24 |         print('    Device - Max Buffer/Image Size: {0:.0f} MB'.format(device.max_mem_alloc_size/1048576.0))
25 |         print('    Device - Max Work Group Size: {0:.0f}'.format(device.max_work_group_size))
26 | print('\n')


--------------------------------------------------------------------------------
/021_array_sum.py:
--------------------------------------------------------------------------------
 1 | # Use OpenCL To Add Two Random Arrays (This Way Shows Details)
 2 | 
 3 | import pyopencl as cl  # Import the OpenCL GPU computing API
 4 | import numpy as np  # Import Np number tools
 5 | 
 6 | platform = cl.get_platforms()[0]  # Select the first platform [0]
 7 | device = platform.get_devices()[0]  # Select the first device on this platform [0]
 8 | context = cl.Context([device])  # Create a context with your device
 9 | queue = cl.CommandQueue(context)  # Create a command queue with your context
10 | 
11 | np_a = np.random.rand(50000).astype(np.float32)  # Create a random np array
12 | np_b = np.random.rand(50000).astype(np.float32)  # Create a random np array
13 | np_c = np.empty_like(np_a)  # Create an empty destination array
14 | 
15 | cl_a = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=np_a)
16 | cl_b = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=np_b)
17 | cl_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, np_c.nbytes)
18 | # Create three buffers (plans for areas of memory on the device)
19 | 
20 | kernel = """__kernel void sum(__global float* a, __global float* b, __global float* c)
21 | {
22 |     int i = get_global_id(0);
23 |     c[i] = a[i] + b[i];
24 | }"""  # Create a kernel (a string containing C-like OpenCL device code)
25 | 
26 | program = cl.Program(context, kernel).build()
27 | # Compile the kernel code into an executable OpenCL program
28 | 
29 | program.sum(queue, np_a.shape, None, cl_a, cl_b, cl_c)
30 | # Enqueue the program for execution, causing data to be copied to the device
31 | #  - queue: the command queue the program will be sent to
32 | #  - np_a.shape: a tuple of the arrays' dimensions
33 | #  - cl_a, cl_b, cl_c: the memory spaces this program deals with
34 | 
35 | np_arrays = [np_a, np_b, np_c]
36 | cl_arrays = [cl_a, cl_b, cl_c]
37 | 
38 | for x in range(3):
39 |     cl.enqueue_copy(queue, cl_arrays[x], np_arrays[x])
40 | queue.finish()
41 | # Copy the data for array c back to the host
42 | 
43 | for x in np_arrays:
44 | 	print(x)
45 | # Print all three host arrays, to show sum() worked


--------------------------------------------------------------------------------
/030_timing.py:
--------------------------------------------------------------------------------
 1 | # Test the speed of your PyOpenCL program
 2 | from time import time  # Import time tools
 3 | 
 4 | import pyopencl as cl  # Import the OpenCL GPU computing API
 5 | import numpy as np  # Import number tools
 6 |  
 7 | a = np.random.rand(1000).astype(np.float32)  # Create a random array to add
 8 | b = np.random.rand(1000).astype(np.float32)  # Create a random array to add
 9 | 
10 | def cpu_array_sum(a, b):  # Sum two arrays on the CPU
11 |     c_cpu = np.empty_like(a)  # Create the destination array
12 |     cpu_start_time = time()  # Get the CPU start time
13 |     for i in range(1000):
14 |             for j in range(1000):  # 1000 times add each number and store it
15 |                     c_cpu[i] = a[i] + b[i]  # This add operation happens 1,000,000 times XXX
16 |     cpu_end_time = time()  # Get the CPU end time
17 |     print("CPU Time: {0} s".format(cpu_end_time - cpu_start_time))  # Print how long the CPU took
18 |     return c_cpu  # Return the sum of the arrays
19 | 
20 | def gpu_array_sum(a, b):
21 |     context = cl.create_some_context()  # Initialize the Context
22 |     queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE)  # Instantiate a Queue with profiling (timing) enabled
23 |     a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
24 |     b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b)
25 |     c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes)  # Create three buffers (plans for areas of memory on the device)
26 |     program = cl.Program(context, """
27 |     __kernel void sum(__global const float *a, __global const float *b, __global float *c)
28 |     {
29 |         int i = get_global_id(0);
30 |         int j;
31 |         for(j = 0; j < 1000; j++)
32 |         {
33 |             c[i] = a[i] + b[i];
34 |         }
35 |     }""").build()  # Compile the device program
36 |     gpu_start_time = time()  # Get the GPU start time
37 |     event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer)  # Enqueue the GPU sum program XXX
38 |     event.wait()  # Wait until the event finishes XXX
39 |     elapsed = 1e-9*(event.profile.end - event.profile.start)  # Calculate the time it took to execute the kernel
40 |     print("GPU Kernel Time: {0} s".format(elapsed))  # Print the time it took to execute the kernel
41 |     c_gpu = np.empty_like(a)  # Create an empty array the same size as array a
42 |     cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait()  # Read back the data from GPU memory into array c_gpu
43 |     gpu_end_time = time()  # Get the GPU end time
44 |     print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time))  # Print the time the GPU program took, including both memory copies
45 |     return c_gpu  # Return the sum of the two arrays
46 | 
47 | cpu_array_sum(a, b)  # Call the function that sums two arrays on the CPU
48 | gpu_array_sum(a, b)  # Call the function that sums two arrays on the GPU


--------------------------------------------------------------------------------