├── README.md ├── 020_array_sum.py ├── 040_elementwise.py ├── 010_introspection.py ├── 021_array_sum.py └── 030_timing.py /README.md: -------------------------------------------------------------------------------- 1 | ## PyOpenCL Inline Comments Tutorial (In Progress) 2 | 3 | This tutorial is an introduction to parallel programming with Python and OpenCL. The lessons in the tutorial are numbered PyOpenCL scripts with inline comments. 4 | 5 | ### About The Tutorial 6 | 7 | PyOpenCL is a tool that is worth learning. Python allows exceptional clarity-of-expression while OpenCL provides access to all the power modern hardware can deliver. Together they are a great combination. 8 | 9 | # Completed Lessons 10 | 11 | - 010 Introspection - Find out about your computer's OpenCL situation 12 | - 020 Array Sum - Use OpenCL To Add Two Large Random Arrays - Hiding Details 13 | - 021 Array Sum - Use OpenCL To Add Two Large Random Arrays - Showing Details 14 | -------------------------------------------------------------------------------- /020_array_sum.py: -------------------------------------------------------------------------------- 1 | # Use OpenCL To Add Two Random Arrays (This Way Hides Details) 2 | 3 | import pyopencl as cl # Import the OpenCL GPU computing API 4 | import pyopencl.array as pycl_array # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object) 5 | import numpy as np # Import Numpy number tools 6 | 7 | context = cl.create_some_context() # Initialize the Context 8 | queue = cl.CommandQueue(context) # Instantiate a Queue 9 | 10 | a = pycl_array.to_device(queue, np.random.rand(50000).astype(np.float32)) 11 | b = pycl_array.to_device(queue, np.random.rand(50000).astype(np.float32)) 12 | # Create two random pyopencl arrays 13 | c = pycl_array.empty_like(a) # Create an empty pyopencl destination array 14 | 15 | program = cl.Program(context, """ 16 | __kernel void sum(__global const float *a, __global const float *b, __global float *c) 17 | { 18 | int i = get_global_id(0); 19 | c[i] = a[i] + b[i]; 20 | }""").build() # Create the OpenCL program 21 | 22 | program.sum(queue, a.shape, None, a.data, b.data, c.data) # Enqueue the program for execution and store the result in c 23 | 24 | print("a: {}".format(a)) 25 | print("b: {}".format(b)) 26 | print("c: {}".format(c)) 27 | # Print all three arrays, to show sum() worked -------------------------------------------------------------------------------- /040_elementwise.py: -------------------------------------------------------------------------------- 1 | # Use OpenCL To Add Two Random Arrays (Using PyOpenCL Arrays and Elementwise) 2 | 3 | import pyopencl as cl # Import the OpenCL GPU computing API 4 | import pyopencl.array as cl_array # Import PyOpenCL Array (a Numpy array plus an OpenCL buffer object) 5 | import numpy # Import Numpy number tools 6 | 7 | context = cl.create_some_context() # Initialize the Context 8 | queue = cl.CommandQueue(context) # Instantiate a Queue 9 | 10 | a = cl_array.to_device(queue, numpy.random.randn(10).astype(numpy.float32)) # Create a random pyopencl array 11 | b = cl_array.to_device(queue, numpy.random.randn(10).astype(numpy.float32)) # Create a random pyopencl array 12 | c = cl_array.empty_like(a) # Create an empty pyopencl destination array 13 | 14 | sum = cl.elementwise.ElementwiseKernel(context, "float *a, float *b, float *c", "c[i] = a[i] + b[i]", "sum") 15 | # Create an elementwise kernel object 16 | # - Arguments: a string formatted as a C argument list 17 | # - Operation: a snippet of C that carries out the desired map operatino 18 | # - Name: the fuction name as which the kernel is compiled 19 | 20 | sum(a, b, c) # Call the elementwise kernel 21 | 22 | print("a: {}".format(a)) 23 | print("b: {}".format(b)) 24 | print("c: {}".format(c)) 25 | # Print all three arrays, to show sum() worked -------------------------------------------------------------------------------- /010_introspection.py: -------------------------------------------------------------------------------- 1 | # Find out about your computer's OpenCL situation 2 | 3 | # Import the OpenCL GPU computing API 4 | import pyopencl as cl 5 | 6 | print('\n' + '=' * 60 + '\nOpenCL Platforms and Devices') 7 | # Print each platform on this computer 8 | for platform in cl.get_platforms(): 9 | print('=' * 60) 10 | print('Platform - Name: ' + platform.name) 11 | print('Platform - Vendor: ' + platform.vendor) 12 | print('Platform - Version: ' + platform.version) 13 | print('Platform - Profile: ' + platform.profile) 14 | # Print each device per-platform 15 | for device in platform.get_devices(): 16 | print(' ' + '-' * 56) 17 | print(' Device - Name: ' + device.name) 18 | print(' Device - Type: ' + cl.device_type.to_string(device.type)) 19 | print(' Device - Max Clock Speed: {0} Mhz'.format(device.max_clock_frequency)) 20 | print(' Device - Compute Units: {0}'.format(device.max_compute_units)) 21 | print(' Device - Local Memory: {0:.0f} KB'.format(device.local_mem_size/1024.0)) 22 | print(' Device - Constant Memory: {0:.0f} KB'.format(device.max_constant_buffer_size/1024.0)) 23 | print(' Device - Global Memory: {0:.0f} GB'.format(device.global_mem_size/1073741824.0)) 24 | print(' Device - Max Buffer/Image Size: {0:.0f} MB'.format(device.max_mem_alloc_size/1048576.0)) 25 | print(' Device - Max Work Group Size: {0:.0f}'.format(device.max_work_group_size)) 26 | print('\n') -------------------------------------------------------------------------------- /021_array_sum.py: -------------------------------------------------------------------------------- 1 | # Use OpenCL To Add Two Random Arrays (This Way Shows Details) 2 | 3 | import pyopencl as cl # Import the OpenCL GPU computing API 4 | import numpy as np # Import Np number tools 5 | 6 | platform = cl.get_platforms()[0] # Select the first platform [0] 7 | device = platform.get_devices()[0] # Select the first device on this platform [0] 8 | context = cl.Context([device]) # Create a context with your device 9 | queue = cl.CommandQueue(context) # Create a command queue with your context 10 | 11 | np_a = np.random.rand(50000).astype(np.float32) # Create a random np array 12 | np_b = np.random.rand(50000).astype(np.float32) # Create a random np array 13 | np_c = np.empty_like(np_a) # Create an empty destination array 14 | 15 | cl_a = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=np_a) 16 | cl_b = cl.Buffer(context, cl.mem_flags.COPY_HOST_PTR, hostbuf=np_b) 17 | cl_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, np_c.nbytes) 18 | # Create three buffers (plans for areas of memory on the device) 19 | 20 | kernel = """__kernel void sum(__global float* a, __global float* b, __global float* c) 21 | { 22 | int i = get_global_id(0); 23 | c[i] = a[i] + b[i]; 24 | }""" # Create a kernel (a string containing C-like OpenCL device code) 25 | 26 | program = cl.Program(context, kernel).build() 27 | # Compile the kernel code into an executable OpenCL program 28 | 29 | program.sum(queue, np_a.shape, None, cl_a, cl_b, cl_c) 30 | # Enqueue the program for execution, causing data to be copied to the device 31 | # - queue: the command queue the program will be sent to 32 | # - np_a.shape: a tuple of the arrays' dimensions 33 | # - cl_a, cl_b, cl_c: the memory spaces this program deals with 34 | 35 | np_arrays = [np_a, np_b, np_c] 36 | cl_arrays = [cl_a, cl_b, cl_c] 37 | 38 | for x in range(3): 39 | cl.enqueue_copy(queue, cl_arrays[x], np_arrays[x]) 40 | queue.finish() 41 | # Copy the data for array c back to the host 42 | 43 | for x in np_arrays: 44 | print(x) 45 | # Print all three host arrays, to show sum() worked -------------------------------------------------------------------------------- /030_timing.py: -------------------------------------------------------------------------------- 1 | # Test the speed of your PyOpenCL program 2 | from time import time # Import time tools 3 | 4 | import pyopencl as cl # Import the OpenCL GPU computing API 5 | import numpy as np # Import number tools 6 | 7 | a = np.random.rand(1000).astype(np.float32) # Create a random array to add 8 | b = np.random.rand(1000).astype(np.float32) # Create a random array to add 9 | 10 | def cpu_array_sum(a, b): # Sum two arrays on the CPU 11 | c_cpu = np.empty_like(a) # Create the destination array 12 | cpu_start_time = time() # Get the CPU start time 13 | for i in range(1000): 14 | for j in range(1000): # 1000 times add each number and store it 15 | c_cpu[i] = a[i] + b[i] # This add operation happens 1,000,000 times XXX 16 | cpu_end_time = time() # Get the CPU end time 17 | print("CPU Time: {0} s".format(cpu_end_time - cpu_start_time)) # Print how long the CPU took 18 | return c_cpu # Return the sum of the arrays 19 | 20 | def gpu_array_sum(a, b): 21 | context = cl.create_some_context() # Initialize the Context 22 | queue = cl.CommandQueue(context, properties=cl.command_queue_properties.PROFILING_ENABLE) # Instantiate a Queue with profiling (timing) enabled 23 | a_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a) 24 | b_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b) 25 | c_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, b.nbytes) # Create three buffers (plans for areas of memory on the device) 26 | program = cl.Program(context, """ 27 | __kernel void sum(__global const float *a, __global const float *b, __global float *c) 28 | { 29 | int i = get_global_id(0); 30 | int j; 31 | for(j = 0; j < 1000; j++) 32 | { 33 | c[i] = a[i] + b[i]; 34 | } 35 | }""").build() # Compile the device program 36 | gpu_start_time = time() # Get the GPU start time 37 | event = program.sum(queue, a.shape, None, a_buffer, b_buffer, c_buffer) # Enqueue the GPU sum program XXX 38 | event.wait() # Wait until the event finishes XXX 39 | elapsed = 1e-9*(event.profile.end - event.profile.start) # Calculate the time it took to execute the kernel 40 | print("GPU Kernel Time: {0} s".format(elapsed)) # Print the time it took to execute the kernel 41 | c_gpu = np.empty_like(a) # Create an empty array the same size as array a 42 | cl.enqueue_read_buffer(queue, c_buffer, c_gpu).wait() # Read back the data from GPU memory into array c_gpu 43 | gpu_end_time = time() # Get the GPU end time 44 | print("GPU Time: {0} s".format(gpu_end_time - gpu_start_time)) # Print the time the GPU program took, including both memory copies 45 | return c_gpu # Return the sum of the two arrays 46 | 47 | cpu_array_sum(a, b) # Call the function that sums two arrays on the CPU 48 | gpu_array_sum(a, b) # Call the function that sums two arrays on the GPU --------------------------------------------------------------------------------