├── .gitignore ├── Exercises ├── C_common │ ├── device_info.c │ ├── device_picker.h │ ├── err_code.h │ └── wtime.c ├── Cpp_common │ ├── cl.hpp │ ├── device_picker.hpp │ ├── err_code.h │ ├── util.hpp │ └── wtime.c ├── Exercise01 │ ├── C │ │ ├── DeviceInfo.c │ │ └── Makefile │ ├── Cpp │ │ ├── DeviceInfo.cpp │ │ └── Makefile │ ├── Python │ │ └── DeviceInfo.py │ └── README.md ├── Exercise02 │ ├── C │ │ ├── Makefile │ │ └── vadd_c.c │ └── README.md ├── Exercise03 │ ├── Cpp │ │ ├── Makefile │ │ ├── vadd.cl │ │ └── vadd.cpp │ ├── Python │ │ ├── deviceinfo.py │ │ └── vadd.py │ └── README.md ├── Exercise04 │ ├── C │ │ ├── Makefile │ │ └── vadd_c.c │ ├── Cpp │ │ ├── Makefile │ │ ├── vadd.cl │ │ └── vadd.cpp │ ├── Python │ │ ├── deviceinfo.py │ │ └── vadd.py │ └── README.md ├── Exercise05 │ ├── C │ │ ├── Makefile │ │ └── vadd_c.c │ ├── Cpp │ │ ├── Makefile │ │ ├── vadd.cl │ │ └── vadd.cpp │ ├── Python │ │ ├── deviceinfo.py │ │ └── vadd.py │ └── README.md ├── Exercise06 │ ├── C │ │ ├── Makefile │ │ ├── matmul.c │ │ ├── matmul.h │ │ ├── matrix_lib.c │ │ └── matrix_lib.h │ ├── Cpp │ │ ├── Makefile │ │ ├── matmul.cpp │ │ ├── matmul.hpp │ │ ├── matrix_lib.cpp │ │ └── matrix_lib.hpp │ ├── Python │ │ ├── definitions.py │ │ ├── helper.py │ │ └── matmul.py │ └── README.md ├── Exercise07 │ └── README.md ├── Exercise08 │ └── README.md ├── Exercise09 │ ├── C │ │ ├── Makefile │ │ └── pi.c │ ├── Cpp │ │ ├── Makefile │ │ └── pi.cpp │ ├── Python │ │ └── pi.py │ └── README.md ├── Exercise10 │ └── README.md ├── Exercise11 │ └── README.md ├── Exercise12 │ └── README.md ├── Exercise13 │ ├── C │ │ ├── Makefile │ │ └── gameoflife.c │ ├── CUDA-VADD │ │ ├── Makefile │ │ └── vadd.cu │ ├── CUDA │ │ ├── Makefile │ │ └── gameoflife.cu │ ├── Displayer │ │ ├── Makefile │ │ └── displayer.c │ ├── Examples │ │ ├── Acorn │ │ │ ├── acorn.dat │ │ │ ├── final_state.dat │ │ │ └── input.params │ │ ├── Max │ │ │ ├── final_state.dat │ │ │ ├── input.params │ │ │ └── max.dat │ │ ├── Pulsar │ │ │ ├── final_state.dat │ │ │ ├── input.params │ │ │ └── pulsar.dat │ │ └── QueenBee │ │ │ ├── final_state.dat │ │ │ ├── input.params │ │ │ └── queenbee.dat │ └── README.md ├── ExerciseA │ └── README.md └── Makefile ├── License ├── README.md ├── Solutions ├── C_common │ ├── device_info.c │ ├── device_picker.h │ ├── err_code.h │ └── wtime.c ├── Cpp_common │ ├── cl.hpp │ ├── device_picker.hpp │ ├── err_code.h │ ├── util.hpp │ └── wtime.c ├── Exercise04 │ ├── C │ │ ├── Makefile │ │ └── vadd_chain.c │ ├── Cpp │ │ ├── Makefile │ │ ├── vadd_chain.cl │ │ └── vadd_chain.cpp │ ├── Python │ │ └── vadd_chain.py │ └── README.md ├── Exercise05 │ ├── C │ │ ├── Makefile │ │ └── vadd_abc.c │ ├── Cpp │ │ ├── Makefile │ │ ├── vadd_abc.cl │ │ └── vadd_abc.cpp │ ├── Python │ │ └── vadd_abc.py │ └── README.md ├── Exercise06 │ ├── C │ │ ├── Makefile │ │ ├── matmul.c │ │ ├── matmul.h │ │ ├── matrix_lib.c │ │ └── matrix_lib.h │ ├── Cpp │ │ ├── Makefile │ │ ├── matmul.cpp │ │ ├── matmul.hpp │ │ ├── matrix_lib.cpp │ │ └── matrix_lib.hpp │ ├── Python │ │ ├── definitions.py │ │ ├── helper.py │ │ └── matmul.py │ └── README.md ├── Exercise07 │ ├── C │ │ ├── Makefile │ │ ├── matmul.c │ │ ├── matmul.h │ │ ├── matrix_lib.c │ │ └── matrix_lib.h │ ├── C_elem.cl │ ├── C_row.cl │ ├── C_row_priv.cl │ ├── Cpp │ │ ├── Makefile │ │ ├── matmul.cpp │ │ ├── matmul.hpp │ │ ├── matrix_lib.cpp │ │ └── matrix_lib.hpp │ ├── Python │ │ ├── definitions.py │ │ ├── helper.py │ │ └── matmul.py │ └── README.md ├── Exercise08 │ ├── C │ │ ├── Makefile │ │ ├── matmul.c │ │ ├── matmul.h │ │ ├── matrix_lib.c │ │ └── matrix_lib.h │ ├── C_block_form.cl │ ├── C_elem.cl │ ├── C_row.cl │ ├── C_row_priv.cl │ ├── C_row_priv_bloc.cl │ ├── Cpp │ │ ├── Makefile │ │ ├── matmul.cpp │ │ ├── matmul.hpp │ │ ├── matrix_lib.cpp │ │ └── matrix_lib.hpp │ ├── Python │ │ ├── definitions.py │ │ ├── helper.py │ │ └── matmul.py │ └── README.md ├── Exercise09 │ ├── C │ │ ├── Makefile │ │ └── pi_ocl.c │ ├── Cpp │ │ ├── Makefile │ │ └── pi_ocl.cpp │ ├── Python │ │ └── pi_ocl.py │ ├── README.md │ └── pi_ocl.cl ├── Exercise13 │ ├── C │ │ ├── Makefile │ │ └── gameoflife.c │ ├── Cpp │ │ ├── Makefile │ │ └── gameoflife.cpp │ ├── Python │ │ └── gameoflife.py │ ├── README.md │ └── gameoflife.cl ├── ExerciseA │ ├── C │ │ ├── Makefile │ │ └── pi_vocl.c │ ├── Cpp │ │ ├── Makefile │ │ └── pi_vocl.cpp │ ├── Python │ │ └── pi_vocl.py │ ├── README.md │ └── pi_vocl.cl └── Makefile └── Tools ├── .DS_Store ├── genErrCode.py └── stringify_opencl /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | 3 | *.pyc 4 | 5 | # C 6 | *.o 7 | 8 | # Produced binarys 9 | Exercises/Exercise01/C/DeviceInfo 10 | Exercises/Exercise02/C/vadd 11 | Exercises/Exercise04/C/vadd 12 | Exercises/Exercise05/C/vadd 13 | Exercises/Exercise06/C/mult 14 | Exercises/Exercise09/C/pi 15 | Exercises/Exercise13/C/gameoflife 16 | Exercises/Exercise13/CUDA/gameoflife 17 | Exercises/Exercise13/CUDA-VADD/vadd 18 | Exercises/Exercise13/Displayer/displayer 19 | 20 | Exercises/Exercise01/Cpp/DeviceInfo 21 | Exercises/Exercise03/Cpp/vadd 22 | Exercises/Exercise04/Cpp/vadd 23 | Exercises/Exercise05/Cpp/vadd 24 | Exercises/Exercise06/Cpp/mult 25 | Exercises/Exercise09/Cpp/pi 26 | 27 | Solutions/Exercise04/C/vadd_chain 28 | Solutions/Exercise05/C/vadd_abc 29 | Solutions/Exercise06/C/mult 30 | Solutions/Exercise07/C/mult 31 | Solutions/Exercise08/C/mult 32 | Solutions/Exercise09/C/pi_ocl 33 | Solutions/Exercise13/C/gameoflife 34 | Solutions/ExerciseA/C/pi_vocl 35 | 36 | Solutions/Exercise04/Cpp/vadd_chain 37 | Solutions/Exercise05/Cpp/vadd_abc 38 | Solutions/Exercise06/Cpp/mult 39 | Solutions/Exercise07/Cpp/mult 40 | Solutions/Exercise08/Cpp/mult 41 | Solutions/Exercise09/Cpp/pi_ocl 42 | Solutions/Exercise13/Cpp/gameoflife 43 | Solutions/ExerciseA/Cpp/pi_vocl 44 | 45 | 46 | *.plist 47 | .DS_Store 48 | 49 | Exercises/Exercise01/C/DeviceInfo.dSYM/Contents/Resources/DWARF/DeviceInfo 50 | 51 | 52 | Solutions/Exercise06/Cpp/Makefile.tmp 53 | 54 | Solutions/ExerciseA/C/pi_vocl.dSYM/Contents/Resources/DWARF/pi_vocl 55 | -------------------------------------------------------------------------------- /Exercises/C_common/device_picker.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------ 2 | * 3 | * Name: device_picker.h 4 | * 5 | * Purpose: Provide a simple CLI to specify an OpenCL device at runtime 6 | * 7 | * Note: Must be included AFTER the relevant OpenCL header 8 | * See one of the Matrix Multiply exercises for usage 9 | * 10 | * HISTORY: Method written by James Price, October 2014 11 | * Extracted to a common header by Tom Deakin, November 2014 12 | */ 13 | 14 | #pragma once 15 | 16 | #include 17 | #include 18 | 19 | #define MAX_PLATFORMS 8 20 | #define MAX_DEVICES 16 21 | #define MAX_INFO_STRING 256 22 | 23 | 24 | unsigned getDeviceList(cl_device_id devices[MAX_DEVICES]) 25 | { 26 | cl_int err; 27 | 28 | // Get list of platforms 29 | cl_uint numPlatforms = 0; 30 | cl_platform_id platforms[MAX_PLATFORMS]; 31 | err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms); 32 | checkError(err, "getting platforms"); 33 | 34 | // Enumerate devices 35 | unsigned numDevices = 0; 36 | for (int i = 0; i < numPlatforms; i++) 37 | { 38 | cl_uint num = 0; 39 | err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 40 | MAX_DEVICES-numDevices, devices+numDevices, &num); 41 | checkError(err, "getting deviceS"); 42 | numDevices += num; 43 | } 44 | 45 | return numDevices; 46 | } 47 | 48 | void getDeviceName(cl_device_id device, char name[MAX_INFO_STRING]) 49 | { 50 | cl_device_info info = CL_DEVICE_NAME; 51 | 52 | // Special case for AMD 53 | #ifdef CL_DEVICE_BOARD_NAME_AMD 54 | clGetDeviceInfo(device, CL_DEVICE_VENDOR, MAX_INFO_STRING, name, NULL); 55 | if (strstr(name, "Advanced Micro Devices")) 56 | info = CL_DEVICE_BOARD_NAME_AMD; 57 | #endif 58 | 59 | clGetDeviceInfo(device, info, MAX_INFO_STRING, name, NULL); 60 | } 61 | 62 | 63 | int parseUInt(const char *str, cl_uint *output) 64 | { 65 | char *next; 66 | *output = strtoul(str, &next, 10); 67 | return !strlen(next); 68 | } 69 | 70 | void parseArguments(int argc, char *argv[], cl_uint *deviceIndex) 71 | { 72 | for (int i = 1; i < argc; i++) 73 | { 74 | if (!strcmp(argv[i], "--list")) 75 | { 76 | // Get list of devices 77 | cl_device_id devices[MAX_DEVICES]; 78 | unsigned numDevices = getDeviceList(devices); 79 | 80 | // Print device names 81 | if (numDevices == 0) 82 | { 83 | printf("No devices found.\n"); 84 | } 85 | else 86 | { 87 | printf("\n"); 88 | printf("Devices:\n"); 89 | for (int i = 0; i < numDevices; i++) 90 | { 91 | char name[MAX_INFO_STRING]; 92 | getDeviceName(devices[i], name); 93 | printf("%2d: %s\n", i, name); 94 | } 95 | printf("\n"); 96 | } 97 | exit(0); 98 | } 99 | else if (!strcmp(argv[i], "--device")) 100 | { 101 | if (++i >= argc || !parseUInt(argv[i], deviceIndex)) 102 | { 103 | printf("Invalid device index\n"); 104 | exit(1); 105 | } 106 | } 107 | else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) 108 | { 109 | printf("\n"); 110 | printf("Usage: ./program [OPTIONS]\n\n"); 111 | printf("Options:\n"); 112 | printf(" -h --help Print the message\n"); 113 | printf(" --list List available devices\n"); 114 | printf(" --device INDEX Select device at INDEX\n"); 115 | printf("\n"); 116 | exit(0); 117 | } 118 | } 119 | } 120 | 121 | -------------------------------------------------------------------------------- /Exercises/C_common/wtime.c: -------------------------------------------------------------------------------- 1 | 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | #include 6 | #endif 7 | 8 | #include 9 | 10 | double wtime() 11 | { 12 | #ifdef _OPENMP 13 | /* Use omp_get_wtime() if we can */ 14 | return omp_get_wtime(); 15 | #else 16 | /* Use a generic timer */ 17 | static int sec = -1; 18 | struct timeval tv; 19 | gettimeofday(&tv, NULL); 20 | if (sec < 0) sec = tv.tv_sec; 21 | return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 22 | #endif 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /Exercises/Cpp_common/device_picker.hpp: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------ 2 | * 3 | * Name: device_picker.h 4 | * 5 | * Purpose: Provide a simple CLI to specify an OpenCL device at runtime 6 | * 7 | * Note: Must be included AFTER the relevant OpenCL header 8 | * See one of the Matrix Multiply exercises for usage 9 | * 10 | * HISTORY: Method written by James Price, October 2014 11 | * Extracted to a common header by Tom Deakin, November 2014 12 | */ 13 | 14 | #pragma once 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #define MAX_INFO_STRING 256 21 | 22 | 23 | unsigned getDeviceList(std::vector& devices) 24 | { 25 | cl_int err; 26 | 27 | // Get list of platforms 28 | std::vector platforms; 29 | cl::Platform::get(&platforms); 30 | 31 | // Enumerate devices 32 | for (int i = 0; i < platforms.size(); i++) 33 | { 34 | cl_uint num = 0; 35 | std::vector plat_devices; 36 | platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); 37 | devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); 38 | } 39 | 40 | return devices.size(); 41 | } 42 | 43 | void getDeviceName(cl::Device& device, std::string& name) 44 | { 45 | cl_device_info info = CL_DEVICE_NAME; 46 | 47 | // Special case for AMD 48 | #ifdef CL_DEVICE_BOARD_NAME_AMD 49 | device.getInfo(CL_DEVICE_VENDOR, &name); 50 | if (strstr(name.c_str(), "Advanced Micro Devices")) 51 | info = CL_DEVICE_BOARD_NAME_AMD; 52 | #endif 53 | 54 | device.getInfo(info, &name); 55 | } 56 | 57 | 58 | int parseUInt(const char *str, cl_uint *output) 59 | { 60 | char *next; 61 | *output = strtoul(str, &next, 10); 62 | return !strlen(next); 63 | } 64 | 65 | void parseArguments(int argc, char *argv[], cl_uint *deviceIndex) 66 | { 67 | for (int i = 1; i < argc; i++) 68 | { 69 | if (!strcmp(argv[i], "--list")) 70 | { 71 | // Get list of devices 72 | std::vector devices; 73 | unsigned numDevices = getDeviceList(devices); 74 | 75 | // Print device names 76 | if (numDevices == 0) 77 | { 78 | std::cout << "No devices found.\n"; 79 | } 80 | else 81 | { 82 | std::cout << "\nDevices:\n"; 83 | for (int i = 0; i < numDevices; i++) 84 | { 85 | std::string name; 86 | getDeviceName(devices[i], name); 87 | std::cout << i << ": " << name << "\n"; 88 | } 89 | std::cout << "\n"; 90 | } 91 | exit(0); 92 | } 93 | else if (!strcmp(argv[i], "--device")) 94 | { 95 | if (++i >= argc || !parseUInt(argv[i], deviceIndex)) 96 | { 97 | std::cout << "Invalid device index\n"; 98 | exit(1); 99 | } 100 | } 101 | else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) 102 | { 103 | std::cout << "\n"; 104 | std::cout << "Usage: ./program [OPTIONS]\n\n"; 105 | std::cout << "Options:\n"; 106 | std::cout << " -h --help Print the message\n"; 107 | std::cout << " --list List available devices\n"; 108 | std::cout << " --device INDEX Select device at INDEX\n"; 109 | std::cout << "\n"; 110 | exit(0); 111 | } 112 | } 113 | } 114 | 115 | -------------------------------------------------------------------------------- /Exercises/Cpp_common/wtime.c: -------------------------------------------------------------------------------- 1 | 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | #include 6 | #endif 7 | 8 | #include 9 | 10 | double wtime() 11 | { 12 | #ifdef _OPENMP 13 | /* Use omp_get_wtime() if we can */ 14 | return omp_get_wtime(); 15 | #else 16 | /* Use a generic timer */ 17 | static int sec = -1; 18 | struct timeval tv; 19 | gettimeofday(&tv, NULL); 20 | if (sec < 0) sec = tv.tv_sec; 21 | return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 22 | #endif 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /Exercises/Exercise01/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS=-std=c99 7 | 8 | LIBS = -lOpenCL 9 | 10 | COMMON_DIR = ../../C_common 11 | 12 | # Check our platform and make sure we define the APPLE variable 13 | # and set up the right compiler flags and libraries 14 | PLATFORM = $(shell uname -s) 15 | ifeq ($(PLATFORM), Darwin) 16 | LIBS = -framework OpenCL 17 | endif 18 | 19 | DeviceInfo: DeviceInfo.c 20 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@ 21 | 22 | 23 | clean: 24 | rm -f DeviceInfo 25 | -------------------------------------------------------------------------------- /Exercises/Exercise01/Cpp/DeviceInfo.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Display Device Information 3 | * 4 | * Script to print out some information about the OpenCL devices 5 | * and platforms available on your system 6 | * 7 | * History: C++ version written by Tom Deakin, 2012 8 | * Updated by Tom Deakin, August 2013 9 | */ 10 | 11 | #define __CL_ENABLE_EXCEPTIONS 12 | 13 | #include "cl.hpp" 14 | #include 15 | #include 16 | 17 | #include 18 | 19 | int main(void) 20 | { 21 | 22 | try 23 | { 24 | // Discover number of platforms 25 | std::vector platforms; 26 | cl::Platform::get(&platforms); 27 | std::cout << "\nNumber of OpenCL plaforms: " << platforms.size() << std::endl; 28 | 29 | // Investigate each platform 30 | std::cout << "\n-------------------------" << std::endl; 31 | for (std::vector::iterator plat = platforms.begin(); plat != platforms.end(); plat++) 32 | { 33 | std::string s; 34 | plat->getInfo(CL_PLATFORM_NAME, &s); 35 | std::cout << "Platform: " << s << std::endl; 36 | 37 | plat->getInfo(CL_PLATFORM_VENDOR, &s); 38 | std::cout << "\tVendor: " << s << std::endl; 39 | 40 | plat->getInfo(CL_PLATFORM_VERSION, &s); 41 | std::cout << "\tVersion: " << s << std::endl; 42 | 43 | // Discover number of devices 44 | std::vector devices; 45 | plat->getDevices(CL_DEVICE_TYPE_ALL, &devices); 46 | std::cout << "\n\tNumber of devices: " << devices.size() << std::endl; 47 | 48 | // Investigate each device 49 | for (std::vector::iterator dev = devices.begin(); dev != devices.end(); dev++ ) 50 | { 51 | std::cout << "\t-------------------------" << std::endl; 52 | 53 | dev->getInfo(CL_DEVICE_NAME, &s); 54 | std::cout << "\t\tName: " << s << std::endl; 55 | 56 | dev->getInfo(CL_DEVICE_OPENCL_C_VERSION, &s); 57 | std::cout << "\t\tVersion: " << s << std::endl; 58 | 59 | int i; 60 | dev->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &i); 61 | std::cout << "\t\tMax. Compute Units: " << i << std::endl; 62 | 63 | size_t size; 64 | dev->getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &size); 65 | std::cout << "\t\tLocal Memory Size: " << size/1024 << " KB" << std::endl; 66 | 67 | dev->getInfo(CL_DEVICE_GLOBAL_MEM_SIZE, &size); 68 | std::cout << "\t\tGlobal Memory Size: " << size/(1024*1024) << " MB" << std::endl; 69 | 70 | dev->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size); 71 | std::cout << "\t\tMax Alloc Size: " << size/(1024*1024) << " MB" << std::endl; 72 | 73 | dev->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size); 74 | std::cout << "\t\tMax Work-group Total Size: " << size << std::endl; 75 | 76 | std::vector d; 77 | dev->getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &d); 78 | std::cout << "\t\tMax Work-group Dims: ("; 79 | for (std::vector::iterator st = d.begin(); st != d.end(); st++) 80 | std::cout << *st << " "; 81 | std::cout << "\x08)" << std::endl; 82 | 83 | std::cout << "\t-------------------------" << std::endl; 84 | 85 | } 86 | 87 | std::cout << "\n-------------------------\n"; 88 | } 89 | 90 | } 91 | catch (cl::Error err) 92 | { 93 | std::cout << "OpenCL Error: " << err.what() << " returned " << err_code(err.err()) << std::endl; 94 | std::cout << "Check cl.h for error codes." << std::endl; 95 | exit(-1); 96 | } 97 | 98 | return 0; 99 | 100 | } 101 | -------------------------------------------------------------------------------- /Exercises/Exercise01/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CPPC 3 | CPPC=g++ 4 | endif 5 | 6 | CPP_COMMON = ../../Cpp_common 7 | 8 | CCFLAGS= 9 | 10 | INC = -I $(CPP_COMMON) 11 | 12 | LIBS = -lOpenCL 13 | 14 | # Check our platform and make sure we define the APPLE variable 15 | # and set up the right compiler flags and libraries 16 | PLATFORM = $(shell uname -s) 17 | ifeq ($(PLATFORM), Darwin) 18 | CPPC = clang++ 19 | LIBS = -framework OpenCL 20 | endif 21 | 22 | DeviceInfo: DeviceInfo.cpp 23 | $(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@ 24 | 25 | 26 | clean: 27 | rm -f DeviceInfo 28 | -------------------------------------------------------------------------------- /Exercises/Exercise01/Python/DeviceInfo.py: -------------------------------------------------------------------------------- 1 | # 2 | # Display Device Information 3 | # 4 | # Script to print out some information about the OpenCL devices 5 | # and platforms available on your system 6 | # 7 | # History: C++ version written by Tom Deakin, 2012 8 | # Ported to Python by Tom Deakin, July 2013 9 | # 10 | 11 | # Import the Python OpenCL API 12 | import pyopencl as cl 13 | 14 | # Create a list of all the platform IDs 15 | platforms = cl.get_platforms() 16 | 17 | print "\nNumber of OpenCL platforms:", len(platforms) 18 | 19 | print "\n-------------------------" 20 | 21 | # Investigate each platform 22 | for p in platforms: 23 | # Print out some information about the platforms 24 | print "Platform:", p.name 25 | print "Vendor:", p.vendor 26 | print "Version:", p.version 27 | 28 | # Discover all devices 29 | devices = p.get_devices() 30 | print "Number of devices:", len(devices) 31 | 32 | # Investigate each device 33 | for d in devices: 34 | print "\t-------------------------" 35 | # Print out some information about the devices 36 | print "\t\tName:", d.name 37 | print "\t\tVersion:", d.opencl_c_version 38 | print "\t\tMax. Compute Units:", d.max_compute_units 39 | print "\t\tLocal Memory Size:", d.local_mem_size/1024, "KB" 40 | print "\t\tGlobal Memory Size:", d.global_mem_size/(1024*1024), "MB" 41 | print "\t\tMax Alloc Size:", d.max_mem_alloc_size/(1024*1024), "MB" 42 | print "\t\tMax Work-group Total Size:", d.max_work_group_size 43 | 44 | # Find the maximum dimensions of the work-groups 45 | dim = d.max_work_item_sizes 46 | print "\t\tMax Work-group Dims:(", dim[0], " ".join(map(str, dim[1:])), ")" 47 | 48 | print "\t-------------------------" 49 | 50 | print "\n-------------------------" 51 | -------------------------------------------------------------------------------- /Exercises/Exercise01/README.md: -------------------------------------------------------------------------------- 1 | Exercise 1 - Platform Information 2 | ================================= 3 | 4 | Goal 5 | ---- 6 | * Verify that you can run the OpenCL environment you'll be using in this tutorial. 7 | Specifically, can you run a simple OpenCL program. 8 | 9 | Procedure 10 | --------- 11 | * Take the program we provide (`DeviceInfo`), inspect it in the editor of your choice, build the program and run it. 12 | 13 | Expected output 14 | --------------- 15 | * Information about the installed OpenCL platforms and devices visible to them. 16 | 17 | Extension 18 | --------- 19 | * Run the command `clinfo` which comes with the AMD implementation. 20 | This outputs all the information the OpenCL runtime can find out about devices and platforms. 21 | 22 | -------------------------------------------------------------------------------- /Exercises/Exercise02/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS=-O3 -lm 7 | 8 | LIBS = -lOpenCL 9 | 10 | COMMON_DIR = ../../C_common 11 | 12 | # Change this variable to specify the device type 13 | # to the OpenCL device type of choice. You can also 14 | # edit the variable in the source. 15 | ifndef DEVICE 16 | DEVICE = CL_DEVICE_TYPE_DEFAULT 17 | endif 18 | 19 | # Check our platform and make sure we define the APPLE variable 20 | # and set up the right compiler flags and libraries 21 | PLATFORM = $(shell uname -s) 22 | ifeq ($(PLATFORM), Darwin) 23 | LIBS = -framework OpenCL 24 | endif 25 | 26 | CCFLAGS += -D DEVICE=$(DEVICE) 27 | 28 | vadd: vadd_c.c $(COMMON_DIR)/wtime.c $(COMMON_DIR)/device_info.c 29 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@ 30 | 31 | 32 | clean: 33 | rm -f vadd 34 | -------------------------------------------------------------------------------- /Exercises/Exercise02/README.md: -------------------------------------------------------------------------------- 1 | Exercise 2 - Running the Vadd kernel 2 | ==================================== 3 | 4 | Goal 5 | ---- 6 | * To inspect and verify that you can run an OpenCL kernel. 7 | 8 | Procedure 9 | --------- 10 | * Take the C Vadd program we provide you. 11 | It will run a simple kernel to add two vectors together. 12 | * Look at the host code and identify the API calls in the host code. 13 | Compare them against the API descriptions on the OpenCL reference card. 14 | * There are some helper files which time the execution, output device information neatly and check errors. 15 | 16 | Expected output 17 | --------------- 18 | * A message verifying that the vector addition completed successfully. 19 | -------------------------------------------------------------------------------- /Exercises/Exercise03/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CPPC 3 | CPPC=g++ 4 | endif 5 | 6 | CPP_COMMON = ../../Cpp_common 7 | 8 | CCFLAGS=-std=c++11 9 | 10 | INC = -I $(CPP_COMMON) 11 | 12 | LIBS = -lOpenCL -lrt 13 | 14 | # Change this variable to specify the device type 15 | # to the OpenCL device type of choice. You can also 16 | # edit the variable in the source. 17 | ifndef DEVICE 18 | DEVICE = CL_DEVICE_TYPE_DEFAULT 19 | endif 20 | 21 | # Check our platform and make sure we define the APPLE variable 22 | # and set up the right compiler flags and libraries 23 | PLATFORM = $(shell uname -s) 24 | ifeq ($(PLATFORM), Darwin) 25 | CPPC = clang++ 26 | CCFLAGS += -stdlib=libc++ 27 | LIBS = -framework OpenCL 28 | endif 29 | 30 | CCFLAGS += -D DEVICE=$(DEVICE) 31 | 32 | vadd: vadd.cpp 33 | $(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@ 34 | 35 | 36 | clean: 37 | rm -f vadd 38 | -------------------------------------------------------------------------------- /Exercises/Exercise03/Cpp/vadd.cl: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // kernel: vadd 4 | // 5 | // Purpose: Compute the elementwise sum c = a+b 6 | // 7 | // input: a and b float vectors of length count 8 | // 9 | // output: c float vector of length count holding the sum a + b 10 | // 11 | 12 | __kernel void vadd( 13 | __global float* a, 14 | __global float* b, 15 | __global float* c, 16 | const unsigned int count) 17 | { 18 | int i = get_global_id(0); 19 | if(i < count) { 20 | c[i] = a[i] + b[i]; 21 | } 22 | } -------------------------------------------------------------------------------- /Exercises/Exercise03/Cpp/vadd.cpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Name: vadd_cpp.cpp 4 | // 5 | // Purpose: Elementwise addition of two vectors (c = a + b) 6 | // 7 | // c = a + b 8 | // 9 | // HISTORY: Written by Tim Mattson, June 2011 10 | // Ported to C++ Wrapper API by Benedict Gaster, September 2011 11 | // Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012 12 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 13 | // 14 | //------------------------------------------------------------------------------ 15 | 16 | #define __CL_ENABLE_EXCEPTIONS 17 | 18 | #include "cl.hpp" 19 | 20 | #include "util.hpp" // utility library 21 | 22 | #include "err_code.h" 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include 30 | #include 31 | 32 | // pick up device type from compiler command line or from the default type 33 | #ifndef DEVICE 34 | #define DEVICE CL_DEVICE_TYPE_DEFAULT 35 | #endif 36 | 37 | //------------------------------------------------------------------------------ 38 | 39 | #define TOL (0.001) // tolerance used in floating point comparisons 40 | #define LENGTH (1024) // length of vectors a, b, and c 41 | 42 | int main(void) 43 | { 44 | std::vector h_a(LENGTH); // a vector 45 | std::vector h_b(LENGTH); // b vector 46 | std::vector h_c (LENGTH, 0xdeadbeef); // c = a + b, from compute device 47 | 48 | cl::Buffer d_a; // device memory used for the input a vector 49 | cl::Buffer d_b; // device memory used for the input b vector 50 | cl::Buffer d_c; // device memory used for the output c vector 51 | 52 | // Fill vectors a and b with random float values 53 | int count = LENGTH; 54 | for(int i = 0; i < count; i++) 55 | { 56 | h_a[i] = rand() / (float)RAND_MAX; 57 | h_b[i] = rand() / (float)RAND_MAX; 58 | } 59 | 60 | try 61 | { 62 | // Create a context 63 | cl::Context context(DEVICE); 64 | 65 | // Load in kernel source, creating a program object for the context 66 | 67 | cl::Program program(context, util::loadProgram("vadd.cl"), true); 68 | 69 | // Get the command queue 70 | cl::CommandQueue queue(context); 71 | 72 | // Create the kernel functor 73 | 74 | auto vadd = cl::make_kernel(program, "vadd"); 75 | 76 | d_a = cl::Buffer(context, begin(h_a), end(h_a), true); 77 | d_b = cl::Buffer(context, begin(h_b), end(h_b), true); 78 | 79 | d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); 80 | 81 | util::Timer timer; 82 | 83 | vadd( 84 | cl::EnqueueArgs( 85 | queue, 86 | cl::NDRange(count)), 87 | d_a, 88 | d_b, 89 | d_c, 90 | count); 91 | 92 | queue.finish(); 93 | 94 | double rtime = static_cast(timer.getTimeMilliseconds()) / 1000.0; 95 | printf("\nThe kernels ran in %lf seconds\n", rtime); 96 | 97 | cl::copy(queue, d_c, begin(h_c), end(h_c)); 98 | 99 | // Test the results 100 | int correct = 0; 101 | float tmp; 102 | for(int i = 0; i < count; i++) { 103 | tmp = h_a[i] + h_b[i]; // expected value for d_c[i] 104 | tmp -= h_c[i]; // compute errors 105 | if(tmp*tmp < TOL*TOL) { // correct if square deviation is less 106 | correct++; // than tolerance squared 107 | } 108 | else { 109 | 110 | printf( 111 | " tmp %f h_a %f h_b %f h_c %f \n", 112 | tmp, 113 | h_a[i], 114 | h_b[i], 115 | h_c[i]); 116 | } 117 | } 118 | 119 | // summarize results 120 | printf( 121 | "vector add to find C = A+B: %d out of %d results were correct.\n", 122 | correct, 123 | count); 124 | } 125 | catch (cl::Error err) { 126 | std::cout << "Exception\n"; 127 | std::cerr 128 | << "ERROR: " 129 | << err.what() 130 | << "(" 131 | << err_code(err.err()) 132 | << ")" 133 | << std::endl; 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /Exercises/Exercise03/Python/deviceinfo.py: -------------------------------------------------------------------------------- 1 | # 2 | # Device Info 3 | # 4 | # Function to output key parameters about the input OpenCL device 5 | # 6 | # History: C version written by Tim Mattson, June 2010 7 | # Ported to Python by Tom Deakin, July 2013 8 | # 9 | 10 | import pyopencl as cl 11 | import sys 12 | 13 | def output_device_info(device_id): 14 | sys.stdout.write("Device is ") 15 | sys.stdout.write(device_id.name) 16 | if device_id.type == cl.device_type.GPU: 17 | sys.stdout.write("GPU from ") 18 | elif device_id.type == cl.device_type.CPU: 19 | sys.stdout.write("CPU from ") 20 | else: 21 | sys.stdout.write("non CPU of GPU processor from ") 22 | sys.stdout.write(device_id.vendor) 23 | sys.stdout.write(" with a max of ") 24 | sys.stdout.write(str(device_id.max_compute_units)) 25 | sys.stdout.write(" compute units\n") 26 | sys.stdout.flush() 27 | -------------------------------------------------------------------------------- /Exercises/Exercise03/Python/vadd.py: -------------------------------------------------------------------------------- 1 | # 2 | # Vadd 3 | # 4 | # Element wise addition of two vectors (c = a + b) 5 | # Asks the user to select a device at runtime 6 | # 7 | # History: C version written by Tim Mattson, December 2009 8 | # C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 9 | # Ported to Python by Tom Deakin, July 2013 10 | # 11 | 12 | # Import the Python OpenCL API 13 | import pyopencl as cl 14 | # Import the Python Maths Library (for vectors) 15 | import numpy 16 | 17 | # Import a library to print out the device information 18 | import deviceinfo 19 | 20 | # Import Standard Library to time the execution 21 | from time import time 22 | #------------------------------------------------------------------------------ 23 | 24 | # tolerance used in floating point comparisons 25 | TOL = 0.001 26 | # length of vectors a, b and c 27 | LENGTH = 1024 28 | 29 | #------------------------------------------------------------------------------ 30 | # 31 | # Kernel: vadd 32 | # 33 | # To compute the elementwise sum c = a + b 34 | # 35 | # Input: a and b float vectors of length count 36 | # Output c float vector of length count holding the sum a + b 37 | 38 | kernelsource = """ 39 | __kernel void vadd( 40 | __global float* a, 41 | __global float* b, 42 | __global float* c, 43 | const unsigned int count) 44 | { 45 | int i = get_global_id(0); 46 | if (i < count) 47 | c[i] = a[i] + b[i]; 48 | } 49 | """ 50 | 51 | #------------------------------------------------------------------------------ 52 | 53 | # Main procedure 54 | 55 | # Create a compute context 56 | # Ask the user to select a platform/device on the CLI 57 | context = cl.create_some_context() 58 | 59 | # Print out device info 60 | deviceinfo.output_device_info(context.devices[0]) 61 | 62 | # Create a command queue 63 | queue = cl.CommandQueue(context) 64 | 65 | # Create the compute program from the source buffer 66 | # and build it 67 | program = cl.Program(context, kernelsource).build() 68 | 69 | # Create a and b vectors and fill with random float values 70 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32) 71 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32) 72 | # Create an empty c vector (a+b) to be returned from the compute device 73 | h_c = numpy.empty(LENGTH).astype(numpy.float32) 74 | 75 | # Create the input (a, b) arrays in device memory and copy data from host 76 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a) 77 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b) 78 | # Create the output (c) array in device memory 79 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_c.nbytes) 80 | 81 | # Start the timer 82 | rtime = time() 83 | 84 | # Execute the kernel over the entire range of our 1d input 85 | # allowing OpenCL runtime to select the work group items for the device 86 | vadd = program.vadd 87 | vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32]) 88 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH) 89 | 90 | # Wait for the commands to finish before reading back 91 | queue.finish() 92 | rtime = time() - rtime 93 | print "The kernel ran in", rtime, "seconds" 94 | 95 | # Read back the results from the compute device 96 | cl.enqueue_copy(queue, h_c, d_c) 97 | 98 | # Test the results 99 | correct = 0; 100 | for a, b, c in zip(h_a, h_b, h_c): 101 | # assign element i of a+b to tmp 102 | tmp = a + b 103 | # compute the deviation of expected and output result 104 | tmp -= c 105 | # correct if square deviation is less than tolerance squared 106 | if tmp*tmp < TOL*TOL: 107 | correct += 1 108 | else: 109 | print "tmp", tmp, "h_a", a, "h_b", b, "h_c", c 110 | 111 | # Summarize results 112 | print "C = A+B:", correct, "out of", LENGTH, "results were correct." 113 | -------------------------------------------------------------------------------- /Exercises/Exercise03/README.md: -------------------------------------------------------------------------------- 1 | Exercise 3 - Running the Vadd kernel (C++/Python) 2 | ================================================= 3 | 4 | Goal 5 | ---- 6 | * To learn the C++ and/or Python interface to OpenCL's API 7 | 8 | Procedure 9 | --------- 10 | * Examine the C++ or Python program we provide you. 11 | It will run a simple kernel to add two vectors together. 12 | * Look at the host code and identify the API calls in the host code. 13 | Note how some of the API calls in OpenCL map onto C++/Python constructs. 14 | * Compare the original C version with the C++/Python versions 15 | * Look at the simplicity of the common API calls 16 | 17 | Expected output 18 | --------------- 19 | * A message verifying that the vector addition completed successfully. 20 | -------------------------------------------------------------------------------- /Exercises/Exercise04/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS=-O3 -lm 7 | 8 | LIBS = -lOpenCL -fopenmp 9 | 10 | COMMON_DIR = ../../C_common 11 | 12 | # Change this variable to specify the device type 13 | # to the OpenCL device type of choice. You can also 14 | # edit the variable in the source. 15 | ifndef DEVICE 16 | DEVICE = CL_DEVICE_TYPE_DEFAULT 17 | endif 18 | 19 | # Check our platform and make sure we define the APPLE variable 20 | # and set up the right compiler flags and libraries 21 | PLATFORM = $(shell uname -s) 22 | ifeq ($(PLATFORM), Darwin) 23 | LIBS = -framework OpenCL 24 | endif 25 | 26 | CCFLAGS += -D DEVICE=$(DEVICE) 27 | 28 | vadd: vadd_c.c $(COMMON_DIR)/wtime.c $(COMMON_DIR)/device_info.c 29 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@ 30 | 31 | 32 | clean: 33 | rm -f vadd 34 | -------------------------------------------------------------------------------- /Exercises/Exercise04/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CPPC 3 | CPPC=g++ 4 | endif 5 | 6 | CPP_COMMON = ../../Cpp_common 7 | 8 | CCFLAGS= 9 | 10 | INC = -I $(CPP_COMMON) 11 | 12 | LIBS = -lOpenCL -lrt 13 | 14 | # Change this variable to specify the device type 15 | # to the OpenCL device type of choice. You can also 16 | # edit the variable in the source. 17 | ifndef DEVICE 18 | DEVICE = CL_DEVICE_TYPE_DEFAULT 19 | endif 20 | 21 | # Check our platform and make sure we define the APPLE variable 22 | # and set up the right compiler flags and libraries 23 | PLATFORM = $(shell uname -s) 24 | ifeq ($(PLATFORM), Darwin) 25 | CPPC = clang++ 26 | CCFLAGS += -stdlib=libc++ 27 | LIBS = -framework OpenCL 28 | endif 29 | 30 | CCFLAGS += -D DEVICE=$(DEVICE) 31 | 32 | vadd: vadd.cpp 33 | $(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@ 34 | 35 | 36 | clean: 37 | rm -f vadd 38 | -------------------------------------------------------------------------------- /Exercises/Exercise04/Cpp/vadd.cl: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // kernel: vadd 4 | // 5 | // Purpose: Compute the elementwise sum c = a+b 6 | // 7 | // input: a and b float vectors of length count 8 | // 9 | // output: c float vector of length count holding the sum a + b 10 | // 11 | 12 | __kernel void vadd( 13 | __global float* a, 14 | __global float* b, 15 | __global float* c, 16 | const unsigned int count) 17 | { 18 | int i = get_global_id(0); 19 | if(i < count) { 20 | c[i] = a[i] + b[i]; 21 | } 22 | } -------------------------------------------------------------------------------- /Exercises/Exercise04/Cpp/vadd.cpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Name: vadd_cpp.cpp 4 | // 5 | // Purpose: Elementwise addition of two vectors (c = a + b) 6 | // 7 | // c = a + b 8 | // 9 | // HISTORY: Written by Tim Mattson, June 2011 10 | // Ported to C++ Wrapper API by Benedict Gaster, September 2011 11 | // Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012 12 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 13 | // 14 | //------------------------------------------------------------------------------ 15 | 16 | #define __CL_ENABLE_EXCEPTIONS 17 | 18 | #include "cl.hpp" 19 | 20 | #include "util.hpp" // utility library 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | // pick up device type from compiler command line or from the default type 31 | #ifndef DEVICE 32 | #define DEVICE CL_DEVICE_TYPE_DEFAULT 33 | #endif 34 | 35 | #include 36 | 37 | //------------------------------------------------------------------------------ 38 | 39 | #define TOL (0.001) // tolerance used in floating point comparisons 40 | #define LENGTH (1024) // length of vectors a, b, and c 41 | 42 | int main(void) 43 | { 44 | std::vector h_a(LENGTH); // a vector 45 | std::vector h_b(LENGTH); // b vector 46 | std::vector h_c(LENGTH, 0xdeadbeef); // c = a + b, from compute device 47 | 48 | cl::Buffer d_a; // device memory used for the input a vector 49 | cl::Buffer d_b; // device memory used for the input b vector 50 | cl::Buffer d_c; // device memory used for the output c vector 51 | 52 | // Fill vectors a and b with random float values 53 | int count = LENGTH; 54 | for(int i = 0; i < count; i++) 55 | { 56 | h_a[i] = rand() / (float)RAND_MAX; 57 | h_b[i] = rand() / (float)RAND_MAX; 58 | } 59 | 60 | try 61 | { 62 | // Create a context 63 | cl::Context context(DEVICE); 64 | 65 | // Load in kernel source, creating a program object for the context 66 | 67 | cl::Program program(context, util::loadProgram("vadd.cl"), true); 68 | 69 | // Get the command queue 70 | cl::CommandQueue queue(context); 71 | 72 | // Create the kernel functor 73 | 74 | cl::make_kernel vadd(program, "vadd"); 75 | 76 | d_a = cl::Buffer(context, h_a.begin(), h_a.end(), true); 77 | d_b = cl::Buffer(context, h_b.begin(), h_b.end(), true); 78 | 79 | d_c = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); 80 | 81 | util::Timer timer; 82 | 83 | vadd( 84 | cl::EnqueueArgs( 85 | queue, 86 | cl::NDRange(count)), 87 | d_a, 88 | d_b, 89 | d_c, 90 | count); 91 | 92 | queue.finish(); 93 | 94 | double rtime = static_cast(timer.getTimeMilliseconds()) / 1000.0; 95 | printf("\nThe kernels ran in %lf seconds\n", rtime); 96 | 97 | cl::copy(queue, d_c, h_c.begin(), h_c.end()); 98 | 99 | // Test the results 100 | int correct = 0; 101 | float tmp; 102 | for(int i = 0; i < count; i++) { 103 | tmp = h_a[i] + h_b[i]; // expected value for d_c[i] 104 | tmp -= h_c[i]; // compute errors 105 | if(tmp*tmp < TOL*TOL) { // correct if square deviation is less 106 | correct++; // than tolerance squared 107 | } 108 | else { 109 | 110 | printf( 111 | " tmp %f h_a %f h_b %f h_c %f \n", 112 | tmp, 113 | h_a[i], 114 | h_b[i], 115 | h_c[i]); 116 | } 117 | } 118 | 119 | // summarize results 120 | printf( 121 | "vector add to find C = A+B: %d out of %d results were correct.\n", 122 | correct, 123 | count); 124 | } 125 | catch (cl::Error err) { 126 | std::cout << "Exception\n"; 127 | std::cerr 128 | << "ERROR: " 129 | << err.what() 130 | << "(" 131 | << err_code(err.err()) 132 | << ")" 133 | << std::endl; 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /Exercises/Exercise04/Python/deviceinfo.py: -------------------------------------------------------------------------------- 1 | # 2 | # Device Info 3 | # 4 | # Function to output key parameters about the input OpenCL device 5 | # 6 | # History: C version written by Tim Mattson, June 2010 7 | # Ported to Python by Tom Deakin, July 2013 8 | # 9 | 10 | import pyopencl as cl 11 | import sys 12 | 13 | def output_device_info(device_id): 14 | sys.stdout.write("Device is ") 15 | sys.stdout.write(device_id.name) 16 | if device_id.type == cl.device_type.GPU: 17 | sys.stdout.write("GPU from ") 18 | elif device_id.type == cl.device_type.CPU: 19 | sys.stdout.write("CPU from ") 20 | else: 21 | sys.stdout.write("non CPU of GPU processor from ") 22 | sys.stdout.write(device_id.vendor) 23 | sys.stdout.write(" with a max of ") 24 | sys.stdout.write(str(device_id.max_compute_units)) 25 | sys.stdout.write(" compute units\n") 26 | sys.stdout.flush() 27 | -------------------------------------------------------------------------------- /Exercises/Exercise04/Python/vadd.py: -------------------------------------------------------------------------------- 1 | # 2 | # Vadd 3 | # 4 | # Element wise addition of two vectors (c = a + b) 5 | # Asks the user to select a device at runtime 6 | # 7 | # History: C version written by Tim Mattson, December 2009 8 | # C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 9 | # Ported to Python by Tom Deakin, July 2013 10 | # 11 | 12 | # Import the Python OpenCL API 13 | import pyopencl as cl 14 | # Import the Python Maths Library (for vectors) 15 | import numpy 16 | 17 | # Import a library to print out the device information 18 | import deviceinfo 19 | 20 | # Import Standard Library to time the execution 21 | from time import time 22 | #------------------------------------------------------------------------------ 23 | 24 | # tolerance used in floating point comparisons 25 | TOL = 0.001 26 | # length of vectors a, b and c 27 | LENGTH = 1024 28 | 29 | #------------------------------------------------------------------------------ 30 | # 31 | # Kernel: vadd 32 | # 33 | # To compute the elementwise sum c = a + b 34 | # 35 | # Input: a and b float vectors of length count 36 | # Output c float vector of length count holding the sum a + b 37 | 38 | kernelsource = """ 39 | __kernel void vadd( 40 | __global float* a, 41 | __global float* b, 42 | __global float* c, 43 | const unsigned int count) 44 | { 45 | int i = get_global_id(0); 46 | if (i < count) 47 | c[i] = a[i] + b[i]; 48 | } 49 | """ 50 | 51 | #------------------------------------------------------------------------------ 52 | 53 | # Main procedure 54 | 55 | # Create a compute context 56 | # Ask the user to select a platform/device on the CLI 57 | context = cl.create_some_context() 58 | 59 | # Print out device info 60 | deviceinfo.output_device_info(context.devices[0]) 61 | 62 | # Create a command queue 63 | queue = cl.CommandQueue(context) 64 | 65 | # Create the compute program from the source buffer 66 | # and build it 67 | program = cl.Program(context, kernelsource).build() 68 | 69 | # Create a and b vectors and fill with random float values 70 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32) 71 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32) 72 | # Create an empty c vector (a+b) to be returned from the compute device 73 | h_c = numpy.empty(LENGTH).astype(numpy.float32) 74 | 75 | # Create the input (a, b) arrays in device memory and copy data from host 76 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a) 77 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b) 78 | # Create the output (c) array in device memory 79 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_c.nbytes) 80 | 81 | # Start the timer 82 | rtime = time() 83 | 84 | # Execute the kernel over the entire range of our 1d input 85 | # allowing OpenCL runtime to select the work group items for the device 86 | vadd = program.vadd 87 | vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32]) 88 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH) 89 | 90 | # Wait for the commands to finish before reading back 91 | queue.finish() 92 | rtime = time() - rtime 93 | print "The kernel ran in", rtime, "seconds" 94 | 95 | # Read back the results from the compute device 96 | cl.enqueue_copy(queue, h_c, d_c) 97 | 98 | # Test the results 99 | correct = 0; 100 | for a, b, c in zip(h_a, h_b, h_c): 101 | # assign element i of a+b to tmp 102 | tmp = a + b 103 | # compute the deviation of expected and output result 104 | tmp -= c 105 | # correct if square deviation is less than tolerance squared 106 | if tmp*tmp < TOL*TOL: 107 | correct += 1 108 | else: 109 | print "tmp", tmp, "h_a", a, "h_b", b, "h_c", c 110 | 111 | # Summarize results 112 | print "C = A+B:", correct, "out of", LENGTH, "results were correct." 113 | -------------------------------------------------------------------------------- /Exercises/Exercise04/README.md: -------------------------------------------------------------------------------- 1 | Exercise 4 - Chaining vector add kernels (C++/Python) 2 | ===================================================== 3 | 4 | Goal 5 | ---- 6 | * To verify that you understand manipulating kernel invocations and buffers in OpenCL. 7 | 8 | Procedure 9 | --------- 10 | * Start with your VADD program in C++ or Python. 11 | * Add additional buffer objects and assign them to vectors defined on the host 12 | (see the provided vadd programs for examples of how to do this). 13 | * Chain vadds ... e.g. C=A+B; D=C+E; F=D+G. 14 | * Read back the final result and verify that this is correct. 15 | * Compare the complexity of your host code to C. 16 | 17 | Expected output 18 | --------------- 19 | * A message to standard output verifying that the chain of vector additions produced the correct result. 20 | 21 | Note 22 | ---- 23 | 24 | Sample solution is for C = A + B; D = C + E; F = D + G; return F 25 | -------------------------------------------------------------------------------- /Exercises/Exercise05/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS=-O3 -lm 7 | 8 | LIBS = -lOpenCL -fopenmp 9 | 10 | COMMON_DIR = ../../C_common 11 | 12 | # Change this variable to specify the device type 13 | # to the OpenCL device type of choice. You can also 14 | # edit the variable in the source. 15 | ifndef DEVICE 16 | DEVICE = CL_DEVICE_TYPE_DEFAULT 17 | endif 18 | 19 | # Check our platform and make sure we define the APPLE variable 20 | # and set up the right compiler flags and libraries 21 | PLATFORM = $(shell uname -s) 22 | ifeq ($(PLATFORM), Darwin) 23 | LIBS = -framework OpenCL 24 | endif 25 | 26 | CCFLAGS += -D DEVICE=$(DEVICE) 27 | 28 | vadd: vadd_c.c $(COMMON_DIR)/wtime.c $(COMMON_DIR)/device_info.c 29 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@ 30 | 31 | 32 | clean: 33 | rm -f vadd 34 | -------------------------------------------------------------------------------- /Exercises/Exercise05/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CPPC 3 | CPPC=g++ 4 | endif 5 | 6 | CPP_COMMON = ../../Cpp_common 7 | 8 | CCFLAGS=-std=c++11 9 | 10 | INC = -I $(CPP_COMMON) 11 | 12 | LIBS = -lOpenCL -lrt 13 | 14 | # Change this variable to specify the device type 15 | # to the OpenCL device type of choice. You can also 16 | # edit the variable in the source. 17 | ifndef DEVICE 18 | DEVICE = CL_DEVICE_TYPE_DEFAULT 19 | endif 20 | 21 | # Check our platform and make sure we define the APPLE variable 22 | # and set up the right compiler flags and libraries 23 | PLATFORM = $(shell uname -s) 24 | ifeq ($(PLATFORM), Darwin) 25 | CPPC = clang++ 26 | CCFLAGS += -stdlib=libc++ 27 | LIBS = -framework OpenCL 28 | endif 29 | 30 | CCFLAGS += -D DEVICE=$(DEVICE) 31 | 32 | vadd: vadd.cpp 33 | $(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@ 34 | 35 | 36 | clean: 37 | rm -f vadd 38 | -------------------------------------------------------------------------------- /Exercises/Exercise05/Cpp/vadd.cl: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // kernel: vadd 4 | // 5 | // Purpose: Compute the elementwise sum c = a+b 6 | // 7 | // input: a and b float vectors of length count 8 | // 9 | // output: c float vector of length count holding the sum a + b 10 | // 11 | 12 | __kernel void vadd( 13 | __global float* a, 14 | __global float* b, 15 | __global float* c, 16 | const unsigned int count) 17 | { 18 | int i = get_global_id(0); 19 | if(i < count) { 20 | c[i] = a[i] + b[i]; 21 | } 22 | } -------------------------------------------------------------------------------- /Exercises/Exercise05/Python/deviceinfo.py: -------------------------------------------------------------------------------- 1 | # 2 | # Device Info 3 | # 4 | # Function to output key parameters about the input OpenCL device 5 | # 6 | # History: C version written by Tim Mattson, June 2010 7 | # Ported to Python by Tom Deakin, July 2013 8 | # 9 | 10 | import pyopencl as cl 11 | import sys 12 | 13 | def output_device_info(device_id): 14 | sys.stdout.write("Device is ") 15 | sys.stdout.write(device_id.name) 16 | if device_id.type == cl.device_type.GPU: 17 | sys.stdout.write("GPU from ") 18 | elif device_id.type == cl.device_type.CPU: 19 | sys.stdout.write("CPU from ") 20 | else: 21 | sys.stdout.write("non CPU of GPU processor from ") 22 | sys.stdout.write(device_id.vendor) 23 | sys.stdout.write(" with a max of ") 24 | sys.stdout.write(str(device_id.max_compute_units)) 25 | sys.stdout.write(" compute units\n") 26 | sys.stdout.flush() 27 | -------------------------------------------------------------------------------- /Exercises/Exercise05/Python/vadd.py: -------------------------------------------------------------------------------- 1 | # 2 | # Vadd 3 | # 4 | # Element wise addition of two vectors (c = a + b) 5 | # Asks the user to select a device at runtime 6 | # 7 | # History: C version written by Tim Mattson, December 2009 8 | # C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 9 | # Ported to Python by Tom Deakin, July 2013 10 | # 11 | 12 | # Import the Python OpenCL API 13 | import pyopencl as cl 14 | # Import the Python Maths Library (for vectors) 15 | import numpy 16 | 17 | # Import a library to print out the device information 18 | import deviceinfo 19 | 20 | # Import Standard Library to time the execution 21 | from time import time 22 | #------------------------------------------------------------------------------ 23 | 24 | # tolerance used in floating point comparisons 25 | TOL = 0.001 26 | # length of vectors a, b and c 27 | LENGTH = 1024 28 | 29 | #------------------------------------------------------------------------------ 30 | # 31 | # Kernel: vadd 32 | # 33 | # To compute the elementwise sum c = a + b 34 | # 35 | # Input: a and b float vectors of length count 36 | # Output c float vector of length count holding the sum a + b 37 | 38 | kernelsource = """ 39 | __kernel void vadd( 40 | __global float* a, 41 | __global float* b, 42 | __global float* c, 43 | const unsigned int count) 44 | { 45 | int i = get_global_id(0); 46 | if (i < count) 47 | c[i] = a[i] + b[i]; 48 | } 49 | """ 50 | 51 | #------------------------------------------------------------------------------ 52 | 53 | # Main procedure 54 | 55 | # Create a compute context 56 | # Ask the user to select a platform/device on the CLI 57 | context = cl.create_some_context() 58 | 59 | # Print out device info 60 | deviceinfo.output_device_info(context.devices[0]) 61 | 62 | # Create a command queue 63 | queue = cl.CommandQueue(context) 64 | 65 | # Create the compute program from the source buffer 66 | # and build it 67 | program = cl.Program(context, kernelsource).build() 68 | 69 | # Create a and b vectors and fill with random float values 70 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32) 71 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32) 72 | # Create an empty c vector (a+b) to be returned from the compute device 73 | h_c = numpy.empty(LENGTH).astype(numpy.float32) 74 | 75 | # Create the input (a, b) arrays in device memory and copy data from host 76 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a) 77 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b) 78 | # Create the output (c) array in device memory 79 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_c.nbytes) 80 | 81 | # Start the timer 82 | rtime = time() 83 | 84 | # Execute the kernel over the entire range of our 1d input 85 | # allowing OpenCL runtime to select the work group items for the device 86 | vadd = program.vadd 87 | vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32]) 88 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH) 89 | 90 | # Wait for the commands to finish before reading back 91 | queue.finish() 92 | rtime = time() - rtime 93 | print "The kernel ran in", rtime, "seconds" 94 | 95 | # Read back the results from the compute device 96 | cl.enqueue_copy(queue, h_c, d_c) 97 | 98 | # Test the results 99 | correct = 0; 100 | for a, b, c in zip(h_a, h_b, h_c): 101 | # assign element i of a+b to tmp 102 | tmp = a + b 103 | # compute the deviation of expected and output result 104 | tmp -= c 105 | # correct if square deviation is less than tolerance squared 106 | if tmp*tmp < TOL*TOL: 107 | correct += 1 108 | else: 109 | print "tmp", tmp, "h_a", a, "h_b", b, "h_c", c 110 | 111 | # Summarize results 112 | print "C = A+B:", correct, "out of", LENGTH, "results were correct." 113 | -------------------------------------------------------------------------------- /Exercises/Exercise05/README.md: -------------------------------------------------------------------------------- 1 | Exercise 5 - The D = A + B + C problem 2 | ====================================== 3 | 4 | Goal 5 | ---- 6 | * To verify that you understand how to control the argument definitions for a *kernel*. 7 | * To verify that you understand the host/kernel interface. 8 | 9 | Procedure 10 | --------- 11 | * Start with your VADD program. 12 | * Modify the kernel so it adds three vectors together. 13 | * Modify the host code to define three vectors and associate them with relevant kernel arguments. 14 | * Read back the final result and verify that it is correct. 15 | 16 | Expected output 17 | --------------- 18 | * Test your result and verify that it is correct. 19 | Print a message to that effect on the screen. 20 | -------------------------------------------------------------------------------- /Exercises/Exercise06/C/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Matrix Multiply example makefile 3 | # 4 | # History: Written by Tim mattson, August 2010 5 | # Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 6 | # Modified by Tom Deakin, July 2013 7 | # Modified by Tom Deakin, October 2014 8 | # 9 | 10 | ifndef CC 11 | CC = gcc 12 | endif 13 | 14 | CCFLAGS=-O3 -std=c99 -ffast-math 15 | 16 | LIBS = -lm -lOpenCL -fopenmp 17 | 18 | COMMON_DIR = ../../C_common 19 | 20 | MMUL_OBJS = wtime.o 21 | EXEC = mult 22 | 23 | 24 | # Check our platform and make sure we define the APPLE variable 25 | # and set up the right compiler flags and libraries 26 | PLATFORM = $(shell uname -s) 27 | ifeq ($(PLATFORM), Darwin) 28 | LIBS = -lm -framework OpenCL 29 | endif 30 | 31 | 32 | all: $(EXEC) 33 | 34 | mult: $(MMUL_OBJS) matmul.c matrix_lib.c 35 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $(EXEC) 36 | 37 | wtime.o: $(COMMON_DIR)/wtime.c 38 | $(CC) -c $^ $(CCFLAGS) -o $@ 39 | 40 | .c.o: 41 | $(CC) -c $< $(CCFLAGS) -o $@ 42 | 43 | 44 | clean: 45 | rm -f $(MMUL_OBJS) $(EXEC) 46 | -------------------------------------------------------------------------------- /Exercises/Exercise06/C/matmul.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Include fle for the Matrix Multiply test harness 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Ported to C by Tom Deakin, July 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MULT_HDR 13 | #define __MULT_HDR 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #ifdef __APPLE__ 20 | #include 21 | #include 22 | #else 23 | #include 24 | #endif 25 | 26 | #include "matrix_lib.h" 27 | 28 | //------------------------------------------------------------------------------ 29 | // functions from ../Common 30 | //------------------------------------------------------------------------------ 31 | extern double wtime(); // returns time since some fixed past point (wtime.c) 32 | 33 | //------------------------------------------------------------------------------ 34 | // Constants 35 | //------------------------------------------------------------------------------ 36 | #define ORDER 1024 // Order of the square matrices A, B, and C 37 | #define AVAL 3.0 // A elements are constant and equal to AVAL 38 | #define BVAL 5.0 // B elements are constant and equal to BVAL 39 | #define TOL (0.001) // tolerance used in floating point comparisons 40 | #define DIM 2 // Max dim for NDRange 41 | #define COUNT 1 // number of times to do each multiplication 42 | #define SUCCESS 1 43 | #define FAILURE 0 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /Exercises/Exercise06/C/matrix_lib.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library for the multiplication driver 4 | // 5 | // PURPOSE: This is a simple set of functions to manipulate 6 | // matrices used with the multiplcation driver. 7 | // 8 | // USAGE: The matrices are square and the order is 9 | // set as a defined constant, ORDER. 10 | // 11 | // HISTORY: Written by Tim Mattson, August 2010 12 | // Modified by Simon McIntosh-Smith, September 2011 13 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 14 | // Ported to C by Tom Deakin, 2013 15 | // 16 | //------------------------------------------------------------------------------ 17 | 18 | #include "matmul.h" 19 | 20 | //------------------------------------------------------------------------------ 21 | // 22 | // Function to compute the matrix product (sequential algorithm, dot prod) 23 | // 24 | //------------------------------------------------------------------------------ 25 | 26 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C) 27 | { 28 | int i, j, k; 29 | float tmp; 30 | 31 | for (i = 0; i < N; i++) { 32 | for (j = 0; j < N; j++) { 33 | tmp = 0.0f; 34 | for (k = 0; k < N; k++) { 35 | /* C(i,j) = sum(over k) A(i,k) * B(k,j) */ 36 | tmp += A[i*N+k] * B[k*N+j]; 37 | } 38 | C[i*N+j] = tmp; 39 | } 40 | } 41 | } 42 | 43 | //------------------------------------------------------------------------------ 44 | // 45 | // Function to initialize the input matrices A and B 46 | // 47 | //------------------------------------------------------------------------------ 48 | void initmat(int N, float *A, float *B, float *C) 49 | { 50 | int i, j; 51 | 52 | /* Initialize matrices */ 53 | 54 | for (i = 0; i < N; i++) 55 | for (j = 0; j < N; j++) 56 | A[i*N+j] = AVAL; 57 | 58 | for (i = 0; i < N; i++) 59 | for (j = 0; j < N; j++) 60 | B[i*N+j] = BVAL; 61 | 62 | for (i = 0; i < N; i++) 63 | for (j = 0; j < N; j++) 64 | C[i*N+j] = 0.0f; 65 | } 66 | 67 | //------------------------------------------------------------------------------ 68 | // 69 | // Function to set a matrix to zero 70 | // 71 | //------------------------------------------------------------------------------ 72 | void zero_mat (int N, float *C) 73 | { 74 | int i, j; 75 | 76 | for (i = 0; i < N; i++) 77 | for (j = 0; j < N; j++) 78 | C[i*N+j] = 0.0f; 79 | } 80 | 81 | //------------------------------------------------------------------------------ 82 | // 83 | // Function to fill Btrans(N,N) with transpose of B(N,N) 84 | // 85 | //------------------------------------------------------------------------------ 86 | void trans(int N, float *B, float *Btrans) 87 | { 88 | int i, j; 89 | 90 | for (i = 0; i < N; i++) 91 | for (j = 0; j < N; j++) 92 | Btrans[j*N+i] = B[i*N+j]; 93 | } 94 | 95 | //------------------------------------------------------------------------------ 96 | // 97 | // Function to compute errors of the product matrix 98 | // 99 | //------------------------------------------------------------------------------ 100 | float error(int N, float *C) 101 | { 102 | int i,j; 103 | float cval, errsq, err; 104 | cval = (float) N * AVAL * BVAL; 105 | errsq = 0.0f; 106 | 107 | for (i = 0; i < N; i++) { 108 | for (j = 0; j < N; j++) { 109 | err = C[i*N+j] - cval; 110 | errsq += err * err; 111 | } 112 | } 113 | return errsq; 114 | } 115 | 116 | //------------------------------------------------------------------------------ 117 | // 118 | // Function to analyze and output results 119 | // 120 | //------------------------------------------------------------------------------ 121 | void results(int N, float *C, double run_time) 122 | { 123 | float mflops; 124 | float errsq; 125 | 126 | mflops = 2.0 * N * N * N/(1000000.0f * run_time); 127 | printf(" %.2f seconds at %.1f MFLOPS \n", run_time,mflops); 128 | errsq = error(N, C); 129 | if (isnan(errsq) || errsq > TOL) { 130 | printf("\n Errors in multiplication: %f\n",errsq); 131 | exit(1); 132 | } 133 | } 134 | 135 | -------------------------------------------------------------------------------- /Exercises/Exercise06/C/matrix_lib.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library include file (function prototypes) 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Ported by Tom Deakin, July 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MATRIX_LIB_HDR 13 | #define __MATRIX_LIB_HDR 14 | 15 | 16 | //------------------------------------------------------------------------------ 17 | // 18 | // Function to compute the matrix product (sequential algorithm, dot producdt) 19 | // 20 | //------------------------------------------------------------------------------ 21 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C); 22 | 23 | //------------------------------------------------------------------------------ 24 | // 25 | // Function to initialize the input matrices A and B 26 | // 27 | //------------------------------------------------------------------------------ 28 | void initmat(int N, float *A, float *B, float *C); 29 | 30 | //------------------------------------------------------------------------------ 31 | // 32 | // Function to set a matrix to zero 33 | // 34 | //------------------------------------------------------------------------------ 35 | void zero_mat (int N, float *C); 36 | 37 | //------------------------------------------------------------------------------ 38 | // 39 | // Function to fill Btrans(Mdim,Pdim) with transpose of B(Pdim,Mdim) 40 | // 41 | //------------------------------------------------------------------------------ 42 | void trans(int N, float *B, float *Btrans); 43 | 44 | //------------------------------------------------------------------------------ 45 | // 46 | // Function to compute errors of the product matrix 47 | // 48 | //------------------------------------------------------------------------------ 49 | float error(int N, float *C); 50 | 51 | 52 | //------------------------------------------------------------------------------ 53 | // 54 | // Function to analyze and output results 55 | // 56 | //------------------------------------------------------------------------------ 57 | void results(int N, float *C, double run_time); 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /Exercises/Exercise06/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Matrix Multiply example makefile 3 | # 4 | # History: Written by Tim mattson, August 2010 5 | # Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 6 | # Modified by Tom Deakin, July 2013 7 | # 8 | 9 | ifndef CPPC 10 | CPPC=g++ 11 | endif 12 | 13 | CCFLAGS=-O3 -ffast-math 14 | 15 | LIBS = -lm -lOpenCL -fopenmp 16 | 17 | COMMON_DIR = ../../Cpp_common 18 | 19 | INC = -I $(COMMON_DIR) 20 | 21 | MMUL_OBJS = matmul.o matrix_lib.o wtime.o 22 | EXEC = mult 23 | 24 | # Check our platform and make sure we define the APPLE variable 25 | # and set up the right compiler flags and libraries 26 | PLATFORM = $(shell uname -s) 27 | ifeq ($(PLATFORM), Darwin) 28 | CPPC = clang++ 29 | CCFLAGS += -stdlib=libc++ 30 | LIBS = -lm -framework OpenCL 31 | endif 32 | 33 | all: $(EXEC) 34 | 35 | mult: $(MMUL_OBJS) 36 | $(CPPC) $(MMUL_OBJS) $(CCFLAGS) $(LIBS) -o $(EXEC) 37 | 38 | wtime.o: $(COMMON_DIR)/wtime.c 39 | $(CPPC) -c $^ $(CCFLAGS) -o $@ 40 | 41 | 42 | .c.o: 43 | $(CPPC) -c $< $(CCFLAGS) -o $@ 44 | 45 | .cpp.o: 46 | $(CPPC) -c $< $(CCFLAGS) $(INC) -o $@ 47 | 48 | matmul.o: matmul.hpp matrix_lib.hpp 49 | 50 | matrix_lib.o: matmul.hpp 51 | 52 | clean: 53 | rm -f $(MMUL_OBJS) $(EXEC) 54 | -------------------------------------------------------------------------------- /Exercises/Exercise06/Cpp/matmul.hpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Include fle for the Matrix Multiply test harness 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MULT_HDR 13 | #define __MULT_HDR 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #define __CL_ENABLE_EXCEPTIONS 23 | #include "cl.hpp" 24 | 25 | 26 | #include "matrix_lib.hpp" 27 | 28 | //------------------------------------------------------------------------------ 29 | // functions from ../Common 30 | //------------------------------------------------------------------------------ 31 | extern double wtime(); // returns time since some fixed past point (wtime.c) 32 | 33 | //------------------------------------------------------------------------------ 34 | // Constants 35 | //------------------------------------------------------------------------------ 36 | #define ORDER 1024 // Order of the square matrices A, B, and C 37 | #define AVAL 3.0 // A elements are constant and equal to AVAL 38 | #define BVAL 5.0 // B elements are constant and equal to BVAL 39 | #define TOL (0.001) // tolerance used in floating point comparisons 40 | #define DIM 2 // Max dim for NDRange 41 | #define COUNT 1 // number of times to do each multiplication 42 | #define SUCCESS 1 43 | #define FAILURE 0 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /Exercises/Exercise06/Cpp/matrix_lib.cpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library for the multiplication driver 4 | // 5 | // PURPOSE: This is a simple set of functions to manipulate 6 | // matrices used with the multiplcation driver. 7 | // 8 | // USAGE: The matrices are square and the order is 9 | // set as a defined constant, ORDER. 10 | // 11 | // HISTORY: Written by Tim Mattson, August 2010 12 | // Modified by Simon McIntosh-Smith, September 2011 13 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 14 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 15 | // Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014 16 | // 17 | //------------------------------------------------------------------------------ 18 | 19 | #include "matmul.hpp" 20 | 21 | //------------------------------------------------------------------------------ 22 | // 23 | // Function to compute the matrix product (sequential algorithm, dot prod) 24 | // 25 | //------------------------------------------------------------------------------ 26 | 27 | void seq_mat_mul_sdot(int N, std::vector& A, std::vector& B, std::vector& C) 28 | { 29 | int i, j, k; 30 | float tmp; 31 | 32 | for (i = 0; i < N; i++) { 33 | for (j = 0; j < N; j++) { 34 | tmp = 0.0f; 35 | for (k = 0; k < N; k++) { 36 | /* C(i,j) = sum(over k) A(i,k) * B(k,j) */ 37 | tmp += A[i*N+k] * B[k*N+j]; 38 | } 39 | C[i*N+j] = tmp; 40 | } 41 | } 42 | } 43 | 44 | //------------------------------------------------------------------------------ 45 | // 46 | // Function to initialize the input matrices A and B 47 | // 48 | //------------------------------------------------------------------------------ 49 | void initmat(int N, std::vector& A, std::vector& B, std::vector& C) 50 | { 51 | int i, j; 52 | 53 | /* Initialize matrices */ 54 | 55 | for (i = 0; i < N; i++) 56 | for (j = 0; j < N; j++) 57 | A[i*N+j] = AVAL; 58 | 59 | for (i = 0; i < N; i++) 60 | for (j = 0; j < N; j++) 61 | B[i*N+j] = BVAL; 62 | 63 | for (i = 0; i < N; i++) 64 | for (j = 0; j < N; j++) 65 | C[i*N+j] = 0.0f; 66 | } 67 | 68 | //------------------------------------------------------------------------------ 69 | // 70 | // Function to set a matrix to zero 71 | // 72 | //------------------------------------------------------------------------------ 73 | void zero_mat (int N, std::vector& C) 74 | { 75 | int i, j; 76 | 77 | for (i = 0; i < N; i++) 78 | for (j = 0; j < N; j++) 79 | C[i*N+j] = 0.0f; 80 | } 81 | 82 | //------------------------------------------------------------------------------ 83 | // 84 | // Function to fill Btrans(N,N) with transpose of B(N,N) 85 | // 86 | //------------------------------------------------------------------------------ 87 | void trans(int N, std::vector& B, std::vector& Btrans) 88 | { 89 | int i, j; 90 | 91 | for (i = 0; i < N; i++) 92 | for (j = 0; j < N; j++) 93 | Btrans[j*N+i] = B[i*N+j]; 94 | } 95 | 96 | //------------------------------------------------------------------------------ 97 | // 98 | // Function to compute errors of the product matrix 99 | // 100 | //------------------------------------------------------------------------------ 101 | float error(int N, std::vector& C) 102 | { 103 | int i,j; 104 | float cval, errsq, err; 105 | cval = (float) N * AVAL * BVAL; 106 | errsq = 0.0f; 107 | 108 | for (i = 0; i < N; i++) { 109 | for (j = 0; j < N; j++) { 110 | err = C[i*N+j] - cval; 111 | errsq += err * err; 112 | } 113 | } 114 | return errsq; 115 | } 116 | 117 | //------------------------------------------------------------------------------ 118 | // 119 | // Function to analyze and output results 120 | // 121 | //------------------------------------------------------------------------------ 122 | void results(int N, std::vector& C, double run_time) 123 | { 124 | 125 | float mflops; 126 | float errsq; 127 | 128 | mflops = 2.0 * N * N * N/(1000000.0f * run_time); 129 | printf(" %.2f seconds at %.1f MFLOPS \n", run_time,mflops); 130 | errsq = error(N, C); 131 | if (std::isnan(errsq) || errsq > TOL) 132 | printf("\n Errors in multiplication: %f\n",errsq); 133 | } 134 | 135 | -------------------------------------------------------------------------------- /Exercises/Exercise06/Cpp/matrix_lib.hpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library include file (function prototypes) 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MATRIX_LIB_HDR 13 | #define __MATRIX_LIB_HDR 14 | 15 | 16 | //------------------------------------------------------------------------------ 17 | // 18 | // Function to compute the matrix product (sequential algorithm, dot producdt) 19 | // 20 | //------------------------------------------------------------------------------ 21 | void seq_mat_mul_sdot(int N, std::vector &A, std::vector &B, std::vector &C); 22 | 23 | //------------------------------------------------------------------------------ 24 | // 25 | // Function to initialize the input matrices A and B 26 | // 27 | //------------------------------------------------------------------------------ 28 | void initmat(int N, std::vector& A, std::vector& B, std::vector& C); 29 | 30 | //------------------------------------------------------------------------------ 31 | // 32 | // Function to set a matrix to zero 33 | // 34 | //------------------------------------------------------------------------------ 35 | void zero_mat (int N, std::vector &C); 36 | 37 | //------------------------------------------------------------------------------ 38 | // 39 | // Function to fill Btrans(Mdim,Pdim) with transpose of B(Pdim,Mdim) 40 | // 41 | //------------------------------------------------------------------------------ 42 | void trans(int N, std::vector& B, std::vector& Btrans); 43 | 44 | //------------------------------------------------------------------------------ 45 | // 46 | // Function to compute errors of the product matrix 47 | // 48 | //------------------------------------------------------------------------------ 49 | float error(int N, std::vector& C); 50 | 51 | 52 | //------------------------------------------------------------------------------ 53 | // 54 | // Function to analyze and output results 55 | // 56 | //------------------------------------------------------------------------------ 57 | void results(int N, std::vector& C, double run_time); 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /Exercises/Exercise06/Python/definitions.py: -------------------------------------------------------------------------------- 1 | 2 | # Order of the square matrices A, B and C 3 | ORDER = 1024 4 | 5 | # A elemetns are constant and equal to AVAL 6 | AVAL = 3.0 7 | 8 | # B elemetns are constant and equal to BVAL 9 | BVAL = 5.0 10 | 11 | # tolerance used in floating point comparisons 12 | TOL = 0.001 13 | 14 | # Max dim for NDRange 15 | DIM = 2 16 | 17 | # number of times to do each multiplication 18 | COUNT = 1 19 | -------------------------------------------------------------------------------- /Exercises/Exercise06/Python/helper.py: -------------------------------------------------------------------------------- 1 | 2 | from definitions import * 3 | import numpy 4 | 5 | # Function to compute the matrix product (sequential algorithm, dot prod) 6 | def seq_mat_mul_sdot(N, A, B, C): 7 | for i in range(N): 8 | for j in range(N): 9 | tmp = 0.0 10 | for k in range(N): 11 | tmp += A[i*N+k] * B[k*N+j] 12 | C[i*N+j] = tmp 13 | 14 | # Function to compute errors of the product matrix 15 | def error(N, C): 16 | cval = float(N) * AVAL * BVAL 17 | errsq = 0.0 18 | for i in range(N): 19 | for j in range(N): 20 | err = C[i*N+j] - cval 21 | errsq += err * err 22 | return errsq; 23 | 24 | 25 | # Function to analyze and output results 26 | def results(N, C, run_time): 27 | mflops = 2.0 * N * N * N/(1000000.0* run_time) 28 | print run_time, "seconds at", mflops, "MFLOPS" 29 | errsq = error(N, C) 30 | if numpy.isnan(errsq) or errsq > TOL: 31 | print "Errors in multiplication:", errsq 32 | -------------------------------------------------------------------------------- /Exercises/Exercise06/Python/matmul.py: -------------------------------------------------------------------------------- 1 | # 2 | # Matrix Multiplication Driver 3 | # 4 | # This is a driver program to test various ways of computing 5 | # the product: 6 | # C = A * B 7 | # 8 | # A and B are constant matrices, square and the order is 9 | # set as a constant, ORDER (see definitions.py). This is so 10 | # we can make a quick test of the multiplication result. 11 | # 12 | # History: C++ version written by Tim Mattson, August 2010 13 | # Modified by Simon McIntosh-Smith, September 2011 14 | # Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 15 | # Ported to Python by Tom Deakin, July 2013 16 | # Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014 17 | # 18 | 19 | from helper import * 20 | from definitions import * 21 | 22 | import pyopencl as cl 23 | import numpy 24 | from time import time 25 | 26 | C_elem_KernelSource = ''' 27 | __kernel void mmul( 28 | const int N, 29 | __global float* A, 30 | __global float* B, 31 | __global float* C) 32 | { 33 | } 34 | ''' 35 | 36 | # A[N][N], B[N][N], C[N][N] 37 | N = ORDER; 38 | 39 | # Number of elements in the matrix 40 | size = N * N 41 | 42 | 43 | # A matrix 44 | h_A = numpy.empty(size).astype(numpy.float32) 45 | h_A.fill(AVAL) 46 | 47 | # B matrix 48 | h_B = numpy.empty(size).astype(numpy.float32) 49 | h_B.fill(BVAL) 50 | 51 | # C matrix 52 | h_C = numpy.empty(size).astype(numpy.float32) 53 | 54 | print "\n===== Sequential, matrix mult (dot prod), order", ORDER, "on host CPU ======\n" 55 | 56 | for i in range(COUNT): 57 | h_C.fill(0.0) 58 | start_time = time() 59 | 60 | print "Skipping as this takes a long time to run!" 61 | #seq_mat_mul_sdot(N, h_A, h_B, h_C) 62 | 63 | run_time = time() - start_time 64 | #results(N, h_C, run_time) 65 | 66 | 67 | # Set up OpenCL 68 | context = cl.create_some_context() 69 | queue = cl.CommandQueue(context) 70 | 71 | # Reset host buffers - just to play it safe 72 | h_A = numpy.empty(size).astype(numpy.float32) 73 | h_A.fill(AVAL) 74 | h_B = numpy.empty(size).astype(numpy.float32) 75 | h_B.fill(BVAL) 76 | h_C = numpy.empty(size).astype(numpy.float32) 77 | 78 | # Create OpenCL buffers 79 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_A) 80 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_B) 81 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes) 82 | 83 | program = cl.Program(context, C_elem_KernelSource).build() 84 | mmul = program.mmul 85 | mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None]) 86 | 87 | print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", N, "======\n" 88 | 89 | # Do the multiplication COUNT times 90 | for i in range(COUNT): 91 | h_C.fill(0.0) 92 | start_time = time() 93 | 94 | globalrange = (N, N) 95 | localrange = None 96 | 97 | mmul(queue, globalrange, localrange, N, d_a, d_b, d_c) 98 | queue.finish() 99 | 100 | run_time = time() - start_time 101 | 102 | cl.enqueue_copy(queue, h_C, d_c) 103 | results(N, h_C, run_time) 104 | -------------------------------------------------------------------------------- /Exercises/Exercise06/README.md: -------------------------------------------------------------------------------- 1 | Exercise 6 - Matrix Multiplication 2 | ================================== 3 | 4 | Goal 5 | ---- 6 | * To write your first complete OpenCL kernel **from scratch** 7 | * To multiply a pair of matrices 8 | 9 | Procedure 10 | --------- 11 | * Start with the serial matrix multiplication program including the function to generate matrices (C/C++ only) and test results. 12 | * Create a kernel to do the multiplication. 13 | * Modify the provided OpenCL host program to use your kernel. 14 | * Verify the results. 15 | 16 | Expected output 17 | --------------- 18 | * A message to standard output verifying that the chain of vector additions produced the correct results. 19 | * Report the runtime and the MFLOPS. 20 | -------------------------------------------------------------------------------- /Exercises/Exercise07/README.md: -------------------------------------------------------------------------------- 1 | Exercise 7 - using private memory 2 | ================================= 3 | 4 | Goal 5 | ---- 6 | * Use private memory to minimize memory movement costs and optimize performance of your matrix multiplication program. 7 | 8 | Procedure 9 | --------- 10 | * Start with your matrix multiplication program. 11 | * Modify the kernel so that each work-item copies its own row of A into private memory. 12 | * Optimize step by step, saving the intermediate versions and tracking performance improvements. 13 | 14 | Expected output 15 | --------------- 16 | * A message to standard output verifying that the matrix multiplication program is generating the correct results. 17 | * Report the runtime and the MFLOPS. 18 | -------------------------------------------------------------------------------- /Exercises/Exercise08/README.md: -------------------------------------------------------------------------------- 1 | Exercise 8 - using local memory 2 | =============================== 3 | 4 | Goal 5 | ---- 6 | * Use local memory to minimize memory movement costs and optimize performance of your matrix multiplication program. 7 | 8 | Procedure 9 | --------- 10 | * Start with your matrix multiplication program that already uses private memory from Exercise 7. 11 | * Modify the kernel so that each work-group collaboratively copies its own column of B into local memory. 12 | * Optimize step by step, saving the intermediate versions and tracking performance improvements. 13 | 14 | Expected output 15 | --------------- 16 | * A message to standard output verifying that the matrix multiplication program is generating the correct results. 17 | * Report the runtime and the MFLOPS. 18 | 19 | Extra 20 | ----- 21 | * Look at the fast, blocked implementation from the NVIDIA OpenCL SDK example. 22 | Try running it and compare to yours. 23 | 24 | -------------------------------------------------------------------------------- /Exercises/Exercise09/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS=-lm -O3 7 | 8 | LIBS = -fopenmp 9 | 10 | COMMON_DIR = ../../C_common 11 | 12 | # Check our platform and make sure we define the APPLE variable 13 | # and set up the right compiler flags and libraries 14 | PLATFORM = $(shell uname -s) 15 | ifeq ($(PLATFORM), Darwin) 16 | LIBS = 17 | endif 18 | 19 | pi: pi.c $(COMMON_DIR)/wtime.c 20 | $(CC) $^ $(CCFLAGS) -o $@ 21 | 22 | clean: 23 | rm -f pi 24 | -------------------------------------------------------------------------------- /Exercises/Exercise09/C/pi.c: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | This program will numerically compute the integral of 4 | 5 | 4/(1+x*x) 6 | 7 | from 0 to 1. The value of this integral is pi -- which 8 | is great since it gives us an easy way to check the answer. 9 | 10 | The is the original sequential program. It uses the timer 11 | from the OpenMP runtime library 12 | 13 | History: Written by Tim Mattson, 11/99. 14 | 15 | */ 16 | 17 | #include 18 | static long num_steps = 100000000; 19 | double step; 20 | extern double wtime(); // returns time since some fixed past point (wtime.c) 21 | 22 | 23 | int main () 24 | { 25 | int i; 26 | double x, pi, sum = 0.0; 27 | double start_time, run_time; 28 | 29 | step = 1.0/(double) num_steps; 30 | 31 | start_time =wtime(); 32 | 33 | for (i=1;i<= num_steps; i++){ 34 | x = (i-0.5)*step; 35 | sum = sum + 4.0/(1.0+x*x); 36 | } 37 | 38 | pi = step * sum; 39 | run_time = wtime() - start_time; 40 | printf("\n pi with %ld steps is %lf in %lf seconds\n", num_steps, pi, run_time); 41 | } 42 | 43 | -------------------------------------------------------------------------------- /Exercises/Exercise09/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CPPC 3 | CPPC = g++ 4 | endif 5 | 6 | CCFLAGS = -O3 -lrt 7 | 8 | LIBS = -lm 9 | 10 | CPP_COMMON = ../../Cpp_common 11 | 12 | INC = -I $(CPP_COMMON) 13 | 14 | # Check our platform and make sure we define the APPLE variable 15 | # and set up the right compiler flags and libraries 16 | PLATFORM = $(shell uname -s) 17 | ifeq ($(PLATFORM), Darwin) 18 | CPPC = clang++ 19 | CCFLAGS = -O3 -std=c++11 -stdlib=libc++ 20 | endif 21 | 22 | pi: pi.cpp 23 | $(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@ 24 | 25 | clean: 26 | rm -f pi 27 | -------------------------------------------------------------------------------- /Exercises/Exercise09/Cpp/pi.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | This program will numerically compute the integral of 4 | 5 | 4/(1+x*x) 6 | 7 | from 0 to 1. The value of this integral is pi -- which 8 | is great since it gives us an easy way to check the answer. 9 | 10 | The is the original sequential program. It uses the timer 11 | from the OpenMP runtime library 12 | 13 | History: Written by Tim Mattson, 11/99. 14 | Ported to C++ by Tom Deakin, August 2013 15 | 16 | */ 17 | 18 | #include "util.hpp" 19 | 20 | #include 21 | static long num_steps = 100000000; 22 | double step; 23 | extern double wtime(); // returns time since some fixed past point (wtime.c) 24 | 25 | 26 | int main () 27 | { 28 | int i; 29 | double x, pi, sum = 0.0; 30 | 31 | 32 | step = 1.0/(double) num_steps; 33 | 34 | util::Timer timer; 35 | 36 | for (i=1;i<= num_steps; i++){ 37 | x = (i-0.5)*step; 38 | sum = sum + 4.0/(1.0+x*x); 39 | } 40 | 41 | pi = step * sum; 42 | double run_time = static_cast(timer.getTimeMilliseconds()) / 1000.0; 43 | printf("\n pi with %ld steps is %lf in %lf seconds\n", num_steps, pi, run_time); 44 | } 45 | 46 | -------------------------------------------------------------------------------- /Exercises/Exercise09/Python/pi.py: -------------------------------------------------------------------------------- 1 | # 2 | # This program will numerically compute the integral of 3 | # 4 | # 4/(1+x*x) 5 | # 6 | # from 0 to 1. The value of this integral is pi -- which 7 | # is great since it gives us an easy way to check the answer. 8 | # 9 | # This the original sequential program. 10 | # 11 | # History: Written in C by Tim Mattson, 11/99 12 | # Ported to Python by Tom Deakin, July 2013 13 | # 14 | 15 | from time import time 16 | 17 | num_steps = 100000000 18 | 19 | print "\nNote: Wanted to do", num_steps, "steps, but this is very slow in Python." 20 | 21 | num_steps = 1000000 22 | 23 | print "Doing", num_steps, "steps instead." 24 | 25 | integral_sum = 0.0 26 | 27 | step = 1.0/num_steps 28 | 29 | start_time = time() 30 | 31 | for i in range(1,num_steps): 32 | x = (i-0.5)*step 33 | integral_sum += 4.0/(1.0+x*x) 34 | 35 | pi = step * integral_sum 36 | 37 | run_time = time() - start_time; 38 | 39 | print "\npi with", num_steps, "steps is", pi, "in", run_time, "seconds\n" 40 | 41 | -------------------------------------------------------------------------------- /Exercises/Exercise09/README.md: -------------------------------------------------------------------------------- 1 | Exercise 9 - The Pi program 2 | =========================== 3 | 4 | Goal 5 | ---- 6 | * To understand synchronization between work-items in the OpenCL C kernel programming language. 7 | 8 | Procedure 9 | --------- 10 | * Start with the provided serial program to estimate Pi through numerical integration. 11 | * Write a kernel and host program to compute the numerical integral using OpenCL. 12 | * Note: you will need to implement a reduction. 13 | 14 | Expected output 15 | --------------- 16 | * Output result plus an estimate of the error in the result. 17 | * Report the runtime. 18 | 19 | Hint 20 | ---- 21 | You will want each work-item to do many iterations of the loop, i.e. don't create one work-item per loop iteration. 22 | To do so would make the reduction so costly that performance would be terrible. 23 | -------------------------------------------------------------------------------- /Exercises/Exercise10/README.md: -------------------------------------------------------------------------------- 1 | Exercise 10 - Heterogeneous Computing 2 | ==================================== 3 | 4 | Goal 5 | ---- 6 | * To experiment with running kernels on multiple devices. 7 | 8 | Procedure 9 | --------- 10 | * Take one of your OpenCL programs. 11 | * Investigate the Context constructors and include more than once device. 12 | * Modify the program to run a kernel on multiple devices, each with different input data. 13 | * Split your problem across multiple devices if you have time. 14 | * Use the examples in the SDK to help you. 15 | 16 | Expected output 17 | --------------- 18 | * Output the results from both devices and see which runs faster. 19 | -------------------------------------------------------------------------------- /Exercises/Exercise11/README.md: -------------------------------------------------------------------------------- 1 | Exercise 11 - Optimize matrix multiplication 2 | ============================================ 3 | 4 | Goal 5 | ---- 6 | * To understand portable performance in OpenCL. 7 | 8 | Procedure 9 | --------- 10 | * Optimize step by step ... saving intermediate versions and tracking performance improvements. 11 | * After you've tried to optimize the program on your own, study the blocked solution optimized for NVIDIA GPU. 12 | Apply these techniques to your own code to further optimize performance. 13 | * As a final step, go back and make a single program that is adaptive so it delivers good results on both a CPU and a GPU. 14 | 15 | Expected output 16 | --------------- 17 | * A message confirming that the matrix multiplication is correct. 18 | * Report the runtime and the MFLOPS. -------------------------------------------------------------------------------- /Exercises/Exercise12/README.md: -------------------------------------------------------------------------------- 1 | Exercise 12 - Profiling OpenCL programs 2 | ======================================= 3 | 4 | Goal 5 | ---- 6 | * To experiment with profiling tools. 7 | 8 | Procedure 9 | --------- 10 | * Take one of your OpenCL programs, such as matrix multiply. 11 | * Run the program in the profiler and explore the results. 12 | * Modify the program to improve the performance. 13 | * Repeat with other programs if you have time. 14 | 15 | Expected output 16 | --------------- 17 | * Timing from the timer and profiling interfaces should roughly match. -------------------------------------------------------------------------------- /Exercises/Exercise13/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS = -O3 -std=c99 7 | 8 | LIBS = -fopenmp 9 | 10 | # Check our platform and make sure we define the APPLE variable 11 | # and set up the right compiler flags and libraries 12 | PLATFORM = $(shell uname -s) 13 | ifeq ($(PLATFORM), Darwin) 14 | LIBS = 15 | endif 16 | 17 | gameoflife: gameoflife.c 18 | $(CC) $^ $(CCFLAGS) $(LIBS) -o $@ 19 | 20 | clean: 21 | rm -f gameoflife 22 | -------------------------------------------------------------------------------- /Exercises/Exercise13/CUDA-VADD/Makefile: -------------------------------------------------------------------------------- 1 | 2 | CC = nvcc 3 | 4 | CCFLAGS = -O3 5 | 6 | LIBS = 7 | 8 | vadd: vadd.cu 9 | $(CC) $^ $(CCFLAGS) $(LIBS) -o $@ 10 | 11 | clean: 12 | rm -f vadd 13 | -------------------------------------------------------------------------------- /Exercises/Exercise13/CUDA-VADD/vadd.cu: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Name: vadd.cu 4 | // 5 | // Purpose: CUDA implementation of VADD 6 | // 7 | // HISTORY: Written by Tom Deakin and Simon McIntosh-Smith, August 2013 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | #include 12 | #include 13 | 14 | #define TOL (0.001) // tolerance used in floating point comparisons 15 | #define LENGTH (1024) // length of vectors a, b, and c 16 | 17 | /************************************************************************************* 18 | * CUDA kernel 19 | ************************************************************************************/ 20 | 21 | __global__ void vadd(const float* a, 22 | const float* b, 23 | float* c, 24 | const unsigned int count) 25 | { 26 | int i = blockDim.x * blockIdx.x + threadIdx.x; 27 | if (i < count) { 28 | c[i] = a[i] + b[i]; 29 | } 30 | } 31 | 32 | /************************************************************************************* 33 | * Main function 34 | ************************************************************************************/ 35 | 36 | int main(void) 37 | { 38 | float h_a[LENGTH]; // a vector 39 | float h_b[LENGTH]; // b vector 40 | float h_c[LENGTH]; // c vector (a+b) returned from the compute device 41 | float *d_a, *d_b, *d_c; // CUDA memory 42 | unsigned int correct; // number of correct results 43 | 44 | // Fill vectors a and b with random float values 45 | int i = 0; 46 | int count = LENGTH; 47 | for(i = 0; i < count; i++){ 48 | h_a[i] = rand() / (float)RAND_MAX; 49 | h_b[i] = rand() / (float)RAND_MAX; 50 | } 51 | 52 | // Allocate CUDA memory 53 | cudaMalloc(&d_a, sizeof(float) * LENGTH); 54 | cudaMalloc(&d_b, sizeof(float) * LENGTH); 55 | cudaMalloc(&d_c, sizeof(float) * LENGTH); 56 | 57 | // Write buffers a and b to GPU memory 58 | cudaMemcpy(d_a, h_a, sizeof(float) * LENGTH, cudaMemcpyHostToDevice); 59 | cudaMemcpy(d_b, h_b, sizeof(float) * LENGTH, cudaMemcpyHostToDevice); 60 | 61 | dim3 numBlocks(LENGTH); 62 | dim3 numThreads(1); 63 | vadd<<>>(d_a, d_b, d_c, LENGTH); 64 | 65 | // Copy result array back to host memory 66 | cudaMemcpy(h_c, d_c, sizeof(float) * LENGTH, cudaMemcpyDeviceToHost); 67 | 68 | // Test the results 69 | correct = 0; 70 | float tmp; 71 | 72 | for(i = 0; i < count; i++) 73 | { 74 | tmp = h_a[i] + h_b[i]; // assign element i of a+b to tmp 75 | tmp -= h_c[i]; // compute deviation of expected and output result 76 | if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared 77 | correct++; 78 | else { 79 | printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], h_c[i]); 80 | } 81 | } 82 | 83 | // summarize results 84 | printf("C = A+B: %d out of %d results were correct.\n", correct, count); 85 | 86 | return EXIT_SUCCESS; 87 | } 88 | -------------------------------------------------------------------------------- /Exercises/Exercise13/CUDA/Makefile: -------------------------------------------------------------------------------- 1 | 2 | CC = nvcc 3 | 4 | CCFLAGS = -O3 5 | 6 | LIBS = 7 | 8 | gameoflife: gameoflife.cu 9 | $(CC) $^ $(CCFLAGS) $(LIBS) -o $@ 10 | 11 | clean: 12 | rm -f gameoflife 13 | -------------------------------------------------------------------------------- /Exercises/Exercise13/Displayer/Makefile: -------------------------------------------------------------------------------- 1 | 2 | CC=gcc 3 | 4 | CCFLAGS=-std=c99 -O3 5 | 6 | LIBS = -lGL -lglut 7 | 8 | # Check our platform and make sure we define the APPLE variable 9 | # and set up the right compiler flags and libraries 10 | PLATFORM = $(shell uname -s) 11 | ifeq ($(PLATFORM), Darwin) 12 | LIBS = -framework OpenGL -framework GLUT 13 | endif 14 | 15 | displayer: displayer.c 16 | $(CC) $^ $(CCFLAGS) $(LIBS) -o $@ 17 | 18 | .PHONY: clean 19 | clean: 20 | rm -f displayer 21 | -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/Acorn/acorn.dat: -------------------------------------------------------------------------------- 1 | 48 49 1 2 | 50 50 1 3 | 47 51 1 4 | 48 51 1 5 | 51 51 1 6 | 52 51 1 7 | 53 51 1 -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/Acorn/final_state.dat: -------------------------------------------------------------------------------- 1 | 33 33 1 2 | 32 34 1 3 | 34 34 1 4 | 15 35 1 5 | 16 35 1 6 | 32 35 1 7 | 34 35 1 8 | 15 36 1 9 | 18 36 1 10 | 33 36 1 11 | 16 37 1 12 | 17 37 1 13 | 18 37 1 14 | 56 37 1 15 | 57 37 1 16 | 15 38 1 17 | 56 38 1 18 | 57 38 1 19 | 10 39 1 20 | 11 39 1 21 | 12 39 1 22 | 13 39 1 23 | 14 39 1 24 | 15 39 1 25 | 17 39 1 26 | 18 39 1 27 | 8 41 1 28 | 14 41 1 29 | 15 41 1 30 | 6 42 1 31 | 7 42 1 32 | 13 42 1 33 | 19 42 1 34 | 6 43 1 35 | 7 43 1 36 | 11 43 1 37 | 12 43 1 38 | 15 43 1 39 | 6 44 1 40 | 7 44 1 41 | 16 44 1 42 | 20 44 1 43 | 21 44 1 44 | 22 44 1 45 | 6 45 1 46 | 7 45 1 47 | 8 45 1 48 | 19 45 1 49 | 22 45 1 50 | 24 45 1 51 | 20 46 1 52 | 24 46 1 53 | 6 47 1 54 | 8 47 1 55 | 21 47 1 56 | 22 47 1 57 | 23 47 1 58 | 33 47 1 59 | 6 48 1 60 | 21 48 1 61 | 22 48 1 62 | 23 48 1 63 | 30 48 1 64 | 34 48 1 65 | 10 49 1 66 | 30 49 1 67 | 34 49 1 68 | 6 50 1 69 | 10 50 1 70 | 14 50 1 71 | 15 50 1 72 | 16 50 1 73 | 30 50 1 74 | 34 50 1 75 | 10 51 1 76 | 6 52 1 77 | 31 52 1 78 | 33 52 1 79 | 6 53 1 80 | 8 53 1 81 | 28 53 1 82 | 30 53 1 83 | 31 53 1 84 | 32 53 1 85 | 24 54 1 86 | 26 54 1 87 | 27 54 1 88 | 28 54 1 89 | 42 54 1 90 | 43 54 1 91 | 44 54 1 92 | 6 55 1 93 | 7 55 1 94 | 8 55 1 95 | 26 55 1 96 | 27 55 1 97 | 28 55 1 98 | 60 55 1 99 | 61 55 1 100 | 6 56 1 101 | 7 56 1 102 | 26 56 1 103 | 31 56 1 104 | 33 56 1 105 | 60 56 1 106 | 61 56 1 107 | 6 57 1 108 | 7 57 1 109 | 11 57 1 110 | 12 57 1 111 | 16 57 1 112 | 17 57 1 113 | 18 57 1 114 | 19 57 1 115 | 20 57 1 116 | 25 57 1 117 | 31 57 1 118 | 34 57 1 119 | 6 58 1 120 | 7 58 1 121 | 13 58 1 122 | 16 58 1 123 | 17 58 1 124 | 19 58 1 125 | 20 58 1 126 | 31 58 1 127 | 33 58 1 128 | 8 59 1 129 | 13 59 1 130 | 17 59 1 131 | 23 59 1 132 | 18 60 1 133 | 21 60 1 134 | 22 60 1 135 | 10 61 1 136 | 11 61 1 137 | 12 61 1 138 | 17 61 1 139 | 17 62 1 140 | 18 62 1 141 | 19 62 1 142 | 16 63 1 143 | 19 63 1 144 | 15 64 1 145 | 18 64 1 146 | 15 65 1 147 | 16 65 1 148 | 17 65 1 149 | 16 66 1 150 | -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/Acorn/input.params: -------------------------------------------------------------------------------- 1 | 100 2 | 100 3 | 150 -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/Max/input.params: -------------------------------------------------------------------------------- 1 | 100 2 | 100 3 | 50 -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/Max/max.dat: -------------------------------------------------------------------------------- 1 | 56 38 1 2 | 55 39 1 3 | 56 39 1 4 | 57 39 1 5 | 50 40 1 6 | 51 40 1 7 | 52 40 1 8 | 57 40 1 9 | 58 40 1 10 | 49 41 1 11 | 52 41 1 12 | 53 41 1 13 | 54 41 1 14 | 57 41 1 15 | 59 41 1 16 | 60 41 1 17 | 48 42 1 18 | 52 42 1 19 | 54 42 1 20 | 57 42 1 21 | 59 42 1 22 | 48 43 1 23 | 53 43 1 24 | 55 43 1 25 | 57 43 1 26 | 59 43 1 27 | 61 43 1 28 | 62 43 1 29 | 50 44 1 30 | 55 44 1 31 | 57 44 1 32 | 61 44 1 33 | 62 44 1 34 | 38 45 1 35 | 39 45 1 36 | 40 45 1 37 | 41 45 1 38 | 47 45 1 39 | 49 45 1 40 | 54 45 1 41 | 58 45 1 42 | 60 45 1 43 | 61 45 1 44 | 62 45 1 45 | 38 46 1 46 | 42 46 1 47 | 43 46 1 48 | 45 46 1 49 | 47 46 1 50 | 48 46 1 51 | 49 46 1 52 | 51 46 1 53 | 52 46 1 54 | 62 46 1 55 | 63 46 1 56 | 38 47 1 57 | 44 47 1 58 | 45 47 1 59 | 51 47 1 60 | 39 48 1 61 | 42 48 1 62 | 43 48 1 63 | 45 48 1 64 | 48 48 1 65 | 51 48 1 66 | 53 48 1 67 | 54 48 1 68 | 45 49 1 69 | 47 49 1 70 | 49 49 1 71 | 51 49 1 72 | 53 49 1 73 | 55 49 1 74 | 61 49 1 75 | 62 49 1 76 | 63 49 1 77 | 64 49 1 78 | 39 50 1 79 | 42 50 1 80 | 43 50 1 81 | 45 50 1 82 | 48 50 1 83 | 51 50 1 84 | 54 50 1 85 | 55 50 1 86 | 57 50 1 87 | 59 50 1 88 | 60 50 1 89 | 64 50 1 90 | 38 51 1 91 | 44 51 1 92 | 45 51 1 93 | 49 51 1 94 | 51 51 1 95 | 53 51 1 96 | 57 51 1 97 | 58 51 1 98 | 64 51 1 99 | 38 52 1 100 | 42 52 1 101 | 43 52 1 102 | 45 52 1 103 | 47 52 1 104 | 48 52 1 105 | 51 52 1 106 | 54 52 1 107 | 57 52 1 108 | 59 52 1 109 | 60 52 1 110 | 63 52 1 111 | 38 53 1 112 | 39 53 1 113 | 40 53 1 114 | 41 53 1 115 | 47 53 1 116 | 49 53 1 117 | 51 53 1 118 | 53 53 1 119 | 55 53 1 120 | 57 53 1 121 | 48 54 1 122 | 49 54 1 123 | 51 54 1 124 | 54 54 1 125 | 57 54 1 126 | 59 54 1 127 | 60 54 1 128 | 63 54 1 129 | 51 55 1 130 | 57 55 1 131 | 58 55 1 132 | 64 55 1 133 | 39 56 1 134 | 40 56 1 135 | 50 56 1 136 | 51 56 1 137 | 53 56 1 138 | 54 56 1 139 | 55 56 1 140 | 57 56 1 141 | 59 56 1 142 | 60 56 1 143 | 64 56 1 144 | 40 57 1 145 | 41 57 1 146 | 42 57 1 147 | 44 57 1 148 | 48 57 1 149 | 53 57 1 150 | 55 57 1 151 | 61 57 1 152 | 62 57 1 153 | 63 57 1 154 | 64 57 1 155 | 40 58 1 156 | 41 58 1 157 | 45 58 1 158 | 47 58 1 159 | 52 58 1 160 | 40 59 1 161 | 41 59 1 162 | 43 59 1 163 | 45 59 1 164 | 47 59 1 165 | 49 59 1 166 | 54 59 1 167 | 43 60 1 168 | 45 60 1 169 | 48 60 1 170 | 50 60 1 171 | 54 60 1 172 | 42 61 1 173 | 43 61 1 174 | 45 61 1 175 | 48 61 1 176 | 49 61 1 177 | 50 61 1 178 | 53 61 1 179 | 44 62 1 180 | 45 62 1 181 | 50 62 1 182 | 51 62 1 183 | 52 62 1 184 | 45 63 1 185 | 46 63 1 186 | 47 63 1 187 | 46 64 1 188 | -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/Pulsar/final_state.dat: -------------------------------------------------------------------------------- 1 | 4 2 1 2 | 5 2 1 3 | 6 2 1 4 | 10 2 1 5 | 11 2 1 6 | 12 2 1 7 | 2 4 1 8 | 7 4 1 9 | 9 4 1 10 | 14 4 1 11 | 2 5 1 12 | 7 5 1 13 | 9 5 1 14 | 14 5 1 15 | 2 6 1 16 | 7 6 1 17 | 9 6 1 18 | 14 6 1 19 | 4 7 1 20 | 5 7 1 21 | 6 7 1 22 | 10 7 1 23 | 11 7 1 24 | 12 7 1 25 | 4 9 1 26 | 5 9 1 27 | 6 9 1 28 | 10 9 1 29 | 11 9 1 30 | 12 9 1 31 | 2 10 1 32 | 7 10 1 33 | 9 10 1 34 | 14 10 1 35 | 2 11 1 36 | 7 11 1 37 | 9 11 1 38 | 14 11 1 39 | 2 12 1 40 | 7 12 1 41 | 9 12 1 42 | 14 12 1 43 | 4 14 1 44 | 5 14 1 45 | 6 14 1 46 | 10 14 1 47 | 11 14 1 48 | 12 14 1 49 | -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/Pulsar/input.params: -------------------------------------------------------------------------------- 1 | 18 2 | 18 3 | 300000 4 | -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/Pulsar/pulsar.dat: -------------------------------------------------------------------------------- 1 | 4 2 1 2 | 5 2 1 3 | 6 2 1 4 | 10 2 1 5 | 11 2 1 6 | 12 2 1 7 | 2 4 1 8 | 7 4 1 9 | 9 4 1 10 | 14 4 1 11 | 2 5 1 12 | 7 5 1 13 | 9 5 1 14 | 14 5 1 15 | 2 6 1 16 | 7 6 1 17 | 9 6 1 18 | 14 6 1 19 | 4 7 1 20 | 5 7 1 21 | 6 7 1 22 | 10 7 1 23 | 11 7 1 24 | 12 7 1 25 | 4 9 1 26 | 5 9 1 27 | 6 9 1 28 | 10 9 1 29 | 11 9 1 30 | 12 9 1 31 | 2 10 1 32 | 7 10 1 33 | 9 10 1 34 | 14 10 1 35 | 2 11 1 36 | 7 11 1 37 | 9 11 1 38 | 14 11 1 39 | 2 12 1 40 | 7 12 1 41 | 9 12 1 42 | 14 12 1 43 | 4 14 1 44 | 5 14 1 45 | 6 14 1 46 | 10 14 1 47 | 11 14 1 48 | 12 14 1 49 | -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/QueenBee/final_state.dat: -------------------------------------------------------------------------------- 1 | 11 2 1 2 | 9 3 1 3 | 11 3 1 4 | 8 4 1 5 | 10 4 1 6 | 2 5 1 7 | 3 5 1 8 | 7 5 1 9 | 10 5 1 10 | 22 5 1 11 | 23 5 1 12 | 2 6 1 13 | 3 6 1 14 | 8 6 1 15 | 10 6 1 16 | 22 6 1 17 | 23 6 1 18 | 9 7 1 19 | 11 7 1 20 | 11 8 1 21 | -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/QueenBee/input.params: -------------------------------------------------------------------------------- 1 | 35 2 | 11 3 | 30 -------------------------------------------------------------------------------- /Exercises/Exercise13/Examples/QueenBee/queenbee.dat: -------------------------------------------------------------------------------- 1 | 11 2 1 2 | 9 3 1 3 | 11 3 1 4 | 8 4 1 5 | 10 4 1 6 | 2 5 1 7 | 3 5 1 8 | 7 5 1 9 | 10 5 1 10 | 22 5 1 11 | 23 5 1 12 | 2 6 1 13 | 3 6 1 14 | 8 6 1 15 | 10 6 1 16 | 22 6 1 17 | 23 6 1 18 | 9 7 1 19 | 11 7 1 20 | 11 8 1 -------------------------------------------------------------------------------- /Exercises/Exercise13/README.md: -------------------------------------------------------------------------------- 1 | Exercise 13 - Porting CUDA to OpenCL 2 | ==================================== 3 | 4 | Goal 5 | ---- 6 | * To port the CUDA/serial C program to OpenCL 7 | 8 | Procedure 9 | --------- 10 | * Examine the CUDA kernel and identify which parts need changing 11 | * Change them to the OpenCL equivalents 12 | * Examine the Host code and part the commands to the OpenCL equivalents 13 | 14 | Expected output 15 | --------------- 16 | * The OpenCL and CUDA programs should produce the same output - check this! 17 | 18 | Examples 19 | -------- 20 | Some example input is provided in the Examples/ directory. 21 | The `.dat` files list the co-ordinates of the grid with a live cell, followed by a 1 (to signify alive). 22 | The `input.params` file lists the size of the grid (X then Y) and the number of iterations. 23 | 24 | Notes 25 | ----- 26 | 27 | See the Exercises/Exercise13/Examples directory for some sample input .dat, input.params files 28 | along with the expected final_state.dat for four different Game of Life patterns. 29 | -------------------------------------------------------------------------------- /Exercises/ExerciseA/README.md: -------------------------------------------------------------------------------- 1 | Exercise A - The vectorized Pi program 2 | ====================================== 3 | 4 | Goal 5 | ---- 6 | * To understand the vector instructions in the kernel programming language. 7 | 8 | Procedure 9 | --------- 10 | * Start with your best Pi program. 11 | * Unroll the loop 4 times. 12 | Verify that the program still works. 13 | * Use vector instructions in the body of the loop. 14 | 15 | Expected output 16 | --------------- 17 | * Output result plus an estimate of the error in the result. 18 | * Report the runtime and compare the vectorized and scalar versions of the program. 19 | * You could try running this on the CPU as well as the GPU... 20 | -------------------------------------------------------------------------------- /Exercises/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # This makefile will produce all the C binaries 3 | # in their respective directories 4 | 5 | CEXES = Exercise01/C/DeviceInfo Exercise02/C/vadd \ 6 | Exercise04/C/vadd Exercise05/C/vadd \ 7 | Exercise06/C/mult Exercise09/C/pi \ 8 | Exercise13/C/gameoflife 9 | 10 | CPPEXES = Exercise01/Cpp/DeviceInfo Exercise03/Cpp/vadd \ 11 | Exercise04/Cpp/vadd Exercise05/Cpp/vadd \ 12 | Exercise06/Cpp/mult Exercise09/Cpp/pi 13 | 14 | # Change this variable to specify the device type in all 15 | # the Makefile to the OpenCL device type of choice 16 | DEVICE = CL_DEVICE_TYPE_DEFAULT 17 | export DEVICE 18 | 19 | # Incase you need to rename the C++ compiler, you can 20 | # do it in bulk here 21 | CPPC = g++ 22 | export CPPC 23 | 24 | ifndef CC 25 | CC = gcc 26 | endif 27 | export CC 28 | 29 | .PHONY : $(CEXES) $(CPPEXES) 30 | 31 | all: $(CEXES) $(CPPEXES) 32 | 33 | $(CEXES): 34 | $(MAKE) -C `dirname $@` 35 | 36 | $(CPPEXES): 37 | $(MAKE) -C `dirname $@` 38 | 39 | .PHONY : clean 40 | clean: 41 | for e in $(CEXES) $(CPPEXES); do $(MAKE) -C `dirname $$e` clean; done 42 | -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | This work is licensed under the Creative Commons Attribution 3.0 Unported License. 2 | 3 | To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/ 4 | or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain View, California, 94041, USA. 5 | -------------------------------------------------------------------------------- /Solutions/C_common/device_picker.h: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------ 2 | * 3 | * Name: device_picker.h 4 | * 5 | * Purpose: Provide a simple CLI to specify an OpenCL device at runtime 6 | * 7 | * Note: Must be included AFTER the relevant OpenCL header 8 | * See one of the Matrix Multiply exercises for usage 9 | * 10 | * HISTORY: Method written by James Price, October 2014 11 | * Extracted to a common header by Tom Deakin, November 2014 12 | */ 13 | 14 | #pragma once 15 | 16 | #include 17 | #include 18 | 19 | #define MAX_PLATFORMS 8 20 | #define MAX_DEVICES 16 21 | #define MAX_INFO_STRING 256 22 | 23 | 24 | unsigned getDeviceList(cl_device_id devices[MAX_DEVICES]) 25 | { 26 | cl_int err; 27 | 28 | // Get list of platforms 29 | cl_uint numPlatforms = 0; 30 | cl_platform_id platforms[MAX_PLATFORMS]; 31 | err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms); 32 | checkError(err, "getting platforms"); 33 | 34 | // Enumerate devices 35 | unsigned numDevices = 0; 36 | for (int i = 0; i < numPlatforms; i++) 37 | { 38 | cl_uint num = 0; 39 | err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 40 | MAX_DEVICES-numDevices, devices+numDevices, &num); 41 | checkError(err, "getting deviceS"); 42 | numDevices += num; 43 | } 44 | 45 | return numDevices; 46 | } 47 | 48 | void getDeviceName(cl_device_id device, char name[MAX_INFO_STRING]) 49 | { 50 | cl_device_info info = CL_DEVICE_NAME; 51 | 52 | // Special case for AMD 53 | #ifdef CL_DEVICE_BOARD_NAME_AMD 54 | clGetDeviceInfo(device, CL_DEVICE_VENDOR, MAX_INFO_STRING, name, NULL); 55 | if (strstr(name, "Advanced Micro Devices")) 56 | info = CL_DEVICE_BOARD_NAME_AMD; 57 | #endif 58 | 59 | clGetDeviceInfo(device, info, MAX_INFO_STRING, name, NULL); 60 | } 61 | 62 | 63 | int parseUInt(const char *str, cl_uint *output) 64 | { 65 | char *next; 66 | *output = strtoul(str, &next, 10); 67 | return !strlen(next); 68 | } 69 | 70 | void parseArguments(int argc, char *argv[], cl_uint *deviceIndex) 71 | { 72 | for (int i = 1; i < argc; i++) 73 | { 74 | if (!strcmp(argv[i], "--list")) 75 | { 76 | // Get list of devices 77 | cl_device_id devices[MAX_DEVICES]; 78 | unsigned numDevices = getDeviceList(devices); 79 | 80 | // Print device names 81 | if (numDevices == 0) 82 | { 83 | printf("No devices found.\n"); 84 | } 85 | else 86 | { 87 | printf("\n"); 88 | printf("Devices:\n"); 89 | for (int i = 0; i < numDevices; i++) 90 | { 91 | char name[MAX_INFO_STRING]; 92 | getDeviceName(devices[i], name); 93 | printf("%2d: %s\n", i, name); 94 | } 95 | printf("\n"); 96 | } 97 | exit(0); 98 | } 99 | else if (!strcmp(argv[i], "--device")) 100 | { 101 | if (++i >= argc || !parseUInt(argv[i], deviceIndex)) 102 | { 103 | printf("Invalid device index\n"); 104 | exit(1); 105 | } 106 | } 107 | else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) 108 | { 109 | printf("\n"); 110 | printf("Usage: ./program [OPTIONS]\n\n"); 111 | printf("Options:\n"); 112 | printf(" -h --help Print the message\n"); 113 | printf(" --list List available devices\n"); 114 | printf(" --device INDEX Select device at INDEX\n"); 115 | printf("\n"); 116 | exit(0); 117 | } 118 | } 119 | } 120 | 121 | -------------------------------------------------------------------------------- /Solutions/C_common/wtime.c: -------------------------------------------------------------------------------- 1 | 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | #include 6 | #endif 7 | 8 | #include 9 | 10 | double wtime() 11 | { 12 | #ifdef _OPENMP 13 | /* Use omp_get_wtime() if we can */ 14 | return omp_get_wtime(); 15 | #else 16 | /* Use a generic timer */ 17 | static int sec = -1; 18 | struct timeval tv; 19 | gettimeofday(&tv, NULL); 20 | if (sec < 0) sec = tv.tv_sec; 21 | return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 22 | #endif 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /Solutions/Cpp_common/device_picker.hpp: -------------------------------------------------------------------------------- 1 | /*------------------------------------------------------------------------------ 2 | * 3 | * Name: device_picker.h 4 | * 5 | * Purpose: Provide a simple CLI to specify an OpenCL device at runtime 6 | * 7 | * Note: Must be included AFTER the relevant OpenCL header 8 | * See one of the Matrix Multiply exercises for usage 9 | * 10 | * HISTORY: Method written by James Price, October 2014 11 | * Extracted to a common header by Tom Deakin, November 2014 12 | */ 13 | 14 | #pragma once 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #define MAX_INFO_STRING 256 21 | 22 | 23 | unsigned getDeviceList(std::vector& devices) 24 | { 25 | cl_int err; 26 | 27 | // Get list of platforms 28 | std::vector platforms; 29 | cl::Platform::get(&platforms); 30 | 31 | // Enumerate devices 32 | for (int i = 0; i < platforms.size(); i++) 33 | { 34 | cl_uint num = 0; 35 | std::vector plat_devices; 36 | platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices); 37 | devices.insert(devices.end(), plat_devices.begin(), plat_devices.end()); 38 | } 39 | 40 | return devices.size(); 41 | } 42 | 43 | void getDeviceName(cl::Device& device, std::string& name) 44 | { 45 | cl_device_info info = CL_DEVICE_NAME; 46 | 47 | // Special case for AMD 48 | #ifdef CL_DEVICE_BOARD_NAME_AMD 49 | device.getInfo(CL_DEVICE_VENDOR, &name); 50 | if (strstr(name.c_str(), "Advanced Micro Devices")) 51 | info = CL_DEVICE_BOARD_NAME_AMD; 52 | #endif 53 | 54 | device.getInfo(info, &name); 55 | } 56 | 57 | 58 | int parseUInt(const char *str, cl_uint *output) 59 | { 60 | char *next; 61 | *output = strtoul(str, &next, 10); 62 | return !strlen(next); 63 | } 64 | 65 | void parseArguments(int argc, char *argv[], cl_uint *deviceIndex) 66 | { 67 | for (int i = 1; i < argc; i++) 68 | { 69 | if (!strcmp(argv[i], "--list")) 70 | { 71 | // Get list of devices 72 | std::vector devices; 73 | unsigned numDevices = getDeviceList(devices); 74 | 75 | // Print device names 76 | if (numDevices == 0) 77 | { 78 | std::cout << "No devices found.\n"; 79 | } 80 | else 81 | { 82 | std::cout << "\nDevices:\n"; 83 | for (int i = 0; i < numDevices; i++) 84 | { 85 | std::string name; 86 | getDeviceName(devices[i], name); 87 | std::cout << i << ": " << name << "\n"; 88 | } 89 | std::cout << "\n"; 90 | } 91 | exit(0); 92 | } 93 | else if (!strcmp(argv[i], "--device")) 94 | { 95 | if (++i >= argc || !parseUInt(argv[i], deviceIndex)) 96 | { 97 | std::cout << "Invalid device index\n"; 98 | exit(1); 99 | } 100 | } 101 | else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) 102 | { 103 | std::cout << "\n"; 104 | std::cout << "Usage: ./program [OPTIONS]\n\n"; 105 | std::cout << "Options:\n"; 106 | std::cout << " -h --help Print the message\n"; 107 | std::cout << " --list List available devices\n"; 108 | std::cout << " --device INDEX Select device at INDEX\n"; 109 | std::cout << "\n"; 110 | exit(0); 111 | } 112 | } 113 | } 114 | 115 | -------------------------------------------------------------------------------- /Solutions/Cpp_common/wtime.c: -------------------------------------------------------------------------------- 1 | 2 | #ifdef _OPENMP 3 | #include 4 | #else 5 | #include 6 | #endif 7 | 8 | #include 9 | 10 | double wtime() 11 | { 12 | #ifdef _OPENMP 13 | /* Use omp_get_wtime() if we can */ 14 | return omp_get_wtime(); 15 | #else 16 | /* Use a generic timer */ 17 | static int sec = -1; 18 | struct timeval tv; 19 | gettimeofday(&tv, NULL); 20 | if (sec < 0) sec = tv.tv_sec; 21 | return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec; 22 | #endif 23 | } 24 | 25 | 26 | -------------------------------------------------------------------------------- /Solutions/Exercise04/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS=-O3 -lm 7 | 8 | LIBS = -lOpenCL -fopenmp 9 | 10 | COMMON_DIR = ../../C_common 11 | 12 | # Change this variable to specify the device type 13 | # to the OpenCL device type of choice. You can also 14 | # edit the variable in the source. 15 | ifndef DEVICE 16 | DEVICE = CL_DEVICE_TYPE_DEFAULT 17 | endif 18 | 19 | # Check our platform and make sure we define the APPLE variable 20 | # and set up the right compiler flags and libraries 21 | PLATFORM = $(shell uname -s) 22 | ifeq ($(PLATFORM), Darwin) 23 | LIBS = -framework OpenCL 24 | endif 25 | 26 | CCFLAGS += -D DEVICE=$(DEVICE) 27 | 28 | vadd_chain: vadd_chain.c $(COMMON_DIR)/device_info.c 29 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@ 30 | 31 | 32 | clean: 33 | rm -f vadd_chain 34 | -------------------------------------------------------------------------------- /Solutions/Exercise04/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CPPC 3 | CPPC=g++ 4 | endif 5 | 6 | CPP_COMMON = ../../Cpp_common 7 | 8 | CCFLAGS= 9 | 10 | INC = -I $(CPP_COMMON) 11 | 12 | LIBS = -lOpenCL -lrt 13 | 14 | # Change this variable to specify the device type 15 | # to the OpenCL device type of choice. You can also 16 | # edit the variable in the source. 17 | ifndef DEVICE 18 | DEVICE = CL_DEVICE_TYPE_DEFAULT 19 | endif 20 | 21 | # Check our platform and make sure we define the APPLE variable 22 | # and set up the right compiler flags and libraries 23 | PLATFORM = $(shell uname -s) 24 | ifeq ($(PLATFORM), Darwin) 25 | CPPC = clang++ 26 | CCFLAGS += -stdlib=libc++ 27 | LIBS = -framework OpenCL 28 | endif 29 | 30 | CCFLAGS += -D DEVICE=$(DEVICE) 31 | 32 | vadd_chain: vadd_chain.cpp 33 | $(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@ 34 | 35 | 36 | clean: 37 | rm -f vadd_chain 38 | -------------------------------------------------------------------------------- /Solutions/Exercise04/Cpp/vadd_chain.cl: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // kernel: vadd 4 | // 5 | // Purpose: Compute the elementwise sum c = a+b 6 | // 7 | // input: a and b float vectors of length count 8 | // 9 | // output: c float vector of length count holding the sum a + b 10 | // 11 | 12 | __kernel void vadd( 13 | __global float* a, 14 | __global float* b, 15 | __global float* c, 16 | const unsigned int count) 17 | { 18 | int i = get_global_id(0); 19 | if(i < count) { 20 | c[i] = a[i] + b[i]; 21 | } 22 | } -------------------------------------------------------------------------------- /Solutions/Exercise04/Python/vadd_chain.py: -------------------------------------------------------------------------------- 1 | # 2 | # Vadd 3 | # 4 | # Element wise addition of two vectors at a time in a chain (C=A+B; D=C+E; F=D+G) 5 | # Asks the user to select a device at runtime 6 | # 7 | # History: Initial version based on vadd.c, written by Tim Mattson, June 2011 8 | # Ported to C++ Wrapper API by Benedict Gaster, September 2011 9 | # Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012 10 | # Ported to Python by Tom Deakin, July 2013 11 | # 12 | 13 | # Import the Python OpenCL API 14 | import pyopencl as cl 15 | # Import the Python Maths Library (for vectors) 16 | import numpy 17 | 18 | #------------------------------------------------------------------------------ 19 | 20 | # tolerance used in floating point comparisons 21 | TOL = 0.001 22 | # length of vectors a, b and c 23 | LENGTH = 1024 24 | 25 | #------------------------------------------------------------------------------ 26 | # 27 | # Kernel: vadd 28 | # 29 | # To compute the elementwise sum c = a + b 30 | # 31 | # Input: a and b float vectors of length count 32 | # Output c float vector of length count holding the sum a + b 33 | 34 | kernelsource = """ 35 | __kernel void vadd( 36 | __global float* a, 37 | __global float* b, 38 | __global float* c, 39 | const unsigned int count) 40 | { 41 | int i = get_global_id(0); 42 | if (i < count) 43 | c[i] = a[i] + b[i]; 44 | } 45 | """ 46 | 47 | #------------------------------------------------------------------------------ 48 | 49 | # Main procedure 50 | 51 | # Create a compute context 52 | # Ask the user to select a platform/device on the CLI 53 | context = cl.create_some_context() 54 | 55 | # Create a command queue 56 | queue = cl.CommandQueue(context) 57 | 58 | # Create the compute program from the source buffer 59 | # and build it 60 | program = cl.Program(context, kernelsource).build() 61 | 62 | # Create a, b, e and g vectors and fill with random float values 63 | # Create empty vectors for c, d and f 64 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32) 65 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32) 66 | h_c = numpy.empty(LENGTH).astype(numpy.float32) 67 | h_d = numpy.empty(LENGTH).astype(numpy.float32) 68 | h_e = numpy.random.rand(LENGTH).astype(numpy.float32) 69 | h_f = numpy.empty(LENGTH).astype(numpy.float32) 70 | h_g = numpy.random.rand(LENGTH).astype(numpy.float32) 71 | 72 | # Create the input (a, b, e, g) arrays in device memory and copy data from host 73 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a) 74 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b) 75 | d_e = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_e) 76 | d_g = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_g) 77 | # Create the output (c, d, f) array in device memory 78 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_c.nbytes) 79 | d_d = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_d.nbytes) 80 | d_f = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_f.nbytes) 81 | 82 | vadd = program.vadd 83 | vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32]) 84 | 85 | # Execute the kernel over the entire range of our 1d input 86 | # allowing OpenCL runtime to select the work group items for the device 87 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH) 88 | 89 | # Enqueue the kernel again, but with different arguments 90 | vadd(queue, h_e.shape, None, d_e, d_c, d_d, LENGTH) 91 | 92 | # Enqueue the kernel a third time, again with different arguments 93 | vadd(queue, h_g.shape, None, d_g, d_d, d_f, LENGTH) 94 | 95 | 96 | # Read back the results from the compute device 97 | cl.enqueue_copy(queue, h_f, d_f) 98 | 99 | # Test the results 100 | correct = 0; 101 | for a, b, e, f, g in zip(h_a, h_b, h_e, h_f, h_g): 102 | tmp = a + b + e + g 103 | # compute the deviation of expected and output result 104 | tmp -= f 105 | # correct if square deviation is less than tolerance squared 106 | if tmp*tmp < TOL*TOL: 107 | correct += 1 108 | else: 109 | print "tmp", tmp, "h_a", a, "h_b", b, "h_e", e, "h_g", g, "h_f", f 110 | 111 | # Summarize results 112 | print "3 vector adds to find F = A+B+E+G:", correct, "out of", LENGTH, "results were correct." 113 | -------------------------------------------------------------------------------- /Solutions/Exercise04/README.md: -------------------------------------------------------------------------------- 1 | Exercise 4 - Chaining vector add kernels (C++/Python) 2 | ===================================================== 3 | -------------------------------------------------------------------------------- /Solutions/Exercise05/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS=-O3 -lm 7 | 8 | LIBS = -lOpenCL -fopenmp 9 | 10 | COMMON_DIR = ../../C_common 11 | 12 | # Change this variable to specify the device type 13 | # to the OpenCL device type of choice. You can also 14 | # edit the variable in the source. 15 | ifndef DEVICE 16 | DEVICE = CL_DEVICE_TYPE_DEFAULT 17 | endif 18 | 19 | # Check our platform and make sure we define the APPLE variable 20 | # and set up the right compiler flags and libraries 21 | PLATFORM = $(shell uname -s) 22 | ifeq ($(PLATFORM), Darwin) 23 | LIBS = -framework OpenCL 24 | endif 25 | 26 | CCFLAGS += -D DEVICE=$(DEVICE) 27 | 28 | vadd_abc: vadd_abc.c $(COMMON_DIR)/device_info.c 29 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@ 30 | 31 | 32 | clean: 33 | rm -f vadd_abc 34 | -------------------------------------------------------------------------------- /Solutions/Exercise05/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CPPC 3 | CPPC=g++ 4 | endif 5 | 6 | CPP_COMMON = ../../Cpp_common 7 | 8 | CCFLAGS=-std=c++11 9 | 10 | INC = -I $(CPP_COMMON) 11 | 12 | LIBS = -lOpenCL -lrt 13 | 14 | # Change this variable to specify the device type 15 | # to the OpenCL device type of choice. You can also 16 | # edit the variable in the source. 17 | ifndef DEVICE 18 | DEVICE = CL_DEVICE_TYPE_DEFAULT 19 | endif 20 | 21 | # Check our platform and make sure we define the APPLE variable 22 | # and set up the right compiler flags and libraries 23 | PLATFORM = $(shell uname -s) 24 | ifeq ($(PLATFORM), Darwin) 25 | CPPC = clang++ 26 | CCFLAGS += -DAPPLE -stdlib=libc++ 27 | LIBS = -framework OpenCL 28 | endif 29 | 30 | CCFLAGS += -D DEVICE=$(DEVICE) 31 | 32 | vadd_abc: vadd_abc.cpp 33 | $(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -I $(CPP_COMMON) -o $@ 34 | 35 | 36 | clean: 37 | rm -f vadd_abc 38 | -------------------------------------------------------------------------------- /Solutions/Exercise05/Cpp/vadd_abc.cl: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // kernel: vadd 4 | // 5 | // Purpose: Compute the elementwise sum d = a+b+c 6 | // 7 | // input: a, b and c float vectors of length count 8 | // 9 | // output: d float vector of length count holding the sum a + b + c 10 | // 11 | 12 | __kernel void vadd( 13 | __global float* a, 14 | __global float* b, 15 | __global float* c, 16 | __global float* d, 17 | const unsigned int count) 18 | { 19 | int i = get_global_id(0); 20 | if(i < count) { 21 | d[i] = a[i] + b[i] + c[i]; 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /Solutions/Exercise05/Cpp/vadd_abc.cpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Name: vadd_three.cpp 4 | // 5 | // Purpose: Elementwise addition of three vectors (d = a + b + c) 6 | // 7 | // d = a + b + c 8 | // 9 | // HISTORY: Written by Tim Mattson, June 2011 10 | // Ported to C++ Wrapper API by Benedict Gaster, September 2011 11 | // Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012 12 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 13 | // Updated by Tom Deakin, October 2014 14 | // 15 | //------------------------------------------------------------------------------ 16 | 17 | #define __CL_ENABLE_EXCEPTIONS 18 | 19 | #include "cl.hpp" 20 | 21 | #include "util.hpp" // utility library 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | #include 30 | 31 | // pick up device type from compiler command line or from the default type 32 | #ifndef DEVICE 33 | #define DEVICE CL_DEVICE_TYPE_DEFAULT 34 | #endif 35 | 36 | #include "err_code.h" 37 | 38 | //------------------------------------------------------------------------------ 39 | 40 | #define TOL (0.001) // tolerance used in floating point comparisons 41 | #define LENGTH (1024) // length of vectors a, b, and c 42 | 43 | int main(void) 44 | { 45 | std::vector h_a(LENGTH); // a vector 46 | std::vector h_b(LENGTH); // b vector 47 | std::vector h_c(LENGTH); // c vector 48 | std::vector h_d (LENGTH, 0xdeadbeef); // d vector (result) 49 | 50 | cl::Buffer d_a; // device memory used for the input a vector 51 | cl::Buffer d_b; // device memory used for the input b vector 52 | cl::Buffer d_c; // device memory used for the input c vector 53 | cl::Buffer d_d; // device memory used for the output d vector 54 | 55 | // Fill vectors a and b with random float values 56 | int count = LENGTH; 57 | for(int i = 0; i < count; i++) 58 | { 59 | h_a[i] = rand() / (float)RAND_MAX; 60 | h_b[i] = rand() / (float)RAND_MAX; 61 | h_c[i] = rand() / (float)RAND_MAX; 62 | } 63 | 64 | try 65 | { 66 | // Create a context 67 | cl::Context context(DEVICE); 68 | 69 | // Load in kernel source, creating a program object for the context 70 | 71 | cl::Program program(context, util::loadProgram("vadd_abc.cl"), true); 72 | 73 | // Get the command queue 74 | cl::CommandQueue queue(context); 75 | 76 | // Create the kernel functor 77 | 78 | cl::make_kernel vadd(program, "vadd"); 79 | 80 | d_a = cl::Buffer(context, h_a.begin(), h_a.end(), true); 81 | d_b = cl::Buffer(context, h_b.begin(), h_b.end(), true); 82 | d_c = cl::Buffer(context, h_c.begin(), h_c.end(), true); 83 | 84 | d_d = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH); 85 | 86 | vadd( 87 | cl::EnqueueArgs( 88 | queue, 89 | cl::NDRange(count)), 90 | d_a, 91 | d_b, 92 | d_c, 93 | d_d, 94 | count); 95 | 96 | cl::copy(queue, d_d, h_d.begin(), h_d.end()); 97 | 98 | // Test the results 99 | int correct = 0; 100 | float tmp; 101 | for(int i = 0; i < count; i++) 102 | { 103 | tmp = h_a[i] + h_b[i] + h_c[i]; // assign element i of a+b+c to tmp 104 | tmp -= h_d[i]; // compute deviation of expected and output result 105 | if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared 106 | correct++; 107 | else { 108 | printf(" tmp %f h_a %f h_b %f h_c %f h_d %f\n",tmp, h_a[i], h_b[i], h_c[i], h_d[i]); 109 | } 110 | } 111 | 112 | // summarize results 113 | printf("D = A+B+C: %d out of %d results were correct.\n", correct, count); 114 | 115 | } 116 | catch (cl::Error err) { 117 | std::cout << "Exception\n"; 118 | std::cerr 119 | << "ERROR: " 120 | << err.what() 121 | << "(" 122 | << err_code(err.err()) 123 | << ")" 124 | << std::endl; 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /Solutions/Exercise05/Python/vadd_abc.py: -------------------------------------------------------------------------------- 1 | # 2 | # Vadd 3 | # 4 | # Element wise addition of three vectors at a time (R=A+B+C) 5 | # Asks the user to select a device at runtime 6 | # 7 | # History: Initial version based on vadd.c, written by Tim Mattson, June 2011 8 | # Ported to C++ Wrapper API by Benedict Gaster, September 2011 9 | # Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012 10 | # Ported to Python by Tom Deakin, July 2013 11 | # 12 | 13 | # Import the Python OpenCL API 14 | import pyopencl as cl 15 | # Import the Python Maths Library (for vectors) 16 | import numpy 17 | 18 | #------------------------------------------------------------------------------ 19 | 20 | # tolerance used in floating point comparisons 21 | TOL = 0.001 22 | # length of vectors a, b and c 23 | LENGTH = 1024 24 | 25 | #------------------------------------------------------------------------------ 26 | # 27 | # Kernel: vadd 28 | # 29 | # To compute the elementwise sum r = a + b + c 30 | # 31 | # Input: a, b and c float vectors of length count 32 | # Output r float vector of length count holding the sum a + b + cs 33 | 34 | kernelsource = """ 35 | __kernel void vadd( 36 | __global float* a, 37 | __global float* b, 38 | __global float* c, 39 | __global float* r, 40 | const unsigned int count) 41 | { 42 | int i = get_global_id(0); 43 | if (i < count) 44 | r[i] = a[i] + b[i] + c[i]; 45 | } 46 | """ 47 | 48 | #------------------------------------------------------------------------------ 49 | 50 | # Main procedure 51 | 52 | # Create a compute context 53 | # Ask the user to select a platform/device on the CLI 54 | context = cl.create_some_context() 55 | 56 | # Create a command queue 57 | queue = cl.CommandQueue(context) 58 | 59 | # Create the compute program from the source buffer 60 | # and build it 61 | program = cl.Program(context, kernelsource).build() 62 | 63 | # Create a, b and c vectors and fill with random float values 64 | # Create empty vectors for r 65 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32) 66 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32) 67 | h_c = numpy.random.rand(LENGTH).astype(numpy.float32) 68 | h_r = numpy.empty(LENGTH).astype(numpy.float32) 69 | 70 | # Create the input (a, b, c) arrays in device memory and copy data from host 71 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a) 72 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b) 73 | d_c = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_c) 74 | # Create the output (r) array in device memory 75 | d_r = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_r.nbytes) 76 | 77 | # Execute the kernel over the entire range of our 1d input 78 | # allowing OpenCL runtime to select the work group items for the device 79 | vadd = program.vadd 80 | vadd.set_scalar_arg_dtypes([None, None, None, None, numpy.uint32]) 81 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, d_r, LENGTH) 82 | 83 | # Read back the results from the compute device 84 | cl.enqueue_copy(queue, h_r, d_r) 85 | 86 | # Test the results 87 | correct = 0; 88 | for a, b, c, r in zip(h_a, h_b, h_c, h_r): 89 | tmp = a + b + c 90 | # compute the deviation of expected and output result 91 | tmp -= r 92 | # correct if square deviation is less than tolerance squared 93 | if tmp*tmp < TOL*TOL: 94 | correct += 1 95 | else: 96 | print "tmp", tmp, "h_a", a, "h_b", b, "h_c", c, "h_r", r 97 | 98 | # Summarize results 99 | print "1 vector adds to find R = A+B+C:", correct, "out of", LENGTH, "results were correct." 100 | -------------------------------------------------------------------------------- /Solutions/Exercise05/README.md: -------------------------------------------------------------------------------- 1 | Exercise 5 - The D = A + B + C problem 2 | ====================================== 3 | -------------------------------------------------------------------------------- /Solutions/Exercise06/C/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Matrix Multiply example makefile 3 | # 4 | # History: Written by Tim mattson, August 2010 5 | # Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 6 | # Modified by Tom Deakin, July 2013 7 | # Modified by Tom Deakin, October 2014 8 | # 9 | 10 | ifndef CC 11 | CC = gcc 12 | endif 13 | 14 | CCFLAGS=-O3 -std=c99 -ffast-math 15 | 16 | LIBS = -lm -lOpenCL -fopenmp 17 | 18 | COMMON_DIR = ../../C_common 19 | 20 | MMUL_OBJS = wtime.o 21 | EXEC = mult 22 | 23 | 24 | # Check our platform and make sure we define the APPLE variable 25 | # and set up the right compiler flags and libraries 26 | PLATFORM = $(shell uname -s) 27 | ifeq ($(PLATFORM), Darwin) 28 | LIBS = -lm -framework OpenCL 29 | endif 30 | 31 | 32 | all: $(EXEC) 33 | 34 | mult: $(MMUL_OBJS) matmul.c matrix_lib.c 35 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $(EXEC) 36 | 37 | wtime.o: $(COMMON_DIR)/wtime.c 38 | $(CC) -c $^ $(CCFLAGS) -o $@ 39 | 40 | .c.o: 41 | $(CC) -c $< $(CCFLAGS) -o $@ 42 | 43 | 44 | clean: 45 | rm -f $(MMUL_OBJS) $(EXEC) 46 | -------------------------------------------------------------------------------- /Solutions/Exercise06/C/matmul.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Include fle for the Matrix Multiply test harness 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Ported to C by Tom Deakin, July 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MULT_HDR 13 | #define __MULT_HDR 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #ifdef __APPLE__ 20 | #include 21 | #include 22 | #else 23 | #include 24 | #endif 25 | 26 | #include "matrix_lib.h" 27 | 28 | //------------------------------------------------------------------------------ 29 | // functions from ../Common 30 | //------------------------------------------------------------------------------ 31 | extern int output_device_info(cl_device_id ); 32 | extern double wtime(); // returns time since some fixed past point (wtime.c) 33 | 34 | //------------------------------------------------------------------------------ 35 | // Constants 36 | //------------------------------------------------------------------------------ 37 | #define ORDER 1024 // Order of the square matrices A, B, and C 38 | #define AVAL 3.0 // A elements are constant and equal to AVAL 39 | #define BVAL 5.0 // B elements are constant and equal to BVAL 40 | #define TOL (0.001) // tolerance used in floating point comparisons 41 | #define DIM 2 // Max dim for NDRange 42 | #define COUNT 1 // number of times to do each multiplication 43 | #define SUCCESS 1 44 | #define FAILURE 0 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /Solutions/Exercise06/C/matrix_lib.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library for the multiplication driver 4 | // 5 | // PURPOSE: This is a simple set of functions to manipulate 6 | // matrices used with the multiplcation driver. 7 | // 8 | // USAGE: The matrices are square and the order is 9 | // set as a defined constant, ORDER. 10 | // 11 | // HISTORY: Written by Tim Mattson, August 2010 12 | // Modified by Simon McIntosh-Smith, September 2011 13 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 14 | // Ported to C by Tom Deakin, 2013 15 | // 16 | //------------------------------------------------------------------------------ 17 | 18 | #include "matmul.h" 19 | 20 | //------------------------------------------------------------------------------ 21 | // 22 | // Function to compute the matrix product (sequential algorithm, dot prod) 23 | // 24 | //------------------------------------------------------------------------------ 25 | 26 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C) 27 | { 28 | int i, j, k; 29 | float tmp; 30 | 31 | for (i = 0; i < N; i++) { 32 | for (j = 0; j < N; j++) { 33 | tmp = 0.0f; 34 | for (k = 0; k < N; k++) { 35 | /* C(i,j) = sum(over k) A(i,k) * B(k,j) */ 36 | tmp += A[i*N+k] * B[k*N+j]; 37 | } 38 | C[i*N+j] = tmp; 39 | } 40 | } 41 | } 42 | 43 | //------------------------------------------------------------------------------ 44 | // 45 | // Function to initialize the input matrices A and B 46 | // 47 | //------------------------------------------------------------------------------ 48 | void initmat(int N, float *A, float *B, float *C) 49 | { 50 | int i, j; 51 | 52 | /* Initialize matrices */ 53 | 54 | for (i = 0; i < N; i++) 55 | for (j = 0; j < N; j++) 56 | A[i*N+j] = AVAL; 57 | 58 | for (i = 0; i < N; i++) 59 | for (j = 0; j < N; j++) 60 | B[i*N+j] = BVAL; 61 | 62 | for (i = 0; i < N; i++) 63 | for (j = 0; j < N; j++) 64 | C[i*N+j] = 0.0f; 65 | } 66 | 67 | //------------------------------------------------------------------------------ 68 | // 69 | // Function to set a matrix to zero 70 | // 71 | //------------------------------------------------------------------------------ 72 | void zero_mat (int N, float *C) 73 | { 74 | int i, j; 75 | 76 | for (i = 0; i < N; i++) 77 | for (j = 0; j < N; j++) 78 | C[i*N+j] = 0.0f; 79 | } 80 | 81 | //------------------------------------------------------------------------------ 82 | // 83 | // Function to fill Btrans(N,N) with transpose of B(N,N) 84 | // 85 | //------------------------------------------------------------------------------ 86 | void trans(int N, float *B, float *Btrans) 87 | { 88 | int i, j; 89 | 90 | for (i = 0; i < N; i++) 91 | for (j = 0; j < N; j++) 92 | Btrans[j*N+i] = B[i*N+j]; 93 | } 94 | 95 | //------------------------------------------------------------------------------ 96 | // 97 | // Function to compute errors of the product matrix 98 | // 99 | //------------------------------------------------------------------------------ 100 | float error(int N, float *C) 101 | { 102 | int i,j; 103 | float cval, errsq, err; 104 | cval = (float) N * AVAL * BVAL; 105 | errsq = 0.0f; 106 | 107 | for (i = 0; i < N; i++) { 108 | for (j = 0; j < N; j++) { 109 | err = C[i*N+j] - cval; 110 | errsq += err * err; 111 | } 112 | } 113 | return errsq; 114 | } 115 | 116 | //------------------------------------------------------------------------------ 117 | // 118 | // Function to analyze and output results 119 | // 120 | //------------------------------------------------------------------------------ 121 | void results(int N, float *C, double run_time) 122 | { 123 | float mflops; 124 | float errsq; 125 | 126 | mflops = 2.0 * N * N * N/(1000000.0f * run_time); 127 | printf(" %.2f seconds at %.1f MFLOPS \n", run_time,mflops); 128 | errsq = error(N, C); 129 | if (isnan(errsq) || errsq > TOL) { 130 | printf("\n Errors in multiplication: %f\n",errsq); 131 | exit(1); 132 | } 133 | } 134 | 135 | -------------------------------------------------------------------------------- /Solutions/Exercise06/C/matrix_lib.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library include file (function prototypes) 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Ported by Tom Deakin, July 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MATRIX_LIB_HDR 13 | #define __MATRIX_LIB_HDR 14 | 15 | 16 | //------------------------------------------------------------------------------ 17 | // 18 | // Function to compute the matrix product (sequential algorithm, dot producdt) 19 | // 20 | //------------------------------------------------------------------------------ 21 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C); 22 | 23 | //------------------------------------------------------------------------------ 24 | // 25 | // Function to initialize the input matrices A and B 26 | // 27 | //------------------------------------------------------------------------------ 28 | void initmat(int N, float *A, float *B, float *C); 29 | 30 | //------------------------------------------------------------------------------ 31 | // 32 | // Function to set a matrix to zero 33 | // 34 | //------------------------------------------------------------------------------ 35 | void zero_mat (int N, float *C); 36 | 37 | //------------------------------------------------------------------------------ 38 | // 39 | // Function to fill Btrans(Mdim,Pdim) with transpose of B(Pdim,Mdim) 40 | // 41 | //------------------------------------------------------------------------------ 42 | void trans(int N, float *B, float *Btrans); 43 | 44 | //------------------------------------------------------------------------------ 45 | // 46 | // Function to compute errors of the product matrix 47 | // 48 | //------------------------------------------------------------------------------ 49 | float error(int N, float *C); 50 | 51 | 52 | //------------------------------------------------------------------------------ 53 | // 54 | // Function to analyze and output results 55 | // 56 | //------------------------------------------------------------------------------ 57 | void results(int N, float *C, double run_time); 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /Solutions/Exercise06/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Matrix Multiply example makefile 3 | # 4 | # History: Written by Tim mattson, August 2010 5 | # Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 6 | # Modified by Tom Deakin, July 2013 7 | # 8 | 9 | ifndef CPPC 10 | CPPC=g++ 11 | endif 12 | 13 | CCFLAGS=-O3 -ffast-math 14 | 15 | LIBS = -lm -lOpenCL -fopenmp 16 | 17 | COMMON_DIR = ../../Cpp_common 18 | 19 | INC = -I $(COMMON_DIR) 20 | 21 | MMUL_OBJS = matmul.o matrix_lib.o wtime.o 22 | EXEC = mult 23 | 24 | # Check our platform and make sure we define the APPLE variable 25 | # and set up the right compiler flags and libraries 26 | PLATFORM = $(shell uname -s) 27 | ifeq ($(PLATFORM), Darwin) 28 | CPPC = clang++ 29 | CCFLAGS += -stdlib=libc++ 30 | LIBS = -lm -framework OpenCL 31 | endif 32 | 33 | all: $(EXEC) 34 | 35 | mult: $(MMUL_OBJS) 36 | $(CPPC) $(MMUL_OBJS) $(CCFLAGS) $(LIBS) -o $(EXEC) 37 | 38 | wtime.o: $(COMMON_DIR)/wtime.c 39 | $(CPPC) -c $^ $(CCFLAGS) -o $@ 40 | 41 | .c.o: 42 | $(CPPC) -c $< $(CCFLAGS) -o $@ 43 | 44 | .cpp.o: 45 | $(CPPC) -c $< $(CCFLAGS) $(INC) -o $@ 46 | 47 | matmul.o: matmul.hpp matrix_lib.hpp 48 | 49 | matrix_lib.o: matmul.hpp 50 | 51 | clean: 52 | rm -f $(MMUL_OBJS) $(EXEC) 53 | -------------------------------------------------------------------------------- /Solutions/Exercise06/Cpp/matmul.hpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Include fle for the Matrix Multiply test harness 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MULT_HDR 13 | #define __MULT_HDR 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #define __CL_ENABLE_EXCEPTIONS 23 | #include "cl.hpp" 24 | 25 | 26 | #include "matrix_lib.hpp" 27 | 28 | //------------------------------------------------------------------------------ 29 | // functions from ../Common 30 | //------------------------------------------------------------------------------ 31 | extern double wtime(); // returns time since some fixed past point (wtime.c) 32 | 33 | //------------------------------------------------------------------------------ 34 | // Constants 35 | //------------------------------------------------------------------------------ 36 | #define ORDER 1024 // Order of the square matrices A, B, and C 37 | #define AVAL 3.0 // A elements are constant and equal to AVAL 38 | #define BVAL 5.0 // B elements are constant and equal to BVAL 39 | #define TOL (0.001) // tolerance used in floating point comparisons 40 | #define DIM 2 // Max dim for NDRange 41 | #define COUNT 1 // number of times to do each multiplication 42 | #define SUCCESS 1 43 | #define FAILURE 0 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /Solutions/Exercise06/Cpp/matrix_lib.cpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library for the multiplication driver 4 | // 5 | // PURPOSE: This is a simple set of functions to manipulate 6 | // matrices used with the multiplcation driver. 7 | // 8 | // USAGE: The matrices are square and the order is 9 | // set as a defined constant, ORDER. 10 | // 11 | // HISTORY: Written by Tim Mattson, August 2010 12 | // Modified by Simon McIntosh-Smith, September 2011 13 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 14 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 15 | // Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014 16 | // 17 | //------------------------------------------------------------------------------ 18 | 19 | #include "matmul.hpp" 20 | 21 | //------------------------------------------------------------------------------ 22 | // 23 | // Function to compute the matrix product (sequential algorithm, dot prod) 24 | // 25 | //------------------------------------------------------------------------------ 26 | 27 | void seq_mat_mul_sdot(int N, std::vector& A, std::vector& B, std::vector& C) 28 | { 29 | int i, j, k; 30 | float tmp; 31 | 32 | for (i = 0; i < N; i++) { 33 | for (j = 0; j < N; j++) { 34 | tmp = 0.0f; 35 | for (k = 0; k < N; k++) { 36 | /* C(i,j) = sum(over k) A(i,k) * B(k,j) */ 37 | tmp += A[i*N+k] * B[k*N+j]; 38 | } 39 | C[i*N+j] = tmp; 40 | } 41 | } 42 | } 43 | 44 | //------------------------------------------------------------------------------ 45 | // 46 | // Function to initialize the input matrices A and B 47 | // 48 | //------------------------------------------------------------------------------ 49 | void initmat(int N, std::vector& A, std::vector& B, std::vector& C) 50 | { 51 | int i, j; 52 | 53 | /* Initialize matrices */ 54 | 55 | for (i = 0; i < N; i++) 56 | for (j = 0; j < N; j++) 57 | A[i*N+j] = AVAL; 58 | 59 | for (i = 0; i < N; i++) 60 | for (j = 0; j < N; j++) 61 | B[i*N+j] = BVAL; 62 | 63 | for (i = 0; i < N; i++) 64 | for (j = 0; j < N; j++) 65 | C[i*N+j] = 0.0f; 66 | } 67 | 68 | //------------------------------------------------------------------------------ 69 | // 70 | // Function to set a matrix to zero 71 | // 72 | //------------------------------------------------------------------------------ 73 | void zero_mat (int N, std::vector& C) 74 | { 75 | int i, j; 76 | 77 | for (i = 0; i < N; i++) 78 | for (j = 0; j < N; j++) 79 | C[i*N+j] = 0.0f; 80 | } 81 | 82 | //------------------------------------------------------------------------------ 83 | // 84 | // Function to fill Btrans(N,N) with transpose of B(N,N) 85 | // 86 | //------------------------------------------------------------------------------ 87 | void trans(int N, std::vector& B, std::vector& Btrans) 88 | { 89 | int i, j; 90 | 91 | for (i = 0; i < N; i++) 92 | for (j = 0; j < N; j++) 93 | Btrans[j*N+i] = B[i*N+j]; 94 | } 95 | 96 | //------------------------------------------------------------------------------ 97 | // 98 | // Function to compute errors of the product matrix 99 | // 100 | //------------------------------------------------------------------------------ 101 | float error(int N, std::vector& C) 102 | { 103 | int i,j; 104 | float cval, errsq, err; 105 | cval = (float) N * AVAL * BVAL; 106 | errsq = 0.0f; 107 | 108 | for (i = 0; i < N; i++) { 109 | for (j = 0; j < N; j++) { 110 | err = C[i*N+j] - cval; 111 | errsq += err * err; 112 | } 113 | } 114 | return errsq; 115 | } 116 | 117 | //------------------------------------------------------------------------------ 118 | // 119 | // Function to analyze and output results 120 | // 121 | //------------------------------------------------------------------------------ 122 | void results(int N, std::vector& C, double run_time) 123 | { 124 | 125 | float mflops; 126 | float errsq; 127 | 128 | mflops = 2.0 * N * N * N/(1000000.0f * run_time); 129 | printf(" %.2f seconds at %.1f MFLOPS \n", run_time,mflops); 130 | errsq = error(N, C); 131 | if (std::isnan(errsq) || errsq > TOL) 132 | printf("\n Errors in multiplication: %f\n",errsq); 133 | } 134 | 135 | -------------------------------------------------------------------------------- /Solutions/Exercise06/Cpp/matrix_lib.hpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library include file (function prototypes) 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 9 | // Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014 10 | // 11 | //------------------------------------------------------------------------------ 12 | 13 | #ifndef __MATRIX_LIB_HDR 14 | #define __MATRIX_LIB_HDR 15 | 16 | 17 | //------------------------------------------------------------------------------ 18 | // 19 | // Function to compute the matrix product (sequential algorithm, dot producdt) 20 | // 21 | //------------------------------------------------------------------------------ 22 | void seq_mat_mul_sdot(int N, std::vector &A, std::vector &B, std::vector &C); 23 | 24 | //------------------------------------------------------------------------------ 25 | // 26 | // Function to initialize the input matrices A and B 27 | // 28 | //------------------------------------------------------------------------------ 29 | void initmat(int N, std::vector& A, std::vector& B, std::vector& C); 30 | 31 | //------------------------------------------------------------------------------ 32 | // 33 | // Function to set a matrix to zero 34 | // 35 | //------------------------------------------------------------------------------ 36 | void zero_mat (int N, std::vector &C); 37 | 38 | //------------------------------------------------------------------------------ 39 | // 40 | // Function to fill Btrans(Mdim,Pdim) with transpose of B(Pdim,Mdim) 41 | // 42 | //------------------------------------------------------------------------------ 43 | void trans(int N, std::vector& B, std::vector& Btrans); 44 | 45 | //------------------------------------------------------------------------------ 46 | // 47 | // Function to compute errors of the product matrix 48 | // 49 | //------------------------------------------------------------------------------ 50 | float error(int N, std::vector& C); 51 | 52 | 53 | //------------------------------------------------------------------------------ 54 | // 55 | // Function to analyze and output results 56 | // 57 | //------------------------------------------------------------------------------ 58 | void results(int N, std::vector& C, double run_time); 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /Solutions/Exercise06/Python/definitions.py: -------------------------------------------------------------------------------- 1 | 2 | # Order of the square matrices A, B and C 3 | ORDER = 1024 4 | 5 | # A elemetns are constant and equal to AVAL 6 | AVAL = 3.0 7 | 8 | # B elemetns are constant and equal to BVAL 9 | BVAL = 5.0 10 | 11 | # tolerance used in floating point comparisons 12 | TOL = 0.001 13 | 14 | # Max dim for NDRange 15 | DIM = 2 16 | 17 | # number of times to do each multiplication 18 | COUNT = 1 19 | -------------------------------------------------------------------------------- /Solutions/Exercise06/Python/helper.py: -------------------------------------------------------------------------------- 1 | 2 | from definitions import * 3 | import numpy 4 | 5 | # Function to compute the matrix product (sequential algorithm, dot prod) 6 | def seq_mat_mul_sdot( Ndim, A, B, C): 7 | for i in range(Ndim): 8 | for j in range(Ndim): 9 | tmp = 0.0 10 | for k in range(Ndim): 11 | tmp += A[i*Ndim+k] * B[k*Ndim+j] 12 | C[i*Ndim+j] = tmp 13 | 14 | # Function to compute errors of the product matrix 15 | def error( Ndim, C): 16 | cval = float(Ndim) * AVAL * BVAL 17 | errsq = 0.0 18 | for i in range(Ndim): 19 | for j in range(Ndim): 20 | err = C[i*Ndim+j] - cval 21 | errsq += err * err 22 | return errsq; 23 | 24 | 25 | # Function to analyze and output results 26 | def results( Ndim, C, run_time): 27 | mflops = ( 2.0 * (Ndim**(3)) )/(1000000.0* run_time) 28 | print run_time, "seconds at", mflops, "MFLOPS" 29 | errsq = error( Ndim, C) 30 | if numpy.isnan(errsq) or errsq > TOL: 31 | print "Errors in multiplication:", errsq 32 | -------------------------------------------------------------------------------- /Solutions/Exercise06/Python/matmul.py: -------------------------------------------------------------------------------- 1 | # 2 | # Matrix Multiplication Driver 3 | # 4 | # This is a driver program to test various ways of computing 5 | # the product: 6 | # C = A * B 7 | # 8 | # A and B are constant matrices, square and the order is 9 | # set as a constant, ORDER (see definitions.py). This is so 10 | # we can make a quick test of the multiplication result. 11 | # 12 | # History: C++ version written by Tim Mattson, August 2010 13 | # Modified by Simon McIntosh-Smith, September 2011 14 | # Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 15 | # Ported to Python by Tom Deakin, July 2013 16 | # Modified to assume square matrices by Ben Elgar, November 2014 17 | # 18 | 19 | from helper import * 20 | from definitions import * 21 | 22 | import pyopencl as cl 23 | import numpy 24 | from time import time 25 | 26 | C_elem_KernelSource = ''' 27 | __kernel void mmul( 28 | const int N, 29 | __global float* A, 30 | __global float* B, 31 | __global float* C) 32 | { 33 | int k; 34 | int i = get_global_id(0); 35 | int j = get_global_id(1); 36 | float tmp = 0; 37 | if ((i < N) && (j < N)) 38 | { 39 | tmp = 0.0f; 40 | for (k=0; k 16 | #include 17 | #include 18 | 19 | #ifdef __APPLE__ 20 | #include 21 | #include 22 | #else 23 | #include 24 | #endif 25 | 26 | #include "matrix_lib.h" 27 | 28 | 29 | //------------------------------------------------------------------------------ 30 | // functions from ../Common 31 | //------------------------------------------------------------------------------ 32 | extern int output_device_info(cl_device_id ); 33 | extern double wtime(); // returns time since some fixed past point (wtime.c) 34 | 35 | //------------------------------------------------------------------------------ 36 | // Constants 37 | //------------------------------------------------------------------------------ 38 | #define ORDER 1024 // Order of the square matrices A, B, and C 39 | #define AVAL 3.0 // A elements are constant and equal to AVAL 40 | #define BVAL 5.0 // B elements are constant and equal to BVAL 41 | #define TOL (0.001) // tolerance used in floating point comparisons 42 | #define DIM 2 // Max dim for NDRange 43 | #define COUNT 1 // number of times to do each multiplication 44 | #define SUCCESS 1 45 | #define FAILURE 0 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /Solutions/Exercise07/C/matrix_lib.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library for the multiplication driver 4 | // 5 | // PURPOSE: This is a simple set of functions to manipulate 6 | // matrices used with the multiplcation driver. 7 | // 8 | // USAGE: The matrices are square and the order is 9 | // set as a defined constant, ORDER. 10 | // 11 | // HISTORY: Written by Tim Mattson, August 2010 12 | // Modified by Simon McIntosh-Smith, September 2011 13 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 14 | // Ported to C by Tom Deakin, 2013 15 | // 16 | //------------------------------------------------------------------------------ 17 | 18 | #include "matmul.h" 19 | 20 | //------------------------------------------------------------------------------ 21 | // 22 | // Function to compute the matrix product (sequential algorithm, dot prod) 23 | // 24 | //------------------------------------------------------------------------------ 25 | 26 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C) 27 | { 28 | int i, j, k; 29 | float tmp; 30 | 31 | for (i = 0; i < N; i++) { 32 | for (j = 0; j < N; j++) { 33 | tmp = 0.0f; 34 | for (k = 0; k < N; k++) { 35 | /* C(i,j) = sum(over k) A(i,k) * B(k,j) */ 36 | tmp += A[i*N+k] * B[k*N+j]; 37 | } 38 | C[i*N+j] = tmp; 39 | } 40 | } 41 | } 42 | 43 | //------------------------------------------------------------------------------ 44 | // 45 | // Function to initialize the input matrices A and B 46 | // 47 | //------------------------------------------------------------------------------ 48 | void initmat(int N, float *A, float *B, float *C) 49 | { 50 | int i, j; 51 | 52 | /* Initialize matrices */ 53 | 54 | for (i = 0; i < N; i++) 55 | for (j = 0; j < N; j++) 56 | A[i*N+j] = AVAL; 57 | 58 | for (i = 0; i < N; i++) 59 | for (j = 0; j < N; j++) 60 | B[i*N+j] = BVAL; 61 | 62 | for (i = 0; i < N; i++) 63 | for (j = 0; j < N; j++) 64 | C[i*N+j] = 0.0f; 65 | } 66 | 67 | //------------------------------------------------------------------------------ 68 | // 69 | // Function to set a matrix to zero 70 | // 71 | //------------------------------------------------------------------------------ 72 | void zero_mat (int N, float *C) 73 | { 74 | int i, j; 75 | 76 | for (i = 0; i < N; i++) 77 | for (j = 0; j < N; j++) 78 | C[i*N+j] = 0.0f; 79 | } 80 | 81 | //------------------------------------------------------------------------------ 82 | // 83 | // Function to fill Btrans(N,N) with transpose of B(N,N) 84 | // 85 | //------------------------------------------------------------------------------ 86 | void trans(int N, float *B, float *Btrans) 87 | { 88 | int i, j; 89 | 90 | for (i = 0; i < N; i++) 91 | for (j = 0; j < N; j++) 92 | Btrans[j*N+i] = B[i*N+j]; 93 | } 94 | 95 | //------------------------------------------------------------------------------ 96 | // 97 | // Function to compute errors of the product matrix 98 | // 99 | //------------------------------------------------------------------------------ 100 | float error(int N, float *C) 101 | { 102 | int i,j; 103 | float cval, errsq, err; 104 | cval = (float) N * AVAL * BVAL; 105 | errsq = 0.0f; 106 | 107 | for (i = 0; i < N; i++) { 108 | for (j = 0; j < N; j++) { 109 | err = C[i*N+j] - cval; 110 | errsq += err * err; 111 | } 112 | } 113 | return errsq; 114 | } 115 | 116 | //------------------------------------------------------------------------------ 117 | // 118 | // Function to analyze and output results 119 | // 120 | //------------------------------------------------------------------------------ 121 | void results(int N, float *C, double run_time) 122 | { 123 | float mflops; 124 | float errsq; 125 | 126 | mflops = 2.0 * N * N * N/(1000000.0f * run_time); 127 | printf(" %.2f seconds at %.1f MFLOPS \n", run_time,mflops); 128 | errsq = error(N, C); 129 | if (isnan(errsq) || errsq > TOL) { 130 | printf("\n Errors in multiplication: %f\n",errsq); 131 | exit(1); 132 | } 133 | } 134 | 135 | -------------------------------------------------------------------------------- /Solutions/Exercise07/C/matrix_lib.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library include file (function prototypes) 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Ported by Tom Deakin, July 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MATRIX_LIB_HDR 13 | #define __MATRIX_LIB_HDR 14 | 15 | 16 | //------------------------------------------------------------------------------ 17 | // 18 | // Function to compute the matrix product (sequential algorithm, dot producdt) 19 | // 20 | //------------------------------------------------------------------------------ 21 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C); 22 | 23 | //------------------------------------------------------------------------------ 24 | // 25 | // Function to initialize the input matrices A and B 26 | // 27 | //------------------------------------------------------------------------------ 28 | void initmat(int N, float *A, float *B, float *C); 29 | 30 | //------------------------------------------------------------------------------ 31 | // 32 | // Function to set a matrix to zero 33 | // 34 | //------------------------------------------------------------------------------ 35 | void zero_mat (int N, float *C); 36 | 37 | //------------------------------------------------------------------------------ 38 | // 39 | // Function to fill Btrans(Mdim,Pdim) with transpose of B(Pdim,Mdim) 40 | // 41 | //------------------------------------------------------------------------------ 42 | void trans(int N, float *B, float *Btrans); 43 | 44 | //------------------------------------------------------------------------------ 45 | // 46 | // Function to compute errors of the product matrix 47 | // 48 | //------------------------------------------------------------------------------ 49 | float error(int N, float *C); 50 | 51 | 52 | //------------------------------------------------------------------------------ 53 | // 54 | // Function to analyze and output results 55 | // 56 | //------------------------------------------------------------------------------ 57 | void results(int N, float *C, double run_time); 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /Solutions/Exercise07/C_elem.cl: -------------------------------------------------------------------------------- 1 | 2 | __kernel void mmul( 3 | const int N, 4 | __global float* A, 5 | __global float* B, 6 | __global float* C) 7 | { 8 | int k; 9 | int i = get_global_id(0); 10 | int j = get_global_id(1); 11 | float tmp; 12 | if ((i < N) && (j < N)) 13 | { 14 | tmp = 0.0f; 15 | for (k = 0; k < N; k++) 16 | tmp += A[i*N+k] * B[k*N+j]; 17 | C[i*N+j] = tmp; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Solutions/Exercise07/C_row.cl: -------------------------------------------------------------------------------- 1 | 2 | __kernel void mmul( 3 | const int N, 4 | __global float* A, 5 | __global float* B, 6 | __global float* C) 7 | { 8 | int k, j; 9 | int i = get_global_id(0); 10 | float tmp; 11 | if (i < N) { 12 | for (j = 0; j < N; j++) { 13 | tmp = 0.0f; 14 | for (k = 0; k < N; k++) 15 | tmp += A[i*N+k] * B[k*N+j]; 16 | C[i*N+j] = tmp; 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Solutions/Exercise07/C_row_priv.cl: -------------------------------------------------------------------------------- 1 | 2 | __kernel void mmul( 3 | const int N, 4 | __global float* A, 5 | __global float* B, 6 | __global float* C) 7 | { 8 | int k, j; 9 | int i = get_global_id(0); 10 | float Awrk[1024]; 11 | float tmp; 12 | if (i < N) { 13 | for (k = 0; k < N; k++) 14 | Awrk[k] = A[i*N+k]; 15 | 16 | for (j = 0; j < N; j++) { 17 | tmp = 0.0f; 18 | for (k = 0; k < N; k++) 19 | tmp += Awrk[k] * B[k*N+j]; 20 | C[i*N+j] = tmp; 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /Solutions/Exercise07/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Matrix Multiply example makefile 3 | # 4 | # History: Written by Tim mattson, August 2010 5 | # Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 6 | # Modified by Tom Deakin, July 2013 7 | # 8 | 9 | ifndef CPPC 10 | CPPC=g++ 11 | endif 12 | 13 | CCFLAGS =-O3 -ffast-math 14 | 15 | LIBS = -lm -lOpenCL -fopenmp 16 | 17 | COMMON_DIR = ../../Cpp_common 18 | 19 | INC = -I $(COMMON_DIR) 20 | 21 | MMUL_OBJS = matmul.o matrix_lib.o wtime.o 22 | EXEC = mult 23 | 24 | 25 | # Check our platform and make sure we define the APPLE variable 26 | # and set up the right compiler flags and libraries 27 | PLATFORM = $(shell uname -s) 28 | ifeq ($(PLATFORM), Darwin) 29 | CPPC = clang++ 30 | CCFLAGS += -stdlib=libc++ 31 | LIBS = -lm -framework OpenCL 32 | endif 33 | 34 | all: $(EXEC) 35 | 36 | mult: $(MMUL_OBJS) 37 | $(CPPC) $(MMUL_OBJS) $(CCFLAGS) $(LIBS) -o $(EXEC) 38 | 39 | wtime.o: $(COMMON_DIR)/wtime.c 40 | $(CPPC) -c $^ $(CCFLAGS) -o $@ 41 | 42 | .c.o: 43 | $(CPPC) -c $< $(CCFLAGS) -o $@ 44 | 45 | .cpp.o: 46 | $(CPPC) -c $< $(CCFLAGS) $(INC) -o $@ 47 | 48 | matmul.o: matmul.hpp matrix_lib.hpp 49 | 50 | matrix_lib.o: matmul.hpp 51 | 52 | clean: 53 | rm -f $(MMUL_OBJS) $(EXEC) 54 | -------------------------------------------------------------------------------- /Solutions/Exercise07/Cpp/matmul.hpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Include fle for the Matrix Multiply test harness 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MULT_HDR 13 | #define __MULT_HDR 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #define __CL_ENABLE_EXCEPTIONS 23 | #include "cl.hpp" 24 | 25 | #include "util.hpp" 26 | 27 | #include "matrix_lib.hpp" 28 | 29 | //------------------------------------------------------------------------------ 30 | // functions from ../Common 31 | //------------------------------------------------------------------------------ 32 | extern double wtime(); // returns time since some fixed past point (wtime.c) 33 | 34 | //------------------------------------------------------------------------------ 35 | // Constants 36 | //------------------------------------------------------------------------------ 37 | #define ORDER 1024 // Order of the square matrices A, B, and C 38 | #define AVAL 3.0 // A elements are constant and equal to AVAL 39 | #define BVAL 5.0 // B elements are constant and equal to BVAL 40 | #define TOL (0.001) // tolerance used in floating point comparisons 41 | #define DIM 2 // Max dim for NDRange 42 | #define COUNT 1 // number of times to do each multiplication 43 | #define SUCCESS 1 44 | #define FAILURE 0 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /Solutions/Exercise07/Cpp/matrix_lib.hpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library include file (function prototypes) 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 9 | // Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014 10 | // 11 | //------------------------------------------------------------------------------ 12 | 13 | #ifndef __MATRIX_LIB_HDR 14 | #define __MATRIX_LIB_HDR 15 | 16 | 17 | //------------------------------------------------------------------------------ 18 | // 19 | // Function to compute the matrix product (sequential algorithm, dot producdt) 20 | // 21 | //------------------------------------------------------------------------------ 22 | void seq_mat_mul_sdot(int N, std::vector &A, std::vector &B, std::vector &C); 23 | 24 | //------------------------------------------------------------------------------ 25 | // 26 | // Function to initialize the input matrices A and B 27 | // 28 | //------------------------------------------------------------------------------ 29 | void initmat(int N, std::vector& A, std::vector& B, std::vector& C); 30 | 31 | //------------------------------------------------------------------------------ 32 | // 33 | // Function to set a matrix to zero 34 | // 35 | //------------------------------------------------------------------------------ 36 | void zero_mat (int N, std::vector &C); 37 | 38 | //------------------------------------------------------------------------------ 39 | // 40 | // Function to fill Btrans(Mdim,Pdim) with transpose of B(Pdim,Mdim) 41 | // 42 | //------------------------------------------------------------------------------ 43 | void trans(int N, std::vector& B, std::vector& Btrans); 44 | 45 | //------------------------------------------------------------------------------ 46 | // 47 | // Function to compute errors of the product matrix 48 | // 49 | //------------------------------------------------------------------------------ 50 | float error(int N, std::vector& C); 51 | 52 | 53 | //------------------------------------------------------------------------------ 54 | // 55 | // Function to analyze and output results 56 | // 57 | //------------------------------------------------------------------------------ 58 | void results(int N, std::vector& C, double run_time); 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /Solutions/Exercise07/Python/definitions.py: -------------------------------------------------------------------------------- 1 | 2 | # Order of the square matrices A, B and C 3 | ORDER = 1024 4 | 5 | # A elemetns are constant and equal to AVAL 6 | AVAL = 3.0 7 | 8 | # B elemetns are constant and equal to BVAL 9 | BVAL = 5.0 10 | 11 | # tolerance used in floating point comparisons 12 | TOL = 0.001 13 | 14 | # Max dim for NDRange 15 | DIM = 2 16 | 17 | # number of times to do each multiplication 18 | COUNT = 1 19 | -------------------------------------------------------------------------------- /Solutions/Exercise07/Python/helper.py: -------------------------------------------------------------------------------- 1 | 2 | from definitions import * 3 | 4 | # Function to compute the matrix product (sequential algorithm, dot prod) 5 | def seq_mat_mul_sdot(N, A, B, C): 6 | for i in range(N): 7 | for j in range(N): 8 | tmp = 0.0 9 | for k in range(N): 10 | tmp += A[i*N+k] * B[k*N+j] 11 | C[i*N+j] = tmp 12 | 13 | # Function to compute errors of the product matrix 14 | def error(N, C): 15 | cval = float(N) * AVAL * BVAL 16 | errsq = 0.0 17 | for i in range(N): 18 | for j in range(N): 19 | err = C[i*N+j] - cval 20 | errsq += err * err 21 | return errsq; 22 | 23 | 24 | # Function to analyze and output results 25 | def results(N, C, run_time): 26 | mflops = 2.0 * N * N * N/(1000000.0* run_time) 27 | print run_time, "seconds at", mflops, "MFLOPS" 28 | errsq = error(N, C) 29 | if (errsq > TOL): 30 | print "Errors in multiplication:", errsq 31 | -------------------------------------------------------------------------------- /Solutions/Exercise07/README.md: -------------------------------------------------------------------------------- 1 | Exercise 7 - using private memory 2 | ================================= 3 | -------------------------------------------------------------------------------- /Solutions/Exercise08/C/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Matrix Multiply example makefile 3 | # 4 | # History: Written by Tim mattson, August 2010 5 | # Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 6 | # Modified by Tom Deakin, July 2013 7 | # Modified by Tom Deakin, October 2014 8 | # 9 | 10 | ifndef CC 11 | CC = gcc 12 | endif 13 | 14 | CCFLAGS=-O3 -std=c99 -ffast-math 15 | 16 | LIBS = -lm -lOpenCL -fopenmp 17 | 18 | COMMON_DIR = ../../C_common 19 | 20 | MMUL_OBJS = wtime.o 21 | EXEC = mult 22 | 23 | 24 | # Check our platform and make sure we define the APPLE variable 25 | # and set up the right compiler flags and libraries 26 | PLATFORM = $(shell uname -s) 27 | ifeq ($(PLATFORM), Darwin) 28 | LIBS = -lm -framework OpenCL 29 | endif 30 | 31 | all: $(EXEC) 32 | 33 | mult: $(MMUL_OBJS) matmul.c matrix_lib.c 34 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $(EXEC) 35 | 36 | wtime.o: $(COMMON_DIR)/wtime.c 37 | $(CC) -c $^ $(CCFLAGS) -o $@ 38 | 39 | .c.o: 40 | $(CC) -c $< $(CCFLAGS) -o $@ 41 | 42 | 43 | clean: 44 | rm -f $(MMUL_OBJS) $(EXEC) 45 | -------------------------------------------------------------------------------- /Solutions/Exercise08/C/matmul.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // Include fle for the Matrix Multiply test harness 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Ported to C by Tom Deakin, July 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MULT_HDR 13 | #define __MULT_HDR 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #ifdef __APPLE__ 20 | #include 21 | #include 22 | #else 23 | #include 24 | #endif 25 | 26 | #include "matrix_lib.h" 27 | 28 | //------------------------------------------------------------------------------ 29 | // functions from ../Common 30 | //------------------------------------------------------------------------------ 31 | extern int output_device_info(cl_device_id ); 32 | extern double wtime(); // returns time since some fixed past point (wtime.c) 33 | 34 | //------------------------------------------------------------------------------ 35 | // Constants 36 | //------------------------------------------------------------------------------ 37 | #define ORDER 1024 // Order of the square matrices A, B, and C 38 | #define AVAL 3.0 // A elements are constant and equal to AVAL 39 | #define BVAL 5.0 // B elements are constant and equal to BVAL 40 | #define TOL (0.001) // tolerance used in floating point comparisons 41 | #define DIM 2 // Max dim for NDRange 42 | #define COUNT 1 // number of times to do each multiplication 43 | #define SUCCESS 1 44 | #define FAILURE 0 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /Solutions/Exercise08/C/matrix_lib.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library for the multiplication driver 4 | // 5 | // PURPOSE: This is a simple set of functions to manipulate 6 | // matrices used with the multiplcation driver. 7 | // 8 | // USAGE: The matrices are square and the order is 9 | // set as a defined constant, ORDER. 10 | // 11 | // HISTORY: Written by Tim Mattson, August 2010 12 | // Modified by Simon McIntosh-Smith, September 2011 13 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 14 | // Ported to C by Tom Deakin, 2013 15 | // 16 | //------------------------------------------------------------------------------ 17 | 18 | #include "matmul.h" 19 | 20 | //------------------------------------------------------------------------------ 21 | // 22 | // Function to compute the matrix product (sequential algorithm, dot prod) 23 | // 24 | //------------------------------------------------------------------------------ 25 | 26 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C) 27 | { 28 | int i, j, k; 29 | float tmp; 30 | 31 | for (i = 0; i < N; i++) { 32 | for (j = 0; j < N; j++) { 33 | tmp = 0.0f; 34 | for (k = 0; k < N; k++) { 35 | /* C(i,j) = sum(over k) A(i,k) * B(k,j) */ 36 | tmp += A[i*N+k] * B[k*N+j]; 37 | } 38 | C[i*N+j] = tmp; 39 | } 40 | } 41 | } 42 | 43 | //------------------------------------------------------------------------------ 44 | // 45 | // Function to initialize the input matrices A and B 46 | // 47 | //------------------------------------------------------------------------------ 48 | void initmat(int N, float *A, float *B, float *C) 49 | { 50 | int i, j; 51 | 52 | /* Initialize matrices */ 53 | 54 | for (i = 0; i < N; i++) 55 | for (j = 0; j < N; j++) 56 | A[i*N+j] = AVAL; 57 | 58 | for (i = 0; i < N; i++) 59 | for (j = 0; j < N; j++) 60 | B[i*N+j] = BVAL; 61 | 62 | for (i = 0; i < N; i++) 63 | for (j = 0; j < N; j++) 64 | C[i*N+j] = 0.0f; 65 | } 66 | 67 | //------------------------------------------------------------------------------ 68 | // 69 | // Function to set a matrix to zero 70 | // 71 | //------------------------------------------------------------------------------ 72 | void zero_mat (int N, float *C) 73 | { 74 | int i, j; 75 | 76 | for (i = 0; i < N; i++) 77 | for (j = 0; j < N; j++) 78 | C[i*N+j] = 0.0f; 79 | } 80 | 81 | //------------------------------------------------------------------------------ 82 | // 83 | // Function to fill Btrans(N,N) with transpose of B(N,N) 84 | // 85 | //------------------------------------------------------------------------------ 86 | void trans(int N, float *B, float *Btrans) 87 | { 88 | int i, j; 89 | 90 | for (i = 0; i < N; i++) 91 | for (j = 0; j < N; j++) 92 | Btrans[j*N+i] = B[i*N+j]; 93 | } 94 | 95 | //------------------------------------------------------------------------------ 96 | // 97 | // Function to compute errors of the product matrix 98 | // 99 | //------------------------------------------------------------------------------ 100 | float error(int N, float *C) 101 | { 102 | int i,j; 103 | float cval, errsq, err; 104 | cval = (float) N * AVAL * BVAL; 105 | errsq = 0.0f; 106 | 107 | for (i = 0; i < N; i++) { 108 | for (j = 0; j < N; j++) { 109 | err = C[i*N+j] - cval; 110 | errsq += err * err; 111 | } 112 | } 113 | return errsq; 114 | } 115 | 116 | //------------------------------------------------------------------------------ 117 | // 118 | // Function to analyze and output results 119 | // 120 | //------------------------------------------------------------------------------ 121 | void results(int N, float *C, double run_time) 122 | { 123 | float mflops; 124 | float errsq; 125 | 126 | mflops = 2.0 * N * N * N/(1000000.0f * run_time); 127 | printf(" %.2f seconds at %.1f MFLOPS \n", run_time,mflops); 128 | errsq = error(N, C); 129 | if (isnan(errsq) || errsq > TOL) { 130 | printf("\n Errors in multiplication: %f\n",errsq); 131 | exit(1); 132 | } 133 | } 134 | 135 | -------------------------------------------------------------------------------- /Solutions/Exercise08/C/matrix_lib.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library include file (function prototypes) 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Ported by Tom Deakin, July 2013 9 | // 10 | //------------------------------------------------------------------------------ 11 | 12 | #ifndef __MATRIX_LIB_HDR 13 | #define __MATRIX_LIB_HDR 14 | 15 | 16 | //------------------------------------------------------------------------------ 17 | // 18 | // Function to compute the matrix product (sequential algorithm, dot producdt) 19 | // 20 | //------------------------------------------------------------------------------ 21 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C); 22 | 23 | //------------------------------------------------------------------------------ 24 | // 25 | // Function to initialize the input matrices A and B 26 | // 27 | //------------------------------------------------------------------------------ 28 | void initmat(int N, float *A, float *B, float *C); 29 | 30 | //------------------------------------------------------------------------------ 31 | // 32 | // Function to set a matrix to zero 33 | // 34 | //------------------------------------------------------------------------------ 35 | void zero_mat (int N, float *C); 36 | 37 | //------------------------------------------------------------------------------ 38 | // 39 | // Function to fill Btrans(Mdim,Pdim) with transpose of B(Pdim,Mdim) 40 | // 41 | //------------------------------------------------------------------------------ 42 | void trans(int N, float *B, float *Btrans); 43 | 44 | //------------------------------------------------------------------------------ 45 | // 46 | // Function to compute errors of the product matrix 47 | // 48 | //------------------------------------------------------------------------------ 49 | float error(int N, float *C); 50 | 51 | 52 | //------------------------------------------------------------------------------ 53 | // 54 | // Function to analyze and output results 55 | // 56 | //------------------------------------------------------------------------------ 57 | void results(int N, float *C, double run_time); 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /Solutions/Exercise08/C_block_form.cl: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------- 2 | // 3 | // PROGRAM: Blocked Matrix Multipliplication kernel 4 | // 5 | // PURPOSE: Computes an element of the proudct matrix 6 | // 7 | // C = A * B 8 | // 9 | // Using the well known blocked algorithm. 10 | // 11 | // To derive this algorithm, start with the naive 12 | // triply nested loop algorithm with a dot product 13 | // for each element of C. Decompose each loop 14 | // into blocks of size blcksz. This gives you 6 15 | // nested loops with three loops over blocks 16 | // and three loops over indices inside the blocks. 17 | // 18 | // Rearrange the loops to put the 3 loops over blocks 19 | // at the outermost loops of the loop nest. You'll 20 | // see that the three "inner" loops are just the 21 | // regular matrix product between blocks. 22 | // 23 | // The algorithms is simple. Keeping all the indices 24 | // straight is not. We will use the following 25 | // conventions: 26 | // 27 | // i,j,k ... indices of full, global matrices 28 | // Iblk, Jblk, Kblk ... indices of matrix blocks 29 | // iloc, jloc, kloc ... indices inside blocks 30 | // 31 | // HISTORY: Written by Tim Mattson, November 2013 32 | // Updated by Simon McIntosh-Smith, August 2014 33 | // 34 | // LICENSE: This work is licensed under the Creative Commons 35 | // Attribution 4.0 International License. 36 | // To view a copy of this license, visit 37 | // http://creativecommons.org/licenses/by/4.0/ 38 | // or send a letter to: 39 | // Creative Commons, 40 | // 444 Castro Street, Suite 900, 41 | // Mountain View, California, 94041, USA. 42 | // 43 | //------------------------------------------------------------- 44 | 45 | // It turns out that the compiler generates much better code if 46 | // we "hardwire" this block size. 16 works well for an NVIDIA 47 | // GPU, 32 works well for a CPU 48 | #define blksz 16 49 | 50 | __kernel void mmul( 51 | const unsigned int N, 52 | __global const float* restrict A, 53 | __global const float* restrict B, 54 | __global float* restrict C, 55 | __local float* restrict Awrk, 56 | __local float* restrict Bwrk) 57 | { 58 | int kloc, Kblk; 59 | float Ctmp=0.0f; 60 | 61 | // This work-item will compute element C(i,j) 62 | const int i = get_global_id(0); 63 | const int j = get_global_id(1); 64 | 65 | // Element C(i,j) is in block C(Iblk,Jblk) 66 | const int Iblk = get_group_id(0); 67 | const int Jblk = get_group_id(1); 68 | 69 | // C(i,j) is element C(iloc, jloc) of block C(Iblk, Jblk) 70 | const int iloc = get_local_id(0); 71 | const int jloc = get_local_id(1); 72 | 73 | // The number of blocks are the same in each dimension 74 | const int Num_BLK = N/blksz; 75 | 76 | // Setup the upper-left-corner (base address) for the A and 77 | // B blocks plus the increments to advance base addresses as 78 | // we loop over blocks 79 | int Abase = Jblk*N*blksz; 80 | const int Ainc = blksz; 81 | 82 | int Bbase = Iblk*blksz; 83 | const int Binc = blksz*N; 84 | 85 | 86 | // C(Iblk,Jblk) = (sum over Kblk) A(Iblk,Kblk)*B(Kblk,Jblk) 87 | for (Kblk = 0; Kblk 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | 22 | #define __CL_ENABLE_EXCEPTIONS 23 | #include "cl.hpp" 24 | 25 | #include "util.hpp" 26 | 27 | #include "matrix_lib.hpp" 28 | 29 | //------------------------------------------------------------------------------ 30 | // functions from ../Common 31 | //------------------------------------------------------------------------------ 32 | extern double wtime(); // returns time since some fixed past point (wtime.c) 33 | 34 | //------------------------------------------------------------------------------ 35 | // Constants 36 | //------------------------------------------------------------------------------ 37 | #define ORDER 1024 // Order of the square matrices A, B, and C 38 | #define AVAL 3.0 // A elements are constant and equal to AVAL 39 | #define BVAL 5.0 // B elements are constant and equal to BVAL 40 | #define TOL (0.001) // tolerance used in floating point comparisons 41 | #define DIM 2 // Max dim for NDRange 42 | #define COUNT 1 // number of times to do each multiplication 43 | #define SUCCESS 1 44 | #define FAILURE 0 45 | 46 | #endif 47 | -------------------------------------------------------------------------------- /Solutions/Exercise08/Cpp/matrix_lib.hpp: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // PROGRAM: Matrix library include file (function prototypes) 4 | // 5 | // HISTORY: Written by Tim Mattson, August 2010 6 | // Modified by Simon McIntosh-Smith, September 2011 7 | // Modified by Tom Deakin and Simon McIntosh-Smith, October 2012 8 | // Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013 9 | // Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014 10 | // 11 | //------------------------------------------------------------------------------ 12 | 13 | #ifndef __MATRIX_LIB_HDR 14 | #define __MATRIX_LIB_HDR 15 | 16 | 17 | //------------------------------------------------------------------------------ 18 | // 19 | // Function to compute the matrix product (sequential algorithm, dot producdt) 20 | // 21 | //------------------------------------------------------------------------------ 22 | void seq_mat_mul_sdot(int N, std::vector &A, std::vector &B, std::vector &C); 23 | 24 | //------------------------------------------------------------------------------ 25 | // 26 | // Function to initialize the input matrices A and B 27 | // 28 | //------------------------------------------------------------------------------ 29 | void initmat(int N, std::vector& A, std::vector& B, std::vector& C); 30 | 31 | //------------------------------------------------------------------------------ 32 | // 33 | // Function to set a matrix to zero 34 | // 35 | //------------------------------------------------------------------------------ 36 | void zero_mat (int N, std::vector &C); 37 | 38 | //------------------------------------------------------------------------------ 39 | // 40 | // Function to fill Btrans(Mdim,Pdim) with transpose of B(Pdim,Mdim) 41 | // 42 | //------------------------------------------------------------------------------ 43 | void trans(int N, std::vector& B, std::vector& Btrans); 44 | 45 | //------------------------------------------------------------------------------ 46 | // 47 | // Function to compute errors of the product matrix 48 | // 49 | //------------------------------------------------------------------------------ 50 | float error(int N, std::vector& C); 51 | 52 | 53 | //------------------------------------------------------------------------------ 54 | // 55 | // Function to analyze and output results 56 | // 57 | //------------------------------------------------------------------------------ 58 | void results(int N, std::vector& C, double run_time); 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /Solutions/Exercise08/Python/definitions.py: -------------------------------------------------------------------------------- 1 | 2 | # Order of the square matrices A, B and C 3 | ORDER = 1024 4 | 5 | # A elemetns are constant and equal to AVAL 6 | AVAL = 3.0 7 | 8 | # B elemetns are constant and equal to BVAL 9 | BVAL = 5.0 10 | 11 | # tolerance used in floating point comparisons 12 | TOL = 0.001 13 | 14 | # Max dim for NDRange 15 | DIM = 2 16 | 17 | # number of times to do each multiplication 18 | COUNT = 1 19 | -------------------------------------------------------------------------------- /Solutions/Exercise08/Python/helper.py: -------------------------------------------------------------------------------- 1 | 2 | from definitions import * 3 | 4 | # Function to compute the matrix product (sequential algorithm, dot prod) 5 | def seq_mat_mul_sdot(N, A, B, C): 6 | for i in range(N): 7 | for j in range(N): 8 | tmp = 0.0 9 | for k in range(N): 10 | tmp += A[i*N+k] * B[k*N+j] 11 | C[i*N+j] = tmp 12 | 13 | # Function to compute errors of the product matrix 14 | def error(N, C): 15 | cval = float(N) * AVAL * BVAL 16 | errsq = 0.0 17 | for i in range(N): 18 | for j in range(N): 19 | err = C[i*N+j] - cval 20 | errsq += err * err 21 | return errsq; 22 | 23 | 24 | # Function to analyze and output results 25 | def results(N, C, run_time): 26 | mflops = 2.0 * N * N * N/(1000000.0* run_time) 27 | print run_time, "seconds at", mflops, "MFLOPS" 28 | errsq = error(N, C) 29 | if (errsq > TOL): 30 | print "Errors in multiplication:", errsq 31 | -------------------------------------------------------------------------------- /Solutions/Exercise08/README.md: -------------------------------------------------------------------------------- 1 | Exercise 8 - using local memory 2 | =============================== 3 | -------------------------------------------------------------------------------- /Solutions/Exercise09/C/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CC 3 | CC = gcc 4 | endif 5 | 6 | CCFLAGS=-O3 -std=c99 7 | 8 | LIBS = -lOpenCL -fopenmp -lm 9 | 10 | COMMON_DIR = ../../C_common 11 | 12 | # Check our platform and make sure we define the APPLE variable 13 | # and set up the right compiler flags and libraries 14 | PLATFORM = $(shell uname -s) 15 | ifeq ($(PLATFORM), Darwin) 16 | LIBS = -framework OpenCL -lm 17 | endif 18 | 19 | 20 | pi_ocl: pi_ocl.c $(COMMON_DIR)/wtime.c $(COMMON_DIR)/device_info.c 21 | $(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@ 22 | 23 | 24 | clean: 25 | rm -f pi_ocl 26 | -------------------------------------------------------------------------------- /Solutions/Exercise09/Cpp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ifndef CPPC 3 | CPPC=g++ 4 | endif 5 | 6 | CPP_COMMON = ../../Cpp_common 7 | 8 | CCFLAGS= 9 | 10 | INC = -I $(CPP_COMMON) 11 | 12 | LIBS = -lOpenCL -lrt 13 | 14 | 15 | # Check our platform and make sure we define the APPLE variable 16 | # and set up the right compiler flags and libraries 17 | PLATFORM = $(shell uname -s) 18 | ifeq ($(PLATFORM), Darwin) 19 | CPPC = clang++ 20 | CCFLAGS += -stdlib=libc++ 21 | LIBS = -framework OpenCL 22 | endif 23 | 24 | pi_ocl: pi_ocl.cpp 25 | $(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@ 26 | 27 | 28 | clean: 29 | rm -f pi_ocl 30 | -------------------------------------------------------------------------------- /Solutions/Exercise09/Python/pi_ocl.py: -------------------------------------------------------------------------------- 1 | # 2 | # Pi reduction 3 | # 4 | # Numeric integration to estimate pi 5 | # Asks the user to select a device at runtime 6 | # 7 | # History: C version written by Tim Mattson, May 2010 8 | # Ported to the C++ Wrapper API by Benedict R. Gaster, September 2011 9 | # C++ version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012 10 | # Ported to Python by Tom Deakin, July 2013 11 | # 12 | 13 | 14 | import pyopencl as cl 15 | import numpy 16 | from time import time 17 | 18 | # Some constant values 19 | INSTEPS = 512*512*512 20 | ITERS = 262144 21 | 22 | # Set some default values: 23 | # Default number of steps (updated later to device prefereable) 24 | in_nsteps = INSTEPS 25 | # Default number of iterations 26 | niters = ITERS 27 | 28 | # Create context, queue and build program 29 | context = cl.create_some_context() 30 | queue = cl.CommandQueue(context) 31 | kernelsource = open("../pi_ocl.cl").read() 32 | program = cl.Program(context, kernelsource).build() 33 | pi = program.pi 34 | pi.set_scalar_arg_dtypes([numpy.int32, numpy.float32, None, None]) 35 | 36 | # Get the max work group size for the kernel pi on our device 37 | device = context.devices[0] 38 | work_group_size = program.pi.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device) 39 | 40 | 41 | # Now that we know the size of the work_groups, we can set the number of work 42 | # groups, the actual number of steps, and the step size 43 | nwork_groups = in_nsteps/(work_group_size*niters) 44 | 45 | if nwork_groups < 1: 46 | nwork_groups = device.max_compute_units 47 | work_group_size = in_nsteps/(nwork_groups*niters) 48 | 49 | nsteps = work_group_size * niters * nwork_groups 50 | step_size = 1.0 / float(nsteps) 51 | 52 | # vector to hold partial sum 53 | h_psum = numpy.empty(nwork_groups).astype(numpy.float32) 54 | 55 | print nwork_groups, "work groups of size", work_group_size, ".", 56 | print nsteps, "Integration steps" 57 | 58 | d_partial_sums = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_psum.nbytes) 59 | 60 | # Start the timer 61 | rtime = time() 62 | 63 | # Execute the kernel over the entire range of our 1d input data et 64 | # using the maximum number of work group items for this device 65 | # Set the global and local size as tuples 66 | global_size = ((nwork_groups * work_group_size),) 67 | local_size = ((work_group_size),) 68 | localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size) 69 | 70 | pi(queue, global_size, local_size, 71 | niters, step_size, 72 | localmem, d_partial_sums) 73 | 74 | cl.enqueue_copy(queue, h_psum, d_partial_sums) 75 | 76 | # complete the sum and compute the final integral value 77 | pi_res = h_psum.sum() * step_size 78 | 79 | # Stop the timer 80 | rtime = time() - rtime 81 | print "The calculation ran in", rtime, "seconds" 82 | print "pi =", pi_res, "for", nsteps, "steps" 83 | 84 | -------------------------------------------------------------------------------- /Solutions/Exercise09/README.md: -------------------------------------------------------------------------------- 1 | Exercise 9 - The Pi program 2 | =========================== 3 | -------------------------------------------------------------------------------- /Solutions/Exercise09/pi_ocl.cl: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // kernel: pi 4 | // 5 | // Purpose: accumulate partial sums of pi comp 6 | // 7 | // input: float step_size 8 | // int niters per work item 9 | // local float* an array to hold sums from each work item 10 | // 11 | // output: partial_sums float vector of partial sums 12 | // 13 | 14 | 15 | void reduce( 16 | __local float*, 17 | __global float*); 18 | 19 | 20 | __kernel void pi( 21 | const int niters, 22 | const float step_size, 23 | __local float* local_sums, 24 | __global float* partial_sums) 25 | { 26 | int num_wrk_items = get_local_size(0); 27 | int local_id = get_local_id(0); 28 | int group_id = get_group_id(0); 29 | 30 | float x, accum = 0.0f; 31 | int i,istart,iend; 32 | 33 | istart = (group_id * num_wrk_items + local_id) * niters; 34 | iend = istart+niters; 35 | 36 | for(i= istart; i work_group_size: 80 | work_group_size = max_size 81 | nwork_groups = in_nsteps/(work_group_size*niters) 82 | 83 | 84 | if nwork_groups < 1: 85 | nwork_groups = device.max_compute_units 86 | work_group_size = in_nsteps/(nwork_groups*niters) 87 | 88 | nsteps = work_group_size * niters * nwork_groups 89 | step_size = 1.0 / float(nsteps) 90 | 91 | # vector to hold partial sum 92 | h_psum = numpy.empty(nwork_groups).astype(numpy.float32) 93 | 94 | print nwork_groups, "work groups of size", work_group_size, ".", 95 | print nsteps, "Integration steps" 96 | 97 | d_partial_sums = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_psum.nbytes) 98 | 99 | # Start the timer 100 | rtime = time() 101 | 102 | # Execute the kernel over the entire range of our 1d input data et 103 | # using the maximum number of work group items for this device 104 | # Set the global and local size as tuples 105 | global_size = ((nwork_groups * work_group_size),) 106 | local_size = ((work_group_size),) 107 | 108 | localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size) 109 | 110 | pi(queue, global_size, local_size, 111 | niters, 112 | step_size, 113 | localmem, 114 | d_partial_sums) 115 | 116 | cl.enqueue_copy(queue, h_psum, d_partial_sums) 117 | 118 | # complete the sum and compute the final integral value 119 | pi_res = h_psum.sum() * step_size 120 | 121 | # Stop the timer 122 | rtime = time() - rtime 123 | print "The calculation ran in", rtime, "seconds" 124 | print "pi =", pi_res, "for", nsteps, "steps" 125 | 126 | -------------------------------------------------------------------------------- /Solutions/ExerciseA/README.md: -------------------------------------------------------------------------------- 1 | Exercise A - The vectorized Pi program 2 | ====================================== 3 | -------------------------------------------------------------------------------- /Solutions/Makefile: -------------------------------------------------------------------------------- 1 | 2 | # This makefile will produce all the C binaries 3 | # in their respective directories 4 | 5 | CEXES = Exercise04/C/vadd_chain Exercise05/C/vadd_abc \ 6 | Exercise06/C/mult Exercise07/C/mult \ 7 | Exercise08/C/mult Exercise09/C/pi_ocl \ 8 | Exercise13/C/gameoflife ExerciseA/C/pi_vocl 9 | 10 | CPPEXES = Exercise04/Cpp/vadd_chain Exercise05/Cpp/vadd_abc \ 11 | Exercise06/Cpp/mult Exercise07/Cpp/mult \ 12 | Exercise08/Cpp/mult Exercise08/Cpp/pi_ocl \ 13 | Exercise13/Cpp/gameoflife ExerciseA/Cpp/pi_vocl 14 | 15 | # Change this variable to specify the device type in all 16 | # the Makefile to the OpenCL device type of choice 17 | DEVICE = CL_DEVICE_TYPE_DEFAULT 18 | export DEVICE 19 | 20 | # Incase you need to rename the C++ compiler, you can 21 | # do it in bulk here 22 | CPPC = g++ 23 | export CPPC 24 | 25 | ifndef CC 26 | CC = gcc 27 | endif 28 | export CC 29 | 30 | .PHONY : $(CEXES) $(CPEXES) 31 | 32 | all: $(CEXES) $(CPPEXES) 33 | 34 | $(CEXES): 35 | $(MAKE) -C `dirname $@` 36 | 37 | $(CPPEXES): 38 | $(MAKE) -C `dirname $@` 39 | 40 | .PHONY : clean 41 | clean: 42 | for e in $(CEXES) $(CPPEXES); do $(MAKE) -C `dirname $$e` clean; done 43 | -------------------------------------------------------------------------------- /Tools/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HandsOnOpenCL/Exercises-Solutions/be2fb26d7c478627598ffba369014a4adb57b6f4/Tools/.DS_Store -------------------------------------------------------------------------------- /Tools/genErrCode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Usage: ./genErrCode.py /path/to/cl.h > err_code.h 4 | 5 | from __future__ import print_function 6 | import sys 7 | 8 | if len(sys.argv) != 2: 9 | print("Usage: python genErrCode.py /path/to/cl.h", file = sys.stderr) 10 | sys.exit(1) 11 | 12 | hfile = open(sys.argv[1], "r") 13 | 14 | # Find the start of the error code list 15 | for l in hfile: 16 | if l == "/* Error Codes */\n": 17 | # Found the error code comment 18 | break 19 | 20 | errors = [] 21 | # Loop through the errors and construct the list of errors 22 | for l in hfile: 23 | # Skip if a blank line 24 | if l == "\n": 25 | continue 26 | 27 | tokens = l.split() 28 | # We expect the line to be of the form: 29 | # #define CL_... int 30 | # OpenCL error numbers are 0 or negative 31 | if len(tokens) != 3 or int(tokens[2]) > 0: 32 | # We are done or some error 33 | break 34 | else: 35 | errors.append(tokens[1]) 36 | 37 | # Print out the C file 38 | print(''' 39 | #pragma once 40 | /*---------------------------------------------------------------------------- 41 | * 42 | * Name: err_code() 43 | * 44 | * Purpose: Function to output descriptions of errors for an input error code 45 | * and quit a program on an error with a user message 46 | * 47 | * 48 | * RETURN: echoes the input error code / echos user message and exits 49 | * 50 | * HISTORY: Written by Tim Mattson, June 2010 51 | * This version automatically produced by genErrCode.py 52 | * script written by Tom Deakin, August 2013 53 | * Modified by Bruce Merry, March 2014 54 | * Updated by Tom Deakin, October 2014 55 | * Included the checkError function written by 56 | * James Price and Simon McIntosh-Smith 57 | * 58 | *---------------------------------------------------------------------------- 59 | */ 60 | #if defined(__APPLE__) || defined(__MACOSX) 61 | #include 62 | #else 63 | #include 64 | #endif 65 | 66 | #ifdef __cplusplus 67 | #include 68 | #endif 69 | 70 | const char *err_code (cl_int err_in) 71 | { 72 | switch (err_in) {''') 73 | for err in errors: 74 | print(' case ' + err + ':') 75 | print(' return (char*)"' + err.strip() + '";') 76 | 77 | print(''' 78 | default: 79 | return (char*)"UNKNOWN ERROR"; 80 | } 81 | } 82 | ''') 83 | 84 | # Check error funtion 85 | print(''' 86 | void check_error(cl_int err, const char *operation, char *filename, int line) 87 | { 88 | if (err != CL_SUCCESS) 89 | { 90 | fprintf(stderr, "Error during operation '%s', ", operation); 91 | fprintf(stderr, "in '%s' on line %d\\n", filename, line); 92 | fprintf(stderr, "Error code was \\"%s\\" (%d)\\n", err_code(err), err); 93 | exit(EXIT_FAILURE); 94 | } 95 | } 96 | ''') 97 | 98 | # Macro version of checkError without need for file and line 99 | print(''' 100 | #define checkError(E, S) check_error(E,S,__FILE__,__LINE__) 101 | ''') 102 | 103 | -------------------------------------------------------------------------------- /Tools/stringify_opencl: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IN=$1 4 | OUT=$2 5 | NAME=${IN%.cl} 6 | NAME=${NAME##*/} 7 | 8 | echo "const char *"$NAME"_ocl =" >$OUT 9 | sed -e 's/\\/\\\\/g;s/"/\\"/g;s/^/"/;s/$/\\n"/' $IN >>$OUT 10 | echo ";" >>$OUT 11 | --------------------------------------------------------------------------------