├── .gitignore
├── Exercises
    ├── C_common
    │   ├── device_info.c
    │   ├── device_picker.h
    │   ├── err_code.h
    │   └── wtime.c
    ├── Cpp_common
    │   ├── cl.hpp
    │   ├── device_picker.hpp
    │   ├── err_code.h
    │   ├── util.hpp
    │   └── wtime.c
    ├── Exercise01
    │   ├── C
    │   │   ├── DeviceInfo.c
    │   │   └── Makefile
    │   ├── Cpp
    │   │   ├── DeviceInfo.cpp
    │   │   └── Makefile
    │   ├── Python
    │   │   └── DeviceInfo.py
    │   └── README.md
    ├── Exercise02
    │   ├── C
    │   │   ├── Makefile
    │   │   └── vadd_c.c
    │   └── README.md
    ├── Exercise03
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── vadd.cl
    │   │   └── vadd.cpp
    │   ├── Python
    │   │   ├── deviceinfo.py
    │   │   └── vadd.py
    │   └── README.md
    ├── Exercise04
    │   ├── C
    │   │   ├── Makefile
    │   │   └── vadd_c.c
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── vadd.cl
    │   │   └── vadd.cpp
    │   ├── Python
    │   │   ├── deviceinfo.py
    │   │   └── vadd.py
    │   └── README.md
    ├── Exercise05
    │   ├── C
    │   │   ├── Makefile
    │   │   └── vadd_c.c
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── vadd.cl
    │   │   └── vadd.cpp
    │   ├── Python
    │   │   ├── deviceinfo.py
    │   │   └── vadd.py
    │   └── README.md
    ├── Exercise06
    │   ├── C
    │   │   ├── Makefile
    │   │   ├── matmul.c
    │   │   ├── matmul.h
    │   │   ├── matrix_lib.c
    │   │   └── matrix_lib.h
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── matmul.cpp
    │   │   ├── matmul.hpp
    │   │   ├── matrix_lib.cpp
    │   │   └── matrix_lib.hpp
    │   ├── Python
    │   │   ├── definitions.py
    │   │   ├── helper.py
    │   │   └── matmul.py
    │   └── README.md
    ├── Exercise07
    │   └── README.md
    ├── Exercise08
    │   └── README.md
    ├── Exercise09
    │   ├── C
    │   │   ├── Makefile
    │   │   └── pi.c
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   └── pi.cpp
    │   ├── Python
    │   │   └── pi.py
    │   └── README.md
    ├── Exercise10
    │   └── README.md
    ├── Exercise11
    │   └── README.md
    ├── Exercise12
    │   └── README.md
    ├── Exercise13
    │   ├── C
    │   │   ├── Makefile
    │   │   └── gameoflife.c
    │   ├── CUDA-VADD
    │   │   ├── Makefile
    │   │   └── vadd.cu
    │   ├── CUDA
    │   │   ├── Makefile
    │   │   └── gameoflife.cu
    │   ├── Displayer
    │   │   ├── Makefile
    │   │   └── displayer.c
    │   ├── Examples
    │   │   ├── Acorn
    │   │   │   ├── acorn.dat
    │   │   │   ├── final_state.dat
    │   │   │   └── input.params
    │   │   ├── Max
    │   │   │   ├── final_state.dat
    │   │   │   ├── input.params
    │   │   │   └── max.dat
    │   │   ├── Pulsar
    │   │   │   ├── final_state.dat
    │   │   │   ├── input.params
    │   │   │   └── pulsar.dat
    │   │   └── QueenBee
    │   │   │   ├── final_state.dat
    │   │   │   ├── input.params
    │   │   │   └── queenbee.dat
    │   └── README.md
    ├── ExerciseA
    │   └── README.md
    └── Makefile
├── License
├── README.md
├── Solutions
    ├── C_common
    │   ├── device_info.c
    │   ├── device_picker.h
    │   ├── err_code.h
    │   └── wtime.c
    ├── Cpp_common
    │   ├── cl.hpp
    │   ├── device_picker.hpp
    │   ├── err_code.h
    │   ├── util.hpp
    │   └── wtime.c
    ├── Exercise04
    │   ├── C
    │   │   ├── Makefile
    │   │   └── vadd_chain.c
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── vadd_chain.cl
    │   │   └── vadd_chain.cpp
    │   ├── Python
    │   │   └── vadd_chain.py
    │   └── README.md
    ├── Exercise05
    │   ├── C
    │   │   ├── Makefile
    │   │   └── vadd_abc.c
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── vadd_abc.cl
    │   │   └── vadd_abc.cpp
    │   ├── Python
    │   │   └── vadd_abc.py
    │   └── README.md
    ├── Exercise06
    │   ├── C
    │   │   ├── Makefile
    │   │   ├── matmul.c
    │   │   ├── matmul.h
    │   │   ├── matrix_lib.c
    │   │   └── matrix_lib.h
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── matmul.cpp
    │   │   ├── matmul.hpp
    │   │   ├── matrix_lib.cpp
    │   │   └── matrix_lib.hpp
    │   ├── Python
    │   │   ├── definitions.py
    │   │   ├── helper.py
    │   │   └── matmul.py
    │   └── README.md
    ├── Exercise07
    │   ├── C
    │   │   ├── Makefile
    │   │   ├── matmul.c
    │   │   ├── matmul.h
    │   │   ├── matrix_lib.c
    │   │   └── matrix_lib.h
    │   ├── C_elem.cl
    │   ├── C_row.cl
    │   ├── C_row_priv.cl
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── matmul.cpp
    │   │   ├── matmul.hpp
    │   │   ├── matrix_lib.cpp
    │   │   └── matrix_lib.hpp
    │   ├── Python
    │   │   ├── definitions.py
    │   │   ├── helper.py
    │   │   └── matmul.py
    │   └── README.md
    ├── Exercise08
    │   ├── C
    │   │   ├── Makefile
    │   │   ├── matmul.c
    │   │   ├── matmul.h
    │   │   ├── matrix_lib.c
    │   │   └── matrix_lib.h
    │   ├── C_block_form.cl
    │   ├── C_elem.cl
    │   ├── C_row.cl
    │   ├── C_row_priv.cl
    │   ├── C_row_priv_bloc.cl
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   ├── matmul.cpp
    │   │   ├── matmul.hpp
    │   │   ├── matrix_lib.cpp
    │   │   └── matrix_lib.hpp
    │   ├── Python
    │   │   ├── definitions.py
    │   │   ├── helper.py
    │   │   └── matmul.py
    │   └── README.md
    ├── Exercise09
    │   ├── C
    │   │   ├── Makefile
    │   │   └── pi_ocl.c
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   └── pi_ocl.cpp
    │   ├── Python
    │   │   └── pi_ocl.py
    │   ├── README.md
    │   └── pi_ocl.cl
    ├── Exercise13
    │   ├── C
    │   │   ├── Makefile
    │   │   └── gameoflife.c
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   └── gameoflife.cpp
    │   ├── Python
    │   │   └── gameoflife.py
    │   ├── README.md
    │   └── gameoflife.cl
    ├── ExerciseA
    │   ├── C
    │   │   ├── Makefile
    │   │   └── pi_vocl.c
    │   ├── Cpp
    │   │   ├── Makefile
    │   │   └── pi_vocl.cpp
    │   ├── Python
    │   │   └── pi_vocl.py
    │   ├── README.md
    │   └── pi_vocl.cl
    └── Makefile
└── Tools
    ├── .DS_Store
    ├── genErrCode.py
    └── stringify_opencl


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | 
 3 | *.pyc
 4 | 
 5 | # C
 6 | *.o
 7 | 
 8 | # Produced binarys
 9 | Exercises/Exercise01/C/DeviceInfo
10 | Exercises/Exercise02/C/vadd
11 | Exercises/Exercise04/C/vadd
12 | Exercises/Exercise05/C/vadd
13 | Exercises/Exercise06/C/mult
14 | Exercises/Exercise09/C/pi
15 | Exercises/Exercise13/C/gameoflife
16 | Exercises/Exercise13/CUDA/gameoflife
17 | Exercises/Exercise13/CUDA-VADD/vadd
18 | Exercises/Exercise13/Displayer/displayer
19 | 
20 | Exercises/Exercise01/Cpp/DeviceInfo
21 | Exercises/Exercise03/Cpp/vadd
22 | Exercises/Exercise04/Cpp/vadd
23 | Exercises/Exercise05/Cpp/vadd
24 | Exercises/Exercise06/Cpp/mult
25 | Exercises/Exercise09/Cpp/pi
26 | 
27 | Solutions/Exercise04/C/vadd_chain
28 | Solutions/Exercise05/C/vadd_abc
29 | Solutions/Exercise06/C/mult
30 | Solutions/Exercise07/C/mult
31 | Solutions/Exercise08/C/mult
32 | Solutions/Exercise09/C/pi_ocl
33 | Solutions/Exercise13/C/gameoflife
34 | Solutions/ExerciseA/C/pi_vocl
35 | 
36 | Solutions/Exercise04/Cpp/vadd_chain
37 | Solutions/Exercise05/Cpp/vadd_abc
38 | Solutions/Exercise06/Cpp/mult
39 | Solutions/Exercise07/Cpp/mult
40 | Solutions/Exercise08/Cpp/mult
41 | Solutions/Exercise09/Cpp/pi_ocl
42 | Solutions/Exercise13/Cpp/gameoflife
43 | Solutions/ExerciseA/Cpp/pi_vocl
44 | 
45 | 
46 | *.plist
47 | .DS_Store
48 | 
49 | Exercises/Exercise01/C/DeviceInfo.dSYM/Contents/Resources/DWARF/DeviceInfo
50 | 
51 | 
52 | Solutions/Exercise06/Cpp/Makefile.tmp
53 | 
54 | Solutions/ExerciseA/C/pi_vocl.dSYM/Contents/Resources/DWARF/pi_vocl
55 | 


--------------------------------------------------------------------------------
/Exercises/C_common/device_picker.h:
--------------------------------------------------------------------------------
  1 | /*------------------------------------------------------------------------------
  2 |  *
  3 |  * Name:       device_picker.h
  4 |  *
  5 |  * Purpose:    Provide a simple CLI to specify an OpenCL device at runtime
  6 |  *
  7 |  * Note:       Must be included AFTER the relevant OpenCL header
  8 |  *             See one of the Matrix Multiply exercises for usage
  9 |  *
 10 |  * HISTORY:    Method written by James Price, October 2014
 11 |  *             Extracted to a common header by Tom Deakin, November 2014
 12 |  */
 13 | 
 14 | #pragma once
 15 | 
 16 | #include <string.h>
 17 | #include <err_code.h>
 18 | 
 19 | #define MAX_PLATFORMS     8
 20 | #define MAX_DEVICES      16
 21 | #define MAX_INFO_STRING 256
 22 | 
 23 | 
 24 | unsigned getDeviceList(cl_device_id devices[MAX_DEVICES])
 25 | {
 26 |   cl_int err;
 27 | 
 28 |   // Get list of platforms
 29 |   cl_uint numPlatforms = 0;
 30 |   cl_platform_id platforms[MAX_PLATFORMS];
 31 |   err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms);
 32 |   checkError(err, "getting platforms");
 33 | 
 34 |   // Enumerate devices
 35 |   unsigned numDevices = 0;
 36 |   for (int i = 0; i < numPlatforms; i++)
 37 |   {
 38 |     cl_uint num = 0;
 39 |     err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL,
 40 |                          MAX_DEVICES-numDevices, devices+numDevices, &num);
 41 |     checkError(err, "getting deviceS");
 42 |     numDevices += num;
 43 |   }
 44 | 
 45 |   return numDevices;
 46 | }
 47 | 
 48 | void getDeviceName(cl_device_id device, char name[MAX_INFO_STRING])
 49 | {
 50 |   cl_device_info info = CL_DEVICE_NAME;
 51 | 
 52 |   // Special case for AMD
 53 | #ifdef CL_DEVICE_BOARD_NAME_AMD
 54 |   clGetDeviceInfo(device, CL_DEVICE_VENDOR, MAX_INFO_STRING, name, NULL);
 55 |   if (strstr(name, "Advanced Micro Devices"))
 56 |     info = CL_DEVICE_BOARD_NAME_AMD;
 57 | #endif
 58 | 
 59 |   clGetDeviceInfo(device, info, MAX_INFO_STRING, name, NULL);
 60 | }
 61 | 
 62 | 
 63 | int parseUInt(const char *str, cl_uint *output)
 64 | {
 65 |   char *next;
 66 |   *output = strtoul(str, &next, 10);
 67 |   return !strlen(next);
 68 | }
 69 | 
 70 | void parseArguments(int argc, char *argv[], cl_uint *deviceIndex)
 71 | {
 72 |   for (int i = 1; i < argc; i++)
 73 |   {
 74 |     if (!strcmp(argv[i], "--list"))
 75 |     {
 76 |       // Get list of devices
 77 |       cl_device_id devices[MAX_DEVICES];
 78 |       unsigned numDevices = getDeviceList(devices);
 79 | 
 80 |       // Print device names
 81 |       if (numDevices == 0)
 82 |       {
 83 |         printf("No devices found.\n");
 84 |       }
 85 |       else
 86 |       {
 87 |         printf("\n");
 88 |         printf("Devices:\n");
 89 |         for (int i = 0; i < numDevices; i++)
 90 |         {
 91 |           char name[MAX_INFO_STRING];
 92 |           getDeviceName(devices[i], name);
 93 |           printf("%2d: %s\n", i, name);
 94 |         }
 95 |         printf("\n");
 96 |       }
 97 |       exit(0);
 98 |     }
 99 |     else if (!strcmp(argv[i], "--device"))
100 |     {
101 |       if (++i >= argc || !parseUInt(argv[i], deviceIndex))
102 |       {
103 |         printf("Invalid device index\n");
104 |         exit(1);
105 |       }
106 |     }
107 |     else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
108 |     {
109 |       printf("\n");
110 |       printf("Usage: ./program [OPTIONS]\n\n");
111 |       printf("Options:\n");
112 |       printf("  -h  --help               Print the message\n");
113 |       printf("      --list               List available devices\n");
114 |       printf("      --device     INDEX   Select device at INDEX\n");
115 |       printf("\n");
116 |       exit(0);
117 |     }
118 |   }
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/Exercises/C_common/wtime.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | #include <sys/time.h>
 6 | #endif
 7 | 
 8 | #include <stdlib.h>
 9 | 
10 | double wtime()
11 | {
12 | #ifdef _OPENMP
13 |    /* Use omp_get_wtime() if we can */
14 |    return omp_get_wtime();
15 | #else
16 |    /* Use a generic timer */
17 |    static int sec = -1;
18 |    struct timeval tv;
19 |    gettimeofday(&tv, NULL);
20 |    if (sec < 0) sec = tv.tv_sec;
21 |    return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
22 | #endif
23 | }
24 | 
25 |     
26 | 


--------------------------------------------------------------------------------
/Exercises/Cpp_common/device_picker.hpp:
--------------------------------------------------------------------------------
  1 | /*------------------------------------------------------------------------------
  2 |  *
  3 |  * Name:       device_picker.h
  4 |  *
  5 |  * Purpose:    Provide a simple CLI to specify an OpenCL device at runtime
  6 |  *
  7 |  * Note:       Must be included AFTER the relevant OpenCL header
  8 |  *             See one of the Matrix Multiply exercises for usage
  9 |  *
 10 |  * HISTORY:    Method written by James Price, October 2014
 11 |  *             Extracted to a common header by Tom Deakin, November 2014
 12 |  */
 13 | 
 14 | #pragma once
 15 | 
 16 | #include <vector>
 17 | #include <err_code.h>
 18 | #include <iostream>
 19 | 
 20 | #define MAX_INFO_STRING 256
 21 | 
 22 | 
 23 | unsigned getDeviceList(std::vector<cl::Device>& devices)
 24 | {
 25 |   cl_int err;
 26 | 
 27 |   // Get list of platforms
 28 |   std::vector<cl::Platform> platforms;
 29 |   cl::Platform::get(&platforms);
 30 | 
 31 |   // Enumerate devices
 32 |   for (int i = 0; i < platforms.size(); i++)
 33 |   {
 34 |     cl_uint num = 0;
 35 |     std::vector<cl::Device> plat_devices;
 36 |     platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
 37 |     devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
 38 |   }
 39 | 
 40 |   return devices.size();
 41 | }
 42 | 
 43 | void getDeviceName(cl::Device& device, std::string& name)
 44 | {
 45 |   cl_device_info info = CL_DEVICE_NAME;
 46 | 
 47 |   // Special case for AMD
 48 | #ifdef CL_DEVICE_BOARD_NAME_AMD
 49 |   device.getInfo(CL_DEVICE_VENDOR, &name);
 50 |   if (strstr(name.c_str(), "Advanced Micro Devices"))
 51 |     info = CL_DEVICE_BOARD_NAME_AMD;
 52 | #endif
 53 | 
 54 |   device.getInfo(info, &name);
 55 | }
 56 | 
 57 | 
 58 | int parseUInt(const char *str, cl_uint *output)
 59 | {
 60 |   char *next;
 61 |   *output = strtoul(str, &next, 10);
 62 |   return !strlen(next);
 63 | }
 64 | 
 65 | void parseArguments(int argc, char *argv[], cl_uint *deviceIndex)
 66 | {
 67 |   for (int i = 1; i < argc; i++)
 68 |   {
 69 |     if (!strcmp(argv[i], "--list"))
 70 |     {
 71 |       // Get list of devices
 72 |       std::vector<cl::Device> devices;
 73 |       unsigned numDevices = getDeviceList(devices);
 74 | 
 75 |       // Print device names
 76 |       if (numDevices == 0)
 77 |       {
 78 |         std::cout << "No devices found.\n";
 79 |       }
 80 |       else
 81 |       {
 82 |         std::cout << "\nDevices:\n";
 83 |         for (int i = 0; i < numDevices; i++)
 84 |         {
 85 |           std::string name;
 86 |           getDeviceName(devices[i], name);
 87 |           std::cout << i << ": " << name << "\n";
 88 |         }
 89 |         std::cout << "\n";
 90 |       }
 91 |       exit(0);
 92 |     }
 93 |     else if (!strcmp(argv[i], "--device"))
 94 |     {
 95 |       if (++i >= argc || !parseUInt(argv[i], deviceIndex))
 96 |       {
 97 |         std::cout << "Invalid device index\n";
 98 |         exit(1);
 99 |       }
100 |     }
101 |     else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
102 |     {
103 |       std::cout << "\n";
104 |       std::cout << "Usage: ./program [OPTIONS]\n\n";
105 |       std::cout << "Options:\n";
106 |       std::cout << "  -h  --help               Print the message\n";
107 |       std::cout << "      --list               List available devices\n";
108 |       std::cout << "      --device     INDEX   Select device at INDEX\n";
109 |       std::cout << "\n";
110 |       exit(0);
111 |     }
112 |   }
113 | }
114 | 
115 | 


--------------------------------------------------------------------------------
/Exercises/Cpp_common/wtime.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | #include <sys/time.h>
 6 | #endif
 7 | 
 8 | #include <stdlib.h>
 9 | 
10 | double wtime()
11 | {
12 | #ifdef _OPENMP
13 |    /* Use omp_get_wtime() if we can */
14 |    return omp_get_wtime();
15 | #else
16 |    /* Use a generic timer */
17 |    static int sec = -1;
18 |    struct timeval tv;
19 |    gettimeofday(&tv, NULL);
20 |    if (sec < 0) sec = tv.tv_sec;
21 |    return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
22 | #endif
23 | }
24 | 
25 |     
26 | 


--------------------------------------------------------------------------------
/Exercises/Exercise01/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-std=c99
 7 | 
 8 | LIBS = -lOpenCL
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Check our platform and make sure we define the APPLE variable
13 | # and set up the right compiler flags and libraries
14 | PLATFORM = $(shell uname -s)
15 | ifeq ($(PLATFORM), Darwin)
16 | 	LIBS = -framework OpenCL
17 | endif
18 | 
19 | DeviceInfo: DeviceInfo.c
20 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
21 | 
22 | 
23 | clean:
24 | 	rm -f DeviceInfo
25 | 


--------------------------------------------------------------------------------
/Exercises/Exercise01/Cpp/DeviceInfo.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Display Device Information
  3 |  *
  4 |  * Script to print out some information about the OpenCL devices
  5 |  * and platforms available on your system
  6 |  *
  7 |  * History: C++ version written by Tom Deakin, 2012
  8 |  *          Updated by Tom Deakin, August 2013
  9 | */
 10 | 
 11 | #define __CL_ENABLE_EXCEPTIONS
 12 | 
 13 | #include "cl.hpp"
 14 | #include <iostream>
 15 | #include <vector>
 16 | 
 17 | #include <err_code.h>
 18 | 
 19 | int main(void)
 20 | {
 21 | 
 22 |   try
 23 |   {
 24 |     // Discover number of platforms
 25 |     std::vector<cl::Platform> platforms;
 26 |     cl::Platform::get(&platforms);
 27 |     std::cout << "\nNumber of OpenCL plaforms: " << platforms.size() << std::endl;
 28 | 
 29 |     // Investigate each platform
 30 |     std::cout << "\n-------------------------" << std::endl;
 31 |     for (std::vector<cl::Platform>::iterator plat = platforms.begin(); plat != platforms.end(); plat++)
 32 |     {
 33 |       std::string s;
 34 |       plat->getInfo(CL_PLATFORM_NAME, &s);
 35 |       std::cout << "Platform: " << s << std::endl;
 36 | 
 37 |       plat->getInfo(CL_PLATFORM_VENDOR, &s);
 38 |       std::cout << "\tVendor:  " << s << std::endl;
 39 | 
 40 |       plat->getInfo(CL_PLATFORM_VERSION, &s);
 41 |       std::cout << "\tVersion: " << s << std::endl;
 42 | 
 43 |       // Discover number of devices
 44 |       std::vector<cl::Device> devices;
 45 |       plat->getDevices(CL_DEVICE_TYPE_ALL, &devices);
 46 |       std::cout << "\n\tNumber of devices: " << devices.size() << std::endl;
 47 | 
 48 |       // Investigate each device
 49 |       for (std::vector<cl::Device>::iterator dev = devices.begin(); dev != devices.end(); dev++ )
 50 |       {
 51 |         std::cout << "\t-------------------------" << std::endl;
 52 | 
 53 |         dev->getInfo(CL_DEVICE_NAME, &s);
 54 |         std::cout << "\t\tName: " << s << std::endl;
 55 | 
 56 |         dev->getInfo(CL_DEVICE_OPENCL_C_VERSION, &s);
 57 |         std::cout << "\t\tVersion: " << s << std::endl;
 58 | 
 59 |         int i;
 60 |         dev->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &i);
 61 |         std::cout << "\t\tMax. Compute Units: " << i << std::endl;
 62 | 
 63 |         size_t size;
 64 |         dev->getInfo(CL_DEVICE_LOCAL_MEM_SIZE, &size);
 65 |         std::cout << "\t\tLocal Memory Size: " << size/1024 << " KB" << std::endl;
 66 | 
 67 |         dev->getInfo(CL_DEVICE_GLOBAL_MEM_SIZE, &size);
 68 |         std::cout << "\t\tGlobal Memory Size: " << size/(1024*1024) << " MB" << std::endl;
 69 | 
 70 |         dev->getInfo(CL_DEVICE_MAX_MEM_ALLOC_SIZE, &size);
 71 |         std::cout << "\t\tMax Alloc Size: " << size/(1024*1024) << " MB" << std::endl;
 72 | 
 73 |         dev->getInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE, &size);
 74 |         std::cout << "\t\tMax Work-group Total Size: " << size << std::endl;
 75 | 
 76 |         std::vector<size_t> d;
 77 |         dev->getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &d);
 78 |         std::cout << "\t\tMax Work-group Dims: (";
 79 |         for (std::vector<size_t>::iterator st = d.begin(); st != d.end(); st++)
 80 |           std::cout << *st << " ";
 81 |         std::cout << "\x08)" << std::endl;
 82 | 
 83 |         std::cout << "\t-------------------------" << std::endl;
 84 | 
 85 |       }
 86 | 
 87 |       std::cout << "\n-------------------------\n";
 88 |     }
 89 | 
 90 |   }
 91 |   catch (cl::Error err)
 92 |   {
 93 |     std::cout << "OpenCL Error: " << err.what() << " returned " << err_code(err.err()) << std::endl;
 94 |     std::cout << "Check cl.h for error codes." << std::endl;
 95 |     exit(-1);
 96 |   }
 97 | 
 98 |   return 0;
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/Exercises/Exercise01/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL
13 | 
14 | # Check our platform and make sure we define the APPLE variable
15 | # and set up the right compiler flags and libraries
16 | PLATFORM = $(shell uname -s)
17 | ifeq ($(PLATFORM), Darwin)
18 | 	CPPC = clang++
19 | 	LIBS = -framework OpenCL
20 | endif
21 | 
22 | DeviceInfo: DeviceInfo.cpp
23 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
24 | 
25 | 
26 | clean:
27 | 	rm -f DeviceInfo
28 | 


--------------------------------------------------------------------------------
/Exercises/Exercise01/Python/DeviceInfo.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Display Device Information
 3 | #
 4 | # Script to print out some information about the OpenCL devices
 5 | # and platforms available on your system
 6 | #
 7 | # History: C++ version written by Tom Deakin, 2012
 8 | #          Ported to Python by Tom Deakin, July 2013
 9 | #
10 | 
11 | # Import the Python OpenCL API
12 | import pyopencl as cl
13 | 
14 | # Create a list of all the platform IDs
15 | platforms = cl.get_platforms()
16 | 
17 | print "\nNumber of OpenCL platforms:", len(platforms)
18 | 
19 | print "\n-------------------------"
20 | 
21 | # Investigate each platform
22 | for p in platforms:
23 |     # Print out some information about the platforms
24 |     print "Platform:", p.name
25 |     print "Vendor:", p.vendor
26 |     print "Version:", p.version
27 | 
28 |     # Discover all devices
29 |     devices = p.get_devices()
30 |     print "Number of devices:", len(devices)
31 | 
32 |     # Investigate each device
33 |     for d in devices:
34 |         print "\t-------------------------"
35 |         # Print out some information about the devices
36 |         print "\t\tName:", d.name
37 |         print "\t\tVersion:", d.opencl_c_version
38 |         print "\t\tMax. Compute Units:", d.max_compute_units
39 |         print "\t\tLocal Memory Size:", d.local_mem_size/1024, "KB"
40 |         print "\t\tGlobal Memory Size:", d.global_mem_size/(1024*1024), "MB"
41 |         print "\t\tMax Alloc Size:", d.max_mem_alloc_size/(1024*1024), "MB"
42 |         print "\t\tMax Work-group Total Size:", d.max_work_group_size
43 | 
44 |         # Find the maximum dimensions of the work-groups
45 |         dim = d.max_work_item_sizes
46 |         print "\t\tMax Work-group Dims:(", dim[0], " ".join(map(str, dim[1:])), ")"
47 | 
48 |         print "\t-------------------------"
49 | 
50 |     print "\n-------------------------"
51 | 


--------------------------------------------------------------------------------
/Exercises/Exercise01/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 1 - Platform Information
 2 | =================================
 3 | 
 4 | Goal
 5 | ----
 6 | * Verify that you can run the OpenCL environment you'll be using in this tutorial.
 7 | Specifically, can you run a simple OpenCL program.
 8 | 
 9 | Procedure
10 | ---------
11 | * Take the program we provide (`DeviceInfo`), inspect it in the editor of your choice, build the program and run it.
12 | 
13 | Expected output
14 | ---------------
15 | * Information about the installed OpenCL platforms and devices visible to them.
16 | 
17 | Extension
18 | ---------
19 | * Run the command `clinfo` which comes with the AMD implementation.
20 | This outputs all the information the OpenCL runtime can find out about devices and platforms.
21 | 
22 | 


--------------------------------------------------------------------------------
/Exercises/Exercise02/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-O3 -lm
 7 | 
 8 | LIBS = -lOpenCL
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Change this variable to specify the device type
13 | # to the OpenCL device type of choice. You can also
14 | # edit the variable in the source.
15 | ifndef DEVICE
16 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
17 | endif
18 | 
19 | # Check our platform and make sure we define the APPLE variable
20 | # and set up the right compiler flags and libraries
21 | PLATFORM = $(shell uname -s)
22 | ifeq ($(PLATFORM), Darwin)
23 | 	LIBS = -framework OpenCL
24 | endif
25 | 
26 | CCFLAGS += -D DEVICE=$(DEVICE)
27 | 
28 | vadd: vadd_c.c $(COMMON_DIR)/wtime.c $(COMMON_DIR)/device_info.c
29 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
30 | 
31 | 
32 | clean:
33 | 	rm -f vadd
34 | 


--------------------------------------------------------------------------------
/Exercises/Exercise02/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 2 - Running the Vadd kernel
 2 | ====================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To inspect and verify that you can run an OpenCL kernel.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Take the C Vadd program we provide you.
11 | It will run a simple kernel to add two vectors together.
12 | * Look at the host code and identify the API calls in the host code.
13 | Compare them against the API descriptions on the OpenCL reference card.
14 | * There are some helper files which time the execution, output device information neatly and check errors.
15 | 
16 | Expected output
17 | ---------------
18 | * A message verifying that the vector addition completed successfully.
19 | 


--------------------------------------------------------------------------------
/Exercises/Exercise03/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=-std=c++11
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL -lrt
13 | 
14 | # Change this variable to specify the device type
15 | # to the OpenCL device type of choice. You can also
16 | # edit the variable in the source.
17 | ifndef DEVICE
18 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
19 | endif
20 | 
21 | # Check our platform and make sure we define the APPLE variable
22 | # and set up the right compiler flags and libraries
23 | PLATFORM = $(shell uname -s)
24 | ifeq ($(PLATFORM), Darwin)
25 | 	CPPC = clang++
26 | 	CCFLAGS += -stdlib=libc++
27 | 	LIBS = -framework OpenCL
28 | endif
29 | 
30 | CCFLAGS += -D DEVICE=$(DEVICE)
31 | 
32 | vadd: vadd.cpp
33 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
34 | 
35 | 
36 | clean:
37 | 	rm -f vadd
38 | 


--------------------------------------------------------------------------------
/Exercises/Exercise03/Cpp/vadd.cl:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | // kernel:  vadd  
 4 | //
 5 | // Purpose: Compute the elementwise sum c = a+b
 6 | // 
 7 | // input: a and b float vectors of length count
 8 | //
 9 | // output: c float vector of length count holding the sum a + b
10 | //
11 | 
12 | __kernel void vadd(                             
13 |    __global float* a,                      
14 |    __global float* b,                      
15 |    __global float* c,                      
16 |    const unsigned int count)               
17 | {                                          
18 |    int i = get_global_id(0);               
19 |    if(i < count)  {
20 |        c[i] = a[i] + b[i];                 
21 |    }
22 | }                                          


--------------------------------------------------------------------------------
/Exercises/Exercise03/Cpp/vadd.cpp:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | // Name:       vadd_cpp.cpp
  4 | // 
  5 | // Purpose:    Elementwise addition of two vectors (c = a + b)
  6 | //
  7 | //                   c = a + b
  8 | //
  9 | // HISTORY:    Written by Tim Mattson, June 2011
 10 | //             Ported to C++ Wrapper API by Benedict Gaster, September 2011
 11 | //             Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012
 12 | //             Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 13 | //             
 14 | //------------------------------------------------------------------------------
 15 | 
 16 | #define __CL_ENABLE_EXCEPTIONS
 17 | 
 18 | #include "cl.hpp"
 19 | 
 20 | #include "util.hpp" // utility library
 21 | 
 22 | #include "err_code.h"
 23 | 
 24 | #include <vector>
 25 | #include <cstdio>
 26 | #include <cstdlib>
 27 | #include <string>
 28 | 
 29 | #include <iostream>
 30 | #include <fstream>
 31 | 
 32 | // pick up device type from compiler command line or from the default type
 33 | #ifndef DEVICE
 34 | #define DEVICE CL_DEVICE_TYPE_DEFAULT
 35 | #endif
 36 | 
 37 | //------------------------------------------------------------------------------
 38 | 
 39 | #define TOL    (0.001)   // tolerance used in floating point comparisons
 40 | #define LENGTH (1024)    // length of vectors a, b, and c
 41 | 
 42 | int main(void)
 43 | {
 44 |     std::vector<float> h_a(LENGTH);                // a vector 
 45 |     std::vector<float> h_b(LENGTH);                // b vector 	
 46 |     std::vector<float> h_c (LENGTH, 0xdeadbeef);    // c = a + b, from compute device
 47 | 
 48 |     cl::Buffer d_a;                        // device memory used for the input  a vector
 49 |     cl::Buffer d_b;                        // device memory used for the input  b vector
 50 |     cl::Buffer d_c;                       // device memory used for the output c vector
 51 | 
 52 |     // Fill vectors a and b with random float values
 53 |     int count = LENGTH;
 54 |     for(int i = 0; i < count; i++)
 55 |     {
 56 |         h_a[i]  = rand() / (float)RAND_MAX;
 57 |         h_b[i]  = rand() / (float)RAND_MAX;
 58 |     }
 59 | 
 60 |     try 
 61 |     {
 62 |     	// Create a context
 63 |         cl::Context context(DEVICE);
 64 | 
 65 |         // Load in kernel source, creating a program object for the context
 66 | 
 67 |         cl::Program program(context, util::loadProgram("vadd.cl"), true);
 68 | 
 69 |         // Get the command queue
 70 |         cl::CommandQueue queue(context);
 71 | 
 72 |         // Create the kernel functor
 73 |  
 74 |         auto vadd = cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int>(program, "vadd");
 75 | 
 76 |         d_a   = cl::Buffer(context, begin(h_a), end(h_a), true);
 77 |         d_b   = cl::Buffer(context, begin(h_b), end(h_b), true);
 78 | 
 79 |         d_c  = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH);
 80 | 
 81 |         util::Timer timer;
 82 | 
 83 |         vadd(
 84 |             cl::EnqueueArgs(
 85 |                 queue,
 86 |                 cl::NDRange(count)), 
 87 |             d_a,
 88 |             d_b,
 89 |             d_c,
 90 |             count);
 91 | 
 92 |         queue.finish();
 93 | 
 94 |         double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;
 95 |         printf("\nThe kernels ran in %lf seconds\n", rtime);
 96 | 
 97 |         cl::copy(queue, d_c, begin(h_c), end(h_c));
 98 | 
 99 |         // Test the results
100 |         int correct = 0;
101 |         float tmp;
102 |         for(int i = 0; i < count; i++) {
103 |             tmp = h_a[i] + h_b[i]; // expected value for d_c[i]
104 |             tmp -= h_c[i];                      // compute errors
105 |             if(tmp*tmp < TOL*TOL) {      // correct if square deviation is less 
106 |                 correct++;                         //  than tolerance squared
107 |             }
108 |             else {
109 | 
110 |                 printf(
111 |                     " tmp %f h_a %f h_b %f  h_c %f \n",
112 |                     tmp, 
113 |                     h_a[i], 
114 |                     h_b[i], 
115 |                     h_c[i]);
116 |             }
117 |         }
118 | 
119 |         // summarize results
120 |         printf(
121 |             "vector add to find C = A+B:  %d out of %d results were correct.\n", 
122 |             correct, 
123 |             count);
124 |     }
125 |     catch (cl::Error err) {
126 |         std::cout << "Exception\n";
127 |         std::cerr 
128 |             << "ERROR: "
129 |             << err.what()
130 |             << "("
131 |             << err_code(err.err())
132 |            << ")"
133 |            << std::endl;
134 |     }
135 | }
136 | 


--------------------------------------------------------------------------------
/Exercises/Exercise03/Python/deviceinfo.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Device Info
 3 | #
 4 | # Function to output key parameters about the input OpenCL device
 5 | #
 6 | # History: C version written by Tim Mattson, June 2010
 7 | #          Ported to Python by Tom Deakin, July 2013
 8 | #
 9 | 
10 | import pyopencl as cl
11 | import sys
12 | 
13 | def output_device_info(device_id):
14 |     sys.stdout.write("Device is ")
15 |     sys.stdout.write(device_id.name)
16 |     if device_id.type == cl.device_type.GPU:
17 |         sys.stdout.write("GPU from ")
18 |     elif device_id.type == cl.device_type.CPU:
19 |         sys.stdout.write("CPU from ")
20 |     else:
21 |         sys.stdout.write("non CPU of GPU processor from ")
22 |     sys.stdout.write(device_id.vendor)
23 |     sys.stdout.write(" with a max of ")
24 |     sys.stdout.write(str(device_id.max_compute_units))
25 |     sys.stdout.write(" compute units\n")
26 |     sys.stdout.flush()
27 | 


--------------------------------------------------------------------------------
/Exercises/Exercise03/Python/vadd.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Vadd
  3 | #
  4 | # Element wise addition of two vectors (c = a + b)
  5 | # Asks the user to select a device at runtime
  6 | #
  7 | # History: C version written by Tim Mattson, December 2009
  8 | #          C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
  9 | #          Ported to Python by Tom Deakin, July 2013
 10 | #
 11 | 
 12 | # Import the Python OpenCL API
 13 | import pyopencl as cl
 14 | # Import the Python Maths Library (for vectors)
 15 | import numpy
 16 | 
 17 | # Import a library to print out the device information
 18 | import deviceinfo
 19 | 
 20 | # Import Standard Library to time the execution
 21 | from time import time
 22 | #------------------------------------------------------------------------------
 23 | 
 24 | # tolerance used in floating point comparisons
 25 | TOL = 0.001
 26 | # length of vectors a, b and c
 27 | LENGTH = 1024
 28 | 
 29 | #------------------------------------------------------------------------------
 30 | #
 31 | # Kernel: vadd
 32 | #
 33 | # To compute the elementwise sum c = a + b
 34 | #
 35 | # Input: a and b float vectors of length count
 36 | # Output c float vector of length count holding the sum a + b
 37 | 
 38 | kernelsource = """
 39 | __kernel void vadd(
 40 |     __global float* a,
 41 |     __global float* b,
 42 |     __global float* c,
 43 |     const unsigned int count)
 44 | {
 45 |     int i = get_global_id(0);
 46 |     if (i < count)
 47 |         c[i] = a[i] + b[i];
 48 | }
 49 | """
 50 | 
 51 | #------------------------------------------------------------------------------
 52 | 
 53 | # Main procedure
 54 | 
 55 | # Create a compute context
 56 | # Ask the user to select a platform/device on the CLI
 57 | context = cl.create_some_context()
 58 | 
 59 | # Print out device info
 60 | deviceinfo.output_device_info(context.devices[0])
 61 | 
 62 | # Create a command queue
 63 | queue = cl.CommandQueue(context)
 64 | 
 65 | # Create the compute program from the source buffer
 66 | # and build it
 67 | program = cl.Program(context, kernelsource).build()
 68 | 
 69 | # Create a and b vectors and fill with random float values
 70 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32)
 71 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32)
 72 | # Create an empty c vector (a+b) to be returned from the compute device
 73 | h_c = numpy.empty(LENGTH).astype(numpy.float32)
 74 | 
 75 | # Create the input (a, b) arrays in device memory and copy data from host
 76 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a)
 77 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b)
 78 | # Create the output (c) array in device memory
 79 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_c.nbytes)
 80 | 
 81 | # Start the timer
 82 | rtime = time()
 83 | 
 84 | # Execute the kernel over the entire range of our 1d input
 85 | # allowing OpenCL runtime to select the work group items for the device
 86 | vadd = program.vadd
 87 | vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
 88 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)
 89 | 
 90 | # Wait for the commands to finish before reading back
 91 | queue.finish()
 92 | rtime = time() - rtime
 93 | print "The kernel ran in", rtime, "seconds"
 94 | 
 95 | # Read back the results from the compute device
 96 | cl.enqueue_copy(queue, h_c, d_c)
 97 | 
 98 | # Test the results
 99 | correct = 0;
100 | for a, b, c in zip(h_a, h_b, h_c):
101 |     # assign element i of a+b to tmp
102 |     tmp = a + b
103 |     # compute the deviation of expected and output result
104 |     tmp -= c
105 |     # correct if square deviation is less than tolerance squared
106 |     if tmp*tmp < TOL*TOL:
107 |         correct += 1
108 |     else:
109 |         print "tmp", tmp, "h_a", a, "h_b", b, "h_c", c
110 | 
111 | # Summarize results
112 | print "C = A+B:", correct, "out of", LENGTH, "results were correct."
113 | 


--------------------------------------------------------------------------------
/Exercises/Exercise03/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 3 - Running the Vadd kernel (C++/Python)
 2 | =================================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To learn the C++ and/or Python interface to OpenCL's API
 7 | 
 8 | Procedure
 9 | ---------
10 | * Examine the C++ or Python program we provide you.
11 | It will run a simple kernel to add two vectors together.
12 | * Look at the host code and identify the API calls in the host code.
13 | Note how some of the API calls in OpenCL map onto C++/Python constructs.
14 | * Compare the original C version with the C++/Python versions
15 | * Look at the simplicity of the common API calls
16 | 
17 | Expected output
18 | ---------------
19 | * A message verifying that the vector addition completed successfully.
20 | 


--------------------------------------------------------------------------------
/Exercises/Exercise04/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-O3 -lm
 7 | 
 8 | LIBS = -lOpenCL -fopenmp
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Change this variable to specify the device type
13 | # to the OpenCL device type of choice. You can also
14 | # edit the variable in the source.
15 | ifndef DEVICE
16 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
17 | endif
18 | 
19 | # Check our platform and make sure we define the APPLE variable
20 | # and set up the right compiler flags and libraries
21 | PLATFORM = $(shell uname -s)
22 | ifeq ($(PLATFORM), Darwin)
23 | 	LIBS = -framework OpenCL 
24 | endif
25 | 
26 | CCFLAGS += -D DEVICE=$(DEVICE)
27 | 
28 | vadd: vadd_c.c $(COMMON_DIR)/wtime.c $(COMMON_DIR)/device_info.c
29 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
30 | 
31 | 
32 | clean:
33 | 	rm -f vadd
34 | 


--------------------------------------------------------------------------------
/Exercises/Exercise04/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL -lrt
13 | 
14 | # Change this variable to specify the device type
15 | # to the OpenCL device type of choice. You can also
16 | # edit the variable in the source.
17 | ifndef DEVICE
18 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
19 | endif
20 | 
21 | # Check our platform and make sure we define the APPLE variable
22 | # and set up the right compiler flags and libraries
23 | PLATFORM = $(shell uname -s)
24 | ifeq ($(PLATFORM), Darwin)
25 | 	CPPC = clang++
26 | 	CCFLAGS += -stdlib=libc++
27 | 	LIBS = -framework OpenCL
28 | endif
29 | 
30 | CCFLAGS += -D DEVICE=$(DEVICE)
31 | 
32 | vadd: vadd.cpp
33 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
34 | 
35 | 
36 | clean:
37 | 	rm -f vadd
38 | 


--------------------------------------------------------------------------------
/Exercises/Exercise04/Cpp/vadd.cl:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | // kernel:  vadd  
 4 | //
 5 | // Purpose: Compute the elementwise sum c = a+b
 6 | // 
 7 | // input: a and b float vectors of length count
 8 | //
 9 | // output: c float vector of length count holding the sum a + b
10 | //
11 | 
12 | __kernel void vadd(                             
13 |    __global float* a,                      
14 |    __global float* b,                      
15 |    __global float* c,                      
16 |    const unsigned int count)               
17 | {                                          
18 |    int i = get_global_id(0);               
19 |    if(i < count)  {
20 |        c[i] = a[i] + b[i];                 
21 |    }
22 | }                                          


--------------------------------------------------------------------------------
/Exercises/Exercise04/Cpp/vadd.cpp:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | // Name:       vadd_cpp.cpp
  4 | // 
  5 | // Purpose:    Elementwise addition of two vectors (c = a + b)
  6 | //
  7 | //                   c = a + b
  8 | //
  9 | // HISTORY:    Written by Tim Mattson, June 2011
 10 | //             Ported to C++ Wrapper API by Benedict Gaster, September 2011
 11 | //             Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012
 12 | //             Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 13 | //             
 14 | //------------------------------------------------------------------------------
 15 | 
 16 | #define __CL_ENABLE_EXCEPTIONS
 17 | 
 18 | #include "cl.hpp"
 19 | 
 20 | #include "util.hpp" // utility library
 21 | 
 22 | #include <vector>
 23 | #include <cstdio>
 24 | #include <cstdlib>
 25 | #include <string>
 26 | 
 27 | #include <iostream>
 28 | #include <fstream>
 29 | 
 30 | // pick up device type from compiler command line or from the default type
 31 | #ifndef DEVICE
 32 | #define DEVICE CL_DEVICE_TYPE_DEFAULT
 33 | #endif
 34 | 
 35 | #include <err_code.h>
 36 | 
 37 | //------------------------------------------------------------------------------
 38 | 
 39 | #define TOL    (0.001)   // tolerance used in floating point comparisons
 40 | #define LENGTH (1024)    // length of vectors a, b, and c
 41 | 
 42 | int main(void)
 43 | {
 44 |     std::vector<float> h_a(LENGTH);                // a vector 
 45 |     std::vector<float> h_b(LENGTH);                // b vector 	
 46 |     std::vector<float> h_c(LENGTH, 0xdeadbeef);    // c = a + b, from compute device
 47 | 
 48 |     cl::Buffer d_a;                        // device memory used for the input  a vector
 49 |     cl::Buffer d_b;                        // device memory used for the input  b vector
 50 |     cl::Buffer d_c;                       // device memory used for the output c vector
 51 | 
 52 |     // Fill vectors a and b with random float values
 53 |     int count = LENGTH;
 54 |     for(int i = 0; i < count; i++)
 55 |     {
 56 |         h_a[i]  = rand() / (float)RAND_MAX;
 57 |         h_b[i]  = rand() / (float)RAND_MAX;
 58 |     }
 59 | 
 60 |     try 
 61 |     {
 62 |     	// Create a context
 63 |         cl::Context context(DEVICE);
 64 | 
 65 |         // Load in kernel source, creating a program object for the context
 66 | 
 67 |         cl::Program program(context, util::loadProgram("vadd.cl"), true);
 68 | 
 69 |         // Get the command queue
 70 |         cl::CommandQueue queue(context);
 71 | 
 72 |         // Create the kernel functor
 73 |  
 74 |         cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int> vadd(program, "vadd");
 75 | 
 76 |         d_a   = cl::Buffer(context, h_a.begin(), h_a.end(), true);
 77 |         d_b   = cl::Buffer(context, h_b.begin(), h_b.end(), true);
 78 | 
 79 |         d_c  = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH);
 80 | 
 81 |         util::Timer timer;
 82 | 
 83 |         vadd(
 84 |             cl::EnqueueArgs(
 85 |                 queue,
 86 |                 cl::NDRange(count)), 
 87 |             d_a,
 88 |             d_b,
 89 |             d_c,
 90 |             count);
 91 | 
 92 |         queue.finish();
 93 | 
 94 |         double rtime = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;
 95 |         printf("\nThe kernels ran in %lf seconds\n", rtime);
 96 | 
 97 |         cl::copy(queue, d_c, h_c.begin(), h_c.end());
 98 | 
 99 |         // Test the results
100 |         int correct = 0;
101 |         float tmp;
102 |         for(int i = 0; i < count; i++) {
103 |             tmp = h_a[i] + h_b[i]; // expected value for d_c[i]
104 |             tmp -= h_c[i];                      // compute errors
105 |             if(tmp*tmp < TOL*TOL) {      // correct if square deviation is less 
106 |                 correct++;                         //  than tolerance squared
107 |             }
108 |             else {
109 | 
110 |                 printf(
111 |                     " tmp %f h_a %f h_b %f  h_c %f \n",
112 |                     tmp, 
113 |                     h_a[i], 
114 |                     h_b[i], 
115 |                     h_c[i]);
116 |             }
117 |         }
118 | 
119 |         // summarize results
120 |         printf(
121 |             "vector add to find C = A+B:  %d out of %d results were correct.\n", 
122 |             correct, 
123 |             count);
124 |     }
125 |     catch (cl::Error err) {
126 |         std::cout << "Exception\n";
127 |         std::cerr 
128 |             << "ERROR: "
129 |             << err.what()
130 |             << "("
131 |             << err_code(err.err())
132 |            << ")"
133 |            << std::endl;
134 |     }
135 | }
136 | 


--------------------------------------------------------------------------------
/Exercises/Exercise04/Python/deviceinfo.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Device Info
 3 | #
 4 | # Function to output key parameters about the input OpenCL device
 5 | #
 6 | # History: C version written by Tim Mattson, June 2010
 7 | #          Ported to Python by Tom Deakin, July 2013
 8 | #
 9 | 
10 | import pyopencl as cl
11 | import sys
12 | 
13 | def output_device_info(device_id):
14 |     sys.stdout.write("Device is ")
15 |     sys.stdout.write(device_id.name)
16 |     if device_id.type == cl.device_type.GPU:
17 |         sys.stdout.write("GPU from ")
18 |     elif device_id.type == cl.device_type.CPU:
19 |         sys.stdout.write("CPU from ")
20 |     else:
21 |         sys.stdout.write("non CPU of GPU processor from ")
22 |     sys.stdout.write(device_id.vendor)
23 |     sys.stdout.write(" with a max of ")
24 |     sys.stdout.write(str(device_id.max_compute_units))
25 |     sys.stdout.write(" compute units\n")
26 |     sys.stdout.flush()
27 | 


--------------------------------------------------------------------------------
/Exercises/Exercise04/Python/vadd.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Vadd
  3 | #
  4 | # Element wise addition of two vectors (c = a + b)
  5 | # Asks the user to select a device at runtime
  6 | #
  7 | # History: C version written by Tim Mattson, December 2009
  8 | #          C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
  9 | #          Ported to Python by Tom Deakin, July 2013
 10 | #
 11 | 
 12 | # Import the Python OpenCL API
 13 | import pyopencl as cl
 14 | # Import the Python Maths Library (for vectors)
 15 | import numpy
 16 | 
 17 | # Import a library to print out the device information
 18 | import deviceinfo
 19 | 
 20 | # Import Standard Library to time the execution
 21 | from time import time
 22 | #------------------------------------------------------------------------------
 23 | 
 24 | # tolerance used in floating point comparisons
 25 | TOL = 0.001
 26 | # length of vectors a, b and c
 27 | LENGTH = 1024
 28 | 
 29 | #------------------------------------------------------------------------------
 30 | #
 31 | # Kernel: vadd
 32 | #
 33 | # To compute the elementwise sum c = a + b
 34 | #
 35 | # Input: a and b float vectors of length count
 36 | # Output c float vector of length count holding the sum a + b
 37 | 
 38 | kernelsource = """
 39 | __kernel void vadd(
 40 |     __global float* a,
 41 |     __global float* b,
 42 |     __global float* c,
 43 |     const unsigned int count)
 44 | {
 45 |     int i = get_global_id(0);
 46 |     if (i < count)
 47 |         c[i] = a[i] + b[i];
 48 | }
 49 | """
 50 | 
 51 | #------------------------------------------------------------------------------
 52 | 
 53 | # Main procedure
 54 | 
 55 | # Create a compute context
 56 | # Ask the user to select a platform/device on the CLI
 57 | context = cl.create_some_context()
 58 | 
 59 | # Print out device info
 60 | deviceinfo.output_device_info(context.devices[0])
 61 | 
 62 | # Create a command queue
 63 | queue = cl.CommandQueue(context)
 64 | 
 65 | # Create the compute program from the source buffer
 66 | # and build it
 67 | program = cl.Program(context, kernelsource).build()
 68 | 
 69 | # Create a and b vectors and fill with random float values
 70 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32)
 71 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32)
 72 | # Create an empty c vector (a+b) to be returned from the compute device
 73 | h_c = numpy.empty(LENGTH).astype(numpy.float32)
 74 | 
 75 | # Create the input (a, b) arrays in device memory and copy data from host
 76 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a)
 77 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b)
 78 | # Create the output (c) array in device memory
 79 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_c.nbytes)
 80 | 
 81 | # Start the timer
 82 | rtime = time()
 83 | 
 84 | # Execute the kernel over the entire range of our 1d input
 85 | # allowing OpenCL runtime to select the work group items for the device
 86 | vadd = program.vadd
 87 | vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
 88 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)
 89 | 
 90 | # Wait for the commands to finish before reading back
 91 | queue.finish()
 92 | rtime = time() - rtime
 93 | print "The kernel ran in", rtime, "seconds"
 94 | 
 95 | # Read back the results from the compute device
 96 | cl.enqueue_copy(queue, h_c, d_c)
 97 | 
 98 | # Test the results
 99 | correct = 0;
100 | for a, b, c in zip(h_a, h_b, h_c):
101 |     # assign element i of a+b to tmp
102 |     tmp = a + b
103 |     # compute the deviation of expected and output result
104 |     tmp -= c
105 |     # correct if square deviation is less than tolerance squared
106 |     if tmp*tmp < TOL*TOL:
107 |         correct += 1
108 |     else:
109 |         print "tmp", tmp, "h_a", a, "h_b", b, "h_c", c
110 | 
111 | # Summarize results
112 | print "C = A+B:", correct, "out of", LENGTH, "results were correct."
113 | 


--------------------------------------------------------------------------------
/Exercises/Exercise04/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 4 - Chaining vector add kernels (C++/Python)
 2 | =====================================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To verify that you understand manipulating kernel invocations and buffers in OpenCL.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Start with your VADD program in C++ or Python.
11 | * Add additional buffer objects and assign them to vectors defined on the host
12 |   (see the provided vadd programs for examples of how to do this).
13 | * Chain vadds ... e.g. C=A+B; D=C+E; F=D+G.
14 | * Read back the final result and verify that this is correct.
15 | * Compare the complexity of your host code to C.
16 | 
17 | Expected output
18 | ---------------
19 | * A message to standard output verifying that the chain of vector additions produced the correct result.
20 | 
21 | Note
22 | ----
23 | 
24 | Sample solution is for C = A + B; D = C + E; F = D + G; return F
25 | 


--------------------------------------------------------------------------------
/Exercises/Exercise05/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-O3 -lm
 7 | 
 8 | LIBS = -lOpenCL -fopenmp
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Change this variable to specify the device type
13 | # to the OpenCL device type of choice. You can also
14 | # edit the variable in the source.
15 | ifndef DEVICE
16 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
17 | endif
18 | 
19 | # Check our platform and make sure we define the APPLE variable
20 | # and set up the right compiler flags and libraries
21 | PLATFORM = $(shell uname -s)
22 | ifeq ($(PLATFORM), Darwin)
23 | 	LIBS = -framework OpenCL
24 | endif
25 | 
26 | CCFLAGS += -D DEVICE=$(DEVICE)
27 | 
28 | vadd: vadd_c.c $(COMMON_DIR)/wtime.c $(COMMON_DIR)/device_info.c
29 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
30 | 
31 | 
32 | clean:
33 | 	rm -f vadd
34 | 


--------------------------------------------------------------------------------
/Exercises/Exercise05/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=-std=c++11
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL -lrt
13 | 
14 | # Change this variable to specify the device type
15 | # to the OpenCL device type of choice. You can also
16 | # edit the variable in the source.
17 | ifndef DEVICE
18 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
19 | endif
20 | 
21 | # Check our platform and make sure we define the APPLE variable
22 | # and set up the right compiler flags and libraries
23 | PLATFORM = $(shell uname -s)
24 | ifeq ($(PLATFORM), Darwin)
25 | 	CPPC = clang++
26 | 	CCFLAGS += -stdlib=libc++
27 | 	LIBS = -framework OpenCL
28 | endif
29 | 
30 | CCFLAGS += -D DEVICE=$(DEVICE)
31 | 
32 | vadd: vadd.cpp
33 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
34 | 
35 | 
36 | clean:
37 | 	rm -f vadd
38 | 


--------------------------------------------------------------------------------
/Exercises/Exercise05/Cpp/vadd.cl:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | // kernel:  vadd  
 4 | //
 5 | // Purpose: Compute the elementwise sum c = a+b
 6 | // 
 7 | // input: a and b float vectors of length count
 8 | //
 9 | // output: c float vector of length count holding the sum a + b
10 | //
11 | 
12 | __kernel void vadd(                             
13 |    __global float* a,                      
14 |    __global float* b,                      
15 |    __global float* c,                      
16 |    const unsigned int count)               
17 | {                                          
18 |    int i = get_global_id(0);               
19 |    if(i < count)  {
20 |        c[i] = a[i] + b[i];                 
21 |    }
22 | }                                          


--------------------------------------------------------------------------------
/Exercises/Exercise05/Python/deviceinfo.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Device Info
 3 | #
 4 | # Function to output key parameters about the input OpenCL device
 5 | #
 6 | # History: C version written by Tim Mattson, June 2010
 7 | #          Ported to Python by Tom Deakin, July 2013
 8 | #
 9 | 
10 | import pyopencl as cl
11 | import sys
12 | 
13 | def output_device_info(device_id):
14 |     sys.stdout.write("Device is ")
15 |     sys.stdout.write(device_id.name)
16 |     if device_id.type == cl.device_type.GPU:
17 |         sys.stdout.write("GPU from ")
18 |     elif device_id.type == cl.device_type.CPU:
19 |         sys.stdout.write("CPU from ")
20 |     else:
21 |         sys.stdout.write("non CPU of GPU processor from ")
22 |     sys.stdout.write(device_id.vendor)
23 |     sys.stdout.write(" with a max of ")
24 |     sys.stdout.write(str(device_id.max_compute_units))
25 |     sys.stdout.write(" compute units\n")
26 |     sys.stdout.flush()
27 | 


--------------------------------------------------------------------------------
/Exercises/Exercise05/Python/vadd.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Vadd
  3 | #
  4 | # Element wise addition of two vectors (c = a + b)
  5 | # Asks the user to select a device at runtime
  6 | #
  7 | # History: C version written by Tim Mattson, December 2009
  8 | #          C version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
  9 | #          Ported to Python by Tom Deakin, July 2013
 10 | #
 11 | 
 12 | # Import the Python OpenCL API
 13 | import pyopencl as cl
 14 | # Import the Python Maths Library (for vectors)
 15 | import numpy
 16 | 
 17 | # Import a library to print out the device information
 18 | import deviceinfo
 19 | 
 20 | # Import Standard Library to time the execution
 21 | from time import time
 22 | #------------------------------------------------------------------------------
 23 | 
 24 | # tolerance used in floating point comparisons
 25 | TOL = 0.001
 26 | # length of vectors a, b and c
 27 | LENGTH = 1024
 28 | 
 29 | #------------------------------------------------------------------------------
 30 | #
 31 | # Kernel: vadd
 32 | #
 33 | # To compute the elementwise sum c = a + b
 34 | #
 35 | # Input: a and b float vectors of length count
 36 | # Output c float vector of length count holding the sum a + b
 37 | 
 38 | kernelsource = """
 39 | __kernel void vadd(
 40 |     __global float* a,
 41 |     __global float* b,
 42 |     __global float* c,
 43 |     const unsigned int count)
 44 | {
 45 |     int i = get_global_id(0);
 46 |     if (i < count)
 47 |         c[i] = a[i] + b[i];
 48 | }
 49 | """
 50 | 
 51 | #------------------------------------------------------------------------------
 52 | 
 53 | # Main procedure
 54 | 
 55 | # Create a compute context
 56 | # Ask the user to select a platform/device on the CLI
 57 | context = cl.create_some_context()
 58 | 
 59 | # Print out device info
 60 | deviceinfo.output_device_info(context.devices[0])
 61 | 
 62 | # Create a command queue
 63 | queue = cl.CommandQueue(context)
 64 | 
 65 | # Create the compute program from the source buffer
 66 | # and build it
 67 | program = cl.Program(context, kernelsource).build()
 68 | 
 69 | # Create a and b vectors and fill with random float values
 70 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32)
 71 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32)
 72 | # Create an empty c vector (a+b) to be returned from the compute device
 73 | h_c = numpy.empty(LENGTH).astype(numpy.float32)
 74 | 
 75 | # Create the input (a, b) arrays in device memory and copy data from host
 76 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a)
 77 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b)
 78 | # Create the output (c) array in device memory
 79 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_c.nbytes)
 80 | 
 81 | # Start the timer
 82 | rtime = time()
 83 | 
 84 | # Execute the kernel over the entire range of our 1d input
 85 | # allowing OpenCL runtime to select the work group items for the device
 86 | vadd = program.vadd
 87 | vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
 88 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)
 89 | 
 90 | # Wait for the commands to finish before reading back
 91 | queue.finish()
 92 | rtime = time() - rtime
 93 | print "The kernel ran in", rtime, "seconds"
 94 | 
 95 | # Read back the results from the compute device
 96 | cl.enqueue_copy(queue, h_c, d_c)
 97 | 
 98 | # Test the results
 99 | correct = 0;
100 | for a, b, c in zip(h_a, h_b, h_c):
101 |     # assign element i of a+b to tmp
102 |     tmp = a + b
103 |     # compute the deviation of expected and output result
104 |     tmp -= c
105 |     # correct if square deviation is less than tolerance squared
106 |     if tmp*tmp < TOL*TOL:
107 |         correct += 1
108 |     else:
109 |         print "tmp", tmp, "h_a", a, "h_b", b, "h_c", c
110 | 
111 | # Summarize results
112 | print "C = A+B:", correct, "out of", LENGTH, "results were correct."
113 | 


--------------------------------------------------------------------------------
/Exercises/Exercise05/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 5 - The D = A + B + C problem
 2 | ======================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To verify that you understand how to control the argument definitions for a *kernel*.
 7 | * To verify that you understand the host/kernel interface.
 8 | 
 9 | Procedure
10 | ---------
11 | * Start with your VADD program.
12 | * Modify the kernel so it adds three vectors together.
13 | * Modify the host code to define three vectors and associate them with relevant kernel arguments.
14 | * Read back the final result and verify that it is correct.
15 | 
16 | Expected output
17 | ---------------
18 | * Test your result and verify that it is correct.
19 | Print a message to that effect on the screen.
20 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/C/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Matrix Multiply example makefile
 3 | #
 4 | # History: Written  by Tim mattson, August 2010
 5 | #          Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 6 | #          Modified by Tom Deakin, July 2013
 7 | #          Modified by Tom Deakin, October 2014
 8 | #
 9 | 
10 | ifndef CC
11 | 	CC = gcc
12 | endif
13 | 
14 | CCFLAGS=-O3 -std=c99 -ffast-math
15 | 
16 | LIBS = -lm -lOpenCL -fopenmp
17 | 
18 | COMMON_DIR = ../../C_common
19 | 
20 | MMUL_OBJS = wtime.o
21 | EXEC = mult
22 | 
23 | 
24 | # Check our platform and make sure we define the APPLE variable
25 | # and set up the right compiler flags and libraries
26 | PLATFORM = $(shell uname -s)
27 | ifeq ($(PLATFORM), Darwin)
28 | 	LIBS = -lm -framework OpenCL 
29 | endif
30 | 
31 | 
32 | all: $(EXEC)
33 | 
34 | mult: $(MMUL_OBJS) matmul.c matrix_lib.c
35 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $(EXEC)
36 | 
37 | wtime.o: $(COMMON_DIR)/wtime.c
38 | 	$(CC) -c $^ $(CCFLAGS) -o $@
39 | 
40 | .c.o:
41 | 	$(CC) -c $< $(CCFLAGS) -o $@
42 | 
43 | 
44 | clean:
45 | 	rm -f $(MMUL_OBJS) $(EXEC)
46 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/C/matmul.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  Include fle for the Matrix Multiply test harness
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Ported to C by Tom Deakin, July 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MULT_HDR
13 | #define __MULT_HDR
14 | 
15 | #include <stdio.h>
16 | #include <stdlib.h>
17 | #include <math.h>
18 | 
19 | #ifdef __APPLE__
20 | #include <OpenCL/opencl.h>
21 | #include <unistd.h>
22 | #else
23 | #include <CL/cl.h>
24 | #endif
25 | 
26 | #include "matrix_lib.h"
27 | 
28 | //------------------------------------------------------------------------------
29 | //  functions from ../Common
30 | //------------------------------------------------------------------------------
31 | extern double wtime();   // returns time since some fixed past point (wtime.c)
32 | 
33 | //------------------------------------------------------------------------------
34 | //  Constants
35 | //------------------------------------------------------------------------------
36 | #define ORDER    1024    // Order of the square matrices A, B, and C
37 | #define AVAL     3.0     // A elements are constant and equal to AVAL
38 | #define BVAL     5.0     // B elements are constant and equal to BVAL
39 | #define TOL      (0.001) // tolerance used in floating point comparisons
40 | #define DIM      2       // Max dim for NDRange
41 | #define COUNT    1       // number of times to do each multiplication
42 | #define SUCCESS  1
43 | #define FAILURE  0
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/C/matrix_lib.c:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | //  PROGRAM: Matrix library for the multiplication driver
  4 | //
  5 | //  PURPOSE: This is a simple set of functions to manipulate
  6 | //           matrices used with the multiplcation driver.
  7 | //
  8 | //  USAGE:   The matrices are square and the order is
  9 | //           set as a defined constant, ORDER.
 10 | //
 11 | //  HISTORY: Written by Tim Mattson, August 2010
 12 | //           Modified by Simon McIntosh-Smith, September 2011
 13 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 14 | //           Ported to C by Tom Deakin, 2013
 15 | //
 16 | //------------------------------------------------------------------------------
 17 | 
 18 | #include "matmul.h"
 19 | 
 20 | //------------------------------------------------------------------------------
 21 | //
 22 | //  Function to compute the matrix product (sequential algorithm, dot prod)
 23 | //
 24 | //------------------------------------------------------------------------------
 25 | 
 26 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C)
 27 | {
 28 |     int i, j, k;
 29 |     float tmp;
 30 | 
 31 |     for (i = 0; i < N; i++) {
 32 |         for (j = 0; j < N; j++) {
 33 |             tmp = 0.0f;
 34 |             for (k = 0; k < N; k++) {
 35 |                 /* C(i,j) = sum(over k) A(i,k) * B(k,j) */
 36 |                 tmp += A[i*N+k] * B[k*N+j];
 37 |             }
 38 |             C[i*N+j] = tmp;
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | //------------------------------------------------------------------------------
 44 | //
 45 | //  Function to initialize the input matrices A and B
 46 | //
 47 | //------------------------------------------------------------------------------
 48 | void initmat(int N, float *A, float *B, float *C)
 49 | {
 50 |     int i, j;
 51 | 
 52 |     /* Initialize matrices */
 53 | 
 54 | 	for (i = 0; i < N; i++)
 55 | 		for (j = 0; j < N; j++)
 56 | 			A[i*N+j] = AVAL;
 57 | 
 58 | 	for (i = 0; i < N; i++)
 59 | 		for (j = 0; j < N; j++)
 60 | 			B[i*N+j] = BVAL;
 61 | 
 62 | 	for (i = 0; i < N; i++)
 63 | 		for (j = 0; j < N; j++)
 64 | 			C[i*N+j] = 0.0f;
 65 | }
 66 | 
 67 | //------------------------------------------------------------------------------
 68 | //
 69 | //  Function to set a matrix to zero
 70 | //
 71 | //------------------------------------------------------------------------------
 72 | void zero_mat (int N, float *C)
 73 | {
 74 |     int i, j;
 75 | 
 76 | 	for (i = 0; i < N; i++)
 77 | 		for (j = 0; j < N; j++)
 78 | 			C[i*N+j] = 0.0f;
 79 | }
 80 | 
 81 | //------------------------------------------------------------------------------
 82 | //
 83 | //  Function to fill Btrans(N,N) with transpose of B(N,N)
 84 | //
 85 | //------------------------------------------------------------------------------
 86 | void trans(int N, float *B, float *Btrans)
 87 | {
 88 |     int i, j;
 89 | 
 90 | 	for (i = 0; i < N; i++)
 91 | 		for (j = 0; j < N; j++)
 92 | 		    Btrans[j*N+i] = B[i*N+j];
 93 | }
 94 | 
 95 | //------------------------------------------------------------------------------
 96 | //
 97 | //  Function to compute errors of the product matrix
 98 | //
 99 | //------------------------------------------------------------------------------
100 | float error(int N, float *C)
101 | {
102 |    int i,j;
103 |    float cval, errsq, err;
104 |    cval = (float) N * AVAL * BVAL;
105 |    errsq = 0.0f;
106 | 
107 |     for (i = 0; i < N; i++) {
108 |         for (j = 0; j < N; j++) {
109 |             err = C[i*N+j] - cval;
110 |             errsq += err * err;
111 |         }
112 |     }
113 |     return errsq;
114 | }
115 | 
116 | //------------------------------------------------------------------------------
117 | //
118 | //  Function to analyze and output results
119 | //
120 | //------------------------------------------------------------------------------
121 | void results(int N, float *C, double run_time)
122 | {
123 |     float mflops;
124 |     float errsq;
125 | 
126 |     mflops = 2.0 * N * N * N/(1000000.0f * run_time);
127 |     printf(" %.2f seconds at %.1f MFLOPS \n",  run_time,mflops);
128 |     errsq = error(N, C);
129 |     if (isnan(errsq) || errsq > TOL) {
130 |         printf("\n Errors in multiplication: %f\n",errsq);
131 |         exit(1);
132 |     }
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/C/matrix_lib.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  PROGRAM: Matrix library include file (function prototypes)
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010 
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Ported by Tom Deakin, July 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MATRIX_LIB_HDR
13 | #define __MATRIX_LIB_HDR
14 | 
15 | 
16 | //------------------------------------------------------------------------------
17 | //
18 | //  Function to compute the matrix product (sequential algorithm, dot producdt)
19 | //
20 | //------------------------------------------------------------------------------
21 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C);
22 | 
23 | //------------------------------------------------------------------------------
24 | //
25 | //  Function to initialize the input matrices A and B
26 | //
27 | //------------------------------------------------------------------------------
28 | void initmat(int N, float *A, float *B, float *C);
29 | 
30 | //------------------------------------------------------------------------------
31 | //
32 | //  Function to set a matrix to zero 
33 | //
34 | //------------------------------------------------------------------------------
35 | void zero_mat (int N, float *C);
36 | 
37 | //------------------------------------------------------------------------------
38 | //
39 | //  Function to fill Btrans(Mdim,Pdim)  with transpose of B(Pdim,Mdim)
40 | //
41 | //------------------------------------------------------------------------------
42 | void trans(int N, float *B, float *Btrans);
43 | 
44 | //------------------------------------------------------------------------------
45 | //
46 | //  Function to compute errors of the product matrix
47 | //
48 | //------------------------------------------------------------------------------
49 | float error(int N, float *C);
50 | 
51 | 
52 | //------------------------------------------------------------------------------
53 | //
54 | //  Function to analyze and output results 
55 | //
56 | //------------------------------------------------------------------------------
57 | void results(int N, float *C, double run_time);
58 |     
59 | #endif
60 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Matrix Multiply example makefile
 3 | #
 4 | # History: Written  by Tim mattson, August 2010
 5 | #          Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 6 | #          Modified by Tom Deakin, July 2013
 7 | #
 8 | 
 9 | ifndef CPPC
10 | 	CPPC=g++
11 | endif
12 | 
13 | CCFLAGS=-O3 -ffast-math
14 | 
15 | LIBS = -lm -lOpenCL -fopenmp 
16 | 
17 | COMMON_DIR = ../../Cpp_common
18 | 
19 | INC = -I $(COMMON_DIR)
20 | 
21 | MMUL_OBJS = matmul.o matrix_lib.o wtime.o
22 | EXEC = mult
23 | 
24 | # Check our platform and make sure we define the APPLE variable
25 | # and set up the right compiler flags and libraries
26 | PLATFORM = $(shell uname -s)
27 | ifeq ($(PLATFORM), Darwin)
28 | 	CPPC = clang++
29 | 	CCFLAGS += -stdlib=libc++
30 | 	LIBS = -lm -framework OpenCL
31 | endif
32 | 
33 | all: $(EXEC)
34 | 
35 | mult: $(MMUL_OBJS)
36 | 	$(CPPC) $(MMUL_OBJS) $(CCFLAGS) $(LIBS) -o $(EXEC)
37 | 
38 | wtime.o: $(COMMON_DIR)/wtime.c
39 | 	$(CPPC) -c $^ $(CCFLAGS) -o $@
40 | 
41 | 
42 | .c.o:
43 | 	$(CPPC) -c $< $(CCFLAGS) -o $@
44 | 
45 | .cpp.o:
46 | 	$(CPPC) -c $< $(CCFLAGS) $(INC) -o $@
47 | 
48 | matmul.o:	matmul.hpp matrix_lib.hpp
49 | 
50 | matrix_lib.o:	matmul.hpp
51 | 
52 | clean:
53 | 	rm -f $(MMUL_OBJS) $(EXEC)
54 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/Cpp/matmul.hpp:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  Include fle for the Matrix Multiply test harness
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MULT_HDR
13 | #define __MULT_HDR
14 | 
15 | #include <cstdio>
16 | #include <cstdlib>
17 | #include <cmath>
18 | #include <iostream>
19 | 
20 | #include <vector>
21 | 
22 | #define __CL_ENABLE_EXCEPTIONS
23 | #include "cl.hpp"
24 | 
25 | 
26 | #include "matrix_lib.hpp"
27 | 
28 | //------------------------------------------------------------------------------
29 | //  functions from ../Common
30 | //------------------------------------------------------------------------------
31 | extern double wtime();   // returns time since some fixed past point (wtime.c)
32 | 
33 | //------------------------------------------------------------------------------
34 | //  Constants
35 | //------------------------------------------------------------------------------
36 | #define ORDER    1024    // Order of the square matrices A, B, and C
37 | #define AVAL     3.0     // A elements are constant and equal to AVAL
38 | #define BVAL     5.0     // B elements are constant and equal to BVAL
39 | #define TOL      (0.001) // tolerance used in floating point comparisons
40 | #define DIM      2       // Max dim for NDRange
41 | #define COUNT    1       // number of times to do each multiplication
42 | #define SUCCESS  1
43 | #define FAILURE  0
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/Cpp/matrix_lib.cpp:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | //  PROGRAM: Matrix library for the multiplication driver
  4 | //
  5 | //  PURPOSE: This is a simple set of functions to manipulate
  6 | //           matrices used with the multiplcation driver.
  7 | //
  8 | //  USAGE:   The matrices are square and the order is
  9 | //           set as a defined constant, ORDER.
 10 | //
 11 | //  HISTORY: Written by Tim Mattson, August 2010
 12 | //           Modified by Simon McIntosh-Smith, September 2011
 13 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 14 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 15 | //           Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
 16 | //
 17 | //------------------------------------------------------------------------------
 18 | 
 19 | #include "matmul.hpp"
 20 | 
 21 | //------------------------------------------------------------------------------
 22 | //
 23 | //  Function to compute the matrix product (sequential algorithm, dot prod)
 24 | //
 25 | //------------------------------------------------------------------------------
 26 | 
 27 | void seq_mat_mul_sdot(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C)
 28 | {
 29 |     int i, j, k;
 30 |     float tmp;
 31 | 
 32 |     for (i = 0; i < N; i++) {
 33 |         for (j = 0; j < N; j++) {
 34 |             tmp = 0.0f;
 35 |             for (k = 0; k < N; k++) {
 36 |                 /* C(i,j) = sum(over k) A(i,k) * B(k,j) */
 37 |                 tmp += A[i*N+k] * B[k*N+j];
 38 |             }
 39 |             C[i*N+j] = tmp;
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | //------------------------------------------------------------------------------
 45 | //
 46 | //  Function to initialize the input matrices A and B
 47 | //
 48 | //------------------------------------------------------------------------------
 49 | void initmat(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C)
 50 | {
 51 |     int i, j;
 52 | 
 53 |     /* Initialize matrices */
 54 | 
 55 | 	for (i = 0; i < N; i++)
 56 | 		for (j = 0; j < N; j++)
 57 | 			A[i*N+j] = AVAL;
 58 | 
 59 | 	for (i = 0; i < N; i++)
 60 | 		for (j = 0; j < N; j++)
 61 | 			B[i*N+j] = BVAL;
 62 | 
 63 | 	for (i = 0; i < N; i++)
 64 | 		for (j = 0; j < N; j++)
 65 | 			C[i*N+j] = 0.0f;
 66 | }
 67 | 
 68 | //------------------------------------------------------------------------------
 69 | //
 70 | //  Function to set a matrix to zero
 71 | //
 72 | //------------------------------------------------------------------------------
 73 | void zero_mat (int N, std::vector<float>& C)
 74 | {
 75 |     int i, j;
 76 | 
 77 | 	for (i = 0; i < N; i++)
 78 | 		for (j = 0; j < N; j++)
 79 | 			C[i*N+j] = 0.0f;
 80 | }
 81 | 
 82 | //------------------------------------------------------------------------------
 83 | //
 84 | //  Function to fill Btrans(N,N) with transpose of B(N,N)
 85 | //
 86 | //------------------------------------------------------------------------------
 87 | void trans(int N, std::vector<float>& B, std::vector<float>& Btrans)
 88 | {
 89 |     int i, j;
 90 | 
 91 | 	for (i = 0; i < N; i++)
 92 | 		for (j = 0; j < N; j++)
 93 | 		    Btrans[j*N+i] = B[i*N+j];
 94 | }
 95 | 
 96 | //------------------------------------------------------------------------------
 97 | //
 98 | //  Function to compute errors of the product matrix
 99 | //
100 | //------------------------------------------------------------------------------
101 | float error(int N, std::vector<float>& C)
102 | {
103 |    int i,j;
104 |    float cval, errsq, err;
105 |    cval = (float) N * AVAL * BVAL;
106 |    errsq = 0.0f;
107 | 
108 |     for (i = 0; i < N; i++) {
109 |         for (j = 0; j < N; j++) {
110 |             err = C[i*N+j] - cval;
111 |             errsq += err * err;
112 |         }
113 |     }
114 |     return errsq;
115 | }
116 | 
117 | //------------------------------------------------------------------------------
118 | //
119 | //  Function to analyze and output results
120 | //
121 | //------------------------------------------------------------------------------
122 | void results(int N, std::vector<float>& C, double run_time)
123 | {
124 | 
125 |     float mflops;
126 |     float errsq;
127 |     
128 |     mflops = 2.0 * N * N * N/(1000000.0f * run_time);
129 |     printf(" %.2f seconds at %.1f MFLOPS \n",  run_time,mflops);
130 |     errsq = error(N, C);
131 |     if (std::isnan(errsq) || errsq > TOL)
132 |            printf("\n Errors in multiplication: %f\n",errsq);
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/Cpp/matrix_lib.hpp:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  PROGRAM: Matrix library include file (function prototypes)
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010 
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MATRIX_LIB_HDR
13 | #define __MATRIX_LIB_HDR
14 | 
15 | 
16 | //------------------------------------------------------------------------------
17 | //
18 | //  Function to compute the matrix product (sequential algorithm, dot producdt)
19 | //
20 | //------------------------------------------------------------------------------
21 | void seq_mat_mul_sdot(int N, std::vector<float> &A, std::vector<float> &B, std::vector<float> &C);
22 | 
23 | //------------------------------------------------------------------------------
24 | //
25 | //  Function to initialize the input matrices A and B
26 | //
27 | //------------------------------------------------------------------------------
28 | void initmat(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C);
29 | 
30 | //------------------------------------------------------------------------------
31 | //
32 | //  Function to set a matrix to zero 
33 | //
34 | //------------------------------------------------------------------------------
35 | void zero_mat (int N, std::vector<float> &C);
36 | 
37 | //------------------------------------------------------------------------------
38 | //
39 | //  Function to fill Btrans(Mdim,Pdim)  with transpose of B(Pdim,Mdim)
40 | //
41 | //------------------------------------------------------------------------------
42 | void trans(int N, std::vector<float>& B, std::vector<float>& Btrans);
43 | 
44 | //------------------------------------------------------------------------------
45 | //
46 | //  Function to compute errors of the product matrix
47 | //
48 | //------------------------------------------------------------------------------
49 | float error(int N, std::vector<float>& C);
50 | 
51 | 
52 | //------------------------------------------------------------------------------
53 | //
54 | //  Function to analyze and output results 
55 | //
56 | //------------------------------------------------------------------------------
57 | void results(int N, std::vector<float>& C, double run_time);
58 |     
59 | #endif
60 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/Python/definitions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Order of the square matrices A, B and C
 3 | ORDER = 1024
 4 | 
 5 | # A elemetns are constant and equal to AVAL
 6 | AVAL = 3.0
 7 | 
 8 | # B elemetns are constant and equal to BVAL
 9 | BVAL = 5.0
10 | 
11 | # tolerance used in floating point comparisons
12 | TOL = 0.001
13 | 
14 | # Max dim for NDRange
15 | DIM = 2
16 | 
17 | # number of times to do each multiplication
18 | COUNT = 1
19 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/Python/helper.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from definitions import *
 3 | import numpy
 4 | 
 5 | #  Function to compute the matrix product (sequential algorithm, dot prod)
 6 | def seq_mat_mul_sdot(N, A, B, C):
 7 |     for i in range(N):
 8 |         for j in range(N):
 9 |             tmp = 0.0
10 |             for k in range(N):
11 |                 tmp += A[i*N+k] * B[k*N+j]
12 |             C[i*N+j] = tmp
13 | 
14 | #  Function to compute errors of the product matrix
15 | def error(N, C):
16 |    cval = float(N) * AVAL * BVAL
17 |    errsq = 0.0
18 |    for i in range(N):
19 |        for j in range(N):
20 |             err = C[i*N+j] - cval
21 |             errsq += err * err
22 |    return errsq;
23 | 
24 | 
25 | # Function to analyze and output results
26 | def results(N, C, run_time):
27 |     mflops = 2.0 * N * N * N/(1000000.0* run_time)
28 |     print run_time, "seconds at", mflops, "MFLOPS"
29 |     errsq = error(N, C)
30 |     if numpy.isnan(errsq) or errsq > TOL:
31 |         print "Errors in multiplication:", errsq
32 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/Python/matmul.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Matrix Multiplication Driver
  3 | #
  4 | # This is a driver program to test various ways of computing
  5 | # the product:
  6 | #                 C = A * B
  7 | #
  8 | # A and B are constant matrices, square and the order is
  9 | # set as a constant, ORDER (see definitions.py). This is so
 10 | # we can make a quick test of the multiplication result.
 11 | #
 12 | # History:   C++ version written by Tim Mattson, August 2010 
 13 | #            Modified by Simon McIntosh-Smith, September 2011
 14 | #            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 15 | #            Ported to Python by Tom Deakin, July 2013
 16 | #            Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
 17 | #
 18 | 
 19 | from helper import *
 20 | from definitions import *
 21 | 
 22 | import pyopencl as cl
 23 | import numpy
 24 | from time import time
 25 | 
 26 | C_elem_KernelSource = '''
 27 | __kernel void mmul(
 28 | 	const int N,
 29 | 	__global float* A,
 30 | 	__global float* B,
 31 | 	__global float* C)
 32 | {
 33 | }
 34 | '''
 35 | 
 36 | # A[N][N], B[N][N], C[N][N]
 37 | N = ORDER;
 38 | 
 39 | # Number of elements in the matrix
 40 | size = N * N
 41 | 
 42 | 
 43 | # A matrix
 44 | h_A = numpy.empty(size).astype(numpy.float32)
 45 | h_A.fill(AVAL)
 46 | 
 47 | # B matrix
 48 | h_B = numpy.empty(size).astype(numpy.float32)
 49 | h_B.fill(BVAL)
 50 | 
 51 | # C matrix
 52 | h_C = numpy.empty(size).astype(numpy.float32)
 53 | 
 54 | print "\n===== Sequential, matrix mult (dot prod), order", ORDER, "on host CPU ======\n"
 55 | 
 56 | for i in range(COUNT):
 57 |     h_C.fill(0.0)
 58 |     start_time = time()
 59 | 
 60 |     print "Skipping as this takes a long time to run!"
 61 |     #seq_mat_mul_sdot(N, h_A, h_B, h_C)
 62 | 
 63 |     run_time = time() - start_time
 64 |     #results(N, h_C, run_time)
 65 | 
 66 | 
 67 | # Set up OpenCL
 68 | context = cl.create_some_context()
 69 | queue = cl.CommandQueue(context)
 70 | 
 71 | # Reset host buffers - just to play it safe
 72 | h_A = numpy.empty(size).astype(numpy.float32)
 73 | h_A.fill(AVAL)
 74 | h_B = numpy.empty(size).astype(numpy.float32)
 75 | h_B.fill(BVAL)
 76 | h_C = numpy.empty(size).astype(numpy.float32)
 77 | 
 78 | # Create OpenCL buffers
 79 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_A)
 80 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_B)
 81 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)
 82 | 
 83 | program = cl.Program(context, C_elem_KernelSource).build()
 84 | mmul = program.mmul
 85 | mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])
 86 | 
 87 | print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", N, "======\n"
 88 | 
 89 | # Do the multiplication COUNT times
 90 | for i in range(COUNT):
 91 |     h_C.fill(0.0)
 92 |     start_time = time()
 93 | 
 94 |     globalrange = (N, N)
 95 |     localrange = None
 96 | 
 97 |     mmul(queue, globalrange, localrange, N, d_a, d_b, d_c)
 98 |     queue.finish()
 99 | 
100 |     run_time = time() - start_time
101 | 
102 |     cl.enqueue_copy(queue, h_C, d_c)
103 |     results(N, h_C, run_time)
104 | 


--------------------------------------------------------------------------------
/Exercises/Exercise06/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 6 - Matrix Multiplication
 2 | ==================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To write your first complete OpenCL kernel **from scratch**
 7 | * To multiply a pair of matrices
 8 | 
 9 | Procedure
10 | ---------
11 | * Start with the serial matrix multiplication program including the function to generate matrices (C/C++ only) and test results.
12 | * Create a kernel to do the multiplication.
13 | * Modify the provided OpenCL host program to use your kernel.
14 | * Verify the results.
15 | 
16 | Expected output
17 | ---------------
18 | * A message to standard output verifying that the chain of vector additions produced the correct results.
19 | * Report the runtime and the MFLOPS.
20 | 


--------------------------------------------------------------------------------
/Exercises/Exercise07/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 7 - using private memory
 2 | =================================
 3 | 
 4 | Goal
 5 | ----
 6 | * Use private memory to minimize memory movement costs and optimize performance of your matrix multiplication program.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Start with your matrix multiplication program.
11 | * Modify the kernel so that each work-item copies its own row of A into private memory.
12 | * Optimize step by step, saving the intermediate versions and tracking performance improvements.
13 | 
14 | Expected output
15 | ---------------
16 | * A message to standard output verifying that the matrix multiplication program is generating the correct results.
17 | * Report the runtime and the MFLOPS.
18 | 


--------------------------------------------------------------------------------
/Exercises/Exercise08/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 8 - using local memory
 2 | ===============================
 3 | 
 4 | Goal
 5 | ----
 6 | * Use local memory to minimize memory movement costs and optimize performance of your matrix multiplication program.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Start with your matrix multiplication program that already uses private memory from Exercise 7.
11 | * Modify the kernel so that each work-group collaboratively copies its own column of B into local memory.
12 | * Optimize step by step, saving the intermediate versions and tracking performance improvements.
13 | 
14 | Expected output
15 | ---------------
16 | * A message to standard output verifying that the matrix multiplication program is generating the correct results.
17 | * Report the runtime and the MFLOPS.
18 | 
19 | Extra
20 | -----
21 | * Look at the fast, blocked implementation from the NVIDIA OpenCL SDK example.
22 |   Try running it and compare to yours.
23 | 
24 | 


--------------------------------------------------------------------------------
/Exercises/Exercise09/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-lm -O3
 7 | 
 8 | LIBS = -fopenmp
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Check our platform and make sure we define the APPLE variable
13 | # and set up the right compiler flags and libraries
14 | PLATFORM = $(shell uname -s)
15 | ifeq ($(PLATFORM), Darwin)
16 |        LIBS =  
17 | endif
18 | 
19 | pi: pi.c $(COMMON_DIR)/wtime.c
20 | 	$(CC) $^ $(CCFLAGS) -o $@
21 | 
22 | clean:
23 | 	rm -f pi
24 | 


--------------------------------------------------------------------------------
/Exercises/Exercise09/C/pi.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | This program will numerically compute the integral of
 4 | 
 5 |                   4/(1+x*x)
 6 | 
 7 | from 0 to 1.  The value of this integral is pi -- which
 8 | is great since it gives us an easy way to check the answer.
 9 | 
10 | The is the original sequential program.  It uses the timer
11 | from the OpenMP runtime library
12 | 
13 | History: Written by Tim Mattson, 11/99.
14 | 
15 | */
16 | 
17 | #include <stdio.h>
18 | static long num_steps = 100000000;
19 | double step;
20 | extern double wtime();   // returns time since some fixed past point (wtime.c)
21 | 
22 | 
23 | int main ()
24 | {
25 |     int i;
26 |     double x, pi, sum = 0.0;
27 |     double start_time, run_time;
28 | 
29 |     step = 1.0/(double) num_steps;
30 | 
31 |     start_time =wtime();
32 | 
33 |     for (i=1;i<= num_steps; i++){
34 |         x = (i-0.5)*step;
35 |         sum = sum + 4.0/(1.0+x*x);
36 |     }
37 | 
38 |     pi = step * sum;
39 |     run_time = wtime() - start_time;
40 |     printf("\n pi with %ld steps is %lf in %lf seconds\n", num_steps, pi, run_time);
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/Exercises/Exercise09/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC = g++
 4 | endif
 5 | 
 6 | CCFLAGS = -O3 -lrt
 7 | 
 8 | LIBS = -lm
 9 | 
10 | CPP_COMMON = ../../Cpp_common
11 | 
12 | INC = -I $(CPP_COMMON)
13 | 
14 | # Check our platform and make sure we define the APPLE variable
15 | # and set up the right compiler flags and libraries
16 | PLATFORM = $(shell uname -s)
17 | ifeq ($(PLATFORM), Darwin)
18 | 	CPPC = clang++
19 | 	CCFLAGS = -O3 -std=c++11 -stdlib=libc++
20 | endif
21 | 
22 | pi: pi.cpp
23 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
24 | 
25 | clean:
26 | 	rm -f pi
27 | 


--------------------------------------------------------------------------------
/Exercises/Exercise09/Cpp/pi.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | This program will numerically compute the integral of
 4 | 
 5 |                   4/(1+x*x)
 6 | 
 7 | from 0 to 1.  The value of this integral is pi -- which
 8 | is great since it gives us an easy way to check the answer.
 9 | 
10 | The is the original sequential program.  It uses the timer
11 | from the OpenMP runtime library
12 | 
13 | History: Written by Tim Mattson, 11/99.
14 |          Ported to C++ by Tom Deakin, August 2013
15 | 
16 | */
17 | 
18 | #include "util.hpp"
19 | 
20 | #include <cstdio>
21 | static long num_steps = 100000000;
22 | double step;
23 | extern double wtime();   // returns time since some fixed past point (wtime.c)
24 | 
25 | 
26 | int main ()
27 | {
28 |     int i;
29 |     double x, pi, sum = 0.0;
30 | 
31 | 
32 |     step = 1.0/(double) num_steps;
33 | 
34 |     util::Timer timer;
35 | 
36 |     for (i=1;i<= num_steps; i++){
37 |         x = (i-0.5)*step;
38 |         sum = sum + 4.0/(1.0+x*x);
39 |     }
40 | 
41 |     pi = step * sum;
42 |     double run_time = static_cast<double>(timer.getTimeMilliseconds()) / 1000.0;
43 |     printf("\n pi with %ld steps is %lf in %lf seconds\n", num_steps, pi, run_time);
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/Exercises/Exercise09/Python/pi.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # This program will numerically compute the integral of
 3 | #
 4 | #                4/(1+x*x)
 5 | #
 6 | # from 0 to 1.  The value of this integral is pi -- which
 7 | # is great since it gives us an easy way to check the answer.
 8 | #
 9 | # This the original sequential program.
10 | #
11 | # History: Written in C by Tim Mattson, 11/99
12 | #          Ported to Python by Tom Deakin, July 2013
13 | #
14 | 
15 | from time import time
16 | 
17 | num_steps = 100000000
18 | 
19 | print "\nNote: Wanted to do", num_steps, "steps, but this is very slow in Python."
20 | 
21 | num_steps = 1000000
22 | 
23 | print "Doing", num_steps, "steps instead."
24 | 
25 | integral_sum = 0.0
26 | 
27 | step = 1.0/num_steps
28 | 
29 | start_time = time()
30 | 
31 | for i in range(1,num_steps):
32 |     x = (i-0.5)*step
33 |     integral_sum += 4.0/(1.0+x*x)
34 | 
35 | pi = step * integral_sum
36 | 
37 | run_time = time() - start_time;
38 | 
39 | print "\npi with", num_steps, "steps is", pi, "in", run_time, "seconds\n"
40 | 
41 | 


--------------------------------------------------------------------------------
/Exercises/Exercise09/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 9 - The Pi program
 2 | ===========================
 3 | 
 4 | Goal
 5 | ----
 6 | * To understand synchronization between work-items in the OpenCL C kernel programming language.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Start with the provided serial program to estimate Pi through numerical integration.
11 | * Write a kernel and host program to compute the numerical integral using OpenCL.
12 | * Note: you will need to implement a reduction.
13 | 
14 | Expected output
15 | ---------------
16 | * Output result plus an estimate of the error in the result.
17 | * Report the runtime.
18 | 
19 | Hint
20 | ----
21 | You will want each work-item to do many iterations of the loop, i.e. don't create one work-item per loop iteration.
22 | To do so would make the reduction so costly that performance would be terrible.
23 | 


--------------------------------------------------------------------------------
/Exercises/Exercise10/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 10 - Heterogeneous Computing
 2 | ====================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To experiment with running kernels on multiple devices.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Take one of your OpenCL programs.
11 | * Investigate the Context constructors and include more than once device.
12 | * Modify the program to run a kernel on multiple devices, each with different input data.
13 | * Split your problem across multiple devices if you have time.
14 | * Use the examples in the SDK to help you.
15 | 
16 | Expected output
17 | ---------------
18 | * Output the results from both devices and see which runs faster.
19 | 


--------------------------------------------------------------------------------
/Exercises/Exercise11/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 11 - Optimize matrix multiplication
 2 | ============================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To understand portable performance in OpenCL.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Optimize step by step ... saving intermediate versions and tracking performance improvements.
11 | * After you've tried to optimize the program on your own, study the blocked solution optimized for NVIDIA GPU.
12 | Apply these techniques to your own code to further optimize performance.
13 | * As a final step, go back and make a single program that is adaptive so it delivers good results on both a CPU and a GPU.
14 | 
15 | Expected output
16 | ---------------
17 | * A message confirming that the matrix multiplication is correct.
18 | * Report the runtime and the MFLOPS.


--------------------------------------------------------------------------------
/Exercises/Exercise12/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 12 - Profiling OpenCL programs
 2 | =======================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To experiment with profiling tools.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Take one of your OpenCL programs, such as matrix multiply.
11 | * Run the program in the profiler and explore the results.
12 | * Modify the program to improve the performance.
13 | * Repeat with other programs if you have time.
14 | 
15 | Expected output
16 | ---------------
17 | * Timing from the timer and profiling interfaces should roughly match.


--------------------------------------------------------------------------------
/Exercises/Exercise13/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS = -O3 -std=c99
 7 | 
 8 | LIBS = -fopenmp
 9 | 
10 | # Check our platform and make sure we define the APPLE variable
11 | # and set up the right compiler flags and libraries
12 | PLATFORM = $(shell uname -s)
13 | ifeq ($(PLATFORM), Darwin)
14 |         LIBS = 
15 | endif
16 | 
17 | gameoflife: gameoflife.c
18 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -o $@
19 | 
20 | clean:
21 | 	rm -f gameoflife
22 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/CUDA-VADD/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CC = nvcc
 3 | 
 4 | CCFLAGS = -O3
 5 | 
 6 | LIBS =
 7 | 
 8 | vadd: vadd.cu
 9 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -o $@
10 | 
11 | clean:
12 | 	rm -f vadd
13 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/CUDA-VADD/vadd.cu:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | // Name:       vadd.cu
 4 | // 
 5 | // Purpose:    CUDA implementation of VADD
 6 | //
 7 | // HISTORY:    Written by Tom Deakin and Simon McIntosh-Smith, August 2013
 8 | //
 9 | //------------------------------------------------------------------------------
10 | 
11 | #include <stdio.h>
12 | #include <cuda.h>
13 | 
14 | #define TOL    (0.001)   // tolerance used in floating point comparisons
15 | #define LENGTH (1024)    // length of vectors a, b, and c
16 | 
17 | /*************************************************************************************
18 |  * CUDA kernel
19 |  ************************************************************************************/
20 | 
21 | __global__ void vadd(const float* a,
22 | 					 const float* b,
23 | 					       float* c,
24 | 					 const unsigned int count)
25 | {
26 | 	int i = blockDim.x * blockIdx.x + threadIdx.x;
27 | 	if (i < count) {
28 | 		c[i] = a[i] + b[i];
29 | 	}
30 | }
31 | 
32 | /*************************************************************************************
33 |  * Main function
34 |  ************************************************************************************/
35 | 
36 | int main(void)
37 | {
38 |     float        h_a[LENGTH];       // a vector
39 |     float        h_b[LENGTH];       // b vector
40 |     float        h_c[LENGTH];       // c vector (a+b) returned from the compute device
41 |     float *d_a, *d_b, *d_c;         // CUDA memory
42 |     unsigned int correct;           // number of correct results
43 | 
44 |     // Fill vectors a and b with random float values
45 |     int i = 0;
46 |     int count = LENGTH;
47 |     for(i = 0; i < count; i++){
48 |         h_a[i] = rand() / (float)RAND_MAX;
49 |         h_b[i] = rand() / (float)RAND_MAX;
50 |     }
51 | 
52 |     // Allocate CUDA memory
53 |     cudaMalloc(&d_a, sizeof(float) * LENGTH);
54 |     cudaMalloc(&d_b, sizeof(float) * LENGTH);
55 |     cudaMalloc(&d_c, sizeof(float) * LENGTH);
56 | 
57 |     // Write buffers a and b to GPU memory
58 |     cudaMemcpy(d_a, h_a, sizeof(float) * LENGTH, cudaMemcpyHostToDevice);
59 |     cudaMemcpy(d_b, h_b, sizeof(float) * LENGTH, cudaMemcpyHostToDevice);
60 | 
61 |     dim3 numBlocks(LENGTH);
62 |     dim3 numThreads(1);
63 |     vadd<<<numBlocks, numThreads>>>(d_a, d_b, d_c, LENGTH);
64 | 
65 |     // Copy result array back to host memory
66 |     cudaMemcpy(h_c, d_c, sizeof(float) * LENGTH, cudaMemcpyDeviceToHost);
67 | 
68 |     // Test the results
69 |     correct = 0;
70 |     float tmp;
71 |     
72 |     for(i = 0; i < count; i++)
73 |     {
74 |         tmp = h_a[i] + h_b[i];     // assign element i of a+b to tmp
75 |         tmp -= h_c[i];             // compute deviation of expected and output result
76 |         if(tmp*tmp < TOL*TOL)        // correct if square deviation is less than tolerance squared
77 |             correct++;
78 |         else {
79 |             printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], h_c[i]);
80 |         }
81 |     }
82 |     
83 |     // summarize results
84 |     printf("C = A+B:  %d out of %d results were correct.\n", correct, count);
85 | 
86 | 	return EXIT_SUCCESS;
87 | }
88 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/CUDA/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CC = nvcc
 3 | 
 4 | CCFLAGS = -O3
 5 | 
 6 | LIBS =
 7 | 
 8 | gameoflife: gameoflife.cu
 9 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -o $@
10 | 
11 | clean:
12 | 	rm -f gameoflife
13 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/Displayer/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CC=gcc
 3 | 
 4 | CCFLAGS=-std=c99 -O3
 5 | 
 6 | LIBS = -lGL -lglut
 7 | 
 8 | # Check our platform and make sure we define the APPLE variable
 9 | # and set up the right compiler flags and libraries
10 | PLATFORM = $(shell uname -s)
11 | ifeq ($(PLATFORM), Darwin)
12 | 	LIBS = -framework OpenGL -framework GLUT
13 | endif
14 | 
15 | displayer: displayer.c
16 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -o $@
17 | 
18 | .PHONY: clean
19 | clean:
20 | 	rm -f displayer
21 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/Acorn/acorn.dat:
--------------------------------------------------------------------------------
1 | 48 49 1
2 | 50 50 1
3 | 47 51 1
4 | 48 51 1
5 | 51 51 1
6 | 52 51 1
7 | 53 51 1


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/Acorn/final_state.dat:
--------------------------------------------------------------------------------
  1 | 33 33 1
  2 | 32 34 1
  3 | 34 34 1
  4 | 15 35 1
  5 | 16 35 1
  6 | 32 35 1
  7 | 34 35 1
  8 | 15 36 1
  9 | 18 36 1
 10 | 33 36 1
 11 | 16 37 1
 12 | 17 37 1
 13 | 18 37 1
 14 | 56 37 1
 15 | 57 37 1
 16 | 15 38 1
 17 | 56 38 1
 18 | 57 38 1
 19 | 10 39 1
 20 | 11 39 1
 21 | 12 39 1
 22 | 13 39 1
 23 | 14 39 1
 24 | 15 39 1
 25 | 17 39 1
 26 | 18 39 1
 27 | 8 41 1
 28 | 14 41 1
 29 | 15 41 1
 30 | 6 42 1
 31 | 7 42 1
 32 | 13 42 1
 33 | 19 42 1
 34 | 6 43 1
 35 | 7 43 1
 36 | 11 43 1
 37 | 12 43 1
 38 | 15 43 1
 39 | 6 44 1
 40 | 7 44 1
 41 | 16 44 1
 42 | 20 44 1
 43 | 21 44 1
 44 | 22 44 1
 45 | 6 45 1
 46 | 7 45 1
 47 | 8 45 1
 48 | 19 45 1
 49 | 22 45 1
 50 | 24 45 1
 51 | 20 46 1
 52 | 24 46 1
 53 | 6 47 1
 54 | 8 47 1
 55 | 21 47 1
 56 | 22 47 1
 57 | 23 47 1
 58 | 33 47 1
 59 | 6 48 1
 60 | 21 48 1
 61 | 22 48 1
 62 | 23 48 1
 63 | 30 48 1
 64 | 34 48 1
 65 | 10 49 1
 66 | 30 49 1
 67 | 34 49 1
 68 | 6 50 1
 69 | 10 50 1
 70 | 14 50 1
 71 | 15 50 1
 72 | 16 50 1
 73 | 30 50 1
 74 | 34 50 1
 75 | 10 51 1
 76 | 6 52 1
 77 | 31 52 1
 78 | 33 52 1
 79 | 6 53 1
 80 | 8 53 1
 81 | 28 53 1
 82 | 30 53 1
 83 | 31 53 1
 84 | 32 53 1
 85 | 24 54 1
 86 | 26 54 1
 87 | 27 54 1
 88 | 28 54 1
 89 | 42 54 1
 90 | 43 54 1
 91 | 44 54 1
 92 | 6 55 1
 93 | 7 55 1
 94 | 8 55 1
 95 | 26 55 1
 96 | 27 55 1
 97 | 28 55 1
 98 | 60 55 1
 99 | 61 55 1
100 | 6 56 1
101 | 7 56 1
102 | 26 56 1
103 | 31 56 1
104 | 33 56 1
105 | 60 56 1
106 | 61 56 1
107 | 6 57 1
108 | 7 57 1
109 | 11 57 1
110 | 12 57 1
111 | 16 57 1
112 | 17 57 1
113 | 18 57 1
114 | 19 57 1
115 | 20 57 1
116 | 25 57 1
117 | 31 57 1
118 | 34 57 1
119 | 6 58 1
120 | 7 58 1
121 | 13 58 1
122 | 16 58 1
123 | 17 58 1
124 | 19 58 1
125 | 20 58 1
126 | 31 58 1
127 | 33 58 1
128 | 8 59 1
129 | 13 59 1
130 | 17 59 1
131 | 23 59 1
132 | 18 60 1
133 | 21 60 1
134 | 22 60 1
135 | 10 61 1
136 | 11 61 1
137 | 12 61 1
138 | 17 61 1
139 | 17 62 1
140 | 18 62 1
141 | 19 62 1
142 | 16 63 1
143 | 19 63 1
144 | 15 64 1
145 | 18 64 1
146 | 15 65 1
147 | 16 65 1
148 | 17 65 1
149 | 16 66 1
150 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/Acorn/input.params:
--------------------------------------------------------------------------------
1 | 100
2 | 100
3 | 150


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/Max/input.params:
--------------------------------------------------------------------------------
1 | 100
2 | 100
3 | 50


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/Max/max.dat:
--------------------------------------------------------------------------------
  1 | 56 38 1
  2 | 55 39 1
  3 | 56 39 1
  4 | 57 39 1
  5 | 50 40 1
  6 | 51 40 1
  7 | 52 40 1
  8 | 57 40 1
  9 | 58 40 1
 10 | 49 41 1
 11 | 52 41 1
 12 | 53 41 1
 13 | 54 41 1
 14 | 57 41 1
 15 | 59 41 1
 16 | 60 41 1
 17 | 48 42 1
 18 | 52 42 1
 19 | 54 42 1
 20 | 57 42 1
 21 | 59 42 1
 22 | 48 43 1
 23 | 53 43 1
 24 | 55 43 1
 25 | 57 43 1
 26 | 59 43 1
 27 | 61 43 1
 28 | 62 43 1
 29 | 50 44 1
 30 | 55 44 1
 31 | 57 44 1
 32 | 61 44 1
 33 | 62 44 1
 34 | 38 45 1
 35 | 39 45 1
 36 | 40 45 1
 37 | 41 45 1
 38 | 47 45 1
 39 | 49 45 1
 40 | 54 45 1
 41 | 58 45 1
 42 | 60 45 1
 43 | 61 45 1
 44 | 62 45 1
 45 | 38 46 1
 46 | 42 46 1
 47 | 43 46 1
 48 | 45 46 1
 49 | 47 46 1
 50 | 48 46 1
 51 | 49 46 1
 52 | 51 46 1
 53 | 52 46 1
 54 | 62 46 1
 55 | 63 46 1
 56 | 38 47 1
 57 | 44 47 1
 58 | 45 47 1
 59 | 51 47 1
 60 | 39 48 1
 61 | 42 48 1
 62 | 43 48 1
 63 | 45 48 1
 64 | 48 48 1
 65 | 51 48 1
 66 | 53 48 1
 67 | 54 48 1
 68 | 45 49 1
 69 | 47 49 1
 70 | 49 49 1
 71 | 51 49 1
 72 | 53 49 1
 73 | 55 49 1
 74 | 61 49 1
 75 | 62 49 1
 76 | 63 49 1
 77 | 64 49 1
 78 | 39 50 1
 79 | 42 50 1
 80 | 43 50 1
 81 | 45 50 1
 82 | 48 50 1
 83 | 51 50 1
 84 | 54 50 1
 85 | 55 50 1
 86 | 57 50 1
 87 | 59 50 1
 88 | 60 50 1
 89 | 64 50 1
 90 | 38 51 1
 91 | 44 51 1
 92 | 45 51 1
 93 | 49 51 1
 94 | 51 51 1
 95 | 53 51 1
 96 | 57 51 1
 97 | 58 51 1
 98 | 64 51 1
 99 | 38 52 1
100 | 42 52 1
101 | 43 52 1
102 | 45 52 1
103 | 47 52 1
104 | 48 52 1
105 | 51 52 1
106 | 54 52 1
107 | 57 52 1
108 | 59 52 1
109 | 60 52 1
110 | 63 52 1
111 | 38 53 1
112 | 39 53 1
113 | 40 53 1
114 | 41 53 1
115 | 47 53 1
116 | 49 53 1
117 | 51 53 1
118 | 53 53 1
119 | 55 53 1
120 | 57 53 1
121 | 48 54 1
122 | 49 54 1
123 | 51 54 1
124 | 54 54 1
125 | 57 54 1
126 | 59 54 1
127 | 60 54 1
128 | 63 54 1
129 | 51 55 1
130 | 57 55 1
131 | 58 55 1
132 | 64 55 1
133 | 39 56 1
134 | 40 56 1
135 | 50 56 1
136 | 51 56 1
137 | 53 56 1
138 | 54 56 1
139 | 55 56 1
140 | 57 56 1
141 | 59 56 1
142 | 60 56 1
143 | 64 56 1
144 | 40 57 1
145 | 41 57 1
146 | 42 57 1
147 | 44 57 1
148 | 48 57 1
149 | 53 57 1
150 | 55 57 1
151 | 61 57 1
152 | 62 57 1
153 | 63 57 1
154 | 64 57 1
155 | 40 58 1
156 | 41 58 1
157 | 45 58 1
158 | 47 58 1
159 | 52 58 1
160 | 40 59 1
161 | 41 59 1
162 | 43 59 1
163 | 45 59 1
164 | 47 59 1
165 | 49 59 1
166 | 54 59 1
167 | 43 60 1
168 | 45 60 1
169 | 48 60 1
170 | 50 60 1
171 | 54 60 1
172 | 42 61 1
173 | 43 61 1
174 | 45 61 1
175 | 48 61 1
176 | 49 61 1
177 | 50 61 1
178 | 53 61 1
179 | 44 62 1
180 | 45 62 1
181 | 50 62 1
182 | 51 62 1
183 | 52 62 1
184 | 45 63 1
185 | 46 63 1
186 | 47 63 1
187 | 46 64 1
188 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/Pulsar/final_state.dat:
--------------------------------------------------------------------------------
 1 | 4 2 1
 2 | 5 2 1
 3 | 6 2 1
 4 | 10 2 1
 5 | 11 2 1
 6 | 12 2 1
 7 | 2 4 1
 8 | 7 4 1
 9 | 9 4 1
10 | 14 4 1
11 | 2 5 1
12 | 7 5 1
13 | 9 5 1
14 | 14 5 1
15 | 2 6 1
16 | 7 6 1
17 | 9 6 1
18 | 14 6 1
19 | 4 7 1
20 | 5 7 1
21 | 6 7 1
22 | 10 7 1
23 | 11 7 1
24 | 12 7 1
25 | 4 9 1
26 | 5 9 1
27 | 6 9 1
28 | 10 9 1
29 | 11 9 1
30 | 12 9 1
31 | 2 10 1
32 | 7 10 1
33 | 9 10 1
34 | 14 10 1
35 | 2 11 1
36 | 7 11 1
37 | 9 11 1
38 | 14 11 1
39 | 2 12 1
40 | 7 12 1
41 | 9 12 1
42 | 14 12 1
43 | 4 14 1
44 | 5 14 1
45 | 6 14 1
46 | 10 14 1
47 | 11 14 1
48 | 12 14 1
49 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/Pulsar/input.params:
--------------------------------------------------------------------------------
1 | 18
2 | 18
3 | 300000
4 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/Pulsar/pulsar.dat:
--------------------------------------------------------------------------------
 1 | 4 2 1
 2 | 5 2 1
 3 | 6 2 1
 4 | 10 2 1
 5 | 11 2 1
 6 | 12 2 1
 7 | 2 4 1
 8 | 7 4 1
 9 | 9 4 1
10 | 14 4 1
11 | 2 5 1
12 | 7 5 1
13 | 9 5 1
14 | 14 5 1
15 | 2 6 1
16 | 7 6 1
17 | 9 6 1
18 | 14 6 1
19 | 4 7 1
20 | 5 7 1
21 | 6 7 1
22 | 10 7 1
23 | 11 7 1
24 | 12 7 1
25 | 4 9 1
26 | 5 9 1
27 | 6 9 1
28 | 10 9 1
29 | 11 9 1
30 | 12 9 1
31 | 2 10 1
32 | 7 10 1
33 | 9 10 1
34 | 14 10 1
35 | 2 11 1
36 | 7 11 1
37 | 9 11 1
38 | 14 11 1
39 | 2 12 1
40 | 7 12 1
41 | 9 12 1
42 | 14 12 1
43 | 4 14 1
44 | 5 14 1
45 | 6 14 1
46 | 10 14 1
47 | 11 14 1
48 | 12 14 1
49 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/QueenBee/final_state.dat:
--------------------------------------------------------------------------------
 1 | 11 2 1
 2 | 9 3 1
 3 | 11 3 1
 4 | 8 4 1
 5 | 10 4 1
 6 | 2 5 1
 7 | 3 5 1
 8 | 7 5 1
 9 | 10 5 1
10 | 22 5 1
11 | 23 5 1
12 | 2 6 1
13 | 3 6 1
14 | 8 6 1
15 | 10 6 1
16 | 22 6 1
17 | 23 6 1
18 | 9 7 1
19 | 11 7 1
20 | 11 8 1
21 | 


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/QueenBee/input.params:
--------------------------------------------------------------------------------
1 | 35
2 | 11
3 | 30


--------------------------------------------------------------------------------
/Exercises/Exercise13/Examples/QueenBee/queenbee.dat:
--------------------------------------------------------------------------------
 1 | 11 2 1
 2 | 9 3 1
 3 | 11 3 1
 4 | 8 4 1
 5 | 10 4 1
 6 | 2 5 1
 7 | 3 5 1
 8 | 7 5 1
 9 | 10 5 1
10 | 22 5 1
11 | 23 5 1
12 | 2 6 1
13 | 3 6 1
14 | 8 6 1
15 | 10 6 1
16 | 22 6 1
17 | 23 6 1
18 | 9 7 1
19 | 11 7 1
20 | 11 8 1


--------------------------------------------------------------------------------
/Exercises/Exercise13/README.md:
--------------------------------------------------------------------------------
 1 | Exercise 13 - Porting CUDA to OpenCL
 2 | ====================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To port the CUDA/serial C program to OpenCL
 7 | 
 8 | Procedure
 9 | ---------
10 | * Examine the CUDA kernel and identify which parts need changing
11 |     * Change them to the OpenCL equivalents
12 | * Examine the Host code and part the commands to the OpenCL equivalents
13 | 
14 | Expected output
15 | ---------------
16 | * The OpenCL and CUDA programs should produce the same output - check this!
17 | 
18 | Examples
19 | --------
20 | Some example input is provided in the Examples/ directory.
21 | The `.dat` files list the co-ordinates of the grid with a live cell, followed by a 1 (to signify alive).
22 | The `input.params` file lists the size of the grid (X then Y) and the number of iterations.
23 | 
24 | Notes
25 | -----
26 | 
27 | See the Exercises/Exercise13/Examples directory for some sample input .dat, input.params files
28 | along with the expected final_state.dat for four different Game of Life patterns.
29 | 


--------------------------------------------------------------------------------
/Exercises/ExerciseA/README.md:
--------------------------------------------------------------------------------
 1 | Exercise A - The vectorized Pi program
 2 | ======================================
 3 | 
 4 | Goal
 5 | ----
 6 | * To understand the vector instructions in the kernel programming language.
 7 | 
 8 | Procedure
 9 | ---------
10 | * Start with your best Pi program.
11 | * Unroll the loop 4 times.
12 |   Verify that the program still works.
13 | * Use vector instructions in the body of the loop.
14 | 
15 | Expected output
16 | ---------------
17 | * Output result plus an estimate of the error in the result.
18 | * Report the runtime and compare the vectorized and scalar versions of the program.
19 | * You could try running this on the CPU as well as the GPU...
20 | 


--------------------------------------------------------------------------------
/Exercises/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # This makefile will produce all the C binaries
 3 | # in their respective directories
 4 | 
 5 | CEXES = Exercise01/C/DeviceInfo Exercise02/C/vadd \
 6 | 		Exercise04/C/vadd Exercise05/C/vadd \
 7 | 		Exercise06/C/mult Exercise09/C/pi \
 8 | 		Exercise13/C/gameoflife
 9 | 
10 | CPPEXES = Exercise01/Cpp/DeviceInfo Exercise03/Cpp/vadd \
11 | 		Exercise04/Cpp/vadd Exercise05/Cpp/vadd \
12 | 		Exercise06/Cpp/mult Exercise09/Cpp/pi
13 | 
14 | # Change this variable to specify the device type in all
15 | # the Makefile to the OpenCL device type of choice
16 | DEVICE = CL_DEVICE_TYPE_DEFAULT
17 | export DEVICE
18 | 
19 | # Incase you need to rename the C++ compiler, you can
20 | # do it in bulk here
21 | CPPC = g++
22 | export CPPC
23 | 
24 | ifndef CC
25 |         CC = gcc
26 | endif
27 | export CC
28 | 
29 | .PHONY : $(CEXES) $(CPPEXES)
30 | 
31 | all: $(CEXES) $(CPPEXES)
32 | 
33 | $(CEXES):
34 | 	$(MAKE) -C `dirname $@`
35 | 
36 | $(CPPEXES):
37 | 	$(MAKE) -C `dirname $@`
38 | 
39 | .PHONY : clean
40 | clean:
41 | 	for e in $(CEXES) $(CPPEXES); do $(MAKE) -C `dirname $$e` clean; done
42 | 


--------------------------------------------------------------------------------
/License:
--------------------------------------------------------------------------------
1 | This work is licensed under the Creative Commons Attribution 3.0 Unported License.
2 | 
3 | To view a copy of this license, visit http://creativecommons.org/licenses/by/3.0/
4 | or send a letter to Creative Commons, 444 Castro Street, Suite 900, Mountain View, California, 94041, USA.
5 | 


--------------------------------------------------------------------------------
/Solutions/C_common/device_picker.h:
--------------------------------------------------------------------------------
  1 | /*------------------------------------------------------------------------------
  2 |  *
  3 |  * Name:       device_picker.h
  4 |  *
  5 |  * Purpose:    Provide a simple CLI to specify an OpenCL device at runtime
  6 |  *
  7 |  * Note:       Must be included AFTER the relevant OpenCL header
  8 |  *             See one of the Matrix Multiply exercises for usage
  9 |  *
 10 |  * HISTORY:    Method written by James Price, October 2014
 11 |  *             Extracted to a common header by Tom Deakin, November 2014
 12 |  */
 13 | 
 14 | #pragma once
 15 | 
 16 | #include <string.h>
 17 | #include <err_code.h>
 18 | 
 19 | #define MAX_PLATFORMS     8
 20 | #define MAX_DEVICES      16
 21 | #define MAX_INFO_STRING 256
 22 | 
 23 | 
 24 | unsigned getDeviceList(cl_device_id devices[MAX_DEVICES])
 25 | {
 26 |   cl_int err;
 27 | 
 28 |   // Get list of platforms
 29 |   cl_uint numPlatforms = 0;
 30 |   cl_platform_id platforms[MAX_PLATFORMS];
 31 |   err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms);
 32 |   checkError(err, "getting platforms");
 33 | 
 34 |   // Enumerate devices
 35 |   unsigned numDevices = 0;
 36 |   for (int i = 0; i < numPlatforms; i++)
 37 |   {
 38 |     cl_uint num = 0;
 39 |     err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL,
 40 |                          MAX_DEVICES-numDevices, devices+numDevices, &num);
 41 |     checkError(err, "getting deviceS");
 42 |     numDevices += num;
 43 |   }
 44 | 
 45 |   return numDevices;
 46 | }
 47 | 
 48 | void getDeviceName(cl_device_id device, char name[MAX_INFO_STRING])
 49 | {
 50 |   cl_device_info info = CL_DEVICE_NAME;
 51 | 
 52 |   // Special case for AMD
 53 | #ifdef CL_DEVICE_BOARD_NAME_AMD
 54 |   clGetDeviceInfo(device, CL_DEVICE_VENDOR, MAX_INFO_STRING, name, NULL);
 55 |   if (strstr(name, "Advanced Micro Devices"))
 56 |     info = CL_DEVICE_BOARD_NAME_AMD;
 57 | #endif
 58 | 
 59 |   clGetDeviceInfo(device, info, MAX_INFO_STRING, name, NULL);
 60 | }
 61 | 
 62 | 
 63 | int parseUInt(const char *str, cl_uint *output)
 64 | {
 65 |   char *next;
 66 |   *output = strtoul(str, &next, 10);
 67 |   return !strlen(next);
 68 | }
 69 | 
 70 | void parseArguments(int argc, char *argv[], cl_uint *deviceIndex)
 71 | {
 72 |   for (int i = 1; i < argc; i++)
 73 |   {
 74 |     if (!strcmp(argv[i], "--list"))
 75 |     {
 76 |       // Get list of devices
 77 |       cl_device_id devices[MAX_DEVICES];
 78 |       unsigned numDevices = getDeviceList(devices);
 79 | 
 80 |       // Print device names
 81 |       if (numDevices == 0)
 82 |       {
 83 |         printf("No devices found.\n");
 84 |       }
 85 |       else
 86 |       {
 87 |         printf("\n");
 88 |         printf("Devices:\n");
 89 |         for (int i = 0; i < numDevices; i++)
 90 |         {
 91 |           char name[MAX_INFO_STRING];
 92 |           getDeviceName(devices[i], name);
 93 |           printf("%2d: %s\n", i, name);
 94 |         }
 95 |         printf("\n");
 96 |       }
 97 |       exit(0);
 98 |     }
 99 |     else if (!strcmp(argv[i], "--device"))
100 |     {
101 |       if (++i >= argc || !parseUInt(argv[i], deviceIndex))
102 |       {
103 |         printf("Invalid device index\n");
104 |         exit(1);
105 |       }
106 |     }
107 |     else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
108 |     {
109 |       printf("\n");
110 |       printf("Usage: ./program [OPTIONS]\n\n");
111 |       printf("Options:\n");
112 |       printf("  -h  --help               Print the message\n");
113 |       printf("      --list               List available devices\n");
114 |       printf("      --device     INDEX   Select device at INDEX\n");
115 |       printf("\n");
116 |       exit(0);
117 |     }
118 |   }
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/Solutions/C_common/wtime.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | #include <sys/time.h>
 6 | #endif
 7 | 
 8 | #include <stdlib.h>
 9 | 
10 | double wtime()
11 | {
12 | #ifdef _OPENMP
13 |    /* Use omp_get_wtime() if we can */
14 |    return omp_get_wtime();
15 | #else
16 |    /* Use a generic timer */
17 |    static int sec = -1;
18 |    struct timeval tv;
19 |    gettimeofday(&tv, NULL);
20 |    if (sec < 0) sec = tv.tv_sec;
21 |    return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
22 | #endif
23 | }
24 | 
25 |     
26 | 


--------------------------------------------------------------------------------
/Solutions/Cpp_common/device_picker.hpp:
--------------------------------------------------------------------------------
  1 | /*------------------------------------------------------------------------------
  2 |  *
  3 |  * Name:       device_picker.h
  4 |  *
  5 |  * Purpose:    Provide a simple CLI to specify an OpenCL device at runtime
  6 |  *
  7 |  * Note:       Must be included AFTER the relevant OpenCL header
  8 |  *             See one of the Matrix Multiply exercises for usage
  9 |  *
 10 |  * HISTORY:    Method written by James Price, October 2014
 11 |  *             Extracted to a common header by Tom Deakin, November 2014
 12 |  */
 13 | 
 14 | #pragma once
 15 | 
 16 | #include <vector>
 17 | #include <err_code.h>
 18 | #include <iostream>
 19 | 
 20 | #define MAX_INFO_STRING 256
 21 | 
 22 | 
 23 | unsigned getDeviceList(std::vector<cl::Device>& devices)
 24 | {
 25 |   cl_int err;
 26 | 
 27 |   // Get list of platforms
 28 |   std::vector<cl::Platform> platforms;
 29 |   cl::Platform::get(&platforms);
 30 | 
 31 |   // Enumerate devices
 32 |   for (int i = 0; i < platforms.size(); i++)
 33 |   {
 34 |     cl_uint num = 0;
 35 |     std::vector<cl::Device> plat_devices;
 36 |     platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &plat_devices);
 37 |     devices.insert(devices.end(), plat_devices.begin(), plat_devices.end());
 38 |   }
 39 | 
 40 |   return devices.size();
 41 | }
 42 | 
 43 | void getDeviceName(cl::Device& device, std::string& name)
 44 | {
 45 |   cl_device_info info = CL_DEVICE_NAME;
 46 | 
 47 |   // Special case for AMD
 48 | #ifdef CL_DEVICE_BOARD_NAME_AMD
 49 |   device.getInfo(CL_DEVICE_VENDOR, &name);
 50 |   if (strstr(name.c_str(), "Advanced Micro Devices"))
 51 |     info = CL_DEVICE_BOARD_NAME_AMD;
 52 | #endif
 53 | 
 54 |   device.getInfo(info, &name);
 55 | }
 56 | 
 57 | 
 58 | int parseUInt(const char *str, cl_uint *output)
 59 | {
 60 |   char *next;
 61 |   *output = strtoul(str, &next, 10);
 62 |   return !strlen(next);
 63 | }
 64 | 
 65 | void parseArguments(int argc, char *argv[], cl_uint *deviceIndex)
 66 | {
 67 |   for (int i = 1; i < argc; i++)
 68 |   {
 69 |     if (!strcmp(argv[i], "--list"))
 70 |     {
 71 |       // Get list of devices
 72 |       std::vector<cl::Device> devices;
 73 |       unsigned numDevices = getDeviceList(devices);
 74 | 
 75 |       // Print device names
 76 |       if (numDevices == 0)
 77 |       {
 78 |         std::cout << "No devices found.\n";
 79 |       }
 80 |       else
 81 |       {
 82 |         std::cout << "\nDevices:\n";
 83 |         for (int i = 0; i < numDevices; i++)
 84 |         {
 85 |           std::string name;
 86 |           getDeviceName(devices[i], name);
 87 |           std::cout << i << ": " << name << "\n";
 88 |         }
 89 |         std::cout << "\n";
 90 |       }
 91 |       exit(0);
 92 |     }
 93 |     else if (!strcmp(argv[i], "--device"))
 94 |     {
 95 |       if (++i >= argc || !parseUInt(argv[i], deviceIndex))
 96 |       {
 97 |         std::cout << "Invalid device index\n";
 98 |         exit(1);
 99 |       }
100 |     }
101 |     else if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h"))
102 |     {
103 |       std::cout << "\n";
104 |       std::cout << "Usage: ./program [OPTIONS]\n\n";
105 |       std::cout << "Options:\n";
106 |       std::cout << "  -h  --help               Print the message\n";
107 |       std::cout << "      --list               List available devices\n";
108 |       std::cout << "      --device     INDEX   Select device at INDEX\n";
109 |       std::cout << "\n";
110 |       exit(0);
111 |     }
112 |   }
113 | }
114 | 
115 | 


--------------------------------------------------------------------------------
/Solutions/Cpp_common/wtime.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifdef _OPENMP
 3 | #include <omp.h>
 4 | #else
 5 | #include <sys/time.h>
 6 | #endif
 7 | 
 8 | #include <stdlib.h>
 9 | 
10 | double wtime()
11 | {
12 | #ifdef _OPENMP
13 |    /* Use omp_get_wtime() if we can */
14 |    return omp_get_wtime();
15 | #else
16 |    /* Use a generic timer */
17 |    static int sec = -1;
18 |    struct timeval tv;
19 |    gettimeofday(&tv, NULL);
20 |    if (sec < 0) sec = tv.tv_sec;
21 |    return (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
22 | #endif
23 | }
24 | 
25 |     
26 | 


--------------------------------------------------------------------------------
/Solutions/Exercise04/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-O3 -lm
 7 | 
 8 | LIBS = -lOpenCL -fopenmp
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Change this variable to specify the device type
13 | # to the OpenCL device type of choice. You can also
14 | # edit the variable in the source.
15 | ifndef DEVICE
16 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
17 | endif
18 | 
19 | # Check our platform and make sure we define the APPLE variable
20 | # and set up the right compiler flags and libraries
21 | PLATFORM = $(shell uname -s)
22 | ifeq ($(PLATFORM), Darwin)
23 | 	LIBS = -framework OpenCL 
24 | endif
25 | 
26 | CCFLAGS += -D DEVICE=$(DEVICE)
27 | 
28 | vadd_chain: vadd_chain.c $(COMMON_DIR)/device_info.c
29 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
30 | 
31 | 
32 | clean:
33 | 	rm -f vadd_chain
34 | 


--------------------------------------------------------------------------------
/Solutions/Exercise04/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL -lrt
13 | 
14 | # Change this variable to specify the device type
15 | # to the OpenCL device type of choice. You can also
16 | # edit the variable in the source.
17 | ifndef DEVICE
18 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
19 | endif
20 | 
21 | # Check our platform and make sure we define the APPLE variable
22 | # and set up the right compiler flags and libraries
23 | PLATFORM = $(shell uname -s)
24 | ifeq ($(PLATFORM), Darwin)
25 | 	CPPC = clang++
26 | 	CCFLAGS += -stdlib=libc++
27 | 	LIBS = -framework OpenCL
28 | endif
29 | 
30 | CCFLAGS += -D DEVICE=$(DEVICE)
31 | 
32 | vadd_chain: vadd_chain.cpp
33 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
34 | 
35 | 
36 | clean:
37 | 	rm -f vadd_chain
38 | 


--------------------------------------------------------------------------------
/Solutions/Exercise04/Cpp/vadd_chain.cl:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | // kernel:  vadd  
 4 | //
 5 | // Purpose: Compute the elementwise sum c = a+b
 6 | // 
 7 | // input: a and b float vectors of length count
 8 | //
 9 | // output: c float vector of length count holding the sum a + b
10 | //
11 | 
12 | __kernel void vadd(                             
13 |    __global float* a,                      
14 |    __global float* b,                      
15 |    __global float* c,                      
16 |    const unsigned int count)               
17 | {                                          
18 |    int i = get_global_id(0);               
19 |    if(i < count)  {
20 |        c[i] = a[i] + b[i];                 
21 |    }
22 | }                                          


--------------------------------------------------------------------------------
/Solutions/Exercise04/Python/vadd_chain.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Vadd
  3 | #
  4 | # Element wise addition of two vectors at a time in a chain (C=A+B; D=C+E; F=D+G)
  5 | # Asks the user to select a device at runtime
  6 | #
  7 | # History: Initial version based on vadd.c, written by Tim Mattson, June 2011
  8 | #          Ported to C++ Wrapper API by Benedict Gaster, September 2011
  9 | #          Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012
 10 | #          Ported to Python by Tom Deakin, July 2013
 11 | #
 12 | 
 13 | # Import the Python OpenCL API
 14 | import pyopencl as cl
 15 | # Import the Python Maths Library (for vectors)
 16 | import numpy
 17 | 
 18 | #------------------------------------------------------------------------------
 19 | 
 20 | # tolerance used in floating point comparisons
 21 | TOL = 0.001
 22 | # length of vectors a, b and c
 23 | LENGTH = 1024
 24 | 
 25 | #------------------------------------------------------------------------------
 26 | #
 27 | # Kernel: vadd
 28 | #
 29 | # To compute the elementwise sum c = a + b
 30 | #
 31 | # Input: a and b float vectors of length count
 32 | # Output c float vector of length count holding the sum a + b
 33 | 
 34 | kernelsource = """
 35 | __kernel void vadd(
 36 |     __global float* a,
 37 |     __global float* b,
 38 |     __global float* c,
 39 |     const unsigned int count)
 40 | {
 41 |     int i = get_global_id(0);
 42 |     if (i < count)
 43 |         c[i] = a[i] + b[i];
 44 | }
 45 | """
 46 | 
 47 | #------------------------------------------------------------------------------
 48 | 
 49 | # Main procedure
 50 | 
 51 | # Create a compute context
 52 | # Ask the user to select a platform/device on the CLI
 53 | context = cl.create_some_context()
 54 | 
 55 | # Create a command queue
 56 | queue = cl.CommandQueue(context)
 57 | 
 58 | # Create the compute program from the source buffer
 59 | # and build it
 60 | program = cl.Program(context, kernelsource).build()
 61 | 
 62 | # Create a, b, e and g vectors and fill with random float values
 63 | # Create empty vectors for c, d and f
 64 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32)
 65 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32)
 66 | h_c = numpy.empty(LENGTH).astype(numpy.float32)
 67 | h_d = numpy.empty(LENGTH).astype(numpy.float32)
 68 | h_e = numpy.random.rand(LENGTH).astype(numpy.float32)
 69 | h_f = numpy.empty(LENGTH).astype(numpy.float32)
 70 | h_g = numpy.random.rand(LENGTH).astype(numpy.float32)
 71 | 
 72 | # Create the input (a, b, e, g) arrays in device memory and copy data from host
 73 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a)
 74 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b)
 75 | d_e = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_e)
 76 | d_g = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_g)
 77 | # Create the output (c, d, f) array in device memory
 78 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_c.nbytes)
 79 | d_d = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_d.nbytes)
 80 | d_f = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_f.nbytes)
 81 | 
 82 | vadd = program.vadd
 83 | vadd.set_scalar_arg_dtypes([None, None, None, numpy.uint32])
 84 | 
 85 | # Execute the kernel over the entire range of our 1d input
 86 | # allowing OpenCL runtime to select the work group items for the device
 87 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, LENGTH)
 88 | 
 89 | # Enqueue the kernel again, but with different arguments
 90 | vadd(queue, h_e.shape, None, d_e, d_c, d_d, LENGTH)
 91 | 
 92 | # Enqueue the kernel a third time, again with different arguments
 93 | vadd(queue, h_g.shape, None, d_g, d_d, d_f, LENGTH)
 94 | 
 95 | 
 96 | # Read back the results from the compute device
 97 | cl.enqueue_copy(queue, h_f, d_f)
 98 | 
 99 | # Test the results
100 | correct = 0;
101 | for a, b, e, f, g in zip(h_a, h_b, h_e, h_f, h_g):
102 |     tmp = a + b + e + g
103 |     # compute the deviation of expected and output result
104 |     tmp -= f
105 |     # correct if square deviation is less than tolerance squared
106 |     if tmp*tmp < TOL*TOL:
107 |         correct += 1
108 |     else:
109 |         print "tmp", tmp, "h_a", a, "h_b", b, "h_e", e, "h_g", g, "h_f", f
110 | 
111 | # Summarize results
112 | print "3 vector adds to find F = A+B+E+G:", correct, "out of", LENGTH, "results were correct."
113 | 


--------------------------------------------------------------------------------
/Solutions/Exercise04/README.md:
--------------------------------------------------------------------------------
1 | Exercise 4 - Chaining vector add kernels (C++/Python)
2 | =====================================================
3 | 


--------------------------------------------------------------------------------
/Solutions/Exercise05/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-O3 -lm
 7 | 
 8 | LIBS = -lOpenCL -fopenmp
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Change this variable to specify the device type
13 | # to the OpenCL device type of choice. You can also
14 | # edit the variable in the source.
15 | ifndef DEVICE
16 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
17 | endif
18 | 
19 | # Check our platform and make sure we define the APPLE variable
20 | # and set up the right compiler flags and libraries
21 | PLATFORM = $(shell uname -s)
22 | ifeq ($(PLATFORM), Darwin)
23 | 	LIBS = -framework OpenCL
24 | endif
25 | 
26 | CCFLAGS += -D DEVICE=$(DEVICE)
27 | 
28 | vadd_abc: vadd_abc.c $(COMMON_DIR)/device_info.c
29 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
30 | 
31 | 
32 | clean:
33 | 	rm -f vadd_abc
34 | 


--------------------------------------------------------------------------------
/Solutions/Exercise05/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=-std=c++11
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL -lrt
13 | 
14 | # Change this variable to specify the device type
15 | # to the OpenCL device type of choice. You can also
16 | # edit the variable in the source.
17 | ifndef DEVICE
18 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
19 | endif
20 | 
21 | # Check our platform and make sure we define the APPLE variable
22 | # and set up the right compiler flags and libraries
23 | PLATFORM = $(shell uname -s)
24 | ifeq ($(PLATFORM), Darwin)
25 | 	CPPC = clang++
26 | 	CCFLAGS += -DAPPLE -stdlib=libc++
27 | 	LIBS = -framework OpenCL
28 | endif
29 | 
30 | CCFLAGS += -D DEVICE=$(DEVICE)
31 | 
32 | vadd_abc: vadd_abc.cpp
33 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -I $(CPP_COMMON) -o $@
34 | 
35 | 
36 | clean:
37 | 	rm -f vadd_abc
38 | 


--------------------------------------------------------------------------------
/Solutions/Exercise05/Cpp/vadd_abc.cl:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | // kernel:  vadd
 4 | //
 5 | // Purpose: Compute the elementwise sum d = a+b+c
 6 | //
 7 | // input: a, b and c float vectors of length count
 8 | //
 9 | // output: d float vector of length count holding the sum a + b + c
10 | //
11 | 
12 | __kernel void vadd(
13 |    __global float* a,
14 |    __global float* b,
15 |    __global float* c,
16 |    __global float* d,
17 |    const unsigned int count)
18 | {
19 |    int i = get_global_id(0);
20 |    if(i < count)  {
21 |        d[i] = a[i] + b[i] + c[i];
22 |    }
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/Solutions/Exercise05/Cpp/vadd_abc.cpp:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | // Name:       vadd_three.cpp
  4 | //
  5 | // Purpose:    Elementwise addition of three vectors (d = a + b + c)
  6 | //
  7 | //                   d = a + b + c
  8 | //
  9 | // HISTORY:    Written by Tim Mattson, June 2011
 10 | //             Ported to C++ Wrapper API by Benedict Gaster, September 2011
 11 | //             Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012
 12 | //             Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 13 | //             Updated by Tom Deakin, October 2014
 14 | //
 15 | //------------------------------------------------------------------------------
 16 | 
 17 | #define __CL_ENABLE_EXCEPTIONS
 18 | 
 19 | #include "cl.hpp"
 20 | 
 21 | #include "util.hpp" // utility library
 22 | 
 23 | #include <vector>
 24 | #include <cstdio>
 25 | #include <cstdlib>
 26 | #include <string>
 27 | 
 28 | #include <iostream>
 29 | #include <fstream>
 30 | 
 31 | // pick up device type from compiler command line or from the default type
 32 | #ifndef DEVICE
 33 | #define DEVICE CL_DEVICE_TYPE_DEFAULT
 34 | #endif
 35 | 
 36 | #include "err_code.h"
 37 | 
 38 | //------------------------------------------------------------------------------
 39 | 
 40 | #define TOL    (0.001)   // tolerance used in floating point comparisons
 41 | #define LENGTH (1024)    // length of vectors a, b, and c
 42 | 
 43 | int main(void)
 44 | {
 45 |     std::vector<float> h_a(LENGTH);                // a vector
 46 |     std::vector<float> h_b(LENGTH);                // b vector
 47 |     std::vector<float> h_c(LENGTH);                // c vector
 48 |     std::vector<float> h_d (LENGTH, 0xdeadbeef);   // d vector (result)
 49 | 
 50 |     cl::Buffer d_a;                       // device memory used for the input  a vector
 51 |     cl::Buffer d_b;                       // device memory used for the input  b vector
 52 |     cl::Buffer d_c;                       // device memory used for the input c vector
 53 |     cl::Buffer d_d;                       // device memory used for the output d vector
 54 | 
 55 |     // Fill vectors a and b with random float values
 56 |     int count = LENGTH;
 57 |     for(int i = 0; i < count; i++)
 58 |     {
 59 |         h_a[i]  = rand() / (float)RAND_MAX;
 60 |         h_b[i]  = rand() / (float)RAND_MAX;
 61 |         h_c[i]  = rand() / (float)RAND_MAX;
 62 |     }
 63 | 
 64 |     try
 65 |     {
 66 |     	// Create a context
 67 |         cl::Context context(DEVICE);
 68 | 
 69 |         // Load in kernel source, creating a program object for the context
 70 | 
 71 |         cl::Program program(context, util::loadProgram("vadd_abc.cl"), true);
 72 | 
 73 |         // Get the command queue
 74 |         cl::CommandQueue queue(context);
 75 | 
 76 |         // Create the kernel functor
 77 | 
 78 |         cl::make_kernel<cl::Buffer, cl::Buffer, cl::Buffer, cl::Buffer, int> vadd(program, "vadd");
 79 | 
 80 |         d_a   = cl::Buffer(context, h_a.begin(), h_a.end(), true);
 81 |         d_b   = cl::Buffer(context, h_b.begin(), h_b.end(), true);
 82 |         d_c   = cl::Buffer(context, h_c.begin(), h_c.end(), true);
 83 | 
 84 |         d_d  = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * LENGTH);
 85 | 
 86 |         vadd(
 87 |             cl::EnqueueArgs(
 88 |                 queue,
 89 |                 cl::NDRange(count)),
 90 |             d_a,
 91 |             d_b,
 92 |             d_c,
 93 |             d_d,
 94 |             count);
 95 | 
 96 |         cl::copy(queue, d_d, h_d.begin(), h_d.end());
 97 | 
 98 |         // Test the results
 99 |         int correct = 0;
100 |         float tmp;
101 |         for(int i = 0; i < count; i++)
102 |         {
103 |             tmp = h_a[i] + h_b[i] + h_c[i];              // assign element i of a+b+c to tmp
104 |             tmp -= h_d[i];                               // compute deviation of expected and output result
105 |             if(tmp*tmp < TOL*TOL)                        // correct if square deviation is less than tolerance squared
106 |                 correct++;
107 |             else {
108 |                 printf(" tmp %f h_a %f h_b %f h_c %f h_d %f\n",tmp, h_a[i], h_b[i], h_c[i], h_d[i]);
109 |             }
110 |         }
111 | 
112 |         // summarize results
113 |         printf("D = A+B+C:  %d out of %d results were correct.\n", correct, count);
114 | 
115 |     }
116 |     catch (cl::Error err) {
117 |         std::cout << "Exception\n";
118 |         std::cerr
119 |             << "ERROR: "
120 |             << err.what()
121 |             << "("
122 |             << err_code(err.err())
123 |            << ")"
124 |            << std::endl;
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/Solutions/Exercise05/Python/vadd_abc.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Vadd
  3 | #
  4 | # Element wise addition of three vectors at a time (R=A+B+C)
  5 | # Asks the user to select a device at runtime
  6 | #
  7 | # History: Initial version based on vadd.c, written by Tim Mattson, June 2011
  8 | #          Ported to C++ Wrapper API by Benedict Gaster, September 2011
  9 | #          Updated to C++ Wrapper API v1.2 by Tom Deakin and Simon McIntosh-Smith, October 2012
 10 | #          Ported to Python by Tom Deakin, July 2013
 11 | #
 12 | 
 13 | # Import the Python OpenCL API
 14 | import pyopencl as cl
 15 | # Import the Python Maths Library (for vectors)
 16 | import numpy
 17 | 
 18 | #------------------------------------------------------------------------------
 19 | 
 20 | # tolerance used in floating point comparisons
 21 | TOL = 0.001
 22 | # length of vectors a, b and c
 23 | LENGTH = 1024
 24 | 
 25 | #------------------------------------------------------------------------------
 26 | #
 27 | # Kernel: vadd
 28 | #
 29 | # To compute the elementwise sum r = a + b + c
 30 | #
 31 | # Input: a, b and c float vectors of length count
 32 | # Output r float vector of length count holding the sum a + b + cs
 33 | 
 34 | kernelsource = """
 35 | __kernel void vadd(
 36 |     __global float* a,
 37 |     __global float* b,
 38 |     __global float* c,
 39 |     __global float* r,
 40 |     const unsigned int count)
 41 | {
 42 |     int i = get_global_id(0);
 43 |     if (i < count)
 44 |         r[i] = a[i] + b[i] + c[i];
 45 | }
 46 | """
 47 | 
 48 | #------------------------------------------------------------------------------
 49 | 
 50 | # Main procedure
 51 | 
 52 | # Create a compute context
 53 | # Ask the user to select a platform/device on the CLI
 54 | context = cl.create_some_context()
 55 | 
 56 | # Create a command queue
 57 | queue = cl.CommandQueue(context)
 58 | 
 59 | # Create the compute program from the source buffer
 60 | # and build it
 61 | program = cl.Program(context, kernelsource).build()
 62 | 
 63 | # Create a, b and c vectors and fill with random float values
 64 | # Create empty vectors for r
 65 | h_a = numpy.random.rand(LENGTH).astype(numpy.float32)
 66 | h_b = numpy.random.rand(LENGTH).astype(numpy.float32)
 67 | h_c = numpy.random.rand(LENGTH).astype(numpy.float32)
 68 | h_r = numpy.empty(LENGTH).astype(numpy.float32)
 69 | 
 70 | # Create the input (a, b, c) arrays in device memory and copy data from host
 71 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_a)
 72 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_b)
 73 | d_c = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_c)
 74 | # Create the output (r) array in device memory
 75 | d_r = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_r.nbytes)
 76 | 
 77 | # Execute the kernel over the entire range of our 1d input
 78 | # allowing OpenCL runtime to select the work group items for the device
 79 | vadd = program.vadd
 80 | vadd.set_scalar_arg_dtypes([None, None, None, None, numpy.uint32])
 81 | vadd(queue, h_a.shape, None, d_a, d_b, d_c, d_r, LENGTH)
 82 | 
 83 | # Read back the results from the compute device
 84 | cl.enqueue_copy(queue, h_r, d_r)
 85 | 
 86 | # Test the results
 87 | correct = 0;
 88 | for a, b, c, r in zip(h_a, h_b, h_c, h_r):
 89 |     tmp = a + b + c
 90 |     # compute the deviation of expected and output result
 91 |     tmp -= r
 92 |     # correct if square deviation is less than tolerance squared
 93 |     if tmp*tmp < TOL*TOL:
 94 |         correct += 1
 95 |     else:
 96 |         print "tmp", tmp, "h_a", a, "h_b", b, "h_c", c, "h_r", r
 97 | 
 98 | # Summarize results
 99 | print "1 vector adds to find R = A+B+C:", correct, "out of", LENGTH, "results were correct."
100 | 


--------------------------------------------------------------------------------
/Solutions/Exercise05/README.md:
--------------------------------------------------------------------------------
1 | Exercise 5 - The D = A + B + C problem
2 | ======================================
3 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/C/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Matrix Multiply example makefile
 3 | #
 4 | # History: Written  by Tim mattson, August 2010
 5 | #          Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 6 | #          Modified by Tom Deakin, July 2013
 7 | #          Modified by Tom Deakin, October 2014
 8 | #
 9 | 
10 | ifndef CC
11 | 	CC = gcc
12 | endif
13 | 
14 | CCFLAGS=-O3 -std=c99 -ffast-math
15 | 
16 | LIBS = -lm -lOpenCL -fopenmp
17 | 
18 | COMMON_DIR = ../../C_common
19 | 
20 | MMUL_OBJS = wtime.o
21 | EXEC = mult
22 | 
23 | 
24 | # Check our platform and make sure we define the APPLE variable
25 | # and set up the right compiler flags and libraries
26 | PLATFORM = $(shell uname -s)
27 | ifeq ($(PLATFORM), Darwin)
28 | 	LIBS = -lm -framework OpenCL
29 | endif
30 | 
31 | 
32 | all: $(EXEC)
33 | 
34 | mult: $(MMUL_OBJS) matmul.c matrix_lib.c
35 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $(EXEC)
36 | 
37 | wtime.o: $(COMMON_DIR)/wtime.c
38 | 	$(CC) -c $^ $(CCFLAGS) -o $@
39 | 
40 | .c.o:
41 | 	$(CC) -c $< $(CCFLAGS) -o $@
42 | 
43 | 
44 | clean:
45 | 	rm -f $(MMUL_OBJS) $(EXEC)
46 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/C/matmul.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  Include fle for the Matrix Multiply test harness
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Ported to C by Tom Deakin, July 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MULT_HDR
13 | #define __MULT_HDR
14 | 
15 | #include <stdio.h>
16 | #include <stdlib.h>
17 | #include <math.h>
18 | 
19 | #ifdef __APPLE__
20 | #include <OpenCL/opencl.h>
21 | #include <unistd.h>
22 | #else
23 | #include <CL/cl.h>
24 | #endif
25 | 
26 | #include "matrix_lib.h"
27 | 
28 | //------------------------------------------------------------------------------
29 | //  functions from ../Common
30 | //------------------------------------------------------------------------------
31 | extern int    output_device_info(cl_device_id );
32 | extern double wtime();   // returns time since some fixed past point (wtime.c)
33 | 
34 | //------------------------------------------------------------------------------
35 | //  Constants
36 | //------------------------------------------------------------------------------
37 | #define ORDER    1024    // Order of the square matrices A, B, and C
38 | #define AVAL     3.0     // A elements are constant and equal to AVAL
39 | #define BVAL     5.0     // B elements are constant and equal to BVAL
40 | #define TOL      (0.001) // tolerance used in floating point comparisons
41 | #define DIM      2       // Max dim for NDRange
42 | #define COUNT    1       // number of times to do each multiplication
43 | #define SUCCESS  1
44 | #define FAILURE  0
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/C/matrix_lib.c:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | //  PROGRAM: Matrix library for the multiplication driver
  4 | //
  5 | //  PURPOSE: This is a simple set of functions to manipulate
  6 | //           matrices used with the multiplcation driver.
  7 | //
  8 | //  USAGE:   The matrices are square and the order is
  9 | //           set as a defined constant, ORDER.
 10 | //
 11 | //  HISTORY: Written by Tim Mattson, August 2010
 12 | //           Modified by Simon McIntosh-Smith, September 2011
 13 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 14 | //           Ported to C by Tom Deakin, 2013
 15 | //
 16 | //------------------------------------------------------------------------------
 17 | 
 18 | #include "matmul.h"
 19 | 
 20 | //------------------------------------------------------------------------------
 21 | //
 22 | //  Function to compute the matrix product (sequential algorithm, dot prod)
 23 | //
 24 | //------------------------------------------------------------------------------
 25 | 
 26 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C)
 27 | {
 28 |     int i, j, k;
 29 |     float tmp;
 30 | 
 31 |     for (i = 0; i < N; i++) {
 32 |         for (j = 0; j < N; j++) {
 33 |             tmp = 0.0f;
 34 |             for (k = 0; k < N; k++) {
 35 |                 /* C(i,j) = sum(over k) A(i,k) * B(k,j) */
 36 |                 tmp += A[i*N+k] * B[k*N+j];
 37 |             }
 38 |             C[i*N+j] = tmp;
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | //------------------------------------------------------------------------------
 44 | //
 45 | //  Function to initialize the input matrices A and B
 46 | //
 47 | //------------------------------------------------------------------------------
 48 | void initmat(int N, float *A, float *B, float *C)
 49 | {
 50 |     int i, j;
 51 | 
 52 |     /* Initialize matrices */
 53 | 
 54 | 	for (i = 0; i < N; i++)
 55 | 		for (j = 0; j < N; j++)
 56 | 			A[i*N+j] = AVAL;
 57 | 
 58 | 	for (i = 0; i < N; i++)
 59 | 		for (j = 0; j < N; j++)
 60 | 			B[i*N+j] = BVAL;
 61 | 
 62 | 	for (i = 0; i < N; i++)
 63 | 		for (j = 0; j < N; j++)
 64 | 			C[i*N+j] = 0.0f;
 65 | }
 66 | 
 67 | //------------------------------------------------------------------------------
 68 | //
 69 | //  Function to set a matrix to zero
 70 | //
 71 | //------------------------------------------------------------------------------
 72 | void zero_mat (int N, float *C)
 73 | {
 74 |     int i, j;
 75 | 
 76 | 	for (i = 0; i < N; i++)
 77 | 		for (j = 0; j < N; j++)
 78 | 			C[i*N+j] = 0.0f;
 79 | }
 80 | 
 81 | //------------------------------------------------------------------------------
 82 | //
 83 | //  Function to fill Btrans(N,N) with transpose of B(N,N)
 84 | //
 85 | //------------------------------------------------------------------------------
 86 | void trans(int N, float *B, float *Btrans)
 87 | {
 88 |     int i, j;
 89 | 
 90 | 	for (i = 0; i < N; i++)
 91 | 		for (j = 0; j < N; j++)
 92 | 		    Btrans[j*N+i] = B[i*N+j];
 93 | }
 94 | 
 95 | //------------------------------------------------------------------------------
 96 | //
 97 | //  Function to compute errors of the product matrix
 98 | //
 99 | //------------------------------------------------------------------------------
100 | float error(int N, float *C)
101 | {
102 |    int i,j;
103 |    float cval, errsq, err;
104 |    cval = (float) N * AVAL * BVAL;
105 |    errsq = 0.0f;
106 | 
107 |     for (i = 0; i < N; i++) {
108 |         for (j = 0; j < N; j++) {
109 |             err = C[i*N+j] - cval;
110 |             errsq += err * err;
111 |         }
112 |     }
113 |     return errsq;
114 | }
115 | 
116 | //------------------------------------------------------------------------------
117 | //
118 | //  Function to analyze and output results
119 | //
120 | //------------------------------------------------------------------------------
121 | void results(int N, float *C, double run_time)
122 | {
123 |     float mflops;
124 |     float errsq;
125 | 
126 |     mflops = 2.0 * N * N * N/(1000000.0f * run_time);
127 |     printf(" %.2f seconds at %.1f MFLOPS \n",  run_time,mflops);
128 |     errsq = error(N, C);
129 |     if (isnan(errsq) || errsq > TOL) {
130 |         printf("\n Errors in multiplication: %f\n",errsq);
131 |         exit(1);
132 |     }
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/C/matrix_lib.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  PROGRAM: Matrix library include file (function prototypes)
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010 
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Ported by Tom Deakin, July 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MATRIX_LIB_HDR
13 | #define __MATRIX_LIB_HDR
14 | 
15 | 
16 | //------------------------------------------------------------------------------
17 | //
18 | //  Function to compute the matrix product (sequential algorithm, dot producdt)
19 | //
20 | //------------------------------------------------------------------------------
21 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C);
22 | 
23 | //------------------------------------------------------------------------------
24 | //
25 | //  Function to initialize the input matrices A and B
26 | //
27 | //------------------------------------------------------------------------------
28 | void initmat(int N, float *A, float *B, float *C);
29 | 
30 | //------------------------------------------------------------------------------
31 | //
32 | //  Function to set a matrix to zero 
33 | //
34 | //------------------------------------------------------------------------------
35 | void zero_mat (int N, float *C);
36 | 
37 | //------------------------------------------------------------------------------
38 | //
39 | //  Function to fill Btrans(Mdim,Pdim)  with transpose of B(Pdim,Mdim)
40 | //
41 | //------------------------------------------------------------------------------
42 | void trans(int N, float *B, float *Btrans);
43 | 
44 | //------------------------------------------------------------------------------
45 | //
46 | //  Function to compute errors of the product matrix
47 | //
48 | //------------------------------------------------------------------------------
49 | float error(int N, float *C);
50 | 
51 | 
52 | //------------------------------------------------------------------------------
53 | //
54 | //  Function to analyze and output results 
55 | //
56 | //------------------------------------------------------------------------------
57 | void results(int N, float *C, double run_time);
58 |     
59 | #endif
60 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Matrix Multiply example makefile
 3 | #
 4 | # History: Written  by Tim mattson, August 2010
 5 | #          Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 6 | #          Modified by Tom Deakin, July 2013
 7 | #
 8 | 
 9 | ifndef CPPC
10 | 	CPPC=g++
11 | endif
12 | 
13 | CCFLAGS=-O3 -ffast-math
14 | 
15 | LIBS = -lm -lOpenCL -fopenmp
16 | 
17 | COMMON_DIR = ../../Cpp_common
18 | 
19 | INC = -I $(COMMON_DIR)
20 | 
21 | MMUL_OBJS = matmul.o matrix_lib.o wtime.o
22 | EXEC = mult
23 | 
24 | # Check our platform and make sure we define the APPLE variable
25 | # and set up the right compiler flags and libraries
26 | PLATFORM = $(shell uname -s)
27 | ifeq ($(PLATFORM), Darwin)
28 | 	CPPC = clang++
29 | 	CCFLAGS += -stdlib=libc++
30 | 	LIBS = -lm -framework OpenCL
31 | endif
32 | 
33 | all: $(EXEC)
34 | 
35 | mult: $(MMUL_OBJS)
36 | 	$(CPPC) $(MMUL_OBJS) $(CCFLAGS) $(LIBS) -o $(EXEC)
37 | 
38 | wtime.o: $(COMMON_DIR)/wtime.c
39 | 	$(CPPC) -c $^ $(CCFLAGS) -o $@
40 | 
41 | .c.o:
42 | 	$(CPPC) -c $< $(CCFLAGS) -o $@
43 | 
44 | .cpp.o:
45 | 	$(CPPC) -c $< $(CCFLAGS) $(INC) -o $@
46 | 
47 | matmul.o:	matmul.hpp matrix_lib.hpp
48 | 
49 | matrix_lib.o:	matmul.hpp
50 | 
51 | clean:
52 | 	rm -f $(MMUL_OBJS) $(EXEC)
53 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/Cpp/matmul.hpp:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  Include fle for the Matrix Multiply test harness
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MULT_HDR
13 | #define __MULT_HDR
14 | 
15 | #include <cstdio>
16 | #include <cstdlib>
17 | #include <cmath>
18 | #include <iostream>
19 | 
20 | #include <vector>
21 | 
22 | #define __CL_ENABLE_EXCEPTIONS
23 | #include "cl.hpp"
24 | 
25 | 
26 | #include "matrix_lib.hpp"
27 | 
28 | //------------------------------------------------------------------------------
29 | //  functions from ../Common
30 | //------------------------------------------------------------------------------
31 | extern double wtime();   // returns time since some fixed past point (wtime.c)
32 | 
33 | //------------------------------------------------------------------------------
34 | //  Constants
35 | //------------------------------------------------------------------------------
36 | #define ORDER    1024    // Order of the square matrices A, B, and C
37 | #define AVAL     3.0     // A elements are constant and equal to AVAL
38 | #define BVAL     5.0     // B elements are constant and equal to BVAL
39 | #define TOL      (0.001) // tolerance used in floating point comparisons
40 | #define DIM      2       // Max dim for NDRange
41 | #define COUNT    1       // number of times to do each multiplication
42 | #define SUCCESS  1
43 | #define FAILURE  0
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/Cpp/matrix_lib.cpp:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | //  PROGRAM: Matrix library for the multiplication driver
  4 | //
  5 | //  PURPOSE: This is a simple set of functions to manipulate
  6 | //           matrices used with the multiplcation driver.
  7 | //
  8 | //  USAGE:   The matrices are square and the order is
  9 | //           set as a defined constant, ORDER.
 10 | //
 11 | //  HISTORY: Written by Tim Mattson, August 2010
 12 | //           Modified by Simon McIntosh-Smith, September 2011
 13 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 14 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 15 | //           Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
 16 | //
 17 | //------------------------------------------------------------------------------
 18 | 
 19 | #include "matmul.hpp"
 20 | 
 21 | //------------------------------------------------------------------------------
 22 | //
 23 | //  Function to compute the matrix product (sequential algorithm, dot prod)
 24 | //
 25 | //------------------------------------------------------------------------------
 26 | 
 27 | void seq_mat_mul_sdot(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C)
 28 | {
 29 |     int i, j, k;
 30 |     float tmp;
 31 | 
 32 |     for (i = 0; i < N; i++) {
 33 |         for (j = 0; j < N; j++) {
 34 |             tmp = 0.0f;
 35 |             for (k = 0; k < N; k++) {
 36 |                 /* C(i,j) = sum(over k) A(i,k) * B(k,j) */
 37 |                 tmp += A[i*N+k] * B[k*N+j];
 38 |             }
 39 |             C[i*N+j] = tmp;
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | //------------------------------------------------------------------------------
 45 | //
 46 | //  Function to initialize the input matrices A and B
 47 | //
 48 | //------------------------------------------------------------------------------
 49 | void initmat(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C)
 50 | {
 51 |     int i, j;
 52 | 
 53 |     /* Initialize matrices */
 54 | 
 55 | 	for (i = 0; i < N; i++)
 56 | 		for (j = 0; j < N; j++)
 57 | 			A[i*N+j] = AVAL;
 58 | 
 59 | 	for (i = 0; i < N; i++)
 60 | 		for (j = 0; j < N; j++)
 61 | 			B[i*N+j] = BVAL;
 62 | 
 63 | 	for (i = 0; i < N; i++)
 64 | 		for (j = 0; j < N; j++)
 65 | 			C[i*N+j] = 0.0f;
 66 | }
 67 | 
 68 | //------------------------------------------------------------------------------
 69 | //
 70 | //  Function to set a matrix to zero
 71 | //
 72 | //------------------------------------------------------------------------------
 73 | void zero_mat (int N, std::vector<float>& C)
 74 | {
 75 |     int i, j;
 76 | 
 77 | 	for (i = 0; i < N; i++)
 78 | 		for (j = 0; j < N; j++)
 79 | 			C[i*N+j] = 0.0f;
 80 | }
 81 | 
 82 | //------------------------------------------------------------------------------
 83 | //
 84 | //  Function to fill Btrans(N,N) with transpose of B(N,N)
 85 | //
 86 | //------------------------------------------------------------------------------
 87 | void trans(int N, std::vector<float>& B, std::vector<float>& Btrans)
 88 | {
 89 |     int i, j;
 90 | 
 91 | 	for (i = 0; i < N; i++)
 92 | 		for (j = 0; j < N; j++)
 93 | 		    Btrans[j*N+i] = B[i*N+j];
 94 | }
 95 | 
 96 | //------------------------------------------------------------------------------
 97 | //
 98 | //  Function to compute errors of the product matrix
 99 | //
100 | //------------------------------------------------------------------------------
101 | float error(int N, std::vector<float>& C)
102 | {
103 |    int i,j;
104 |    float cval, errsq, err;
105 |    cval = (float) N * AVAL * BVAL;
106 |    errsq = 0.0f;
107 | 
108 |     for (i = 0; i < N; i++) {
109 |         for (j = 0; j < N; j++) {
110 |             err = C[i*N+j] - cval;
111 |             errsq += err * err;
112 |         }
113 |     }
114 |     return errsq;
115 | }
116 | 
117 | //------------------------------------------------------------------------------
118 | //
119 | //  Function to analyze and output results
120 | //
121 | //------------------------------------------------------------------------------
122 | void results(int N, std::vector<float>& C, double run_time)
123 | {
124 | 
125 |     float mflops;
126 |     float errsq;
127 |     
128 |     mflops = 2.0 * N * N * N/(1000000.0f * run_time);
129 |     printf(" %.2f seconds at %.1f MFLOPS \n",  run_time,mflops);
130 |     errsq = error(N, C);
131 |     if (std::isnan(errsq) || errsq > TOL)
132 |            printf("\n Errors in multiplication: %f\n",errsq);
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/Cpp/matrix_lib.hpp:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  PROGRAM: Matrix library include file (function prototypes)
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010 
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 9 | //           Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
10 | //
11 | //------------------------------------------------------------------------------
12 | 
13 | #ifndef __MATRIX_LIB_HDR
14 | #define __MATRIX_LIB_HDR
15 | 
16 | 
17 | //------------------------------------------------------------------------------
18 | //
19 | //  Function to compute the matrix product (sequential algorithm, dot producdt)
20 | //
21 | //------------------------------------------------------------------------------
22 | void seq_mat_mul_sdot(int N, std::vector<float> &A, std::vector<float> &B, std::vector<float> &C);
23 | 
24 | //------------------------------------------------------------------------------
25 | //
26 | //  Function to initialize the input matrices A and B
27 | //
28 | //------------------------------------------------------------------------------
29 | void initmat(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C);
30 | 
31 | //------------------------------------------------------------------------------
32 | //
33 | //  Function to set a matrix to zero 
34 | //
35 | //------------------------------------------------------------------------------
36 | void zero_mat (int N, std::vector<float> &C);
37 | 
38 | //------------------------------------------------------------------------------
39 | //
40 | //  Function to fill Btrans(Mdim,Pdim)  with transpose of B(Pdim,Mdim)
41 | //
42 | //------------------------------------------------------------------------------
43 | void trans(int N, std::vector<float>& B, std::vector<float>& Btrans);
44 | 
45 | //------------------------------------------------------------------------------
46 | //
47 | //  Function to compute errors of the product matrix
48 | //
49 | //------------------------------------------------------------------------------
50 | float error(int N, std::vector<float>& C);
51 | 
52 | 
53 | //------------------------------------------------------------------------------
54 | //
55 | //  Function to analyze and output results 
56 | //
57 | //------------------------------------------------------------------------------
58 | void results(int N, std::vector<float>& C, double run_time);
59 |     
60 | #endif
61 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/Python/definitions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Order of the square matrices A, B and C
 3 | ORDER = 1024
 4 | 
 5 | # A elemetns are constant and equal to AVAL
 6 | AVAL = 3.0
 7 | 
 8 | # B elemetns are constant and equal to BVAL
 9 | BVAL = 5.0
10 | 
11 | # tolerance used in floating point comparisons
12 | TOL = 0.001
13 | 
14 | # Max dim for NDRange
15 | DIM = 2
16 | 
17 | # number of times to do each multiplication
18 | COUNT = 1
19 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/Python/helper.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from definitions import *
 3 | import numpy
 4 | 
 5 | #  Function to compute the matrix product (sequential algorithm, dot prod)
 6 | def seq_mat_mul_sdot( Ndim, A, B, C):
 7 |     for i in range(Ndim):
 8 |         for j in range(Ndim):
 9 |             tmp = 0.0
10 |             for k in range(Ndim):
11 |                 tmp += A[i*Ndim+k] * B[k*Ndim+j]
12 |             C[i*Ndim+j] = tmp
13 | 
14 | #  Function to compute errors of the product matrix
15 | def error( Ndim, C):
16 |    cval = float(Ndim) * AVAL * BVAL
17 |    errsq = 0.0
18 |    for i in range(Ndim):
19 |        for j in range(Ndim):
20 |             err = C[i*Ndim+j] - cval
21 |             errsq += err * err
22 |    return errsq;
23 | 
24 | 
25 | # Function to analyze and output results
26 | def results( Ndim, C, run_time):
27 |     mflops = ( 2.0 * (Ndim**(3)) )/(1000000.0* run_time)
28 |     print run_time, "seconds at", mflops, "MFLOPS"
29 |     errsq = error( Ndim, C)
30 |     if numpy.isnan(errsq) or errsq > TOL:
31 |         print "Errors in multiplication:", errsq
32 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/Python/matmul.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Matrix Multiplication Driver
  3 | #
  4 | # This is a driver program to test various ways of computing
  5 | # the product:
  6 | #                 C = A * B
  7 | #
  8 | # A and B are constant matrices, square and the order is
  9 | # set as a constant, ORDER (see definitions.py). This is so
 10 | # we can make a quick test of the multiplication result.
 11 | #
 12 | # History:   C++ version written by Tim Mattson, August 2010
 13 | #            Modified by Simon McIntosh-Smith, September 2011
 14 | #            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 15 | #            Ported to Python by Tom Deakin, July 2013
 16 | #            Modified to assume square matrices by Ben Elgar, November 2014
 17 | #
 18 | 
 19 | from helper import *
 20 | from definitions import *
 21 | 
 22 | import pyopencl as cl
 23 | import numpy
 24 | from time import time
 25 | 
 26 | C_elem_KernelSource = '''
 27 | __kernel void mmul(
 28 |     const int N,
 29 |     __global float* A,
 30 |     __global float* B,
 31 |     __global float* C)
 32 | {
 33 |     int k;
 34 |     int i = get_global_id(0);
 35 |     int j = get_global_id(1);
 36 |     float tmp = 0;
 37 |     if ((i < N) && (j < N))
 38 |     {
 39 |         tmp = 0.0f;
 40 |         for (k=0; k<N; k++)
 41 |         {
 42 |             tmp += A[i*N + k] * B[k*N + j];
 43 |         }
 44 |         C[i*N + j] = tmp;
 45 |     }
 46 | }
 47 | '''
 48 | 
 49 | # A[N][N], B[N][N], C[N][N]
 50 | N = ORDER;
 51 | 
 52 | # Number of elements in the matrix
 53 | size = N * N
 54 | 
 55 | 
 56 | # A matrix
 57 | h_A = numpy.empty(size).astype(numpy.float32)
 58 | h_A.fill(AVAL)
 59 | 
 60 | # B matrix
 61 | h_B = numpy.empty(size).astype(numpy.float32)
 62 | h_B.fill(BVAL)
 63 | 
 64 | # C matrix
 65 | h_C = numpy.empty(size).astype(numpy.float32)
 66 | 
 67 | print "\n===== Sequential, matrix mult (dot prod), order", ORDER, "on host CPU ======\n"
 68 | 
 69 | for i in range(COUNT):
 70 |     h_C.fill(0.0)
 71 |     start_time = time()
 72 | 
 73 |     print "Skipping as this takes a long time to run!"
 74 |     #seq_mat_mul_sdot(N, h_A, h_B, h_C)
 75 | 
 76 |     run_time = time() - start_time
 77 |     #results(N, h_C, run_time)
 78 | 
 79 | 
 80 | # Set up OpenCL
 81 | context = cl.create_some_context()
 82 | queue = cl.CommandQueue(context)
 83 | 
 84 | # Reset host buffers - just to play it safe
 85 | h_A = numpy.empty(size).astype(numpy.float32)
 86 | h_A.fill(AVAL)
 87 | h_B = numpy.empty(size).astype(numpy.float32)
 88 | h_B.fill(BVAL)
 89 | h_C = numpy.empty(size).astype(numpy.float32)
 90 | 
 91 | # Create OpenCL buffers
 92 | d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_A)
 93 | d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_B)
 94 | d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)
 95 | 
 96 | program = cl.Program(context, C_elem_KernelSource).build()
 97 | mmul = program.mmul
 98 | mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])
 99 | 
100 | print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", N, "======\n"
101 | 
102 | # Do the multiplication COUNT times
103 | for i in range(COUNT):
104 |     h_C.fill(0.0)
105 |     start_time = time()
106 | 
107 |     globalrange = (N, N)
108 |     localrange = None
109 | 
110 |     mmul(queue, globalrange, localrange, N, d_a, d_b, d_c)
111 |     queue.finish()
112 | 
113 |     run_time = time() - start_time
114 | 
115 |     cl.enqueue_copy(queue, h_C, d_c)
116 |     results(N, h_C, run_time)
117 | 


--------------------------------------------------------------------------------
/Solutions/Exercise06/README.md:
--------------------------------------------------------------------------------
1 | Exercise 6 - Matrix Multiplication
2 | ==================================
3 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/C/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Matrix Multiply example makefile
 3 | #
 4 | # History: Written  by Tim mattson, August 2010
 5 | #          Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 6 | #          Modified by Tom Deakin, July 2013
 7 | #          Modified by Tom Deakin, October 2014
 8 | #
 9 | 
10 | ifndef CC
11 | 	CC = gcc
12 | endif
13 | 
14 | CCFLAGS=-O3 -std=c99 -ffast-math
15 | 
16 | LIBS = -lm -lOpenCL -fopenmp
17 | 
18 | COMMON_DIR = ../../C_common
19 | 
20 | MMUL_OBJS = wtime.o
21 | EXEC = mult
22 | 
23 | 
24 | # Check our platform and make sure we define the APPLE variable
25 | # and set up the right compiler flags and libraries
26 | PLATFORM = $(shell uname -s)
27 | ifeq ($(PLATFORM), Darwin)
28 | 	LIBS = -lm -framework OpenCL 
29 | endif
30 | 
31 | all: $(EXEC)
32 | 
33 | mult: $(MMUL_OBJS) matmul.c matrix_lib.c
34 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $(EXEC)
35 | 
36 | wtime.o: $(COMMON_DIR)/wtime.c
37 | 	$(CC) -c $^ $(CCFLAGS) -o $@
38 | 
39 | .c.o:
40 | 	$(CC) -c $< $(CCFLAGS) -o $@
41 | 
42 | 
43 | clean:
44 | 	rm -f $(MMUL_OBJS) $(EXEC)
45 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/C/matmul.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  Include fle for the Matrix Multiply test harness
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Ported to C by Tom Deakin, July 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MULT_HDR
13 | #define __MULT_HDR
14 | 
15 | #include <stdio.h>
16 | #include <stdlib.h>
17 | #include <math.h>
18 | 
19 | #ifdef __APPLE__
20 | #include <OpenCL/opencl.h>
21 | #include <unistd.h>
22 | #else
23 | #include <CL/cl.h>
24 | #endif
25 | 
26 | #include "matrix_lib.h"
27 | 
28 | 
29 | //------------------------------------------------------------------------------
30 | //  functions from ../Common
31 | //------------------------------------------------------------------------------
32 | extern int    output_device_info(cl_device_id );
33 | extern double wtime();   // returns time since some fixed past point (wtime.c)
34 | 
35 | //------------------------------------------------------------------------------
36 | //  Constants
37 | //------------------------------------------------------------------------------
38 | #define ORDER    1024    // Order of the square matrices A, B, and C
39 | #define AVAL     3.0     // A elements are constant and equal to AVAL
40 | #define BVAL     5.0     // B elements are constant and equal to BVAL
41 | #define TOL      (0.001) // tolerance used in floating point comparisons
42 | #define DIM      2       // Max dim for NDRange
43 | #define COUNT    1       // number of times to do each multiplication
44 | #define SUCCESS  1
45 | #define FAILURE  0
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/C/matrix_lib.c:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | //  PROGRAM: Matrix library for the multiplication driver
  4 | //
  5 | //  PURPOSE: This is a simple set of functions to manipulate
  6 | //           matrices used with the multiplcation driver.
  7 | //
  8 | //  USAGE:   The matrices are square and the order is
  9 | //           set as a defined constant, ORDER.
 10 | //
 11 | //  HISTORY: Written by Tim Mattson, August 2010
 12 | //           Modified by Simon McIntosh-Smith, September 2011
 13 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 14 | //           Ported to C by Tom Deakin, 2013
 15 | //
 16 | //------------------------------------------------------------------------------
 17 | 
 18 | #include "matmul.h"
 19 | 
 20 | //------------------------------------------------------------------------------
 21 | //
 22 | //  Function to compute the matrix product (sequential algorithm, dot prod)
 23 | //
 24 | //------------------------------------------------------------------------------
 25 | 
 26 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C)
 27 | {
 28 |     int i, j, k;
 29 |     float tmp;
 30 | 
 31 |     for (i = 0; i < N; i++) {
 32 |         for (j = 0; j < N; j++) {
 33 |             tmp = 0.0f;
 34 |             for (k = 0; k < N; k++) {
 35 |                 /* C(i,j) = sum(over k) A(i,k) * B(k,j) */
 36 |                 tmp += A[i*N+k] * B[k*N+j];
 37 |             }
 38 |             C[i*N+j] = tmp;
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | //------------------------------------------------------------------------------
 44 | //
 45 | //  Function to initialize the input matrices A and B
 46 | //
 47 | //------------------------------------------------------------------------------
 48 | void initmat(int N, float *A, float *B, float *C)
 49 | {
 50 |     int i, j;
 51 | 
 52 |     /* Initialize matrices */
 53 | 
 54 | 	for (i = 0; i < N; i++)
 55 | 		for (j = 0; j < N; j++)
 56 | 			A[i*N+j] = AVAL;
 57 | 
 58 | 	for (i = 0; i < N; i++)
 59 | 		for (j = 0; j < N; j++)
 60 | 			B[i*N+j] = BVAL;
 61 | 
 62 | 	for (i = 0; i < N; i++)
 63 | 		for (j = 0; j < N; j++)
 64 | 			C[i*N+j] = 0.0f;
 65 | }
 66 | 
 67 | //------------------------------------------------------------------------------
 68 | //
 69 | //  Function to set a matrix to zero
 70 | //
 71 | //------------------------------------------------------------------------------
 72 | void zero_mat (int N, float *C)
 73 | {
 74 |     int i, j;
 75 | 
 76 | 	for (i = 0; i < N; i++)
 77 | 		for (j = 0; j < N; j++)
 78 | 			C[i*N+j] = 0.0f;
 79 | }
 80 | 
 81 | //------------------------------------------------------------------------------
 82 | //
 83 | //  Function to fill Btrans(N,N) with transpose of B(N,N)
 84 | //
 85 | //------------------------------------------------------------------------------
 86 | void trans(int N, float *B, float *Btrans)
 87 | {
 88 |     int i, j;
 89 | 
 90 | 	for (i = 0; i < N; i++)
 91 | 		for (j = 0; j < N; j++)
 92 | 		    Btrans[j*N+i] = B[i*N+j];
 93 | }
 94 | 
 95 | //------------------------------------------------------------------------------
 96 | //
 97 | //  Function to compute errors of the product matrix
 98 | //
 99 | //------------------------------------------------------------------------------
100 | float error(int N, float *C)
101 | {
102 |    int i,j;
103 |    float cval, errsq, err;
104 |    cval = (float) N * AVAL * BVAL;
105 |    errsq = 0.0f;
106 | 
107 |     for (i = 0; i < N; i++) {
108 |         for (j = 0; j < N; j++) {
109 |             err = C[i*N+j] - cval;
110 |             errsq += err * err;
111 |         }
112 |     }
113 |     return errsq;
114 | }
115 | 
116 | //------------------------------------------------------------------------------
117 | //
118 | //  Function to analyze and output results
119 | //
120 | //------------------------------------------------------------------------------
121 | void results(int N, float *C, double run_time)
122 | {
123 |     float mflops;
124 |     float errsq;
125 | 
126 |     mflops = 2.0 * N * N * N/(1000000.0f * run_time);
127 |     printf(" %.2f seconds at %.1f MFLOPS \n",  run_time,mflops);
128 |     errsq = error(N, C);
129 |     if (isnan(errsq) || errsq > TOL) {
130 |         printf("\n Errors in multiplication: %f\n",errsq);
131 |         exit(1);
132 |     }
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/C/matrix_lib.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  PROGRAM: Matrix library include file (function prototypes)
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010 
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Ported by Tom Deakin, July 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MATRIX_LIB_HDR
13 | #define __MATRIX_LIB_HDR
14 | 
15 | 
16 | //------------------------------------------------------------------------------
17 | //
18 | //  Function to compute the matrix product (sequential algorithm, dot producdt)
19 | //
20 | //------------------------------------------------------------------------------
21 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C);
22 | 
23 | //------------------------------------------------------------------------------
24 | //
25 | //  Function to initialize the input matrices A and B
26 | //
27 | //------------------------------------------------------------------------------
28 | void initmat(int N, float *A, float *B, float *C);
29 | 
30 | //------------------------------------------------------------------------------
31 | //
32 | //  Function to set a matrix to zero 
33 | //
34 | //------------------------------------------------------------------------------
35 | void zero_mat (int N, float *C);
36 | 
37 | //------------------------------------------------------------------------------
38 | //
39 | //  Function to fill Btrans(Mdim,Pdim)  with transpose of B(Pdim,Mdim)
40 | //
41 | //------------------------------------------------------------------------------
42 | void trans(int N, float *B, float *Btrans);
43 | 
44 | //------------------------------------------------------------------------------
45 | //
46 | //  Function to compute errors of the product matrix
47 | //
48 | //------------------------------------------------------------------------------
49 | float error(int N, float *C);
50 | 
51 | 
52 | //------------------------------------------------------------------------------
53 | //
54 | //  Function to analyze and output results 
55 | //
56 | //------------------------------------------------------------------------------
57 | void results(int N, float *C, double run_time);
58 |     
59 | #endif
60 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/C_elem.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | __kernel void mmul(
 3 |     const int N,
 4 |     __global float* A,
 5 |     __global float* B,
 6 |     __global float* C)
 7 | {
 8 |     int k;
 9 |     int i = get_global_id(0);
10 |     int j = get_global_id(1);
11 |     float tmp;
12 |     if ((i < N) && (j < N))
13 |     {
14 |         tmp = 0.0f;
15 |         for (k = 0; k < N; k++)
16 |             tmp += A[i*N+k] * B[k*N+j];
17 |         C[i*N+j] = tmp;
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/C_row.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | __kernel void mmul(
 3 |     const int N,
 4 |     __global float* A,
 5 |     __global float* B,
 6 |     __global float* C)
 7 | {
 8 |     int k, j;
 9 |     int i = get_global_id(0);
10 |     float tmp;
11 |     if (i < N) {
12 |         for (j = 0; j < N; j++) {
13 |             tmp = 0.0f;
14 |             for (k = 0; k < N; k++)
15 |                 tmp += A[i*N+k] * B[k*N+j];
16 |             C[i*N+j] = tmp;
17 |         }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/C_row_priv.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | __kernel void mmul(
 3 |     const int N,
 4 |     __global float* A,
 5 |     __global float* B,
 6 |     __global float* C)
 7 | {
 8 |     int k, j;
 9 |     int i = get_global_id(0);
10 |     float Awrk[1024];
11 |     float tmp;
12 |     if (i < N) {
13 |         for (k = 0; k < N; k++)
14 |             Awrk[k] = A[i*N+k];
15 | 
16 |         for (j = 0; j < N; j++) {
17 |             tmp = 0.0f;
18 |             for (k = 0; k < N; k++)
19 |                 tmp += Awrk[k] * B[k*N+j];
20 |             C[i*N+j] = tmp;
21 |         }
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Matrix Multiply example makefile
 3 | #
 4 | # History: Written  by Tim mattson, August 2010
 5 | #          Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 6 | #          Modified by Tom Deakin, July 2013
 7 | #
 8 | 
 9 | ifndef CPPC
10 | 	CPPC=g++
11 | endif
12 | 
13 | CCFLAGS =-O3 -ffast-math
14 | 
15 | LIBS = -lm -lOpenCL -fopenmp
16 | 
17 | COMMON_DIR = ../../Cpp_common
18 | 
19 | INC = -I $(COMMON_DIR)
20 | 
21 | MMUL_OBJS = matmul.o matrix_lib.o wtime.o
22 | EXEC = mult
23 | 
24 | 
25 | # Check our platform and make sure we define the APPLE variable
26 | # and set up the right compiler flags and libraries
27 | PLATFORM = $(shell uname -s)
28 | ifeq ($(PLATFORM), Darwin)
29 | 	CPPC = clang++
30 | 	CCFLAGS += -stdlib=libc++
31 | 	LIBS = -lm -framework OpenCL
32 | endif
33 | 
34 | all: $(EXEC)
35 | 
36 | mult: $(MMUL_OBJS)
37 | 	$(CPPC) $(MMUL_OBJS) $(CCFLAGS) $(LIBS) -o $(EXEC)
38 | 
39 | wtime.o: $(COMMON_DIR)/wtime.c
40 | 	$(CPPC) -c $^ $(CCFLAGS) -o $@
41 | 
42 | .c.o:
43 | 	$(CPPC) -c $< $(CCFLAGS) -o $@
44 | 
45 | .cpp.o:
46 | 	$(CPPC) -c $< $(CCFLAGS) $(INC) -o $@
47 | 
48 | matmul.o:	matmul.hpp matrix_lib.hpp
49 | 
50 | matrix_lib.o:	matmul.hpp
51 | 
52 | clean:
53 | 	rm -f $(MMUL_OBJS) $(EXEC)
54 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/Cpp/matmul.hpp:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  Include fle for the Matrix Multiply test harness
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MULT_HDR
13 | #define __MULT_HDR
14 | 
15 | #include <cstdio>
16 | #include <cstdlib>
17 | #include <cmath>
18 | #include <iostream>
19 | 
20 | #include <vector>
21 | 
22 | #define __CL_ENABLE_EXCEPTIONS
23 | #include "cl.hpp"
24 | 
25 | #include "util.hpp"
26 | 
27 | #include "matrix_lib.hpp"
28 | 
29 | //------------------------------------------------------------------------------
30 | //  functions from ../Common
31 | //------------------------------------------------------------------------------
32 | extern double wtime();   // returns time since some fixed past point (wtime.c)
33 | 
34 | //------------------------------------------------------------------------------
35 | //  Constants
36 | //------------------------------------------------------------------------------
37 | #define ORDER    1024    // Order of the square matrices A, B, and C
38 | #define AVAL     3.0     // A elements are constant and equal to AVAL
39 | #define BVAL     5.0     // B elements are constant and equal to BVAL
40 | #define TOL      (0.001) // tolerance used in floating point comparisons
41 | #define DIM      2       // Max dim for NDRange
42 | #define COUNT    1       // number of times to do each multiplication
43 | #define SUCCESS  1
44 | #define FAILURE  0
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/Cpp/matrix_lib.hpp:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  PROGRAM: Matrix library include file (function prototypes)
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010 
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 9 | //           Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
10 | //
11 | //------------------------------------------------------------------------------
12 | 
13 | #ifndef __MATRIX_LIB_HDR
14 | #define __MATRIX_LIB_HDR
15 | 
16 | 
17 | //------------------------------------------------------------------------------
18 | //
19 | //  Function to compute the matrix product (sequential algorithm, dot producdt)
20 | //
21 | //------------------------------------------------------------------------------
22 | void seq_mat_mul_sdot(int N, std::vector<float> &A, std::vector<float> &B, std::vector<float> &C);
23 | 
24 | //------------------------------------------------------------------------------
25 | //
26 | //  Function to initialize the input matrices A and B
27 | //
28 | //------------------------------------------------------------------------------
29 | void initmat(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C);
30 | 
31 | //------------------------------------------------------------------------------
32 | //
33 | //  Function to set a matrix to zero 
34 | //
35 | //------------------------------------------------------------------------------
36 | void zero_mat (int N, std::vector<float> &C);
37 | 
38 | //------------------------------------------------------------------------------
39 | //
40 | //  Function to fill Btrans(Mdim,Pdim)  with transpose of B(Pdim,Mdim)
41 | //
42 | //------------------------------------------------------------------------------
43 | void trans(int N, std::vector<float>& B, std::vector<float>& Btrans);
44 | 
45 | //------------------------------------------------------------------------------
46 | //
47 | //  Function to compute errors of the product matrix
48 | //
49 | //------------------------------------------------------------------------------
50 | float error(int N, std::vector<float>& C);
51 | 
52 | 
53 | //------------------------------------------------------------------------------
54 | //
55 | //  Function to analyze and output results 
56 | //
57 | //------------------------------------------------------------------------------
58 | void results(int N, std::vector<float>& C, double run_time);
59 |     
60 | #endif
61 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/Python/definitions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Order of the square matrices A, B and C
 3 | ORDER = 1024
 4 | 
 5 | # A elemetns are constant and equal to AVAL
 6 | AVAL = 3.0
 7 | 
 8 | # B elemetns are constant and equal to BVAL
 9 | BVAL = 5.0
10 | 
11 | # tolerance used in floating point comparisons
12 | TOL = 0.001
13 | 
14 | # Max dim for NDRange
15 | DIM = 2
16 | 
17 | # number of times to do each multiplication
18 | COUNT = 1
19 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/Python/helper.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from definitions import *
 3 | 
 4 | #  Function to compute the matrix product (sequential algorithm, dot prod)
 5 | def seq_mat_mul_sdot(N, A, B, C):
 6 |     for i in range(N):
 7 |         for j in range(N):
 8 |             tmp = 0.0
 9 |             for k in range(N):
10 |                 tmp += A[i*N+k] * B[k*N+j]
11 |             C[i*N+j] = tmp
12 | 
13 | #  Function to compute errors of the product matrix
14 | def error(N, C):
15 |    cval = float(N) * AVAL * BVAL
16 |    errsq = 0.0
17 |    for i in range(N):
18 |        for j in range(N):
19 |             err = C[i*N+j] - cval
20 |             errsq += err * err
21 |    return errsq;
22 | 
23 | 
24 | # Function to analyze and output results
25 | def results(N, C, run_time):
26 |     mflops = 2.0 * N * N * N/(1000000.0* run_time)
27 |     print run_time, "seconds at", mflops, "MFLOPS"
28 |     errsq = error(N, C)
29 |     if (errsq > TOL):
30 |         print "Errors in multiplication:", errsq
31 | 


--------------------------------------------------------------------------------
/Solutions/Exercise07/README.md:
--------------------------------------------------------------------------------
1 | Exercise 7 - using private memory
2 | =================================
3 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Matrix Multiply example makefile
 3 | #
 4 | # History: Written  by Tim mattson, August 2010
 5 | #          Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 6 | #          Modified by Tom Deakin, July 2013
 7 | #          Modified by Tom Deakin, October 2014
 8 | #
 9 | 
10 | ifndef CC
11 | 	CC = gcc
12 | endif
13 | 
14 | CCFLAGS=-O3 -std=c99 -ffast-math
15 | 
16 | LIBS = -lm -lOpenCL -fopenmp
17 | 
18 | COMMON_DIR = ../../C_common
19 | 
20 | MMUL_OBJS = wtime.o
21 | EXEC = mult
22 | 
23 | 
24 | # Check our platform and make sure we define the APPLE variable
25 | # and set up the right compiler flags and libraries
26 | PLATFORM = $(shell uname -s)
27 | ifeq ($(PLATFORM), Darwin)
28 | 	LIBS = -lm -framework OpenCL 
29 | endif
30 | 
31 | all: $(EXEC)
32 | 
33 | mult: $(MMUL_OBJS) matmul.c matrix_lib.c
34 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $(EXEC)
35 | 
36 | wtime.o: $(COMMON_DIR)/wtime.c
37 | 	$(CC) -c $^ $(CCFLAGS) -o $@
38 | 
39 | .c.o:
40 | 	$(CC) -c $< $(CCFLAGS) -o $@
41 | 
42 | 
43 | clean:
44 | 	rm -f $(MMUL_OBJS) $(EXEC)
45 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C/matmul.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  Include fle for the Matrix Multiply test harness
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Ported to C by Tom Deakin, July 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MULT_HDR
13 | #define __MULT_HDR
14 | 
15 | #include <stdio.h>
16 | #include <stdlib.h>
17 | #include <math.h>
18 | 
19 | #ifdef __APPLE__
20 | #include <OpenCL/opencl.h>
21 | #include <unistd.h>
22 | #else
23 | #include <CL/cl.h>
24 | #endif
25 | 
26 | #include "matrix_lib.h"
27 | 
28 | //------------------------------------------------------------------------------
29 | //  functions from ../Common
30 | //------------------------------------------------------------------------------
31 | extern int    output_device_info(cl_device_id );
32 | extern double wtime();   // returns time since some fixed past point (wtime.c)
33 | 
34 | //------------------------------------------------------------------------------
35 | //  Constants
36 | //------------------------------------------------------------------------------
37 | #define ORDER    1024    // Order of the square matrices A, B, and C
38 | #define AVAL     3.0     // A elements are constant and equal to AVAL
39 | #define BVAL     5.0     // B elements are constant and equal to BVAL
40 | #define TOL      (0.001) // tolerance used in floating point comparisons
41 | #define DIM      2       // Max dim for NDRange
42 | #define COUNT    1       // number of times to do each multiplication
43 | #define SUCCESS  1
44 | #define FAILURE  0
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C/matrix_lib.c:
--------------------------------------------------------------------------------
  1 | //------------------------------------------------------------------------------
  2 | //
  3 | //  PROGRAM: Matrix library for the multiplication driver
  4 | //
  5 | //  PURPOSE: This is a simple set of functions to manipulate
  6 | //           matrices used with the multiplcation driver.
  7 | //
  8 | //  USAGE:   The matrices are square and the order is
  9 | //           set as a defined constant, ORDER.
 10 | //
 11 | //  HISTORY: Written by Tim Mattson, August 2010
 12 | //           Modified by Simon McIntosh-Smith, September 2011
 13 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 14 | //           Ported to C by Tom Deakin, 2013
 15 | //
 16 | //------------------------------------------------------------------------------
 17 | 
 18 | #include "matmul.h"
 19 | 
 20 | //------------------------------------------------------------------------------
 21 | //
 22 | //  Function to compute the matrix product (sequential algorithm, dot prod)
 23 | //
 24 | //------------------------------------------------------------------------------
 25 | 
 26 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C)
 27 | {
 28 |     int i, j, k;
 29 |     float tmp;
 30 | 
 31 |     for (i = 0; i < N; i++) {
 32 |         for (j = 0; j < N; j++) {
 33 |             tmp = 0.0f;
 34 |             for (k = 0; k < N; k++) {
 35 |                 /* C(i,j) = sum(over k) A(i,k) * B(k,j) */
 36 |                 tmp += A[i*N+k] * B[k*N+j];
 37 |             }
 38 |             C[i*N+j] = tmp;
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | //------------------------------------------------------------------------------
 44 | //
 45 | //  Function to initialize the input matrices A and B
 46 | //
 47 | //------------------------------------------------------------------------------
 48 | void initmat(int N, float *A, float *B, float *C)
 49 | {
 50 |     int i, j;
 51 | 
 52 |     /* Initialize matrices */
 53 | 
 54 | 	for (i = 0; i < N; i++)
 55 | 		for (j = 0; j < N; j++)
 56 | 			A[i*N+j] = AVAL;
 57 | 
 58 | 	for (i = 0; i < N; i++)
 59 | 		for (j = 0; j < N; j++)
 60 | 			B[i*N+j] = BVAL;
 61 | 
 62 | 	for (i = 0; i < N; i++)
 63 | 		for (j = 0; j < N; j++)
 64 | 			C[i*N+j] = 0.0f;
 65 | }
 66 | 
 67 | //------------------------------------------------------------------------------
 68 | //
 69 | //  Function to set a matrix to zero
 70 | //
 71 | //------------------------------------------------------------------------------
 72 | void zero_mat (int N, float *C)
 73 | {
 74 |     int i, j;
 75 | 
 76 | 	for (i = 0; i < N; i++)
 77 | 		for (j = 0; j < N; j++)
 78 | 			C[i*N+j] = 0.0f;
 79 | }
 80 | 
 81 | //------------------------------------------------------------------------------
 82 | //
 83 | //  Function to fill Btrans(N,N) with transpose of B(N,N)
 84 | //
 85 | //------------------------------------------------------------------------------
 86 | void trans(int N, float *B, float *Btrans)
 87 | {
 88 |     int i, j;
 89 | 
 90 | 	for (i = 0; i < N; i++)
 91 | 		for (j = 0; j < N; j++)
 92 | 		    Btrans[j*N+i] = B[i*N+j];
 93 | }
 94 | 
 95 | //------------------------------------------------------------------------------
 96 | //
 97 | //  Function to compute errors of the product matrix
 98 | //
 99 | //------------------------------------------------------------------------------
100 | float error(int N, float *C)
101 | {
102 |    int i,j;
103 |    float cval, errsq, err;
104 |    cval = (float) N * AVAL * BVAL;
105 |    errsq = 0.0f;
106 | 
107 |     for (i = 0; i < N; i++) {
108 |         for (j = 0; j < N; j++) {
109 |             err = C[i*N+j] - cval;
110 |             errsq += err * err;
111 |         }
112 |     }
113 |     return errsq;
114 | }
115 | 
116 | //------------------------------------------------------------------------------
117 | //
118 | //  Function to analyze and output results
119 | //
120 | //------------------------------------------------------------------------------
121 | void results(int N, float *C, double run_time)
122 | {
123 |     float mflops;
124 |     float errsq;
125 | 
126 |     mflops = 2.0 * N * N * N/(1000000.0f * run_time);
127 |     printf(" %.2f seconds at %.1f MFLOPS \n",  run_time,mflops);
128 |     errsq = error(N, C);
129 |     if (isnan(errsq) || errsq > TOL) {
130 |         printf("\n Errors in multiplication: %f\n",errsq);
131 |         exit(1);
132 |     }
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C/matrix_lib.h:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  PROGRAM: Matrix library include file (function prototypes)
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010 
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Ported by Tom Deakin, July 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MATRIX_LIB_HDR
13 | #define __MATRIX_LIB_HDR
14 | 
15 | 
16 | //------------------------------------------------------------------------------
17 | //
18 | //  Function to compute the matrix product (sequential algorithm, dot producdt)
19 | //
20 | //------------------------------------------------------------------------------
21 | void seq_mat_mul_sdot(int N, float *A, float *B, float *C);
22 | 
23 | //------------------------------------------------------------------------------
24 | //
25 | //  Function to initialize the input matrices A and B
26 | //
27 | //------------------------------------------------------------------------------
28 | void initmat(int N, float *A, float *B, float *C);
29 | 
30 | //------------------------------------------------------------------------------
31 | //
32 | //  Function to set a matrix to zero 
33 | //
34 | //------------------------------------------------------------------------------
35 | void zero_mat (int N, float *C);
36 | 
37 | //------------------------------------------------------------------------------
38 | //
39 | //  Function to fill Btrans(Mdim,Pdim)  with transpose of B(Pdim,Mdim)
40 | //
41 | //------------------------------------------------------------------------------
42 | void trans(int N, float *B, float *Btrans);
43 | 
44 | //------------------------------------------------------------------------------
45 | //
46 | //  Function to compute errors of the product matrix
47 | //
48 | //------------------------------------------------------------------------------
49 | float error(int N, float *C);
50 | 
51 | 
52 | //------------------------------------------------------------------------------
53 | //
54 | //  Function to analyze and output results 
55 | //
56 | //------------------------------------------------------------------------------
57 | void results(int N, float *C, double run_time);
58 |     
59 | #endif
60 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C_block_form.cl:
--------------------------------------------------------------------------------
  1 | //-------------------------------------------------------------
  2 | //
  3 | //  PROGRAM: Blocked Matrix Multipliplication kernel
  4 | //
  5 | //  PURPOSE: Computes an element of the proudct matrix
  6 | //
  7 | //              C = A * B
  8 | //
  9 | //           Using the well known blocked algorithm.  
 10 | //
 11 | //           To derive this algorithm, start with the naive
 12 | //           triply nested loop algorithm with a dot product 
 13 | //           for each element of C.  Decompose each loop 
 14 | //           into blocks of size blcksz.  This gives you 6
 15 | //           nested loops with three loops over blocks
 16 | //           and three loops over indices inside the blocks.
 17 | // 
 18 | //           Rearrange the loops to put the 3 loops over blocks 
 19 | //           at the outermost loops of the loop nest.  You'll
 20 | //           see that the three "inner" loops are just the 
 21 | //           regular matrix product between blocks.
 22 | //
 23 | //           The algorithms is simple.  Keeping all the indices
 24 | //           straight is not.  We will use the following 
 25 | //           conventions:
 26 | //
 27 | //             i,j,k            ... indices of full, global matrices 
 28 | //             Iblk, Jblk, Kblk ... indices of matrix blocks
 29 | //             iloc, jloc, kloc ... indices inside blocks
 30 | //                 
 31 | //  HISTORY: Written by Tim Mattson, November 2013 
 32 | //           Updated by Simon McIntosh-Smith, August 2014 
 33 | //
 34 | //  LICENSE: This work is licensed under the Creative Commons
 35 | //           Attribution 4.0 International License.
 36 | //           To view a copy of this license, visit
 37 | //           http://creativecommons.org/licenses/by/4.0/
 38 | //           or send a letter to:
 39 | //              Creative Commons,
 40 | //              444 Castro Street, Suite 900,
 41 | //              Mountain View, California, 94041, USA.
 42 | //
 43 | //-------------------------------------------------------------
 44 | 
 45 | // It turns out that the compiler generates much better code if
 46 | // we "hardwire" this block size.  16 works well for an NVIDIA 
 47 | // GPU, 32 works well for a CPU
 48 | #define blksz 16
 49 | 
 50 | __kernel void mmul(
 51 |                 const unsigned int             N,
 52 |                 __global const float* restrict A,
 53 |                 __global const float* restrict B,
 54 |                 __global       float* restrict C,
 55 |                 __local        float* restrict Awrk,
 56 |                 __local        float* restrict Bwrk)
 57 | {
 58 |     int kloc, Kblk;
 59 |     float Ctmp=0.0f;
 60 | 
 61 |     //  This work-item will compute element C(i,j)
 62 |     const int i = get_global_id(0);
 63 |     const int j = get_global_id(1);
 64 | 
 65 |     // Element C(i,j) is in block C(Iblk,Jblk)
 66 |     const int Iblk = get_group_id(0);
 67 |     const int Jblk = get_group_id(1);
 68 | 
 69 |     // C(i,j) is element C(iloc, jloc) of block C(Iblk, Jblk)
 70 |     const int iloc = get_local_id(0);
 71 |     const int jloc = get_local_id(1);
 72 | 
 73 |     // The number of blocks are the same in each dimension
 74 |     const int Num_BLK = N/blksz;
 75 | 
 76 |     // Setup the upper-left-corner (base address) for the A and
 77 |     // B blocks plus the increments to advance base addresses as
 78 |     // we loop over blocks
 79 |           int Abase = Jblk*N*blksz;    
 80 |     const int Ainc  = blksz;
 81 | 
 82 |           int Bbase = Iblk*blksz;
 83 |     const int Binc  = blksz*N;
 84 | 
 85 | 
 86 |     // C(Iblk,Jblk) = (sum over Kblk) A(Iblk,Kblk)*B(Kblk,Jblk)
 87 |     for (Kblk = 0;  Kblk<Num_BLK;  Kblk++)
 88 |     {
 89 |        // Load A(Iblk,Kblk) and B(Kblk,Jblk) into local memory.
 90 |        // Each work-item loads a single element of the two blocks
 91 |        // which are shared with the entire work-group.
 92 | 
 93 |        Awrk[jloc*blksz+iloc] = A[Abase+jloc*N+iloc];
 94 |        Bwrk[jloc*blksz+iloc] = B[Bbase+jloc*N+iloc];
 95 | 
 96 |        barrier(CLK_LOCAL_MEM_FENCE);
 97 | 
 98 |        // Compute dot products over local blocks to find
 99 |        // the contribution to C(i,j) from this block
100 |        #pragma unroll
101 |        for (kloc=0; kloc<blksz; kloc++)
102 |           Ctmp += Awrk[jloc*blksz+kloc] * Bwrk[kloc*blksz+iloc];
103 | 
104 |        barrier(CLK_LOCAL_MEM_FENCE);
105 |        Abase += Ainc;
106 |        Bbase += Binc;
107 |     }
108 |  
109 |     // update global C matrix 
110 |     C[j*N+i] = Ctmp;
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C_elem.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | __kernel void mmul(
 3 |     const int N,
 4 |     __global float* A,
 5 |     __global float* B,
 6 |     __global float* C)
 7 | {
 8 |     int k;
 9 |     int i = get_global_id(0);
10 |     int j = get_global_id(1);
11 |     float tmp;
12 |     if ((i < N) && (j < N))
13 |     {
14 |         tmp = 0.0;
15 |         for (k = 0; k < N; k++)
16 |             tmp += A[i*N+k] * B[k*N+j];
17 |         C[i*N+j] = tmp;
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C_row.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | __kernel void mmul(
 3 |     const int N,
 4 |     __global float* A,
 5 |     __global float* B,
 6 |     __global float* C)
 7 | {
 8 |     int k, j;
 9 |     int i = get_global_id(0);
10 |     float tmp;
11 |     if (i < N) {
12 |         for (j = 0; j < N; j++) {
13 |             tmp = 0.0;
14 |             for (k = 0; k < N; k++)
15 |                 tmp += A[i*N+k] * B[k*N+j];
16 |             C[i*N+j] = tmp;
17 |         }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C_row_priv.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | __kernel void mmul(
 3 |     const int N,
 4 |     __global float* A,
 5 |     __global float* B,
 6 |     __global float* C)
 7 | {
 8 |     int k, j;
 9 |     int i = get_global_id(0);
10 |     float Awrk[1024];
11 |     float tmp;
12 |     if (i < N) {
13 |         for (k = 0; k < N; k++)
14 |             Awrk[k] = A[i*N+k];
15 | 
16 |         for (j = 0; j < N; j++) {
17 |             tmp = 0.0f;
18 |             for (k = 0; k < N; k++)
19 |                 tmp += Awrk[k] * B[k*N+j];
20 |             C[i*N+j] = tmp;
21 |         }
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/C_row_priv_bloc.cl:
--------------------------------------------------------------------------------
 1 | 
 2 | __kernel void mmul(
 3 |     const int N,
 4 |     __global float* A,
 5 |     __global float* B,
 6 |     __global float* C,
 7 |     __local float* Bwrk)
 8 | {
 9 |     int k, j;
10 |     int i    = get_global_id(0);
11 |     int iloc = get_local_id(0);
12 |     int nloc = get_local_size(0);
13 |     float Awrk[1024];
14 |     float tmp;
15 |     if (i < N) {
16 |         for (k = 0; k < N; k++)
17 |             Awrk[k] = A[i*N+k];
18 | 
19 |         for (j = 0; j < N; j++) {
20 |             barrier(CLK_LOCAL_MEM_FENCE);
21 |             for (k = iloc; k < N; k += nloc)
22 |                 Bwrk[k] = B[k*N+j];
23 |             barrier(CLK_LOCAL_MEM_FENCE);
24 |             tmp = 0.0f;
25 |             for (k = 0; k < N; k++)
26 |                 tmp += Awrk[k] * Bwrk[k];
27 |             C[i*N+j] = tmp;
28 |             barrier(CLK_LOCAL_MEM_FENCE);
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Matrix Multiply example makefile
 3 | #
 4 | # History: Written  by Tim Mattson, August 2010
 5 | #          Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 6 | #          Modified by Tom Deakin, July 2013
 7 | #
 8 | 
 9 | ifndef CPPC
10 | 	CPPC=g++
11 | endif
12 | 
13 | CCFLAGS=-O3 -ffast-math
14 | 
15 | LIBS = -lm -lOpenCL -fopenmp
16 | 
17 | COMMON_DIR = ../../Cpp_common
18 | 
19 | INC = -I $(COMMON_DIR)
20 | 
21 | MMUL_OBJS = matmul.o matrix_lib.o wtime.o
22 | EXEC = mult
23 | 
24 | # Check our platform and make sure we define the APPLE variable
25 | # and set up the right compiler flags and libraries
26 | PLATFORM = $(shell uname -s)
27 | ifeq ($(PLATFORM), Darwin)
28 | 	CPPC = clang++
29 | 	CCFLAGS += -stdlib=libc++
30 | 	LIBS = -lm -framework OpenCL
31 | endif
32 | 
33 | all: $(EXEC)
34 | 
35 | mult: $(MMUL_OBJS)
36 | 	$(CPPC) $(MMUL_OBJS) $(CCFLAGS) $(LIBS) -o $(EXEC)
37 | 
38 | wtime.o: $(COMMON_DIR)/wtime.c
39 | 	$(CPPC) -c $^ $(CCFLAGS) -o $@
40 | 
41 | .c.o:
42 | 	$(CPPC) -c $< $(CCFLAGS) -o $@
43 | 
44 | .cpp.o:
45 | 	$(CPPC) -c $< $(CCFLAGS) $(INC) -o $@
46 | 
47 | matmul.o:	matmul.hpp matrix_lib.hpp
48 | 
49 | matrix_lib.o:	matmul.hpp
50 | 
51 | clean:
52 | 	rm -f $(MMUL_OBJS) $(EXEC)
53 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/Cpp/matmul.hpp:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  Include fle for the Matrix Multiply test harness
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 9 | //
10 | //------------------------------------------------------------------------------
11 | 
12 | #ifndef __MULT_HDR
13 | #define __MULT_HDR
14 | 
15 | #include <cstdio>
16 | #include <cstdlib>
17 | #include <cmath>
18 | #include <iostream>
19 | 
20 | #include <vector>
21 | 
22 | #define __CL_ENABLE_EXCEPTIONS
23 | #include "cl.hpp"
24 | 
25 | #include "util.hpp"
26 | 
27 | #include "matrix_lib.hpp"
28 | 
29 | //------------------------------------------------------------------------------
30 | //  functions from ../Common
31 | //------------------------------------------------------------------------------
32 | extern double wtime();   // returns time since some fixed past point (wtime.c)
33 | 
34 | //------------------------------------------------------------------------------
35 | //  Constants
36 | //------------------------------------------------------------------------------
37 | #define ORDER    1024    // Order of the square matrices A, B, and C
38 | #define AVAL     3.0     // A elements are constant and equal to AVAL
39 | #define BVAL     5.0     // B elements are constant and equal to BVAL
40 | #define TOL      (0.001) // tolerance used in floating point comparisons
41 | #define DIM      2       // Max dim for NDRange
42 | #define COUNT    1       // number of times to do each multiplication
43 | #define SUCCESS  1
44 | #define FAILURE  0
45 | 
46 | #endif
47 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/Cpp/matrix_lib.hpp:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | //  PROGRAM: Matrix library include file (function prototypes)
 4 | //
 5 | //  HISTORY: Written by Tim Mattson, August 2010 
 6 | //           Modified by Simon McIntosh-Smith, September 2011
 7 | //           Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
 8 | //           Updated to C++ Wrapper v1.2.6 by Tom Deakin, August 2013
 9 | //           Modified to assume square matrices by Simon McIntosh-Smith, Sep 2014
10 | //
11 | //------------------------------------------------------------------------------
12 | 
13 | #ifndef __MATRIX_LIB_HDR
14 | #define __MATRIX_LIB_HDR
15 | 
16 | 
17 | //------------------------------------------------------------------------------
18 | //
19 | //  Function to compute the matrix product (sequential algorithm, dot producdt)
20 | //
21 | //------------------------------------------------------------------------------
22 | void seq_mat_mul_sdot(int N, std::vector<float> &A, std::vector<float> &B, std::vector<float> &C);
23 | 
24 | //------------------------------------------------------------------------------
25 | //
26 | //  Function to initialize the input matrices A and B
27 | //
28 | //------------------------------------------------------------------------------
29 | void initmat(int N, std::vector<float>& A, std::vector<float>& B, std::vector<float>& C);
30 | 
31 | //------------------------------------------------------------------------------
32 | //
33 | //  Function to set a matrix to zero 
34 | //
35 | //------------------------------------------------------------------------------
36 | void zero_mat (int N, std::vector<float> &C);
37 | 
38 | //------------------------------------------------------------------------------
39 | //
40 | //  Function to fill Btrans(Mdim,Pdim)  with transpose of B(Pdim,Mdim)
41 | //
42 | //------------------------------------------------------------------------------
43 | void trans(int N, std::vector<float>& B, std::vector<float>& Btrans);
44 | 
45 | //------------------------------------------------------------------------------
46 | //
47 | //  Function to compute errors of the product matrix
48 | //
49 | //------------------------------------------------------------------------------
50 | float error(int N, std::vector<float>& C);
51 | 
52 | 
53 | //------------------------------------------------------------------------------
54 | //
55 | //  Function to analyze and output results 
56 | //
57 | //------------------------------------------------------------------------------
58 | void results(int N, std::vector<float>& C, double run_time);
59 |     
60 | #endif
61 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/Python/definitions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Order of the square matrices A, B and C
 3 | ORDER = 1024
 4 | 
 5 | # A elemetns are constant and equal to AVAL
 6 | AVAL = 3.0
 7 | 
 8 | # B elemetns are constant and equal to BVAL
 9 | BVAL = 5.0
10 | 
11 | # tolerance used in floating point comparisons
12 | TOL = 0.001
13 | 
14 | # Max dim for NDRange
15 | DIM = 2
16 | 
17 | # number of times to do each multiplication
18 | COUNT = 1
19 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/Python/helper.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from definitions import *
 3 | 
 4 | #  Function to compute the matrix product (sequential algorithm, dot prod)
 5 | def seq_mat_mul_sdot(N, A, B, C):
 6 |     for i in range(N):
 7 |         for j in range(N):
 8 |             tmp = 0.0
 9 |             for k in range(N):
10 |                 tmp += A[i*N+k] * B[k*N+j]
11 |             C[i*N+j] = tmp
12 | 
13 | #  Function to compute errors of the product matrix
14 | def error(N, C):
15 |    cval = float(N) * AVAL * BVAL
16 |    errsq = 0.0
17 |    for i in range(N):
18 |        for j in range(N):
19 |             err = C[i*N+j] - cval
20 |             errsq += err * err
21 |    return errsq;
22 | 
23 | 
24 | # Function to analyze and output results
25 | def results(N, C, run_time):
26 |     mflops = 2.0 * N * N * N/(1000000.0* run_time)
27 |     print run_time, "seconds at", mflops, "MFLOPS"
28 |     errsq = error(N, C)
29 |     if (errsq > TOL):
30 |         print "Errors in multiplication:", errsq
31 | 


--------------------------------------------------------------------------------
/Solutions/Exercise08/README.md:
--------------------------------------------------------------------------------
1 | Exercise 8 - using local memory
2 | ===============================
3 | 


--------------------------------------------------------------------------------
/Solutions/Exercise09/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-O3 -std=c99
 7 | 
 8 | LIBS = -lOpenCL -fopenmp -lm
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Check our platform and make sure we define the APPLE variable
13 | # and set up the right compiler flags and libraries
14 | PLATFORM = $(shell uname -s)
15 | ifeq ($(PLATFORM), Darwin)
16 | 	LIBS = -framework OpenCL -lm
17 | endif
18 | 
19 | 
20 | pi_ocl: pi_ocl.c $(COMMON_DIR)/wtime.c $(COMMON_DIR)/device_info.c
21 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
22 | 
23 | 
24 | clean:
25 | 	rm -f pi_ocl
26 | 


--------------------------------------------------------------------------------
/Solutions/Exercise09/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL -lrt
13 | 
14 | 
15 | # Check our platform and make sure we define the APPLE variable
16 | # and set up the right compiler flags and libraries
17 | PLATFORM = $(shell uname -s)
18 | ifeq ($(PLATFORM), Darwin)
19 | 	CPPC = clang++
20 | 	CCFLAGS += -stdlib=libc++
21 | 	LIBS = -framework OpenCL
22 | endif
23 | 
24 | pi_ocl: pi_ocl.cpp
25 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
26 | 
27 | 
28 | clean:
29 | 	rm -f pi_ocl
30 | 


--------------------------------------------------------------------------------
/Solutions/Exercise09/Python/pi_ocl.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Pi reduction
 3 | #
 4 | # Numeric integration to estimate pi
 5 | # Asks the user to select a device at runtime
 6 | #
 7 | # History: C version written by Tim Mattson, May 2010
 8 | #          Ported to the C++ Wrapper API by Benedict R. Gaster, September 2011
 9 | #          C++ version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
10 | #          Ported to Python by Tom Deakin, July 2013
11 | #
12 | 
13 | 
14 | import pyopencl as cl
15 | import numpy
16 | from time import time
17 | 
18 | # Some constant values
19 | INSTEPS = 512*512*512
20 | ITERS = 262144
21 | 
22 | # Set some default values:
23 | # Default number of steps (updated later to device prefereable)
24 | in_nsteps = INSTEPS
25 | # Default number of iterations
26 | niters = ITERS
27 | 
28 | # Create context, queue and build program
29 | context = cl.create_some_context()
30 | queue = cl.CommandQueue(context)
31 | kernelsource = open("../pi_ocl.cl").read()
32 | program = cl.Program(context, kernelsource).build()
33 | pi = program.pi
34 | pi.set_scalar_arg_dtypes([numpy.int32, numpy.float32, None, None])
35 | 
36 | # Get the max work group size for the kernel pi on our device
37 | device = context.devices[0]
38 | work_group_size = program.pi.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
39 | 
40 | 
41 | # Now that we know the size of the work_groups, we can set the number of work
42 | # groups, the actual number of steps, and the step size
43 | nwork_groups = in_nsteps/(work_group_size*niters)
44 | 
45 | if nwork_groups < 1:
46 | 	nwork_groups = device.max_compute_units
47 | 	work_group_size = in_nsteps/(nwork_groups*niters)
48 | 
49 | nsteps = work_group_size * niters * nwork_groups
50 | step_size = 1.0 / float(nsteps)
51 | 
52 | # vector to hold partial sum
53 | h_psum = numpy.empty(nwork_groups).astype(numpy.float32)
54 | 
55 | print nwork_groups, "work groups of size", work_group_size, ".",
56 | print nsteps, "Integration steps"
57 | 
58 | d_partial_sums = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_psum.nbytes)
59 | 
60 | # Start the timer
61 | rtime = time()
62 | 
63 | # Execute the kernel over the entire range of our 1d input data et
64 | # using the maximum number of work group items for this device
65 | # Set the global and local size as tuples
66 | global_size = ((nwork_groups * work_group_size),)
67 | local_size = ((work_group_size),)
68 | localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size)
69 | 
70 | pi(queue, global_size, local_size,
71 | 	niters, step_size,
72 | 	localmem, d_partial_sums)
73 | 
74 | cl.enqueue_copy(queue, h_psum, d_partial_sums)
75 | 
76 | # complete the sum and compute the final integral value
77 | pi_res = h_psum.sum() * step_size
78 | 
79 | # Stop the timer
80 | rtime = time() - rtime
81 | print "The calculation ran in", rtime, "seconds"
82 | print "pi =", pi_res, "for", nsteps, "steps"
83 | 
84 | 


--------------------------------------------------------------------------------
/Solutions/Exercise09/README.md:
--------------------------------------------------------------------------------
1 | Exercise 9 - The Pi program
2 | ===========================
3 | 


--------------------------------------------------------------------------------
/Solutions/Exercise09/pi_ocl.cl:
--------------------------------------------------------------------------------
 1 | //------------------------------------------------------------------------------
 2 | //
 3 | // kernel:  pi    
 4 | //
 5 | // Purpose: accumulate partial sums of pi comp
 6 | // 
 7 | // input: float step_size
 8 | //        int   niters per work item
 9 | //        local float* an array to hold sums from each work item
10 | //
11 | // output: partial_sums   float vector of partial sums
12 | //
13 | 
14 | 
15 | void reduce(                                          
16 |    __local  float*,                          
17 |    __global float*);
18 |                         
19 | 
20 | __kernel void pi(                                          
21 |    const int          niters,                              
22 |    const float        step_size,                           
23 |    __local  float*    local_sums,                          
24 |    __global float*    partial_sums)                        
25 | {                                                          
26 |    int num_wrk_items  = get_local_size(0);                 
27 |    int local_id       = get_local_id(0);                   
28 |    int group_id       = get_group_id(0);                   
29 |    
30 |    float x, accum = 0.0f;                              
31 |    int i,istart,iend;                                      
32 |    
33 |    istart = (group_id * num_wrk_items + local_id) * niters;
34 |    iend   = istart+niters;      
35 | 
36 |    for(i= istart; i<iend; i++){ 
37 |        x = (i+0.5f)*step_size;   
38 |        accum += 4.0f/(1.0f+x*x);  
39 |    } 
40 | 
41 |    local_sums[local_id] = accum;
42 |    barrier(CLK_LOCAL_MEM_FENCE);
43 |    
44 |    reduce(local_sums, partial_sums);                  
45 | }
46 | 
47 | //------------------------------------------------------------------------------
48 | //
49 | // OpenCL function:  reduction    
50 | //
51 | // Purpose: reduce across all the work-items in a work-group
52 | // 
53 | // input: local float* an array to hold sums from each work item
54 | //
55 | // output: global float* partial_sums   float vector of partial sums
56 | //
57 | 
58 | void reduce(                                          
59 |    __local  float*    local_sums,                          
60 |    __global float*    partial_sums)                        
61 | {                                                          
62 |    int num_wrk_items  = get_local_size(0);                 
63 |    int local_id       = get_local_id(0);                   
64 |    int group_id       = get_group_id(0);                   
65 |    
66 |    float sum;                              
67 |    int i;                                      
68 |    
69 |    if (local_id == 0) {                      
70 |       sum = 0.0f;                            
71 |    
72 |       for (i=0; i<num_wrk_items; i++) {        
73 |           sum += local_sums[i];             
74 |       }                                     
75 |    
76 |       partial_sums[group_id] = sum;         
77 |    }
78 | }
79 |                                           
80 | 


--------------------------------------------------------------------------------
/Solutions/Exercise13/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-O3 -std=c99
 7 | 
 8 | LIBS = -lOpenCL -lm
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Change this variable to specify the device type
13 | # to the OpenCL device type of choice. You can also
14 | # edit the variable in the source.
15 | ifndef DEVICE
16 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
17 | endif
18 | 
19 | # Check our platform and make sure we define the APPLE variable
20 | # and set up the right compiler flags and libraries
21 | PLATFORM = $(shell uname -s)
22 | ifeq ($(PLATFORM), Darwin)
23 | 	LIBS = -framework OpenCL -lm
24 | endif
25 | 
26 | CCFLAGS += -D DEVICE=$(DEVICE)
27 | 
28 | gameoflife: gameoflife.c
29 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
30 | 
31 | 
32 | clean:
33 | 	rm -f gameoflife *.o
34 | 


--------------------------------------------------------------------------------
/Solutions/Exercise13/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=-O3
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL -lrt
13 | 
14 | # Change this variable to specify the device type
15 | # to the OpenCL device type of choice. You can also
16 | # edit the variable in the source.
17 | ifndef DEVICE
18 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
19 | endif
20 | 
21 | # Check our platform and make sure we define the APPLE variable
22 | # and set up the right compiler flags and libraries
23 | PLATFORM = $(shell uname -s)
24 | ifeq ($(PLATFORM), Darwin)
25 | 	CPPC = clang++
26 | 	CCFLAGS += -stdlib=libc++
27 | 	LIBS = -framework OpenCL
28 | endif
29 | 
30 | CCFLAGS += -D DEVICE=$(DEVICE)
31 | 
32 | all: gameoflife
33 | 
34 | gameoflife: gameoflife.cpp
35 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
36 | 
37 | clean:
38 | 	rm -f gameoflife *.o
39 | 


--------------------------------------------------------------------------------
/Solutions/Exercise13/README.md:
--------------------------------------------------------------------------------
1 | Exercise 13 - Porting CUDA to OpenCL
2 | ====================================
3 | 


--------------------------------------------------------------------------------
/Solutions/ExerciseA/C/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CC
 3 | 	CC = gcc
 4 | endif
 5 | 
 6 | CCFLAGS=-g -std=c99
 7 | 
 8 | LIBS = -lOpenCL -fopenmp -lm
 9 | 
10 | COMMON_DIR = ../../C_common
11 | 
12 | # Change this variable to specify the device type
13 | # to the OpenCL device type of choice. You can also
14 | # edit the variable in the source.
15 | ifndef DEVICE
16 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
17 | endif
18 | 
19 | # Check our platform and make sure we define the APPLE variable
20 | # and set up the right compiler flags and libraries
21 | PLATFORM = $(shell uname -s)
22 | ifeq ($(PLATFORM), Darwin)
23 | 	LIBS = -framework OpenCL -lm
24 | endif
25 | 
26 | CCFLAGS += -D DEVICE=$(DEVICE)
27 | 
28 | pi_vocl: pi_vocl.c $(COMMON_DIR)/wtime.c
29 | 	$(CC) $^ $(CCFLAGS) $(LIBS) -I $(COMMON_DIR) -o $@
30 | 
31 | 
32 | clean:
33 | 	rm -f pi_vocl
34 | 


--------------------------------------------------------------------------------
/Solutions/ExerciseA/Cpp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | ifndef CPPC
 3 | 	CPPC=g++
 4 | endif
 5 | 
 6 | CPP_COMMON = ../../Cpp_common
 7 | 
 8 | CCFLAGS=
 9 | 
10 | INC = -I $(CPP_COMMON)
11 | 
12 | LIBS = -lOpenCL -lrt
13 | 
14 | # Change this variable to specify the device type
15 | # to the OpenCL device type of choice. You can also
16 | # edit the variable in the source.
17 | ifndef DEVICE
18 | 	DEVICE = CL_DEVICE_TYPE_DEFAULT
19 | endif
20 | 
21 | # Check our platform and make sure we define the APPLE variable
22 | # and set up the right compiler flags and libraries
23 | PLATFORM = $(shell uname -s)
24 | ifeq ($(PLATFORM), Darwin)
25 | 	CPPC = clang++
26 | 	CCFLAGS += -stdlib=libc++
27 | 	LIBS = -framework OpenCL
28 | endif
29 | 
30 | CCFLAGS += -D DEVICE=$(DEVICE)
31 | 
32 | pi_vocl: pi_vocl.cpp
33 | 	$(CPPC) $^ $(INC) $(CCFLAGS) $(LIBS) -o $@
34 | 
35 | 
36 | clean:
37 | 	rm -f pi_vocl
38 | 


--------------------------------------------------------------------------------
/Solutions/ExerciseA/Python/pi_vocl.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Pi reduction - vectorized
  3 | #
  4 | # Numeric integration to estimate pi
  5 | # Asks the user to select a device at runtime
  6 | # Vector size must be present as a CLI argument
  7 | #
  8 | # History: C version written by Tim Mattson, May 2010
  9 | #          Ported to the C++ Wrapper API by Benedict R. Gaster, September 2011
 10 | #          C++ version Updated by Tom Deakin and Simon McIntosh-Smith, October 2012
 11 | #          Ported to Python by Tom Deakin, July 2013
 12 | #
 13 | 
 14 | 
 15 | import pyopencl as cl
 16 | import numpy
 17 | from time import time
 18 | import sys
 19 | 
 20 | if len(sys.argv) != 2:
 21 | 	print "Usage: python pi_vocl.py num\n where num = 1, 4 or 8"
 22 | 	sys.exit(-1)
 23 | 
 24 | vector_size = int(sys.argv[1])
 25 | 
 26 | # Some constant values
 27 | INSTEPS = 512*512*512
 28 | ITERS = -1
 29 | WGS = -1
 30 | NAME = ""
 31 | 
 32 | if vector_size == 1:
 33 | 	ITERS = 262144
 34 | 	WGS = 8
 35 | elif vector_size == 4: 
 36 | 	ITERS = 65536 # (262144/4)
 37 | 	WGS = 32
 38 | elif vector_size == 8:
 39 | 	ITERS = 32768 # (262144/8)
 40 | 	WGS = 64
 41 | else:
 42 | 	print "Invalid vector size"
 43 | 	sys.exit(-1)
 44 | 
 45 | # Set some default values:
 46 | # Default number of steps (updated later to device prefereable)
 47 | in_nsteps = INSTEPS
 48 | # Default number of iterations
 49 | niters = ITERS
 50 | work_group_size = WGS
 51 | 
 52 | # Create context, queue and build program
 53 | context = cl.create_some_context()
 54 | queue = cl.CommandQueue(context)
 55 | kernelsource = open("../pi_vocl.cl").read()
 56 | program = cl.Program(context, kernelsource).build()
 57 | if vector_size == 1:
 58 | 	pi = program.pi
 59 | elif vector_size == 4:
 60 | 	pi = program.pi_vec4
 61 | elif vector_size == 8:
 62 | 	pi = program.pi_vec8
 63 | 
 64 | pi.set_scalar_arg_dtypes([numpy.int32, numpy.float32, None, None])
 65 | 
 66 | # Now that we know the size of the work_groups, we can set the number of work
 67 | # groups, the actual number of steps, and the step size
 68 | nwork_groups = in_nsteps/(work_group_size*niters)
 69 | 
 70 | # Get the max work group size for the kernel pi on our device
 71 | device = context.devices[0]
 72 | if vector_size == 1:
 73 | 	max_size = program.pi.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
 74 | elif vector_size == 4:
 75 | 	max_size = program.pi_vec4.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
 76 | elif vector_size == 8:
 77 | 	max_size = program.pi_vec8.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
 78 | 
 79 | if max_size > work_group_size:
 80 | 	work_group_size = max_size
 81 | 	nwork_groups = in_nsteps/(work_group_size*niters)
 82 | 
 83 | 
 84 | if nwork_groups < 1:
 85 | 	nwork_groups = device.max_compute_units
 86 | 	work_group_size = in_nsteps/(nwork_groups*niters)
 87 | 
 88 | nsteps = work_group_size * niters * nwork_groups
 89 | step_size = 1.0 / float(nsteps)
 90 | 
 91 | # vector to hold partial sum
 92 | h_psum = numpy.empty(nwork_groups).astype(numpy.float32)
 93 | 
 94 | print nwork_groups, "work groups of size", work_group_size, ".",
 95 | print nsteps, "Integration steps"
 96 | 
 97 | d_partial_sums = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_psum.nbytes)
 98 | 
 99 | # Start the timer
100 | rtime = time()
101 | 
102 | # Execute the kernel over the entire range of our 1d input data et
103 | # using the maximum number of work group items for this device
104 | # Set the global and local size as tuples
105 | global_size = ((nwork_groups * work_group_size),)
106 | local_size = ((work_group_size),)
107 | 
108 | localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * work_group_size)
109 | 
110 | pi(queue, global_size, local_size,
111 | 	niters,
112 | 	step_size,
113 | 	localmem,
114 | 	d_partial_sums)
115 | 
116 | cl.enqueue_copy(queue, h_psum, d_partial_sums)
117 | 
118 | # complete the sum and compute the final integral value
119 | pi_res = h_psum.sum() * step_size
120 | 
121 | # Stop the timer
122 | rtime = time() - rtime
123 | print "The calculation ran in", rtime, "seconds"
124 | print "pi =", pi_res, "for", nsteps, "steps"
125 | 
126 | 


--------------------------------------------------------------------------------
/Solutions/ExerciseA/README.md:
--------------------------------------------------------------------------------
1 | Exercise A - The vectorized Pi program
2 | ======================================
3 | 


--------------------------------------------------------------------------------
/Solutions/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # This makefile will produce all the C binaries
 3 | # in their respective directories
 4 | 
 5 | CEXES = Exercise04/C/vadd_chain Exercise05/C/vadd_abc \
 6 | 		Exercise06/C/mult Exercise07/C/mult \
 7 | 		Exercise08/C/mult Exercise09/C/pi_ocl \
 8 | 		Exercise13/C/gameoflife ExerciseA/C/pi_vocl
 9 | 
10 | CPPEXES = Exercise04/Cpp/vadd_chain Exercise05/Cpp/vadd_abc \
11 | 		Exercise06/Cpp/mult Exercise07/Cpp/mult \
12 | 		Exercise08/Cpp/mult Exercise08/Cpp/pi_ocl \
13 | 		Exercise13/Cpp/gameoflife ExerciseA/Cpp/pi_vocl
14 | 
15 | # Change this variable to specify the device type in all
16 | # the Makefile to the OpenCL device type of choice
17 | DEVICE = CL_DEVICE_TYPE_DEFAULT
18 | export DEVICE
19 | 
20 | # Incase you need to rename the C++ compiler, you can
21 | # do it in bulk here
22 | CPPC = g++
23 | export CPPC
24 | 
25 | ifndef CC
26 |         CC = gcc
27 | endif
28 | export CC
29 | 
30 | .PHONY : $(CEXES) $(CPEXES)
31 | 
32 | all: $(CEXES) $(CPPEXES)
33 | 
34 | $(CEXES):
35 | 	$(MAKE) -C `dirname $@`
36 | 
37 | $(CPPEXES):
38 | 	$(MAKE) -C `dirname $@`
39 | 
40 | .PHONY : clean
41 | clean:
42 | 	for e in $(CEXES) $(CPPEXES); do $(MAKE) -C `dirname $$e` clean; done
43 | 


--------------------------------------------------------------------------------
/Tools/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HandsOnOpenCL/Exercises-Solutions/be2fb26d7c478627598ffba369014a4adb57b6f4/Tools/.DS_Store


--------------------------------------------------------------------------------
/Tools/genErrCode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Usage: ./genErrCode.py /path/to/cl.h > err_code.h
  4 | 
  5 | from __future__ import print_function
  6 | import sys
  7 | 
  8 | if len(sys.argv) != 2:
  9 |     print("Usage: python genErrCode.py /path/to/cl.h", file = sys.stderr)
 10 |     sys.exit(1)
 11 | 
 12 | hfile = open(sys.argv[1], "r")
 13 | 
 14 | # Find the start of the error code list
 15 | for l in hfile:
 16 |     if l == "/* Error Codes */\n":
 17 |         # Found the error code comment
 18 |         break
 19 | 
 20 | errors = []
 21 | # Loop through the errors and construct the list of errors
 22 | for l in hfile:
 23 |     # Skip if a blank line
 24 |     if l == "\n":
 25 |         continue
 26 | 
 27 |     tokens = l.split()
 28 |     # We expect the line to be of the form:
 29 |     # #define CL_... int
 30 |     # OpenCL error numbers are 0 or negative
 31 |     if len(tokens) != 3 or int(tokens[2]) > 0:
 32 |         # We are done or some error
 33 |         break
 34 |     else:
 35 |         errors.append(tokens[1])
 36 | 
 37 | # Print out the C file
 38 | print('''
 39 | #pragma once
 40 | /*----------------------------------------------------------------------------
 41 |  *
 42 |  * Name:     err_code()
 43 |  *
 44 |  * Purpose:  Function to output descriptions of errors for an input error code
 45 |  *           and quit a program on an error with a user message
 46 |  *
 47 |  *
 48 |  * RETURN:   echoes the input error code / echos user message and exits
 49 |  *
 50 |  * HISTORY:  Written by Tim Mattson, June 2010
 51 |  *           This version automatically produced by genErrCode.py
 52 |  *           script written by Tom Deakin, August 2013
 53 |  *           Modified by Bruce Merry, March 2014
 54 |  *           Updated by Tom Deakin, October 2014
 55 |  *               Included the checkError function written by
 56 |  *               James Price and Simon McIntosh-Smith
 57 |  *
 58 |  *----------------------------------------------------------------------------
 59 |  */
 60 | #if defined(__APPLE__) || defined(__MACOSX)
 61 | #include <OpenCL/opencl.h>
 62 | #else
 63 | #include <CL/cl.h>
 64 | #endif
 65 | 
 66 | #ifdef __cplusplus
 67 |  #include <cstdio>
 68 | #endif
 69 | 
 70 | const char *err_code (cl_int err_in)
 71 | {
 72 |     switch (err_in) {''')
 73 | for err in errors:
 74 |     print('        case ' + err + ':')
 75 |     print('            return (char*)"' + err.strip() + '";')
 76 | 
 77 | print('''
 78 |         default:
 79 |             return (char*)"UNKNOWN ERROR";
 80 |     }
 81 | }
 82 | ''')
 83 | 
 84 | # Check error funtion
 85 | print('''
 86 | void check_error(cl_int err, const char *operation, char *filename, int line)
 87 | {
 88 |     if (err != CL_SUCCESS)
 89 |     {
 90 |         fprintf(stderr, "Error during operation '%s', ", operation);
 91 |         fprintf(stderr, "in '%s' on line %d\\n", filename, line);
 92 |         fprintf(stderr, "Error code was \\"%s\\" (%d)\\n", err_code(err), err);
 93 |         exit(EXIT_FAILURE);
 94 |     }
 95 | }
 96 | ''')
 97 | 
 98 | # Macro version of checkError without need for file and line
 99 | print('''
100 | #define checkError(E, S) check_error(E,S,__FILE__,__LINE__)
101 | ''')
102 | 
103 | 


--------------------------------------------------------------------------------
/Tools/stringify_opencl:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IN=$1
 4 | OUT=$2
 5 | NAME=${IN%.cl}
 6 | NAME=${NAME##*/}
 7 | 
 8 | echo "const char *"$NAME"_ocl =" >$OUT
 9 | sed -e 's/\\/\\\\/g;s/"/\\"/g;s/^/"/;s/$/\\n"/' $IN >>$OUT
10 | echo ";" >>$OUT
11 | 


--------------------------------------------------------------------------------