├── mixbench-opencl
    ├── check-half2-def.cpp
    ├── mix_kernels_ocl.h
    ├── README.md
    ├── CMakeLists.txt
    ├── mix_kernels.cl
    ├── main-ocl.cpp
    ├── loclutil.h
    └── mix_kernels_ocl.cpp
├── mixbench-cuda
    ├── mix_kernels_cuda.h
    ├── CMakeLists.txt
    ├── main-cuda.cpp
    ├── README.md
    ├── lcutil.h
    └── mix_kernels_cuda.cu
├── mixbench-hip
    ├── mix_kernels_hip.h
    ├── main-hip.cpp
    ├── CMakeLists.txt
    ├── lhiputil.h
    ├── README.md
    └── mix_kernels_hip.cpp
├── .gitattributes
├── mixbench-cpu
    ├── mix_kernels_cpu.h
    ├── Dockerfile
    ├── README.md
    ├── CMakeLists.txt
    ├── main.cpp
    └── mix_kernels_cpu.cpp
├── mixbench-sycl
    ├── mix_kernels_sycl.h
    ├── CMakeLists.txt
    ├── lsyclutil.h
    ├── main-sycl.cpp
    ├── README.md
    └── mix_kernels_sycl.cpp
├── include
    ├── common.h
    └── timestamp.h
├── .gitignore
├── .clang-format
├── README.md
└── LICENSE


/mixbench-opencl/check-half2-def.cpp:
--------------------------------------------------------------------------------
1 | #include <CL/opencl.h>
2 | 
3 | int main(int argc, char* argv[]) {
4 | 	cl_half2 dummy;
5 | }
6 | 


--------------------------------------------------------------------------------
/mixbench-cuda/mix_kernels_cuda.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * mix_kernels_cuda.h: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #pragma once
 8 | 
 9 | extern "C" void mixbenchGPU(double*, long size);
10 | 
11 | 


--------------------------------------------------------------------------------
/mixbench-hip/mix_kernels_hip.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * mix_kernels_hip.h: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #pragma once
 8 | 
 9 | extern "C" void mixbenchGPU(double*, long size);
10 | 
11 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Set the default behavior, in case people don't have core.autocrlf set.
 2 | * text=auto
 3 | 
 4 | # Explicitly declare text files you want to always be normalized and converted
 5 | # to native line endings on checkout.
 6 | *.c text
 7 | *.cpp text
 8 | *.cu text
 9 | *.h text
10 | *.cl text
11 | Makefile text
12 | 


--------------------------------------------------------------------------------
/mixbench-cpu/mix_kernels_cpu.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * mix_kernels_cpu.h: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #ifndef _MIX_KERNELS_CPU_H_
 8 | #define _MIX_KERNELS_CPU_H_
 9 | 
10 | void mixbenchCPU(double*, size_t);
11 | 
12 | #endif
13 | 


--------------------------------------------------------------------------------
/mixbench-sycl/mix_kernels_sycl.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * mix_kernels_sycl.h: This file is part of the mixbench GPU micro-benchmark
 3 |  *suite.
 4 |  *
 5 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 6 |  **/
 7 | 
 8 | #ifndef _MIX_KERNELS_SYCL_H_
 9 | #define _MIX_KERNELS_SYCL_H_
10 | 
11 | void mixbenchGPU(const sycl::device&, void*, long, bool, size_t);
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/mixbench-cpu/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:rolling
 2 | 
 3 | RUN apt-get update \
 4 |    && apt-get install -y \
 5 |       g++ \
 6 |       cmake
 7 | 
 8 | ADD . /root/mixbench-cpu
 9 | 
10 | WORKDIR /root/mixbench-cpu
11 | 
12 | RUN cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_FLAGS="-march=native -funroll-loops" -B build-docker ./
13 | RUN cmake --build build-docker
14 | 
15 | CMD /root/mixbench-cpu/build-docker/mixbench-cpu
16 | 


--------------------------------------------------------------------------------
/mixbench-opencl/mix_kernels_ocl.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * mix_kernels_ocl_ro.h: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #pragma once
 8 | 
 9 | #ifdef __APPLE__
10 | #	include <OpenCL/OpenCL.h>
11 | #else
12 | #	include <CL/opencl.h>
13 | #endif
14 | 
15 | extern "C" void mixbenchGPU(cl_device_id, double*, long, bool, bool, bool, size_t, unsigned int, unsigned int);
16 | 
17 | 


--------------------------------------------------------------------------------
/include/common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <algorithm>
 4 | #include <vector>
 5 | 
 6 | template <int total_runs = 20, typename Op>
 7 | auto benchmark(Op op) {
 8 |   auto duration = op();  // drop first measurement
 9 |   std::vector<decltype(duration)> measurements;
10 |   for (int i = 1; i < total_runs; i++) {
11 |     duration = op();
12 |     measurements.push_back(duration);
13 |   }
14 |   return *std::min_element(std::begin(measurements), std::end(measurements));
15 | }
16 | 


--------------------------------------------------------------------------------
/mixbench-opencl/README.md:
--------------------------------------------------------------------------------
 1 | # mixbench-opencl
 2 | 
 3 | This is the OpenCL implementation of mixbench.
 4 | 
 5 | ## Building notes
 6 | 
 7 | Occasionally, (depending on the CMake version) the OpenCL files might not be
 8 | discovered automatically.
 9 | In such cases you might need to provide the OpenCL directories explicitly,
10 | as seen in the examples below:
11 | 
12 | ```
13 | cmake ../mixbench-opencl -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/
14 | cmake ../mixbench-opencl -DOpenCL_LIBRARY=/opt/rocm/lib/libOpenCL.so -DOpenCL_INCLUDE_DIR=/opt/rocm/opencl/include/
15 | cmake ../mixbench-opencl -DOpenCL_LIBRARY=/opt/amdgpu-pro/lib/x86_64-linux-gnu/libOpenCL.so
16 | ```
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Object files
 2 | *.o
 3 | *.ko
 4 | *.obj
 5 | *.elf
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Libraries
12 | *.lib
13 | *.a
14 | *.la
15 | *.lo
16 | 
17 | # Shared objects (inc. Windows DLLs)
18 | *.dll
19 | *.so
20 | *.so.*
21 | *.dylib
22 | 
23 | # Executables
24 | *.exe
25 | *.out
26 | *.app
27 | *.i*86
28 | *.x86_64
29 | *.hex
30 | 
31 | # Specific executables
32 | mixbench-cuda
33 | mixbench-hip
34 | mixbench-ocl
35 | mixbench-sycl
36 | mixbench-cpu
37 | 
38 | # But not the code itself
39 | !mixbench-cuda/
40 | !mixbench-hip/
41 | !mixbench-ocl/
42 | !mixbench-sycl/
43 | !mixbench-cpu/
44 | 
45 | # Debug files
46 | *.dSYM/
47 | 
48 | # Build folders
49 | build*/
50 | mixbench-cuda/build*/
51 | mixbench-hip/build*/
52 | mixbench-opencl/build*/
53 | mixbench-sycl/build*/
54 | mixbench-cpu/build*/
55 | 
56 | # Other
57 | version_info.h
58 | 


--------------------------------------------------------------------------------
/mixbench-cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 2 | project(mixbench LANGUAGES CXX CUDA)
 3 | 
 4 | # Include CUDA header directory in cpp files
 5 | include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 6 | 
 7 | string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v")
 8 | string(APPEND CMAKE_CUDA_FLAGS " -Wno-deprecated-gpu-targets")
 9 | string(APPEND CMAKE_CUDA_FLAGS " --cudart=static")
10 | 
11 | # Get version info from git tag
12 | execute_process(COMMAND git describe --tags --always
13 |                 OUTPUT_VARIABLE GIT_REV
14 |                 ERROR_QUIET)
15 | 
16 | if ("${GIT_REV}" STREQUAL "")
17 |     set(GIT_REV "")
18 | endif()
19 | string(STRIP "${GIT_REV}" GIT_REV)
20 | file(WRITE "version_info.h" "#define VERSION_INFO \"")
21 | file(APPEND "version_info.h" ${GIT_REV})
22 | file(APPEND "version_info.h" "\"")
23 | 
24 | add_executable(mixbench-cuda main-cuda.cpp mix_kernels_cuda.h lcutil.h version_info.h mix_kernels_cuda.cu mix_kernels_cuda.h lcutil.h)
25 | 
26 | target_compile_features(mixbench-cuda PUBLIC cxx_std_11)
27 | 
28 | 


--------------------------------------------------------------------------------
/mixbench-cuda/main-cuda.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * main-cuda.cpp: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <cuda.h>
10 | #include <cuda_runtime.h>
11 | #include <string.h>
12 | #include "lcutil.h"
13 | #include "mix_kernels_cuda.h"
14 | #include "version_info.h"
15 | 
16 | #define VECTOR_SIZE (32 * 1024 * 1024)
17 | 
18 | int main(int argc, char* argv[]) {
19 |     printf("mixbench (%s)\n", VERSION_INFO);
20 | 
21 |     unsigned int datasize = VECTOR_SIZE * sizeof(double);
22 | 
23 |     cudaSetDevice(0);
24 |     StoreDeviceInfo(stdout);
25 | 
26 |     size_t freeCUDAMem, totalCUDAMem;
27 |     cudaMemGetInfo(&freeCUDAMem, &totalCUDAMem);
28 |     printf("Total GPU memory %lu, free %lu\n", totalCUDAMem, freeCUDAMem);
29 |     printf("Buffer size:          %dMB\n", datasize / (1024 * 1024));
30 | 
31 |     double* c;
32 |     c = (double*)malloc(datasize);
33 | 
34 |     mixbenchGPU(c, VECTOR_SIZE);
35 | 
36 |     free(c);
37 | 
38 |     return 0;
39 | }
40 | 


--------------------------------------------------------------------------------
/mixbench-cuda/README.md:
--------------------------------------------------------------------------------
 1 | # mixbench-cuda
 2 | 
 3 | This is the CUDA implementation of mixbench.
 4 | It is actually the original implementation of this benchmark.
 5 | 
 6 | ## Building
 7 | 
 8 | To build the executable, run the following commands.
 9 | 
10 | > The minimum required CMake version is 3.18.
11 | 
12 | ```sh
13 | mkdir build
14 | cd build
15 | cmake ../mixbench-cuda -DCMAKE_CUDA_ARCHITECTURES=native
16 | cmake --build ./
17 | ```
18 | 
19 | This will build and write a `mixbench-cuda` executable file in the `build/`
20 | directory, compiled with support for the native CUDA architecture. Note that
21 | the `-arch=native` flag was [introduced in CUDA 11.5 update 1][1]. If you
22 | are using a prior version, or wish to compile the program for a specific
23 | architecture, replace `native` in the above command with the architecture.
24 | For example, to compile for the `sm_120` architecture, we would run:
25 | 
26 | ```
27 | mkdir build
28 | cd build
29 | cmake ../mixbench-cuda -DCMAKE_CUDA_ARCHITECTURES=sm_120
30 | cmake --build ./
31 | ```
32 | 
33 | [1]: https://docs.nvidia.com/cuda/cuda-features-archive/index.html#compiler
34 | 


--------------------------------------------------------------------------------
/include/timestamp.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef __linux__
 4 | 
 5 | #include <stdlib.h>
 6 | #include <time.h>
 7 | #ifndef CLOCK_MONOTONIC_RAW
 8 | #include <sys/time.h>
 9 | 
10 | typedef struct timeval timestamp;
11 | inline timestamp getTimestamp(void){
12 | 	struct timeval t;
13 | 	gettimeofday(&t, NULL);
14 | 	return t;
15 | }
16 | inline float getElapsedtime(timestamp t){
17 | 	struct timeval tn;
18 | 	gettimeofday(&tn, NULL);
19 | 	return (tn.tv_sec - t.tv_sec) * 1000.0f + (tn.tv_usec - t.tv_usec) / 1000.0f;
20 | }
21 | #else
22 | typedef struct timespec timestamp;
23 | inline timestamp getTimestamp(void){
24 | 	struct timespec t;
25 | 	clock_gettime(CLOCK_MONOTONIC_RAW, &t);
26 | 	return t;
27 | }
28 | inline double getElapsedtime(timestamp t){
29 | 	struct timespec tn;
30 | 	clock_gettime(CLOCK_MONOTONIC_RAW, &tn);
31 | 	return (double)(tn.tv_sec - t.tv_sec) * 1000.0 + (tn.tv_nsec - t.tv_nsec) / 1000000.0;
32 | }
33 | #endif
34 | 
35 | #else
36 | 
37 | #include <time.h>
38 | 
39 | typedef clock_t timestamp;
40 | inline timestamp getTimestamp(void){
41 | 	return clock();
42 | }
43 | inline double getElapsedtime(timestamp t){
44 | 	return ((double)clock()-t) / CLOCKS_PER_SEC * 1000.0;
45 | }
46 | 
47 | #endif
48 | 
49 | 


--------------------------------------------------------------------------------
/mixbench-sycl/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # required cmake version
 2 | cmake_minimum_required(VERSION 3.5)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 6 | 
 7 | project(mixbench-sycl LANGUAGES CXX)
 8 | 
 9 | include_directories("../include")
10 | 
11 | # Set default build type to RelWithDebInfo if not specified
12 | if (NOT CMAKE_BUILD_TYPE)
13 |     message (STATUS "Default CMAKE_BUILD_TYPE not set. Using Release with Debug Info")
14 |     set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE
15 |         STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
16 |         FORCE)
17 | endif()
18 | 
19 | # Get version info from git tag
20 | execute_process(COMMAND git describe --tags --always
21 |                 OUTPUT_VARIABLE GIT_REV
22 |                 ERROR_QUIET)
23 | 
24 | if ("${GIT_REV}" STREQUAL "")
25 |     set(GIT_REV "")
26 | endif()
27 | string(STRIP "${GIT_REV}" GIT_REV)
28 | file(WRITE "version_info.h" "#define VERSION_INFO \"")
29 | file(APPEND "version_info.h" ${GIT_REV})
30 | file(APPEND "version_info.h" "\"")
31 | 
32 | add_executable(mixbench-sycl main-sycl.cpp lsyclutil.h mix_kernels_sycl.cpp)
33 | 
34 | set_target_properties(mixbench-sycl PROPERTIES CXX_EXTENSIONS OFF)
35 | 


--------------------------------------------------------------------------------
/mixbench-hip/main-hip.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * main-hip.cpp: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #include <hip/hip_runtime.h>
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <string.h>
11 | #include "lhiputil.h"
12 | #include "mix_kernels_hip.h"
13 | #include "version_info.h"
14 | 
15 | #define VECTOR_SIZE (32 * 1024 * 1024)
16 | 
17 | void init_vector(double* v, size_t datasize) {
18 |   for (int i = 0; i < (int)datasize; i++)
19 |     v[i] = i;
20 | }
21 | 
22 | int main(int argc, char* argv[]) {
23 |   printf("mixbench-hip (%s)\n", VERSION_INFO);
24 | 
25 |   unsigned int datasize = VECTOR_SIZE * sizeof(double);
26 | 
27 |   HIP_SAFE_CALL(hipSetDevice(0));
28 |   StoreDeviceInfo(stdout);
29 | 
30 |   size_t freeCUDAMem, totalCUDAMem;
31 |   HIP_SAFE_CALL(hipMemGetInfo(&freeCUDAMem, &totalCUDAMem));
32 |   printf("Total GPU memory %lu, free %lu\n", totalCUDAMem, freeCUDAMem);
33 |   printf("Buffer size:          %dMB\n", datasize / (1024 * 1024));
34 | 
35 |   double* c;
36 |   c = (double*)malloc(datasize);
37 |   init_vector(c, VECTOR_SIZE);
38 | 
39 |   mixbenchGPU(c, VECTOR_SIZE);
40 | 
41 |   free(c);
42 | 
43 |   return 0;
44 | }
45 | 


--------------------------------------------------------------------------------
/mixbench-opencl/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.7 FATAL_ERROR)
 2 | project(mixbench-ocl LANGUAGES CXX)
 3 | 
 4 | find_package(OpenCL REQUIRED)
 5 | 
 6 | include_directories(${OpenCL_INCLUDE_DIR} "../include")
 7 | 
 8 | # Get version info from git tag
 9 | execute_process(COMMAND git describe --tags --always
10 |                 OUTPUT_VARIABLE GIT_REV
11 |                 ERROR_QUIET)
12 | 
13 | # Store version info
14 | if ("${GIT_REV}" STREQUAL "")
15 |     set(GIT_REV "")
16 | endif()
17 | string(STRIP "${GIT_REV}" GIT_REV)
18 | file(WRITE "version_info.h" "#define VERSION_INFO \"")
19 | file(APPEND "version_info.h" ${GIT_REV})
20 | file(APPEND "version_info.h" "\"")
21 | 
22 | add_executable(mixbench-ocl main-ocl.cpp loclutil.h mix_kernels_ocl.cpp mix_kernels_ocl.h loclutil.h)
23 | file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/mix_kernels.cl
24 |         DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
25 | 
26 | try_compile(HAVE_HALF2 ${CMAKE_BINARY_DIR}
27 |         ${CMAKE_SOURCE_DIR}/check-half2-def.cpp
28 | 		CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${OpenCL_INCLUDE_DIR}")
29 | 
30 | if(NOT ${HAVE_HALF2})
31 | 	message( "cl_half2 workaround path" )
32 | 	target_compile_definitions(mixbench-ocl PRIVATE HF_WORKAROUND)
33 | endif()
34 | 
35 | target_link_libraries(mixbench-ocl ${OpenCL_LIBRARY})
36 | 


--------------------------------------------------------------------------------
/mixbench-cpu/README.md:
--------------------------------------------------------------------------------
 1 | # mixbench-cpu
 2 | 
 3 | This is the OpenMP implementation of mixbench, targeted to CPUs.
 4 | Theoretically, it could also target GPU accelerators but it has been developed
 5 | with the CPUs in mind.
 6 | In particular, it has been tailored for GCC compiler (see below for more info).
 7 | 
 8 | ## Running in docker
 9 | 
10 | The easiest way to run CPU version is by docker:
11 | `docker run --rm elkondis/mixbench-cpu`
12 | 
13 | This docker image re-compiles by tuning on your CPU architecture and executes the
14 | benchmark.
15 | 
16 | ## Notes
17 | 
18 | `mixbench-cpu` has been developed with `g++` (`gcc`) in mind.
19 | As such, it has been validated on the particular compiler that it vectorizes and properly
20 | unrolls the vectorized instructions as intended, in order to approach peak performance.
21 | `clang` on the other hand, at the time of development, has been observed that it does not
22 | properly produce optimum machine instruction sequences.
23 | The nature of computations for loop iteration in this benchmark is inherently sequential.
24 | So, it is essential that the compiler adequatelly unrolls the loop in the generated code
25 | so the CPU does not stall due to dependencies.
26 | 
27 | ## Building notes
28 | 
29 | The proper flags passed to the compiler (`-fopenmp -march=native -funroll-loops`) is taken care
30 | by the CMakeLists script.
31 | Thus, a simple cmake build invocation should be enough.
32 | 


--------------------------------------------------------------------------------
/mixbench-opencl/mix_kernels.cl:
--------------------------------------------------------------------------------
 1 | #ifdef ENABLE_DP
 2 | 	#pragma OPENCL EXTENSION cl_khr_fp64 : enable
 3 | #endif
 4 | #ifdef ENABLE_HP
 5 | 	#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #endif
 7 | 
 8 | bool is_equal(const class_T a, const class_T b){
 9 | #ifdef ENABLE_HP
10 | 	return a.x==b.x && a.y==b.y;
11 | #else
12 | 	return a==b;
13 | #endif
14 | }
15 | 
16 | __kernel __attribute__((reqd_work_group_size(blockdim, 1, 1)))
17 | void benchmark_func(class_T seed, global class_T *g_data){
18 | 	const unsigned int blockSize = blockdim;
19 | #ifdef BLOCK_STRIDED
20 | 	const int stride = blockSize;
21 | 	const int idx = get_group_id(0)*blockSize*ELEMENTS_PER_THREAD + get_local_id(0);
22 | #else
23 | 	const int grid_size = blockSize * get_num_groups(0);
24 | 	const int stride = grid_size;
25 | 	const int idx = get_global_id(0);
26 | #endif
27 | 	const int big_stride = get_num_groups(0)*blockSize*ELEMENTS_PER_THREAD;
28 | 
29 | 	class_T tmps[ELEMENTS_PER_THREAD];
30 | 	for(int k=0; k<FUSION_DEGREE; k++){	
31 | 		#pragma unroll
32 | 		for(int j=0; j<ELEMENTS_PER_THREAD; j++){
33 | 			// Load elements (memory intensive part)
34 | 			tmps[j] = g_data[idx+j*stride+k*big_stride];
35 | 			// Perform computations (compute intensive part)
36 | 			for(int i=0; i<COMPUTE_ITERATIONS; i++){
37 | 				tmps[j] = tmps[j]*tmps[j]+seed;//tmps[(j+ELEMENTS_PER_THREAD/2)%ELEMENTS_PER_THREAD];
38 | 			}
39 | 		}
40 | 		// Multiply add reduction
41 | 		class_T sum = (class_T)0;
42 | 		#pragma unroll
43 | 		for(int j=0; j<ELEMENTS_PER_THREAD; j+=2)
44 | 			sum += tmps[j]*tmps[j+1];
45 | 		// Dummy code
46 | 		if( is_equal(sum, (class_T)-1) ) // Designed so it never executes
47 | 			g_data[idx+k*big_stride] = sum;
48 | 	}
49 | }
50 | 


--------------------------------------------------------------------------------
/mixbench-cpu/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.9)
 2 | 
 3 | set(CMAKE_CXX_STANDARD 17)
 4 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 5 | 
 6 | project(mixbench-cpu LANGUAGES CXX)
 7 | 
 8 | set(CMAKE_CXX_EXTENSIONS OFF)
 9 | include_directories("../include")
10 | 
11 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
12 |   MESSAGE(STATUS "GNU C++ compiler is used")
13 | else()
14 |   MESSAGE(WARNING "Not a GNU C++ compiler is used. Potentially not optimized for this.")
15 | endif()
16 | 
17 | find_package(OpenMP)
18 | 
19 | option(BASELINE_IMPL "Use baseline implementation" OFF)
20 | 
21 | # Set default build type to RelWithDebInfo if not specified
22 | if (NOT CMAKE_BUILD_TYPE)
23 |     message (STATUS "CMAKE_BUILD_TYPE not set. Using Release with Debug Info")
24 |     set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE
25 |         STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel"
26 |         FORCE)
27 | endif()
28 | 
29 | if (NOT CMAKE_CXX_FLAGS)
30 |     message (STATUS "CMAKE_CXX_FLAGS not set. Using -march=native -funroll-loops")
31 |     set (CMAKE_CXX_FLAGS "-march=native -funroll-loops")
32 | endif()
33 | 
34 | # Get version info from git tag
35 | execute_process(COMMAND git describe --tags --always
36 |                 OUTPUT_VARIABLE GIT_REV
37 |                 ERROR_QUIET)
38 | 
39 | if ("${GIT_REV}" STREQUAL "")
40 |     set(GIT_REV "")
41 | endif()
42 | string(STRIP "${GIT_REV}" GIT_REV)
43 | file(WRITE "version_info.h" "#define VERSION_INFO \"")
44 | file(APPEND "version_info.h" ${GIT_REV})
45 | file(APPEND "version_info.h" "\"")
46 | 
47 | add_executable(mixbench-cpu main.cpp mix_kernels_cpu.cpp)
48 | 
49 | if(${BASELINE_IMPL})
50 |     message(WARNING "Enforcing baseline implementation")
51 |     target_compile_definitions(mixbench-cpu PRIVATE BASELINE_IMPL)
52 | endif()
53 | 
54 | if(OpenMP_CXX_FOUND)
55 |     target_link_libraries(mixbench-cpu PUBLIC OpenMP::OpenMP_CXX)
56 | endif()
57 | 


--------------------------------------------------------------------------------
/mixbench-hip/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10)
 2 | 
 3 | if(NOT DEFINED ENV{HIP_PATH})
 4 |     set(ENV{HIP_PATH} "/opt/rocm/hip")
 5 | endif()
 6 | if(NOT DEFINED ENV{ROCM_PATH})
 7 |     set(ENV{ROCM_PATH} "/opt/rocm/")
 8 | endif()
 9 | if(NOT DEFINED HIP_PATH)
10 |     set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
11 | endif()
12 | if(NOT DEFINED ROCM_PATH)
13 |     set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCM installation")
14 | endif()
15 | set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}/lib/cmake" {CMAKE_MODULE_PATH})
16 | 
17 | project(mixbench-hip LANGUAGES CXX)
18 | 
19 | set(CMAKE_CXX_STANDARD 17)
20 | 
21 | # Search for rocm in common locations
22 | list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH})
23 | 
24 | find_package(HIP)
25 | if(HIP_FOUND)
26 |     message(STATUS "Found HIP: " ${HIP_VERSION})
27 | else()
28 |     message(FATAL_ERROR "Could not find HIP. Ensure that HIP is either installed in /opt/rocm/hip or the variable HIP_PATH is set to point to the right location.")
29 | endif()
30 | 
31 | # Get version info from git tag
32 | execute_process(COMMAND git describe --tags --always
33 |                 OUTPUT_VARIABLE GIT_REV
34 |                 ERROR_QUIET)
35 | 
36 | if ("${GIT_REV}" STREQUAL "")
37 |     set(GIT_REV "")
38 | endif()
39 | string(STRIP "${GIT_REV}" GIT_REV)
40 | file(WRITE "version_info.h" "#define VERSION_INFO \"")
41 | file(APPEND "version_info.h" ${GIT_REV})
42 | file(APPEND "version_info.h" "\"")
43 | 
44 | # Set compiler and linker
45 | set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
46 | set(CMAKE_CXX_LINKER   ${HIP_HIPCC_EXECUTABLE})
47 | set(CMAKE_BUILD_TYPE Release)
48 | 
49 | include_directories("../include")
50 | 
51 | set(BENCH_SOURCE_FILES main-hip.cpp mix_kernels_hip.cpp)
52 | 
53 | set_source_files_properties(${BENCH_SOURCE_FILES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
54 | 
55 | 
56 | add_executable(mixbench-hip ${BENCH_SOURCE_FILES})
57 | 


--------------------------------------------------------------------------------
/mixbench-cpu/main.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * main.cpp: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #include <omp.h>
 8 | 
 9 | #include <chrono>
10 | #include <cstring>
11 | #include <iostream>
12 | #include <memory>
13 | 
14 | #include "mix_kernels_cpu.h"
15 | #include "version_info.h"
16 | 
17 | constexpr auto DEF_VECTOR_SIZE_PER_THREAD = 4 * 1024 * 1024;
18 | 
19 | using ArgParams = struct { unsigned int vecwidth; };
20 | 
21 | // Argument parsing
22 | // returns whether program execution should continue (true) or just print help
23 | // output (false)
24 | bool argument_parsing(int argc, char* argv[], ArgParams* output) {
25 |   int arg_count = 0;
26 |   for (int i = 1; i < argc; i++) {
27 |     if ((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) {
28 |       return false;
29 |     } else {
30 |       unsigned long value = strtoul(argv[i], NULL, 10);
31 |       switch (arg_count) {
32 |         // device selection
33 |         case 0:
34 |           output->vecwidth = value;
35 |           arg_count++;
36 |           break;
37 |         default:
38 |           return false;
39 |       }
40 |     }
41 |   }
42 |   return true;
43 | }
44 | 
45 | int main(int argc, char* argv[]) {
46 |   std::cout << "mixbench-cpu (" << VERSION_INFO << ")" << std::endl;
47 | 
48 |   const auto hardware_concurrency = omp_get_max_threads();
49 | 
50 |   ArgParams args{static_cast<unsigned int>(
51 |       hardware_concurrency * DEF_VECTOR_SIZE_PER_THREAD / (1024 * 1024))};
52 | 
53 |   if (!argument_parsing(argc, argv, &args)) {
54 |     std::cout << "Usage: mixbench-cpu [options] [array size(1024^2)]"
55 |               << std::endl
56 |               << std::endl
57 |               << "Options:" << std::endl
58 |               << "  -h or --help              Show this message" << std::endl;
59 | 
60 |     exit(1);
61 |   }
62 | 
63 |   std::cout << "Use \"-h\" argument to see available options" << std::endl;
64 | 
65 |   const size_t VEC_WIDTH = 1024 * 1024 * args.vecwidth;
66 | 
67 |   std::unique_ptr<double[]> c;
68 | 
69 |   c.reset(new (std::align_val_t(64)) double[VEC_WIDTH]);
70 | 
71 |   std::cout << "Working memory size: " << args.vecwidth * sizeof(double) << "MB"
72 |             << std::endl;
73 |   std::cout << "Total threads: " << hardware_concurrency << std::endl;
74 | 
75 |   mixbenchCPU(c.get(), VEC_WIDTH);
76 | 
77 |   return 0;
78 | }
79 | 


--------------------------------------------------------------------------------
/mixbench-sycl/lsyclutil.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * lsyclutil.h: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #ifndef _CUTIL_H_
 8 | #define _CUTIL_H_
 9 | 
10 | #include <CL/sycl.hpp>
11 | #include <stdio.h>
12 | 
13 | using namespace cl;
14 | 
15 | #define FRACTION_CEILING(numerator, denominator) ((numerator+denominator-1)/(denominator))
16 | 
17 | 
18 | // Print basic device information
19 | inline void StoreDeviceInfo(const sycl::device &device){
20 |     auto platform = device.get_platform();
21 |     try{
22 |         auto platform_name = platform.get_info<sycl::info::platform::name>();
23 |         auto device_name = device.get_info<sycl::info::device::name>();
24 |         auto vendor_name = device.get_info<sycl::info::device::vendor>();
25 |         auto device_drv = device.get_info<sycl::info::device::driver_version>();
26 | 
27 |         auto device_addrbits = device.get_info<sycl::info::device::address_bits>();
28 |         auto device_freq = device.get_info<sycl::info::device::max_clock_frequency>();
29 |         auto device_gmem = device.get_info<sycl::info::device::global_mem_size>();
30 |         auto device_maxalloc = device.get_info<sycl::info::device::max_mem_alloc_size>();
31 |         auto device_syclver = device.get_info<sycl::info::device::version>();
32 |         auto device_CUs = device.get_info<sycl::info::device::max_compute_units>();
33 | 
34 |         std::cout << "------------------------ Device specifications ------------------------" << std::endl;
35 |         std::cout << "Platform:            " << platform_name << std::endl;
36 |         std::cout << "Device:              " << device_name << '/' << vendor_name << std::endl;
37 |         std::cout << "Driver version:      " << device_drv << std::endl;
38 |         std::cout << "Address bits:        " << device_addrbits << std::endl;
39 |         std::cout << "GPU clock rate:      " << device_freq << " MHz" << std::endl;
40 |         std::cout << "Total global mem:    " << device_gmem/1024/1024 << " MB" << std::endl;
41 |         std::cout << "Max allowed buffer:  " << device_maxalloc/1024/1024 << " MB" << std::endl;
42 |         std::cout << "SYCL version:        " << device_syclver << std::endl;
43 |         std::cout << "Total CUs:           " << device_CUs << std::endl;
44 |         std::cout << "-----------------------------------------------------------------------" << std::endl;
45 |     }
46 |     catch (sycl::exception const &exc) {
47 |         std::cerr << "Could not get full device info: ";
48 |         std::cerr << exc.what() << std::endl;
49 |     }
50 | }
51 | 
52 | inline size_t GetMaxDeviceWGSize(const sycl::device &device){
53 |     auto wgsize = device.get_info<sycl::info::device::max_work_group_size>();
54 |     return wgsize;
55 | }
56 | 
57 | 
58 | #endif
59 | 


--------------------------------------------------------------------------------
/mixbench-sycl/main-sycl.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * main-sycl.cpp: This file is part of the mixbench GPU micro-benchmark suite.
  3 |  *
  4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  5 |  **/
  6 | 
  7 | #include <iostream>
  8 | #include <CL/sycl.hpp>
  9 | #include "lsyclutil.h"
 10 | #include "mix_kernels_sycl.h"
 11 | #include "version_info.h"
 12 | 
 13 | #define DEF_VECTOR_SIZE (32*1024*1024)
 14 | 
 15 | typedef struct{
 16 |     int device_index;
 17 |     bool use_os_timer;
 18 |     int wg_size;
 19 |     unsigned int vecwidth;
 20 | } ArgParams;
 21 | 
 22 | // Argument parsing
 23 | // returns whether program execution should continue (true) or just print help output (false)
 24 | bool argument_parsing(int argc, char* argv[], ArgParams *output){
 25 |     int arg_count = 0;
 26 |     for(int i=1; i<argc; i++) {
 27 |         if( (strcmp(argv[i], "-h")==0) || (strcmp(argv[i], "--help")==0) ) {
 28 |             return false;
 29 |         } else if( (strcmp(argv[i], "-t")==0) || (strcmp(argv[i], "--use-os-timer")==0) ) {
 30 |             output->use_os_timer = true;
 31 |         } else {
 32 |             unsigned long value = strtoul(argv[i], NULL, 10);
 33 |             switch( arg_count ){
 34 |                 // device selection
 35 |                 case 0:
 36 |                     output->device_index = value;
 37 |                     arg_count++;
 38 |                     break;
 39 |                 // workgroup size
 40 |                 case 1:
 41 |                     output->wg_size = value;
 42 |                     arg_count++;
 43 |                     break;
 44 |                 // array size (x1024^2)
 45 |                 case 2:
 46 |                     output->vecwidth = value;
 47 |                     arg_count++;
 48 |                     break;
 49 |                 default:
 50 |                     return false;
 51 |             }
 52 |         }
 53 |     }
 54 |     return true;
 55 | }
 56 | 
 57 | int main(int argc, char* argv[]) {
 58 |     std::cout << "mixbench-sycl (" << VERSION_INFO << ")" << std::endl;
 59 | 
 60 |     ArgParams args{1, false, 256, DEF_VECTOR_SIZE/(1024*1024)};
 61 | 
 62 |     if (!argument_parsing(argc, argv, &args)) {
 63 |         std::cout << "Usage: mixbench-sycl [options] [device index [workgroup size [array size(1024^2)]]]" << std::endl
 64 |                   << std::endl
 65 |                   << "Options:" << std::endl
 66 |                   << "  -h or --help              Show this message" << std::endl
 67 |                   << "  -t or --use-os-timer      Use standard OS timer instead of SYCL profiling timer" << std::endl;
 68 | 
 69 |         auto devices = sycl::device::get_devices();
 70 |         std::cout << "Available SYCL devices:" << std::endl;
 71 |         int cur_dev_idx = 1;
 72 |         for(auto device:devices){
 73 |             std::cout << "  " << cur_dev_idx++ << ". " << device.get_info<sycl::info::device::name>() << '/' 
 74 |                       << device.get_platform().get_info<sycl::info::platform::name>() << std::endl;
 75 |         }
 76 |         exit(1);
 77 |     }
 78 | 
 79 |     std::cout << "Use \"-h\" argument to see available options" << std::endl;
 80 |     
 81 |     const size_t VEC_WIDTH = 1024*1024*args.vecwidth;
 82 |     unsigned int datasize = VEC_WIDTH*sizeof(double);
 83 | 
 84 |     std::unique_ptr<double[]> c(new double[VEC_WIDTH]);
 85 | 
 86 |     try {
 87 |         sycl::device device = sycl::device::get_devices().at(args.device_index-1);
 88 |             
 89 |         StoreDeviceInfo(device);
 90 | 
 91 |         const size_t totalDeviceMem = device.get_info<sycl::info::device::global_mem_size>();
 92 |         std::cout << "Total GPU memory:     " << totalDeviceMem << std::endl;
 93 |         std::cout << "Buffer size:          " << datasize/(1024*1024) << "MB" << std::endl;
 94 | 
 95 |         mixbenchGPU(device, c.get(), VEC_WIDTH, args.use_os_timer, args.wg_size);
 96 |     }
 97 |     catch (sycl::exception const &exc) {
 98 |         std::cerr << "SYCL exception caught: " << exc.what();
 99 |         std::exit(1);
100 |     }
101 | 
102 |     return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/mixbench-opencl/main-ocl.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * main-ocl.cpp: This file is part of the mixbench GPU micro-benchmark suite.
  3 |  *
  4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  5 |  **/
  6 | 
  7 | #include <cstdio>
  8 | #include <cstdlib>
  9 | #include <cstring>
 10 | #include "loclutil.h"
 11 | #include "mix_kernels_ocl.h"
 12 | #include "version_info.h"
 13 | 
 14 | #define DEF_VECTOR_SIZE (32*1024*1024)
 15 | 
 16 | typedef struct{
 17 | 	int device_index;
 18 | 	bool block_strided;
 19 | 	bool host_allocated;
 20 | 	bool use_os_timer;
 21 | 	int wg_size;
 22 | 	unsigned int vecwidth;
 23 | 	unsigned int elements_per_wi;
 24 | 	unsigned int fusion_degree;
 25 | } ArgParams;
 26 | 
 27 | // Argument parsing
 28 | // returns whether program execution should continue (true) or just print help output (false)
 29 | bool argument_parsing(int argc, char* argv[], ArgParams *output){
 30 | 	int arg_count = 0;
 31 | 	for(int i=1; i<argc; i++) {
 32 | 		if( (strcmp(argv[i], "-h")==0) || (strcmp(argv[i], "--help")==0) ) {
 33 | 			return false;
 34 | 		} else if( (strcmp(argv[i], "-w")==0) || (strcmp(argv[i], "--workgroup-stride")==0) ) {
 35 | 			output->block_strided = true;
 36 | 		} else if( (strcmp(argv[i], "-H")==0) || (strcmp(argv[i], "--host-alloc")==0) ) {
 37 | 			output->host_allocated = true;
 38 | 		} else if( (strcmp(argv[i], "-t")==0) || (strcmp(argv[i], "--use-os-timer")==0) ) {
 39 | 			output->use_os_timer = true;
 40 | 		} else {
 41 | 			unsigned long value = strtoul(argv[i], NULL, 10);
 42 | 			switch( arg_count ){
 43 | 				// device selection
 44 | 				case 0:
 45 | 					output->device_index = value;
 46 | 					arg_count++;
 47 | 					break;
 48 | 				// workgroup size
 49 | 				case 1:
 50 | 					output->wg_size = value;
 51 | 					arg_count++;
 52 | 					break;
 53 | 				// array size (x1024^2)
 54 | 				case 2:
 55 | 					output->vecwidth = value;
 56 | 					arg_count++;
 57 | 					break;
 58 | 				// elements per workitem
 59 | 				case 3:
 60 | 					output->elements_per_wi = value;
 61 | 					arg_count++;
 62 | 					break;
 63 | 				case 4:
 64 | 					output->fusion_degree = value;
 65 | 					arg_count++;
 66 | 					break;
 67 | 				default:
 68 | 					return false;
 69 | 			}
 70 | 		}
 71 | 	}
 72 | 	return true;
 73 | }
 74 | 
 75 | int main(int argc, char* argv[]) {
 76 | 	printf("mixbench-ocl (%s)\n", VERSION_INFO);
 77 | 
 78 | 	ArgParams args = {1, false, false, false, 256, DEF_VECTOR_SIZE/(1024*1024), 8, 4};
 79 | 
 80 | 	if( !argument_parsing(argc, argv, &args) ){
 81 | 		printf("Usage: mixbench-ocl [options] [device index [workgroup size [array size(1024^2) [elements per workitem [fusion degree]]]]]\n");
 82 | 		printf("\nOptions:\n"
 83 | 			"  -h or --help              Show this message\n"
 84 | 			"  -H or --host-alloc        Use host allocated buffer (CL_MEM_ALLOC_HOST_PTR)\n"
 85 | 			"  -w or --workgroup-stride  Workitem strides equal to the width of a workgroup length (default: NDRange length)\n"
 86 | 			"  -t or --use-os-timer      Use standard OS timer instead of OpenCL profiling timer\n"
 87 | 			"\n");
 88 | 
 89 | 		GetDeviceID(0, stdout);
 90 | 		exit(1);
 91 | 	}
 92 | 
 93 | 	printf("Use \"-h\" argument to see available options\n");
 94 | 	
 95 | 	const size_t VEC_WIDTH = 1024*1024*args.vecwidth;
 96 | 	unsigned int datasize = VEC_WIDTH*sizeof(double);
 97 | 
 98 | 	cl_device_id dev_id = GetDeviceID(args.device_index, NULL);
 99 | 
100 | 	if( dev_id == NULL ){
101 | 		fprintf(stderr, "Error: No OpenCL device selected\n");
102 | 		exit(1);
103 | 	}
104 | 	StoreDeviceInfo(dev_id, stdout);
105 | 
106 | 	printf("Buffer size:            %dMB\n", datasize/(1024*1024));
107 | 	printf("Workgroup size:         %d\n", args.wg_size);
108 | 	printf("Elements per workitem:  %d\n", args.elements_per_wi);
109 | 	printf("Workitem fusion degree: %d\n", args.fusion_degree);
110 | 	// Check if selected workgroup size is supported
111 | 	if( GetMaxDeviceWGSize(dev_id)<(size_t)args.wg_size ){
112 | 		fprintf(stderr, "Error: Unsupported workgroup size (%u).\n", args.wg_size);
113 | 		exit(1);		
114 | 	}
115 | 
116 | 	double *c;
117 | 	c = (double*)malloc(datasize);
118 | 
119 | 	mixbenchGPU(dev_id, c, VEC_WIDTH, args.block_strided, args.host_allocated, args.use_os_timer, args.wg_size, args.elements_per_wi, args.fusion_degree);
120 | 
121 | 	free(c);
122 | 
123 | 	return 0;
124 | }
125 | 


--------------------------------------------------------------------------------
/mixbench-hip/lhiputil.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * lhiputil.h: This file is part of the mixbench GPU micro-benchmark suite.
 3 |  *
 4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
 5 |  **/
 6 | 
 7 | #ifndef _HIPUTIL_H_
 8 | #define _HIPUTIL_H_
 9 | 
10 | #include <stdio.h>
11 | #include <hip/hip_runtime.h>
12 | #include <hip/hip_runtime_api.h>
13 | 
14 | #define HIP_SAFE_CALL( call) {                                    \
15 |     hipError_t err = call;                                                    \
16 |     if( hipSuccess != err) {                                                \
17 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
18 |                 __FILE__, __LINE__, hipGetErrorString( err) );              \
19 |         exit(EXIT_FAILURE);                                                  \
20 |     } }
21 | 
22 | #define FRACTION_CEILING(numerator, denominator) ((numerator+denominator-1)/(denominator))
23 | 
24 | static inline int _ConvertSMVer2Cores(int major, int minor){
25 | #ifdef __HIP_PLATFORM_HCC__
26 | 	return 64;
27 | #else
28 | 	switch(major){
29 | 		case 1:  return 8;
30 | 		case 2:  switch(minor){
31 | 			case 1:  return 48;
32 | 			default: return 32;
33 | 		}
34 | 		case 3:  return 192;
35 | 		default: return 128;
36 | 	}
37 | #endif
38 | }
39 | 
40 | static inline void GetDevicePeakInfo(double *aGIPS, double *aGBPS, hipDeviceProp_t *aDeviceProp = NULL){
41 | 	hipDeviceProp_t deviceProp;
42 | 	int current_device;
43 | 	if( aDeviceProp )
44 | 		deviceProp = *aDeviceProp;
45 | 	else{
46 | 		HIP_SAFE_CALL( hipGetDevice(&current_device) );
47 | 		HIP_SAFE_CALL( hipGetDeviceProperties(&deviceProp, current_device) );
48 | 	}
49 | 	const int TotalSPs = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)*deviceProp.multiProcessorCount;
50 | 	*aGIPS = 1000.0 * deviceProp.clockRate * TotalSPs / (1000.0 * 1000.0 * 1000.0);  // Giga instructions/sec
51 | 	*aGBPS = 2.0 * (double)deviceProp.memoryClockRate * 1000.0 * (double)deviceProp.memoryBusWidth / 8.0;
52 | }
53 | 
54 | static inline hipDeviceProp_t GetDeviceProperties(void){
55 | 	hipDeviceProp_t deviceProp;
56 | 	int current_device;
57 | 	HIP_SAFE_CALL( hipGetDevice(&current_device) );
58 | 	HIP_SAFE_CALL( hipGetDeviceProperties(&deviceProp, current_device) );
59 | 	return deviceProp;
60 | }
61 | 
62 | // Print basic device information
63 | static void StoreDeviceInfo(FILE *fout){
64 | 	hipDeviceProp_t deviceProp;
65 | 	int current_device, driver_version;
66 | 	HIP_SAFE_CALL( hipGetDevice(&current_device) );
67 | 	HIP_SAFE_CALL( hipGetDeviceProperties(&deviceProp, current_device) );
68 | 	HIP_SAFE_CALL( hipDriverGetVersion(&driver_version) );
69 | 	fprintf(fout, "------------------------ Device specifications ------------------------\n");
70 | 	fprintf(fout, "Device:              %s\n", deviceProp.name);
71 | 	fprintf(fout, "CUDA driver version: %d.%d\n", driver_version/1000, driver_version%1000);
72 | 	fprintf(fout, "GPU clock rate:      %d MHz\n", deviceProp.clockRate/1000);
73 | 	//fprintf(fout, "Memory clock rate:   %d MHz\n", deviceProp.memoryClockRate/1000/2);
74 | 	//fprintf(fout, "Memory bus width:    %d bits\n", deviceProp.memoryBusWidth);
75 | 	fprintf(fout, "WarpSize:            %d\n", deviceProp.warpSize);
76 | 	fprintf(fout, "L2 cache size:       %d KB\n", deviceProp.l2CacheSize/1024);
77 | 	fprintf(fout, "Total global mem:    %d MB\n", (int)(deviceProp.totalGlobalMem/1024/1024));
78 | 	//fprintf(fout, "ECC enabled:         %s\n", deviceProp.ECCEnabled?"Yes":"No");
79 | #ifdef __HIP_PLATFORM_NVCC__
80 | 	fprintf(fout, "Compute Capability:  %d.%d\n", deviceProp.major, deviceProp.minor);
81 | #endif
82 | 	const int TotalSPs = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)*deviceProp.multiProcessorCount;
83 | 	fprintf(fout, "Total SPs:           %d (%d MPs x %d SPs/MP)\n", TotalSPs, deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
84 | 	double InstrThroughput, MemBandwidth;
85 | 	GetDevicePeakInfo(&InstrThroughput, &MemBandwidth, &deviceProp);
86 | 	fprintf(fout, "Compute throughput:  %.2f GFlops (theoretical single precision FMAs)\n", 2.0*InstrThroughput);
87 | 	fprintf(fout, "Memory bandwidth:    %.2f GB/sec\n", MemBandwidth/(1000.0*1000.0*1000.0));
88 | 	fprintf(fout, "-----------------------------------------------------------------------\n");
89 | }
90 | 
91 | #endif
92 | 


--------------------------------------------------------------------------------
/mixbench-opencl/loclutil.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * loclutil.h: This file is part of the mixbench GPU micro-benchmark suite.
  3 |  *
  4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  5 |  **/
  6 | 
  7 | #ifndef _OCLUTIL_H_
  8 | #define _OCLUTIL_H_
  9 | 
 10 | #include <cstdio>
 11 | #include <cstdlib>
 12 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 13 | #define CL_TARGET_OPENCL_VERSION 120
 14 | #include <CL/opencl.h>
 15 | 
 16 | #if defined(_MSC_VER)
 17 | #include <malloc.h>
 18 | #define alloca _alloca
 19 | #endif
 20 | 
 21 | #define OCL_SAFE_CALL(call) {                                                \
 22 |     cl_int err = call;                                                       \
 23 |     if( CL_SUCCESS != err) {                                                 \
 24 |         fprintf(stderr, "OpenCL error in file '%s' in line %i : Code %d.\n", \
 25 |                 __FILE__, __LINE__, err );                                   \
 26 |         exit(EXIT_FAILURE);                                                  \
 27 |     } }
 28 | 
 29 | #define FRACTION_CEILING(numerator, denominator) ((numerator+denominator-1)/(denominator))
 30 | 
 31 | inline cl_device_id GetDeviceID(int index, FILE *fout){
 32 | 	cl_uint cnt_platforms, cnt_device_ids;
 33 | 	cl_device_id device_selected = NULL;
 34 | 	char dev_name[256], plat_name[256];
 35 | 
 36 | 	OCL_SAFE_CALL( clGetPlatformIDs(0, NULL, &cnt_platforms) );
 37 | 	cl_platform_id *platform_ids = (cl_platform_id*)alloca(sizeof(cl_platform_id)*cnt_platforms);
 38 | 	cl_device_id device_ids[256];
 39 | 	OCL_SAFE_CALL( clGetPlatformIDs(cnt_platforms, platform_ids, NULL) );
 40 | 
 41 | 	if( fout )
 42 | 		fprintf(fout, "Available OpenCL devices:\n");
 43 | 	int cur_dev_idx = 1;
 44 | 	for(int i=0; i<(int)cnt_platforms; i++){
 45 | 		size_t sz_name_len;
 46 | 		OCL_SAFE_CALL( clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 0, NULL, &sz_name_len) );
 47 | 		sz_name_len = sz_name_len>sizeof(plat_name) ? sizeof(plat_name) : sz_name_len;
 48 | 		OCL_SAFE_CALL( clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sz_name_len, plat_name, NULL) );
 49 | 
 50 | 		OCL_SAFE_CALL( clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &cnt_device_ids) );
 51 | 		OCL_SAFE_CALL( clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, cnt_device_ids, device_ids, NULL) );
 52 | 		for(int d=0; d<(int)cnt_device_ids; d++){
 53 | 			if( fout ){
 54 | 				OCL_SAFE_CALL( clGetDeviceInfo(device_ids[d], CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL) );
 55 | 				fprintf(fout, "  %d. %s/%s\n", cur_dev_idx, dev_name, plat_name);
 56 | 			}
 57 | 			if( cur_dev_idx==index )
 58 | 				device_selected = device_ids[d];
 59 | 			cur_dev_idx++;
 60 | 		}
 61 | 	}
 62 | 	return device_selected;
 63 | }
 64 | 
 65 | // Print basic device information
 66 | inline void StoreDeviceInfo(cl_device_id devID, FILE *fout){
 67 | 	char dev_platform[256], dev_name[256], dev_vendor[256], dev_clver[256], dev_drv[256];
 68 | 	cl_uint dev_freq, dev_cus, dev_addrbits;
 69 | 	cl_ulong dev_gmem, dev_maxalloc;
 70 | 	cl_platform_id dev_platform_id;
 71 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_PLATFORM, sizeof(dev_platform_id), &dev_platform_id, NULL) );
 72 | 	OCL_SAFE_CALL( clGetPlatformInfo(dev_platform_id, CL_PLATFORM_NAME, sizeof(dev_platform), dev_platform, NULL) );
 73 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL) );
 74 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_VENDOR, sizeof(dev_vendor), dev_vendor, NULL) );
 75 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_VERSION, sizeof(dev_clver), dev_clver, NULL) );
 76 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_ADDRESS_BITS, sizeof(dev_addrbits), &dev_addrbits, NULL) );
 77 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(dev_freq), &dev_freq, NULL) );
 78 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(dev_gmem), &dev_gmem, NULL) );
 79 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(dev_maxalloc), &dev_maxalloc, NULL) );
 80 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DRIVER_VERSION, sizeof(dev_drv), dev_drv, NULL) );
 81 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(dev_cus), &dev_cus, NULL) );
 82 | 	fprintf(fout, "------------------------ Device specifications ------------------------\n");
 83 | 	fprintf(fout, "Platform:            %s\n", dev_platform);
 84 | 	fprintf(fout, "Device:              %s/%s\n", dev_name, dev_vendor);
 85 | 	fprintf(fout, "Driver version:      %s\n", dev_drv);
 86 | 	fprintf(fout, "Address bits:        %d\n", dev_addrbits);
 87 | 	fprintf(fout, "GPU clock rate:      %d MHz\n", dev_freq);
 88 | 	fprintf(fout, "Total global mem:    %d MB\n", (int)(dev_gmem/1024/1024));
 89 | 	fprintf(fout, "Max allowed buffer:  %d MB\n", (int)(dev_maxalloc/1024/1024));
 90 | 	fprintf(fout, "OpenCL version:      %s\n", dev_clver);
 91 | 	fprintf(fout, "Total CUs:           %d\n", dev_cus);
 92 | 	fprintf(fout, "-----------------------------------------------------------------------\n");
 93 | }
 94 | 
 95 | inline size_t GetMaxDeviceWGSize(cl_device_id devID){
 96 | 	size_t wgsize;
 97 | 	OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(wgsize), &wgsize, NULL) );
 98 | 	return wgsize;
 99 | }
100 | 
101 | #endif
102 | 


--------------------------------------------------------------------------------
/mixbench-cuda/lcutil.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * lcutil.h: This file is part of the mixbench GPU micro-benchmark suite.
  3 |  *
  4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  5 |  **/
  6 | 
  7 | #ifndef _CUTIL_H_
  8 | #define _CUTIL_H_
  9 | 
 10 | #include <stdio.h>
 11 | #include <cuda.h>
 12 | #include <cuda_runtime.h>
 13 | #include <cuda_runtime_api.h>
 14 | 
 15 | #define CUDA_SAFE_CALL( call) {                                    \
 16 |     cudaError err = call;                                                    \
 17 |     if( cudaSuccess != err) {                                                \
 18 |         fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
 19 |                 __FILE__, __LINE__, cudaGetErrorString( err) );              \
 20 |         exit(EXIT_FAILURE);                                                  \
 21 |     } }
 22 | 
 23 | #define FRACTION_CEILING(numerator, denominator) ((numerator+denominator-1)/(denominator))
 24 | 
 25 | static inline int _ConvertSMVer2Cores(int major, int minor) {
 26 |   switch (major) {
 27 |     case 1:
 28 |       return 8;
 29 |     case 2:
 30 |       switch (minor) {
 31 |         case 1:
 32 |           return 48;
 33 |         default:
 34 |           return 32;
 35 |       }
 36 |     case 3:
 37 |       return 192;
 38 |     case 6:
 39 |       switch (minor) {
 40 |         case 0:
 41 |           return 64;
 42 |         default:
 43 |           return 128;
 44 |       }
 45 |     case 7:
 46 |       return 64;
 47 |     case 8:
 48 |       switch (minor) {
 49 |         case 0:
 50 |           return 64;
 51 |         default:
 52 |           return 128;
 53 |       }
 54 |     default:
 55 |       return 128;
 56 |   }
 57 | }
 58 | 
 59 | static inline bool IsFP16Supported(void){
 60 | 	cudaDeviceProp deviceProp;
 61 | 	int current_device;
 62 | 	CUDA_SAFE_CALL( cudaGetDevice(&current_device) );
 63 | 	CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, current_device) );
 64 | 	return deviceProp.major>5 || (deviceProp.major == 5 && deviceProp.minor == 3);
 65 | }
 66 | 
 67 | static inline void GetDevicePeakInfo(double *aGIPS, double *aGBPS){
 68 | 	int current_device;
 69 | 	int major, minor;
 70 | 	int memoryBusWidth, clockRate, memoryClockRate, multiProcessorCount;
 71 | 	CUDA_SAFE_CALL( cudaGetDevice(&current_device) );
 72 | 	CUDA_SAFE_CALL( cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device) );
 73 | 	CUDA_SAFE_CALL( cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device) );
 74 | 	CUDA_SAFE_CALL( cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device) );
 75 | 	CUDA_SAFE_CALL( cudaDeviceGetAttribute(&memoryBusWidth, cudaDevAttrGlobalMemoryBusWidth, current_device) );
 76 | 	CUDA_SAFE_CALL( cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device) );
 77 | 	CUDA_SAFE_CALL( cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, current_device) );
 78 | 	const int TotalSPs = _ConvertSMVer2Cores(major, minor)*multiProcessorCount;
 79 | 	*aGIPS = 1000.0 * clockRate * TotalSPs / (1000.0 * 1000.0 * 1000.0);  // Giga instructions/sec
 80 | 	*aGBPS = 2.0 * (double)memoryClockRate * 1000.0 * (double)memoryBusWidth / 8.0;
 81 | }
 82 | 
 83 | static inline cudaDeviceProp GetDeviceProperties(void){
 84 | 	cudaDeviceProp deviceProp;
 85 | 	int current_device;
 86 | 	CUDA_SAFE_CALL( cudaGetDevice(&current_device) );
 87 | 	CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, current_device) );
 88 | 	return deviceProp;
 89 | }
 90 | 
 91 | // Print basic device information
 92 | static void StoreDeviceInfo(FILE *fout){
 93 | 	cudaDeviceProp deviceProp;
 94 | 	int current_device, driver_version;
 95 | 	int clockRate, memoryClockRate;
 96 | 	CUDA_SAFE_CALL( cudaGetDevice(&current_device) );
 97 | 	CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, current_device) );
 98 | 	CUDA_SAFE_CALL( cudaDriverGetVersion(&driver_version) );
 99 | 	CUDA_SAFE_CALL( cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device) );
100 | 	CUDA_SAFE_CALL( cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, current_device) );
101 | 	fprintf(fout, "------------------------ Device specifications ------------------------\n");
102 | 	fprintf(fout, "Device:              %s\n", deviceProp.name);
103 | 	fprintf(fout, "CUDA driver version: %d.%d\n", driver_version/1000, driver_version%1000);
104 | 	fprintf(fout, "GPU clock rate:      %d MHz\n", clockRate/1000);
105 | 	fprintf(fout, "Memory clock rate:   %d MHz\n", memoryClockRate/1000/2);
106 | 	fprintf(fout, "Memory bus width:    %d bits\n", deviceProp.memoryBusWidth);
107 | 	fprintf(fout, "WarpSize:            %d\n", deviceProp.warpSize);
108 | 	fprintf(fout, "L2 cache size:       %d KB\n", deviceProp.l2CacheSize/1024);
109 | 	fprintf(fout, "Total global mem:    %d MB\n", (int)(deviceProp.totalGlobalMem/1024/1024));
110 | 	fprintf(fout, "ECC enabled:         %s\n", deviceProp.ECCEnabled?"Yes":"No");
111 | 	fprintf(fout, "Compute Capability:  %d.%d\n", deviceProp.major, deviceProp.minor);
112 | 	const int TotalSPs = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)*deviceProp.multiProcessorCount;
113 | 	fprintf(fout, "Total SPs:           %d (%d MPs x %d SPs/MP)\n", TotalSPs, deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor));
114 | 	double InstrThroughput, MemBandwidth;
115 | 	GetDevicePeakInfo(&InstrThroughput, &MemBandwidth);
116 | 	fprintf(fout, "Compute throughput:  %.2f GFlops (theoretical single precision FMAs)\n", 2.0*InstrThroughput);
117 | 	fprintf(fout, "Memory bandwidth:    %.2f GB/sec\n", MemBandwidth/(1000.0*1000.0*1000.0));
118 | 	fprintf(fout, "-----------------------------------------------------------------------\n");
119 | }
120 | 
121 | #endif
122 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  Chromium
  4 | AccessModifierOffset: -1
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveMacros: false
  7 | AlignConsecutiveAssignments: false
  8 | AlignConsecutiveBitFields: false
  9 | AlignConsecutiveDeclarations: false
 10 | AlignEscapedNewlines: Left
 11 | AlignOperands:   Align
 12 | AlignTrailingComments: true
 13 | AllowAllArgumentsOnNextLine: true
 14 | AllowAllConstructorInitializersOnNextLine: true
 15 | AllowAllParametersOfDeclarationOnNextLine: false
 16 | AllowShortEnumsOnASingleLine: true
 17 | AllowShortBlocksOnASingleLine: Never
 18 | AllowShortCaseLabelsOnASingleLine: false
 19 | AllowShortFunctionsOnASingleLine: Inline
 20 | AllowShortLambdasOnASingleLine: All
 21 | AllowShortIfStatementsOnASingleLine: Never
 22 | AllowShortLoopsOnASingleLine: false
 23 | AlwaysBreakAfterDefinitionReturnType: None
 24 | AlwaysBreakAfterReturnType: None
 25 | AlwaysBreakBeforeMultilineStrings: true
 26 | AlwaysBreakTemplateDeclarations: Yes
 27 | BinPackArguments: true
 28 | BinPackParameters: false
 29 | BraceWrapping:
 30 |   AfterCaseLabel:  false
 31 |   AfterClass:      false
 32 |   AfterControlStatement: Never
 33 |   AfterEnum:       false
 34 |   AfterFunction:   false
 35 |   AfterNamespace:  false
 36 |   AfterObjCDeclaration: false
 37 |   AfterStruct:     false
 38 |   AfterUnion:      false
 39 |   AfterExternBlock: false
 40 |   BeforeCatch:     false
 41 |   BeforeElse:      false
 42 |   BeforeLambdaBody: false
 43 |   BeforeWhile:     false
 44 |   IndentBraces:    false
 45 |   SplitEmptyFunction: true
 46 |   SplitEmptyRecord: true
 47 |   SplitEmptyNamespace: true
 48 | BreakBeforeBinaryOperators: None
 49 | BreakBeforeBraces: Attach
 50 | BreakBeforeInheritanceComma: false
 51 | BreakInheritanceList: BeforeColon
 52 | BreakBeforeTernaryOperators: true
 53 | BreakConstructorInitializersBeforeComma: false
 54 | BreakConstructorInitializers: BeforeColon
 55 | BreakAfterJavaFieldAnnotations: false
 56 | BreakStringLiterals: true
 57 | ColumnLimit:     80
 58 | CommentPragmas:  '^ IWYU pragma:'
 59 | CompactNamespaces: false
 60 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 61 | ConstructorInitializerIndentWidth: 4
 62 | ContinuationIndentWidth: 4
 63 | Cpp11BracedListStyle: true
 64 | DeriveLineEnding: true
 65 | DerivePointerAlignment: false
 66 | DisableFormat:   false
 67 | ExperimentalAutoDetectBinPacking: false
 68 | FixNamespaceComments: true
 69 | ForEachMacros:
 70 |   - foreach
 71 |   - Q_FOREACH
 72 |   - BOOST_FOREACH
 73 | IncludeBlocks:   Preserve
 74 | IncludeCategories:
 75 |   - Regex:           '^<ext/.*\.h>'
 76 |     Priority:        2
 77 |     SortPriority:    0
 78 |   - Regex:           '^<.*\.h>'
 79 |     Priority:        1
 80 |     SortPriority:    0
 81 |   - Regex:           '^<.*'
 82 |     Priority:        2
 83 |     SortPriority:    0
 84 |   - Regex:           '.*'
 85 |     Priority:        3
 86 |     SortPriority:    0
 87 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 88 | IncludeIsMainSourceRegex: ''
 89 | IndentCaseLabels: true
 90 | IndentCaseBlocks: false
 91 | IndentGotoLabels: true
 92 | IndentPPDirectives: None
 93 | IndentExternBlock: AfterExternBlock
 94 | IndentWidth:     2
 95 | IndentWrappedFunctionNames: false
 96 | InsertTrailingCommas: None
 97 | JavaScriptQuotes: Leave
 98 | JavaScriptWrapImports: true
 99 | KeepEmptyLinesAtTheStartOfBlocks: false
100 | MacroBlockBegin: ''
101 | MacroBlockEnd:   ''
102 | MaxEmptyLinesToKeep: 1
103 | NamespaceIndentation: None
104 | ObjCBinPackProtocolList: Never
105 | ObjCBlockIndentWidth: 2
106 | ObjCBreakBeforeNestedBlockParam: true
107 | ObjCSpaceAfterProperty: false
108 | ObjCSpaceBeforeProtocolList: true
109 | PenaltyBreakAssignment: 2
110 | PenaltyBreakBeforeFirstCallParameter: 1
111 | PenaltyBreakComment: 300
112 | PenaltyBreakFirstLessLess: 120
113 | PenaltyBreakString: 1000
114 | PenaltyBreakTemplateDeclaration: 10
115 | PenaltyExcessCharacter: 1000000
116 | PenaltyReturnTypeOnItsOwnLine: 200
117 | PointerAlignment: Left
118 | RawStringFormats:
119 |   - Language:        Cpp
120 |     Delimiters:
121 |       - cc
122 |       - CC
123 |       - cpp
124 |       - Cpp
125 |       - CPP
126 |       - 'c++'
127 |       - 'C++'
128 |     CanonicalDelimiter: ''
129 |     BasedOnStyle:    google
130 |   - Language:        TextProto
131 |     Delimiters:
132 |       - pb
133 |       - PB
134 |       - proto
135 |       - PROTO
136 |     EnclosingFunctions:
137 |       - EqualsProto
138 |       - EquivToProto
139 |       - PARSE_PARTIAL_TEXT_PROTO
140 |       - PARSE_TEST_PROTO
141 |       - PARSE_TEXT_PROTO
142 |       - ParseTextOrDie
143 |       - ParseTextProtoOrDie
144 |       - ParseTestProto
145 |       - ParsePartialTestProto
146 |     CanonicalDelimiter: ''
147 |     BasedOnStyle:    google
148 | ReflowComments:  true
149 | SortIncludes:    true
150 | SortUsingDeclarations: true
151 | SpaceAfterCStyleCast: false
152 | SpaceAfterLogicalNot: false
153 | SpaceAfterTemplateKeyword: true
154 | SpaceBeforeAssignmentOperators: true
155 | SpaceBeforeCpp11BracedList: false
156 | SpaceBeforeCtorInitializerColon: true
157 | SpaceBeforeInheritanceColon: true
158 | SpaceBeforeParens: ControlStatements
159 | SpaceBeforeRangeBasedForLoopColon: true
160 | SpaceInEmptyBlock: false
161 | SpaceInEmptyParentheses: false
162 | SpacesBeforeTrailingComments: 2
163 | SpacesInAngles:  false
164 | SpacesInConditionalStatement: false
165 | SpacesInContainerLiterals: true
166 | SpacesInCStyleCastParentheses: false
167 | SpacesInParentheses: false
168 | SpacesInSquareBrackets: false
169 | SpaceBeforeSquareBrackets: false
170 | Standard:        Auto
171 | StatementMacros:
172 |   - Q_UNUSED
173 |   - QT_REQUIRE_VERSION
174 | TabWidth:        8
175 | UseCRLF:         false
176 | UseTab:          Never
177 | WhitespaceSensitiveMacros:
178 |   - STRINGIZE
179 |   - PP_STRINGIZE
180 |   - BOOST_PP_STRINGIZE
181 | ...
182 | 
183 | 


--------------------------------------------------------------------------------
/mixbench-hip/README.md:
--------------------------------------------------------------------------------
 1 | # mixbench-hip
 2 | 
 3 | This is the HIP (AMD ROCm) implementation of mixbench.
 4 | 
 5 | ## Building notes
 6 | 
 7 | For HIP version, the HIP_PATH environment variable should be set to point to HIP installation directory. For more information follow the instructions on the following blog to properly install ROCK and ROCR drivers:  
 8 | * ROCm:  
 9 | https://github.com/RadeonOpenCompute/ROCm
10 | * HIP:  
11 | https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP
12 | 
13 | In case you want to retain the kernels' assembly code, you may pass the following parameter:
14 | ```
15 | cmake ../mixbench-hip/ -D HIP_HIPCC_FLAGS="-save-temps"
16 | ```
17 | 
18 | ## Execution
19 | 
20 | ```
21 | $ ./mixbench-hip
22 | mixbench-hip (v0.04-1-g7a068df)
23 | ------------------------ Device specifications ------------------------
24 | Device:              
25 | CUDA driver version: 50221.151
26 | GPU clock rate:      1700 MHz
27 | WarpSize:            64
28 | L2 cache size:       8192 KB
29 | Total global mem:    65520 MB
30 | Total SPs:           6656 (104 MPs x 64 SPs/MP)
31 | Compute throughput:  22630.40 GFlops (theoretical single precision FMAs)
32 | Memory bandwidth:    1638.40 GB/sec
33 | -----------------------------------------------------------------------
34 | Total GPU memory 68702699520, free 68702699520
35 | Buffer size:          256MB
36 | Trade-off type:       compute with global memory (block strided)
37 | Elements per thread:  8
38 | Thread fusion degree: 1
39 | ----------------------------------------------------------------------------- CSV data -------------------------------------------------------------------------------------------------------------------
40 | Experiment ID, Single Precision ops,,,,              Packed Single Precision ops,,,,       Double precision ops,,,,              Half precision ops,,,,                Integer operations,,, 
41 | Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec
42 |             0,      0.250,    0.10,  350.12,1400.46,      0.250,    0.18,  363.14,1452.57,      0.125,    0.19,  180.48,1443.83,      0.500,    0.10,  697.89,1395.78,     0.250,    0.10,  352.47,1409.86
43 |             1,      0.750,    0.10, 1052.08,1402.78,      0.750,    0.18, 1097.99,1463.99,      0.375,    0.18,  549.00,1463.99,      1.500,    0.10, 2097.15,1398.10,     0.750,    0.10, 1055.61,1407.48
44 |             2,      1.250,    0.10, 1753.49,1402.79,      1.250,    0.18, 1823.61,1458.89,      0.625,    0.19,  904.73,1447.57,      2.500,    0.10, 3501.09,1400.44,     1.250,    0.10, 1753.47,1402.78
45 |             3,      1.750,    0.10, 2442.61,1395.78,      1.750,    0.18, 2555.29,1460.17,      0.875,    0.18, 1273.21,1455.10,      3.500,    0.10, 4901.52,1400.44,     1.750,    0.10, 2458.97,1405.13
46 |             4,      2.250,    0.10, 3130.08,1391.15,      2.250,    0.18, 3279.66,1457.63,      1.125,    0.19, 1629.91,1448.81,      4.500,    0.10, 6291.52,1398.12,     2.250,    0.10, 3172.16,1409.85
47 |             5,      2.750,    0.10, 3870.58,1407.48,      2.750,    0.18, 4015.46,1460.17,      1.375,    0.18, 2009.48,1461.44,      5.500,    0.10, 7664.01,1393.46,     2.750,    0.10, 3851.20,1400.44
48 |             6,      3.250,    0.10, 4506.28,1386.55,      3.250,    0.18, 4766.28,1466.55,      1.625,    0.18, 2387.30,1469.11,      6.500,    0.10, 9072.63,1395.79,     3.250,    0.10, 4536.27,1395.78
49 |             7,      3.750,    0.10, 5251.63,1400.44,      3.750,    0.18, 5466.11,1457.63,      1.875,    0.18, 2723.59,1452.58,      7.500,    0.10,10433.59,1391.15,     3.750,    0.10, 5234.21,1395.79
50 |             8,      4.250,    0.10, 5922.19,1393.46,      4.250,    0.18, 6205.71,1460.17,      2.125,    0.18, 3100.14,1458.89,      8.500,    0.10,11785.77,1386.56,     4.250,    0.10, 5932.04,1395.78
51 |             9,      4.750,    0.10, 6641.05,1398.12,      4.750,    0.18, 6966.06,1466.54,      2.375,    0.18, 3480.01,1465.27,      9.500,    0.10,13194.00,1388.84,     4.750,    0.10, 6542.91,1377.45
52 |            10,      5.250,    0.10, 7376.92,1405.13,      5.250,    0.18, 7712.86,1469.12,      2.625,    0.18, 3853.04,1467.82,     10.500,    0.10,14631.45,1393.47,     5.250,    0.10, 7046.43,1342.18
53 |            11,      5.750,    0.10, 7999.17,1391.16,      5.750,    0.18, 8388.61,1458.89,      2.875,    0.18, 4205.30,1462.71,     11.500,    0.10,15918.98,1384.26,     5.750,    0.11, 7330.54,1274.88
54 |            12,      6.250,    0.10, 8738.13,1398.10,      6.250,    0.19, 9047.30,1447.57,      3.125,    0.18, 4559.05,1458.90,     12.500,    0.10,17360.53,1388.84,     6.250,    0.11, 7710.12,1233.62
55 |            13,      6.750,    0.10, 9374.78,1388.86,      6.750,    0.18, 9881.87,1463.98,      3.375,    0.18, 4940.96,1463.99,     13.500,    0.10,18444.01,1366.22,     6.750,    0.12, 7853.41,1163.47
56 |            14,      7.250,    0.10,10035.88,1384.26,      7.250,    0.18,10540.34,1453.84,      3.625,    0.18, 5288.50,1458.90,     14.500,    0.10,20104.93,1386.55,     7.250,    0.12, 8034.07,1108.15
57 |            15,      7.750,    0.10,10853.37,1400.44,      7.750,    0.18,11326.14,1461.44,      3.875,    0.18, 5677.88,1465.26,     15.500,    0.10,21385.65,1379.72,     7.750,    0.13, 8167.30,1053.85
58 |            16,      8.250,    0.10,11534.46,1398.12,      8.250,    0.18,12067.31,1462.70,      4.125,    0.18, 6033.69,1462.71,     16.500,    0.10,22840.27,1384.26,     8.250,    0.13, 8328.10,1009.47
59 |            17,      8.750,    0.10,12213.03,1395.78,      8.750,    0.18,12843.52,1467.83,      4.375,    0.18, 6467.02,1478.18,     17.500,    0.10,23947.90,1368.45,     8.750,    0.14, 8475.79, 968.66
60 |            18,      9.250,    0.10,12783.30,1381.98,      9.250,    0.18,13541.89,1463.99,      4.625,    0.18, 6794.63,1469.11,     18.500,    0.10,25275.38,1366.24,     9.250,    0.15, 8555.09, 924.87
61 |            20,     10.250,    0.10,14118.90,1377.45,     10.250,    0.18,15098.02,1472.98,      5.125,    0.18, 7470.35,1457.63,     20.500,    0.10,26869.76,1310.72,    10.250,    0.16, 8764.91, 855.11
62 |            22,     11.250,    0.10,15624.48,1388.84,     11.250,    0.18,16629.49,1478.18,      5.625,    0.18, 8307.43,1476.88,     22.500,    0.11,28170.70,1252.03,    11.250,    0.17, 8844.65, 786.19
63 |            24,     12.250,    0.10,16845.98,1375.18,     12.250,    0.18,18059.93,1474.28,      6.125,    0.18, 8966.93,1463.99,     24.500,    0.11,29193.31,1191.56,    12.250,    0.18, 9014.12, 735.85
64 |            28,     14.250,    0.10,18390.41,1290.56,     14.250,    0.19,20381.64,1430.29,      7.125,    0.19,10269.56,1441.34,     28.500,    0.12,30690.03,1076.84,    14.250,    0.21, 9202.33, 645.78
65 |            32,     16.250,    0.11,19390.45,1193.26,     16.250,    0.19,23442.07,1442.59,      8.125,    0.19,11571.78,1424.22,     32.500,    0.14,31886.52, 981.12,    16.250,    0.23, 9343.08, 574.96
66 |            40,     20.250,    0.13,20945.66,1034.35,     20.250,    0.20,27824.62,1374.06,     10.125,    0.20,13844.35,1367.34,     40.500,    0.16,33307.91, 822.42,    20.250,    0.29, 9532.54, 470.74
67 |            48,     24.250,    0.15,21733.31, 896.22,     24.250,    0.21,30452.80,1255.79,     12.125,    0.21,15422.65,1271.97,     48.500,    0.19,34595.87, 713.32,    24.250,    0.34, 9714.66, 400.60
68 |            56,     28.250,    0.17,22569.35, 798.92,     28.250,    0.23,32619.30,1154.67,     14.125,    0.23,16365.97,1158.65,     56.500,    0.21,35609.21, 630.25,    28.250,    0.39, 9792.51, 346.64
69 |            64,     32.250,    0.19,23043.79, 714.54,     32.250,    0.26,33859.03,1049.89,     16.125,    0.25,17420.07,1080.31,     64.500,    0.24,36288.75, 562.62,    32.250,    0.45, 9566.24, 296.63
70 |            80,     40.250,    0.23,23912.39, 594.10,     40.250,    0.30,35938.54, 892.88,     20.125,    0.30,17969.27, 892.88,     80.500,    0.30,36383.90, 451.97,    40.250,    0.55, 9735.95, 241.89
71 |            96,     48.250,    0.26,24456.31, 506.87,     48.250,    0.35,36728.81, 761.22,     24.125,    0.35,18372.79, 761.57,     96.500,    0.35,37304.29, 386.57,    48.250,    0.66, 9828.84, 203.71
72 |           128,     64.250,    0.45,19324.86, 300.78,     64.250,    0.46,37260.23, 579.93,     32.125,    0.46,18772.92, 584.37,    128.500,    0.45,38360.80, 298.53,    64.250,    0.87, 9955.10, 154.94
73 |           256,    128.250,    0.85,20184.64, 157.39,    128.250,    0.88,39071.82, 304.65,     64.125,    0.88,19660.84, 306.60,    256.500,    0.86,40188.33, 156.68,   128.250,    1.69,10164.80,  79.26
74 |           512,    256.250,    1.66,20679.06,  80.70,    256.250,    1.72,40037.03, 156.24,    128.125,    1.71,20140.43, 157.19,    512.500,    1.67,41195.62,  80.38,   256.250,    3.35,10279.21,  40.11
75 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
76 | ```
77 | 
78 | 


--------------------------------------------------------------------------------
/mixbench-sycl/README.md:
--------------------------------------------------------------------------------
  1 | # mixbench-sycl
  2 | 
  3 | This is the SYCL implementation of mixbench.
  4 | As SYCL is supported by multiple implementations, not all of them have been tested.
  5 | 
  6 | ## Building notes
  7 | 
  8 | ### Intel clang/DPCPP
  9 | 
 10 | Using the latest version of OneAPI toolkit from Intel, you may try building as follows:
 11 | 
 12 | ```
 13 | cmake ../mixbench-sycl -D CMAKE_CXX_COMPILER=icpx -D CMAKE_CXX_FLAGS="-fsycl -fsycl-device-code-split=per_kernel"
 14 | ```
 15 | 
 16 | Note: `per_kernel` mode facilitates cases where the device lacks support for computations on a particular data type, e.g. double precision.
 17 | 
 18 | If you are building under Windows/DPC++ try:
 19 | ```
 20 | cmake ..\mixbench-sycl -T "Intel(R) oneAPI DPC++ Compiler 2024"  -D CMAKE_CXX_COMPILER=icpx -D CMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel /EHsc"
 21 | ```
 22 | Note: Adjust the platform toolset argument (*"Intel(R) oneAPI DPC++ Compiler"*) to whatever required, e.g. *"Intel(R) oneAPI DPC++ Compiler 2022"* for DPC++ 2022.1.
 23 | 
 24 | ### AMD GPU via hipSYCL/ROCm
 25 | 
 26 | Here building for two device architectures (*gfx803* & *gfx1012*):
 27 | 
 28 | ```
 29 | cmake ../mixbench-sycl -D CMAKE_CXX_COMPILER=syclcc -D CMAKE_CXX_FLAGS="--hipsycl-targets='omp;hip:gfx803,gfx1012' -O2"
 30 | ```
 31 | Note: Older ROCm releases might require adding `--rocm-device-lib-path=/opt/rocm/amdgcn/bitcode` to CMAKE_CXX_FLAGS
 32 | 
 33 | ### NVidia clang/DPCPP
 34 | ```
 35 | cmake ../mixbench-sycl -D CMAKE_CXX_COMPILER=clang++ -D CMAKE_CXX_FLAGS="-fsycl -std=c++17 -fsycl-targets=nvptx64-nvidia-cuda"
 36 | ```
 37 | 
 38 | ## Execution
 39 | 
 40 | In order to select the desired device to run the benchmark on, do pass the `-h` parameter
 41 | so the available devices are listed:
 42 | 
 43 | ```
 44 | $ ./mixbench-sycl -h
 45 | mixbench-sycl (v0.04-3-g664f025)
 46 | Usage: mixbench-sycl [options] [device index [workgroup size [array size(1024^2)]]]
 47 | 
 48 | Options:
 49 |   -h or --help              Show this message
 50 |   -t or --use-os-timer      Use standard OS timer instead of SYCL profiling timer
 51 | Available SYCL devices:
 52 |   1. Intel(R) FPGA Emulation Device/Intel(R) FPGA Emulation Platform for OpenCL(TM)
 53 |   2. Intel(R) Core(TM) i3-8109U CPU @ 3.00GHz/Intel(R) OpenCL
 54 |   3. Intel(R) Iris(R) Plus Graphics 655 [0x3ea5]/Intel(R) OpenCL HD Graphics
 55 |   4. Intel(R) Core(TM) i3-8109U CPU @ 3.00GHz/Intel(R) OpenCL
 56 |   5. Intel(R) Iris(R) Plus Graphics 655 [0x3ea5]/Intel(R) Level-Zero
 57 | ```
 58 | 
 59 | ... and then pass the device number as the argument:
 60 | 
 61 | ```
 62 | $ ./mixbench-sycl 5
 63 | mixbench-sycl (v0.04-3-g664f025)
 64 | Use "-h" argument to see available options
 65 | ------------------------ Device specifications ------------------------
 66 | Platform:            Intel(R) Level-Zero
 67 | Device:              Intel(R) Iris(R) Plus Graphics 655 [0x3ea5]/Intel(R) Corporation
 68 | Driver version:      1.2.21270
 69 | Address bits:        64
 70 | GPU clock rate:      0 MHz
 71 | Total global mem:    12690 MB
 72 | Max allowed buffer:  4095 MB
 73 | SYCL version:        1.1
 74 | Total CUs:           48
 75 | -----------------------------------------------------------------------
 76 | Total GPU memory:     13307101184
 77 | Buffer size:          256MB
 78 | Elements per thread:  8
 79 | Thread fusion degree: 4
 80 | Timer:                SYCL event based
 81 | ----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------
 82 | Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Half precision ops,,,,                Integer operations,,,
 83 | Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec
 84 |             0,      0.250,    5.29,    6.34,  25.36,      0.125,    9.63,    3.49,  27.88,      0.500,    5.04,   13.32,  26.64,     0.250,    5.23,    6.42,  25.68
 85 |             1,      0.750,    5.10,   19.74,  26.32,      0.375,    9.49,   10.60,  28.27,      1.500,    5.00,   40.25,  26.83,     0.750,    5.18,   19.43,  25.90
 86 |             2,      1.250,    5.29,   31.73,  25.38,      0.625,    9.78,   17.16,  27.45,      2.500,    4.62,   72.59,  29.03,     1.250,    5.21,   32.23,  25.78
 87 |             3,      1.750,    5.27,   44.55,  25.46,      0.875,    9.72,   24.15,  27.61,      3.500,    5.12,   91.78,  26.22,     1.750,    5.01,   46.87,  26.78
 88 |             4,      2.250,    5.26,   57.45,  25.53,      1.125,    9.09,   33.21,  29.52,      4.500,    4.62,  130.62,  29.03,     2.250,    4.61,   65.57,  29.14
 89 |             5,      2.750,    5.25,   70.35,  25.58,      1.375,    9.11,   40.51,  29.46,      5.500,    5.26,  140.31,  25.51,     2.750,    4.90,   75.28,  27.37
 90 |             6,      3.250,    5.06,   86.15,  26.51,      1.625,    9.09,   48.01,  29.54,      6.500,    4.62,  188.85,  29.05,     3.250,    4.86,   89.79,  27.63
 91 |             7,      3.750,    4.53,  111.09,  29.62,      1.875,    9.63,   52.27,  27.88,      7.500,    4.65,  216.63,  28.88,     3.750,    4.70,  107.11,  28.56
 92 |             8,      4.250,    4.74,  120.33,  28.31,      2.125,    9.37,   60.91,  28.66,      8.500,    4.60,  247.78,  29.15,     4.250,    4.55,  125.23,  29.47
 93 |             9,      4.750,    4.38,  145.42,  30.61,      2.375,    9.23,   69.07,  29.08,      9.500,    4.76,  268.08,  28.22,     4.750,    4.43,  143.86,  30.29
 94 |            10,      5.250,    4.73,  149.09,  28.40,      2.625,    9.31,   75.67,  28.83,     10.500,    4.55,  309.77,  29.50,     5.250,    4.42,  159.29,  30.34
 95 |            11,      5.750,    4.59,  168.17,  29.25,      2.875,    9.11,   84.69,  29.46,     11.500,    4.75,  324.78,  28.24,     5.750,    4.46,  173.02,  30.09
 96 |            12,      6.250,    4.39,  190.94,  30.55,      3.125,    8.90,   94.21,  30.15,     12.500,    4.40,  381.52,  30.52,     6.250,    4.50,  186.48,  29.84
 97 |            13,      6.750,    4.38,  206.79,  30.64,      3.375,    9.00,  100.67,  29.83,     13.500,    4.45,  406.98,  30.15,     6.750,    4.54,  199.70,  29.59
 98 |            14,      7.250,    4.41,  220.72,  30.44,      3.625,    9.00,  108.09,  29.82,     14.500,    4.41,  441.06,  30.42,     7.250,    4.56,  213.21,  29.41
 99 |            15,      7.750,    4.38,  237.52,  30.65,      3.875,    9.02,  115.35,  29.77,     15.500,    4.74,  439.27,  28.34,     7.750,    4.70,  221.36,  28.56
100 |            16,      8.250,    4.39,  252.08,  30.55,      4.125,    9.03,  122.61,  29.72,     16.500,    4.43,  499.64,  30.28,     8.250,    4.89,  226.35,  27.44
101 |            17,      8.750,    4.38,  268.11,  30.64,      4.375,    9.09,  129.14,  29.52,     17.500,    4.82,  487.16,  27.84,     8.750,    5.34,  219.77,  25.12
102 |            18,      9.250,    4.39,  282.65,  30.56,      4.625,    9.09,  136.54,  29.52,     18.500,    4.44,  559.87,  30.26,     9.250,    5.41,  229.49,  24.81
103 |            20,     10.250,    4.38,  314.04,  30.64,      5.125,    9.38,  146.63,  28.61,     20.500,    4.45,  617.83,  30.14,    10.250,    5.93,  231.87,  22.62
104 |            22,     11.250,    4.36,  345.92,  30.75,      5.625,    9.16,  164.93,  29.32,     22.500,    4.45,  679.14,  30.18,    11.250,    6.46,  233.80,  20.78
105 |            24,     12.250,    4.35,  377.65,  30.83,      6.125,    9.85,  166.94,  27.26,     24.500,    4.46,  737.88,  30.12,    12.250,    6.98,  235.42,  19.22
106 |            28,     14.250,    4.37,  437.21,  30.68,      7.125,   11.26,  169.93,  23.85,     28.500,    4.48,  854.50,  29.98,    14.250,    8.05,  237.58,  16.67
107 |            32,     16.250,    4.36,  499.84,  30.76,      8.125,   12.77,  170.75,  21.02,     32.500,    4.50,  968.97,  29.81,    16.250,    9.12,  239.18,  14.72
108 |            40,     20.250,    4.41,  616.54,  30.45,     10.125,   15.88,  171.17,  16.91,     40.500,    5.02, 1083.80,  26.76,    20.250,   11.23,  242.07,  11.95
109 |            48,     24.250,    4.81,  676.04,  27.88,     12.125,   18.48,  176.11,  14.52,     48.500,    5.91, 1101.80,  22.72,    24.250,   13.37,  243.40,  10.04
110 |            56,     28.250,    5.48,  691.78,  24.49,     14.125,   21.38,  177.33,  12.55,     56.500,    6.34, 1195.28,  21.16,    28.250,   15.52,  244.32,   8.65
111 |            64,     32.250,    6.17,  701.22,  21.74,     16.125,   24.28,  178.27,  11.06,     64.500,    6.99, 1238.30,  19.20,    32.250,   17.66,  245.05,   7.60
112 |            80,     40.250,    7.59,  712.18,  17.69,     20.125,   30.55,  176.85,   8.79,     80.500,    8.37, 1290.93,  16.04,    40.250,   22.45,  240.58,   5.98
113 |            96,     48.250,    8.99,  720.68,  14.94,     24.125,   35.85,  180.66,   7.49,     96.500,    9.78, 1323.72,  13.72,    48.250,   26.77,  241.96,   5.01
114 |           128,     64.250,   11.84,  728.22,  11.33,     32.125,   47.47,  181.66,   5.65,    128.500,   12.62, 1366.48,  10.63,    64.250,   35.32,  244.14,   3.80
115 |           192,     96.250,   19.89,  649.40,   6.75,     48.125,   71.77,  180.00,   3.74,    192.500,   18.37, 1406.37,   7.31,    96.250,   52.49,  246.11,   2.56
116 |           256,    128.250,   25.72,  669.32,   5.22,     64.125,   94.85,  181.48,   2.83,    256.500,   32.84, 1048.44,   4.09,   128.250,   69.62,  247.23,   1.93
117 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------
118 | ```
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # mixbench
  2 | The purpose of this benchmark tool is to evaluate performance bounds of GPUs (or CPUs) on mixed operational intensity kernels. The executed kernel is customized on a range of different operational intensity values. Modern GPUs are able to hide memory latency by switching execution to threads able to perform compute operations. Using this tool one can assess the practical optimum balance in both types of operations for a compute device. CUDA, HIP, OpenCL and SYCL implementations have been developed, targeting GPUs, or OpenMP when using a CPU as a target.
  3 | 
  4 | ## Implementations
  5 | 
  6 | * CUDA: `mixbench-cuda`
  7 | * OpenCL: `mixbench-opencl`
  8 | * HIP: `mixbench-hip`
  9 | * SYCL: `mixbench-sycl`
 10 | * CPU/OpenMP: `mixbench-cpu`
 11 | 
 12 | Since each implementation resides in a separate folder, please check the documentation available within each sub-project's folder.
 13 | 
 14 | ## Kernel types
 15 | 
 16 | Four types of experiments are executed combined with global memory accesses:
 17 | 
 18 | 1. Single precision Flops (multiply-additions)
 19 | 2. Double precision Flops (multiply-additions)
 20 | 3. Half precision Flops (multiply-additions, for GPUs only)
 21 | 4. Integer multiply-addition operations
 22 | 
 23 | ## How to build
 24 | 
 25 | Building is based on CMake files.
 26 | Thus, to build a particular implementation use the proper `CMakeLists.txt` residing in each subdirectory,
 27 | e.g. for the OpenCL implementation you may use the commands as follows:
 28 | 
 29 | ```
 30 | mkdir build
 31 | cd build
 32 | cmake ../mixbench-opencl
 33 | cmake --build ./
 34 | ```
 35 | 
 36 | For more information, check available READMEs within each subfolder.
 37 | 
 38 | ## Execution results
 39 | 
 40 | A typical execution output on an NVidia RTX-2070 GPU is:
 41 | ```
 42 | mixbench/read-only (v0.03-2-gbccfd71)
 43 | ------------------------ Device specifications ------------------------
 44 | Device:              GeForce RTX 2070
 45 | CUDA driver version: 10.20
 46 | GPU clock rate:      1620 MHz
 47 | Memory clock rate:   3500 MHz
 48 | Memory bus width:    256 bits
 49 | WarpSize:            32
 50 | L2 cache size:       4096 KB
 51 | Total global mem:    7979 MB
 52 | ECC enabled:         No
 53 | Compute Capability:  7.5
 54 | Total SPs:           2304 (36 MPs x 64 SPs/MP)
 55 | Compute throughput:  7464.96 GFlops (theoretical single precision FMAs)
 56 | Memory bandwidth:    448.06 GB/sec
 57 | -----------------------------------------------------------------------
 58 | Total GPU memory 8366784512, free 7941521408
 59 | Buffer size:          256MB
 60 | Trade-off type:       compute with global memory (block strided)
 61 | Elements per thread:  8
 62 | Thread fusion degree: 4
 63 | ----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------
 64 | Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Half precision ops,,,,                Integer operations,,, 
 65 | Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec
 66 |             0,      0.250,    0.32,  104.42, 417.68,      0.125,    0.63,   53.04, 424.35,      0.500,    0.32,  211.41, 422.81,     0.250,    0.32,  105.58, 422.30
 67 |             1,      0.750,    0.32,  316.34, 421.79,      0.375,    0.63,  158.69, 423.18,      1.500,    0.32,  634.22, 422.81,     0.750,    0.32,  317.30, 423.07
 68 |             2,      1.250,    0.32,  528.46, 422.77,      0.625,    0.78,  215.91, 345.45,      2.500,    0.32, 1055.97, 422.39,     1.250,    0.32,  528.57, 422.86
 69 |             3,      1.750,    0.32,  738.81, 422.17,      0.875,    1.08,  218.17, 249.34,      3.500,    0.32, 1478.95, 422.56,     1.750,    0.32,  740.59, 423.20
 70 |             4,      2.250,    0.32,  951.33, 422.81,      1.125,    1.38,  219.57, 195.17,      4.500,    0.32, 1902.66, 422.81,     2.250,    0.32,  950.66, 422.51
 71 |             5,      2.750,    0.32, 1162.74, 422.81,      1.375,    1.67,  220.38, 160.28,      5.500,    0.32, 2328.52, 423.37,     2.750,    0.32, 1162.74, 422.81
 72 |             6,      3.250,    0.32, 1374.56, 422.94,      1.625,    1.97,  220.99, 135.99,      6.500,    0.32, 2756.62, 424.10,     3.250,    0.32, 1375.81, 423.32
 73 |             7,      3.750,    0.32, 1592.45, 424.65,      1.875,    2.27,  221.38, 118.07,      7.500,    0.32, 3169.50, 422.60,     3.750,    0.32, 1585.55, 422.81
 74 |             8,      4.250,    0.32, 1796.95, 422.81,      2.125,    2.57,  221.71, 104.33,      8.500,    0.32, 3587.76, 422.09,     4.250,    0.37, 1545.63, 363.68
 75 |             9,      4.750,    0.32, 2006.34, 422.39,      2.375,    2.87,  221.85,  93.41,      9.500,    0.32, 3995.38, 420.57,     4.750,    0.32, 1998.29, 420.69
 76 |            10,      5.250,    0.32, 2209.52, 420.86,      2.625,    3.17,  222.02,  84.58,     10.500,    0.32, 4439.54, 422.81,     5.250,    0.32, 2220.44, 422.94
 77 |            11,      5.750,    0.32, 2434.12, 423.32,      2.875,    3.47,  222.17,  77.28,     11.500,    0.32, 4855.01, 422.17,     5.750,    0.32, 2426.77, 422.05
 78 |            12,      6.250,    0.32, 2638.06, 422.09,      3.125,    3.78,  222.18,  71.10,     12.500,    0.32, 5227.20, 418.18,     6.250,    0.38, 2202.15, 352.34
 79 |            13,      6.750,    0.32, 2841.95, 421.03,      3.375,    4.08,  222.30,  65.87,     13.500,    0.32, 5712.58, 423.15,     6.750,    0.32, 2850.54, 422.30
 80 |            14,      7.250,    0.32, 3065.39, 422.81,      3.625,    4.37,  222.45,  61.36,     14.500,    0.32, 6135.74, 423.15,     7.250,    0.32, 3065.08, 422.77
 81 |            15,      7.750,    0.33, 3143.40, 405.60,      3.875,    4.67,  222.57,  57.44,     15.500,    0.32, 6546.34, 422.34,     7.750,    0.32, 3268.89, 421.79
 82 |            16,      8.250,    0.32, 3482.59, 422.13,      4.125,    4.98,  222.57,  53.96,     16.500,    0.32, 6957.48, 421.67,     8.250,    0.39, 2803.68, 339.84
 83 |            17,      8.750,    0.32, 3693.66, 422.13,      4.375,    5.28,  222.53,  50.86,     17.500,    0.32, 7396.24, 422.64,     8.750,    0.32, 3694.77, 422.26
 84 |            18,      9.250,    0.32, 3901.58, 421.79,      4.625,    5.58,  222.58,  48.12,     18.500,    0.32, 7786.72, 420.90,     9.250,    0.32, 3897.66, 421.37
 85 |            20,     10.250,    0.32, 4312.53, 420.73,      5.125,    6.18,  222.66,  43.45,     20.500,    0.32, 8640.66, 421.50,    10.250,    0.41, 3374.54, 329.22
 86 |            22,     11.250,    0.32, 4729.94, 420.44,      5.625,    6.78,  222.74,  39.60,     22.500,    0.32, 9452.31, 420.10,    11.250,    0.32, 4734.21, 420.82
 87 |            24,     12.250,    0.32, 5148.83, 420.31,      6.125,    7.36,  223.51,  36.49,     24.500,    0.32,10346.40, 422.30,    12.250,    0.42, 3900.12, 318.38
 88 |            28,     14.250,    0.32, 6009.94, 421.75,      7.125,    8.53,  224.23,  31.47,     28.500,    0.32,11975.32, 420.19,    14.250,    0.44, 4368.11, 306.53
 89 |            32,     16.250,    0.32, 6795.36, 418.18,      8.125,    9.72,  224.31,  27.61,     32.500,    0.32,13605.64, 418.64,    16.250,    0.45, 4797.12, 295.21
 90 |            40,     20.250,    0.34, 7899.43, 390.10,     10.125,   12.11,  224.50,  22.17,     40.500,    0.33,16371.37, 404.23,    20.250,    0.50, 5464.85, 269.87
 91 |            48,     24.250,    0.41, 8029.04, 331.09,     12.125,   14.49,  224.58,  18.52,     48.500,    0.40,16468.89, 339.56,    24.250,    0.54, 5986.22, 246.85
 92 |            56,     28.250,    0.47, 8114.58, 287.24,     14.125,   16.88,  224.65,  15.90,     56.500,    0.46,16443.12, 291.03,    28.250,    0.60, 6342.42, 224.51
 93 |            64,     32.250,    0.53, 8154.47, 252.85,     16.125,   19.26,  224.72,  13.94,     64.500,    0.52,16536.22, 256.38,    32.250,    0.66, 6591.93, 204.40
 94 |            80,     40.250,    0.66, 8242.80, 204.79,     20.125,   24.03,  224.79,  11.17,     80.500,    0.65,16644.88, 206.77,    40.250,    0.78, 6909.54, 171.67
 95 |            96,     48.250,    0.78, 8321.35, 172.46,     24.125,   28.80,  224.85,   9.32,     96.500,    0.78,16685.23, 172.90,    48.250,    0.91, 7108.62, 147.33
 96 |           128,     64.250,    1.03, 8337.22, 129.76,     32.125,   38.34,  224.91,   7.00,    128.500,    1.03,16775.65, 130.55,    64.250,    1.18, 7295.18, 113.54
 97 |           192,     96.250,    1.54, 8414.49,  87.42,     48.125,   57.42,  224.97,   4.67,    192.500,    1.53,16847.93,  87.52,    96.250,    1.74, 7431.64,  77.21
 98 |           256,    128.250,    2.06, 8362.01,  65.20,     64.125,   76.50,  225.02,   3.51,    256.500,    2.06,16693.65,  65.08,   128.250,    2.30, 7477.75,  58.31
 99 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------
100 | ```
101 | 
102 | And here is a chart illustrating the results extracted above:
103 | 
104 | ![RTX-2070 execution results](https://raw.githubusercontent.com/ekondis/mixbench/gh-pages/img/rtx2070-sp-roofline.png "mixbench execution results on NVidia RTX-2070 (CUDA/ro implementation)")
105 | 
106 | ## Publications
107 | 
108 | If you use this benchmark tool for a research work please provide citation to any of the following papers:
109 | 
110 | Elias Konstantinidis, Yiannis Cotronis,
111 | "A quantitative roofline model for GPU kernel performance estimation using micro-benchmarks and hardware metric profiling",
112 | Journal of Parallel and Distributed Computing, Volume 107, September 2017, Pages 37-56, ISSN 0743-7315,
113 | https://doi.org/10.1016/j.jpdc.2017.04.002.  
114 | URL: http://www.sciencedirect.com/science/article/pii/S0743731517301247
115 | 
116 | Konstantinidis, E., Cotronis, Y.,
117 | "A Practical Performance Model for Compute and Memory Bound GPU Kernels",
118 | Parallel, Distributed and Network-Based Processing (PDP), 2015 23rd Euromicro International Conference on , vol., no., pp.651-658, 4-6 March 2015
119 | doi: 10.1109/PDP.2015.51  
120 | URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7092788&isnumber=7092002
121 | 


--------------------------------------------------------------------------------
/mixbench-cuda/mix_kernels_cuda.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * mix_kernels_cuda_ro.cu: This file is part of the mixbench GPU micro-benchmark suite.
  3 |  *
  4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  5 |  **/
  6 | 
  7 | #include <stdio.h>
  8 | #include <math_constants.h>
  9 | #include <cuda_fp16.h>
 10 | #include <stdint.h>
 11 | #include <math.h>
 12 | #include "lcutil.h"
 13 | 
 14 | #define ELEMENTS_PER_THREAD (8)
 15 | #define FUSION_DEGREE (4)
 16 | 
 17 | template<class T>
 18 | inline __device__ T conv_int(const int i){ return static_cast<T>(i); }
 19 | 
 20 | template<class T>
 21 | inline __device__ T mad(const T a, const T b, const T c){ return a*b+c; }
 22 | 
 23 | template<class T>
 24 | inline __device__ bool equal(const T a, const T b){ return a==b; }
 25 | 
 26 | #if __CUDA_ARCH__ >= 530
 27 | template<>
 28 | inline __device__ half2 conv_int(const int i){ return __half2half2( __int2half_rd(i) ); }
 29 | template<>
 30 | inline __device__ half2 mad(const half2 a, const half2 b, const half2 c){ return __hfma2(a, b, c)/*__hadd2(__hmul2(a, b), c)*/; }
 31 | template<>
 32 | inline __device__ bool equal(const half2 a, const half2 b){ return __hbeq2(a, b); }
 33 | #else
 34 | // a dummy implementations as a workaround
 35 | template<>
 36 | inline __device__ half2 conv_int(const int i){ return half2(); }
 37 | template<>
 38 | inline __device__ half2 mad(const half2 a, const half2 b, const half2 c){ return half2(); }
 39 | template<>
 40 | inline __device__ bool equal(const half2 a, const half2 b){ return false; }
 41 | #endif
 42 | 
 43 | template <class T, int blockdim, unsigned int granularity, unsigned int fusion_degree, unsigned int compute_iterations, bool TemperateUnroll>
 44 | __global__ void benchmark_func(T seed, T *g_data){
 45 | 	const unsigned int blockSize = blockdim;
 46 | 	const int stride = blockSize;
 47 | 	int idx = blockIdx.x*blockSize*granularity + threadIdx.x;
 48 | 	const int big_stride = gridDim.x*blockSize*granularity;
 49 | 
 50 | 	T tmps[granularity];
 51 | 	for(int k=0; k<fusion_degree; k++){
 52 | 		#pragma unroll
 53 | 		for(int j=0; j<granularity; j++){
 54 | 			// Load elements (memory intensive part)
 55 | 			tmps[j] = g_data[idx+j*stride+k*big_stride];
 56 | 			// Perform computations (compute intensive part)
 57 | 			#pragma unroll TemperateUnroll ? 4 : 128
 58 | 			for(int i=0; i<compute_iterations; i++){
 59 | 				tmps[j] = mad(tmps[j], tmps[j], seed);
 60 | 			}
 61 | 		}
 62 | 		// Multiply add reduction
 63 | 		T sum = conv_int<T>(0);
 64 | 		#pragma unroll
 65 | 		for(int j=0; j<granularity; j+=2)
 66 | 			sum = mad(tmps[j], tmps[j+1], sum);
 67 | 		// Dummy code
 68 | 		if( equal(sum, conv_int<T>(-1)) ) // Designed so it never executes
 69 | 			g_data[idx+k*big_stride] = sum;
 70 | 	}
 71 | }
 72 | 
 73 | void initializeEvents(cudaEvent_t *start, cudaEvent_t *stop){
 74 | 	CUDA_SAFE_CALL( cudaEventCreate(start) );
 75 | 	CUDA_SAFE_CALL( cudaEventCreate(stop) );
 76 | 	CUDA_SAFE_CALL( cudaEventRecord(*start, 0) );
 77 | }
 78 | 
 79 | float finalizeEvents(cudaEvent_t start, cudaEvent_t stop){
 80 | 	CUDA_SAFE_CALL( cudaGetLastError() );
 81 | 	CUDA_SAFE_CALL( cudaEventRecord(stop, 0) );
 82 | 	CUDA_SAFE_CALL( cudaEventSynchronize(stop) );
 83 | 	float kernel_time;
 84 | 	CUDA_SAFE_CALL( cudaEventElapsedTime(&kernel_time, start, stop) );
 85 | 	CUDA_SAFE_CALL( cudaEventDestroy(start) );
 86 | 	CUDA_SAFE_CALL( cudaEventDestroy(stop) );
 87 | 	return kernel_time;
 88 | }
 89 | 
 90 | void runbench_warmup(double *cd, long size){
 91 | 	const long reduced_grid_size = size/(ELEMENTS_PER_THREAD)/128;
 92 | 	const int BLOCK_SIZE = 256;
 93 | 	const int TOTAL_REDUCED_BLOCKS = reduced_grid_size/BLOCK_SIZE;
 94 | 
 95 | 	dim3 dimBlock(BLOCK_SIZE, 1, 1);
 96 | 	dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1);
 97 | 
 98 | 	benchmark_func< short, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, 0, true ><<< dimReducedGrid, dimBlock >>>((short)1, (short*)cd);
 99 | 	CUDA_SAFE_CALL( cudaGetLastError() );
100 | 	CUDA_SAFE_CALL( cudaDeviceSynchronize() );
101 | }
102 | 
103 | int out_config = 1;
104 | 
105 | template<unsigned int compute_iterations>
106 | void runbench(double *cd, long size, bool doHalfs){
107 | 	const long compute_grid_size = size/ELEMENTS_PER_THREAD/FUSION_DEGREE;
108 | 	const int BLOCK_SIZE = 256;
109 | 	const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE;
110 | 	const long long computations = (ELEMENTS_PER_THREAD*(long long)compute_grid_size+(2*ELEMENTS_PER_THREAD*compute_iterations)*(long long)compute_grid_size)*FUSION_DEGREE;
111 | 	const long long memoryoperations = size;
112 | 
113 | 	dim3 dimBlock(BLOCK_SIZE, 1, 1);
114 | 	dim3 dimGrid(TOTAL_BLOCKS, 1, 1);
115 | 	cudaEvent_t start, stop;
116 | 
117 | 	initializeEvents(&start, &stop);
118 | 	benchmark_func< float, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, compute_iterations, false ><<< dimGrid, dimBlock >>>(1.0f, (float*)cd);
119 | 	float kernel_time_mad_sp = finalizeEvents(start, stop);
120 | 
121 | 	initializeEvents(&start, &stop);
122 | 	benchmark_func< double, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, compute_iterations, false ><<< dimGrid, dimBlock >>>(1.0, cd);
123 | 	float kernel_time_mad_dp = finalizeEvents(start, stop);
124 | 
125 | 	float kernel_time_mad_hp = 0.f;
126 | 	if( doHalfs ){
127 | 		initializeEvents(&start, &stop);
128 | 		half2 h_ones;
129 | 		*((int32_t*)&h_ones) = 15360 + (15360 << 16); // 1.0 as half
130 | 		benchmark_func< half2, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, compute_iterations, false ><<< dimGrid, dimBlock >>>(h_ones, (half2*)cd);
131 | 		kernel_time_mad_hp = finalizeEvents(start, stop);
132 | 	}
133 | 
134 | 	initializeEvents(&start, &stop);
135 | 	benchmark_func< int, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, compute_iterations, true ><<< dimGrid, dimBlock >>>(1, (int*)cd);
136 | 	float kernel_time_mad_int = finalizeEvents(start, stop);
137 | 
138 | 	printf("         %4d,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,  %8.3f,%8.2f,%8.2f,%7.2f\n",
139 | 		compute_iterations,
140 | 		((double)computations)/((double)memoryoperations*sizeof(float)),
141 | 		kernel_time_mad_sp,
142 | 		((double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000),
143 | 		((double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.),
144 | 		((double)computations)/((double)memoryoperations*sizeof(double)),
145 | 		kernel_time_mad_dp,
146 | 		((double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000),
147 | 		((double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.),
148 | 		((double)2*computations)/((double)memoryoperations*sizeof(half2)),
149 | 		kernel_time_mad_hp,
150 | 		((double)2*computations)/kernel_time_mad_hp*1000./(double)(1000*1000*1000),
151 | 		((double)memoryoperations*sizeof(half2))/kernel_time_mad_hp*1000./(1000.*1000.*1000.),
152 | 		((double)computations)/((double)memoryoperations*sizeof(int)),
153 | 		kernel_time_mad_int,
154 | 		((double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000),
155 | 		((double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) );
156 | }
157 | 
158 | extern "C" void mixbenchGPU(double *c, long size){
159 | 	const char *benchtype = "compute with global memory (block strided)";
160 | 
161 | 	printf("Trade-off type:       %s\n", benchtype);
162 | 	printf("Elements per thread:  %d\n", ELEMENTS_PER_THREAD);
163 | 	printf("Thread fusion degree: %d\n", FUSION_DEGREE);
164 | 	double *cd;
165 | 	bool doHalfs = IsFP16Supported();
166 | 	if( !doHalfs )
167 | 		printf("Warning:              Half precision computations are not supported\n");
168 | 
169 | 	CUDA_SAFE_CALL( cudaMalloc((void**)&cd, size*sizeof(double)) );
170 | 
171 | 	// Copy data to device memory
172 | 	CUDA_SAFE_CALL( cudaMemset(cd, 0, size*sizeof(double)) );  // initialize to zeros
173 | 
174 | 	// Synchronize in order to wait for memory operations to finish
175 | 	CUDA_SAFE_CALL( cudaDeviceSynchronize() );
176 | 
177 | 	printf("----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------\n");
178 | 	printf("Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Half precision ops,,,,                Integer operations,,, \n");
179 | 	printf("Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec\n");
180 | 
181 | 	runbench_warmup(cd, size);
182 | 
183 | 	runbench<0>(cd, size, doHalfs);
184 | 	runbench<1>(cd, size, doHalfs);
185 | 	runbench<2>(cd, size, doHalfs);
186 | 	runbench<3>(cd, size, doHalfs);
187 | 	runbench<4>(cd, size, doHalfs);
188 | 	runbench<5>(cd, size, doHalfs);
189 | 	runbench<6>(cd, size, doHalfs);
190 | 	runbench<7>(cd, size, doHalfs);
191 | 	runbench<8>(cd, size, doHalfs);
192 | 	runbench<9>(cd, size, doHalfs);
193 | 	runbench<10>(cd, size, doHalfs);
194 | 	runbench<11>(cd, size, doHalfs);
195 | 	runbench<12>(cd, size, doHalfs);
196 | 	runbench<13>(cd, size, doHalfs);
197 | 	runbench<14>(cd, size, doHalfs);
198 | 	runbench<15>(cd, size, doHalfs);
199 | 	runbench<16>(cd, size, doHalfs);
200 | 	runbench<17>(cd, size, doHalfs);
201 | 	runbench<18>(cd, size, doHalfs);
202 | 	runbench<20>(cd, size, doHalfs);
203 | 	runbench<22>(cd, size, doHalfs);
204 | 	runbench<24>(cd, size, doHalfs);
205 | 	runbench<28>(cd, size, doHalfs);
206 | 	runbench<32>(cd, size, doHalfs);
207 | 	runbench<40>(cd, size, doHalfs);
208 | 	runbench<48>(cd, size, doHalfs);
209 | 	runbench<56>(cd, size, doHalfs);
210 | 	runbench<64>(cd, size, doHalfs);
211 | 	runbench<80>(cd, size, doHalfs);
212 | 	runbench<96>(cd, size, doHalfs);
213 | 	runbench<128>(cd, size, doHalfs);
214 | 	runbench<192>(cd, size, doHalfs);
215 | 	runbench<256>(cd, size, doHalfs);
216 | 	runbench<512>(cd, size, doHalfs);
217 | 	runbench<1024>(cd, size, doHalfs);
218 | 
219 | 	printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------\n");
220 | 
221 | 	// Copy results back to host memory
222 | 	CUDA_SAFE_CALL( cudaMemcpy(c, cd, size*sizeof(double), cudaMemcpyDeviceToHost) );
223 | 
224 | 	CUDA_SAFE_CALL( cudaFree(cd) );
225 | 
226 | 	CUDA_SAFE_CALL( cudaDeviceReset() );
227 | }
228 | 


--------------------------------------------------------------------------------
/mixbench-cpu/mix_kernels_cpu.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * mix_kernels_cpu.cpp: This file is part of the mixbench GPU micro-benchmark
  3 |  *suite.
  4 |  *
  5 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  6 |  **/
  7 | 
  8 | #include <omp.h>
  9 | 
 10 | #include <algorithm>
 11 | #include <chrono>
 12 | #include <iomanip>
 13 | #include <iostream>
 14 | #include <memory>
 15 | #include <vector>
 16 | 
 17 | const auto base_omp_get_max_threads = omp_get_max_threads();
 18 | 
 19 | using benchmark_clock = std::chrono::steady_clock;
 20 | 
 21 | #ifdef BASELINE_IMPL
 22 | 
 23 | template <typename Element, size_t compute_iterations, size_t static_chunk_size>
 24 | Element __attribute__((noinline)) bench_block(Element* data) {
 25 |   Element sum = 0;
 26 |   Element f = data[0];
 27 | 
 28 | #pragma omp simd aligned(data : 64) reduction(+ : sum)
 29 |   for (size_t i = 0; i < static_chunk_size; i++) {
 30 |     Element t = data[i];
 31 |     for (size_t j = 0; j < compute_iterations; j++) {
 32 |       t = t * t + f;
 33 |     }
 34 |     sum += t;
 35 |   }
 36 |   return sum;
 37 | }
 38 | 
 39 | #else
 40 | 
 41 | template <typename Element, size_t compute_iterations, size_t static_chunk_size>
 42 | Element __attribute__((noinline)) bench_block(Element* data) {
 43 |   Element sum = 0;
 44 | 
 45 |   Element f[] = {data[0], data[1], data[2], data[3],
 46 |                  data[4], data[5], data[6], data[7]};
 47 | 
 48 | #pragma omp simd aligned(data : 64) reduction(+ : sum)
 49 |   for (size_t i = 0; i < static_chunk_size; i++) {
 50 |     Element t[] = {data[i], data[i], data[i], data[i],
 51 |                    data[i], data[i], data[i], data[i]};
 52 |     for (size_t j = 0; j < compute_iterations / 8; j++) {
 53 |       t[0] = t[0] * t[0] + f[0];
 54 |       t[1] = t[1] * t[1] + f[1];
 55 |       t[2] = t[2] * t[2] + f[2];
 56 |       t[3] = t[3] * t[3] + f[3];
 57 |       t[4] = t[4] * t[4] + f[4];
 58 |       t[5] = t[5] * t[5] + f[5];
 59 |       t[6] = t[6] * t[6] + f[6];
 60 |       t[7] = t[7] * t[7] + f[7];
 61 |     }
 62 |     if constexpr (compute_iterations % 8 > 0) {
 63 |       t[0] = t[0] * t[0] + f[0];
 64 |     }
 65 |     if constexpr (compute_iterations % 8 > 1) {
 66 |       t[1] = t[1] * t[1] + f[1];
 67 |     }
 68 |     if constexpr (compute_iterations % 8 > 2) {
 69 |       t[2] = t[2] * t[2] + f[2];
 70 |     }
 71 |     if constexpr (compute_iterations % 8 > 3) {
 72 |       t[3] = t[3] * t[3] + f[3];
 73 |     }
 74 |     if constexpr (compute_iterations % 8 > 4) {
 75 |       t[4] = t[4] * t[4] + f[4];
 76 |     }
 77 |     if constexpr (compute_iterations % 8 > 5) {
 78 |       t[5] = t[5] * t[5] + f[5];
 79 |     }
 80 |     if constexpr (compute_iterations % 8 > 6) {
 81 |       t[6] = t[6] * t[6] + f[6];
 82 |     }
 83 |     sum += t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7];
 84 |   }
 85 |   return sum;
 86 | }
 87 | 
 88 | #endif
 89 | 
 90 | template <typename Element, size_t compute_iterations>
 91 | __attribute__((optimize("unroll-loops"))) size_t bench(size_t len,
 92 |                                                        const Element seed1,
 93 |                                                        const Element seed2,
 94 |                                                        Element* src) {
 95 |   Element sum = 0;
 96 |   constexpr size_t static_chunk_size = 4096;
 97 | 
 98 | #pragma omp parallel for reduction(+ : sum) schedule(static)
 99 |   for (size_t it_base = 0; it_base < len; it_base += static_chunk_size) {
100 |     sum += bench_block<Element, compute_iterations, static_chunk_size>(
101 |         &src[it_base]);
102 |   }
103 | 
104 |   *src = sum;
105 |   return len;
106 | }
107 | 
108 | auto runbench_warmup(double* c, size_t size) {
109 |   auto timer_start = benchmark_clock::now();
110 | 
111 |   bench<double, 16>(size, 1., -1., c);
112 | 
113 |   auto timer_duration = benchmark_clock::now() - timer_start;
114 |   return std::chrono::duration_cast<std::chrono::microseconds>(timer_duration)
115 |       .count();
116 | }
117 | 
118 | template <typename Op>
119 | auto measure_operation(Op op) {
120 |   auto timer_start = benchmark_clock::now();
121 |   op();
122 |   auto timer_duration = benchmark_clock::now() - timer_start;
123 |   return std::chrono::duration_cast<std::chrono::microseconds>(timer_duration)
124 |              .count() /
125 |          1000.;
126 | }
127 | 
128 | template <typename Op>
129 | auto benchmark_omp(Op op) {
130 |   constexpr int total_runs = 20;
131 |   constexpr int total_half_thread_runs = 10;
132 | 
133 |   auto duration = op();  // drop first measurement
134 |   std::vector<decltype(duration)> measurements;
135 | 
136 |   // 1st try with full threading
137 |   omp_set_num_threads(base_omp_get_max_threads);
138 | 
139 |   for (int i = 1; i < total_runs; i++) {
140 |     duration = op();
141 |     measurements.push_back(duration);
142 |   }
143 | 
144 |   // then try with half threading
145 |   if (base_omp_get_max_threads > 1) {
146 |     omp_set_num_threads(base_omp_get_max_threads / 2);
147 | 
148 |     for (int i = 1; i < total_half_thread_runs; i++) {
149 |       duration = op();
150 |       measurements.push_back(duration);
151 |     }
152 |   }
153 | 
154 |   return *std::min_element(std::begin(measurements), std::end(measurements));
155 | }
156 | 
157 | class ComputeSpace {
158 |   size_t memory_space_{0};
159 |   int compute_iterations_{0};
160 | 
161 |  public:
162 |   ComputeSpace(size_t memory_space, int compute_iterations)
163 |       : memory_space_{memory_space}, compute_iterations_{compute_iterations} {}
164 | 
165 |   template <typename T>
166 |   size_t compute_ops() const {
167 |     const auto total_elements = element_count<T>();
168 |     const long long computations =
169 |         total_elements            /* Vector length */
170 |             * compute_iterations_ /* Core loop iteration count */
171 |             * 2                   /* Flops per core loop iteration */
172 |             * 1                   /* FMAs in the inner most loop */
173 |         + total_elements - 1      /* Due to sum reduction */
174 |         ;
175 |     return computations;
176 |   }
177 | 
178 |   size_t memory_traffic() const { return memory_space_; }
179 | 
180 |   template <typename T>
181 |   size_t element_count() const {
182 |     return memory_space_ / sizeof(T);
183 |   }
184 | };
185 | 
186 | template <unsigned int compute_iterations>
187 | void runbench(double* c, size_t size) {
188 |   ComputeSpace cs{size * sizeof(double), compute_iterations};
189 | 
190 |   // floating point part (single prec)
191 |   auto kernel_time_mad_sp = benchmark_omp([&] {
192 |     return measure_operation([&] {
193 |       bench<float, compute_iterations>(cs.element_count<float>(), 1.f, -1.f,
194 |                                        reinterpret_cast<float*>(c));
195 |     });
196 |   });
197 | 
198 |   // floating point part (double prec)
199 |   auto kernel_time_mad_dp = benchmark_omp([&] {
200 |     return measure_operation([&] {
201 |       bench<double, compute_iterations>(cs.element_count<double>(), 1., -1., c);
202 |     });
203 |   });
204 | 
205 |   // integer part
206 |   auto kernel_time_mad_int = benchmark_omp([&] {
207 |     return measure_operation([&] {
208 |       bench<int, compute_iterations>(cs.element_count<int>(), 1, -1,
209 |                                      reinterpret_cast<int*>(c));
210 |     });
211 |   });
212 | 
213 |   const auto computations_sp = cs.compute_ops<float>();
214 |   const auto computations_dp = cs.compute_ops<double>();
215 |   const auto computations_int = cs.compute_ops<int>();
216 |   const auto memory_traffic = cs.memory_traffic();
217 | 
218 |   const auto setw = std::setw;
219 |   const auto setprecision = std::setprecision;
220 |   std::cout << std::fixed << "         " << std::setw(4) << compute_iterations
221 |             << ",   " << setw(8) << setprecision(3)
222 |             << static_cast<double>(computations_sp) /
223 |                    static_cast<double>(memory_traffic)
224 |             << "," << setw(8) << setprecision(2) << kernel_time_mad_sp << ","
225 |             << setw(8) << setprecision(2)
226 |             << static_cast<double>(computations_sp) / kernel_time_mad_sp *
227 |                    1000. / static_cast<double>(1000 * 1000 * 1000)
228 |             << "," << setw(7) << setprecision(2)
229 |             << static_cast<double>(memory_traffic) / kernel_time_mad_sp *
230 |                    1000. / (1000. * 1000. * 1000.)
231 | 
232 |             << ",   " << setw(8) << setprecision(3)
233 |             << static_cast<double>(computations_dp) /
234 |                    static_cast<double>(memory_traffic)
235 |             << "," << setw(8) << setprecision(2) << kernel_time_mad_dp << ","
236 |             << setw(8) << setprecision(2)
237 |             << static_cast<double>(computations_dp) / kernel_time_mad_dp *
238 |                    1000. / static_cast<double>(1000 * 1000 * 1000)
239 |             << "," << setw(7) << setprecision(2)
240 |             << static_cast<double>(memory_traffic) / kernel_time_mad_dp *
241 |                    1000. / (1000. * 1000. * 1000.)
242 | 
243 |             << ",  " << setw(8) << setprecision(3)
244 |             << static_cast<double>(computations_int) /
245 |                    static_cast<double>(memory_traffic)
246 |             << "," << setw(8) << setprecision(2) << kernel_time_mad_int << ","
247 |             << setw(8) << setprecision(2)
248 |             << static_cast<double>(computations_int) / kernel_time_mad_int *
249 |                    1000. / static_cast<double>(1000 * 1000 * 1000)
250 |             << "," << setw(7) << setprecision(2)
251 |             << static_cast<double>(memory_traffic) / kernel_time_mad_int *
252 |                    1000. / (1000. * 1000. * 1000.)
253 | 
254 |             << std::endl;
255 | }
256 | 
257 | // Variadic template helper to ease multiple configuration invocations
258 | template <unsigned int compute_iterations>
259 | void runbench_range(double* cd, long size) {
260 |   runbench<compute_iterations>(cd, size);
261 | }
262 | 
263 | template <unsigned int j1, unsigned int j2, unsigned int... Args>
264 | void runbench_range(double* cd, long size) {
265 |   runbench_range<j1>(cd, size);
266 |   runbench_range<j2, Args...>(cd, size);
267 | }
268 | 
269 | void mixbenchCPU(double* c, size_t size) {
270 | // Initialize data to zeros on memory by respecting 1st touch policy
271 | #pragma omp parallel for schedule(static)
272 |   for (size_t i = 0; i < size; i++)
273 |     c[i] = 0.0;
274 | 
275 |   std::cout << "--------------------------------------------"
276 |                "-------------- CSV data "
277 |                "--------------------------------------------"
278 |                "--------------"
279 |             << std::endl;
280 |   std::cout << "Experiment ID, Single Precision ops,,,,              Double "
281 |                "precision ops,,,,              Integer operations,,, "
282 |             << std::endl;
283 |   std::cout << "Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, "
284 |                "Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   "
285 |                "GIOPS, GB/sec"
286 |             << std::endl;
287 | 
288 |   runbench_warmup(c, size);
289 | 
290 |   runbench_range<0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24, 28, 32, 40, 6 * 8, 7 * 8,
291 |                  8 * 8, 10 * 8, 13 * 8, 15 * 8, 16 * 8, 20 * 8, 24 * 8, 32 * 8,
292 |                  40 * 8, 64 * 8>(c, size);
293 | 
294 |   std::cout << "---------------------------------------------------------------"
295 |                "---------------------------------------------------------------"
296 |             << std::endl;
297 | }
298 | 


--------------------------------------------------------------------------------
/mixbench-hip/mix_kernels_hip.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * mix_kernels_hip.cpp: This file is part of the mixbench GPU micro-benchmark
  3 |  *suite.
  4 |  *
  5 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  6 |  **/
  7 | 
  8 | #include <hip/hip_ext.h>
  9 | #include <hip/hip_fp16.h>
 10 | #include <hip/hip_runtime.h>
 11 | #include <stdio.h>
 12 | #include <iostream>
 13 | #ifdef __CUDACC__
 14 | #include <math_constants.h>
 15 | #define GPU_INF(_T) (_T)(CUDART_INF)
 16 | #else
 17 | #include <limits>
 18 | #define GPU_INF(_T) std::numeric_limits<_T>::infinity()
 19 | #endif
 20 | 
 21 | typedef __half2 half2;
 22 | 
 23 | #include <common.h>
 24 | #include "lhiputil.h"
 25 | 
 26 | #define ELEMENTS_PER_THREAD (8)
 27 | 
 28 | template <class T>
 29 | inline __device__ T mad(const T& a, const T& b, const T& c) {
 30 |   return a * b + c;
 31 | }
 32 | 
 33 | template <>
 34 | inline __device__ double mad(const double& a,
 35 |                              const double& b,
 36 |                              const double& c) {
 37 |   return fma(a, b, c);
 38 | }
 39 | 
 40 | template <>
 41 | inline __device__ half2 mad(const half2& a, const half2& b, const half2& c) {
 42 |   return __hfma2(a, b, c);
 43 | }
 44 | 
 45 | template <class T>
 46 | inline __device__ bool is_equal(const T& a, const T& b) {
 47 |   return a == b;
 48 | }
 49 | 
 50 | template <>
 51 | inline __device__ bool is_equal(const half2& a, const half2& b) {
 52 |   return __hbeq2(a, b);
 53 | }
 54 | 
 55 | template <class T,
 56 |           int blockSize,
 57 |           unsigned int granularity,
 58 |           unsigned int compute_iterations>
 59 | __global__ void benchmark_func(T seed, T* g_data) {
 60 |   const int stride = blockSize;
 61 |   const int idx = hipBlockIdx_x * blockSize * granularity + hipThreadIdx_x;
 62 | 
 63 |   T tmps[granularity];
 64 | #pragma unroll
 65 |   for (int j = 0; j < granularity; j++) {
 66 |     // Load elements (memory intensive part)
 67 |     tmps[j] = g_data[idx + j * stride];
 68 |     // Perform computations (compute intensive part)
 69 |     for (int i = 0; i < compute_iterations; i++) {
 70 |       tmps[j] = mad<T>(tmps[j], tmps[j], seed);
 71 |     }
 72 |   }
 73 |   // Multiply add reduction
 74 |   T sum = static_cast<T>(0);
 75 | #pragma unroll
 76 |   for (int j = 0; j < granularity; j += 2) {
 77 |     sum = mad<T>(tmps[j], tmps[j + 1], sum);
 78 |   }
 79 |   // Dummy code
 80 |   if (is_equal(sum, static_cast<T>(-1)))  // Designed so it never executes
 81 |     g_data[idx] = sum;
 82 | }
 83 | 
 84 | void initializeEvents_ext(hipEvent_t* start, hipEvent_t* stop) {
 85 |   HIP_SAFE_CALL(hipEventCreate(start));
 86 |   HIP_SAFE_CALL(hipEventCreate(stop));
 87 | }
 88 | 
 89 | float finalizeEvents_ext(hipEvent_t start, hipEvent_t stop) {
 90 |   HIP_SAFE_CALL(hipGetLastError());
 91 |   HIP_SAFE_CALL(hipEventSynchronize(stop));
 92 |   float kernel_time;
 93 |   HIP_SAFE_CALL(hipEventElapsedTime(&kernel_time, start, stop));
 94 |   HIP_SAFE_CALL(hipEventDestroy(start));
 95 |   HIP_SAFE_CALL(hipEventDestroy(stop));
 96 |   return kernel_time;
 97 | }
 98 | 
 99 | void runbench_warmup(double* cd, long size) {
100 |   const long reduced_grid_size = size / (ELEMENTS_PER_THREAD) / 128;
101 |   const int BLOCK_SIZE = 256;
102 |   const int TOTAL_REDUCED_BLOCKS = reduced_grid_size / BLOCK_SIZE;
103 | 
104 |   dim3 dimBlock(BLOCK_SIZE, 1, 1);
105 |   dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1);
106 | 
107 |   hipLaunchKernelGGL(
108 |       HIP_KERNEL_NAME(
109 |           benchmark_func<short, BLOCK_SIZE, ELEMENTS_PER_THREAD, 0>),
110 |       dim3(dimReducedGrid), dim3(dimBlock), 0, 0, (short)1, (short*)cd);
111 |   HIP_SAFE_CALL(hipGetLastError());
112 |   HIP_SAFE_CALL(hipDeviceSynchronize());
113 | }
114 | 
115 | template <unsigned int compute_iterations>
116 | void runbench(double* cd, long size) {
117 |   const long compute_grid_size = size / ELEMENTS_PER_THREAD;
118 |   const int BLOCK_SIZE = 256;
119 |   const int TOTAL_BLOCKS = compute_grid_size / BLOCK_SIZE;
120 |   const long long computations =
121 |       ELEMENTS_PER_THREAD * (long long)compute_grid_size +
122 |       (2 * ELEMENTS_PER_THREAD * compute_iterations) *
123 |           (long long)compute_grid_size;
124 |   const long long memoryoperations = size;
125 | 
126 |   dim3 dimBlock(BLOCK_SIZE, 1, 1);
127 |   dim3 dimGrid(TOTAL_BLOCKS, 1, 1);
128 |   hipEvent_t start, stop;
129 | 
130 |   constexpr auto total_bench_iterations = 3;
131 | 
132 |   float kernel_time_mad_sp = benchmark<total_bench_iterations>([&]() {
133 |     initializeEvents_ext(&start, &stop);
134 |     hipExtLaunchKernelGGL(
135 |         HIP_KERNEL_NAME(benchmark_func<float, BLOCK_SIZE, ELEMENTS_PER_THREAD,
136 |                                        compute_iterations>),
137 |         dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, 1.0f, (float*)cd);
138 |     return finalizeEvents_ext(start, stop);
139 |   });
140 | 
141 |   float kernel_time_mad_sp2 = benchmark<total_bench_iterations>([&]() {
142 |     initializeEvents_ext(&start, &stop);
143 |     hipExtLaunchKernelGGL(
144 |         HIP_KERNEL_NAME(benchmark_func<float2, BLOCK_SIZE, ELEMENTS_PER_THREAD,
145 |                                        compute_iterations>),
146 |         dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, float2{1.0f},
147 |         (float2*)cd);
148 |     return finalizeEvents_ext(start, stop);
149 |   });
150 | 
151 |   float kernel_time_mad_dp = benchmark<total_bench_iterations>([&]() {
152 |     initializeEvents_ext(&start, &stop);
153 |     hipExtLaunchKernelGGL(
154 |         HIP_KERNEL_NAME(benchmark_func<double, BLOCK_SIZE, ELEMENTS_PER_THREAD,
155 |                                        compute_iterations>),
156 |         dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, 1.0, cd);
157 |     return finalizeEvents_ext(start, stop);
158 |   });
159 | 
160 |   float kernel_time_mad_hp = benchmark<total_bench_iterations>([&]() {
161 |     initializeEvents_ext(&start, &stop);
162 |     half2 h_ones(1.0f);
163 |     hipExtLaunchKernelGGL(
164 |         HIP_KERNEL_NAME(benchmark_func<half2, BLOCK_SIZE, ELEMENTS_PER_THREAD,
165 |                                        compute_iterations>),
166 |         dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, h_ones,
167 |         (half2*)cd);
168 |     return finalizeEvents_ext(start, stop);
169 |   });
170 | 
171 |   float kernel_time_mad_int = benchmark<total_bench_iterations>([&]() {
172 |     initializeEvents_ext(&start, &stop);
173 |     hipExtLaunchKernelGGL(
174 |         HIP_KERNEL_NAME(benchmark_func<int, BLOCK_SIZE, ELEMENTS_PER_THREAD,
175 |                                        compute_iterations>),
176 |         dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, 1, (int*)cd);
177 |     return finalizeEvents_ext(start, stop);
178 |   });
179 | 
180 |   printf(
181 |       "         %4d,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,   "
182 |       "%8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,  "
183 |       "%8.3f,%8.2f,%8.2f,%7.2f\n",
184 |       compute_iterations,
185 |       // SP
186 |       ((double)computations) / ((double)memoryoperations * sizeof(float)),
187 |       kernel_time_mad_sp,
188 |       ((double)computations) / kernel_time_mad_sp * 1000. /
189 |           (double)(1000 * 1000 * 1000),
190 |       ((double)memoryoperations * sizeof(float)) / kernel_time_mad_sp * 1000. /
191 |           (1000. * 1000. * 1000.),
192 |       // Packed SP
193 |       ((double)2 * computations) / ((double)memoryoperations * sizeof(float2)),
194 |       kernel_time_mad_sp2,
195 |       ((double)2 * computations) / kernel_time_mad_sp2 * 1000. /
196 |           (double)(1000 * 1000 * 1000),
197 |       ((double)memoryoperations * sizeof(float2)) / kernel_time_mad_sp2 *
198 |           1000. / (1000. * 1000. * 1000.),
199 |       // DP
200 |       ((double)computations) / ((double)memoryoperations * sizeof(double)),
201 |       kernel_time_mad_dp,
202 |       ((double)computations) / kernel_time_mad_dp * 1000. /
203 |           (double)(1000 * 1000 * 1000),
204 |       ((double)memoryoperations * sizeof(double)) / kernel_time_mad_dp * 1000. /
205 |           (1000. * 1000. * 1000.),
206 |       // Packed HP
207 |       ((double)2 * computations) / ((double)memoryoperations * sizeof(half2)),
208 |       kernel_time_mad_hp,
209 |       ((double)2 * computations) / kernel_time_mad_hp * 1000. /
210 |           (double)(1000 * 1000 * 1000),
211 |       ((double)memoryoperations * sizeof(half2)) / kernel_time_mad_hp * 1000. /
212 |           (1000. * 1000. * 1000.),
213 |       // Int
214 |       ((double)computations) / ((double)memoryoperations * sizeof(int)),
215 |       kernel_time_mad_int,
216 |       ((double)computations) / kernel_time_mad_int * 1000. /
217 |           (double)(1000 * 1000 * 1000),
218 |       ((double)memoryoperations * sizeof(int)) / kernel_time_mad_int * 1000. /
219 |           (1000. * 1000. * 1000.));
220 | }
221 | 
222 | extern "C" void mixbenchGPU(double* c, long size) {
223 |   const char* benchtype = "compute with global memory (block strided)";
224 | 
225 |   printf("Trade-off type:       %s\n", benchtype);
226 |   printf("Elements per thread:  %d\n", ELEMENTS_PER_THREAD);
227 |   printf("Thread fusion degree: %d\n", 1);
228 |   double* cd;
229 | 
230 |   HIP_SAFE_CALL(hipMalloc((void**)&cd, size * sizeof(double)));
231 | 
232 |   // Copy data to device memory
233 |   HIP_SAFE_CALL(
234 |       hipMemset(cd, 0, size * sizeof(double)));  // initialize to zeros
235 | 
236 |   // Synchronize in order to wait for memory operations to finish
237 |   HIP_SAFE_CALL(hipDeviceSynchronize());
238 | 
239 |   printf(
240 |       "------------------------------------------------------------------------"
241 |       "----- CSV data "
242 |       "------------------------------------------------------------------------"
243 |       "-------------------------------------------\n");
244 |   printf(
245 |       "Experiment ID, Single Precision ops,,,,              Packed Single "
246 |       "Precision ops,,,,       Double precision ops,,,,              Half "
247 |       "precision ops,,,,                Integer operations,,, \n");
248 |   printf(
249 |       "Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, "
250 |       "ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, "
251 |       "Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, "
252 |       "GB/sec\n");
253 | 
254 |   runbench_warmup(cd, size);
255 | 
256 |   runbench<0>(cd, size);
257 |   runbench<1>(cd, size);
258 |   runbench<2>(cd, size);
259 |   runbench<3>(cd, size);
260 |   runbench<4>(cd, size);
261 |   runbench<5>(cd, size);
262 |   runbench<6>(cd, size);
263 |   runbench<7>(cd, size);
264 |   runbench<8>(cd, size);
265 |   runbench<9>(cd, size);
266 |   runbench<10>(cd, size);
267 |   runbench<11>(cd, size);
268 |   runbench<12>(cd, size);
269 |   runbench<13>(cd, size);
270 |   runbench<14>(cd, size);
271 |   runbench<15>(cd, size);
272 |   runbench<16>(cd, size);
273 |   runbench<17>(cd, size);
274 |   runbench<18>(cd, size);
275 |   runbench<20>(cd, size);
276 |   runbench<22>(cd, size);
277 |   runbench<24>(cd, size);
278 |   runbench<28>(cd, size);
279 |   runbench<32>(cd, size);
280 |   runbench<40>(cd, size);
281 |   runbench<48>(cd, size);
282 |   runbench<56>(cd, size);
283 |   runbench<64>(cd, size);
284 |   runbench<80>(cd, size);
285 |   runbench<96>(cd, size);
286 |   runbench<128>(cd, size);
287 |   runbench<256>(cd, size);
288 |   runbench<512>(cd, size);
289 | 
290 |   printf(
291 |       "------------------------------------------------------------------------"
292 |       "------------------------------------------------------------------------"
293 |       "----------------------------------------------------------\n");
294 | 
295 |   // Copy results back to host memory
296 |   HIP_SAFE_CALL(hipMemcpy(c, cd, size * sizeof(double), hipMemcpyDeviceToHost));
297 | 
298 |   HIP_SAFE_CALL(hipFree(cd));
299 | 
300 |   HIP_SAFE_CALL(hipDeviceReset());
301 | }
302 | 


--------------------------------------------------------------------------------
/mixbench-sycl/mix_kernels_sycl.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * mix_kernels_sycl.cpp: This file is part of the mixbench GPU micro-benchmark suite.
  3 |  *
  4 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  5 |  **/
  6 | 
  7 | #include <common.h>
  8 | #include <CL/sycl.hpp>
  9 | #include <chrono>
 10 | #include <iomanip>
 11 | #include <iostream>
 12 | #include "lsyclutil.h"
 13 | 
 14 | #define ELEMENTS_PER_THREAD (8)
 15 | #define FUSION_DEGREE (4)
 16 | 
 17 | #ifdef __HIPSYCL__
 18 | #include <hip/hip_fp16.h>
 19 | #else
 20 | using half2 = sycl::half2;
 21 | using half = sycl::half;
 22 | #endif
 23 | 
 24 | template <typename T, typename Enable = void>
 25 | struct MADOperator {
 26 |   T operator()(T a, T b, T c) { return a * b + c; }
 27 | };
 28 | 
 29 | #ifndef __HIPSYCL__
 30 | // Use partial specialization for calling sycl::mad() for generic floating point types
 31 | template <typename T>
 32 | struct MADOperator<T, typename std::enable_if_t<sycl::detail::is_genfloat<T>::value>> {
 33 |     T operator()(T a, T b, T c) {
 34 |         return sycl::mad(a, b, c);
 35 |     }
 36 | };
 37 | #else
 38 | #ifdef SYCL_DEVICE_ONLY
 39 | // Packed half precision operation support via ROCm
 40 | //
 41 | template <>
 42 | struct MADOperator<half2, void> {
 43 |     half2 operator()(half2 a, half2 b, half2 c) {
 44 |         return __hfma2(a, b, c);
 45 |     }
 46 | };
 47 | #endif
 48 | #endif
 49 | 
 50 | template <typename T>
 51 | struct EqualOperator {
 52 |     bool operator()(T a, T b) {
 53 |         return a == b;
 54 |     }
 55 | };
 56 | 
 57 | template <>
 58 | struct EqualOperator<half2> {
 59 |     bool operator()(half2 a, half2 b) {
 60 | #ifdef __HIPSYCL__
 61 |         return __hbeq2(a, b);
 62 | #else
 63 |         return a[0] == b[0] && a[1] == b[1];
 64 | #endif
 65 |     }
 66 | };
 67 | 
 68 | template <typename T>
 69 | struct FromIntOperator {
 70 |     T operator()(const int i) {
 71 |         return static_cast<T>(i);
 72 |     }
 73 | };
 74 | 
 75 | template <>
 76 | struct FromIntOperator<half2> {
 77 |     half2 operator()(const int i) {
 78 |         #ifdef __HIPSYCL__
 79 |         return half2{i,i};
 80 |         #else
 81 |         return sycl::int2{i}.convert<half, sycl::rounding_mode::rtn>();
 82 |         #endif
 83 |     }
 84 | };
 85 | 
 86 | template <class T, unsigned int granularity, unsigned int fusion_degree, unsigned int compute_iterations>
 87 | void benchmark_func(T seed, T *g_data, sycl::nd_item<1> item_ct1) {
 88 |     const unsigned int blockSize = item_ct1.get_local_range(0);
 89 |     const int stride = blockSize;
 90 |     int idx = item_ct1.get_group(0) * blockSize * granularity + item_ct1.get_local_id(0);
 91 |     const int big_stride = item_ct1.get_group_range(0) * blockSize * granularity;
 92 |     /*
 93 | #ifdef BLOCK_STRIDED
 94 |     const int stride = blockSize;
 95 |     const int idx = get_group_id(0)*blockSize*ELEMENTS_PER_THREAD + get_local_id(0);
 96 | #else
 97 |     const int grid_size = blockSize * get_num_groups(0);
 98 |     const int stride = grid_size;
 99 |     const int idx = get_global_id(0);
100 | #endif
101 |     const int big_stride = get_num_groups(0)*blockSize*ELEMENTS_PER_THREAD;
102 | */
103 |     // Type specialized functors
104 |     MADOperator<T> mad_op;
105 |     EqualOperator<T> equal_op;
106 |     FromIntOperator<T> from_int_op;
107 |     T tmps[granularity];
108 |     for (int k = 0; k < fusion_degree; k++) {
109 | #pragma unroll
110 |         for (int j = 0; j < granularity; j++) {
111 |             // Load elements (memory intensive part)
112 |             tmps[j] = g_data[idx + j * stride + k * big_stride];
113 |             // Perform computations (compute intensive part)
114 |             for (int i = 0; i < compute_iterations; i++) {
115 |                 tmps[j] = mad_op(tmps[j], tmps[j], seed);
116 |             }
117 |         }
118 |         // Multiply add reduction
119 |         T sum = from_int_op(0);
120 |         //#pragma unroll
121 |         for (int j = 0; j < granularity; j += 2) {
122 |             sum = mad_op(tmps[j], tmps[j + 1], sum);
123 |         }
124 |         // Dummy code just to avoid dead code elimination
125 |         if (equal_op(sum, from_int_op(-1))) {  // Designed so it never executes
126 |             g_data[idx + k * big_stride] = sum;
127 |         }
128 |     }
129 | }
130 | 
131 | using time_point = std::chrono::time_point<std::chrono::high_resolution_clock>;
132 | 
133 | time_point initializeEvents(void) {
134 |     return std::chrono::high_resolution_clock::now();
135 | }
136 | 
137 | double finalizeEvents(bool use_host_timer, sycl::event ev_krn_execution, const time_point &tp_start_compute) {
138 |     ev_krn_execution.wait();
139 |     if (use_host_timer) {
140 |         const time_point tp_stop_compute = std::chrono::high_resolution_clock::now();
141 |         return std::chrono::duration<float, std::milli>(tp_stop_compute - tp_start_compute).count();
142 |     } else {
143 |         // Disabled for hipSYCL: error: no matching member function for call to 'get_profiling_info'
144 |         return (ev_krn_execution.get_profiling_info<sycl::info::event_profiling::command_end>() -
145 |                 ev_krn_execution.get_profiling_info<sycl::info::event_profiling::command_start>()) /
146 |                1000000.0;
147 |     }
148 | }
149 | 
150 | void runbench_warmup(sycl::queue &queue, void *cd, long size) {
151 |     const long reduced_grid_size = size / (ELEMENTS_PER_THREAD) / 128;
152 |     const int BLOCK_SIZE = 256;
153 |     const int TOTAL_REDUCED_BLOCKS = reduced_grid_size / BLOCK_SIZE;
154 | 
155 |     sycl::range<1> dimBlock(BLOCK_SIZE);
156 |     sycl::range<1> dimReducedGrid(TOTAL_REDUCED_BLOCKS);
157 | 
158 |     queue.submit([&](sycl::handler &cgh) {
159 |         cgh.parallel_for<class krn_short>(
160 |             sycl::nd_range<1>(dimReducedGrid * dimBlock, dimBlock),
161 |             [=](sycl::nd_item<1> item_ct1) {
162 |                 benchmark_func<short, ELEMENTS_PER_THREAD, FUSION_DEGREE, 0>(
163 |                     (short)1, (short *)cd, item_ct1);
164 |             });
165 |     });
166 | 
167 |     queue.wait();
168 | }
169 | 
170 | // forward declarations of kernel classes
171 | template <unsigned int>
172 | class krn_float;
173 | template <unsigned int>
174 | class krn_double;
175 | template <unsigned int>
176 | class krn_half;
177 | template <unsigned int>
178 | class krn_int;
179 | 
180 | template <unsigned int compute_iterations>
181 | void runbench(sycl::queue &queue, void *cd, long size, bool doHalfs, bool doDoubles, bool use_os_timer, size_t workgroupsize) {
182 |     const long compute_grid_size = size / ELEMENTS_PER_THREAD / FUSION_DEGREE;
183 |     const int BLOCK_SIZE = workgroupsize;
184 |     const int TOTAL_BLOCKS = compute_grid_size / BLOCK_SIZE;
185 | 
186 |     const sycl::range<1> dimBlock{static_cast<unsigned long>(BLOCK_SIZE)};
187 |     const sycl::range<1> dimGrid{static_cast<unsigned long>(TOTAL_BLOCKS)};
188 | 
189 |     constexpr auto total_bench_iterations = 3;
190 | 
191 |     // floating point part (single prec)
192 |     auto kernel_time_mad_sp = benchmark<total_bench_iterations>([&]() {
193 |       time_point tp_start_compute = initializeEvents();
194 |       auto ev_exec = queue.submit([&](sycl::handler& cgh) {
195 |         cgh.parallel_for<class krn_float<compute_iterations>>(
196 |             sycl::nd_range<1>(dimGrid * dimBlock, dimBlock),
197 |             [=](sycl::nd_item<1> item_ct1) {
198 |               benchmark_func<float, ELEMENTS_PER_THREAD, FUSION_DEGREE,
199 |                              compute_iterations>(-1.0f, (float*)cd, item_ct1);
200 |             });
201 |       });
202 |       return finalizeEvents(use_os_timer, ev_exec, tp_start_compute);
203 |     });
204 | 
205 |     // floating point part (double prec)
206 |     double kernel_time_mad_dp = 0.;
207 |     if (doDoubles) {
208 |       kernel_time_mad_dp = benchmark<total_bench_iterations>([&]() {
209 |         time_point tp_start_compute = initializeEvents();
210 |         auto ev_exec = queue.submit([&](sycl::handler& cgh) {
211 |           cgh.parallel_for<class krn_double<compute_iterations>>(
212 |               sycl::nd_range<1>(dimGrid * dimBlock, dimBlock),
213 |               [=](sycl::nd_item<1> item_ct1) {
214 |                 benchmark_func<double, ELEMENTS_PER_THREAD, FUSION_DEGREE,
215 |                                compute_iterations>(-1.0, reinterpret_cast<double*>(cd), item_ct1);
216 |               });
217 |         });
218 |         return finalizeEvents(use_os_timer, ev_exec, tp_start_compute);
219 |       });
220 |     }
221 | 
222 |     double kernel_time_mad_hp = 0.;
223 |     // floating point part (half prec)
224 |     if (doHalfs) {
225 |       kernel_time_mad_hp = benchmark<total_bench_iterations>([&]() {
226 |         time_point tp_start_compute = initializeEvents();
227 |         half2 h_ones{-1.0f, -1.0f};
228 |         auto ev_exec = queue.submit([&](sycl::handler& cgh) {
229 |           cgh.parallel_for<class krn_half<compute_iterations>>(
230 |               sycl::nd_range<1>(dimGrid * dimBlock, dimBlock),
231 |               [=](sycl::nd_item<1> item_ct1) {
232 |                 benchmark_func<half2, ELEMENTS_PER_THREAD, FUSION_DEGREE,
233 |                                compute_iterations>(
234 |                     h_ones, reinterpret_cast<half2*>(cd), item_ct1);
235 |               });
236 |         });
237 |         return finalizeEvents(use_os_timer, ev_exec, tp_start_compute);
238 |       });
239 |     }
240 | 
241 |     // integer part
242 |     auto kernel_time_mad_int = benchmark<total_bench_iterations>([&]() {
243 |       time_point tp_start_compute = initializeEvents();
244 |       auto ev_exec = queue.submit([&](sycl::handler& cgh) {
245 |         cgh.parallel_for<class krn_int<compute_iterations>>(
246 |             sycl::nd_range<1>(dimGrid * dimBlock, dimBlock),
247 |             [=](sycl::nd_item<1> item_ct1) {
248 |               benchmark_func<int, ELEMENTS_PER_THREAD, FUSION_DEGREE,
249 |                              compute_iterations>(
250 |                   -1, (int*)cd, item_ct1);  // seed 1 causes unwanted code
251 |                                             // elimination optimization
252 |             });
253 |       });
254 |       return finalizeEvents(use_os_timer, ev_exec, tp_start_compute);
255 |     });
256 | 
257 |     const long long computations = (ELEMENTS_PER_THREAD * (long long)compute_grid_size + (2 * ELEMENTS_PER_THREAD * compute_iterations) * (long long)compute_grid_size) * FUSION_DEGREE;
258 |     const long long memoryoperations = size;
259 | 
260 |     const auto setw = std::setw;
261 |     const auto setprecision = std::setprecision;
262 |     std::cout << std::fixed << "         " << std::setw(4) << compute_iterations
263 |               << ",   " << setw(8) << setprecision(3) << ((double)computations) / ((double)memoryoperations * sizeof(float))
264 |               << "," << setw(8) << setprecision(2) << kernel_time_mad_sp
265 |               << "," << setw(8) << setprecision(2) << ((double)computations) / kernel_time_mad_sp * 1000. / (double)(1000 * 1000 * 1000)
266 |               << "," << setw(7) << setprecision(2) << ((double)memoryoperations * sizeof(float)) / kernel_time_mad_sp * 1000. / (1000. * 1000. * 1000.)
267 | 
268 |               << ",   " << setw(8) << setprecision(3) << ((double)computations) / ((double)memoryoperations * sizeof(double))
269 |               << "," << setw(8) << setprecision(2) << kernel_time_mad_dp
270 |               << "," << setw(8) << setprecision(2) << ((double)computations) / kernel_time_mad_dp * 1000. / (double)(1000 * 1000 * 1000)
271 |               << "," << setw(7) << setprecision(2) << ((double)memoryoperations * sizeof(double)) / kernel_time_mad_dp * 1000. / (1000. * 1000. * 1000.)
272 | 
273 |               << ",   " << setw(8) << setprecision(3) << ((double)2 * computations) / ((double)memoryoperations * sizeof(half2))
274 |               << "," << setw(8) << setprecision(2) << kernel_time_mad_hp
275 |               << "," << setw(8) << setprecision(2) << ((double)2 * computations) / kernel_time_mad_hp * 1000. / (double)(1000 * 1000 * 1000)
276 |               << "," << setw(7) << setprecision(2) << ((double)memoryoperations * sizeof(half2)) / kernel_time_mad_hp * 1000. / (1000. * 1000. * 1000.)
277 | 
278 |               << ",  " << setw(8) << setprecision(3) << ((double)computations) / ((double)memoryoperations * sizeof(int))
279 |               << "," << setw(8) << setprecision(2) << kernel_time_mad_int
280 |               << "," << setw(8) << setprecision(2) << ((double)computations) / kernel_time_mad_int * 1000. / (double)(1000 * 1000 * 1000)
281 |               << "," << setw(7) << setprecision(2) << ((double)memoryoperations * sizeof(int)) / kernel_time_mad_int * 1000. / (1000. * 1000. * 1000.)
282 | 
283 |               << std::endl;
284 | }
285 | 
286 | // Variadic template helper to ease multiple configuration invocations
287 | template <unsigned int compute_iterations>
288 | void runbench_range(sycl::queue &queue, void *cd, long size, bool doHalfs, bool doDoubles, bool use_os_timer, size_t workgroupsize) {
289 |     runbench<compute_iterations>(queue, cd, size, doHalfs, doDoubles, use_os_timer, workgroupsize);
290 | }
291 | 
292 | template <unsigned int j1, unsigned int j2, unsigned int... Args>
293 | void runbench_range(sycl::queue &queue, void *cd, long size, bool doHalfs, bool doDoubles, bool use_os_timer, size_t workgroupsize) {
294 |     runbench_range<j1>(queue, cd, size, doHalfs, doDoubles, use_os_timer, workgroupsize);
295 |     runbench_range<j2, Args...>(queue, cd, size, doHalfs, doDoubles, use_os_timer, workgroupsize);
296 | }
297 | 
298 | void mixbenchGPU(const sycl::device &dev, void *c, long size, bool use_os_timer, size_t workgroupsize) {
299 |     const sycl::property_list queue_prop_list = use_os_timer ? sycl::property_list{} : sycl::property_list{sycl::property::queue::enable_profiling()};
300 |     sycl::queue queue{dev, queue_prop_list};
301 | 
302 |     std::cout << "Elements per thread:  " << ELEMENTS_PER_THREAD << std::endl;
303 |     std::cout << "Thread fusion degree: " << FUSION_DEGREE << std::endl;
304 |     std::cout << "Timer:                " << (use_os_timer ? "OS based" : "SYCL event based") << std::endl;
305 | 
306 | #ifndef __HIPSYCL__
307 |     const bool doHalfs = dev.has(sycl::aspect::fp16);
308 |     if (!doHalfs) {
309 |         std::cout << "Warning:              Half precision computations are not supported" << std::endl;
310 |     }
311 | 
312 |     const bool doDoubles = dev.has(sycl::aspect::fp64);
313 |     if (!doDoubles) {
314 |         std::cout << "Warning:              Double precision computations are not supported" << std::endl;
315 |     }
316 | #else
317 |     const bool doHalfs = true;
318 |     const bool doDoubles = true;
319 |     std::cout << "Warning:              hipSYCL - Assuming half and double precision support" << std::endl;
320 | #endif
321 | 
322 |     double *cd = sycl::malloc_device<double>(size, queue);
323 | 
324 |     // Initialize data to zeros on device memory
325 |     queue.memset(cd, 0, size * sizeof(double));
326 | 
327 |     // Synchronize in order to wait for memory operations to finish
328 |     queue.wait();
329 | 
330 |     std::cout << "----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------" << std::endl;
331 |     std::cout << "Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Half precision ops,,,,                Integer operations,,, " << std::endl;
332 |     std::cout << "Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec" << std::endl;
333 | 
334 |     runbench_warmup(queue, cd, size);
335 | 
336 |     runbench_range<0, 1, 2, 3, 4, 5, 6, 7, 8,
337 |                    9, 10, 11, 12, 13, 14, 15, 16,
338 |                    17, 18, 20, 22, 24, 28, 32, 40,
339 |                    48, 56, 64, 80, 96, 128, 192, 256>(queue, cd, size, doHalfs, doDoubles, use_os_timer, workgroupsize);
340 | 
341 |     std::cout << "--------------------------------------------------------------------------------------------------------------------------------------------------------------------" << std::endl;
342 | 
343 |     // Copy results to host memory and release device memory
344 |     queue.memcpy(c, cd, size * sizeof(double)).wait();
345 | 
346 |     sycl::free(cd, queue);
347 | }
348 | 


--------------------------------------------------------------------------------
/mixbench-opencl/mix_kernels_ocl.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * mix_kernels_ocl.cpp: This file is part of the mixbench GPU micro-benchmark
  3 |  *suite.
  4 |  *
  5 |  * Contact: Elias Konstantinidis <ekondis@gmail.com>
  6 |  **/
  7 | 
  8 | #include <common.h>
  9 | #include <timestamp.h>
 10 | #include <cstdarg>
 11 | #include <cstdio>
 12 | #include <cstring>
 13 | #include "loclutil.h"
 14 | 
 15 | #if defined(_MSC_VER)
 16 | #define SIZE_T_FORMAT "%lu"
 17 | #else
 18 | #define SIZE_T_FORMAT "%zu"
 19 | #endif
 20 | 
 21 | enum KrnDataType { kdt_int, kdt_float, kdt_double, kdt_half };
 22 | 
 23 | const int compute_iterations[] = {
 24 |     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13,  14,  15, 16,
 25 |     17, 18, 20, 22, 24, 28, 32, 40, 48, 56, 64, 80, 96, 128, 192, 256};
 26 | const int compute_iterations_len =
 27 |     sizeof(compute_iterations) / sizeof(*compute_iterations);
 28 | 
 29 | #ifdef HF_WORKAROUND
 30 | typedef short cl_half2[2];
 31 | #endif
 32 | 
 33 | char* ReadFile(const char* filename) {
 34 |   char* buffer = NULL;
 35 |   int file_size, read_size;
 36 |   FILE* file = fopen(filename, "r");
 37 |   if (!file)
 38 |     return NULL;
 39 |   // Seek EOF
 40 |   fseek(file, 0, SEEK_END);
 41 |   // Get offset
 42 |   file_size = ftell(file);
 43 |   rewind(file);
 44 |   buffer = (char*)malloc(sizeof(char) * (file_size + 1));
 45 |   read_size = fread(buffer, sizeof(char), file_size, file);
 46 |   buffer[file_size] = '\0';
 47 |   if (file_size != read_size) {
 48 |     free(buffer);
 49 |     buffer = NULL;
 50 |   }
 51 |   return buffer;
 52 | }
 53 | 
 54 | void flushed_printf(const char* format, ...) {
 55 |   va_list args;
 56 |   va_start(args, format);
 57 |   vprintf(format, args);
 58 |   va_end(args);
 59 |   fflush(stdout);
 60 | }
 61 | 
 62 | void show_progress_init(int length) {
 63 |   flushed_printf("[");
 64 |   for (int i = 0; i < length; i++)
 65 |     flushed_printf(" ");
 66 |   flushed_printf("]");
 67 |   for (int i = 0; i <= length; i++)
 68 |     flushed_printf("\b");
 69 | }
 70 | 
 71 | void show_progress_step(int domove, char newchar) {
 72 |   flushed_printf("%c", newchar);
 73 |   if (!domove)
 74 |     flushed_printf("\b");
 75 | }
 76 | 
 77 | void show_progress_done(void) {
 78 |   flushed_printf("\n");
 79 | }
 80 | 
 81 | double get_event_duration(cl_event ev) {
 82 |   cl_ulong ev_t_start, ev_t_finish;
 83 |   OCL_SAFE_CALL(clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START,
 84 |                                         sizeof(cl_ulong), &ev_t_start, NULL));
 85 |   OCL_SAFE_CALL(clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END,
 86 |                                         sizeof(cl_ulong), &ev_t_finish, NULL));
 87 |   double time = (ev_t_finish - ev_t_start) / 1000000.0;
 88 |   return time;
 89 | }
 90 | 
 91 | cl_kernel BuildKernel(cl_context context,
 92 |                       cl_device_id dev_id,
 93 |                       const char* source,
 94 |                       const char* parameters) {
 95 |   cl_int errno;
 96 |   const char** sources = &source;
 97 |   cl_program program =
 98 |       clCreateProgramWithSource(context, 1, sources, NULL, &errno);
 99 |   OCL_SAFE_CALL(errno);
100 |   errno = clBuildProgram(program, 1, &dev_id, parameters, NULL, NULL);
101 |   if (errno != CL_SUCCESS) {
102 |     fprintf(stderr, "Program built error code: %d\n", errno);
103 |     size_t log_size;
104 |     OCL_SAFE_CALL(clGetProgramBuildInfo(program, dev_id, CL_PROGRAM_BUILD_LOG,
105 |                                         0, NULL, &log_size));
106 |     char* log = (char*)alloca(log_size);
107 |     OCL_SAFE_CALL(clGetProgramBuildInfo(program, dev_id, CL_PROGRAM_BUILD_LOG,
108 |                                         log_size, log, NULL));
109 |     OCL_SAFE_CALL(clReleaseProgram(program));
110 |     fprintf(stderr,
111 |             "------------------------------------ Kernel compilation log "
112 |             "----------------------------------\n");
113 |     fprintf(stderr, "%s", log);
114 |     fprintf(stderr,
115 |             "------------------------------------------------------------------"
116 |             "----------------------------\n");
117 |     exit(EXIT_FAILURE);
118 |   }
119 |   // Kernel creation
120 |   cl_kernel kernel = clCreateKernel(program, "benchmark_func", &errno);
121 |   OCL_SAFE_CALL(errno);
122 |   return kernel;
123 | }
124 | 
125 | void ReleaseKernelNProgram(cl_kernel kernel) {
126 |   cl_program program_tmp;
127 |   OCL_SAFE_CALL(clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program_tmp),
128 |                                 &program_tmp, NULL));
129 |   OCL_SAFE_CALL(clReleaseKernel(kernel));
130 |   OCL_SAFE_CALL(clReleaseProgram(program_tmp));
131 | }
132 | 
133 | void runbench_warmup(cl_command_queue queue,
134 |                      cl_kernel kernel,
135 |                      cl_mem cbuffer,
136 |                      long size,
137 |                      size_t workgroupsize) {
138 |   const long reduced_grid_size = size / 256;
139 | 
140 |   const size_t dimBlock[1] = {workgroupsize};
141 |   const size_t dimReducedGrid[1] = {(size_t)reduced_grid_size};
142 | 
143 |   const short seed = 1;
144 |   OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_short), &seed));
145 |   OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer));
146 | 
147 |   OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimReducedGrid,
148 |                                        dimBlock, 0, NULL, NULL));
149 | }
150 | 
151 | void runbench(const int compute_iterations[],
152 |               unsigned int krn_idx,
153 |               cl_command_queue queue,
154 |               cl_kernel kernels[kdt_double + 1][compute_iterations_len],
155 |               cl_mem cbuffer,
156 |               long size,
157 |               size_t workgroupsize,
158 |               unsigned int elements_per_wi,
159 |               unsigned int fusion_degree,
160 |               bool use_os_timer) {
161 |   const long compute_grid_size = size / elements_per_wi / fusion_degree;
162 |   const int current_compute_iterations = compute_iterations[krn_idx];
163 |   const long long computations =
164 |       (elements_per_wi * (long long)compute_grid_size +
165 |        (2 * elements_per_wi * current_compute_iterations) *
166 |            (long long)compute_grid_size) *
167 |       fusion_degree;
168 |   const long long memoryoperations = size;
169 | 
170 |   const size_t dimBlock[1] = {workgroupsize};
171 |   const size_t dimGrid[1] = {(size_t)compute_grid_size};
172 | 
173 |   constexpr auto total_bench_iterations = 3;
174 | 
175 |   double kernel_time_mad_sp = benchmark<total_bench_iterations>([&]() {
176 |     const cl_float seed_f = 1.0f;
177 |     cl_kernel kernel = kernels[kdt_float][krn_idx];
178 |     OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_float), &seed_f));
179 |     OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer));
180 |     auto ts_start = getTimestamp();
181 |     cl_event event;
182 |     OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimGrid,
183 |                                          dimBlock, 0, NULL, &event));
184 |     OCL_SAFE_CALL(clWaitForEvents(1, &event));
185 |     auto duration =
186 |         use_os_timer ? getElapsedtime(ts_start) : get_event_duration(event);
187 |     OCL_SAFE_CALL(clReleaseEvent(event));
188 |     return duration;
189 |   });
190 | 
191 |   double kernel_time_mad_dp = benchmark<total_bench_iterations>([&]() {
192 |     const cl_double seed_d = 1.0;
193 |     cl_kernel kernel = kernels[kdt_double][krn_idx];
194 |     if (kernel) {
195 |       OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_double), &seed_d));
196 |       OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer));
197 |       auto ts_start = getTimestamp();
198 |       cl_event event;
199 |       OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimGrid,
200 |                                            dimBlock, 0, NULL, &event));
201 |       OCL_SAFE_CALL(clWaitForEvents(1, &event));
202 |       auto duration =
203 |           use_os_timer ? getElapsedtime(ts_start) : get_event_duration(event);
204 |       OCL_SAFE_CALL(clReleaseEvent(event));
205 |       return duration;
206 |     } else
207 |       return 0.0;
208 |   });
209 | 
210 |   double kernel_time_mad_hp = benchmark<total_bench_iterations>([&]() {
211 |     const cl_half2 seed_h = {15360, 15360};  // {1.0, 1.0}
212 |     cl_kernel kernel = kernels[kdt_half][krn_idx];
213 |     if (kernel) {
214 |       OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_half2), &seed_h));
215 |       OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer));
216 |       auto ts_start = getTimestamp();
217 |       cl_event event;
218 |       OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimGrid,
219 |                                            dimBlock, 0, NULL, &event));
220 |       OCL_SAFE_CALL(clWaitForEvents(1, &event));
221 |       auto duration =
222 |           use_os_timer ? getElapsedtime(ts_start) : get_event_duration(event);
223 |       OCL_SAFE_CALL(clReleaseEvent(event));
224 |       return duration;
225 |     } else
226 |       return 0.0;
227 |   });
228 | 
229 |   double kernel_time_mad_int = benchmark<total_bench_iterations>([&]() {
230 |     const cl_int seed_i = static_cast<cl_int>(1.0);
231 |     cl_kernel kernel = kernels[kdt_int][krn_idx];
232 |     OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_int), &seed_i));
233 |     OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer));
234 |     auto ts_start = getTimestamp();
235 |     cl_event event;
236 |     OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimGrid,
237 |                                          dimBlock, 0, NULL, &event));
238 |     OCL_SAFE_CALL(clWaitForEvents(1, &event));
239 |     auto duration =
240 |         use_os_timer ? getElapsedtime(ts_start) : get_event_duration(event);
241 |     OCL_SAFE_CALL(clReleaseEvent(event));
242 |     return duration;
243 |   });
244 | 
245 |   printf(
246 |       "         %4d,   %8.3f,%8.2f,%8.2f,%7.2f,   %8.3f,%8.2f,%8.2f,%7.2f,   "
247 |       "%8.3f,%8.2f,%8.2f,%7.2f,  %8.3f,%8.2f,%8.2f,%7.2f\n",
248 |       current_compute_iterations,
249 |       ((double)computations) / ((double)memoryoperations * sizeof(float)),
250 |       kernel_time_mad_sp,
251 |       ((double)computations) / kernel_time_mad_sp * 1000. /
252 |           (double)(1000 * 1000 * 1000),
253 |       ((double)memoryoperations * sizeof(float)) / kernel_time_mad_sp * 1000. /
254 |           (1000. * 1000. * 1000.),
255 |       ((double)computations) / ((double)memoryoperations * sizeof(double)),
256 |       kernel_time_mad_dp,
257 |       ((double)computations) / kernel_time_mad_dp * 1000. /
258 |           (double)(1000 * 1000 * 1000),
259 |       ((double)memoryoperations * sizeof(double)) / kernel_time_mad_dp * 1000. /
260 |           (1000. * 1000. * 1000.),
261 |       ((double)2 * computations) /
262 |           ((double)memoryoperations * sizeof(cl_half2)),
263 |       kernel_time_mad_hp,
264 |       ((double)2 * computations) / kernel_time_mad_hp * 1000. /
265 |           (double)(1000 * 1000 * 1000),
266 |       ((double)memoryoperations * sizeof(cl_half2)) / kernel_time_mad_hp *
267 |           1000. / (1000. * 1000. * 1000.),
268 |       ((double)computations) / ((double)memoryoperations * sizeof(int)),
269 |       kernel_time_mad_int,
270 |       ((double)computations) / kernel_time_mad_int * 1000. /
271 |           (double)(1000 * 1000 * 1000),
272 |       ((double)memoryoperations * sizeof(int)) / kernel_time_mad_int * 1000. /
273 |           (1000. * 1000. * 1000.));
274 | }
275 | 
276 | extern "C" void mixbenchGPU(cl_device_id dev_id,
277 |                             double* c,
278 |                             long size,
279 |                             bool block_strided,
280 |                             bool host_allocated,
281 |                             bool use_os_timer,
282 |                             size_t workgroupsize,
283 |                             unsigned int elements_per_wi,
284 |                             unsigned int fusion_degree) {
285 |   const char* benchtype;
286 |   if (block_strided)
287 |     benchtype = "Workgroup";
288 |   else
289 |     benchtype = "NDRange";
290 |   printf("Workitem stride:        %s\n", benchtype);
291 |   const char* buffer_allocation =
292 |       host_allocated ? "Host allocated" : "Device allocated";
293 |   printf("Buffer allocation:      %s\n", buffer_allocation);
294 |   printf("Timer:                  %s\n",
295 |          use_os_timer ? "OS based" : "CL event based");
296 | 
297 |   // Set context properties
298 |   cl_platform_id p_id;
299 |   OCL_SAFE_CALL(
300 |       clGetDeviceInfo(dev_id, CL_DEVICE_PLATFORM, sizeof(p_id), &p_id, NULL));
301 |   size_t length;
302 |   OCL_SAFE_CALL(
303 |       clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, 0, NULL, &length));
304 |   char* extensions = (char*)alloca(length);
305 |   OCL_SAFE_CALL(
306 |       clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, length, extensions, NULL));
307 |   bool enable_dp = strstr(extensions, "cl_khr_fp64") != NULL;
308 |   if (!enable_dp)
309 |     printf(
310 |         "Warning:                Double precision computations are not "
311 |         "supported\n");
312 |   bool enable_hp = strstr(extensions, "cl_khr_fp16") != NULL;
313 |   if (!enable_hp)
314 |     printf(
315 |         "Warning:                Half precision computations are not "
316 |         "supported\n");
317 | 
318 |   cl_context_properties ctxProps[] = {CL_CONTEXT_PLATFORM,
319 |                                       (cl_context_properties)p_id, 0};
320 | 
321 |   cl_int errno;
322 |   // Create context
323 |   cl_context context =
324 |       clCreateContext(ctxProps, 1, &dev_id, NULL, NULL, &errno);
325 |   OCL_SAFE_CALL(errno);
326 | 
327 |   cl_mem_flags buf_flags = CL_MEM_READ_WRITE;
328 |   if (host_allocated)
329 |     buf_flags |= CL_MEM_ALLOC_HOST_PTR;
330 |   cl_mem c_buffer =
331 |       clCreateBuffer(context, buf_flags, size * sizeof(double), NULL, &errno);
332 |   OCL_SAFE_CALL(errno);
333 | 
334 |   // Create command queue
335 |   cl_command_queue cmd_queue = clCreateCommandQueue(
336 |       context, dev_id, use_os_timer ? 0 : CL_QUEUE_PROFILING_ENABLE, &errno);
337 |   OCL_SAFE_CALL(errno);
338 | 
339 |   // Set data on device memory
340 |   cl_int* mapped_data =
341 |       (cl_int*)clEnqueueMapBuffer(cmd_queue, c_buffer, CL_TRUE, CL_MAP_WRITE, 0,
342 |                                   size * sizeof(double), 0, NULL, NULL, &errno);
343 |   OCL_SAFE_CALL(errno);
344 |   for (int i = 0; i < size; i++)
345 |     mapped_data[i] = 0;
346 |   clEnqueueUnmapMemObject(cmd_queue, c_buffer, mapped_data, 0, NULL, NULL);
347 | 
348 |   // Load source, create program and all kernels
349 |   printf("Loading kernel source file...\n");
350 |   const char c_param_format_str[] =
351 |       "-cl-std=CL1.2 -cl-mad-enable -Dclass_T=%s -Dblockdim=" SIZE_T_FORMAT
352 |       " -DCOMPUTE_ITERATIONS=%d -DELEMENTS_PER_THREAD=%d -DFUSION_DEGREE=%d %s "
353 |       "%s";
354 |   const char* c_empty = "";
355 |   const char* c_striding = block_strided ? "-DBLOCK_STRIDED" : c_empty;
356 |   const char *c_enable_dp = "-DENABLE_DP", *c_enable_hp = "-DENABLE_HP";
357 |   char c_build_params[256];
358 |   const char* c_kernel_source = {ReadFile("mix_kernels.cl")};
359 |   printf("Precompilation of kernels... ");
360 |   sprintf(c_build_params, c_param_format_str, "short", workgroupsize, 0, 1, 1,
361 |           c_striding, c_empty);
362 | 
363 |   cl_kernel kernel_warmup =
364 |       BuildKernel(context, dev_id, c_kernel_source, c_build_params);
365 | 
366 |   show_progress_init(compute_iterations_len);
367 |   cl_kernel kernels[kdt_half + 1][compute_iterations_len];
368 |   for (int i = 0; i < compute_iterations_len; i++) {
369 |     show_progress_step(0, '\\');
370 |     sprintf(c_build_params, c_param_format_str, "float", workgroupsize,
371 |             compute_iterations[i], elements_per_wi, fusion_degree, c_striding,
372 |             c_empty);
373 |     // printf("%s\n",c_build_params);
374 |     kernels[kdt_float][i] =
375 |         BuildKernel(context, dev_id, c_kernel_source, c_build_params);
376 | 
377 |     show_progress_step(0, '|');
378 |     sprintf(c_build_params, c_param_format_str, "int", workgroupsize,
379 |             compute_iterations[i], elements_per_wi, fusion_degree, c_striding,
380 |             c_empty);
381 |     // printf("%s\n",c_build_params);
382 |     kernels[kdt_int][i] =
383 |         BuildKernel(context, dev_id, c_kernel_source, c_build_params);
384 | 
385 |     if (enable_dp) {
386 |       show_progress_step(0, '/');
387 |       sprintf(c_build_params, c_param_format_str, "double", workgroupsize,
388 |               compute_iterations[i], elements_per_wi, fusion_degree, c_striding,
389 |               c_enable_dp);
390 |       // printf("%s\n",c_build_params);
391 |       kernels[kdt_double][i] =
392 |           BuildKernel(context, dev_id, c_kernel_source, c_build_params);
393 |     } else
394 |       kernels[kdt_double][i] = 0;
395 | 
396 |     if (enable_hp) {
397 |       show_progress_step(0, '-');
398 |       sprintf(c_build_params, c_param_format_str, "half2", workgroupsize,
399 |               compute_iterations[i], elements_per_wi, fusion_degree, c_striding,
400 |               c_enable_hp);
401 |       kernels[kdt_half][i] =
402 |           BuildKernel(context, dev_id, c_kernel_source, c_build_params);
403 |     } else
404 |       kernels[kdt_half][i] = 0;
405 | 
406 |     show_progress_step(1, '>');
407 |   }
408 |   show_progress_done();
409 |   free((char*)c_kernel_source);
410 | 
411 |   runbench_warmup(cmd_queue, kernel_warmup, c_buffer, size, workgroupsize);
412 | 
413 |   // Synchronize in order to wait for memory operations to finish
414 |   OCL_SAFE_CALL(clFinish(cmd_queue));
415 | 
416 |   printf(
417 |       "------------------------------------------------------------------------"
418 |       "----- CSV data "
419 |       "------------------------------------------------------------------------"
420 |       "-----\n");
421 |   printf(
422 |       "Experiment ID, Single Precision ops,,,,              Double precision "
423 |       "ops,,,,              Half precision ops,,,,                Integer "
424 |       "operations,,, \n");
425 |   printf(
426 |       "Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, "
427 |       "ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, "
428 |       "Iops/byte, ex.time,   GIOPS, GB/sec\n");
429 | 
430 |   for (int i = 0; i < compute_iterations_len; i++)
431 |     runbench(compute_iterations, i, cmd_queue, kernels, c_buffer, size,
432 |              workgroupsize, elements_per_wi, fusion_degree, use_os_timer);
433 | 
434 |   printf(
435 |       "------------------------------------------------------------------------"
436 |       "------------------------------------------------------------------------"
437 |       "--------------------\n");
438 | 
439 |   // Copy results back to host memory
440 |   OCL_SAFE_CALL(clEnqueueReadBuffer(cmd_queue, c_buffer, CL_TRUE, 0,
441 |                                     size * sizeof(double), c, 0, NULL, NULL));
442 | 
443 |   // Release kernels and program
444 |   ReleaseKernelNProgram(kernel_warmup);
445 |   for (int i = 0; i < compute_iterations_len; i++) {
446 |     ReleaseKernelNProgram(kernels[kdt_float][i]);
447 |     ReleaseKernelNProgram(kernels[kdt_int][i]);
448 |     if (enable_dp)
449 |       ReleaseKernelNProgram(kernels[kdt_double][i]);
450 |     if (enable_hp)
451 |       ReleaseKernelNProgram(kernels[kdt_half][i]);
452 |   }
453 | 
454 |   // Release buffer
455 |   OCL_SAFE_CALL(clReleaseMemObject(c_buffer));
456 |   OCL_SAFE_CALL(clReleaseCommandQueue(cmd_queue));
457 |   OCL_SAFE_CALL(clReleaseContext(context));
458 | }
459 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------