├── mixbench-opencl ├── check-half2-def.cpp ├── mix_kernels_ocl.h ├── README.md ├── CMakeLists.txt ├── mix_kernels.cl ├── main-ocl.cpp ├── loclutil.h └── mix_kernels_ocl.cpp ├── mixbench-cuda ├── mix_kernels_cuda.h ├── CMakeLists.txt ├── main-cuda.cpp ├── README.md ├── lcutil.h └── mix_kernels_cuda.cu ├── mixbench-hip ├── mix_kernels_hip.h ├── main-hip.cpp ├── CMakeLists.txt ├── lhiputil.h ├── README.md └── mix_kernels_hip.cpp ├── .gitattributes ├── mixbench-cpu ├── mix_kernels_cpu.h ├── Dockerfile ├── README.md ├── CMakeLists.txt ├── main.cpp └── mix_kernels_cpu.cpp ├── mixbench-sycl ├── mix_kernels_sycl.h ├── CMakeLists.txt ├── lsyclutil.h ├── main-sycl.cpp ├── README.md └── mix_kernels_sycl.cpp ├── include ├── common.h └── timestamp.h ├── .gitignore ├── .clang-format ├── README.md └── LICENSE /mixbench-opencl/check-half2-def.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(int argc, char* argv[]) { 4 | cl_half2 dummy; 5 | } 6 | -------------------------------------------------------------------------------- /mixbench-cuda/mix_kernels_cuda.h: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_cuda.h: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #pragma once 8 | 9 | extern "C" void mixbenchGPU(double*, long size); 10 | 11 | -------------------------------------------------------------------------------- /mixbench-hip/mix_kernels_hip.h: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_hip.h: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #pragma once 8 | 9 | extern "C" void mixbenchGPU(double*, long size); 10 | 11 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, in case people don't have core.autocrlf set. 2 | * text=auto 3 | 4 | # Explicitly declare text files you want to always be normalized and converted 5 | # to native line endings on checkout. 6 | *.c text 7 | *.cpp text 8 | *.cu text 9 | *.h text 10 | *.cl text 11 | Makefile text 12 | -------------------------------------------------------------------------------- /mixbench-cpu/mix_kernels_cpu.h: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_cpu.h: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #ifndef _MIX_KERNELS_CPU_H_ 8 | #define _MIX_KERNELS_CPU_H_ 9 | 10 | void mixbenchCPU(double*, size_t); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /mixbench-sycl/mix_kernels_sycl.h: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_sycl.h: This file is part of the mixbench GPU micro-benchmark 3 | *suite. 4 | * 5 | * Contact: Elias Konstantinidis 6 | **/ 7 | 8 | #ifndef _MIX_KERNELS_SYCL_H_ 9 | #define _MIX_KERNELS_SYCL_H_ 10 | 11 | void mixbenchGPU(const sycl::device&, void*, long, bool, size_t); 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /mixbench-cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:rolling 2 | 3 | RUN apt-get update \ 4 | && apt-get install -y \ 5 | g++ \ 6 | cmake 7 | 8 | ADD . /root/mixbench-cpu 9 | 10 | WORKDIR /root/mixbench-cpu 11 | 12 | RUN cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_FLAGS="-march=native -funroll-loops" -B build-docker ./ 13 | RUN cmake --build build-docker 14 | 15 | CMD /root/mixbench-cpu/build-docker/mixbench-cpu 16 | -------------------------------------------------------------------------------- /mixbench-opencl/mix_kernels_ocl.h: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_ocl_ro.h: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #pragma once 8 | 9 | #ifdef __APPLE__ 10 | # include 11 | #else 12 | # include 13 | #endif 14 | 15 | extern "C" void mixbenchGPU(cl_device_id, double*, long, bool, bool, bool, size_t, unsigned int, unsigned int); 16 | 17 | -------------------------------------------------------------------------------- /include/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | auto benchmark(Op op) { 8 | auto duration = op(); // drop first measurement 9 | std::vector measurements; 10 | for (int i = 1; i < total_runs; i++) { 11 | duration = op(); 12 | measurements.push_back(duration); 13 | } 14 | return *std::min_element(std::begin(measurements), std::end(measurements)); 15 | } 16 | -------------------------------------------------------------------------------- /mixbench-opencl/README.md: -------------------------------------------------------------------------------- 1 | # mixbench-opencl 2 | 3 | This is the OpenCL implementation of mixbench. 4 | 5 | ## Building notes 6 | 7 | Occasionally, (depending on the CMake version) the OpenCL files might not be 8 | discovered automatically. 9 | In such cases you might need to provide the OpenCL directories explicitly, 10 | as seen in the examples below: 11 | 12 | ``` 13 | cmake ../mixbench-opencl -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ 14 | cmake ../mixbench-opencl -DOpenCL_LIBRARY=/opt/rocm/lib/libOpenCL.so -DOpenCL_INCLUDE_DIR=/opt/rocm/opencl/include/ 15 | cmake ../mixbench-opencl -DOpenCL_LIBRARY=/opt/amdgpu-pro/lib/x86_64-linux-gnu/libOpenCL.so 16 | ``` 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Object files 2 | *.o 3 | *.ko 4 | *.obj 5 | *.elf 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Libraries 12 | *.lib 13 | *.a 14 | *.la 15 | *.lo 16 | 17 | # Shared objects (inc. Windows DLLs) 18 | *.dll 19 | *.so 20 | *.so.* 21 | *.dylib 22 | 23 | # Executables 24 | *.exe 25 | *.out 26 | *.app 27 | *.i*86 28 | *.x86_64 29 | *.hex 30 | 31 | # Specific executables 32 | mixbench-cuda 33 | mixbench-hip 34 | mixbench-ocl 35 | mixbench-sycl 36 | mixbench-cpu 37 | 38 | # But not the code itself 39 | !mixbench-cuda/ 40 | !mixbench-hip/ 41 | !mixbench-ocl/ 42 | !mixbench-sycl/ 43 | !mixbench-cpu/ 44 | 45 | # Debug files 46 | *.dSYM/ 47 | 48 | # Build folders 49 | build*/ 50 | mixbench-cuda/build*/ 51 | mixbench-hip/build*/ 52 | mixbench-opencl/build*/ 53 | mixbench-sycl/build*/ 54 | mixbench-cpu/build*/ 55 | 56 | # Other 57 | version_info.h 58 | -------------------------------------------------------------------------------- /mixbench-cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18 FATAL_ERROR) 2 | project(mixbench LANGUAGES CXX CUDA) 3 | 4 | # Include CUDA header directory in cpp files 5 | include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) 6 | 7 | string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") 8 | string(APPEND CMAKE_CUDA_FLAGS " -Wno-deprecated-gpu-targets") 9 | string(APPEND CMAKE_CUDA_FLAGS " --cudart=static") 10 | 11 | # Get version info from git tag 12 | execute_process(COMMAND git describe --tags --always 13 | OUTPUT_VARIABLE GIT_REV 14 | ERROR_QUIET) 15 | 16 | if ("${GIT_REV}" STREQUAL "") 17 | set(GIT_REV "") 18 | endif() 19 | string(STRIP "${GIT_REV}" GIT_REV) 20 | file(WRITE "version_info.h" "#define VERSION_INFO \"") 21 | file(APPEND "version_info.h" ${GIT_REV}) 22 | file(APPEND "version_info.h" "\"") 23 | 24 | add_executable(mixbench-cuda main-cuda.cpp mix_kernels_cuda.h lcutil.h version_info.h mix_kernels_cuda.cu mix_kernels_cuda.h lcutil.h) 25 | 26 | target_compile_features(mixbench-cuda PUBLIC cxx_std_11) 27 | 28 | -------------------------------------------------------------------------------- /mixbench-cuda/main-cuda.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * main-cuda.cpp: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "lcutil.h" 13 | #include "mix_kernels_cuda.h" 14 | #include "version_info.h" 15 | 16 | #define VECTOR_SIZE (32 * 1024 * 1024) 17 | 18 | int main(int argc, char* argv[]) { 19 | printf("mixbench (%s)\n", VERSION_INFO); 20 | 21 | unsigned int datasize = VECTOR_SIZE * sizeof(double); 22 | 23 | cudaSetDevice(0); 24 | StoreDeviceInfo(stdout); 25 | 26 | size_t freeCUDAMem, totalCUDAMem; 27 | cudaMemGetInfo(&freeCUDAMem, &totalCUDAMem); 28 | printf("Total GPU memory %lu, free %lu\n", totalCUDAMem, freeCUDAMem); 29 | printf("Buffer size: %dMB\n", datasize / (1024 * 1024)); 30 | 31 | double* c; 32 | c = (double*)malloc(datasize); 33 | 34 | mixbenchGPU(c, VECTOR_SIZE); 35 | 36 | free(c); 37 | 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /mixbench-cuda/README.md: -------------------------------------------------------------------------------- 1 | # mixbench-cuda 2 | 3 | This is the CUDA implementation of mixbench. 4 | It is actually the original implementation of this benchmark. 5 | 6 | ## Building 7 | 8 | To build the executable, run the following commands. 9 | 10 | > The minimum required CMake version is 3.18. 11 | 12 | ```sh 13 | mkdir build 14 | cd build 15 | cmake ../mixbench-cuda -DCMAKE_CUDA_ARCHITECTURES=native 16 | cmake --build ./ 17 | ``` 18 | 19 | This will build and write a `mixbench-cuda` executable file in the `build/` 20 | directory, compiled with support for the native CUDA architecture. Note that 21 | the `-arch=native` flag was [introduced in CUDA 11.5 update 1][1]. If you 22 | are using a prior version, or wish to compile the program for a specific 23 | architecture, replace `native` in the above command with the architecture. 24 | For example, to compile for the `sm_120` architecture, we would run: 25 | 26 | ``` 27 | mkdir build 28 | cd build 29 | cmake ../mixbench-cuda -DCMAKE_CUDA_ARCHITECTURES=sm_120 30 | cmake --build ./ 31 | ``` 32 | 33 | [1]: https://docs.nvidia.com/cuda/cuda-features-archive/index.html#compiler 34 | -------------------------------------------------------------------------------- /include/timestamp.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __linux__ 4 | 5 | #include 6 | #include 7 | #ifndef CLOCK_MONOTONIC_RAW 8 | #include 9 | 10 | typedef struct timeval timestamp; 11 | inline timestamp getTimestamp(void){ 12 | struct timeval t; 13 | gettimeofday(&t, NULL); 14 | return t; 15 | } 16 | inline float getElapsedtime(timestamp t){ 17 | struct timeval tn; 18 | gettimeofday(&tn, NULL); 19 | return (tn.tv_sec - t.tv_sec) * 1000.0f + (tn.tv_usec - t.tv_usec) / 1000.0f; 20 | } 21 | #else 22 | typedef struct timespec timestamp; 23 | inline timestamp getTimestamp(void){ 24 | struct timespec t; 25 | clock_gettime(CLOCK_MONOTONIC_RAW, &t); 26 | return t; 27 | } 28 | inline double getElapsedtime(timestamp t){ 29 | struct timespec tn; 30 | clock_gettime(CLOCK_MONOTONIC_RAW, &tn); 31 | return (double)(tn.tv_sec - t.tv_sec) * 1000.0 + (tn.tv_nsec - t.tv_nsec) / 1000000.0; 32 | } 33 | #endif 34 | 35 | #else 36 | 37 | #include 38 | 39 | typedef clock_t timestamp; 40 | inline timestamp getTimestamp(void){ 41 | return clock(); 42 | } 43 | inline double getElapsedtime(timestamp t){ 44 | return ((double)clock()-t) / CLOCKS_PER_SEC * 1000.0; 45 | } 46 | 47 | #endif 48 | 49 | -------------------------------------------------------------------------------- /mixbench-sycl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # required cmake version 2 | cmake_minimum_required(VERSION 3.5) 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 6 | 7 | project(mixbench-sycl LANGUAGES CXX) 8 | 9 | include_directories("../include") 10 | 11 | # Set default build type to RelWithDebInfo if not specified 12 | if (NOT CMAKE_BUILD_TYPE) 13 | message (STATUS "Default CMAKE_BUILD_TYPE not set. Using Release with Debug Info") 14 | set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE 15 | STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" 16 | FORCE) 17 | endif() 18 | 19 | # Get version info from git tag 20 | execute_process(COMMAND git describe --tags --always 21 | OUTPUT_VARIABLE GIT_REV 22 | ERROR_QUIET) 23 | 24 | if ("${GIT_REV}" STREQUAL "") 25 | set(GIT_REV "") 26 | endif() 27 | string(STRIP "${GIT_REV}" GIT_REV) 28 | file(WRITE "version_info.h" "#define VERSION_INFO \"") 29 | file(APPEND "version_info.h" ${GIT_REV}) 30 | file(APPEND "version_info.h" "\"") 31 | 32 | add_executable(mixbench-sycl main-sycl.cpp lsyclutil.h mix_kernels_sycl.cpp) 33 | 34 | set_target_properties(mixbench-sycl PROPERTIES CXX_EXTENSIONS OFF) 35 | -------------------------------------------------------------------------------- /mixbench-hip/main-hip.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * main-hip.cpp: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "lhiputil.h" 12 | #include "mix_kernels_hip.h" 13 | #include "version_info.h" 14 | 15 | #define VECTOR_SIZE (32 * 1024 * 1024) 16 | 17 | void init_vector(double* v, size_t datasize) { 18 | for (int i = 0; i < (int)datasize; i++) 19 | v[i] = i; 20 | } 21 | 22 | int main(int argc, char* argv[]) { 23 | printf("mixbench-hip (%s)\n", VERSION_INFO); 24 | 25 | unsigned int datasize = VECTOR_SIZE * sizeof(double); 26 | 27 | HIP_SAFE_CALL(hipSetDevice(0)); 28 | StoreDeviceInfo(stdout); 29 | 30 | size_t freeCUDAMem, totalCUDAMem; 31 | HIP_SAFE_CALL(hipMemGetInfo(&freeCUDAMem, &totalCUDAMem)); 32 | printf("Total GPU memory %lu, free %lu\n", totalCUDAMem, freeCUDAMem); 33 | printf("Buffer size: %dMB\n", datasize / (1024 * 1024)); 34 | 35 | double* c; 36 | c = (double*)malloc(datasize); 37 | init_vector(c, VECTOR_SIZE); 38 | 39 | mixbenchGPU(c, VECTOR_SIZE); 40 | 41 | free(c); 42 | 43 | return 0; 44 | } 45 | -------------------------------------------------------------------------------- /mixbench-opencl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.7 FATAL_ERROR) 2 | project(mixbench-ocl LANGUAGES CXX) 3 | 4 | find_package(OpenCL REQUIRED) 5 | 6 | include_directories(${OpenCL_INCLUDE_DIR} "../include") 7 | 8 | # Get version info from git tag 9 | execute_process(COMMAND git describe --tags --always 10 | OUTPUT_VARIABLE GIT_REV 11 | ERROR_QUIET) 12 | 13 | # Store version info 14 | if ("${GIT_REV}" STREQUAL "") 15 | set(GIT_REV "") 16 | endif() 17 | string(STRIP "${GIT_REV}" GIT_REV) 18 | file(WRITE "version_info.h" "#define VERSION_INFO \"") 19 | file(APPEND "version_info.h" ${GIT_REV}) 20 | file(APPEND "version_info.h" "\"") 21 | 22 | add_executable(mixbench-ocl main-ocl.cpp loclutil.h mix_kernels_ocl.cpp mix_kernels_ocl.h loclutil.h) 23 | file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/mix_kernels.cl 24 | DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) 25 | 26 | try_compile(HAVE_HALF2 ${CMAKE_BINARY_DIR} 27 | ${CMAKE_SOURCE_DIR}/check-half2-def.cpp 28 | CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${OpenCL_INCLUDE_DIR}") 29 | 30 | if(NOT ${HAVE_HALF2}) 31 | message( "cl_half2 workaround path" ) 32 | target_compile_definitions(mixbench-ocl PRIVATE HF_WORKAROUND) 33 | endif() 34 | 35 | target_link_libraries(mixbench-ocl ${OpenCL_LIBRARY}) 36 | -------------------------------------------------------------------------------- /mixbench-cpu/README.md: -------------------------------------------------------------------------------- 1 | # mixbench-cpu 2 | 3 | This is the OpenMP implementation of mixbench, targeted to CPUs. 4 | Theoretically, it could also target GPU accelerators but it has been developed 5 | with the CPUs in mind. 6 | In particular, it has been tailored for GCC compiler (see below for more info). 7 | 8 | ## Running in docker 9 | 10 | The easiest way to run CPU version is by docker: 11 | `docker run --rm elkondis/mixbench-cpu` 12 | 13 | This docker image re-compiles by tuning on your CPU architecture and executes the 14 | benchmark. 15 | 16 | ## Notes 17 | 18 | `mixbench-cpu` has been developed with `g++` (`gcc`) in mind. 19 | As such, it has been validated on the particular compiler that it vectorizes and properly 20 | unrolls the vectorized instructions as intended, in order to approach peak performance. 21 | `clang` on the other hand, at the time of development, has been observed that it does not 22 | properly produce optimum machine instruction sequences. 23 | The nature of computations for loop iteration in this benchmark is inherently sequential. 24 | So, it is essential that the compiler adequatelly unrolls the loop in the generated code 25 | so the CPU does not stall due to dependencies. 26 | 27 | ## Building notes 28 | 29 | The proper flags passed to the compiler (`-fopenmp -march=native -funroll-loops`) is taken care 30 | by the CMakeLists script. 31 | Thus, a simple cmake build invocation should be enough. 32 | -------------------------------------------------------------------------------- /mixbench-opencl/mix_kernels.cl: -------------------------------------------------------------------------------- 1 | #ifdef ENABLE_DP 2 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 3 | #endif 4 | #ifdef ENABLE_HP 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 6 | #endif 7 | 8 | bool is_equal(const class_T a, const class_T b){ 9 | #ifdef ENABLE_HP 10 | return a.x==b.x && a.y==b.y; 11 | #else 12 | return a==b; 13 | #endif 14 | } 15 | 16 | __kernel __attribute__((reqd_work_group_size(blockdim, 1, 1))) 17 | void benchmark_func(class_T seed, global class_T *g_data){ 18 | const unsigned int blockSize = blockdim; 19 | #ifdef BLOCK_STRIDED 20 | const int stride = blockSize; 21 | const int idx = get_group_id(0)*blockSize*ELEMENTS_PER_THREAD + get_local_id(0); 22 | #else 23 | const int grid_size = blockSize * get_num_groups(0); 24 | const int stride = grid_size; 25 | const int idx = get_global_id(0); 26 | #endif 27 | const int big_stride = get_num_groups(0)*blockSize*ELEMENTS_PER_THREAD; 28 | 29 | class_T tmps[ELEMENTS_PER_THREAD]; 30 | for(int k=0; k 5 | **/ 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include "mix_kernels_cpu.h" 15 | #include "version_info.h" 16 | 17 | constexpr auto DEF_VECTOR_SIZE_PER_THREAD = 4 * 1024 * 1024; 18 | 19 | using ArgParams = struct { unsigned int vecwidth; }; 20 | 21 | // Argument parsing 22 | // returns whether program execution should continue (true) or just print help 23 | // output (false) 24 | bool argument_parsing(int argc, char* argv[], ArgParams* output) { 25 | int arg_count = 0; 26 | for (int i = 1; i < argc; i++) { 27 | if ((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) { 28 | return false; 29 | } else { 30 | unsigned long value = strtoul(argv[i], NULL, 10); 31 | switch (arg_count) { 32 | // device selection 33 | case 0: 34 | output->vecwidth = value; 35 | arg_count++; 36 | break; 37 | default: 38 | return false; 39 | } 40 | } 41 | } 42 | return true; 43 | } 44 | 45 | int main(int argc, char* argv[]) { 46 | std::cout << "mixbench-cpu (" << VERSION_INFO << ")" << std::endl; 47 | 48 | const auto hardware_concurrency = omp_get_max_threads(); 49 | 50 | ArgParams args{static_cast( 51 | hardware_concurrency * DEF_VECTOR_SIZE_PER_THREAD / (1024 * 1024))}; 52 | 53 | if (!argument_parsing(argc, argv, &args)) { 54 | std::cout << "Usage: mixbench-cpu [options] [array size(1024^2)]" 55 | << std::endl 56 | << std::endl 57 | << "Options:" << std::endl 58 | << " -h or --help Show this message" << std::endl; 59 | 60 | exit(1); 61 | } 62 | 63 | std::cout << "Use \"-h\" argument to see available options" << std::endl; 64 | 65 | const size_t VEC_WIDTH = 1024 * 1024 * args.vecwidth; 66 | 67 | std::unique_ptr c; 68 | 69 | c.reset(new (std::align_val_t(64)) double[VEC_WIDTH]); 70 | 71 | std::cout << "Working memory size: " << args.vecwidth * sizeof(double) << "MB" 72 | << std::endl; 73 | std::cout << "Total threads: " << hardware_concurrency << std::endl; 74 | 75 | mixbenchCPU(c.get(), VEC_WIDTH); 76 | 77 | return 0; 78 | } 79 | -------------------------------------------------------------------------------- /mixbench-sycl/lsyclutil.h: -------------------------------------------------------------------------------- 1 | /** 2 | * lsyclutil.h: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #ifndef _CUTIL_H_ 8 | #define _CUTIL_H_ 9 | 10 | #include 11 | #include 12 | 13 | using namespace cl; 14 | 15 | #define FRACTION_CEILING(numerator, denominator) ((numerator+denominator-1)/(denominator)) 16 | 17 | 18 | // Print basic device information 19 | inline void StoreDeviceInfo(const sycl::device &device){ 20 | auto platform = device.get_platform(); 21 | try{ 22 | auto platform_name = platform.get_info(); 23 | auto device_name = device.get_info(); 24 | auto vendor_name = device.get_info(); 25 | auto device_drv = device.get_info(); 26 | 27 | auto device_addrbits = device.get_info(); 28 | auto device_freq = device.get_info(); 29 | auto device_gmem = device.get_info(); 30 | auto device_maxalloc = device.get_info(); 31 | auto device_syclver = device.get_info(); 32 | auto device_CUs = device.get_info(); 33 | 34 | std::cout << "------------------------ Device specifications ------------------------" << std::endl; 35 | std::cout << "Platform: " << platform_name << std::endl; 36 | std::cout << "Device: " << device_name << '/' << vendor_name << std::endl; 37 | std::cout << "Driver version: " << device_drv << std::endl; 38 | std::cout << "Address bits: " << device_addrbits << std::endl; 39 | std::cout << "GPU clock rate: " << device_freq << " MHz" << std::endl; 40 | std::cout << "Total global mem: " << device_gmem/1024/1024 << " MB" << std::endl; 41 | std::cout << "Max allowed buffer: " << device_maxalloc/1024/1024 << " MB" << std::endl; 42 | std::cout << "SYCL version: " << device_syclver << std::endl; 43 | std::cout << "Total CUs: " << device_CUs << std::endl; 44 | std::cout << "-----------------------------------------------------------------------" << std::endl; 45 | } 46 | catch (sycl::exception const &exc) { 47 | std::cerr << "Could not get full device info: "; 48 | std::cerr << exc.what() << std::endl; 49 | } 50 | } 51 | 52 | inline size_t GetMaxDeviceWGSize(const sycl::device &device){ 53 | auto wgsize = device.get_info(); 54 | return wgsize; 55 | } 56 | 57 | 58 | #endif 59 | -------------------------------------------------------------------------------- /mixbench-sycl/main-sycl.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * main-sycl.cpp: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #include 8 | #include 9 | #include "lsyclutil.h" 10 | #include "mix_kernels_sycl.h" 11 | #include "version_info.h" 12 | 13 | #define DEF_VECTOR_SIZE (32*1024*1024) 14 | 15 | typedef struct{ 16 | int device_index; 17 | bool use_os_timer; 18 | int wg_size; 19 | unsigned int vecwidth; 20 | } ArgParams; 21 | 22 | // Argument parsing 23 | // returns whether program execution should continue (true) or just print help output (false) 24 | bool argument_parsing(int argc, char* argv[], ArgParams *output){ 25 | int arg_count = 0; 26 | for(int i=1; iuse_os_timer = true; 31 | } else { 32 | unsigned long value = strtoul(argv[i], NULL, 10); 33 | switch( arg_count ){ 34 | // device selection 35 | case 0: 36 | output->device_index = value; 37 | arg_count++; 38 | break; 39 | // workgroup size 40 | case 1: 41 | output->wg_size = value; 42 | arg_count++; 43 | break; 44 | // array size (x1024^2) 45 | case 2: 46 | output->vecwidth = value; 47 | arg_count++; 48 | break; 49 | default: 50 | return false; 51 | } 52 | } 53 | } 54 | return true; 55 | } 56 | 57 | int main(int argc, char* argv[]) { 58 | std::cout << "mixbench-sycl (" << VERSION_INFO << ")" << std::endl; 59 | 60 | ArgParams args{1, false, 256, DEF_VECTOR_SIZE/(1024*1024)}; 61 | 62 | if (!argument_parsing(argc, argv, &args)) { 63 | std::cout << "Usage: mixbench-sycl [options] [device index [workgroup size [array size(1024^2)]]]" << std::endl 64 | << std::endl 65 | << "Options:" << std::endl 66 | << " -h or --help Show this message" << std::endl 67 | << " -t or --use-os-timer Use standard OS timer instead of SYCL profiling timer" << std::endl; 68 | 69 | auto devices = sycl::device::get_devices(); 70 | std::cout << "Available SYCL devices:" << std::endl; 71 | int cur_dev_idx = 1; 72 | for(auto device:devices){ 73 | std::cout << " " << cur_dev_idx++ << ". " << device.get_info() << '/' 74 | << device.get_platform().get_info() << std::endl; 75 | } 76 | exit(1); 77 | } 78 | 79 | std::cout << "Use \"-h\" argument to see available options" << std::endl; 80 | 81 | const size_t VEC_WIDTH = 1024*1024*args.vecwidth; 82 | unsigned int datasize = VEC_WIDTH*sizeof(double); 83 | 84 | std::unique_ptr c(new double[VEC_WIDTH]); 85 | 86 | try { 87 | sycl::device device = sycl::device::get_devices().at(args.device_index-1); 88 | 89 | StoreDeviceInfo(device); 90 | 91 | const size_t totalDeviceMem = device.get_info(); 92 | std::cout << "Total GPU memory: " << totalDeviceMem << std::endl; 93 | std::cout << "Buffer size: " << datasize/(1024*1024) << "MB" << std::endl; 94 | 95 | mixbenchGPU(device, c.get(), VEC_WIDTH, args.use_os_timer, args.wg_size); 96 | } 97 | catch (sycl::exception const &exc) { 98 | std::cerr << "SYCL exception caught: " << exc.what(); 99 | std::exit(1); 100 | } 101 | 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /mixbench-opencl/main-ocl.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * main-ocl.cpp: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include "loclutil.h" 11 | #include "mix_kernels_ocl.h" 12 | #include "version_info.h" 13 | 14 | #define DEF_VECTOR_SIZE (32*1024*1024) 15 | 16 | typedef struct{ 17 | int device_index; 18 | bool block_strided; 19 | bool host_allocated; 20 | bool use_os_timer; 21 | int wg_size; 22 | unsigned int vecwidth; 23 | unsigned int elements_per_wi; 24 | unsigned int fusion_degree; 25 | } ArgParams; 26 | 27 | // Argument parsing 28 | // returns whether program execution should continue (true) or just print help output (false) 29 | bool argument_parsing(int argc, char* argv[], ArgParams *output){ 30 | int arg_count = 0; 31 | for(int i=1; iblock_strided = true; 36 | } else if( (strcmp(argv[i], "-H")==0) || (strcmp(argv[i], "--host-alloc")==0) ) { 37 | output->host_allocated = true; 38 | } else if( (strcmp(argv[i], "-t")==0) || (strcmp(argv[i], "--use-os-timer")==0) ) { 39 | output->use_os_timer = true; 40 | } else { 41 | unsigned long value = strtoul(argv[i], NULL, 10); 42 | switch( arg_count ){ 43 | // device selection 44 | case 0: 45 | output->device_index = value; 46 | arg_count++; 47 | break; 48 | // workgroup size 49 | case 1: 50 | output->wg_size = value; 51 | arg_count++; 52 | break; 53 | // array size (x1024^2) 54 | case 2: 55 | output->vecwidth = value; 56 | arg_count++; 57 | break; 58 | // elements per workitem 59 | case 3: 60 | output->elements_per_wi = value; 61 | arg_count++; 62 | break; 63 | case 4: 64 | output->fusion_degree = value; 65 | arg_count++; 66 | break; 67 | default: 68 | return false; 69 | } 70 | } 71 | } 72 | return true; 73 | } 74 | 75 | int main(int argc, char* argv[]) { 76 | printf("mixbench-ocl (%s)\n", VERSION_INFO); 77 | 78 | ArgParams args = {1, false, false, false, 256, DEF_VECTOR_SIZE/(1024*1024), 8, 4}; 79 | 80 | if( !argument_parsing(argc, argv, &args) ){ 81 | printf("Usage: mixbench-ocl [options] [device index [workgroup size [array size(1024^2) [elements per workitem [fusion degree]]]]]\n"); 82 | printf("\nOptions:\n" 83 | " -h or --help Show this message\n" 84 | " -H or --host-alloc Use host allocated buffer (CL_MEM_ALLOC_HOST_PTR)\n" 85 | " -w or --workgroup-stride Workitem strides equal to the width of a workgroup length (default: NDRange length)\n" 86 | " -t or --use-os-timer Use standard OS timer instead of OpenCL profiling timer\n" 87 | "\n"); 88 | 89 | GetDeviceID(0, stdout); 90 | exit(1); 91 | } 92 | 93 | printf("Use \"-h\" argument to see available options\n"); 94 | 95 | const size_t VEC_WIDTH = 1024*1024*args.vecwidth; 96 | unsigned int datasize = VEC_WIDTH*sizeof(double); 97 | 98 | cl_device_id dev_id = GetDeviceID(args.device_index, NULL); 99 | 100 | if( dev_id == NULL ){ 101 | fprintf(stderr, "Error: No OpenCL device selected\n"); 102 | exit(1); 103 | } 104 | StoreDeviceInfo(dev_id, stdout); 105 | 106 | printf("Buffer size: %dMB\n", datasize/(1024*1024)); 107 | printf("Workgroup size: %d\n", args.wg_size); 108 | printf("Elements per workitem: %d\n", args.elements_per_wi); 109 | printf("Workitem fusion degree: %d\n", args.fusion_degree); 110 | // Check if selected workgroup size is supported 111 | if( GetMaxDeviceWGSize(dev_id)<(size_t)args.wg_size ){ 112 | fprintf(stderr, "Error: Unsupported workgroup size (%u).\n", args.wg_size); 113 | exit(1); 114 | } 115 | 116 | double *c; 117 | c = (double*)malloc(datasize); 118 | 119 | mixbenchGPU(dev_id, c, VEC_WIDTH, args.block_strided, args.host_allocated, args.use_os_timer, args.wg_size, args.elements_per_wi, args.fusion_degree); 120 | 121 | free(c); 122 | 123 | return 0; 124 | } 125 | -------------------------------------------------------------------------------- /mixbench-hip/lhiputil.h: -------------------------------------------------------------------------------- 1 | /** 2 | * lhiputil.h: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #ifndef _HIPUTIL_H_ 8 | #define _HIPUTIL_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #define HIP_SAFE_CALL( call) { \ 15 | hipError_t err = call; \ 16 | if( hipSuccess != err) { \ 17 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 18 | __FILE__, __LINE__, hipGetErrorString( err) ); \ 19 | exit(EXIT_FAILURE); \ 20 | } } 21 | 22 | #define FRACTION_CEILING(numerator, denominator) ((numerator+denominator-1)/(denominator)) 23 | 24 | static inline int _ConvertSMVer2Cores(int major, int minor){ 25 | #ifdef __HIP_PLATFORM_HCC__ 26 | return 64; 27 | #else 28 | switch(major){ 29 | case 1: return 8; 30 | case 2: switch(minor){ 31 | case 1: return 48; 32 | default: return 32; 33 | } 34 | case 3: return 192; 35 | default: return 128; 36 | } 37 | #endif 38 | } 39 | 40 | static inline void GetDevicePeakInfo(double *aGIPS, double *aGBPS, hipDeviceProp_t *aDeviceProp = NULL){ 41 | hipDeviceProp_t deviceProp; 42 | int current_device; 43 | if( aDeviceProp ) 44 | deviceProp = *aDeviceProp; 45 | else{ 46 | HIP_SAFE_CALL( hipGetDevice(¤t_device) ); 47 | HIP_SAFE_CALL( hipGetDeviceProperties(&deviceProp, current_device) ); 48 | } 49 | const int TotalSPs = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)*deviceProp.multiProcessorCount; 50 | *aGIPS = 1000.0 * deviceProp.clockRate * TotalSPs / (1000.0 * 1000.0 * 1000.0); // Giga instructions/sec 51 | *aGBPS = 2.0 * (double)deviceProp.memoryClockRate * 1000.0 * (double)deviceProp.memoryBusWidth / 8.0; 52 | } 53 | 54 | static inline hipDeviceProp_t GetDeviceProperties(void){ 55 | hipDeviceProp_t deviceProp; 56 | int current_device; 57 | HIP_SAFE_CALL( hipGetDevice(¤t_device) ); 58 | HIP_SAFE_CALL( hipGetDeviceProperties(&deviceProp, current_device) ); 59 | return deviceProp; 60 | } 61 | 62 | // Print basic device information 63 | static void StoreDeviceInfo(FILE *fout){ 64 | hipDeviceProp_t deviceProp; 65 | int current_device, driver_version; 66 | HIP_SAFE_CALL( hipGetDevice(¤t_device) ); 67 | HIP_SAFE_CALL( hipGetDeviceProperties(&deviceProp, current_device) ); 68 | HIP_SAFE_CALL( hipDriverGetVersion(&driver_version) ); 69 | fprintf(fout, "------------------------ Device specifications ------------------------\n"); 70 | fprintf(fout, "Device: %s\n", deviceProp.name); 71 | fprintf(fout, "CUDA driver version: %d.%d\n", driver_version/1000, driver_version%1000); 72 | fprintf(fout, "GPU clock rate: %d MHz\n", deviceProp.clockRate/1000); 73 | //fprintf(fout, "Memory clock rate: %d MHz\n", deviceProp.memoryClockRate/1000/2); 74 | //fprintf(fout, "Memory bus width: %d bits\n", deviceProp.memoryBusWidth); 75 | fprintf(fout, "WarpSize: %d\n", deviceProp.warpSize); 76 | fprintf(fout, "L2 cache size: %d KB\n", deviceProp.l2CacheSize/1024); 77 | fprintf(fout, "Total global mem: %d MB\n", (int)(deviceProp.totalGlobalMem/1024/1024)); 78 | //fprintf(fout, "ECC enabled: %s\n", deviceProp.ECCEnabled?"Yes":"No"); 79 | #ifdef __HIP_PLATFORM_NVCC__ 80 | fprintf(fout, "Compute Capability: %d.%d\n", deviceProp.major, deviceProp.minor); 81 | #endif 82 | const int TotalSPs = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)*deviceProp.multiProcessorCount; 83 | fprintf(fout, "Total SPs: %d (%d MPs x %d SPs/MP)\n", TotalSPs, deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 84 | double InstrThroughput, MemBandwidth; 85 | GetDevicePeakInfo(&InstrThroughput, &MemBandwidth, &deviceProp); 86 | fprintf(fout, "Compute throughput: %.2f GFlops (theoretical single precision FMAs)\n", 2.0*InstrThroughput); 87 | fprintf(fout, "Memory bandwidth: %.2f GB/sec\n", MemBandwidth/(1000.0*1000.0*1000.0)); 88 | fprintf(fout, "-----------------------------------------------------------------------\n"); 89 | } 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /mixbench-opencl/loclutil.h: -------------------------------------------------------------------------------- 1 | /** 2 | * loclutil.h: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #ifndef _OCLUTIL_H_ 8 | #define _OCLUTIL_H_ 9 | 10 | #include 11 | #include 12 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 13 | #define CL_TARGET_OPENCL_VERSION 120 14 | #include 15 | 16 | #if defined(_MSC_VER) 17 | #include 18 | #define alloca _alloca 19 | #endif 20 | 21 | #define OCL_SAFE_CALL(call) { \ 22 | cl_int err = call; \ 23 | if( CL_SUCCESS != err) { \ 24 | fprintf(stderr, "OpenCL error in file '%s' in line %i : Code %d.\n", \ 25 | __FILE__, __LINE__, err ); \ 26 | exit(EXIT_FAILURE); \ 27 | } } 28 | 29 | #define FRACTION_CEILING(numerator, denominator) ((numerator+denominator-1)/(denominator)) 30 | 31 | inline cl_device_id GetDeviceID(int index, FILE *fout){ 32 | cl_uint cnt_platforms, cnt_device_ids; 33 | cl_device_id device_selected = NULL; 34 | char dev_name[256], plat_name[256]; 35 | 36 | OCL_SAFE_CALL( clGetPlatformIDs(0, NULL, &cnt_platforms) ); 37 | cl_platform_id *platform_ids = (cl_platform_id*)alloca(sizeof(cl_platform_id)*cnt_platforms); 38 | cl_device_id device_ids[256]; 39 | OCL_SAFE_CALL( clGetPlatformIDs(cnt_platforms, platform_ids, NULL) ); 40 | 41 | if( fout ) 42 | fprintf(fout, "Available OpenCL devices:\n"); 43 | int cur_dev_idx = 1; 44 | for(int i=0; i<(int)cnt_platforms; i++){ 45 | size_t sz_name_len; 46 | OCL_SAFE_CALL( clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, 0, NULL, &sz_name_len) ); 47 | sz_name_len = sz_name_len>sizeof(plat_name) ? sizeof(plat_name) : sz_name_len; 48 | OCL_SAFE_CALL( clGetPlatformInfo(platform_ids[i], CL_PLATFORM_NAME, sz_name_len, plat_name, NULL) ); 49 | 50 | OCL_SAFE_CALL( clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &cnt_device_ids) ); 51 | OCL_SAFE_CALL( clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, cnt_device_ids, device_ids, NULL) ); 52 | for(int d=0; d<(int)cnt_device_ids; d++){ 53 | if( fout ){ 54 | OCL_SAFE_CALL( clGetDeviceInfo(device_ids[d], CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL) ); 55 | fprintf(fout, " %d. %s/%s\n", cur_dev_idx, dev_name, plat_name); 56 | } 57 | if( cur_dev_idx==index ) 58 | device_selected = device_ids[d]; 59 | cur_dev_idx++; 60 | } 61 | } 62 | return device_selected; 63 | } 64 | 65 | // Print basic device information 66 | inline void StoreDeviceInfo(cl_device_id devID, FILE *fout){ 67 | char dev_platform[256], dev_name[256], dev_vendor[256], dev_clver[256], dev_drv[256]; 68 | cl_uint dev_freq, dev_cus, dev_addrbits; 69 | cl_ulong dev_gmem, dev_maxalloc; 70 | cl_platform_id dev_platform_id; 71 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_PLATFORM, sizeof(dev_platform_id), &dev_platform_id, NULL) ); 72 | OCL_SAFE_CALL( clGetPlatformInfo(dev_platform_id, CL_PLATFORM_NAME, sizeof(dev_platform), dev_platform, NULL) ); 73 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_NAME, sizeof(dev_name), dev_name, NULL) ); 74 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_VENDOR, sizeof(dev_vendor), dev_vendor, NULL) ); 75 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_VERSION, sizeof(dev_clver), dev_clver, NULL) ); 76 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_ADDRESS_BITS, sizeof(dev_addrbits), &dev_addrbits, NULL) ); 77 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(dev_freq), &dev_freq, NULL) ); 78 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(dev_gmem), &dev_gmem, NULL) ); 79 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(dev_maxalloc), &dev_maxalloc, NULL) ); 80 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DRIVER_VERSION, sizeof(dev_drv), dev_drv, NULL) ); 81 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(dev_cus), &dev_cus, NULL) ); 82 | fprintf(fout, "------------------------ Device specifications ------------------------\n"); 83 | fprintf(fout, "Platform: %s\n", dev_platform); 84 | fprintf(fout, "Device: %s/%s\n", dev_name, dev_vendor); 85 | fprintf(fout, "Driver version: %s\n", dev_drv); 86 | fprintf(fout, "Address bits: %d\n", dev_addrbits); 87 | fprintf(fout, "GPU clock rate: %d MHz\n", dev_freq); 88 | fprintf(fout, "Total global mem: %d MB\n", (int)(dev_gmem/1024/1024)); 89 | fprintf(fout, "Max allowed buffer: %d MB\n", (int)(dev_maxalloc/1024/1024)); 90 | fprintf(fout, "OpenCL version: %s\n", dev_clver); 91 | fprintf(fout, "Total CUs: %d\n", dev_cus); 92 | fprintf(fout, "-----------------------------------------------------------------------\n"); 93 | } 94 | 95 | inline size_t GetMaxDeviceWGSize(cl_device_id devID){ 96 | size_t wgsize; 97 | OCL_SAFE_CALL( clGetDeviceInfo (devID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(wgsize), &wgsize, NULL) ); 98 | return wgsize; 99 | } 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /mixbench-cuda/lcutil.h: -------------------------------------------------------------------------------- 1 | /** 2 | * lcutil.h: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #ifndef _CUTIL_H_ 8 | #define _CUTIL_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #define CUDA_SAFE_CALL( call) { \ 16 | cudaError err = call; \ 17 | if( cudaSuccess != err) { \ 18 | fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ 19 | __FILE__, __LINE__, cudaGetErrorString( err) ); \ 20 | exit(EXIT_FAILURE); \ 21 | } } 22 | 23 | #define FRACTION_CEILING(numerator, denominator) ((numerator+denominator-1)/(denominator)) 24 | 25 | static inline int _ConvertSMVer2Cores(int major, int minor) { 26 | switch (major) { 27 | case 1: 28 | return 8; 29 | case 2: 30 | switch (minor) { 31 | case 1: 32 | return 48; 33 | default: 34 | return 32; 35 | } 36 | case 3: 37 | return 192; 38 | case 6: 39 | switch (minor) { 40 | case 0: 41 | return 64; 42 | default: 43 | return 128; 44 | } 45 | case 7: 46 | return 64; 47 | case 8: 48 | switch (minor) { 49 | case 0: 50 | return 64; 51 | default: 52 | return 128; 53 | } 54 | default: 55 | return 128; 56 | } 57 | } 58 | 59 | static inline bool IsFP16Supported(void){ 60 | cudaDeviceProp deviceProp; 61 | int current_device; 62 | CUDA_SAFE_CALL( cudaGetDevice(¤t_device) ); 63 | CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, current_device) ); 64 | return deviceProp.major>5 || (deviceProp.major == 5 && deviceProp.minor == 3); 65 | } 66 | 67 | static inline void GetDevicePeakInfo(double *aGIPS, double *aGBPS){ 68 | int current_device; 69 | int major, minor; 70 | int memoryBusWidth, clockRate, memoryClockRate, multiProcessorCount; 71 | CUDA_SAFE_CALL( cudaGetDevice(¤t_device) ); 72 | CUDA_SAFE_CALL( cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device) ); 73 | CUDA_SAFE_CALL( cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device) ); 74 | CUDA_SAFE_CALL( cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device) ); 75 | CUDA_SAFE_CALL( cudaDeviceGetAttribute(&memoryBusWidth, cudaDevAttrGlobalMemoryBusWidth, current_device) ); 76 | CUDA_SAFE_CALL( cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device) ); 77 | CUDA_SAFE_CALL( cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, current_device) ); 78 | const int TotalSPs = _ConvertSMVer2Cores(major, minor)*multiProcessorCount; 79 | *aGIPS = 1000.0 * clockRate * TotalSPs / (1000.0 * 1000.0 * 1000.0); // Giga instructions/sec 80 | *aGBPS = 2.0 * (double)memoryClockRate * 1000.0 * (double)memoryBusWidth / 8.0; 81 | } 82 | 83 | static inline cudaDeviceProp GetDeviceProperties(void){ 84 | cudaDeviceProp deviceProp; 85 | int current_device; 86 | CUDA_SAFE_CALL( cudaGetDevice(¤t_device) ); 87 | CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, current_device) ); 88 | return deviceProp; 89 | } 90 | 91 | // Print basic device information 92 | static void StoreDeviceInfo(FILE *fout){ 93 | cudaDeviceProp deviceProp; 94 | int current_device, driver_version; 95 | int clockRate, memoryClockRate; 96 | CUDA_SAFE_CALL( cudaGetDevice(¤t_device) ); 97 | CUDA_SAFE_CALL( cudaGetDeviceProperties(&deviceProp, current_device) ); 98 | CUDA_SAFE_CALL( cudaDriverGetVersion(&driver_version) ); 99 | CUDA_SAFE_CALL( cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device) ); 100 | CUDA_SAFE_CALL( cudaDeviceGetAttribute(&memoryClockRate, cudaDevAttrMemoryClockRate, current_device) ); 101 | fprintf(fout, "------------------------ Device specifications ------------------------\n"); 102 | fprintf(fout, "Device: %s\n", deviceProp.name); 103 | fprintf(fout, "CUDA driver version: %d.%d\n", driver_version/1000, driver_version%1000); 104 | fprintf(fout, "GPU clock rate: %d MHz\n", clockRate/1000); 105 | fprintf(fout, "Memory clock rate: %d MHz\n", memoryClockRate/1000/2); 106 | fprintf(fout, "Memory bus width: %d bits\n", deviceProp.memoryBusWidth); 107 | fprintf(fout, "WarpSize: %d\n", deviceProp.warpSize); 108 | fprintf(fout, "L2 cache size: %d KB\n", deviceProp.l2CacheSize/1024); 109 | fprintf(fout, "Total global mem: %d MB\n", (int)(deviceProp.totalGlobalMem/1024/1024)); 110 | fprintf(fout, "ECC enabled: %s\n", deviceProp.ECCEnabled?"Yes":"No"); 111 | fprintf(fout, "Compute Capability: %d.%d\n", deviceProp.major, deviceProp.minor); 112 | const int TotalSPs = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)*deviceProp.multiProcessorCount; 113 | fprintf(fout, "Total SPs: %d (%d MPs x %d SPs/MP)\n", TotalSPs, deviceProp.multiProcessorCount, _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor)); 114 | double InstrThroughput, MemBandwidth; 115 | GetDevicePeakInfo(&InstrThroughput, &MemBandwidth); 116 | fprintf(fout, "Compute throughput: %.2f GFlops (theoretical single precision FMAs)\n", 2.0*InstrThroughput); 117 | fprintf(fout, "Memory bandwidth: %.2f GB/sec\n", MemBandwidth/(1000.0*1000.0*1000.0)); 118 | fprintf(fout, "-----------------------------------------------------------------------\n"); 119 | } 120 | 121 | #endif 122 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Chromium 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveMacros: false 7 | AlignConsecutiveAssignments: false 8 | AlignConsecutiveBitFields: false 9 | AlignConsecutiveDeclarations: false 10 | AlignEscapedNewlines: Left 11 | AlignOperands: Align 12 | AlignTrailingComments: true 13 | AllowAllArgumentsOnNextLine: true 14 | AllowAllConstructorInitializersOnNextLine: true 15 | AllowAllParametersOfDeclarationOnNextLine: false 16 | AllowShortEnumsOnASingleLine: true 17 | AllowShortBlocksOnASingleLine: Never 18 | AllowShortCaseLabelsOnASingleLine: false 19 | AllowShortFunctionsOnASingleLine: Inline 20 | AllowShortLambdasOnASingleLine: All 21 | AllowShortIfStatementsOnASingleLine: Never 22 | AllowShortLoopsOnASingleLine: false 23 | AlwaysBreakAfterDefinitionReturnType: None 24 | AlwaysBreakAfterReturnType: None 25 | AlwaysBreakBeforeMultilineStrings: true 26 | AlwaysBreakTemplateDeclarations: Yes 27 | BinPackArguments: true 28 | BinPackParameters: false 29 | BraceWrapping: 30 | AfterCaseLabel: false 31 | AfterClass: false 32 | AfterControlStatement: Never 33 | AfterEnum: false 34 | AfterFunction: false 35 | AfterNamespace: false 36 | AfterObjCDeclaration: false 37 | AfterStruct: false 38 | AfterUnion: false 39 | AfterExternBlock: false 40 | BeforeCatch: false 41 | BeforeElse: false 42 | BeforeLambdaBody: false 43 | BeforeWhile: false 44 | IndentBraces: false 45 | SplitEmptyFunction: true 46 | SplitEmptyRecord: true 47 | SplitEmptyNamespace: true 48 | BreakBeforeBinaryOperators: None 49 | BreakBeforeBraces: Attach 50 | BreakBeforeInheritanceComma: false 51 | BreakInheritanceList: BeforeColon 52 | BreakBeforeTernaryOperators: true 53 | BreakConstructorInitializersBeforeComma: false 54 | BreakConstructorInitializers: BeforeColon 55 | BreakAfterJavaFieldAnnotations: false 56 | BreakStringLiterals: true 57 | ColumnLimit: 80 58 | CommentPragmas: '^ IWYU pragma:' 59 | CompactNamespaces: false 60 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 61 | ConstructorInitializerIndentWidth: 4 62 | ContinuationIndentWidth: 4 63 | Cpp11BracedListStyle: true 64 | DeriveLineEnding: true 65 | DerivePointerAlignment: false 66 | DisableFormat: false 67 | ExperimentalAutoDetectBinPacking: false 68 | FixNamespaceComments: true 69 | ForEachMacros: 70 | - foreach 71 | - Q_FOREACH 72 | - BOOST_FOREACH 73 | IncludeBlocks: Preserve 74 | IncludeCategories: 75 | - Regex: '^' 76 | Priority: 2 77 | SortPriority: 0 78 | - Regex: '^<.*\.h>' 79 | Priority: 1 80 | SortPriority: 0 81 | - Regex: '^<.*' 82 | Priority: 2 83 | SortPriority: 0 84 | - Regex: '.*' 85 | Priority: 3 86 | SortPriority: 0 87 | IncludeIsMainRegex: '([-_](test|unittest))?$' 88 | IncludeIsMainSourceRegex: '' 89 | IndentCaseLabels: true 90 | IndentCaseBlocks: false 91 | IndentGotoLabels: true 92 | IndentPPDirectives: None 93 | IndentExternBlock: AfterExternBlock 94 | IndentWidth: 2 95 | IndentWrappedFunctionNames: false 96 | InsertTrailingCommas: None 97 | JavaScriptQuotes: Leave 98 | JavaScriptWrapImports: true 99 | KeepEmptyLinesAtTheStartOfBlocks: false 100 | MacroBlockBegin: '' 101 | MacroBlockEnd: '' 102 | MaxEmptyLinesToKeep: 1 103 | NamespaceIndentation: None 104 | ObjCBinPackProtocolList: Never 105 | ObjCBlockIndentWidth: 2 106 | ObjCBreakBeforeNestedBlockParam: true 107 | ObjCSpaceAfterProperty: false 108 | ObjCSpaceBeforeProtocolList: true 109 | PenaltyBreakAssignment: 2 110 | PenaltyBreakBeforeFirstCallParameter: 1 111 | PenaltyBreakComment: 300 112 | PenaltyBreakFirstLessLess: 120 113 | PenaltyBreakString: 1000 114 | PenaltyBreakTemplateDeclaration: 10 115 | PenaltyExcessCharacter: 1000000 116 | PenaltyReturnTypeOnItsOwnLine: 200 117 | PointerAlignment: Left 118 | RawStringFormats: 119 | - Language: Cpp 120 | Delimiters: 121 | - cc 122 | - CC 123 | - cpp 124 | - Cpp 125 | - CPP 126 | - 'c++' 127 | - 'C++' 128 | CanonicalDelimiter: '' 129 | BasedOnStyle: google 130 | - Language: TextProto 131 | Delimiters: 132 | - pb 133 | - PB 134 | - proto 135 | - PROTO 136 | EnclosingFunctions: 137 | - EqualsProto 138 | - EquivToProto 139 | - PARSE_PARTIAL_TEXT_PROTO 140 | - PARSE_TEST_PROTO 141 | - PARSE_TEXT_PROTO 142 | - ParseTextOrDie 143 | - ParseTextProtoOrDie 144 | - ParseTestProto 145 | - ParsePartialTestProto 146 | CanonicalDelimiter: '' 147 | BasedOnStyle: google 148 | ReflowComments: true 149 | SortIncludes: true 150 | SortUsingDeclarations: true 151 | SpaceAfterCStyleCast: false 152 | SpaceAfterLogicalNot: false 153 | SpaceAfterTemplateKeyword: true 154 | SpaceBeforeAssignmentOperators: true 155 | SpaceBeforeCpp11BracedList: false 156 | SpaceBeforeCtorInitializerColon: true 157 | SpaceBeforeInheritanceColon: true 158 | SpaceBeforeParens: ControlStatements 159 | SpaceBeforeRangeBasedForLoopColon: true 160 | SpaceInEmptyBlock: false 161 | SpaceInEmptyParentheses: false 162 | SpacesBeforeTrailingComments: 2 163 | SpacesInAngles: false 164 | SpacesInConditionalStatement: false 165 | SpacesInContainerLiterals: true 166 | SpacesInCStyleCastParentheses: false 167 | SpacesInParentheses: false 168 | SpacesInSquareBrackets: false 169 | SpaceBeforeSquareBrackets: false 170 | Standard: Auto 171 | StatementMacros: 172 | - Q_UNUSED 173 | - QT_REQUIRE_VERSION 174 | TabWidth: 8 175 | UseCRLF: false 176 | UseTab: Never 177 | WhitespaceSensitiveMacros: 178 | - STRINGIZE 179 | - PP_STRINGIZE 180 | - BOOST_PP_STRINGIZE 181 | ... 182 | 183 | -------------------------------------------------------------------------------- /mixbench-hip/README.md: -------------------------------------------------------------------------------- 1 | # mixbench-hip 2 | 3 | This is the HIP (AMD ROCm) implementation of mixbench. 4 | 5 | ## Building notes 6 | 7 | For HIP version, the HIP_PATH environment variable should be set to point to HIP installation directory. For more information follow the instructions on the following blog to properly install ROCK and ROCR drivers: 8 | * ROCm: 9 | https://github.com/RadeonOpenCompute/ROCm 10 | * HIP: 11 | https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP 12 | 13 | In case you want to retain the kernels' assembly code, you may pass the following parameter: 14 | ``` 15 | cmake ../mixbench-hip/ -D HIP_HIPCC_FLAGS="-save-temps" 16 | ``` 17 | 18 | ## Execution 19 | 20 | ``` 21 | $ ./mixbench-hip 22 | mixbench-hip (v0.04-1-g7a068df) 23 | ------------------------ Device specifications ------------------------ 24 | Device: 25 | CUDA driver version: 50221.151 26 | GPU clock rate: 1700 MHz 27 | WarpSize: 64 28 | L2 cache size: 8192 KB 29 | Total global mem: 65520 MB 30 | Total SPs: 6656 (104 MPs x 64 SPs/MP) 31 | Compute throughput: 22630.40 GFlops (theoretical single precision FMAs) 32 | Memory bandwidth: 1638.40 GB/sec 33 | ----------------------------------------------------------------------- 34 | Total GPU memory 68702699520, free 68702699520 35 | Buffer size: 256MB 36 | Trade-off type: compute with global memory (block strided) 37 | Elements per thread: 8 38 | Thread fusion degree: 1 39 | ----------------------------------------------------------------------------- CSV data ------------------------------------------------------------------------------------------------------------------- 40 | Experiment ID, Single Precision ops,,,, Packed Single Precision ops,,,, Double precision ops,,,, Half precision ops,,,, Integer operations,,, 41 | Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec 42 | 0, 0.250, 0.10, 350.12,1400.46, 0.250, 0.18, 363.14,1452.57, 0.125, 0.19, 180.48,1443.83, 0.500, 0.10, 697.89,1395.78, 0.250, 0.10, 352.47,1409.86 43 | 1, 0.750, 0.10, 1052.08,1402.78, 0.750, 0.18, 1097.99,1463.99, 0.375, 0.18, 549.00,1463.99, 1.500, 0.10, 2097.15,1398.10, 0.750, 0.10, 1055.61,1407.48 44 | 2, 1.250, 0.10, 1753.49,1402.79, 1.250, 0.18, 1823.61,1458.89, 0.625, 0.19, 904.73,1447.57, 2.500, 0.10, 3501.09,1400.44, 1.250, 0.10, 1753.47,1402.78 45 | 3, 1.750, 0.10, 2442.61,1395.78, 1.750, 0.18, 2555.29,1460.17, 0.875, 0.18, 1273.21,1455.10, 3.500, 0.10, 4901.52,1400.44, 1.750, 0.10, 2458.97,1405.13 46 | 4, 2.250, 0.10, 3130.08,1391.15, 2.250, 0.18, 3279.66,1457.63, 1.125, 0.19, 1629.91,1448.81, 4.500, 0.10, 6291.52,1398.12, 2.250, 0.10, 3172.16,1409.85 47 | 5, 2.750, 0.10, 3870.58,1407.48, 2.750, 0.18, 4015.46,1460.17, 1.375, 0.18, 2009.48,1461.44, 5.500, 0.10, 7664.01,1393.46, 2.750, 0.10, 3851.20,1400.44 48 | 6, 3.250, 0.10, 4506.28,1386.55, 3.250, 0.18, 4766.28,1466.55, 1.625, 0.18, 2387.30,1469.11, 6.500, 0.10, 9072.63,1395.79, 3.250, 0.10, 4536.27,1395.78 49 | 7, 3.750, 0.10, 5251.63,1400.44, 3.750, 0.18, 5466.11,1457.63, 1.875, 0.18, 2723.59,1452.58, 7.500, 0.10,10433.59,1391.15, 3.750, 0.10, 5234.21,1395.79 50 | 8, 4.250, 0.10, 5922.19,1393.46, 4.250, 0.18, 6205.71,1460.17, 2.125, 0.18, 3100.14,1458.89, 8.500, 0.10,11785.77,1386.56, 4.250, 0.10, 5932.04,1395.78 51 | 9, 4.750, 0.10, 6641.05,1398.12, 4.750, 0.18, 6966.06,1466.54, 2.375, 0.18, 3480.01,1465.27, 9.500, 0.10,13194.00,1388.84, 4.750, 0.10, 6542.91,1377.45 52 | 10, 5.250, 0.10, 7376.92,1405.13, 5.250, 0.18, 7712.86,1469.12, 2.625, 0.18, 3853.04,1467.82, 10.500, 0.10,14631.45,1393.47, 5.250, 0.10, 7046.43,1342.18 53 | 11, 5.750, 0.10, 7999.17,1391.16, 5.750, 0.18, 8388.61,1458.89, 2.875, 0.18, 4205.30,1462.71, 11.500, 0.10,15918.98,1384.26, 5.750, 0.11, 7330.54,1274.88 54 | 12, 6.250, 0.10, 8738.13,1398.10, 6.250, 0.19, 9047.30,1447.57, 3.125, 0.18, 4559.05,1458.90, 12.500, 0.10,17360.53,1388.84, 6.250, 0.11, 7710.12,1233.62 55 | 13, 6.750, 0.10, 9374.78,1388.86, 6.750, 0.18, 9881.87,1463.98, 3.375, 0.18, 4940.96,1463.99, 13.500, 0.10,18444.01,1366.22, 6.750, 0.12, 7853.41,1163.47 56 | 14, 7.250, 0.10,10035.88,1384.26, 7.250, 0.18,10540.34,1453.84, 3.625, 0.18, 5288.50,1458.90, 14.500, 0.10,20104.93,1386.55, 7.250, 0.12, 8034.07,1108.15 57 | 15, 7.750, 0.10,10853.37,1400.44, 7.750, 0.18,11326.14,1461.44, 3.875, 0.18, 5677.88,1465.26, 15.500, 0.10,21385.65,1379.72, 7.750, 0.13, 8167.30,1053.85 58 | 16, 8.250, 0.10,11534.46,1398.12, 8.250, 0.18,12067.31,1462.70, 4.125, 0.18, 6033.69,1462.71, 16.500, 0.10,22840.27,1384.26, 8.250, 0.13, 8328.10,1009.47 59 | 17, 8.750, 0.10,12213.03,1395.78, 8.750, 0.18,12843.52,1467.83, 4.375, 0.18, 6467.02,1478.18, 17.500, 0.10,23947.90,1368.45, 8.750, 0.14, 8475.79, 968.66 60 | 18, 9.250, 0.10,12783.30,1381.98, 9.250, 0.18,13541.89,1463.99, 4.625, 0.18, 6794.63,1469.11, 18.500, 0.10,25275.38,1366.24, 9.250, 0.15, 8555.09, 924.87 61 | 20, 10.250, 0.10,14118.90,1377.45, 10.250, 0.18,15098.02,1472.98, 5.125, 0.18, 7470.35,1457.63, 20.500, 0.10,26869.76,1310.72, 10.250, 0.16, 8764.91, 855.11 62 | 22, 11.250, 0.10,15624.48,1388.84, 11.250, 0.18,16629.49,1478.18, 5.625, 0.18, 8307.43,1476.88, 22.500, 0.11,28170.70,1252.03, 11.250, 0.17, 8844.65, 786.19 63 | 24, 12.250, 0.10,16845.98,1375.18, 12.250, 0.18,18059.93,1474.28, 6.125, 0.18, 8966.93,1463.99, 24.500, 0.11,29193.31,1191.56, 12.250, 0.18, 9014.12, 735.85 64 | 28, 14.250, 0.10,18390.41,1290.56, 14.250, 0.19,20381.64,1430.29, 7.125, 0.19,10269.56,1441.34, 28.500, 0.12,30690.03,1076.84, 14.250, 0.21, 9202.33, 645.78 65 | 32, 16.250, 0.11,19390.45,1193.26, 16.250, 0.19,23442.07,1442.59, 8.125, 0.19,11571.78,1424.22, 32.500, 0.14,31886.52, 981.12, 16.250, 0.23, 9343.08, 574.96 66 | 40, 20.250, 0.13,20945.66,1034.35, 20.250, 0.20,27824.62,1374.06, 10.125, 0.20,13844.35,1367.34, 40.500, 0.16,33307.91, 822.42, 20.250, 0.29, 9532.54, 470.74 67 | 48, 24.250, 0.15,21733.31, 896.22, 24.250, 0.21,30452.80,1255.79, 12.125, 0.21,15422.65,1271.97, 48.500, 0.19,34595.87, 713.32, 24.250, 0.34, 9714.66, 400.60 68 | 56, 28.250, 0.17,22569.35, 798.92, 28.250, 0.23,32619.30,1154.67, 14.125, 0.23,16365.97,1158.65, 56.500, 0.21,35609.21, 630.25, 28.250, 0.39, 9792.51, 346.64 69 | 64, 32.250, 0.19,23043.79, 714.54, 32.250, 0.26,33859.03,1049.89, 16.125, 0.25,17420.07,1080.31, 64.500, 0.24,36288.75, 562.62, 32.250, 0.45, 9566.24, 296.63 70 | 80, 40.250, 0.23,23912.39, 594.10, 40.250, 0.30,35938.54, 892.88, 20.125, 0.30,17969.27, 892.88, 80.500, 0.30,36383.90, 451.97, 40.250, 0.55, 9735.95, 241.89 71 | 96, 48.250, 0.26,24456.31, 506.87, 48.250, 0.35,36728.81, 761.22, 24.125, 0.35,18372.79, 761.57, 96.500, 0.35,37304.29, 386.57, 48.250, 0.66, 9828.84, 203.71 72 | 128, 64.250, 0.45,19324.86, 300.78, 64.250, 0.46,37260.23, 579.93, 32.125, 0.46,18772.92, 584.37, 128.500, 0.45,38360.80, 298.53, 64.250, 0.87, 9955.10, 154.94 73 | 256, 128.250, 0.85,20184.64, 157.39, 128.250, 0.88,39071.82, 304.65, 64.125, 0.88,19660.84, 306.60, 256.500, 0.86,40188.33, 156.68, 128.250, 1.69,10164.80, 79.26 74 | 512, 256.250, 1.66,20679.06, 80.70, 256.250, 1.72,40037.03, 156.24, 128.125, 1.71,20140.43, 157.19, 512.500, 1.67,41195.62, 80.38, 256.250, 3.35,10279.21, 40.11 75 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /mixbench-sycl/README.md: -------------------------------------------------------------------------------- 1 | # mixbench-sycl 2 | 3 | This is the SYCL implementation of mixbench. 4 | As SYCL is supported by multiple implementations, not all of them have been tested. 5 | 6 | ## Building notes 7 | 8 | ### Intel clang/DPCPP 9 | 10 | Using the latest version of OneAPI toolkit from Intel, you may try building as follows: 11 | 12 | ``` 13 | cmake ../mixbench-sycl -D CMAKE_CXX_COMPILER=icpx -D CMAKE_CXX_FLAGS="-fsycl -fsycl-device-code-split=per_kernel" 14 | ``` 15 | 16 | Note: `per_kernel` mode facilitates cases where the device lacks support for computations on a particular data type, e.g. double precision. 17 | 18 | If you are building under Windows/DPC++ try: 19 | ``` 20 | cmake ..\mixbench-sycl -T "Intel(R) oneAPI DPC++ Compiler 2024" -D CMAKE_CXX_COMPILER=icpx -D CMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel /EHsc" 21 | ``` 22 | Note: Adjust the platform toolset argument (*"Intel(R) oneAPI DPC++ Compiler"*) to whatever required, e.g. *"Intel(R) oneAPI DPC++ Compiler 2022"* for DPC++ 2022.1. 23 | 24 | ### AMD GPU via hipSYCL/ROCm 25 | 26 | Here building for two device architectures (*gfx803* & *gfx1012*): 27 | 28 | ``` 29 | cmake ../mixbench-sycl -D CMAKE_CXX_COMPILER=syclcc -D CMAKE_CXX_FLAGS="--hipsycl-targets='omp;hip:gfx803,gfx1012' -O2" 30 | ``` 31 | Note: Older ROCm releases might require adding `--rocm-device-lib-path=/opt/rocm/amdgcn/bitcode` to CMAKE_CXX_FLAGS 32 | 33 | ### NVidia clang/DPCPP 34 | ``` 35 | cmake ../mixbench-sycl -D CMAKE_CXX_COMPILER=clang++ -D CMAKE_CXX_FLAGS="-fsycl -std=c++17 -fsycl-targets=nvptx64-nvidia-cuda" 36 | ``` 37 | 38 | ## Execution 39 | 40 | In order to select the desired device to run the benchmark on, do pass the `-h` parameter 41 | so the available devices are listed: 42 | 43 | ``` 44 | $ ./mixbench-sycl -h 45 | mixbench-sycl (v0.04-3-g664f025) 46 | Usage: mixbench-sycl [options] [device index [workgroup size [array size(1024^2)]]] 47 | 48 | Options: 49 | -h or --help Show this message 50 | -t or --use-os-timer Use standard OS timer instead of SYCL profiling timer 51 | Available SYCL devices: 52 | 1. Intel(R) FPGA Emulation Device/Intel(R) FPGA Emulation Platform for OpenCL(TM) 53 | 2. Intel(R) Core(TM) i3-8109U CPU @ 3.00GHz/Intel(R) OpenCL 54 | 3. Intel(R) Iris(R) Plus Graphics 655 [0x3ea5]/Intel(R) OpenCL HD Graphics 55 | 4. Intel(R) Core(TM) i3-8109U CPU @ 3.00GHz/Intel(R) OpenCL 56 | 5. Intel(R) Iris(R) Plus Graphics 655 [0x3ea5]/Intel(R) Level-Zero 57 | ``` 58 | 59 | ... and then pass the device number as the argument: 60 | 61 | ``` 62 | $ ./mixbench-sycl 5 63 | mixbench-sycl (v0.04-3-g664f025) 64 | Use "-h" argument to see available options 65 | ------------------------ Device specifications ------------------------ 66 | Platform: Intel(R) Level-Zero 67 | Device: Intel(R) Iris(R) Plus Graphics 655 [0x3ea5]/Intel(R) Corporation 68 | Driver version: 1.2.21270 69 | Address bits: 64 70 | GPU clock rate: 0 MHz 71 | Total global mem: 12690 MB 72 | Max allowed buffer: 4095 MB 73 | SYCL version: 1.1 74 | Total CUs: 48 75 | ----------------------------------------------------------------------- 76 | Total GPU memory: 13307101184 77 | Buffer size: 256MB 78 | Elements per thread: 8 79 | Thread fusion degree: 4 80 | Timer: SYCL event based 81 | ----------------------------------------------------------------------------- CSV data ----------------------------------------------------------------------------- 82 | Experiment ID, Single Precision ops,,,, Double precision ops,,,, Half precision ops,,,, Integer operations,,, 83 | Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec 84 | 0, 0.250, 5.29, 6.34, 25.36, 0.125, 9.63, 3.49, 27.88, 0.500, 5.04, 13.32, 26.64, 0.250, 5.23, 6.42, 25.68 85 | 1, 0.750, 5.10, 19.74, 26.32, 0.375, 9.49, 10.60, 28.27, 1.500, 5.00, 40.25, 26.83, 0.750, 5.18, 19.43, 25.90 86 | 2, 1.250, 5.29, 31.73, 25.38, 0.625, 9.78, 17.16, 27.45, 2.500, 4.62, 72.59, 29.03, 1.250, 5.21, 32.23, 25.78 87 | 3, 1.750, 5.27, 44.55, 25.46, 0.875, 9.72, 24.15, 27.61, 3.500, 5.12, 91.78, 26.22, 1.750, 5.01, 46.87, 26.78 88 | 4, 2.250, 5.26, 57.45, 25.53, 1.125, 9.09, 33.21, 29.52, 4.500, 4.62, 130.62, 29.03, 2.250, 4.61, 65.57, 29.14 89 | 5, 2.750, 5.25, 70.35, 25.58, 1.375, 9.11, 40.51, 29.46, 5.500, 5.26, 140.31, 25.51, 2.750, 4.90, 75.28, 27.37 90 | 6, 3.250, 5.06, 86.15, 26.51, 1.625, 9.09, 48.01, 29.54, 6.500, 4.62, 188.85, 29.05, 3.250, 4.86, 89.79, 27.63 91 | 7, 3.750, 4.53, 111.09, 29.62, 1.875, 9.63, 52.27, 27.88, 7.500, 4.65, 216.63, 28.88, 3.750, 4.70, 107.11, 28.56 92 | 8, 4.250, 4.74, 120.33, 28.31, 2.125, 9.37, 60.91, 28.66, 8.500, 4.60, 247.78, 29.15, 4.250, 4.55, 125.23, 29.47 93 | 9, 4.750, 4.38, 145.42, 30.61, 2.375, 9.23, 69.07, 29.08, 9.500, 4.76, 268.08, 28.22, 4.750, 4.43, 143.86, 30.29 94 | 10, 5.250, 4.73, 149.09, 28.40, 2.625, 9.31, 75.67, 28.83, 10.500, 4.55, 309.77, 29.50, 5.250, 4.42, 159.29, 30.34 95 | 11, 5.750, 4.59, 168.17, 29.25, 2.875, 9.11, 84.69, 29.46, 11.500, 4.75, 324.78, 28.24, 5.750, 4.46, 173.02, 30.09 96 | 12, 6.250, 4.39, 190.94, 30.55, 3.125, 8.90, 94.21, 30.15, 12.500, 4.40, 381.52, 30.52, 6.250, 4.50, 186.48, 29.84 97 | 13, 6.750, 4.38, 206.79, 30.64, 3.375, 9.00, 100.67, 29.83, 13.500, 4.45, 406.98, 30.15, 6.750, 4.54, 199.70, 29.59 98 | 14, 7.250, 4.41, 220.72, 30.44, 3.625, 9.00, 108.09, 29.82, 14.500, 4.41, 441.06, 30.42, 7.250, 4.56, 213.21, 29.41 99 | 15, 7.750, 4.38, 237.52, 30.65, 3.875, 9.02, 115.35, 29.77, 15.500, 4.74, 439.27, 28.34, 7.750, 4.70, 221.36, 28.56 100 | 16, 8.250, 4.39, 252.08, 30.55, 4.125, 9.03, 122.61, 29.72, 16.500, 4.43, 499.64, 30.28, 8.250, 4.89, 226.35, 27.44 101 | 17, 8.750, 4.38, 268.11, 30.64, 4.375, 9.09, 129.14, 29.52, 17.500, 4.82, 487.16, 27.84, 8.750, 5.34, 219.77, 25.12 102 | 18, 9.250, 4.39, 282.65, 30.56, 4.625, 9.09, 136.54, 29.52, 18.500, 4.44, 559.87, 30.26, 9.250, 5.41, 229.49, 24.81 103 | 20, 10.250, 4.38, 314.04, 30.64, 5.125, 9.38, 146.63, 28.61, 20.500, 4.45, 617.83, 30.14, 10.250, 5.93, 231.87, 22.62 104 | 22, 11.250, 4.36, 345.92, 30.75, 5.625, 9.16, 164.93, 29.32, 22.500, 4.45, 679.14, 30.18, 11.250, 6.46, 233.80, 20.78 105 | 24, 12.250, 4.35, 377.65, 30.83, 6.125, 9.85, 166.94, 27.26, 24.500, 4.46, 737.88, 30.12, 12.250, 6.98, 235.42, 19.22 106 | 28, 14.250, 4.37, 437.21, 30.68, 7.125, 11.26, 169.93, 23.85, 28.500, 4.48, 854.50, 29.98, 14.250, 8.05, 237.58, 16.67 107 | 32, 16.250, 4.36, 499.84, 30.76, 8.125, 12.77, 170.75, 21.02, 32.500, 4.50, 968.97, 29.81, 16.250, 9.12, 239.18, 14.72 108 | 40, 20.250, 4.41, 616.54, 30.45, 10.125, 15.88, 171.17, 16.91, 40.500, 5.02, 1083.80, 26.76, 20.250, 11.23, 242.07, 11.95 109 | 48, 24.250, 4.81, 676.04, 27.88, 12.125, 18.48, 176.11, 14.52, 48.500, 5.91, 1101.80, 22.72, 24.250, 13.37, 243.40, 10.04 110 | 56, 28.250, 5.48, 691.78, 24.49, 14.125, 21.38, 177.33, 12.55, 56.500, 6.34, 1195.28, 21.16, 28.250, 15.52, 244.32, 8.65 111 | 64, 32.250, 6.17, 701.22, 21.74, 16.125, 24.28, 178.27, 11.06, 64.500, 6.99, 1238.30, 19.20, 32.250, 17.66, 245.05, 7.60 112 | 80, 40.250, 7.59, 712.18, 17.69, 20.125, 30.55, 176.85, 8.79, 80.500, 8.37, 1290.93, 16.04, 40.250, 22.45, 240.58, 5.98 113 | 96, 48.250, 8.99, 720.68, 14.94, 24.125, 35.85, 180.66, 7.49, 96.500, 9.78, 1323.72, 13.72, 48.250, 26.77, 241.96, 5.01 114 | 128, 64.250, 11.84, 728.22, 11.33, 32.125, 47.47, 181.66, 5.65, 128.500, 12.62, 1366.48, 10.63, 64.250, 35.32, 244.14, 3.80 115 | 192, 96.250, 19.89, 649.40, 6.75, 48.125, 71.77, 180.00, 3.74, 192.500, 18.37, 1406.37, 7.31, 96.250, 52.49, 246.11, 2.56 116 | 256, 128.250, 25.72, 669.32, 5.22, 64.125, 94.85, 181.48, 2.83, 256.500, 32.84, 1048.44, 4.09, 128.250, 69.62, 247.23, 1.93 117 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 118 | ``` 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mixbench 2 | The purpose of this benchmark tool is to evaluate performance bounds of GPUs (or CPUs) on mixed operational intensity kernels. The executed kernel is customized on a range of different operational intensity values. Modern GPUs are able to hide memory latency by switching execution to threads able to perform compute operations. Using this tool one can assess the practical optimum balance in both types of operations for a compute device. CUDA, HIP, OpenCL and SYCL implementations have been developed, targeting GPUs, or OpenMP when using a CPU as a target. 3 | 4 | ## Implementations 5 | 6 | * CUDA: `mixbench-cuda` 7 | * OpenCL: `mixbench-opencl` 8 | * HIP: `mixbench-hip` 9 | * SYCL: `mixbench-sycl` 10 | * CPU/OpenMP: `mixbench-cpu` 11 | 12 | Since each implementation resides in a separate folder, please check the documentation available within each sub-project's folder. 13 | 14 | ## Kernel types 15 | 16 | Four types of experiments are executed combined with global memory accesses: 17 | 18 | 1. Single precision Flops (multiply-additions) 19 | 2. Double precision Flops (multiply-additions) 20 | 3. Half precision Flops (multiply-additions, for GPUs only) 21 | 4. Integer multiply-addition operations 22 | 23 | ## How to build 24 | 25 | Building is based on CMake files. 26 | Thus, to build a particular implementation use the proper `CMakeLists.txt` residing in each subdirectory, 27 | e.g. for the OpenCL implementation you may use the commands as follows: 28 | 29 | ``` 30 | mkdir build 31 | cd build 32 | cmake ../mixbench-opencl 33 | cmake --build ./ 34 | ``` 35 | 36 | For more information, check available READMEs within each subfolder. 37 | 38 | ## Execution results 39 | 40 | A typical execution output on an NVidia RTX-2070 GPU is: 41 | ``` 42 | mixbench/read-only (v0.03-2-gbccfd71) 43 | ------------------------ Device specifications ------------------------ 44 | Device: GeForce RTX 2070 45 | CUDA driver version: 10.20 46 | GPU clock rate: 1620 MHz 47 | Memory clock rate: 3500 MHz 48 | Memory bus width: 256 bits 49 | WarpSize: 32 50 | L2 cache size: 4096 KB 51 | Total global mem: 7979 MB 52 | ECC enabled: No 53 | Compute Capability: 7.5 54 | Total SPs: 2304 (36 MPs x 64 SPs/MP) 55 | Compute throughput: 7464.96 GFlops (theoretical single precision FMAs) 56 | Memory bandwidth: 448.06 GB/sec 57 | ----------------------------------------------------------------------- 58 | Total GPU memory 8366784512, free 7941521408 59 | Buffer size: 256MB 60 | Trade-off type: compute with global memory (block strided) 61 | Elements per thread: 8 62 | Thread fusion degree: 4 63 | ----------------------------------------------------------------------------- CSV data ----------------------------------------------------------------------------- 64 | Experiment ID, Single Precision ops,,,, Double precision ops,,,, Half precision ops,,,, Integer operations,,, 65 | Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec 66 | 0, 0.250, 0.32, 104.42, 417.68, 0.125, 0.63, 53.04, 424.35, 0.500, 0.32, 211.41, 422.81, 0.250, 0.32, 105.58, 422.30 67 | 1, 0.750, 0.32, 316.34, 421.79, 0.375, 0.63, 158.69, 423.18, 1.500, 0.32, 634.22, 422.81, 0.750, 0.32, 317.30, 423.07 68 | 2, 1.250, 0.32, 528.46, 422.77, 0.625, 0.78, 215.91, 345.45, 2.500, 0.32, 1055.97, 422.39, 1.250, 0.32, 528.57, 422.86 69 | 3, 1.750, 0.32, 738.81, 422.17, 0.875, 1.08, 218.17, 249.34, 3.500, 0.32, 1478.95, 422.56, 1.750, 0.32, 740.59, 423.20 70 | 4, 2.250, 0.32, 951.33, 422.81, 1.125, 1.38, 219.57, 195.17, 4.500, 0.32, 1902.66, 422.81, 2.250, 0.32, 950.66, 422.51 71 | 5, 2.750, 0.32, 1162.74, 422.81, 1.375, 1.67, 220.38, 160.28, 5.500, 0.32, 2328.52, 423.37, 2.750, 0.32, 1162.74, 422.81 72 | 6, 3.250, 0.32, 1374.56, 422.94, 1.625, 1.97, 220.99, 135.99, 6.500, 0.32, 2756.62, 424.10, 3.250, 0.32, 1375.81, 423.32 73 | 7, 3.750, 0.32, 1592.45, 424.65, 1.875, 2.27, 221.38, 118.07, 7.500, 0.32, 3169.50, 422.60, 3.750, 0.32, 1585.55, 422.81 74 | 8, 4.250, 0.32, 1796.95, 422.81, 2.125, 2.57, 221.71, 104.33, 8.500, 0.32, 3587.76, 422.09, 4.250, 0.37, 1545.63, 363.68 75 | 9, 4.750, 0.32, 2006.34, 422.39, 2.375, 2.87, 221.85, 93.41, 9.500, 0.32, 3995.38, 420.57, 4.750, 0.32, 1998.29, 420.69 76 | 10, 5.250, 0.32, 2209.52, 420.86, 2.625, 3.17, 222.02, 84.58, 10.500, 0.32, 4439.54, 422.81, 5.250, 0.32, 2220.44, 422.94 77 | 11, 5.750, 0.32, 2434.12, 423.32, 2.875, 3.47, 222.17, 77.28, 11.500, 0.32, 4855.01, 422.17, 5.750, 0.32, 2426.77, 422.05 78 | 12, 6.250, 0.32, 2638.06, 422.09, 3.125, 3.78, 222.18, 71.10, 12.500, 0.32, 5227.20, 418.18, 6.250, 0.38, 2202.15, 352.34 79 | 13, 6.750, 0.32, 2841.95, 421.03, 3.375, 4.08, 222.30, 65.87, 13.500, 0.32, 5712.58, 423.15, 6.750, 0.32, 2850.54, 422.30 80 | 14, 7.250, 0.32, 3065.39, 422.81, 3.625, 4.37, 222.45, 61.36, 14.500, 0.32, 6135.74, 423.15, 7.250, 0.32, 3065.08, 422.77 81 | 15, 7.750, 0.33, 3143.40, 405.60, 3.875, 4.67, 222.57, 57.44, 15.500, 0.32, 6546.34, 422.34, 7.750, 0.32, 3268.89, 421.79 82 | 16, 8.250, 0.32, 3482.59, 422.13, 4.125, 4.98, 222.57, 53.96, 16.500, 0.32, 6957.48, 421.67, 8.250, 0.39, 2803.68, 339.84 83 | 17, 8.750, 0.32, 3693.66, 422.13, 4.375, 5.28, 222.53, 50.86, 17.500, 0.32, 7396.24, 422.64, 8.750, 0.32, 3694.77, 422.26 84 | 18, 9.250, 0.32, 3901.58, 421.79, 4.625, 5.58, 222.58, 48.12, 18.500, 0.32, 7786.72, 420.90, 9.250, 0.32, 3897.66, 421.37 85 | 20, 10.250, 0.32, 4312.53, 420.73, 5.125, 6.18, 222.66, 43.45, 20.500, 0.32, 8640.66, 421.50, 10.250, 0.41, 3374.54, 329.22 86 | 22, 11.250, 0.32, 4729.94, 420.44, 5.625, 6.78, 222.74, 39.60, 22.500, 0.32, 9452.31, 420.10, 11.250, 0.32, 4734.21, 420.82 87 | 24, 12.250, 0.32, 5148.83, 420.31, 6.125, 7.36, 223.51, 36.49, 24.500, 0.32,10346.40, 422.30, 12.250, 0.42, 3900.12, 318.38 88 | 28, 14.250, 0.32, 6009.94, 421.75, 7.125, 8.53, 224.23, 31.47, 28.500, 0.32,11975.32, 420.19, 14.250, 0.44, 4368.11, 306.53 89 | 32, 16.250, 0.32, 6795.36, 418.18, 8.125, 9.72, 224.31, 27.61, 32.500, 0.32,13605.64, 418.64, 16.250, 0.45, 4797.12, 295.21 90 | 40, 20.250, 0.34, 7899.43, 390.10, 10.125, 12.11, 224.50, 22.17, 40.500, 0.33,16371.37, 404.23, 20.250, 0.50, 5464.85, 269.87 91 | 48, 24.250, 0.41, 8029.04, 331.09, 12.125, 14.49, 224.58, 18.52, 48.500, 0.40,16468.89, 339.56, 24.250, 0.54, 5986.22, 246.85 92 | 56, 28.250, 0.47, 8114.58, 287.24, 14.125, 16.88, 224.65, 15.90, 56.500, 0.46,16443.12, 291.03, 28.250, 0.60, 6342.42, 224.51 93 | 64, 32.250, 0.53, 8154.47, 252.85, 16.125, 19.26, 224.72, 13.94, 64.500, 0.52,16536.22, 256.38, 32.250, 0.66, 6591.93, 204.40 94 | 80, 40.250, 0.66, 8242.80, 204.79, 20.125, 24.03, 224.79, 11.17, 80.500, 0.65,16644.88, 206.77, 40.250, 0.78, 6909.54, 171.67 95 | 96, 48.250, 0.78, 8321.35, 172.46, 24.125, 28.80, 224.85, 9.32, 96.500, 0.78,16685.23, 172.90, 48.250, 0.91, 7108.62, 147.33 96 | 128, 64.250, 1.03, 8337.22, 129.76, 32.125, 38.34, 224.91, 7.00, 128.500, 1.03,16775.65, 130.55, 64.250, 1.18, 7295.18, 113.54 97 | 192, 96.250, 1.54, 8414.49, 87.42, 48.125, 57.42, 224.97, 4.67, 192.500, 1.53,16847.93, 87.52, 96.250, 1.74, 7431.64, 77.21 98 | 256, 128.250, 2.06, 8362.01, 65.20, 64.125, 76.50, 225.02, 3.51, 256.500, 2.06,16693.65, 65.08, 128.250, 2.30, 7477.75, 58.31 99 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- 100 | ``` 101 | 102 | And here is a chart illustrating the results extracted above: 103 | 104 | ![RTX-2070 execution results](https://raw.githubusercontent.com/ekondis/mixbench/gh-pages/img/rtx2070-sp-roofline.png "mixbench execution results on NVidia RTX-2070 (CUDA/ro implementation)") 105 | 106 | ## Publications 107 | 108 | If you use this benchmark tool for a research work please provide citation to any of the following papers: 109 | 110 | Elias Konstantinidis, Yiannis Cotronis, 111 | "A quantitative roofline model for GPU kernel performance estimation using micro-benchmarks and hardware metric profiling", 112 | Journal of Parallel and Distributed Computing, Volume 107, September 2017, Pages 37-56, ISSN 0743-7315, 113 | https://doi.org/10.1016/j.jpdc.2017.04.002. 114 | URL: http://www.sciencedirect.com/science/article/pii/S0743731517301247 115 | 116 | Konstantinidis, E., Cotronis, Y., 117 | "A Practical Performance Model for Compute and Memory Bound GPU Kernels", 118 | Parallel, Distributed and Network-Based Processing (PDP), 2015 23rd Euromicro International Conference on , vol., no., pp.651-658, 4-6 March 2015 119 | doi: 10.1109/PDP.2015.51 120 | URL: http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7092788&isnumber=7092002 121 | -------------------------------------------------------------------------------- /mixbench-cuda/mix_kernels_cuda.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_cuda_ro.cu: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "lcutil.h" 13 | 14 | #define ELEMENTS_PER_THREAD (8) 15 | #define FUSION_DEGREE (4) 16 | 17 | template 18 | inline __device__ T conv_int(const int i){ return static_cast(i); } 19 | 20 | template 21 | inline __device__ T mad(const T a, const T b, const T c){ return a*b+c; } 22 | 23 | template 24 | inline __device__ bool equal(const T a, const T b){ return a==b; } 25 | 26 | #if __CUDA_ARCH__ >= 530 27 | template<> 28 | inline __device__ half2 conv_int(const int i){ return __half2half2( __int2half_rd(i) ); } 29 | template<> 30 | inline __device__ half2 mad(const half2 a, const half2 b, const half2 c){ return __hfma2(a, b, c)/*__hadd2(__hmul2(a, b), c)*/; } 31 | template<> 32 | inline __device__ bool equal(const half2 a, const half2 b){ return __hbeq2(a, b); } 33 | #else 34 | // a dummy implementations as a workaround 35 | template<> 36 | inline __device__ half2 conv_int(const int i){ return half2(); } 37 | template<> 38 | inline __device__ half2 mad(const half2 a, const half2 b, const half2 c){ return half2(); } 39 | template<> 40 | inline __device__ bool equal(const half2 a, const half2 b){ return false; } 41 | #endif 42 | 43 | template 44 | __global__ void benchmark_func(T seed, T *g_data){ 45 | const unsigned int blockSize = blockdim; 46 | const int stride = blockSize; 47 | int idx = blockIdx.x*blockSize*granularity + threadIdx.x; 48 | const int big_stride = gridDim.x*blockSize*granularity; 49 | 50 | T tmps[granularity]; 51 | for(int k=0; k(0); 64 | #pragma unroll 65 | for(int j=0; j(-1)) ) // Designed so it never executes 69 | g_data[idx+k*big_stride] = sum; 70 | } 71 | } 72 | 73 | void initializeEvents(cudaEvent_t *start, cudaEvent_t *stop){ 74 | CUDA_SAFE_CALL( cudaEventCreate(start) ); 75 | CUDA_SAFE_CALL( cudaEventCreate(stop) ); 76 | CUDA_SAFE_CALL( cudaEventRecord(*start, 0) ); 77 | } 78 | 79 | float finalizeEvents(cudaEvent_t start, cudaEvent_t stop){ 80 | CUDA_SAFE_CALL( cudaGetLastError() ); 81 | CUDA_SAFE_CALL( cudaEventRecord(stop, 0) ); 82 | CUDA_SAFE_CALL( cudaEventSynchronize(stop) ); 83 | float kernel_time; 84 | CUDA_SAFE_CALL( cudaEventElapsedTime(&kernel_time, start, stop) ); 85 | CUDA_SAFE_CALL( cudaEventDestroy(start) ); 86 | CUDA_SAFE_CALL( cudaEventDestroy(stop) ); 87 | return kernel_time; 88 | } 89 | 90 | void runbench_warmup(double *cd, long size){ 91 | const long reduced_grid_size = size/(ELEMENTS_PER_THREAD)/128; 92 | const int BLOCK_SIZE = 256; 93 | const int TOTAL_REDUCED_BLOCKS = reduced_grid_size/BLOCK_SIZE; 94 | 95 | dim3 dimBlock(BLOCK_SIZE, 1, 1); 96 | dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1); 97 | 98 | benchmark_func< short, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, 0, true ><<< dimReducedGrid, dimBlock >>>((short)1, (short*)cd); 99 | CUDA_SAFE_CALL( cudaGetLastError() ); 100 | CUDA_SAFE_CALL( cudaDeviceSynchronize() ); 101 | } 102 | 103 | int out_config = 1; 104 | 105 | template 106 | void runbench(double *cd, long size, bool doHalfs){ 107 | const long compute_grid_size = size/ELEMENTS_PER_THREAD/FUSION_DEGREE; 108 | const int BLOCK_SIZE = 256; 109 | const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; 110 | const long long computations = (ELEMENTS_PER_THREAD*(long long)compute_grid_size+(2*ELEMENTS_PER_THREAD*compute_iterations)*(long long)compute_grid_size)*FUSION_DEGREE; 111 | const long long memoryoperations = size; 112 | 113 | dim3 dimBlock(BLOCK_SIZE, 1, 1); 114 | dim3 dimGrid(TOTAL_BLOCKS, 1, 1); 115 | cudaEvent_t start, stop; 116 | 117 | initializeEvents(&start, &stop); 118 | benchmark_func< float, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, compute_iterations, false ><<< dimGrid, dimBlock >>>(1.0f, (float*)cd); 119 | float kernel_time_mad_sp = finalizeEvents(start, stop); 120 | 121 | initializeEvents(&start, &stop); 122 | benchmark_func< double, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, compute_iterations, false ><<< dimGrid, dimBlock >>>(1.0, cd); 123 | float kernel_time_mad_dp = finalizeEvents(start, stop); 124 | 125 | float kernel_time_mad_hp = 0.f; 126 | if( doHalfs ){ 127 | initializeEvents(&start, &stop); 128 | half2 h_ones; 129 | *((int32_t*)&h_ones) = 15360 + (15360 << 16); // 1.0 as half 130 | benchmark_func< half2, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, compute_iterations, false ><<< dimGrid, dimBlock >>>(h_ones, (half2*)cd); 131 | kernel_time_mad_hp = finalizeEvents(start, stop); 132 | } 133 | 134 | initializeEvents(&start, &stop); 135 | benchmark_func< int, BLOCK_SIZE, ELEMENTS_PER_THREAD, FUSION_DEGREE, compute_iterations, true ><<< dimGrid, dimBlock >>>(1, (int*)cd); 136 | float kernel_time_mad_int = finalizeEvents(start, stop); 137 | 138 | printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", 139 | compute_iterations, 140 | ((double)computations)/((double)memoryoperations*sizeof(float)), 141 | kernel_time_mad_sp, 142 | ((double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), 143 | ((double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), 144 | ((double)computations)/((double)memoryoperations*sizeof(double)), 145 | kernel_time_mad_dp, 146 | ((double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), 147 | ((double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), 148 | ((double)2*computations)/((double)memoryoperations*sizeof(half2)), 149 | kernel_time_mad_hp, 150 | ((double)2*computations)/kernel_time_mad_hp*1000./(double)(1000*1000*1000), 151 | ((double)memoryoperations*sizeof(half2))/kernel_time_mad_hp*1000./(1000.*1000.*1000.), 152 | ((double)computations)/((double)memoryoperations*sizeof(int)), 153 | kernel_time_mad_int, 154 | ((double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), 155 | ((double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); 156 | } 157 | 158 | extern "C" void mixbenchGPU(double *c, long size){ 159 | const char *benchtype = "compute with global memory (block strided)"; 160 | 161 | printf("Trade-off type: %s\n", benchtype); 162 | printf("Elements per thread: %d\n", ELEMENTS_PER_THREAD); 163 | printf("Thread fusion degree: %d\n", FUSION_DEGREE); 164 | double *cd; 165 | bool doHalfs = IsFP16Supported(); 166 | if( !doHalfs ) 167 | printf("Warning: Half precision computations are not supported\n"); 168 | 169 | CUDA_SAFE_CALL( cudaMalloc((void**)&cd, size*sizeof(double)) ); 170 | 171 | // Copy data to device memory 172 | CUDA_SAFE_CALL( cudaMemset(cd, 0, size*sizeof(double)) ); // initialize to zeros 173 | 174 | // Synchronize in order to wait for memory operations to finish 175 | CUDA_SAFE_CALL( cudaDeviceSynchronize() ); 176 | 177 | printf("----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------\n"); 178 | printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Half precision ops,,,, Integer operations,,, \n"); 179 | printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); 180 | 181 | runbench_warmup(cd, size); 182 | 183 | runbench<0>(cd, size, doHalfs); 184 | runbench<1>(cd, size, doHalfs); 185 | runbench<2>(cd, size, doHalfs); 186 | runbench<3>(cd, size, doHalfs); 187 | runbench<4>(cd, size, doHalfs); 188 | runbench<5>(cd, size, doHalfs); 189 | runbench<6>(cd, size, doHalfs); 190 | runbench<7>(cd, size, doHalfs); 191 | runbench<8>(cd, size, doHalfs); 192 | runbench<9>(cd, size, doHalfs); 193 | runbench<10>(cd, size, doHalfs); 194 | runbench<11>(cd, size, doHalfs); 195 | runbench<12>(cd, size, doHalfs); 196 | runbench<13>(cd, size, doHalfs); 197 | runbench<14>(cd, size, doHalfs); 198 | runbench<15>(cd, size, doHalfs); 199 | runbench<16>(cd, size, doHalfs); 200 | runbench<17>(cd, size, doHalfs); 201 | runbench<18>(cd, size, doHalfs); 202 | runbench<20>(cd, size, doHalfs); 203 | runbench<22>(cd, size, doHalfs); 204 | runbench<24>(cd, size, doHalfs); 205 | runbench<28>(cd, size, doHalfs); 206 | runbench<32>(cd, size, doHalfs); 207 | runbench<40>(cd, size, doHalfs); 208 | runbench<48>(cd, size, doHalfs); 209 | runbench<56>(cd, size, doHalfs); 210 | runbench<64>(cd, size, doHalfs); 211 | runbench<80>(cd, size, doHalfs); 212 | runbench<96>(cd, size, doHalfs); 213 | runbench<128>(cd, size, doHalfs); 214 | runbench<192>(cd, size, doHalfs); 215 | runbench<256>(cd, size, doHalfs); 216 | runbench<512>(cd, size, doHalfs); 217 | runbench<1024>(cd, size, doHalfs); 218 | 219 | printf("--------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"); 220 | 221 | // Copy results back to host memory 222 | CUDA_SAFE_CALL( cudaMemcpy(c, cd, size*sizeof(double), cudaMemcpyDeviceToHost) ); 223 | 224 | CUDA_SAFE_CALL( cudaFree(cd) ); 225 | 226 | CUDA_SAFE_CALL( cudaDeviceReset() ); 227 | } 228 | -------------------------------------------------------------------------------- /mixbench-cpu/mix_kernels_cpu.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_cpu.cpp: This file is part of the mixbench GPU micro-benchmark 3 | *suite. 4 | * 5 | * Contact: Elias Konstantinidis 6 | **/ 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | const auto base_omp_get_max_threads = omp_get_max_threads(); 18 | 19 | using benchmark_clock = std::chrono::steady_clock; 20 | 21 | #ifdef BASELINE_IMPL 22 | 23 | template 24 | Element __attribute__((noinline)) bench_block(Element* data) { 25 | Element sum = 0; 26 | Element f = data[0]; 27 | 28 | #pragma omp simd aligned(data : 64) reduction(+ : sum) 29 | for (size_t i = 0; i < static_chunk_size; i++) { 30 | Element t = data[i]; 31 | for (size_t j = 0; j < compute_iterations; j++) { 32 | t = t * t + f; 33 | } 34 | sum += t; 35 | } 36 | return sum; 37 | } 38 | 39 | #else 40 | 41 | template 42 | Element __attribute__((noinline)) bench_block(Element* data) { 43 | Element sum = 0; 44 | 45 | Element f[] = {data[0], data[1], data[2], data[3], 46 | data[4], data[5], data[6], data[7]}; 47 | 48 | #pragma omp simd aligned(data : 64) reduction(+ : sum) 49 | for (size_t i = 0; i < static_chunk_size; i++) { 50 | Element t[] = {data[i], data[i], data[i], data[i], 51 | data[i], data[i], data[i], data[i]}; 52 | for (size_t j = 0; j < compute_iterations / 8; j++) { 53 | t[0] = t[0] * t[0] + f[0]; 54 | t[1] = t[1] * t[1] + f[1]; 55 | t[2] = t[2] * t[2] + f[2]; 56 | t[3] = t[3] * t[3] + f[3]; 57 | t[4] = t[4] * t[4] + f[4]; 58 | t[5] = t[5] * t[5] + f[5]; 59 | t[6] = t[6] * t[6] + f[6]; 60 | t[7] = t[7] * t[7] + f[7]; 61 | } 62 | if constexpr (compute_iterations % 8 > 0) { 63 | t[0] = t[0] * t[0] + f[0]; 64 | } 65 | if constexpr (compute_iterations % 8 > 1) { 66 | t[1] = t[1] * t[1] + f[1]; 67 | } 68 | if constexpr (compute_iterations % 8 > 2) { 69 | t[2] = t[2] * t[2] + f[2]; 70 | } 71 | if constexpr (compute_iterations % 8 > 3) { 72 | t[3] = t[3] * t[3] + f[3]; 73 | } 74 | if constexpr (compute_iterations % 8 > 4) { 75 | t[4] = t[4] * t[4] + f[4]; 76 | } 77 | if constexpr (compute_iterations % 8 > 5) { 78 | t[5] = t[5] * t[5] + f[5]; 79 | } 80 | if constexpr (compute_iterations % 8 > 6) { 81 | t[6] = t[6] * t[6] + f[6]; 82 | } 83 | sum += t[0] + t[1] + t[2] + t[3] + t[4] + t[5] + t[6] + t[7]; 84 | } 85 | return sum; 86 | } 87 | 88 | #endif 89 | 90 | template 91 | __attribute__((optimize("unroll-loops"))) size_t bench(size_t len, 92 | const Element seed1, 93 | const Element seed2, 94 | Element* src) { 95 | Element sum = 0; 96 | constexpr size_t static_chunk_size = 4096; 97 | 98 | #pragma omp parallel for reduction(+ : sum) schedule(static) 99 | for (size_t it_base = 0; it_base < len; it_base += static_chunk_size) { 100 | sum += bench_block( 101 | &src[it_base]); 102 | } 103 | 104 | *src = sum; 105 | return len; 106 | } 107 | 108 | auto runbench_warmup(double* c, size_t size) { 109 | auto timer_start = benchmark_clock::now(); 110 | 111 | bench(size, 1., -1., c); 112 | 113 | auto timer_duration = benchmark_clock::now() - timer_start; 114 | return std::chrono::duration_cast(timer_duration) 115 | .count(); 116 | } 117 | 118 | template 119 | auto measure_operation(Op op) { 120 | auto timer_start = benchmark_clock::now(); 121 | op(); 122 | auto timer_duration = benchmark_clock::now() - timer_start; 123 | return std::chrono::duration_cast(timer_duration) 124 | .count() / 125 | 1000.; 126 | } 127 | 128 | template 129 | auto benchmark_omp(Op op) { 130 | constexpr int total_runs = 20; 131 | constexpr int total_half_thread_runs = 10; 132 | 133 | auto duration = op(); // drop first measurement 134 | std::vector measurements; 135 | 136 | // 1st try with full threading 137 | omp_set_num_threads(base_omp_get_max_threads); 138 | 139 | for (int i = 1; i < total_runs; i++) { 140 | duration = op(); 141 | measurements.push_back(duration); 142 | } 143 | 144 | // then try with half threading 145 | if (base_omp_get_max_threads > 1) { 146 | omp_set_num_threads(base_omp_get_max_threads / 2); 147 | 148 | for (int i = 1; i < total_half_thread_runs; i++) { 149 | duration = op(); 150 | measurements.push_back(duration); 151 | } 152 | } 153 | 154 | return *std::min_element(std::begin(measurements), std::end(measurements)); 155 | } 156 | 157 | class ComputeSpace { 158 | size_t memory_space_{0}; 159 | int compute_iterations_{0}; 160 | 161 | public: 162 | ComputeSpace(size_t memory_space, int compute_iterations) 163 | : memory_space_{memory_space}, compute_iterations_{compute_iterations} {} 164 | 165 | template 166 | size_t compute_ops() const { 167 | const auto total_elements = element_count(); 168 | const long long computations = 169 | total_elements /* Vector length */ 170 | * compute_iterations_ /* Core loop iteration count */ 171 | * 2 /* Flops per core loop iteration */ 172 | * 1 /* FMAs in the inner most loop */ 173 | + total_elements - 1 /* Due to sum reduction */ 174 | ; 175 | return computations; 176 | } 177 | 178 | size_t memory_traffic() const { return memory_space_; } 179 | 180 | template 181 | size_t element_count() const { 182 | return memory_space_ / sizeof(T); 183 | } 184 | }; 185 | 186 | template 187 | void runbench(double* c, size_t size) { 188 | ComputeSpace cs{size * sizeof(double), compute_iterations}; 189 | 190 | // floating point part (single prec) 191 | auto kernel_time_mad_sp = benchmark_omp([&] { 192 | return measure_operation([&] { 193 | bench(cs.element_count(), 1.f, -1.f, 194 | reinterpret_cast(c)); 195 | }); 196 | }); 197 | 198 | // floating point part (double prec) 199 | auto kernel_time_mad_dp = benchmark_omp([&] { 200 | return measure_operation([&] { 201 | bench(cs.element_count(), 1., -1., c); 202 | }); 203 | }); 204 | 205 | // integer part 206 | auto kernel_time_mad_int = benchmark_omp([&] { 207 | return measure_operation([&] { 208 | bench(cs.element_count(), 1, -1, 209 | reinterpret_cast(c)); 210 | }); 211 | }); 212 | 213 | const auto computations_sp = cs.compute_ops(); 214 | const auto computations_dp = cs.compute_ops(); 215 | const auto computations_int = cs.compute_ops(); 216 | const auto memory_traffic = cs.memory_traffic(); 217 | 218 | const auto setw = std::setw; 219 | const auto setprecision = std::setprecision; 220 | std::cout << std::fixed << " " << std::setw(4) << compute_iterations 221 | << ", " << setw(8) << setprecision(3) 222 | << static_cast(computations_sp) / 223 | static_cast(memory_traffic) 224 | << "," << setw(8) << setprecision(2) << kernel_time_mad_sp << "," 225 | << setw(8) << setprecision(2) 226 | << static_cast(computations_sp) / kernel_time_mad_sp * 227 | 1000. / static_cast(1000 * 1000 * 1000) 228 | << "," << setw(7) << setprecision(2) 229 | << static_cast(memory_traffic) / kernel_time_mad_sp * 230 | 1000. / (1000. * 1000. * 1000.) 231 | 232 | << ", " << setw(8) << setprecision(3) 233 | << static_cast(computations_dp) / 234 | static_cast(memory_traffic) 235 | << "," << setw(8) << setprecision(2) << kernel_time_mad_dp << "," 236 | << setw(8) << setprecision(2) 237 | << static_cast(computations_dp) / kernel_time_mad_dp * 238 | 1000. / static_cast(1000 * 1000 * 1000) 239 | << "," << setw(7) << setprecision(2) 240 | << static_cast(memory_traffic) / kernel_time_mad_dp * 241 | 1000. / (1000. * 1000. * 1000.) 242 | 243 | << ", " << setw(8) << setprecision(3) 244 | << static_cast(computations_int) / 245 | static_cast(memory_traffic) 246 | << "," << setw(8) << setprecision(2) << kernel_time_mad_int << "," 247 | << setw(8) << setprecision(2) 248 | << static_cast(computations_int) / kernel_time_mad_int * 249 | 1000. / static_cast(1000 * 1000 * 1000) 250 | << "," << setw(7) << setprecision(2) 251 | << static_cast(memory_traffic) / kernel_time_mad_int * 252 | 1000. / (1000. * 1000. * 1000.) 253 | 254 | << std::endl; 255 | } 256 | 257 | // Variadic template helper to ease multiple configuration invocations 258 | template 259 | void runbench_range(double* cd, long size) { 260 | runbench(cd, size); 261 | } 262 | 263 | template 264 | void runbench_range(double* cd, long size) { 265 | runbench_range(cd, size); 266 | runbench_range(cd, size); 267 | } 268 | 269 | void mixbenchCPU(double* c, size_t size) { 270 | // Initialize data to zeros on memory by respecting 1st touch policy 271 | #pragma omp parallel for schedule(static) 272 | for (size_t i = 0; i < size; i++) 273 | c[i] = 0.0; 274 | 275 | std::cout << "--------------------------------------------" 276 | "-------------- CSV data " 277 | "--------------------------------------------" 278 | "--------------" 279 | << std::endl; 280 | std::cout << "Experiment ID, Single Precision ops,,,, Double " 281 | "precision ops,,,, Integer operations,,, " 282 | << std::endl; 283 | std::cout << "Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, " 284 | "Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, " 285 | "GIOPS, GB/sec" 286 | << std::endl; 287 | 288 | runbench_warmup(c, size); 289 | 290 | runbench_range<0, 1, 2, 3, 4, 6, 8, 12, 16, 20, 24, 28, 32, 40, 6 * 8, 7 * 8, 291 | 8 * 8, 10 * 8, 13 * 8, 15 * 8, 16 * 8, 20 * 8, 24 * 8, 32 * 8, 292 | 40 * 8, 64 * 8>(c, size); 293 | 294 | std::cout << "---------------------------------------------------------------" 295 | "---------------------------------------------------------------" 296 | << std::endl; 297 | } 298 | -------------------------------------------------------------------------------- /mixbench-hip/mix_kernels_hip.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_hip.cpp: This file is part of the mixbench GPU micro-benchmark 3 | *suite. 4 | * 5 | * Contact: Elias Konstantinidis 6 | **/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #ifdef __CUDACC__ 14 | #include 15 | #define GPU_INF(_T) (_T)(CUDART_INF) 16 | #else 17 | #include 18 | #define GPU_INF(_T) std::numeric_limits<_T>::infinity() 19 | #endif 20 | 21 | typedef __half2 half2; 22 | 23 | #include 24 | #include "lhiputil.h" 25 | 26 | #define ELEMENTS_PER_THREAD (8) 27 | 28 | template 29 | inline __device__ T mad(const T& a, const T& b, const T& c) { 30 | return a * b + c; 31 | } 32 | 33 | template <> 34 | inline __device__ double mad(const double& a, 35 | const double& b, 36 | const double& c) { 37 | return fma(a, b, c); 38 | } 39 | 40 | template <> 41 | inline __device__ half2 mad(const half2& a, const half2& b, const half2& c) { 42 | return __hfma2(a, b, c); 43 | } 44 | 45 | template 46 | inline __device__ bool is_equal(const T& a, const T& b) { 47 | return a == b; 48 | } 49 | 50 | template <> 51 | inline __device__ bool is_equal(const half2& a, const half2& b) { 52 | return __hbeq2(a, b); 53 | } 54 | 55 | template 59 | __global__ void benchmark_func(T seed, T* g_data) { 60 | const int stride = blockSize; 61 | const int idx = hipBlockIdx_x * blockSize * granularity + hipThreadIdx_x; 62 | 63 | T tmps[granularity]; 64 | #pragma unroll 65 | for (int j = 0; j < granularity; j++) { 66 | // Load elements (memory intensive part) 67 | tmps[j] = g_data[idx + j * stride]; 68 | // Perform computations (compute intensive part) 69 | for (int i = 0; i < compute_iterations; i++) { 70 | tmps[j] = mad(tmps[j], tmps[j], seed); 71 | } 72 | } 73 | // Multiply add reduction 74 | T sum = static_cast(0); 75 | #pragma unroll 76 | for (int j = 0; j < granularity; j += 2) { 77 | sum = mad(tmps[j], tmps[j + 1], sum); 78 | } 79 | // Dummy code 80 | if (is_equal(sum, static_cast(-1))) // Designed so it never executes 81 | g_data[idx] = sum; 82 | } 83 | 84 | void initializeEvents_ext(hipEvent_t* start, hipEvent_t* stop) { 85 | HIP_SAFE_CALL(hipEventCreate(start)); 86 | HIP_SAFE_CALL(hipEventCreate(stop)); 87 | } 88 | 89 | float finalizeEvents_ext(hipEvent_t start, hipEvent_t stop) { 90 | HIP_SAFE_CALL(hipGetLastError()); 91 | HIP_SAFE_CALL(hipEventSynchronize(stop)); 92 | float kernel_time; 93 | HIP_SAFE_CALL(hipEventElapsedTime(&kernel_time, start, stop)); 94 | HIP_SAFE_CALL(hipEventDestroy(start)); 95 | HIP_SAFE_CALL(hipEventDestroy(stop)); 96 | return kernel_time; 97 | } 98 | 99 | void runbench_warmup(double* cd, long size) { 100 | const long reduced_grid_size = size / (ELEMENTS_PER_THREAD) / 128; 101 | const int BLOCK_SIZE = 256; 102 | const int TOTAL_REDUCED_BLOCKS = reduced_grid_size / BLOCK_SIZE; 103 | 104 | dim3 dimBlock(BLOCK_SIZE, 1, 1); 105 | dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1); 106 | 107 | hipLaunchKernelGGL( 108 | HIP_KERNEL_NAME( 109 | benchmark_func), 110 | dim3(dimReducedGrid), dim3(dimBlock), 0, 0, (short)1, (short*)cd); 111 | HIP_SAFE_CALL(hipGetLastError()); 112 | HIP_SAFE_CALL(hipDeviceSynchronize()); 113 | } 114 | 115 | template 116 | void runbench(double* cd, long size) { 117 | const long compute_grid_size = size / ELEMENTS_PER_THREAD; 118 | const int BLOCK_SIZE = 256; 119 | const int TOTAL_BLOCKS = compute_grid_size / BLOCK_SIZE; 120 | const long long computations = 121 | ELEMENTS_PER_THREAD * (long long)compute_grid_size + 122 | (2 * ELEMENTS_PER_THREAD * compute_iterations) * 123 | (long long)compute_grid_size; 124 | const long long memoryoperations = size; 125 | 126 | dim3 dimBlock(BLOCK_SIZE, 1, 1); 127 | dim3 dimGrid(TOTAL_BLOCKS, 1, 1); 128 | hipEvent_t start, stop; 129 | 130 | constexpr auto total_bench_iterations = 3; 131 | 132 | float kernel_time_mad_sp = benchmark([&]() { 133 | initializeEvents_ext(&start, &stop); 134 | hipExtLaunchKernelGGL( 135 | HIP_KERNEL_NAME(benchmark_func), 137 | dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, 1.0f, (float*)cd); 138 | return finalizeEvents_ext(start, stop); 139 | }); 140 | 141 | float kernel_time_mad_sp2 = benchmark([&]() { 142 | initializeEvents_ext(&start, &stop); 143 | hipExtLaunchKernelGGL( 144 | HIP_KERNEL_NAME(benchmark_func), 146 | dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, float2{1.0f}, 147 | (float2*)cd); 148 | return finalizeEvents_ext(start, stop); 149 | }); 150 | 151 | float kernel_time_mad_dp = benchmark([&]() { 152 | initializeEvents_ext(&start, &stop); 153 | hipExtLaunchKernelGGL( 154 | HIP_KERNEL_NAME(benchmark_func), 156 | dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, 1.0, cd); 157 | return finalizeEvents_ext(start, stop); 158 | }); 159 | 160 | float kernel_time_mad_hp = benchmark([&]() { 161 | initializeEvents_ext(&start, &stop); 162 | half2 h_ones(1.0f); 163 | hipExtLaunchKernelGGL( 164 | HIP_KERNEL_NAME(benchmark_func), 166 | dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, h_ones, 167 | (half2*)cd); 168 | return finalizeEvents_ext(start, stop); 169 | }); 170 | 171 | float kernel_time_mad_int = benchmark([&]() { 172 | initializeEvents_ext(&start, &stop); 173 | hipExtLaunchKernelGGL( 174 | HIP_KERNEL_NAME(benchmark_func), 176 | dim3(dimGrid), dim3(dimBlock), 0, 0, start, stop, 0, 1, (int*)cd); 177 | return finalizeEvents_ext(start, stop); 178 | }); 179 | 180 | printf( 181 | " %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, " 182 | "%8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, " 183 | "%8.3f,%8.2f,%8.2f,%7.2f\n", 184 | compute_iterations, 185 | // SP 186 | ((double)computations) / ((double)memoryoperations * sizeof(float)), 187 | kernel_time_mad_sp, 188 | ((double)computations) / kernel_time_mad_sp * 1000. / 189 | (double)(1000 * 1000 * 1000), 190 | ((double)memoryoperations * sizeof(float)) / kernel_time_mad_sp * 1000. / 191 | (1000. * 1000. * 1000.), 192 | // Packed SP 193 | ((double)2 * computations) / ((double)memoryoperations * sizeof(float2)), 194 | kernel_time_mad_sp2, 195 | ((double)2 * computations) / kernel_time_mad_sp2 * 1000. / 196 | (double)(1000 * 1000 * 1000), 197 | ((double)memoryoperations * sizeof(float2)) / kernel_time_mad_sp2 * 198 | 1000. / (1000. * 1000. * 1000.), 199 | // DP 200 | ((double)computations) / ((double)memoryoperations * sizeof(double)), 201 | kernel_time_mad_dp, 202 | ((double)computations) / kernel_time_mad_dp * 1000. / 203 | (double)(1000 * 1000 * 1000), 204 | ((double)memoryoperations * sizeof(double)) / kernel_time_mad_dp * 1000. / 205 | (1000. * 1000. * 1000.), 206 | // Packed HP 207 | ((double)2 * computations) / ((double)memoryoperations * sizeof(half2)), 208 | kernel_time_mad_hp, 209 | ((double)2 * computations) / kernel_time_mad_hp * 1000. / 210 | (double)(1000 * 1000 * 1000), 211 | ((double)memoryoperations * sizeof(half2)) / kernel_time_mad_hp * 1000. / 212 | (1000. * 1000. * 1000.), 213 | // Int 214 | ((double)computations) / ((double)memoryoperations * sizeof(int)), 215 | kernel_time_mad_int, 216 | ((double)computations) / kernel_time_mad_int * 1000. / 217 | (double)(1000 * 1000 * 1000), 218 | ((double)memoryoperations * sizeof(int)) / kernel_time_mad_int * 1000. / 219 | (1000. * 1000. * 1000.)); 220 | } 221 | 222 | extern "C" void mixbenchGPU(double* c, long size) { 223 | const char* benchtype = "compute with global memory (block strided)"; 224 | 225 | printf("Trade-off type: %s\n", benchtype); 226 | printf("Elements per thread: %d\n", ELEMENTS_PER_THREAD); 227 | printf("Thread fusion degree: %d\n", 1); 228 | double* cd; 229 | 230 | HIP_SAFE_CALL(hipMalloc((void**)&cd, size * sizeof(double))); 231 | 232 | // Copy data to device memory 233 | HIP_SAFE_CALL( 234 | hipMemset(cd, 0, size * sizeof(double))); // initialize to zeros 235 | 236 | // Synchronize in order to wait for memory operations to finish 237 | HIP_SAFE_CALL(hipDeviceSynchronize()); 238 | 239 | printf( 240 | "------------------------------------------------------------------------" 241 | "----- CSV data " 242 | "------------------------------------------------------------------------" 243 | "-------------------------------------------\n"); 244 | printf( 245 | "Experiment ID, Single Precision ops,,,, Packed Single " 246 | "Precision ops,,,, Double precision ops,,,, Half " 247 | "precision ops,,,, Integer operations,,, \n"); 248 | printf( 249 | "Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, " 250 | "ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, " 251 | "Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, " 252 | "GB/sec\n"); 253 | 254 | runbench_warmup(cd, size); 255 | 256 | runbench<0>(cd, size); 257 | runbench<1>(cd, size); 258 | runbench<2>(cd, size); 259 | runbench<3>(cd, size); 260 | runbench<4>(cd, size); 261 | runbench<5>(cd, size); 262 | runbench<6>(cd, size); 263 | runbench<7>(cd, size); 264 | runbench<8>(cd, size); 265 | runbench<9>(cd, size); 266 | runbench<10>(cd, size); 267 | runbench<11>(cd, size); 268 | runbench<12>(cd, size); 269 | runbench<13>(cd, size); 270 | runbench<14>(cd, size); 271 | runbench<15>(cd, size); 272 | runbench<16>(cd, size); 273 | runbench<17>(cd, size); 274 | runbench<18>(cd, size); 275 | runbench<20>(cd, size); 276 | runbench<22>(cd, size); 277 | runbench<24>(cd, size); 278 | runbench<28>(cd, size); 279 | runbench<32>(cd, size); 280 | runbench<40>(cd, size); 281 | runbench<48>(cd, size); 282 | runbench<56>(cd, size); 283 | runbench<64>(cd, size); 284 | runbench<80>(cd, size); 285 | runbench<96>(cd, size); 286 | runbench<128>(cd, size); 287 | runbench<256>(cd, size); 288 | runbench<512>(cd, size); 289 | 290 | printf( 291 | "------------------------------------------------------------------------" 292 | "------------------------------------------------------------------------" 293 | "----------------------------------------------------------\n"); 294 | 295 | // Copy results back to host memory 296 | HIP_SAFE_CALL(hipMemcpy(c, cd, size * sizeof(double), hipMemcpyDeviceToHost)); 297 | 298 | HIP_SAFE_CALL(hipFree(cd)); 299 | 300 | HIP_SAFE_CALL(hipDeviceReset()); 301 | } 302 | -------------------------------------------------------------------------------- /mixbench-sycl/mix_kernels_sycl.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_sycl.cpp: This file is part of the mixbench GPU micro-benchmark suite. 3 | * 4 | * Contact: Elias Konstantinidis 5 | **/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "lsyclutil.h" 13 | 14 | #define ELEMENTS_PER_THREAD (8) 15 | #define FUSION_DEGREE (4) 16 | 17 | #ifdef __HIPSYCL__ 18 | #include 19 | #else 20 | using half2 = sycl::half2; 21 | using half = sycl::half; 22 | #endif 23 | 24 | template 25 | struct MADOperator { 26 | T operator()(T a, T b, T c) { return a * b + c; } 27 | }; 28 | 29 | #ifndef __HIPSYCL__ 30 | // Use partial specialization for calling sycl::mad() for generic floating point types 31 | template 32 | struct MADOperator::value>> { 33 | T operator()(T a, T b, T c) { 34 | return sycl::mad(a, b, c); 35 | } 36 | }; 37 | #else 38 | #ifdef SYCL_DEVICE_ONLY 39 | // Packed half precision operation support via ROCm 40 | // 41 | template <> 42 | struct MADOperator { 43 | half2 operator()(half2 a, half2 b, half2 c) { 44 | return __hfma2(a, b, c); 45 | } 46 | }; 47 | #endif 48 | #endif 49 | 50 | template 51 | struct EqualOperator { 52 | bool operator()(T a, T b) { 53 | return a == b; 54 | } 55 | }; 56 | 57 | template <> 58 | struct EqualOperator { 59 | bool operator()(half2 a, half2 b) { 60 | #ifdef __HIPSYCL__ 61 | return __hbeq2(a, b); 62 | #else 63 | return a[0] == b[0] && a[1] == b[1]; 64 | #endif 65 | } 66 | }; 67 | 68 | template 69 | struct FromIntOperator { 70 | T operator()(const int i) { 71 | return static_cast(i); 72 | } 73 | }; 74 | 75 | template <> 76 | struct FromIntOperator { 77 | half2 operator()(const int i) { 78 | #ifdef __HIPSYCL__ 79 | return half2{i,i}; 80 | #else 81 | return sycl::int2{i}.convert(); 82 | #endif 83 | } 84 | }; 85 | 86 | template 87 | void benchmark_func(T seed, T *g_data, sycl::nd_item<1> item_ct1) { 88 | const unsigned int blockSize = item_ct1.get_local_range(0); 89 | const int stride = blockSize; 90 | int idx = item_ct1.get_group(0) * blockSize * granularity + item_ct1.get_local_id(0); 91 | const int big_stride = item_ct1.get_group_range(0) * blockSize * granularity; 92 | /* 93 | #ifdef BLOCK_STRIDED 94 | const int stride = blockSize; 95 | const int idx = get_group_id(0)*blockSize*ELEMENTS_PER_THREAD + get_local_id(0); 96 | #else 97 | const int grid_size = blockSize * get_num_groups(0); 98 | const int stride = grid_size; 99 | const int idx = get_global_id(0); 100 | #endif 101 | const int big_stride = get_num_groups(0)*blockSize*ELEMENTS_PER_THREAD; 102 | */ 103 | // Type specialized functors 104 | MADOperator mad_op; 105 | EqualOperator equal_op; 106 | FromIntOperator from_int_op; 107 | T tmps[granularity]; 108 | for (int k = 0; k < fusion_degree; k++) { 109 | #pragma unroll 110 | for (int j = 0; j < granularity; j++) { 111 | // Load elements (memory intensive part) 112 | tmps[j] = g_data[idx + j * stride + k * big_stride]; 113 | // Perform computations (compute intensive part) 114 | for (int i = 0; i < compute_iterations; i++) { 115 | tmps[j] = mad_op(tmps[j], tmps[j], seed); 116 | } 117 | } 118 | // Multiply add reduction 119 | T sum = from_int_op(0); 120 | //#pragma unroll 121 | for (int j = 0; j < granularity; j += 2) { 122 | sum = mad_op(tmps[j], tmps[j + 1], sum); 123 | } 124 | // Dummy code just to avoid dead code elimination 125 | if (equal_op(sum, from_int_op(-1))) { // Designed so it never executes 126 | g_data[idx + k * big_stride] = sum; 127 | } 128 | } 129 | } 130 | 131 | using time_point = std::chrono::time_point; 132 | 133 | time_point initializeEvents(void) { 134 | return std::chrono::high_resolution_clock::now(); 135 | } 136 | 137 | double finalizeEvents(bool use_host_timer, sycl::event ev_krn_execution, const time_point &tp_start_compute) { 138 | ev_krn_execution.wait(); 139 | if (use_host_timer) { 140 | const time_point tp_stop_compute = std::chrono::high_resolution_clock::now(); 141 | return std::chrono::duration(tp_stop_compute - tp_start_compute).count(); 142 | } else { 143 | // Disabled for hipSYCL: error: no matching member function for call to 'get_profiling_info' 144 | return (ev_krn_execution.get_profiling_info() - 145 | ev_krn_execution.get_profiling_info()) / 146 | 1000000.0; 147 | } 148 | } 149 | 150 | void runbench_warmup(sycl::queue &queue, void *cd, long size) { 151 | const long reduced_grid_size = size / (ELEMENTS_PER_THREAD) / 128; 152 | const int BLOCK_SIZE = 256; 153 | const int TOTAL_REDUCED_BLOCKS = reduced_grid_size / BLOCK_SIZE; 154 | 155 | sycl::range<1> dimBlock(BLOCK_SIZE); 156 | sycl::range<1> dimReducedGrid(TOTAL_REDUCED_BLOCKS); 157 | 158 | queue.submit([&](sycl::handler &cgh) { 159 | cgh.parallel_for( 160 | sycl::nd_range<1>(dimReducedGrid * dimBlock, dimBlock), 161 | [=](sycl::nd_item<1> item_ct1) { 162 | benchmark_func( 163 | (short)1, (short *)cd, item_ct1); 164 | }); 165 | }); 166 | 167 | queue.wait(); 168 | } 169 | 170 | // forward declarations of kernel classes 171 | template 172 | class krn_float; 173 | template 174 | class krn_double; 175 | template 176 | class krn_half; 177 | template 178 | class krn_int; 179 | 180 | template 181 | void runbench(sycl::queue &queue, void *cd, long size, bool doHalfs, bool doDoubles, bool use_os_timer, size_t workgroupsize) { 182 | const long compute_grid_size = size / ELEMENTS_PER_THREAD / FUSION_DEGREE; 183 | const int BLOCK_SIZE = workgroupsize; 184 | const int TOTAL_BLOCKS = compute_grid_size / BLOCK_SIZE; 185 | 186 | const sycl::range<1> dimBlock{static_cast(BLOCK_SIZE)}; 187 | const sycl::range<1> dimGrid{static_cast(TOTAL_BLOCKS)}; 188 | 189 | constexpr auto total_bench_iterations = 3; 190 | 191 | // floating point part (single prec) 192 | auto kernel_time_mad_sp = benchmark([&]() { 193 | time_point tp_start_compute = initializeEvents(); 194 | auto ev_exec = queue.submit([&](sycl::handler& cgh) { 195 | cgh.parallel_for>( 196 | sycl::nd_range<1>(dimGrid * dimBlock, dimBlock), 197 | [=](sycl::nd_item<1> item_ct1) { 198 | benchmark_func(-1.0f, (float*)cd, item_ct1); 200 | }); 201 | }); 202 | return finalizeEvents(use_os_timer, ev_exec, tp_start_compute); 203 | }); 204 | 205 | // floating point part (double prec) 206 | double kernel_time_mad_dp = 0.; 207 | if (doDoubles) { 208 | kernel_time_mad_dp = benchmark([&]() { 209 | time_point tp_start_compute = initializeEvents(); 210 | auto ev_exec = queue.submit([&](sycl::handler& cgh) { 211 | cgh.parallel_for>( 212 | sycl::nd_range<1>(dimGrid * dimBlock, dimBlock), 213 | [=](sycl::nd_item<1> item_ct1) { 214 | benchmark_func(-1.0, reinterpret_cast(cd), item_ct1); 216 | }); 217 | }); 218 | return finalizeEvents(use_os_timer, ev_exec, tp_start_compute); 219 | }); 220 | } 221 | 222 | double kernel_time_mad_hp = 0.; 223 | // floating point part (half prec) 224 | if (doHalfs) { 225 | kernel_time_mad_hp = benchmark([&]() { 226 | time_point tp_start_compute = initializeEvents(); 227 | half2 h_ones{-1.0f, -1.0f}; 228 | auto ev_exec = queue.submit([&](sycl::handler& cgh) { 229 | cgh.parallel_for>( 230 | sycl::nd_range<1>(dimGrid * dimBlock, dimBlock), 231 | [=](sycl::nd_item<1> item_ct1) { 232 | benchmark_func( 234 | h_ones, reinterpret_cast(cd), item_ct1); 235 | }); 236 | }); 237 | return finalizeEvents(use_os_timer, ev_exec, tp_start_compute); 238 | }); 239 | } 240 | 241 | // integer part 242 | auto kernel_time_mad_int = benchmark([&]() { 243 | time_point tp_start_compute = initializeEvents(); 244 | auto ev_exec = queue.submit([&](sycl::handler& cgh) { 245 | cgh.parallel_for>( 246 | sycl::nd_range<1>(dimGrid * dimBlock, dimBlock), 247 | [=](sycl::nd_item<1> item_ct1) { 248 | benchmark_func( 250 | -1, (int*)cd, item_ct1); // seed 1 causes unwanted code 251 | // elimination optimization 252 | }); 253 | }); 254 | return finalizeEvents(use_os_timer, ev_exec, tp_start_compute); 255 | }); 256 | 257 | const long long computations = (ELEMENTS_PER_THREAD * (long long)compute_grid_size + (2 * ELEMENTS_PER_THREAD * compute_iterations) * (long long)compute_grid_size) * FUSION_DEGREE; 258 | const long long memoryoperations = size; 259 | 260 | const auto setw = std::setw; 261 | const auto setprecision = std::setprecision; 262 | std::cout << std::fixed << " " << std::setw(4) << compute_iterations 263 | << ", " << setw(8) << setprecision(3) << ((double)computations) / ((double)memoryoperations * sizeof(float)) 264 | << "," << setw(8) << setprecision(2) << kernel_time_mad_sp 265 | << "," << setw(8) << setprecision(2) << ((double)computations) / kernel_time_mad_sp * 1000. / (double)(1000 * 1000 * 1000) 266 | << "," << setw(7) << setprecision(2) << ((double)memoryoperations * sizeof(float)) / kernel_time_mad_sp * 1000. / (1000. * 1000. * 1000.) 267 | 268 | << ", " << setw(8) << setprecision(3) << ((double)computations) / ((double)memoryoperations * sizeof(double)) 269 | << "," << setw(8) << setprecision(2) << kernel_time_mad_dp 270 | << "," << setw(8) << setprecision(2) << ((double)computations) / kernel_time_mad_dp * 1000. / (double)(1000 * 1000 * 1000) 271 | << "," << setw(7) << setprecision(2) << ((double)memoryoperations * sizeof(double)) / kernel_time_mad_dp * 1000. / (1000. * 1000. * 1000.) 272 | 273 | << ", " << setw(8) << setprecision(3) << ((double)2 * computations) / ((double)memoryoperations * sizeof(half2)) 274 | << "," << setw(8) << setprecision(2) << kernel_time_mad_hp 275 | << "," << setw(8) << setprecision(2) << ((double)2 * computations) / kernel_time_mad_hp * 1000. / (double)(1000 * 1000 * 1000) 276 | << "," << setw(7) << setprecision(2) << ((double)memoryoperations * sizeof(half2)) / kernel_time_mad_hp * 1000. / (1000. * 1000. * 1000.) 277 | 278 | << ", " << setw(8) << setprecision(3) << ((double)computations) / ((double)memoryoperations * sizeof(int)) 279 | << "," << setw(8) << setprecision(2) << kernel_time_mad_int 280 | << "," << setw(8) << setprecision(2) << ((double)computations) / kernel_time_mad_int * 1000. / (double)(1000 * 1000 * 1000) 281 | << "," << setw(7) << setprecision(2) << ((double)memoryoperations * sizeof(int)) / kernel_time_mad_int * 1000. / (1000. * 1000. * 1000.) 282 | 283 | << std::endl; 284 | } 285 | 286 | // Variadic template helper to ease multiple configuration invocations 287 | template 288 | void runbench_range(sycl::queue &queue, void *cd, long size, bool doHalfs, bool doDoubles, bool use_os_timer, size_t workgroupsize) { 289 | runbench(queue, cd, size, doHalfs, doDoubles, use_os_timer, workgroupsize); 290 | } 291 | 292 | template 293 | void runbench_range(sycl::queue &queue, void *cd, long size, bool doHalfs, bool doDoubles, bool use_os_timer, size_t workgroupsize) { 294 | runbench_range(queue, cd, size, doHalfs, doDoubles, use_os_timer, workgroupsize); 295 | runbench_range(queue, cd, size, doHalfs, doDoubles, use_os_timer, workgroupsize); 296 | } 297 | 298 | void mixbenchGPU(const sycl::device &dev, void *c, long size, bool use_os_timer, size_t workgroupsize) { 299 | const sycl::property_list queue_prop_list = use_os_timer ? sycl::property_list{} : sycl::property_list{sycl::property::queue::enable_profiling()}; 300 | sycl::queue queue{dev, queue_prop_list}; 301 | 302 | std::cout << "Elements per thread: " << ELEMENTS_PER_THREAD << std::endl; 303 | std::cout << "Thread fusion degree: " << FUSION_DEGREE << std::endl; 304 | std::cout << "Timer: " << (use_os_timer ? "OS based" : "SYCL event based") << std::endl; 305 | 306 | #ifndef __HIPSYCL__ 307 | const bool doHalfs = dev.has(sycl::aspect::fp16); 308 | if (!doHalfs) { 309 | std::cout << "Warning: Half precision computations are not supported" << std::endl; 310 | } 311 | 312 | const bool doDoubles = dev.has(sycl::aspect::fp64); 313 | if (!doDoubles) { 314 | std::cout << "Warning: Double precision computations are not supported" << std::endl; 315 | } 316 | #else 317 | const bool doHalfs = true; 318 | const bool doDoubles = true; 319 | std::cout << "Warning: hipSYCL - Assuming half and double precision support" << std::endl; 320 | #endif 321 | 322 | double *cd = sycl::malloc_device(size, queue); 323 | 324 | // Initialize data to zeros on device memory 325 | queue.memset(cd, 0, size * sizeof(double)); 326 | 327 | // Synchronize in order to wait for memory operations to finish 328 | queue.wait(); 329 | 330 | std::cout << "----------------------------------------------------------------------------- CSV data -----------------------------------------------------------------------------" << std::endl; 331 | std::cout << "Experiment ID, Single Precision ops,,,, Double precision ops,,,, Half precision ops,,,, Integer operations,,, " << std::endl; 332 | std::cout << "Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec" << std::endl; 333 | 334 | runbench_warmup(queue, cd, size); 335 | 336 | runbench_range<0, 1, 2, 3, 4, 5, 6, 7, 8, 337 | 9, 10, 11, 12, 13, 14, 15, 16, 338 | 17, 18, 20, 22, 24, 28, 32, 40, 339 | 48, 56, 64, 80, 96, 128, 192, 256>(queue, cd, size, doHalfs, doDoubles, use_os_timer, workgroupsize); 340 | 341 | std::cout << "--------------------------------------------------------------------------------------------------------------------------------------------------------------------" << std::endl; 342 | 343 | // Copy results to host memory and release device memory 344 | queue.memcpy(c, cd, size * sizeof(double)).wait(); 345 | 346 | sycl::free(cd, queue); 347 | } 348 | -------------------------------------------------------------------------------- /mixbench-opencl/mix_kernels_ocl.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * mix_kernels_ocl.cpp: This file is part of the mixbench GPU micro-benchmark 3 | *suite. 4 | * 5 | * Contact: Elias Konstantinidis 6 | **/ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "loclutil.h" 14 | 15 | #if defined(_MSC_VER) 16 | #define SIZE_T_FORMAT "%lu" 17 | #else 18 | #define SIZE_T_FORMAT "%zu" 19 | #endif 20 | 21 | enum KrnDataType { kdt_int, kdt_float, kdt_double, kdt_half }; 22 | 23 | const int compute_iterations[] = { 24 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 25 | 17, 18, 20, 22, 24, 28, 32, 40, 48, 56, 64, 80, 96, 128, 192, 256}; 26 | const int compute_iterations_len = 27 | sizeof(compute_iterations) / sizeof(*compute_iterations); 28 | 29 | #ifdef HF_WORKAROUND 30 | typedef short cl_half2[2]; 31 | #endif 32 | 33 | char* ReadFile(const char* filename) { 34 | char* buffer = NULL; 35 | int file_size, read_size; 36 | FILE* file = fopen(filename, "r"); 37 | if (!file) 38 | return NULL; 39 | // Seek EOF 40 | fseek(file, 0, SEEK_END); 41 | // Get offset 42 | file_size = ftell(file); 43 | rewind(file); 44 | buffer = (char*)malloc(sizeof(char) * (file_size + 1)); 45 | read_size = fread(buffer, sizeof(char), file_size, file); 46 | buffer[file_size] = '\0'; 47 | if (file_size != read_size) { 48 | free(buffer); 49 | buffer = NULL; 50 | } 51 | return buffer; 52 | } 53 | 54 | void flushed_printf(const char* format, ...) { 55 | va_list args; 56 | va_start(args, format); 57 | vprintf(format, args); 58 | va_end(args); 59 | fflush(stdout); 60 | } 61 | 62 | void show_progress_init(int length) { 63 | flushed_printf("["); 64 | for (int i = 0; i < length; i++) 65 | flushed_printf(" "); 66 | flushed_printf("]"); 67 | for (int i = 0; i <= length; i++) 68 | flushed_printf("\b"); 69 | } 70 | 71 | void show_progress_step(int domove, char newchar) { 72 | flushed_printf("%c", newchar); 73 | if (!domove) 74 | flushed_printf("\b"); 75 | } 76 | 77 | void show_progress_done(void) { 78 | flushed_printf("\n"); 79 | } 80 | 81 | double get_event_duration(cl_event ev) { 82 | cl_ulong ev_t_start, ev_t_finish; 83 | OCL_SAFE_CALL(clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, 84 | sizeof(cl_ulong), &ev_t_start, NULL)); 85 | OCL_SAFE_CALL(clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, 86 | sizeof(cl_ulong), &ev_t_finish, NULL)); 87 | double time = (ev_t_finish - ev_t_start) / 1000000.0; 88 | return time; 89 | } 90 | 91 | cl_kernel BuildKernel(cl_context context, 92 | cl_device_id dev_id, 93 | const char* source, 94 | const char* parameters) { 95 | cl_int errno; 96 | const char** sources = &source; 97 | cl_program program = 98 | clCreateProgramWithSource(context, 1, sources, NULL, &errno); 99 | OCL_SAFE_CALL(errno); 100 | errno = clBuildProgram(program, 1, &dev_id, parameters, NULL, NULL); 101 | if (errno != CL_SUCCESS) { 102 | fprintf(stderr, "Program built error code: %d\n", errno); 103 | size_t log_size; 104 | OCL_SAFE_CALL(clGetProgramBuildInfo(program, dev_id, CL_PROGRAM_BUILD_LOG, 105 | 0, NULL, &log_size)); 106 | char* log = (char*)alloca(log_size); 107 | OCL_SAFE_CALL(clGetProgramBuildInfo(program, dev_id, CL_PROGRAM_BUILD_LOG, 108 | log_size, log, NULL)); 109 | OCL_SAFE_CALL(clReleaseProgram(program)); 110 | fprintf(stderr, 111 | "------------------------------------ Kernel compilation log " 112 | "----------------------------------\n"); 113 | fprintf(stderr, "%s", log); 114 | fprintf(stderr, 115 | "------------------------------------------------------------------" 116 | "----------------------------\n"); 117 | exit(EXIT_FAILURE); 118 | } 119 | // Kernel creation 120 | cl_kernel kernel = clCreateKernel(program, "benchmark_func", &errno); 121 | OCL_SAFE_CALL(errno); 122 | return kernel; 123 | } 124 | 125 | void ReleaseKernelNProgram(cl_kernel kernel) { 126 | cl_program program_tmp; 127 | OCL_SAFE_CALL(clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(program_tmp), 128 | &program_tmp, NULL)); 129 | OCL_SAFE_CALL(clReleaseKernel(kernel)); 130 | OCL_SAFE_CALL(clReleaseProgram(program_tmp)); 131 | } 132 | 133 | void runbench_warmup(cl_command_queue queue, 134 | cl_kernel kernel, 135 | cl_mem cbuffer, 136 | long size, 137 | size_t workgroupsize) { 138 | const long reduced_grid_size = size / 256; 139 | 140 | const size_t dimBlock[1] = {workgroupsize}; 141 | const size_t dimReducedGrid[1] = {(size_t)reduced_grid_size}; 142 | 143 | const short seed = 1; 144 | OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_short), &seed)); 145 | OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer)); 146 | 147 | OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimReducedGrid, 148 | dimBlock, 0, NULL, NULL)); 149 | } 150 | 151 | void runbench(const int compute_iterations[], 152 | unsigned int krn_idx, 153 | cl_command_queue queue, 154 | cl_kernel kernels[kdt_double + 1][compute_iterations_len], 155 | cl_mem cbuffer, 156 | long size, 157 | size_t workgroupsize, 158 | unsigned int elements_per_wi, 159 | unsigned int fusion_degree, 160 | bool use_os_timer) { 161 | const long compute_grid_size = size / elements_per_wi / fusion_degree; 162 | const int current_compute_iterations = compute_iterations[krn_idx]; 163 | const long long computations = 164 | (elements_per_wi * (long long)compute_grid_size + 165 | (2 * elements_per_wi * current_compute_iterations) * 166 | (long long)compute_grid_size) * 167 | fusion_degree; 168 | const long long memoryoperations = size; 169 | 170 | const size_t dimBlock[1] = {workgroupsize}; 171 | const size_t dimGrid[1] = {(size_t)compute_grid_size}; 172 | 173 | constexpr auto total_bench_iterations = 3; 174 | 175 | double kernel_time_mad_sp = benchmark([&]() { 176 | const cl_float seed_f = 1.0f; 177 | cl_kernel kernel = kernels[kdt_float][krn_idx]; 178 | OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_float), &seed_f)); 179 | OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer)); 180 | auto ts_start = getTimestamp(); 181 | cl_event event; 182 | OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimGrid, 183 | dimBlock, 0, NULL, &event)); 184 | OCL_SAFE_CALL(clWaitForEvents(1, &event)); 185 | auto duration = 186 | use_os_timer ? getElapsedtime(ts_start) : get_event_duration(event); 187 | OCL_SAFE_CALL(clReleaseEvent(event)); 188 | return duration; 189 | }); 190 | 191 | double kernel_time_mad_dp = benchmark([&]() { 192 | const cl_double seed_d = 1.0; 193 | cl_kernel kernel = kernels[kdt_double][krn_idx]; 194 | if (kernel) { 195 | OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_double), &seed_d)); 196 | OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer)); 197 | auto ts_start = getTimestamp(); 198 | cl_event event; 199 | OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimGrid, 200 | dimBlock, 0, NULL, &event)); 201 | OCL_SAFE_CALL(clWaitForEvents(1, &event)); 202 | auto duration = 203 | use_os_timer ? getElapsedtime(ts_start) : get_event_duration(event); 204 | OCL_SAFE_CALL(clReleaseEvent(event)); 205 | return duration; 206 | } else 207 | return 0.0; 208 | }); 209 | 210 | double kernel_time_mad_hp = benchmark([&]() { 211 | const cl_half2 seed_h = {15360, 15360}; // {1.0, 1.0} 212 | cl_kernel kernel = kernels[kdt_half][krn_idx]; 213 | if (kernel) { 214 | OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_half2), &seed_h)); 215 | OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer)); 216 | auto ts_start = getTimestamp(); 217 | cl_event event; 218 | OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimGrid, 219 | dimBlock, 0, NULL, &event)); 220 | OCL_SAFE_CALL(clWaitForEvents(1, &event)); 221 | auto duration = 222 | use_os_timer ? getElapsedtime(ts_start) : get_event_duration(event); 223 | OCL_SAFE_CALL(clReleaseEvent(event)); 224 | return duration; 225 | } else 226 | return 0.0; 227 | }); 228 | 229 | double kernel_time_mad_int = benchmark([&]() { 230 | const cl_int seed_i = static_cast(1.0); 231 | cl_kernel kernel = kernels[kdt_int][krn_idx]; 232 | OCL_SAFE_CALL(clSetKernelArg(kernel, 0, sizeof(cl_int), &seed_i)); 233 | OCL_SAFE_CALL(clSetKernelArg(kernel, 1, sizeof(cl_mem), &cbuffer)); 234 | auto ts_start = getTimestamp(); 235 | cl_event event; 236 | OCL_SAFE_CALL(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, dimGrid, 237 | dimBlock, 0, NULL, &event)); 238 | OCL_SAFE_CALL(clWaitForEvents(1, &event)); 239 | auto duration = 240 | use_os_timer ? getElapsedtime(ts_start) : get_event_duration(event); 241 | OCL_SAFE_CALL(clReleaseEvent(event)); 242 | return duration; 243 | }); 244 | 245 | printf( 246 | " %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, " 247 | "%8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", 248 | current_compute_iterations, 249 | ((double)computations) / ((double)memoryoperations * sizeof(float)), 250 | kernel_time_mad_sp, 251 | ((double)computations) / kernel_time_mad_sp * 1000. / 252 | (double)(1000 * 1000 * 1000), 253 | ((double)memoryoperations * sizeof(float)) / kernel_time_mad_sp * 1000. / 254 | (1000. * 1000. * 1000.), 255 | ((double)computations) / ((double)memoryoperations * sizeof(double)), 256 | kernel_time_mad_dp, 257 | ((double)computations) / kernel_time_mad_dp * 1000. / 258 | (double)(1000 * 1000 * 1000), 259 | ((double)memoryoperations * sizeof(double)) / kernel_time_mad_dp * 1000. / 260 | (1000. * 1000. * 1000.), 261 | ((double)2 * computations) / 262 | ((double)memoryoperations * sizeof(cl_half2)), 263 | kernel_time_mad_hp, 264 | ((double)2 * computations) / kernel_time_mad_hp * 1000. / 265 | (double)(1000 * 1000 * 1000), 266 | ((double)memoryoperations * sizeof(cl_half2)) / kernel_time_mad_hp * 267 | 1000. / (1000. * 1000. * 1000.), 268 | ((double)computations) / ((double)memoryoperations * sizeof(int)), 269 | kernel_time_mad_int, 270 | ((double)computations) / kernel_time_mad_int * 1000. / 271 | (double)(1000 * 1000 * 1000), 272 | ((double)memoryoperations * sizeof(int)) / kernel_time_mad_int * 1000. / 273 | (1000. * 1000. * 1000.)); 274 | } 275 | 276 | extern "C" void mixbenchGPU(cl_device_id dev_id, 277 | double* c, 278 | long size, 279 | bool block_strided, 280 | bool host_allocated, 281 | bool use_os_timer, 282 | size_t workgroupsize, 283 | unsigned int elements_per_wi, 284 | unsigned int fusion_degree) { 285 | const char* benchtype; 286 | if (block_strided) 287 | benchtype = "Workgroup"; 288 | else 289 | benchtype = "NDRange"; 290 | printf("Workitem stride: %s\n", benchtype); 291 | const char* buffer_allocation = 292 | host_allocated ? "Host allocated" : "Device allocated"; 293 | printf("Buffer allocation: %s\n", buffer_allocation); 294 | printf("Timer: %s\n", 295 | use_os_timer ? "OS based" : "CL event based"); 296 | 297 | // Set context properties 298 | cl_platform_id p_id; 299 | OCL_SAFE_CALL( 300 | clGetDeviceInfo(dev_id, CL_DEVICE_PLATFORM, sizeof(p_id), &p_id, NULL)); 301 | size_t length; 302 | OCL_SAFE_CALL( 303 | clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, 0, NULL, &length)); 304 | char* extensions = (char*)alloca(length); 305 | OCL_SAFE_CALL( 306 | clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, length, extensions, NULL)); 307 | bool enable_dp = strstr(extensions, "cl_khr_fp64") != NULL; 308 | if (!enable_dp) 309 | printf( 310 | "Warning: Double precision computations are not " 311 | "supported\n"); 312 | bool enable_hp = strstr(extensions, "cl_khr_fp16") != NULL; 313 | if (!enable_hp) 314 | printf( 315 | "Warning: Half precision computations are not " 316 | "supported\n"); 317 | 318 | cl_context_properties ctxProps[] = {CL_CONTEXT_PLATFORM, 319 | (cl_context_properties)p_id, 0}; 320 | 321 | cl_int errno; 322 | // Create context 323 | cl_context context = 324 | clCreateContext(ctxProps, 1, &dev_id, NULL, NULL, &errno); 325 | OCL_SAFE_CALL(errno); 326 | 327 | cl_mem_flags buf_flags = CL_MEM_READ_WRITE; 328 | if (host_allocated) 329 | buf_flags |= CL_MEM_ALLOC_HOST_PTR; 330 | cl_mem c_buffer = 331 | clCreateBuffer(context, buf_flags, size * sizeof(double), NULL, &errno); 332 | OCL_SAFE_CALL(errno); 333 | 334 | // Create command queue 335 | cl_command_queue cmd_queue = clCreateCommandQueue( 336 | context, dev_id, use_os_timer ? 0 : CL_QUEUE_PROFILING_ENABLE, &errno); 337 | OCL_SAFE_CALL(errno); 338 | 339 | // Set data on device memory 340 | cl_int* mapped_data = 341 | (cl_int*)clEnqueueMapBuffer(cmd_queue, c_buffer, CL_TRUE, CL_MAP_WRITE, 0, 342 | size * sizeof(double), 0, NULL, NULL, &errno); 343 | OCL_SAFE_CALL(errno); 344 | for (int i = 0; i < size; i++) 345 | mapped_data[i] = 0; 346 | clEnqueueUnmapMemObject(cmd_queue, c_buffer, mapped_data, 0, NULL, NULL); 347 | 348 | // Load source, create program and all kernels 349 | printf("Loading kernel source file...\n"); 350 | const char c_param_format_str[] = 351 | "-cl-std=CL1.2 -cl-mad-enable -Dclass_T=%s -Dblockdim=" SIZE_T_FORMAT 352 | " -DCOMPUTE_ITERATIONS=%d -DELEMENTS_PER_THREAD=%d -DFUSION_DEGREE=%d %s " 353 | "%s"; 354 | const char* c_empty = ""; 355 | const char* c_striding = block_strided ? "-DBLOCK_STRIDED" : c_empty; 356 | const char *c_enable_dp = "-DENABLE_DP", *c_enable_hp = "-DENABLE_HP"; 357 | char c_build_params[256]; 358 | const char* c_kernel_source = {ReadFile("mix_kernels.cl")}; 359 | printf("Precompilation of kernels... "); 360 | sprintf(c_build_params, c_param_format_str, "short", workgroupsize, 0, 1, 1, 361 | c_striding, c_empty); 362 | 363 | cl_kernel kernel_warmup = 364 | BuildKernel(context, dev_id, c_kernel_source, c_build_params); 365 | 366 | show_progress_init(compute_iterations_len); 367 | cl_kernel kernels[kdt_half + 1][compute_iterations_len]; 368 | for (int i = 0; i < compute_iterations_len; i++) { 369 | show_progress_step(0, '\\'); 370 | sprintf(c_build_params, c_param_format_str, "float", workgroupsize, 371 | compute_iterations[i], elements_per_wi, fusion_degree, c_striding, 372 | c_empty); 373 | // printf("%s\n",c_build_params); 374 | kernels[kdt_float][i] = 375 | BuildKernel(context, dev_id, c_kernel_source, c_build_params); 376 | 377 | show_progress_step(0, '|'); 378 | sprintf(c_build_params, c_param_format_str, "int", workgroupsize, 379 | compute_iterations[i], elements_per_wi, fusion_degree, c_striding, 380 | c_empty); 381 | // printf("%s\n",c_build_params); 382 | kernels[kdt_int][i] = 383 | BuildKernel(context, dev_id, c_kernel_source, c_build_params); 384 | 385 | if (enable_dp) { 386 | show_progress_step(0, '/'); 387 | sprintf(c_build_params, c_param_format_str, "double", workgroupsize, 388 | compute_iterations[i], elements_per_wi, fusion_degree, c_striding, 389 | c_enable_dp); 390 | // printf("%s\n",c_build_params); 391 | kernels[kdt_double][i] = 392 | BuildKernel(context, dev_id, c_kernel_source, c_build_params); 393 | } else 394 | kernels[kdt_double][i] = 0; 395 | 396 | if (enable_hp) { 397 | show_progress_step(0, '-'); 398 | sprintf(c_build_params, c_param_format_str, "half2", workgroupsize, 399 | compute_iterations[i], elements_per_wi, fusion_degree, c_striding, 400 | c_enable_hp); 401 | kernels[kdt_half][i] = 402 | BuildKernel(context, dev_id, c_kernel_source, c_build_params); 403 | } else 404 | kernels[kdt_half][i] = 0; 405 | 406 | show_progress_step(1, '>'); 407 | } 408 | show_progress_done(); 409 | free((char*)c_kernel_source); 410 | 411 | runbench_warmup(cmd_queue, kernel_warmup, c_buffer, size, workgroupsize); 412 | 413 | // Synchronize in order to wait for memory operations to finish 414 | OCL_SAFE_CALL(clFinish(cmd_queue)); 415 | 416 | printf( 417 | "------------------------------------------------------------------------" 418 | "----- CSV data " 419 | "------------------------------------------------------------------------" 420 | "-----\n"); 421 | printf( 422 | "Experiment ID, Single Precision ops,,,, Double precision " 423 | "ops,,,, Half precision ops,,,, Integer " 424 | "operations,,, \n"); 425 | printf( 426 | "Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, " 427 | "ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, " 428 | "Iops/byte, ex.time, GIOPS, GB/sec\n"); 429 | 430 | for (int i = 0; i < compute_iterations_len; i++) 431 | runbench(compute_iterations, i, cmd_queue, kernels, c_buffer, size, 432 | workgroupsize, elements_per_wi, fusion_degree, use_os_timer); 433 | 434 | printf( 435 | "------------------------------------------------------------------------" 436 | "------------------------------------------------------------------------" 437 | "--------------------\n"); 438 | 439 | // Copy results back to host memory 440 | OCL_SAFE_CALL(clEnqueueReadBuffer(cmd_queue, c_buffer, CL_TRUE, 0, 441 | size * sizeof(double), c, 0, NULL, NULL)); 442 | 443 | // Release kernels and program 444 | ReleaseKernelNProgram(kernel_warmup); 445 | for (int i = 0; i < compute_iterations_len; i++) { 446 | ReleaseKernelNProgram(kernels[kdt_float][i]); 447 | ReleaseKernelNProgram(kernels[kdt_int][i]); 448 | if (enable_dp) 449 | ReleaseKernelNProgram(kernels[kdt_double][i]); 450 | if (enable_hp) 451 | ReleaseKernelNProgram(kernels[kdt_half][i]); 452 | } 453 | 454 | // Release buffer 455 | OCL_SAFE_CALL(clReleaseMemObject(c_buffer)); 456 | OCL_SAFE_CALL(clReleaseCommandQueue(cmd_queue)); 457 | OCL_SAFE_CALL(clReleaseContext(context)); 458 | } 459 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | --------------------------------------------------------------------------------