├── .github └── workflows │ └── main.yaml ├── .gitignore ├── CHANGELOG.md ├── CITATION.cff ├── CMakeLists.txt ├── LICENSE ├── README.md ├── babelstream.png ├── cmake ├── Modules │ ├── ComputeCppCompilerChecks.cmake │ ├── ComputeCppIRMap.cmake │ └── FindComputeCpp.cmake ├── register_models.cmake └── toolchains │ ├── arm-gcc-poky.cmake │ └── gcc-generic.cmake ├── docs └── spack_instructions.md ├── results ├── v1.0 │ ├── cuda │ │ ├── nvidia-gtx-580.txt │ │ ├── nvidia-gtx-780-ti.txt │ │ ├── nvidia-gtx-980-ti.txt │ │ ├── nvidia-gtx-980.txt │ │ ├── nvidia-gtx-titan_x.txt │ │ ├── nvidia-k20c-ecc-off.txt │ │ ├── nvidia-k20c-ecc-on.txt │ │ ├── nvidia-k40c-ecc-off.txt │ │ ├── nvidia-k40c-ecc-on.txt │ │ ├── nvidia-k80-ecc-off.txt │ │ └── nvidia-k80-ecc-on.txt │ ├── hip │ │ ├── amd-fiji-nano.txt │ │ └── nvidia-gtx-titan_x.txt │ ├── opencl-results.csv │ ├── opencl-results.xlsx │ └── opencl │ │ ├── amd-a10-7850k-radeon-r7.txt │ │ ├── amd-firepro-s10000.txt │ │ ├── amd-firepro-s9150-ecc-off.txt │ │ ├── amd-r9-fury-x.txt │ │ ├── amd-radeon-7970.txt │ │ ├── amd-radeon-r9-290x.txt │ │ ├── amd-radeon-r9-295x2.txt │ │ ├── intel-phi-se10p.txt │ │ ├── nvidia-gtx-580.txt │ │ ├── nvidia-gtx-780-ti.txt │ │ ├── nvidia-gtx-980-ti.txt │ │ ├── nvidia-gtx-980.txt │ │ ├── nvidia-k20c-ecc-off.txt │ │ ├── nvidia-k20c-ecc-on.txt │ │ ├── nvidia-k40c-ecc-off.txt │ │ ├── nvidia-k40c-ecc-on.txt │ │ ├── nvidia-k80-ecc-off.txt │ │ └── nvidia-k80-ecc-on.txt ├── v2.0 │ ├── broadwell │ │ ├── acc-pgi-kernel.txt │ │ ├── acc-pgi-loops.txt │ │ ├── kokkos-gcc.txt │ │ ├── mccalpin-cray.txt │ │ ├── ocl.txt │ │ ├── omp-cray.txt │ │ └── raja-gcc.txt │ ├── furynano │ │ └── hip.txt │ ├── furyx │ │ ├── ocl.txt │ │ └── sycl.txt │ ├── gtx980ti │ │ ├── acc-pgi-loops.txt │ │ ├── cuda.txt │ │ ├── kokkos.txt │ │ ├── ocl.txt │ │ └── raja.txt │ ├── haswell │ │ ├── acc-pgi-kernel.txt │ │ ├── acc-pgi-loops.txt │ │ ├── cuda.txt │ │ ├── kokkos-gcc.txt │ │ ├── mccalpin-cray.txt │ │ ├── ocl.txt │ │ ├── omp-cray.txt │ │ └── raja-gcc.txt │ ├── ivybridge │ │ ├── acc-pgi-kernel.txt │ │ ├── acc-pgi-loops.txt │ │ ├── cuda.txt │ │ ├── kokkos-gcc.txt │ │ ├── mccalpin-intel.txt │ │ ├── ocl.txt │ │ ├── omp-intel.txt │ │ ├── raja-gcc.txt │ │ └── sycl.txt │ ├── k20x │ │ ├── acc-cray.txt │ │ ├── cuda.txt │ │ ├── kokkos.txt │ │ ├── ocl.txt │ │ ├── omp-cray.txt │ │ └── raja.txt │ ├── k40 │ │ ├── acc-cray.txt │ │ ├── cuda.txt │ │ ├── kokkos.txt │ │ ├── ocl.txt │ │ └── raja.txt │ ├── k80 │ │ ├── acc-cray.txt │ │ ├── cuda.txt │ │ ├── kokkos.txt │ │ ├── ocl.txt │ │ └── raja.txt │ ├── knl │ │ ├── acc-pgi-kernel.txt │ │ ├── kokkos-intel-128threads.txt │ │ ├── mccalpin-intel.txt │ │ ├── ocl.txt │ │ ├── omp-intel.txt │ │ ├── raja-intel.txt │ │ └── sycl.txt │ ├── power8 │ │ ├── kokkos-xl.txt │ │ ├── mccalpin-xl.txt │ │ ├── omp-xl.txt │ │ ├── raja-gcc.txt │ │ └── raja-xl.txt │ ├── s9150 │ │ ├── acc-pgi-loops.txt │ │ ├── ocl.txt │ │ └── sycl.txt │ ├── s9300x2 │ │ └── hip.txt │ ├── sandybridge │ │ ├── acc-pgi-kernel.txt │ │ ├── acc-pgi-loops.txt │ │ ├── cuda.txt │ │ ├── kokkos-gcc.txt │ │ ├── mccalpin-intel.txt │ │ ├── ocl.txt │ │ ├── omp-intel.txt │ │ └── raja-gcc.txt │ └── titanx │ │ └── hip.txt └── v3.3 │ └── titanxp │ ├── cuda.txt │ └── ocl.txt └── src ├── .gitignore ├── Stream.h ├── acc ├── ACCStream.cpp ├── ACCStream.h └── model.cmake ├── ci-prepare-bionic.sh ├── ci-test-compile.sh ├── cuda ├── CUDAStream.cu ├── CUDAStream.h └── model.cmake ├── dpl_shim.h ├── fortran ├── ArrayStream.F90 ├── BabelStreamTypes.F90 ├── CUDAKernelStream.F90 ├── CUDAStream.F90 ├── DoConcurrentStream.F90 ├── Makefile ├── OpenACCArrayStream.F90 ├── OpenACCStream.F90 ├── OpenMPStream.F90 ├── OpenMPTargetLoopStream.F90 ├── OpenMPTargetStream.F90 ├── OpenMPTaskloopStream.F90 ├── OpenMPWorkshareStream.F90 ├── SequentialStream.F90 ├── build.sh ├── main.F90 ├── make.inc.amd ├── make.inc.arm ├── make.inc.cray ├── make.inc.fj ├── make.inc.gcc ├── make.inc.nvhpc ├── make.inc.oneapi └── run.sh ├── futhark ├── FutharkStream.cpp ├── FutharkStream.h ├── babelstream.fut └── model.cmake ├── hip ├── HIPStream.cpp ├── HIPStream.h └── model.cmake ├── java └── java-stream │ ├── .gitignore │ ├── .mvn │ └── wrapper │ │ ├── maven-wrapper.jar │ │ └── maven-wrapper.properties │ ├── README.md │ ├── mvnw │ ├── mvnw.cmd │ ├── pom.xml │ └── src │ ├── main │ └── java │ │ └── javastream │ │ ├── FractionalMaths.java │ │ ├── JavaStream.java │ │ ├── Main.java │ │ ├── aparapi │ │ ├── AparapiStreams.java │ │ ├── GenericAparapiStreamKernel.java │ │ ├── SpecialisedDoubleKernel.java │ │ └── SpecialisedFloatKernel.java │ │ ├── jdk │ │ ├── GenericPlainStream.java │ │ ├── GenericStream.java │ │ ├── JdkStreams.java │ │ ├── PlainStream.java │ │ ├── SpecialisedDoubleStream.java │ │ ├── SpecialisedFloatStream.java │ │ ├── SpecialisedPlainDoubleStream.java │ │ └── SpecialisedPlainFloatStream.java │ │ └── tornadovm │ │ ├── GenericTornadoVMStream.java │ │ ├── SpecialisedDouble.java │ │ ├── SpecialisedFloat.java │ │ └── TornadoVMStreams.java │ └── test │ └── java │ └── javastream │ └── SmokeTest.java ├── julia └── JuliaStream.jl │ ├── .JuliaFormatter.toml │ ├── .gitignore │ ├── AMDGPU │ ├── Manifest.toml │ └── Project.toml │ ├── CUDA │ ├── Manifest.toml │ └── Project.toml │ ├── KernelAbstractions │ ├── Manifest.toml │ └── Project.toml │ ├── Manifest.toml │ ├── Project.toml │ ├── README.md │ ├── Threaded │ ├── Manifest.toml │ └── Project.toml │ ├── oneAPI │ ├── Manifest.toml │ └── Project.toml │ ├── src │ ├── AMDGPUStream.jl │ ├── CUDAStream.jl │ ├── DistributedStream.jl │ ├── JuliaStream.jl │ ├── KernelAbstractionsStream.jl │ ├── PlainStream.jl │ ├── Stream.jl │ ├── StreamData.jl │ ├── ThreadedStream.jl │ └── oneAPIStream.jl │ └── update_all.sh ├── kokkos ├── KokkosStream.cpp ├── KokkosStream.hpp └── model.cmake ├── legacy ├── HCStream.cpp └── HCStream.h ├── main.cpp ├── ocl ├── CL │ └── cl2.hpp ├── OCLStream.cpp ├── OCLStream.h └── model.cmake ├── omp ├── OMPStream.cpp ├── OMPStream.h └── model.cmake ├── raja ├── RAJAStream.cpp ├── RAJAStream.hpp └── model.cmake ├── rust └── rust-stream │ ├── .cargo │ └── config.toml │ ├── .gitignore │ ├── Cargo.lock │ ├── Cargo.toml │ ├── README.md │ ├── rustfmt.toml │ ├── src │ ├── arc_stream.rs │ ├── crossbeam_stream.rs │ ├── lib.rs │ ├── main.rs │ ├── plain_stream.rs │ ├── rayon_stream.rs │ ├── stream.rs │ └── unsafe_stream.rs │ └── tests │ └── integration_test.rs ├── scala └── scala-stream │ ├── .gitignore │ ├── .jvmopts │ ├── .scalafmt.conf │ ├── README.md │ ├── build.sbt │ ├── project │ ├── build.properties │ └── plugins.sbt │ ├── reflect-config.json │ ├── sbt │ ├── sbt-dist │ ├── bin │ │ ├── java9-rt-export.jar │ │ ├── sbt │ │ ├── sbt-launch-lib.bash │ │ ├── sbt-launch.jar │ │ └── sbt.bat │ └── conf │ │ ├── sbtconfig.txt │ │ └── sbtopts │ └── src │ └── main │ └── scala │ └── scalastream │ ├── J8SStream.scala │ ├── ParStream.scala │ ├── PlainStream.scala │ ├── ScalaStream.scala │ └── ThreadStream.scala ├── std-data ├── STDDataStream.cpp ├── STDDataStream.h └── model.cmake ├── std-indices ├── STDIndicesStream.cpp ├── STDIndicesStream.h └── model.cmake ├── std-ranges ├── STDRangesStream.cpp ├── STDRangesStream.hpp └── model.cmake ├── sycl ├── SYCLStream.cpp ├── SYCLStream.h └── model.cmake ├── sycl2020-acc ├── SYCLStream2020.cpp ├── SYCLStream2020.h └── model.cmake ├── sycl2020-usm ├── SYCLStream2020.cpp ├── SYCLStream2020.h └── model.cmake ├── tbb ├── TBBStream.cpp ├── TBBStream.hpp └── model.cmake └── thrust ├── ThrustStream.cu ├── ThrustStream.h └── model.cmake /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | cuda-stream 3 | ocl-stream 4 | omp-stream 5 | acc-stream 6 | raja-stream 7 | kokkos-stream 8 | std-stream 9 | sycl-stream 10 | hip-stream 11 | tbb-stream 12 | 13 | src/fortran/BabelStream 14 | src/fortran/BabelStream.* 15 | 16 | *.o 17 | *.bc 18 | *.sycl 19 | *.tar 20 | *.gz 21 | *.a 22 | *.mod 23 | *.cub 24 | *.ptx 25 | 26 | KokkosCore_config.* 27 | 28 | .DS_Store 29 | 30 | Makefile 31 | 32 | build/ 33 | cmake-build-*/ 34 | CMakeFiles/ 35 | .idea/ 36 | .vscode/ 37 | .directory 38 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.1.0 2 | message: If you use this software, please cite it as below. 3 | authors: 4 | - family-names: Deakin 5 | given-names: Tom 6 | affiliation: University of Bristol 7 | website: https://hpc.tomdeakin.com 8 | - family-names: McIntosh-Smith 9 | given-names: Simon 10 | affiliation: University of Bristol 11 | website: https://uob-hpc.github.io 12 | title: BabelStream 13 | version: 3.4 14 | date-released: 2019-04-10 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | *============================================================================== 2 | *------------------------------------------------------------------------------ 3 | * Copyright 2015-16: Tom Deakin, Simon McIntosh-Smith, University of Bristol HPC 4 | * Based on John D. McCalpin’s original STREAM benchmark for CPUs 5 | *------------------------------------------------------------------------------ 6 | * License: 7 | * 1. You are free to use this program and/or to redistribute 8 | * this program. 9 | * 2. You are free to modify this program for your own use, 10 | * including commercial use, subject to the publication 11 | * restrictions in item 3. 12 | * 3. You are free to publish results obtained from running this 13 | * program, or from works that you derive from this program, 14 | * with the following limitations: 15 | * 3a. In order to be referred to as "BabelStream benchmark results", 16 | * published results must be in conformance to the BabelStream 17 | * Run Rules published at 18 | * http://github.com/UoB-HPC/BabelStream/wiki/Run-Rules 19 | * and incorporated herein by reference. 20 | * The copyright holders retain the 21 | * right to determine conformity with the Run Rules. 22 | * 3b. Results based on modified source code or on runs not in 23 | * accordance with the BabelStream Run Rules must be clearly 24 | * labelled whenever they are published. Examples of 25 | * proper labelling include: 26 | * "tuned BabelStream benchmark results" 27 | * "based on a variant of the BabelStream benchmark code" 28 | * Other comparable, clear and reasonable labelling is 29 | * acceptable. 30 | * 3c. Submission of results to the BabelStream benchmark web site 31 | * is encouraged, but not required. 32 | * 4. Use of this program or creation of derived works based on this 33 | * program constitutes acceptance of these licensing restrictions. 34 | * 5. Absolutely no warranty is expressed or implied. 35 | *———————————————————————————————————------------------------------------------- 36 | -------------------------------------------------------------------------------- /babelstream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/BabelStream/2f00dfb7f8b7cfe8c53d20d5c770bccbf8673440/babelstream.png -------------------------------------------------------------------------------- /cmake/Modules/ComputeCppIRMap.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4.3) 2 | 3 | # These should match the types of IR output by compute++ 4 | set(IR_MAP_spir bc) 5 | set(IR_MAP_spir64 bc) 6 | set(IR_MAP_spir32 bc) 7 | set(IR_MAP_spirv spv) 8 | set(IR_MAP_spirv64 spv) 9 | set(IR_MAP_spirv32 spv) 10 | set(IR_MAP_aorta-x86_64 o) 11 | set(IR_MAP_aorta-aarch64 o) 12 | set(IR_MAP_aorta-rcar-cve o) 13 | set(IR_MAP_custom-spir64 bc) 14 | set(IR_MAP_custom-spir32 bc) 15 | set(IR_MAP_custom-spirv64 spv) 16 | set(IR_MAP_custom-spirv32 spv) 17 | set(IR_MAP_ptx64 s) 18 | set(IR_MAP_amdgcn s) 19 | -------------------------------------------------------------------------------- /cmake/toolchains/arm-gcc-poky.cmake: -------------------------------------------------------------------------------- 1 | set(CMAKE_SYSTEM_NAME Linux) 2 | set(CMAKE_SYSTEM_PROCESSOR ARM64) 3 | set(SDK_POKY_ROOT $ENV{SDK_POKY_ROOT}) 4 | 5 | if(NOT SDK_POKY_ROOT) 6 | message(FATAL_ERROR 7 | "Please set SDK_POKY_ROOT in the environment when crosscompiling.") 8 | endif() 9 | 10 | set(COMPUTECPP_TARGET_TRIPLE aarch64-poky-linux) 11 | set(COMPUTECPP_TOOLCHAIN_DIR ${SDK_POKY_ROOT}/x86_64-pokysdk-linux) 12 | set(COMPUTECPP_SYSROOT_DIR ${SDK_POKY_ROOT}/aarch64-poky-linux) 13 | # Adding this as the GCC toolchain makes compute++ not find headers 14 | set(COMPUTECPP_DONT_USE_TOOLCHAIN ON) 15 | 16 | set(CMAKE_C_COMPILER "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-gcc" CACHE PATH "gcc") 17 | set(CMAKE_CXX_COMPILER "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-g++" CACHE PATH "g++") 18 | set(CMAKE_AR "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-ar" CACHE PATH "archive") 19 | set(CMAKE_LINKER "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-ld" CACHE PATH "linker") 20 | set(CMAKE_NM "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-nm" CACHE PATH "nm") 21 | set(CMAKE_OBJCOPY "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-objcopy" CACHE PATH "objcopy") 22 | set(CMAKE_OBJDUMP "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-objdump" CACHE PATH "objdump") 23 | set(CMAKE_STRIP "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-strip" CACHE PATH "strip") 24 | set(CMAKE_RANLIB "${COMPUTECPP_TOOLCHAIN_DIR}/usr/bin/${COMPUTECPP_TARGET_TRIPLE}/${COMPUTECPP_TARGET_TRIPLE}-ranlib" CACHE PATH "ranlib") 25 | 26 | set(CMAKE_FIND_ROOT_PATH ${COMPUTECPP_SYSROOT_DIR}) 27 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 28 | 29 | set(CMAKE_SYSROOT "${COMPUTECPP_SYSROOT_DIR}") 30 | 31 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__aarch64__ --sysroot=${COMPUTECPP_SYSROOT_DIR}" CACHE INTERNAL "") 32 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__aarch64__ --sysroot=${COMPUTECPP_SYSROOT_DIR}" CACHE INTERNAL "") 33 | 34 | set(CMAKE_CXX_LINK_EXECUTABLE " -o " CACHE INTERNAL "") 35 | -------------------------------------------------------------------------------- /cmake/toolchains/gcc-generic.cmake: -------------------------------------------------------------------------------- 1 | set(CMAKE_SYSTEM_NAME Linux) 2 | set(COMPUTECPP_SYSROOT_DIR $ENV{COMPUTECPP_SYSROOT_DIR}) 3 | set(COMPUTECPP_TOOLCHAIN_DIR $ENV{COMPUTECPP_TOOLCHAIN_DIR}) 4 | set(COMPUTECPP_TARGET_TRIPLE $ENV{COMPUTECPP_TARGET_TRIPLE}) 5 | 6 | if(NOT COMPUTECPP_SYSROOT_DIR OR 7 | NOT COMPUTECPP_TOOLCHAIN_DIR OR 8 | NOT COMPUTECPP_TARGET_TRIPLE 9 | ) 10 | message(FATAL_ERROR 11 | "Please set all of COMPUTECPP_TARGET_TRIPLE, COMPUTECPP_SYSROOT_DIR and " 12 | "COMPUTECPP_TOOLCHAIN_DIR in the environment when crosscompiling.") 13 | endif() 14 | 15 | set(CMAKE_SYSROOT ${COMPUTECPP_SYSROOT_DIR}) 16 | set(CMAKE_C_COMPILER ${COMPUTECPP_TOOLCHAIN_DIR}/bin/${COMPUTECPP_TARGET_TRIPLE}-gcc) 17 | set(CMAKE_CXX_COMPILER ${COMPUTECPP_TOOLCHAIN_DIR}/bin/${COMPUTECPP_TARGET_TRIPLE}-g++) 18 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER) 19 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-gtx-580.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: CUDA 4 | 5 | Warning: array size must divide 1024 6 | Resizing array from 50000000 to 49999872 7 | Using CUDA device GeForce GTX 580 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 174970 0.00457222 0.00457604 0.00457437 10 | Mul 175003 0.00457135 0.00457712 0.00457405 11 | Add 172211 0.0069682 0.00703834 0.00702217 12 | Triad 172091 0.00697306 0.00702229 0.00700782 13 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-gtx-780-ti.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: CUDA 4 | 5 | Warning: array size must divide 1024 6 | Resizing array from 50000000 to 49999872 7 | Using CUDA device GeForce GTX 780 Ti 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 278448 0.00287306 0.00288167 0.00287746 10 | Mul 278463 0.00287292 0.00288136 0.00287668 11 | Add 280658 0.00427566 0.00429018 0.00428196 12 | Triad 280592 0.00427667 0.00429209 0.00428346 13 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-gtx-980-ti.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: CUDA 4 | 5 | Warning: array size must divide 1024 6 | Resizing array from 50000000 to 49999872 7 | Using CUDA device GeForce GTX 980 Ti 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 263520 0.00303582 0.00304059 0.00303816 10 | Mul 263561 0.00303535 0.00304045 0.00303786 11 | Add 269207 0.00445754 0.0044628 0.00446007 12 | Triad 269163 0.00445826 0.00446236 0.00446001 13 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-gtx-980.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: CUDA 4 | 5 | Warning: array size must divide 1024 6 | Resizing array from 50000000 to 49999872 7 | Using CUDA device GeForce GTX 980 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 169256 0.00472657 0.00473075 0.00472877 10 | Mul 169305 0.0047252 0.00473152 0.00472861 11 | Add 171685 0.00698954 0.00699297 0.00699159 12 | Triad 171660 0.00699056 0.00699286 0.006992 13 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-gtx-titan_x.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 1.0 3 | Implementation: HIP 4 | GridSize: 52428800 work-items 5 | GroupSize: 1024 work-items 6 | Operations/Work-item: 1 7 | Precision: double 8 | 9 | Running kernels 10 times 10 | Array size: 400.0 MB (=0.4 GB) 0 bytes padding 11 | Total size: 1200.0 MB (=1.2 GB) 12 | Using HIP device GeForce GTX TITAN X (compute_units=24) 13 | Driver: 4 14 | d_a=0x1306d80000 15 | d_b=0x131fd80000 16 | d_c=0x1338d80000 17 | Function MBytes/sec Min (sec) Max Average 18 | Copy 263042.207 0.00319 0.00320 0.00319 19 | Mul 262972.033 0.00319 0.00320 0.00319 20 | Add 268732.653 0.00468 0.00469 0.00469 21 | Triad 268706.197 0.00468 0.00469 0.00469 22 | GEOMEAN 265847.929 23 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-k20c-ecc-off.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: CUDA 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using CUDA device Tesla K20c 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 171245.377 0.00467 0.00470 0.00468 13 | Mul 172831.350 0.00463 0.00466 0.00465 14 | Add 173598.938 0.00691 0.00694 0.00692 15 | Triad 173616.092 0.00691 0.00693 0.00692 16 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-k20c-ecc-on.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: CUDA 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using CUDA device Tesla K20c 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 152296.519 0.00551 0.00553 0.00552 12 | Mul 152284.216 0.00551 0.00553 0.00552 13 | Add 150549.336 0.00836 0.00838 0.00837 14 | Triad 150597.842 0.00836 0.00838 0.00837 15 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-k40c-ecc-off.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: CUDA 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using CUDA device Tesla K40c 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 220178.580 0.00363 0.00364 0.00364 13 | Mul 219898.608 0.00364 0.00364 0.00364 14 | Add 220694.367 0.00544 0.00544 0.00544 15 | Triad 220662.874 0.00544 0.00545 0.00544 16 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-k40c-ecc-on.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: CUDA 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using CUDA device Tesla K40c 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 194335.669 0.00432 0.00433 0.00432 12 | Mul 194171.527 0.00432 0.00433 0.00433 13 | Add 191294.438 0.00658 0.00659 0.00658 14 | Triad 191240.187 0.00658 0.00659 0.00658 15 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-k80-ecc-off.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: CUDA 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using CUDA device Tesla K80 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 203910.595 0.00392 0.00393 0.00393 13 | Mul 203682.941 0.00393 0.00393 0.00393 14 | Add 203489.986 0.00590 0.00590 0.00590 15 | Triad 203421.823 0.00590 0.00590 0.00590 16 | -------------------------------------------------------------------------------- /results/v1.0/cuda/nvidia-k80-ecc-on.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: CUDA 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using CUDA device Tesla K80 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 180192.706 0.00444 0.00497 0.00448 13 | Mul 180093.404 0.00444 0.00459 0.00446 14 | Add 177047.859 0.00678 0.00696 0.00679 15 | Triad 176954.341 0.00678 0.00696 0.00680 16 | -------------------------------------------------------------------------------- /results/v1.0/hip/amd-fiji-nano.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 1.0 3 | Implementation: CUDA 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using CUDA device Fiji 10 | Driver: 4 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 375822.410 0.00223 0.00225 0.00224 13 | Mul 375086.879 0.00224 0.00227 0.00224 14 | Add 425650.718 0.00296 0.00298 0.00297 15 | Triad 424710.113 0.00296 0.00298 0.00298 16 | -------------------------------------------------------------------------------- /results/v1.0/hip/nvidia-gtx-titan_x.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 1.0 3 | Implementation: HIP 4 | GridSize: 52428800 work-items 5 | GroupSize: 1024 work-items 6 | Operations/Work-item: 1 7 | Precision: double 8 | 9 | Running kernels 10 times 10 | Array size: 400.0 MB (=0.4 GB) 0 bytes padding 11 | Total size: 1200.0 MB (=1.2 GB) 12 | Using HIP device GeForce GTX TITAN X (compute_units=24) 13 | Driver: 4 14 | d_a=0x1306d80000 15 | d_b=0x131fd80000 16 | d_c=0x1338d80000 17 | Function MBytes/sec Min (sec) Max Average 18 | Copy 263042.207 0.00319 0.00320 0.00319 19 | Mul 262972.033 0.00319 0.00320 0.00319 20 | Add 268732.653 0.00468 0.00469 0.00469 21 | Triad 268706.197 0.00468 0.00469 0.00469 22 | GEOMEAN 265847.929 23 | -------------------------------------------------------------------------------- /results/v1.0/opencl-results.csv: -------------------------------------------------------------------------------- 1 | Platform,Device,ECC,Copy,Mul,Add,Triad,Best GB/s,Peak,Percentage 2 | AMD,FirePro S10000,Off,202334.005,202607.623,206313.435,205839.449,206,240,86.0 3 | AMD,FirePro S9150,Off,175275.054,172765.213,271254.828,271794.459,272,320,84.9 4 | AMD,Radeon R9 295X2,Off,257397.667,266888.442,272780.981,270801.525,273,320,85.2 5 | AMD,Radeon R9 290X,Off,257320.921,265161.454,272112.619,271556.605,272,320,85.0 6 | AMD,Radeon 7970,Off,211122.437,212108.366,219478.174,219383.16,219,264,83.1 7 | AMD,A10 7850K Radeon R7,Off,16897.342,16690.679,16331.946,16262.937,17,34.1,49.6 8 | AMD,R9 Fury X,Off,351139.404,400256.512,388737.861,385242.562,400,512,78.2 9 | NVIDIA,GTX 580,Off,176151,176314,172099,171998,176,192.384,91.6 10 | NVIDIA,GTX 780 Ti,Off,272926,275240,281845,281601,282,336.5,83.8 11 | NVIDIA,GTX 980,Off,168866,168856,171015,171096,171,224,76.4 12 | NVIDIA,GTX 980 Ti,Off,263595,263768,268115,268015,268,336,79.8 13 | NVIDIA,K20c,Off,169228.501,169313.85,172816.441,172945.582,173,208,83.1 14 | NVIDIA,K20c,On,150397.867,150241.232,151673.787,151699.186,152,182,83.4 15 | NVIDIA,K40c,Off,213696.013,214286.39,217204.275,217420.998,217,288,75.5 16 | NVIDIA,K40c,On,190354.872,190199.107,190946.38,190991.101,191,252,75.8 17 | NVIDIA,K80,Off,204118.601,204369.626,204705.951,204615.651,205,240,85.3 18 | NVIDIA,K80,On,181989.324,181996.901,182062.121,181988.951,182,210,86.7 19 | ,,,,,,,,, 20 | ,,,,,,,,, 21 | ,,,,,,,,, 22 | ,,,,,,,,, 23 | ,,,,,,,,, 24 | ,,,,,,,,, -------------------------------------------------------------------------------- /results/v1.0/opencl-results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/BabelStream/2f00dfb7f8b7cfe8c53d20d5c770bccbf8673440/results/v1.0/opencl-results.xlsx -------------------------------------------------------------------------------- /results/v1.0/opencl/amd-a10-7850k-radeon-r7.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 16897.342 0.04964 0.05011 0.04988 12 | Mul 16690.679 0.05026 0.05098 0.05058 13 | Add 16331.946 0.07704 0.07776 0.07728 14 | Triad 16262.937 0.07737 0.07922 0.07780 15 | -------------------------------------------------------------------------------- /results/v1.0/opencl/amd-firepro-s10000.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 202334.005 0.00415 0.00420 0.00418 12 | Mul 202607.623 0.00414 0.00417 0.00415 13 | Add 206313.435 0.00610 0.00614 0.00612 14 | Triad 205839.449 0.00611 0.00615 0.00614 15 | -------------------------------------------------------------------------------- /results/v1.0/opencl/amd-firepro-s9150-ecc-off.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device Hawaii 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 175275.054 0.00479 0.00498 0.00488 12 | Mul 172765.213 0.00486 0.00497 0.00493 13 | Add 271254.828 0.00464 0.00473 0.00470 14 | Triad 271794.459 0.00463 0.00472 0.00469 15 | 16 | -------------------------------------------------------------------------------- /results/v1.0/opencl/amd-r9-fury-x.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 1.0 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device AMD Radeon (TM) R9 Fury Series 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 351139.404 0.00239 0.00258 0.00246 12 | Mul 400256.512 0.00210 0.00213 0.00212 13 | Add 388737.861 0.00324 0.00340 0.00332 14 | Triad 385242.562 0.00327 0.00339 0.00332 15 | -------------------------------------------------------------------------------- /results/v1.0/opencl/amd-radeon-7970.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 211122.437 0.00397 0.00411 0.00401 12 | Mul 212108.366 0.00395 0.00414 0.00402 13 | Add 219478.174 0.00573 0.00593 0.00580 14 | Triad 219383.160 0.00574 0.00600 0.00583 15 | -------------------------------------------------------------------------------- /results/v1.0/opencl/amd-radeon-r9-290x.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device Hawaii 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 257320.921 0.00326 0.00343 0.00333 12 | Mul 265161.454 0.00316 0.00324 0.00320 13 | Add 272112.619 0.00462 0.00476 0.00469 14 | Triad 271556.605 0.00463 0.00479 0.00470 15 | -------------------------------------------------------------------------------- /results/v1.0/opencl/amd-radeon-r9-295x2.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 257397.667 0.00326 0.00341 0.00332 12 | Mul 266888.442 0.00314 0.00321 0.00318 13 | Add 272780.981 0.00461 0.00466 0.00464 14 | Triad 270801.525 0.00465 0.00466 0.00465 15 | -------------------------------------------------------------------------------- /results/v1.0/opencl/intel-phi-se10p.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using OpenCL device Intel(R) Many Integrated Core Acceleration Card 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 96934.200 0.00825 0.00911 0.00887 13 | Mul 101522.583 0.00788 0.00806 0.00795 14 | Add 110415.617 0.01087 0.01261 0.01113 15 | Triad 109458.809 0.01096 0.01216 0.01155 16 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-gtx-580.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | 5 | Using OpenCL device GeForce GTX 580 6 | Function MBytes/sec Min (sec) Max Average 7 | Copy 176151 0.00454157 0.00454387 0.00454229 8 | Mul 176314 0.00453736 0.00453922 0.0045382 9 | Add 172099 0.00697274 0.00697767 0.00697497 10 | Triad 171998 0.00697684 0.0069785 0.00697772 11 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-gtx-780-ti.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | 5 | Using OpenCL device GeForce GTX 780 Ti 6 | Function MBytes/sec Min (sec) Max Average 7 | Copy 272926 0.0029312 0.0029951 0.00294692 8 | Mul 275240 0.00290655 0.00297589 0.00292918 9 | Add 281845 0.00425766 0.00432828 0.00427445 10 | Triad 281601 0.00426134 0.00433837 0.00427603 11 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-gtx-980-ti.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | 5 | Using OpenCL device GeForce GTX 980 Ti 6 | Function MBytes/sec Min (sec) Max Average 7 | Copy 263595 0.00303496 0.00303947 0.00303653 8 | Mul 263768 0.00303296 0.00303712 0.00303467 9 | Add 268115 0.0044757 0.00448098 0.00447908 10 | Triad 268015 0.00447737 0.00448109 0.00447909 11 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-gtx-980.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | 5 | Using OpenCL device GeForce GTX 980 6 | Function MBytes/sec Min (sec) Max Average 7 | Copy 168866 0.00473748 0.00474083 0.00473952 8 | Mul 168856 0.00473776 0.00474205 0.00473983 9 | Add 171015 0.00701691 0.00702005 0.00701843 10 | Triad 171096 0.00701359 0.00702061 0.00701662 11 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-k20c-ecc-off.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using OpenCL device Tesla K20c 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 169228.501 0.00473 0.00476 0.00474 13 | Mul 169313.850 0.00472 0.00476 0.00474 14 | Add 172816.441 0.00694 0.00697 0.00696 15 | Triad 172945.582 0.00694 0.00697 0.00696 16 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-k20c-ecc-on.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device Tesla K20c 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 150397.867 0.00558 0.00561 0.00559 12 | Mul 150241.232 0.00558 0.00560 0.00559 13 | Add 151673.787 0.00830 0.00833 0.00831 14 | Triad 151699.186 0.00829 0.00833 0.00831 15 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-k40c-ecc-off.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using OpenCL device Tesla K40c 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 213696.013 0.00374 0.00377 0.00376 13 | Mul 214286.390 0.00373 0.00378 0.00376 14 | Add 217204.275 0.00552 0.00554 0.00553 15 | Triad 217420.998 0.00552 0.00554 0.00553 16 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-k40c-ecc-on.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.9 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Running kernels 10 times 7 | Array size: 400.0 MB (=0.4 GB) 8 | Total size: 1200.0 MB (=1.2 GB) 9 | Using OpenCL device Tesla K40c 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 190354.872 0.00441 0.00443 0.00442 12 | Mul 190199.107 0.00441 0.00443 0.00442 13 | Add 190946.380 0.00659 0.00660 0.00660 14 | Triad 190991.101 0.00659 0.00661 0.00660 15 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-k80-ecc-off.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using OpenCL device Tesla K80 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 204118.601 0.00392 0.00397 0.00395 13 | Mul 204369.626 0.00391 0.00396 0.00395 14 | Add 204705.951 0.00586 0.00592 0.00591 15 | Triad 204615.651 0.00586 0.00593 0.00591 16 | -------------------------------------------------------------------------------- /results/v1.0/opencl/nvidia-k80-ecc-on.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 0.0 3 | Implementation: OpenCL 4 | Precision: double 5 | 6 | Warning: array size must divide 1024 7 | Resizing array from 50000000 to 49999872 8 | Array size: 381.5 MB (=0.4 GB) 9 | Total size: 1144.4 MB (=1.1 GB) 10 | Using OpenCL device Tesla K80 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 181989.324 0.00440 0.00468 0.00442 13 | Mul 181996.901 0.00440 0.00446 0.00441 14 | Add 182062.121 0.00659 0.00695 0.00662 15 | Triad 181988.951 0.00659 0.00719 0.00663 16 | -------------------------------------------------------------------------------- /results/v2.0/broadwell/acc-pgi-kernel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 37575.728 0.01429 0.01440 0.01434 10 | Mul 37848.869 0.01418 0.01433 0.01425 11 | Add 45589.244 0.01766 0.01781 0.01772 12 | Triad 46657.287 0.01726 0.01736 0.01730 13 | Application 1454136 resources: utime ~287s, stime ~1s, Rss ~789384, inblocks ~374, outblocks ~387 14 | -------------------------------------------------------------------------------- /results/v2.0/broadwell/acc-pgi-loops.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 34338.639 0.01563 0.01574 0.01569 10 | Mul 38249.125 0.01404 0.01413 0.01408 11 | Add 40155.137 0.02005 0.02015 0.02009 12 | Triad 40995.666 0.01964 0.01971 0.01967 13 | Application 1396691 resources: utime ~357s, stime ~1s, Rss ~789348, inblocks ~365, outblocks ~350 14 | -------------------------------------------------------------------------------- /results/v2.0/broadwell/kokkos-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 86500.072 0.00621 0.00627 0.00622 10 | Mul 84168.782 0.00638 0.00643 0.00640 11 | Add 94162.571 0.00855 0.00867 0.00858 12 | Triad 96282.261 0.00836 0.00843 0.00838 13 | -------------------------------------------------------------------------------- /results/v2.0/broadwell/mccalpin-cray.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 33554432 (elements), Offset = 0 (elements) 7 | Memory per array = 256.0 MiB (= 0.2 GiB). 8 | Total memory required = 768.0 MiB (= 0.8 GiB). 9 | Each kernel will be executed 100 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 44 14 | Number of Threads counted = 44 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 4125 microseconds. 18 | (= 4125 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 92980.4 0.005803 0.005774 0.005843 28 | Scale: 97951.2 0.005527 0.005481 0.005586 29 | Add: 123058.1 0.006592 0.006544 0.006655 30 | Triad: 124799.5 0.006492 0.006453 0.006544 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | Application 1396471 resources: utime ~111s, stime ~3s, Rss ~788804, inblocks ~5778, outblocks ~14259 35 | -------------------------------------------------------------------------------- /results/v2.0/broadwell/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz 9 | Driver: 1.2.0.57 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 43567.760 0.01232 0.01257 0.01247 12 | Mul 42995.296 0.01249 0.01257 0.01253 13 | Add 48537.031 0.01659 0.01672 0.01664 14 | Triad 49238.925 0.01636 0.01652 0.01645 15 | Application 1407386 resources: utime ~130s, stime ~1s, Rss ~1647432, inblocks ~817, outblocks ~464 16 | -------------------------------------------------------------------------------- /results/v2.0/broadwell/omp-cray.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: Reference OpenMP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 88171.606 0.00609 0.00614 0.00611 10 | Mul 85751.031 0.00626 0.00632 0.00628 11 | Add 96048.112 0.00838 0.00845 0.00842 12 | Triad 98169.628 0.00820 0.00827 0.00824 13 | Application 1396470 resources: utime ~133s, stime ~3s, Rss ~1576044, inblocks ~6345, outblocks ~16023 14 | -------------------------------------------------------------------------------- /results/v2.0/broadwell/raja-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 88094.593 0.00609 0.00619 0.00615 10 | Mul 85183.815 0.00630 0.00643 0.00633 11 | Add 95834.438 0.00840 0.00850 0.00846 12 | Triad 97943.551 0.00822 0.00836 0.00827 13 | -------------------------------------------------------------------------------- /results/v2.0/furynano/hip.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: HIP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using HIP device Fiji 9 | Driver: 4 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 414098.238 0.00130 0.00132 0.00131 12 | Mul 416699.068 0.00129 0.00134 0.00131 13 | Add 422965.910 0.00190 0.00195 0.00192 14 | Triad 417453.151 0.00193 0.00196 0.00194 15 | -------------------------------------------------------------------------------- /results/v2.0/furyx/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Fiji 9 | Driver: 1912.5 (VM) 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 429587.115 0.00125 0.00141 0.00128 12 | Mul 429295.476 0.00125 0.00135 0.00128 13 | Add 442443.451 0.00182 0.00192 0.00186 14 | Triad 442069.177 0.00182 0.00194 0.00186 15 | -------------------------------------------------------------------------------- /results/v2.0/furyx/sycl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: SYCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using SYCL device Fiji 9 | Driver: 1912.5 (VM) 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 419830.223 0.00128 0.00141 0.00130 12 | Mul 419811.510 0.00128 0.00136 0.00130 13 | Add 432957.387 0.00186 0.00193 0.00188 14 | Triad 430761.906 0.00187 0.00194 0.00189 15 | -------------------------------------------------------------------------------- /results/v2.0/gtx980ti/acc-pgi-loops.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 258666.277 0.00208 0.00208 0.00208 10 | Mul 258730.975 0.00208 0.00208 0.00208 11 | Add 265497.286 0.00303 0.00304 0.00304 12 | Triad 266836.306 0.00302 0.00303 0.00302 13 | -------------------------------------------------------------------------------- /results/v2.0/gtx980ti/cuda.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: CUDA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device GeForce GTX 980 Ti 9 | Driver: 7050 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 263497.383 0.00204 0.00204 0.00204 12 | Mul 263283.395 0.00204 0.00204 0.00204 13 | Add 269113.982 0.00299 0.00300 0.00300 14 | Triad 269153.828 0.00299 0.00300 0.00300 15 | -------------------------------------------------------------------------------- /results/v2.0/gtx980ti/kokkos.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 263006.605 0.00204 0.00205 0.00204 10 | Mul 262996.298 0.00204 0.00205 0.00204 11 | Add 268536.157 0.00300 0.00301 0.00300 12 | Triad 268594.912 0.00300 0.00301 0.00300 13 | -------------------------------------------------------------------------------- /results/v2.0/gtx980ti/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device GeForce GTX 980 Ti 9 | Driver: 352.21 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 262864.310 0.00204 0.00206 0.00205 12 | Mul 262886.578 0.00204 0.00205 0.00205 13 | Add 268781.736 0.00300 0.00300 0.00300 14 | Triad 268727.204 0.00300 0.00301 0.00300 15 | -------------------------------------------------------------------------------- /results/v2.0/gtx980ti/raja.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 262849.638 0.00204 0.00205 0.00204 10 | Mul 262842.175 0.00204 0.00205 0.00205 11 | Add 268802.910 0.00300 0.00300 0.00300 12 | Triad 268830.368 0.00300 0.00300 0.00300 13 | -------------------------------------------------------------------------------- /results/v2.0/haswell/acc-pgi-kernel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 50148.428 0.01071 0.01077 0.01073 10 | Mul 51051.215 0.01052 0.01059 0.01055 11 | Add 55360.902 0.01455 0.01462 0.01458 12 | Triad 54556.116 0.01476 0.01486 0.01481 13 | Application 1454139 resources: utime ~167s, stime ~1s, Rss ~789120, inblocks ~316, outblocks ~387 14 | -------------------------------------------------------------------------------- /results/v2.0/haswell/acc-pgi-loops.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 46401.864 0.01157 0.01163 0.01160 10 | Mul 40767.924 0.01317 0.01326 0.01321 11 | Add 42062.027 0.01915 0.01925 0.01920 12 | Triad 42666.377 0.01887 0.01900 0.01893 13 | Application 1396695 resources: utime ~240s, stime ~1s, Rss ~789084, inblocks ~307, outblocks ~350 14 | -------------------------------------------------------------------------------- /results/v2.0/haswell/cuda.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: CUDA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device DEVICE EMULATION MODE 9 | Driver: PGI 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 38778.163 0.01384 0.01391 0.01388 12 | Mul 38124.361 0.01408 0.01412 0.01410 13 | Add 41817.646 0.01926 0.01934 0.01930 14 | Triad 42446.352 0.01897 0.01906 0.01901 15 | -------------------------------------------------------------------------------- /results/v2.0/haswell/kokkos-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 79740.484 0.00673 0.00688 0.00678 10 | Mul 77939.661 0.00689 0.00701 0.00694 11 | Add 87154.341 0.00924 0.00954 0.00935 12 | Triad 88503.861 0.00910 0.00945 0.00919 13 | -------------------------------------------------------------------------------- /results/v2.0/haswell/mccalpin-cray.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 33554432 (elements), Offset = 0 (elements) 7 | Memory per array = 256.0 MiB (= 0.2 GiB). 8 | Total memory required = 768.0 MiB (= 0.8 GiB). 9 | Each kernel will be executed 100 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 32 14 | Number of Threads counted = 32 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 4685 microseconds. 18 | (= 4685 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 84400.3 0.006386 0.006361 0.006415 28 | Scale: 99272.6 0.005457 0.005408 0.005546 29 | Add: 118080.7 0.006854 0.006820 0.006892 30 | Triad: 116271.9 0.006969 0.006926 0.007042 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | Application 1396734 resources: utime ~84s, stime ~2s, Rss ~791400, inblocks ~490, outblocks ~54 35 | -------------------------------------------------------------------------------- /results/v2.0/haswell/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Intel(R) Xeon(R) CPU E5-2698 v3 @ 2.30GHz 9 | Driver: 1.2.0.57 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 39316.996 0.01365 0.01413 0.01388 12 | Mul 39127.564 0.01372 0.01418 0.01392 13 | Add 43768.720 0.01840 0.01955 0.01871 14 | Triad 44121.647 0.01825 0.01892 0.01847 15 | Application 1407392 resources: utime ~106s, stime ~1s, Rss ~1642860, inblocks ~459, outblocks ~464 16 | -------------------------------------------------------------------------------- /results/v2.0/haswell/omp-cray.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: Reference OpenMP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 81012.503 0.00663 0.00668 0.00665 10 | Mul 79344.699 0.00677 0.00683 0.00678 11 | Add 89615.218 0.00899 0.00907 0.00901 12 | Triad 90999.378 0.00885 0.00893 0.00887 13 | Application 1396725 resources: utime ~104s, stime ~2s, Rss ~1578772, inblocks ~544, outblocks ~213 14 | -------------------------------------------------------------------------------- /results/v2.0/haswell/raja-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 80888.406 0.00664 0.00671 0.00667 10 | Mul 79024.295 0.00679 0.00686 0.00682 11 | Add 89360.767 0.00901 0.00910 0.00905 12 | Triad 90744.543 0.00887 0.00893 0.00890 13 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/acc-pgi-kernel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 82520.672 0.00651 0.01660 0.00835 10 | Mul 80283.532 0.00669 0.01680 0.00862 11 | Add 84622.881 0.00952 0.02310 0.01205 12 | Triad 83561.609 0.00964 0.02359 0.01253 13 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/acc-pgi-loops.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 77879.649 0.00689 0.02005 0.00893 10 | Mul 59127.697 0.00908 0.01900 0.01069 11 | Add 63608.223 0.01266 0.02617 0.01511 12 | Triad 64017.868 0.01258 0.02615 0.01513 13 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/cuda.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: CUDA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device DEVICE EMULATION MODE 9 | Driver: PGI 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 57308.251 0.00937 0.02134 0.01109 12 | Mul 55999.151 0.00959 0.02233 0.01134 13 | Add 63534.754 0.01268 0.02962 0.01492 14 | Triad 64546.130 0.01248 0.02873 0.01492 15 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/kokkos-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 56720.246 0.00947 0.01173 0.00999 10 | Mul 55256.554 0.00972 0.03029 0.01052 11 | Add 62562.358 0.01287 0.03304 0.01384 12 | Triad 62965.518 0.01279 0.02534 0.01364 13 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/mccalpin-intel.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 33554432 (elements), Offset = 0 (elements) 7 | Memory per array = 256.0 MiB (= 0.2 GiB). 8 | Total memory required = 768.0 MiB (= 0.8 GiB). 9 | Each kernel will be executed 100 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 24 14 | Number of Threads counted = 24 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 2 microseconds. 17 | Each test below will take on the order of 7369 microseconds. 18 | (= 3684 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 57739.9 0.009647 0.009298 0.010455 28 | Scale: 74390.5 0.008626 0.007217 0.010923 29 | Add: 83859.7 0.010991 0.009603 0.013830 30 | Triad: 82738.1 0.011216 0.009733 0.015579 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz 9 | Driver: 1.2.0.92 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 47971.490 0.01119 0.02317 0.01306 12 | Mul 46385.194 0.01157 0.02247 0.01341 13 | Add 53319.761 0.01510 0.02831 0.01769 14 | Triad 53374.243 0.01509 0.02794 0.01707 15 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/omp-intel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: Reference OpenMP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 56676.385 0.00947 0.01567 0.01040 10 | Mul 55505.151 0.00967 0.01513 0.01075 11 | Add 61874.931 0.01302 0.01930 0.01435 12 | Triad 62073.488 0.01297 0.01899 0.01420 13 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/raja-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 55500.859 0.00967 0.01992 0.01139 10 | Mul 55232.718 0.00972 0.01978 0.01151 11 | Add 62568.702 0.01287 0.02616 0.01523 12 | Triad 64105.913 0.01256 0.02570 0.01497 13 | -------------------------------------------------------------------------------- /results/v2.0/ivybridge/sycl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: SYCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using SYCL device Intel(R) Xeon(R) CPU E5-2697 v2 @ 2.70GHz 9 | Driver: 1.2.0.92 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 39756.124 0.01350 0.02534 0.01567 12 | Mul 38899.994 0.01380 0.02237 0.01554 13 | Add 46878.810 0.01718 0.02802 0.01919 14 | Triad 51324.819 0.01569 0.02555 0.01748 15 | -------------------------------------------------------------------------------- /results/v2.0/k20x/acc-cray.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 116702.138 0.00460 0.00462 0.00461 10 | Mul 174325.378 0.00308 0.00310 0.00309 11 | Add 261591.864 0.00308 0.00310 0.00309 12 | Triad 175206.996 0.00460 0.00462 0.00461 13 | Application 1396462 resources: utime ~2s, stime ~1s, Rss ~876780, inblocks ~601, outblocks ~323 14 | -------------------------------------------------------------------------------- /results/v2.0/k20x/cuda.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: CUDA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Tesla K20X 9 | Driver: 7050 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 181833.763 0.00295 0.00298 0.00297 12 | Mul 181354.354 0.00296 0.00305 0.00297 13 | Add 179955.484 0.00448 0.00449 0.00448 14 | Triad 179798.066 0.00448 0.00450 0.00449 15 | Application 1396457 resources: utime ~3s, stime ~1s, Rss ~871996, inblocks ~690, outblocks ~1373 16 | -------------------------------------------------------------------------------- /results/v2.0/k20x/kokkos.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 182239.282 0.00295 0.00299 0.00296 10 | Mul 182179.668 0.00295 0.00298 0.00296 11 | Add 182333.793 0.00442 0.00451 0.00443 12 | Triad 182213.531 0.00442 0.00444 0.00443 13 | -------------------------------------------------------------------------------- /results/v2.0/k20x/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Tesla K20X 9 | Driver: 352.68 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 182194.259 0.00295 0.00298 0.00296 12 | Mul 182081.736 0.00295 0.00296 0.00296 13 | Add 182723.055 0.00441 0.00443 0.00442 14 | Triad 182719.573 0.00441 0.00443 0.00442 15 | Application 1396458 resources: utime ~3s, stime ~1s, Rss ~1670780, inblocks ~2549, outblocks ~464 16 | -------------------------------------------------------------------------------- /results/v2.0/k20x/omp-cray.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenMP 4.5 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 174380.925 0.00308 0.00310 0.00309 10 | Mul 174416.162 0.00308 0.00310 0.00309 11 | Add 175158.103 0.00460 0.00462 0.00461 12 | Triad 175104.249 0.00460 0.00462 0.00461 13 | Application 1396463 resources: utime ~2s, stime ~1s, Rss ~876708, inblocks ~600, outblocks ~327 14 | -------------------------------------------------------------------------------- /results/v2.0/k20x/raja.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 182375.663 0.00294 0.00296 0.00295 10 | Mul 182515.909 0.00294 0.00296 0.00295 11 | Add 181695.649 0.00443 0.00445 0.00444 12 | Triad 181436.686 0.00444 0.00445 0.00445 13 | -------------------------------------------------------------------------------- /results/v2.0/k40/acc-cray.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 120256.706 0.00446 0.00458 0.00448 10 | Mul 185251.125 0.00290 0.00292 0.00291 11 | Add 277727.285 0.00290 0.00388 0.00292 12 | Triad 181094.123 0.00445 0.00448 0.00446 13 | -------------------------------------------------------------------------------- /results/v2.0/k40/cuda.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: CUDA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Tesla K40m 9 | Driver: 7050 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 194135.310 0.00277 0.00278 0.00277 12 | Mul 194049.073 0.00277 0.00280 0.00278 13 | Add 190956.372 0.00422 0.00423 0.00422 14 | Triad 190822.844 0.00422 0.00423 0.00422 15 | -------------------------------------------------------------------------------- /results/v2.0/k40/kokkos.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 192464.515 0.00279 0.00283 0.00281 10 | Mul 192505.853 0.00279 0.00283 0.00281 11 | Add 193303.390 0.00417 0.00419 0.00417 12 | Triad 193249.349 0.00417 0.00419 0.00417 13 | -------------------------------------------------------------------------------- /results/v2.0/k40/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Tesla K40m 9 | Driver: 352.79 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 190876.305 0.00281 0.00285 0.00283 12 | Mul 190558.963 0.00282 0.00284 0.00283 13 | Add 191437.004 0.00421 0.00422 0.00421 14 | Triad 191420.077 0.00421 0.00423 0.00421 15 | -------------------------------------------------------------------------------- /results/v2.0/k40/raja.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 191811.130 0.00280 0.00282 0.00281 10 | Mul 191720.029 0.00280 0.00282 0.00281 11 | Add 192768.490 0.00418 0.00419 0.00418 12 | Triad 192718.253 0.00418 0.00419 0.00418 13 | -------------------------------------------------------------------------------- /results/v2.0/k80/acc-cray.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 113606.114 0.00473 0.00560 0.00476 10 | Mul 175729.637 0.00306 0.00367 0.00309 11 | Add 263518.810 0.00306 0.00369 0.00309 12 | Triad 170709.791 0.00472 0.00573 0.00477 13 | -------------------------------------------------------------------------------- /results/v2.0/k80/cuda.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: CUDA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Tesla K80 9 | Driver: 7050 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 179851.070 0.00299 0.00357 0.00303 12 | Mul 179720.423 0.00299 0.00358 0.00303 13 | Add 176265.359 0.00457 0.00539 0.00461 14 | Triad 176116.986 0.00457 0.00540 0.00461 15 | -------------------------------------------------------------------------------- /results/v2.0/k80/kokkos.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 181880.210 0.00295 0.00352 0.00301 10 | Mul 181906.524 0.00295 0.00353 0.00301 11 | Add 179304.662 0.00449 0.00532 0.00456 12 | Triad 179172.535 0.00449 0.00531 0.00456 13 | -------------------------------------------------------------------------------- /results/v2.0/k80/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Tesla K80 9 | Driver: 352.79 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 181392.835 0.00296 0.00357 0.00303 12 | Mul 181350.127 0.00296 0.00356 0.00303 13 | Add 181786.662 0.00443 0.00532 0.00453 14 | Triad 181670.318 0.00443 0.00533 0.00454 15 | -------------------------------------------------------------------------------- /results/v2.0/k80/raja.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 182711.634 0.00294 0.00354 0.00302 10 | Mul 182628.907 0.00294 0.00354 0.00302 11 | Add 178679.382 0.00451 0.00533 0.00462 12 | Triad 178467.177 0.00451 0.00534 0.00462 13 | -------------------------------------------------------------------------------- /results/v2.0/knl/acc-pgi-kernel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 323365.370 0.00166 0.00176 0.00172 10 | Mul 162148.544 0.00331 0.00590 0.00432 11 | Add 425349.852 0.00189 0.00203 0.00193 12 | Triad 239556.020 0.00336 0.00664 0.00549 13 | -------------------------------------------------------------------------------- /results/v2.0/knl/kokkos-intel-128threads.txt: -------------------------------------------------------------------------------- 1 | OMP_NUM_THREADS=128 numactl -m 1 ./gpu-stream-kokkos 2 | GPU-STREAM 3 | Version: 2.0 4 | Implementation: KOKKOS 5 | Running kernels 100 times 6 | Precision: double 7 | Array size: 268.4 MB (=0.3 GB) 8 | Total size: 805.3 MB (=0.8 GB) 9 | Function MBytes/sec Min (sec) Max Average 10 | Copy 284255.707 0.00189 0.00209 0.00199 11 | Mul 259925.621 0.00207 0.00483 0.00426 12 | Add 301882.418 0.00267 0.00295 0.00279 13 | Triad 293037.412 0.00275 0.00314 0.00293 14 | -------------------------------------------------------------------------------- /results/v2.0/knl/mccalpin-intel.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 33554432 (elements), Offset = 0 (elements) 7 | Memory per array = 256.0 MiB (= 0.2 GiB). 8 | Total memory required = 768.0 MiB (= 0.8 GiB). 9 | Each kernel will be executed 100 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 64 14 | Number of Threads counted = 64 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 1507 microseconds. 18 | (= 1507 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 387306.5 0.001418 0.001386 0.001456 28 | Scale: 414238.4 0.001321 0.001296 0.001368 29 | Add: 444668.2 0.001849 0.001811 0.001875 30 | Triad: 447436.7 0.001855 0.001800 0.001949 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /results/v2.0/knl/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz 9 | Driver: 1.2.0.10002 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 218490.851 0.00246 0.00431 0.00257 12 | Mul 216827.480 0.00248 0.00276 0.00258 13 | Add 233472.011 0.00345 0.00365 0.00354 14 | Triad 236852.515 0.00340 0.00365 0.00351 15 | -------------------------------------------------------------------------------- /results/v2.0/knl/omp-intel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: Reference OpenMP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 355432.548 0.00151 0.00159 0.00154 10 | Mul 303409.465 0.00177 0.00420 0.00214 11 | Add 317176.372 0.00254 0.00266 0.00259 12 | Triad 296841.725 0.00271 0.00307 0.00288 13 | -------------------------------------------------------------------------------- /results/v2.0/knl/raja-intel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 358479.622 0.00150 0.00166 0.00157 10 | Mul 301367.207 0.00178 0.00369 0.00201 11 | Add 317005.071 0.00254 0.00271 0.00261 12 | Triad 298105.168 0.00270 0.00303 0.00286 13 | -------------------------------------------------------------------------------- /results/v2.0/knl/sycl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: SYCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using SYCL device Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz 9 | Driver: 1.2.0.10002 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 188284.193 0.00285 0.00447 0.00298 12 | Mul 185567.824 0.00289 0.00762 0.00307 13 | Add 207104.230 0.00389 0.00614 0.00404 14 | Triad 207078.189 0.00389 0.01483 0.00415 15 | -------------------------------------------------------------------------------- /results/v2.0/power8/kokkos-xl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 260801.154 0.00206 0.00955 0.00234 10 | Mul 231963.990 0.00231 0.01149 0.00264 11 | Add 292167.544 0.00276 0.01150 0.00309 12 | Triad 298266.810 0.00270 0.01533 0.00316 13 | -------------------------------------------------------------------------------- /results/v2.0/power8/mccalpin-xl.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 33554432 (elements), Offset = 0 (elements) 7 | Memory per array = 256.0 MiB (= 0.2 GiB). 8 | Total memory required = 768.0 MiB (= 0.8 GiB). 9 | Each kernel will be executed 100 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 20 14 | Number of Threads counted = 20 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 2250 microseconds. 18 | (= 2250 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 261745.9 0.002101 0.002051 0.002517 28 | Scale: 253352.8 0.002188 0.002119 0.003140 29 | Add: 239468.3 0.003499 0.003363 0.004400 30 | Triad: 245151.7 0.003468 0.003285 0.004771 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /results/v2.0/power8/omp-xl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: Reference OpenMP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 223266.147 0.00240 0.00271 0.00248 10 | Mul 196756.047 0.00273 0.00372 0.00297 11 | Add 210090.244 0.00383 0.00441 0.00396 12 | Triad 212958.097 0.00378 0.00500 0.00409 13 | -------------------------------------------------------------------------------- /results/v2.0/power8/raja-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 227063.854 0.00236 0.01007 0.00449 10 | Mul 218404.455 0.00246 0.01080 0.00449 11 | Add 257927.160 0.00312 0.01815 0.00647 12 | Triad 253019.962 0.00318 0.01535 0.00635 13 | -------------------------------------------------------------------------------- /results/v2.0/power8/raja-xl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 257796.702 0.00208 0.01017 0.00423 10 | Mul 218399.746 0.00246 0.01163 0.00440 11 | Add 269553.023 0.00299 0.01575 0.00620 12 | Triad 279022.596 0.00289 0.01569 0.00614 13 | -------------------------------------------------------------------------------- /results/v2.0/s9150/acc-pgi-loops.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 256996.540 0.00209 0.00217 0.00212 10 | Mul 258109.030 0.00208 0.00215 0.00212 11 | Add 265716.643 0.00303 0.00315 0.00306 12 | Triad 265499.387 0.00303 0.00307 0.00306 13 | -------------------------------------------------------------------------------- /results/v2.0/s9150/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Hawaii 9 | Driver: 1912.5 (VM) 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 259805.874 0.00207 0.00214 0.00211 12 | Mul 260956.698 0.00206 0.00214 0.00211 13 | Add 268425.077 0.00300 0.00306 0.00304 14 | Triad 267506.939 0.00301 0.00307 0.00305 15 | -------------------------------------------------------------------------------- /results/v2.0/s9150/sycl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: SYCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using SYCL device Hawaii 9 | Driver: 1912.5 (VM) 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 222466.991 0.00241 0.00251 0.00245 12 | Mul 224827.470 0.00239 0.00247 0.00244 13 | Add 271092.068 0.00297 0.00305 0.00301 14 | Triad 269725.824 0.00299 0.00304 0.00302 15 | -------------------------------------------------------------------------------- /results/v2.0/s9300x2/hip.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: HIP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using HIP device Fiji 9 | Driver: 4 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 442194.067 0.00121 0.03320 0.00331 12 | Mul 442500.583 0.00121 0.00195 0.00149 13 | Add 459234.293 0.00175 0.00355 0.00248 14 | Triad 458682.906 0.00176 0.00357 0.00246 15 | -------------------------------------------------------------------------------- /results/v2.0/sandybridge/acc-pgi-kernel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 28718.686 0.01869 0.03758 0.01919 10 | Mul 27994.499 0.01918 0.02873 0.01984 11 | Add 28014.895 0.02875 0.03684 0.02941 12 | Triad 28070.552 0.02869 0.04073 0.02952 13 | -------------------------------------------------------------------------------- /results/v2.0/sandybridge/acc-pgi-loops.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenACC 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 28017.895 0.01916 0.01995 0.01942 10 | Mul 22199.944 0.02418 0.02536 0.02469 11 | Add 23458.271 0.03433 0.03588 0.03497 12 | Triad 23644.549 0.03406 0.03532 0.03465 13 | -------------------------------------------------------------------------------- /results/v2.0/sandybridge/cuda.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: CUDA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device DEVICE EMULATION MODE 9 | Driver: PGI 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 21626.429 0.02482 0.03784 0.02526 12 | Mul 21321.415 0.02518 0.02603 0.02551 13 | Add 23394.375 0.03442 0.03588 0.03506 14 | Triad 23527.878 0.03423 0.03550 0.03486 15 | -------------------------------------------------------------------------------- /results/v2.0/sandybridge/kokkos-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: KOKKOS 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 51337.144 0.01046 0.01126 0.01063 10 | Mul 51613.662 0.01040 0.01464 0.01064 11 | Add 54473.120 0.01478 0.01544 0.01506 12 | Triad 54461.048 0.01479 0.01568 0.01523 13 | -------------------------------------------------------------------------------- /results/v2.0/sandybridge/mccalpin-intel.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------- 2 | STREAM version $Revision: 5.10 $ 3 | ------------------------------------------------------------- 4 | This system uses 8 bytes per array element. 5 | ------------------------------------------------------------- 6 | Array size = 33554432 (elements), Offset = 0 (elements) 7 | Memory per array = 256.0 MiB (= 0.2 GiB). 8 | Total memory required = 768.0 MiB (= 0.8 GiB). 9 | Each kernel will be executed 100 times. 10 | The *best* time for each kernel (excluding the first iteration) 11 | will be used to compute the reported bandwidth. 12 | ------------------------------------------------------------- 13 | Number of Threads requested = 16 14 | Number of Threads counted = 16 15 | ------------------------------------------------------------- 16 | Your clock granularity/precision appears to be 1 microseconds. 17 | Each test below will take on the order of 6841 microseconds. 18 | (= 6841 clock ticks) 19 | Increase the size of the arrays if this shows that 20 | you are not getting at least 20 clock ticks per test. 21 | ------------------------------------------------------------- 22 | WARNING -- The above is only a rough guideline. 23 | For best results, please be sure you know the 24 | precision of your system timer. 25 | ------------------------------------------------------------- 26 | Function Best Rate MB/s Avg time Min time Max time 27 | Copy: 57290.4 0.009506 0.009371 0.009757 28 | Scale: 66890.4 0.008211 0.008026 0.008573 29 | Add: 65196.5 0.012791 0.012352 0.023453 30 | Triad: 64351.8 0.012753 0.012514 0.013659 31 | ------------------------------------------------------------- 32 | Solution Validates: avg error less than 1.000000e-13 on all three arrays 33 | ------------------------------------------------------------- 34 | -------------------------------------------------------------------------------- /results/v2.0/sandybridge/ocl.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz 9 | Driver: 1.2.0.8 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 27240.466 0.01971 0.02092 0.02015 12 | Mul 26733.421 0.02008 0.02266 0.02043 13 | Add 29405.722 0.02739 0.03030 0.02808 14 | Triad 29734.601 0.02708 0.02990 0.02783 15 | -------------------------------------------------------------------------------- /results/v2.0/sandybridge/omp-intel.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: Reference OpenMP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 54991.394 0.00976 0.01134 0.01002 10 | Mul 48003.655 0.01118 0.01986 0.01146 11 | Add 52111.180 0.01545 0.01621 0.01575 12 | Triad 52985.444 0.01520 0.01608 0.01557 13 | -------------------------------------------------------------------------------- /results/v2.0/sandybridge/raja-gcc.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: RAJA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Function MBytes/sec Min (sec) Max Average 9 | Copy 51576.355 0.01041 0.01123 0.01064 10 | Mul 50943.953 0.01054 0.01359 0.01074 11 | Add 53535.927 0.01504 0.01598 0.01535 12 | Triad 53928.576 0.01493 0.01579 0.01534 13 | -------------------------------------------------------------------------------- /results/v2.0/titanx/hip.txt: -------------------------------------------------------------------------------- 1 | GPU-STREAM 2 | Version: 2.0 3 | Implementation: HIP 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using HIP device GeForce GTX TITAN X 9 | Driver: 4 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 263048.615 0.00204 0.00205 0.00205 12 | Mul 262831.366 0.00204 0.00205 0.00205 13 | Add 268754.019 0.00300 0.00301 0.00300 14 | Triad 268630.840 0.00300 0.00301 0.00300 15 | -------------------------------------------------------------------------------- /results/v3.3/titanxp/cuda.txt: -------------------------------------------------------------------------------- 1 | BabelStream 2 | Version: 3.3 3 | Implementation: CUDA 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using CUDA device TITAN Xp 9 | Driver: 9010 10 | Function MBytes/sec Min (sec) Max Average 11 | Copy 433182.244 0.00124 0.00128 0.00125 12 | Mul 431744.565 0.00124 0.00125 0.00125 13 | Add 436451.083 0.00185 0.00186 0.00185 14 | Triad 436491.299 0.00184 0.00186 0.00185 15 | Dot 435969.304 0.00123 0.00125 0.00124 16 | -------------------------------------------------------------------------------- /results/v3.3/titanxp/ocl.txt: -------------------------------------------------------------------------------- 1 | BabelStream 2 | Version: 3.3 3 | Implementation: OpenCL 4 | Running kernels 100 times 5 | Precision: double 6 | Array size: 268.4 MB (=0.3 GB) 7 | Total size: 805.3 MB (=0.8 GB) 8 | Using OpenCL device TITAN Xp 9 | Driver: 390.30 10 | Reduction kernel config: 120 groups of size 1024 11 | Function MBytes/sec Min (sec) Max Average 12 | Copy 433585.615 0.00124 0.00131 0.00125 13 | Mul 432684.751 0.00124 0.00125 0.00125 14 | Add 435230.162 0.00185 0.00187 0.00186 15 | Triad 435177.949 0.00185 0.00187 0.00186 16 | Dot 431076.539 0.00125 0.00127 0.00126 17 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | **/cuda-stream 3 | **/ocl-stream 4 | **/omp-stream 5 | **/acc-stream 6 | **/raja-stream 7 | **/kokkos-stream 8 | **/std-stream 9 | **/sycl-stream 10 | **/hip-stream 11 | 12 | **/*.o 13 | **/*.bc 14 | **/*.sycl 15 | **/*.tar 16 | **/*.gz 17 | **/*.a 18 | 19 | **/*.swp 20 | 21 | **/KokkosCore_Config_* 22 | 23 | **/.DS_Store 24 | 25 | 26 | build/ 27 | cmake-build-*/ 28 | CMakeFiles/ 29 | .idea/ 30 | .vscode/ 31 | .directory 32 | -------------------------------------------------------------------------------- /src/Stream.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | 13 | // Array values 14 | #define startA (0.1) 15 | #define startB (0.2) 16 | #define startC (0.0) 17 | #define startScalar (0.4) 18 | 19 | template 20 | class Stream 21 | { 22 | public: 23 | 24 | virtual ~Stream(){} 25 | 26 | // Kernels 27 | // These must be blocking calls 28 | virtual void copy() = 0; 29 | virtual void mul() = 0; 30 | virtual void add() = 0; 31 | virtual void triad() = 0; 32 | virtual void nstream() = 0; 33 | virtual T dot() = 0; 34 | 35 | // Copy memory between host and device 36 | virtual void init_arrays(T initA, T initB, T initC) = 0; 37 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; 38 | 39 | }; 40 | 41 | 42 | // Implementation specific device functions 43 | void listDevices(void); 44 | std::string getDeviceName(const int); 45 | std::string getDeviceDriver(const int); 46 | 47 | -------------------------------------------------------------------------------- /src/acc/ACCStream.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | 13 | #include "Stream.h" 14 | 15 | #include 16 | 17 | #define IMPLEMENTATION_STRING "OpenACC" 18 | 19 | template 20 | class ACCStream : public Stream 21 | { 22 | 23 | struct A{ 24 | T *a; 25 | T *b; 26 | T *c; 27 | }; 28 | 29 | protected: 30 | // Size of arrays 31 | int array_size; 32 | A aa; 33 | // Device side pointers 34 | T *a; 35 | T *b; 36 | T *c; 37 | 38 | public: 39 | ACCStream(const int, int); 40 | ~ACCStream(); 41 | 42 | virtual void copy() override; 43 | virtual void add() override; 44 | virtual void mul() override; 45 | virtual void triad() override; 46 | virtual void nstream() override; 47 | virtual T dot() override; 48 | 49 | virtual void init_arrays(T initA, T initB, T initC) override; 50 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 51 | 52 | 53 | 54 | }; 55 | -------------------------------------------------------------------------------- /src/cuda/CUDAStream.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "Stream.h" 15 | 16 | #define IMPLEMENTATION_STRING "CUDA" 17 | 18 | #define TBSIZE 1024 19 | 20 | template 21 | class CUDAStream : public Stream 22 | { 23 | protected: 24 | // Size of arrays 25 | int array_size; 26 | 27 | // Host array for partial sums for dot kernel 28 | T *sums; 29 | 30 | // Device side pointers to arrays 31 | T *d_a; 32 | T *d_b; 33 | T *d_c; 34 | T *d_sum; 35 | 36 | // Number of blocks for dot kernel 37 | int dot_num_blocks; 38 | 39 | public: 40 | 41 | CUDAStream(const int, const int); 42 | ~CUDAStream(); 43 | 44 | virtual void copy() override; 45 | virtual void add() override; 46 | virtual void mul() override; 47 | virtual void triad() override; 48 | virtual void nstream() override; 49 | virtual T dot() override; 50 | 51 | virtual void init_arrays(T initA, T initB, T initC) override; 52 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 53 | 54 | }; 55 | -------------------------------------------------------------------------------- /src/cuda/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection, this is used for host compilation" 4 | "c++") 5 | 6 | register_flag_optional(MEM "Device memory mode: 7 | DEFAULT - allocate host and device memory pointers. 8 | MANAGED - use CUDA Managed Memory. 9 | PAGEFAULT - shared memory, only host pointers allocated." 10 | "DEFAULT") 11 | 12 | register_flag_required(CMAKE_CUDA_COMPILER 13 | "Path to the CUDA nvcc compiler") 14 | 15 | # XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly 16 | register_flag_required(CUDA_ARCH 17 | "Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc") 18 | 19 | register_flag_optional(CUDA_EXTRA_FLAGS 20 | "Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" 21 | "") 22 | 23 | 24 | macro(setup) 25 | 26 | # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes 27 | if(POLICY CMP0104) 28 | cmake_policy(SET CMP0104 OLD) 29 | endif() 30 | 31 | enable_language(CUDA) 32 | register_definitions(${MEM}) 33 | 34 | # add -forward-unknown-to-host-compiler for compatibility reasons 35 | set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) 36 | string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") 37 | 38 | # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG 39 | # appended later 40 | wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) 41 | 42 | message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") 43 | endmacro() 44 | 45 | -------------------------------------------------------------------------------- /src/dpl_shim.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #ifndef ALIGNMENT 7 | #define ALIGNMENT (2*1024*1024) // 2MB 8 | #endif 9 | 10 | #ifdef USE_ONEDPL 11 | 12 | // oneDPL C++17 PSTL 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #if ONEDPL_USE_DPCPP_BACKEND 19 | 20 | #include 21 | 22 | const static auto exe_policy = oneapi::dpl::execution::device_policy<>{ 23 | oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{}) 24 | }; 25 | 26 | template 27 | T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue()); } 28 | 29 | template 30 | void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } 31 | 32 | #else 33 | 34 | // auto exe_policy = dpl::execution::seq; 35 | // auto exe_policy = dpl::execution::par; 36 | static constexpr auto exe_policy = dpl::execution::par_unseq; 37 | #define USE_STD_PTR_ALLOC_DEALLOC 38 | 39 | #endif 40 | 41 | #else 42 | 43 | // Normal C++17 PSTL 44 | 45 | #include 46 | #include 47 | #include 48 | 49 | // auto exe_policy = std::execution::seq; 50 | // auto exe_policy = std::execution::par; 51 | static constexpr auto exe_policy = std::execution::par_unseq; 52 | #define USE_STD_PTR_ALLOC_DEALLOC 53 | 54 | 55 | #endif 56 | 57 | #ifdef USE_STD_PTR_ALLOC_DEALLOC 58 | 59 | #if defined(__HIPSYCL__) || defined(__OPENSYCL__) 60 | #include 61 | 62 | // TODO We temporarily use malloc_shared/free here for hipSYCL stdpar because there's a linking issue if we let it hijack new/delete 63 | // for this to work, we compile with --hipsycl-stdpar-system-usm so that hijacking is disabled 64 | static cl::sycl::queue queue{cl::sycl::default_selector_v}; 65 | template T *alloc_raw(size_t size) { return cl::sycl::malloc_shared(size, queue); } 66 | template void dealloc_raw(T *ptr) { cl::sycl::free(ptr, queue); } 67 | 68 | #else 69 | template 70 | T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); } 71 | 72 | template 73 | void dealloc_raw(T *ptr) { free(ptr); } 74 | #endif 75 | 76 | #endif 77 | -------------------------------------------------------------------------------- /src/fortran/BabelStreamTypes.F90: -------------------------------------------------------------------------------- 1 | module BabelStreamTypes 2 | use, intrinsic :: ISO_Fortran_env, only: REAL64,REAL32,INT64,INT32 3 | 4 | implicit none 5 | 6 | #ifdef USE_FLOAT 7 | integer, parameter :: StreamRealKind = REAL32 8 | character(len=6) :: StreamRealName = "REAL32" 9 | #else 10 | integer, parameter :: StreamRealKind = REAL64 11 | character(len=6) :: StreamRealName = "REAL64" 12 | #endif 13 | 14 | #ifdef USE_INT32 15 | #warning There is no checking for overflowing INT32, so be careful. 16 | integer, parameter :: StreamIntKind = INT32 17 | #else 18 | integer, parameter :: StreamIntKind = INT64 19 | #endif 20 | 21 | end module BabelStreamTypes 22 | -------------------------------------------------------------------------------- /src/fortran/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # uncomment to disable GPU targets 4 | #HAS_GPU=0 5 | 6 | # Orin 7 | #if [ "x${compiler}" == "xgcc" ] ; then 8 | # export MCPU=cortex-a78ae 9 | #fi 10 | #if [ "x${compiler}" == "xarm" ] ; then 11 | # export MCPU=cortex-a78 12 | #fi 13 | 14 | COMPILERS="gcc" 15 | if [ $(which nvfortran) ] ; then 16 | COMPILERS="${COMPILERS} nvhpc" 17 | fi 18 | if [ $(which crayftn) ] ; then 19 | COMPILERS="${COMPILERS} cray" 20 | fi 21 | if [ $(uname -m) == "aarch64" ] ; then 22 | if [ $(which armflang) ] ; then 23 | COMPILERS="${COMPILERS} arm" 24 | fi 25 | if [ $(which frt) ] ; then 26 | COMPILERS="${COMPILERS} fj" 27 | fi 28 | elif [ $(uname -m) == "x86_64" ] ; then 29 | if [ $(which lscpu >& /dev/null && lscpu | grep GenuineIntel | awk '{print $3}') == "GenuineIntel" ] ; then 30 | COMPILERS="${COMPILERS} oneapi" 31 | if [ -f /opt/intel/oneapi/setvars.sh ] ; then 32 | . /opt/intel/oneapi/setvars.sh >& /dev/null 33 | fi 34 | else 35 | # ^ this detection can be improved 36 | COMPILERS="${COMPILERS} amd" 37 | fi 38 | fi 39 | 40 | for compiler in ${COMPILERS} ; do 41 | TARGETS="DoConcurrent Array OpenMP OpenMPTaskloop OpenMPWorkshare" 42 | if [ "${HAS_GPU}" != "0" ] ; then 43 | TARGETS="${TARGETS} OpenMPTarget OpenMPTargetLoop" 44 | if [ "x${compiler}" == "xnvhpc" ] ; then 45 | TARGETS="${TARGETS} CUDA CUDAKernel" 46 | fi 47 | fi 48 | if [ "x${compiler}" == "xnvhpc" ] || [ "x${compiler}" == "xgcc" ] || [ "x${compiler}" == "xcray" ] ; then 49 | TARGETS="${TARGETS} OpenACC OpenACCArray" 50 | fi 51 | for implementation in ${TARGETS} ; do 52 | make COMPILER=${compiler} IMPLEMENTATION=${implementation} 53 | done 54 | done 55 | -------------------------------------------------------------------------------- /src/fortran/make.inc.amd: -------------------------------------------------------------------------------- 1 | FC := /opt/rocm/llvm/bin/flang 2 | FC := /global/u1/j/jhammond/AMD/aocc-compiler-3.2.0/bin/flang 3 | FCFLAGS := -std=f2018 -O3 4 | FCFLAGS += -Wall -Wno-unused-variable 5 | 6 | ifdef MARCH 7 | FCFLAGS += -march=$(MARCH) 8 | else 9 | FCFLAGS += -march=native 10 | endif 11 | 12 | DOCONCURRENT_FLAG = -fopenmp # libomp.so required 13 | ARRAY_FLAG = -fopenmp # libomp.so required 14 | OPENMP_FLAG = -fopenmp 15 | #OPENMP_FLAG += -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908 16 | OPENACC_FLAG = -fopenacc 17 | CUDA_FLAG = 18 | SEQUENTIAL_FLAG = 19 | 20 | ifeq ($(IMPLEMENTATION),CUDA) 21 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 22 | endif 23 | ifeq ($(IMPLEMENTATION),CUDAKernels) 24 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 25 | endif 26 | -------------------------------------------------------------------------------- /src/fortran/make.inc.arm: -------------------------------------------------------------------------------- 1 | FC = armflang 2 | FCFLAGS = -std=f2018 -O3 3 | FCFLAGS += -Wall -Wno-unused-variable 4 | 5 | # MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78 6 | ARCH=$(shell uname -m) 7 | ifeq ($(ARCH),aarch64) 8 | ifdef MCPU 9 | FCFLAGS += -mcpu=$(MCPU) 10 | else 11 | FCFLAGS += -mcpu=native 12 | endif 13 | else 14 | ifdef MARCH 15 | FCFLAGS += -march=$(MARCH) 16 | else 17 | FCFLAGS += -march=native 18 | endif 19 | endif 20 | 21 | DOCONCURRENT_FLAG = -fopenmp 22 | ARRAY_FLAG = -fopenmp 23 | OPENMP_FLAG = -fopenmp 24 | OPENACC_FLAG = -fopenacc 25 | CUDA_FLAG = 26 | SEQUENTIAL_FLAG = 27 | 28 | ifeq ($(IMPLEMENTATION),OpenACC) 29 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 30 | endif 31 | ifeq ($(IMPLEMENTATION),OpenACCArray) 32 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 33 | endif 34 | ifeq ($(IMPLEMENTATION),CUDA) 35 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 36 | endif 37 | ifeq ($(IMPLEMENTATION),CUDAKernels) 38 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 39 | endif 40 | -------------------------------------------------------------------------------- /src/fortran/make.inc.cray: -------------------------------------------------------------------------------- 1 | FC := ftn 2 | FCFLAGS = -e F -O3 3 | 4 | DOCONCURRENT_FLAG = -h thread_do_concurrent -DCRAY_THREAD_DOCONCURRENT 5 | ARRAY_FLAG = -h autothread 6 | OPENMP_FLAG = -h omp 7 | OPENACC_FLAG = -h acc 8 | # CPU only 9 | OPENACC_FLAG += -h omp 10 | CUDA_FLAG = 11 | SEQUENTIAL_FLAG = 12 | 13 | ifeq ($(IMPLEMENTATION),CUDA) 14 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 15 | endif 16 | ifeq ($(IMPLEMENTATION),CUDAKernels) 17 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 18 | endif 19 | -------------------------------------------------------------------------------- /src/fortran/make.inc.fj: -------------------------------------------------------------------------------- 1 | FC := frt 2 | FCFLAGS = -X08 -Kfast -KA64FX -KSVE -KARMV8_3_A -Kzfill=100 -Kprefetch_sequential=soft -Kprefetch_line=8 -Kprefetch_line_L2=16 -Koptmsg=2 -Keval -DUSE_OMP_GET_WTIME=1 # FJ Fortran system_clock is low resolution 3 | 4 | DOCONCURRENT_FLAG = -Kparallel,reduction -DNOTSHARED 5 | ARRAY_FLAG = -Kparallel,reduction 6 | OPENMP_FLAG = -fopenmp 7 | OPENACC_FLAG = 8 | # CPU only 9 | OPENACC_FLAG += 10 | CUDA_FLAG = 11 | SEQUENTIAL_FLAG = 12 | 13 | ifeq ($(IMPLEMENTATION),OPENACC) 14 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 15 | endif 16 | ifeq ($(IMPLEMENTATION),CUDA) 17 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 18 | endif 19 | ifeq ($(IMPLEMENTATION),CUDAKernels) 20 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 21 | endif 22 | -------------------------------------------------------------------------------- /src/fortran/make.inc.gcc: -------------------------------------------------------------------------------- 1 | FC = gfortran 2 | FCFLAGS = -std=f2018 -O3 3 | FCFLAGS += -Wall -Wno-unused-dummy-argument -Wno-unused-variable 4 | 5 | # MARCH=neoverse-v1,neoverse-n1,icelake-server,znver3,cortex-a78ae 6 | ARCH=$(shell uname -m) 7 | ifeq ($(ARCH),aarch64) 8 | ifdef MCPU 9 | FCFLAGS += -mcpu=$(MCPU) 10 | else 11 | FCFLAGS += -mcpu=native 12 | endif 13 | else 14 | ifdef MARCH 15 | FCFLAGS += -march=$(MARCH) 16 | else 17 | FCFLAGS += -march=native 18 | endif 19 | endif 20 | 21 | DOCONCURRENT_FLAG = -ftree-parallelize-loops=4 22 | ARRAY_FLAG = 23 | OPENMP_FLAG = -fopenmp 24 | OPENACC_FLAG = -fopenacc 25 | CUDA_FLAG = 26 | SEQUENTIAL_FLAG = 27 | 28 | ifeq ($(IMPLEMENTATION),CUDA) 29 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 30 | endif 31 | ifeq ($(IMPLEMENTATION),CUDAKernels) 32 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 33 | endif 34 | -------------------------------------------------------------------------------- /src/fortran/make.inc.nvhpc: -------------------------------------------------------------------------------- 1 | FC := nvfortran 2 | #FCFLAGS := -O3 -Minform=inform -Minfo=all 3 | FCFLAGS := -O3 -Minform=warn 4 | 5 | #TARGET=gpu 6 | TARGET=multicore 7 | 8 | NVARCH=$(shell which nvidia-smi > /dev/null && nvidia-smi -q | grep "Product Architecture") 9 | ifeq ($(findstring Ampere,$(NVARCH)),Ampere) 10 | $(info Ampere detected) 11 | GPU = cc80 12 | endif 13 | ifeq ($(findstring Turing,$(NVARCH)),Turing) 14 | $(info Turing detected) 15 | GPU = cc75 16 | endif 17 | ifeq ($(findstring Volta,$(NVARCH)),Volta) 18 | $(info Volta detected) 19 | GPU = cc70 20 | endif 21 | ifeq ($(findstring Pascal,$(NVARCH)),Pascal) 22 | $(info Pascal detected) 23 | GPU = cc60,cc61 24 | endif 25 | ifeq ($(shell which jetson_clocks > /dev/null && echo 1),1) 26 | $(info Jetson AGX Orin detected) 27 | GPU = ccn87,cc86 28 | # figure out Xavier later 29 | #GPU = cc72 30 | endif 31 | ifeq ($(GPU),) 32 | $(error Your GPU architecture could not be detected. Set it manually.) 33 | endif 34 | GPUFLAG = -gpu=$(GPU) 35 | 36 | # MARCH=neoverse-v1,neoverse-n1,zen3 37 | ARCH=$(shell uname -m) 38 | ifdef MARCH 39 | ifeq ($(ARCH),aarch64) 40 | ifeq ($(MARCH),neoverse-n1) 41 | FCFLAGS += -tp=$(MARCH) 42 | else 43 | ifeq ($(MARCH),neoverse-v1) 44 | FCFLAGS += -tp=$(MARCH) 45 | else 46 | FCFLAGS += -tp=native 47 | endif 48 | endif 49 | else 50 | FCFLAGS += -tp=$(MARCH) 51 | endif 52 | else 53 | FCFLAGS += -tp=native 54 | endif 55 | 56 | # this is to allow apples-to-apples comparison with DC in non-DC GPU impls 57 | # set exactly one of these! 58 | #MANAGED = -DUSE_MANAGED -gpu=managed 59 | #DEVICE = -DUSE_DEVICE -cuda -gpu=nomanaged 60 | 61 | DOCONCURRENT_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(DEVICE) 62 | ARRAY_FLAG = $(GPUFLAG) -stdpar=$(TARGET) $(MANAGED) 63 | OPENMP_FLAG = $(GPUFLAG) -mp=$(TARGET) $(MANAGED) 64 | OPENACC_FLAG = $(GPUFLAG) -acc=$(TARGET) $(MANAGED) 65 | CUDA_FLAG = $(GPUFLAG) -cuda -acc=gpu $(MANAGED) 66 | SEQUENTIAL_FLAG = 67 | 68 | ifeq ($(IMPLEMENTATION),OpenMPTaskloop) 69 | $(error IMPLEMENTATION=OpenMPTaskloop is not supported by this compiler.) 70 | endif 71 | -------------------------------------------------------------------------------- /src/fortran/make.inc.oneapi: -------------------------------------------------------------------------------- 1 | FC := ifx 2 | FCFLAGS = -std18 3 | FCFLAGS += -Ofast -xHOST 4 | FCFLAGS += -qopt-zmm-usage=low 5 | 6 | ifeq ($(FC),ifort) 7 | FCFLAGS += -qopt-streaming-stores=always 8 | PARALLEL = -parallel 9 | endif 10 | 11 | DOCONCURRENT_FLAG = -qopenmp $(PARALLEL) 12 | ARRAY_FLAG = -qopenmp $(PARALLEL) 13 | OPENMP_FLAG = -qopenmp 14 | ifeq ($(FC),ifx) 15 | OPENMP_FLAG += -fopenmp-targets=spir64 -DUSE_FLOAT=1 16 | endif 17 | OPENACC_FLAG = 18 | CUDA_FLAG = 19 | SEQUENTIAL_FLAG = 20 | 21 | ifeq ($(IMPLEMENTATION),OpenACC) 22 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 23 | endif 24 | ifeq ($(IMPLEMENTATION),OpenACCArray) 25 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 26 | endif 27 | ifeq ($(IMPLEMENTATION),CUDA) 28 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 29 | endif 30 | ifeq ($(IMPLEMENTATION),CUDAKernels) 31 | $(error IMPLEMENTATION=$(IMPLEMENTATION) is not supported by this compiler.) 32 | endif 33 | -------------------------------------------------------------------------------- /src/fortran/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cat ./run.sh 4 | 5 | if [ `uname -s` == Darwin ] ; then 6 | NUM_HWTHREADS=`sysctl -n hw.ncpu` 7 | MEMORY_BYTES=`sysctl -n hw.memsize` 8 | else 9 | NUM_HWTHREADS=`nproc` 10 | MEMORY_KILOS=`grep MemTotal /proc/meminfo | awk '{print $2}'` 11 | fi 12 | 13 | M=128 14 | 15 | export OMP_NUM_THREADS=8 16 | export OMP_PROC_BIND=close 17 | export OMP_PLACES=threads 18 | 19 | export ACC_NUM_CORES=${OMP_NUM_THREADS} 20 | 21 | AFFCONTROL="numactl -N 0 -m 0 -C `seq -s "," 0 $((${OMP_NUM_THREADS}-1))`" 22 | 23 | for compiler in gcc nvhpc cray oneapi arm amd fj ; do 24 | #if [ "x$compiler" == "xgcc" ] ; then 25 | # export LD_PRELOAD=/usr/lib/gcc/aarch64-linux-gnu/11/libgomp.so 26 | #fi 27 | for implementation in OpenMP OpenMPTaskloop OpenMPWorkshare DoConcurrent Array OpenACC OpenACCArray CUDA CUDAKernel ; do 28 | if [ -f BabelStream.${compiler}.${implementation} ] ; then 29 | echo "BabelStream.${compiler}.${implementation}" 30 | ldd BabelStream.${compiler}.${implementation} 31 | time $AFFCONTROL \ 32 | ./BabelStream.${compiler}.${implementation} -s $((1024*1024*${M})) 33 | fi 34 | done 35 | done 36 | -------------------------------------------------------------------------------- /src/futhark/FutharkStream.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 2 | // University of Bristol HPC 3 | // Copyright (c) 2022 Troels Henriksen 4 | // University of Copenhagen 5 | // 6 | // For full license terms please see the LICENSE file distributed with this 7 | // source code 8 | 9 | #pragma once 10 | 11 | #include 12 | #include 13 | 14 | #include "Stream.h" 15 | #include "babelstream.h" 16 | 17 | #if defined(FUTHARK_BACKEND_c) 18 | #define IMPLEMENTATION_STRING "Futhark (sequential)" 19 | #elif defined(FUTHARK_BACKEND_multicore) 20 | #define IMPLEMENTATION_STRING "Futhark (parallel CPU)" 21 | #elif defined(FUTHARK_BACKEND_opencl) 22 | #define IMPLEMENTATION_STRING "Futhark (OpencL)" 23 | #elif defined(FUTHARK_BACKEND_cuda) 24 | #define IMPLEMENTATION_STRING "Futhark (CUDA)" 25 | #else 26 | #define IMPLEMENTATION_STRING "Futhark (unknown backend)" 27 | #endif 28 | 29 | template 30 | class FutharkStream : public Stream 31 | { 32 | protected: 33 | // Size of arrays 34 | int array_size; 35 | // For device selection. 36 | std::string device; 37 | 38 | // Futhark stuff 39 | struct futhark_context_config *cfg; 40 | struct futhark_context *ctx; 41 | 42 | // Device side arrays 43 | void* a; 44 | void* b; 45 | void* c; 46 | 47 | public: 48 | FutharkStream(const int, int); 49 | ~FutharkStream(); 50 | 51 | virtual void copy() override; 52 | virtual void add() override; 53 | virtual void mul() override; 54 | virtual void triad() override; 55 | virtual void nstream() override; 56 | virtual T dot() override; 57 | 58 | virtual void init_arrays(T initA, T initB, T initC) override; 59 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 60 | }; 61 | -------------------------------------------------------------------------------- /src/futhark/babelstream.fut: -------------------------------------------------------------------------------- 1 | module type kernels = { 2 | type t 3 | val copy [n] : [n]t -> *[n]t 4 | val mul [n] : t -> [n]t -> [n]t 5 | val add [n] : [n]t -> [n]t -> [n]t 6 | val triad [n] : t -> [n]t -> [n]t -> [n]t 7 | val dot [n] : [n]t -> [n]t -> t 8 | -- Uniqueness allows nstream to mutate the 'a' array. 9 | val nstream [n] : t -> *[n]t -> [n]t -> [n]t -> [n]t 10 | } 11 | 12 | module kernels (P: real) : kernels with t = P.t = { 13 | type t = P.t 14 | def copy = copy 15 | def mul scalar c = map (P.*scalar) c 16 | def add = map2 (P.+) 17 | def triad scalar b c = map2 (P.+) b (map (P.* scalar) c) 18 | def dot a b = reduce (P.+) (P.i32 0) (map2 (P.*) a b) 19 | def nstream scalar a b c = map2 (P.+) a (map2 (P.+) b (map (P.*scalar) c)) 20 | } 21 | 22 | module f32_kernels = kernels f32 23 | def f32_start_scalar : f32 = 0.4 24 | entry f32_copy = f32_kernels.copy 25 | entry f32_mul = f32_kernels.mul f32_start_scalar 26 | entry f32_add = f32_kernels.add 27 | entry f32_triad = f32_kernels.triad f32_start_scalar 28 | entry f32_nstream = f32_kernels.nstream f32_start_scalar 29 | entry f32_dot = f32_kernels.dot 30 | 31 | module f64_kernels = kernels f64 32 | def f64_start_scalar : f64 = 0.4 33 | entry f64_copy = f64_kernels.copy 34 | entry f64_mul = f64_kernels.mul f64_start_scalar 35 | entry f64_add = f64_kernels.add 36 | entry f64_triad = f64_kernels.triad f64_start_scalar 37 | entry f64_nstream = f64_kernels.nstream f64_start_scalar 38 | entry f64_dot = f64_kernels.dot 39 | 40 | -- == 41 | -- entry: f32_copy f32_mul 42 | -- random input { [33554432]f32 } 43 | 44 | -- == 45 | -- entry: f32_add f32_dot f32_triad 46 | -- random input { [33554432]f32 [33554432]f32 } 47 | 48 | -- == 49 | -- entry: f32_nstream 50 | -- random input { [33554432]f32 [33554432]f32 [33554432]f32 } 51 | 52 | -- == 53 | -- entry: f64_copy f64_mul 54 | -- random input { [33554432]f64 } 55 | 56 | -- == 57 | -- entry: f64_add f64_dot f64_triad 58 | -- random input { [33554432]f64 [33554432]f64 } 59 | 60 | -- == 61 | -- entry: f64_nstream 62 | -- random input { [33554432]f64 [33554432]f64 [33554432]f64 } 63 | -------------------------------------------------------------------------------- /src/futhark/model.cmake: -------------------------------------------------------------------------------- 1 | # Use 2 | # 3 | # cmake -Bbuild -H. -DMODEL=futhark -DFUTHARK_BACKEND=foo -DFUTHARK_COMPILER=foo/bar/bin/futhark 4 | # 5 | # to use the Futhark backend, where 'foo' must be one of 'multicore', 6 | # 'c', 'opencl', or 'cuda'. Defaults to 'multicore'. 7 | # 8 | # Use -DFUTHARK_COMPILER to set the path to the Futhark compiler 9 | # binary. Defaults to 'futhark' on the PATH. 10 | 11 | register_flag_optional(FUTHARK_BACKEND 12 | "Use a specific Futhark backend, possible options are: 13 | - c 14 | - multicore 15 | - opencl 16 | - cuda" 17 | "multicore") 18 | 19 | register_flag_optional(FUTHARK_COMPILER 20 | "Absolute path to the Futhark compiler, defaults to the futhark compiler on PATH" 21 | "futhark") 22 | 23 | macro(setup) 24 | add_custom_command( 25 | OUTPUT 26 | ${CMAKE_CURRENT_BINARY_DIR}/babelstream.c 27 | ${CMAKE_CURRENT_BINARY_DIR}/babelstream.h 28 | COMMAND ${FUTHARK_COMPILER} ${FUTHARK_BACKEND} 29 | --library src/futhark/babelstream.fut 30 | -o ${CMAKE_CURRENT_BINARY_DIR}/babelstream 31 | DEPENDS src/futhark/babelstream.fut 32 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 33 | VERBATIM 34 | ) 35 | if (${FUTHARK_BACKEND} STREQUAL "c") 36 | # Nothing to do. 37 | elseif (${FUTHARK_BACKEND} STREQUAL "multicore") 38 | set(THREADS_PREFER_PTHREAD_FLAG ON) 39 | find_package(Threads REQUIRED) 40 | register_link_library(Threads::Threads) 41 | elseif (${FUTHARK_BACKEND} STREQUAL "opencl") 42 | find_package(OpenCL REQUIRED) 43 | register_link_library(OpenCL::OpenCL) 44 | elseif (${FUTHARK_BACKEND} STREQUAL "cuda") 45 | find_package(CUDA REQUIRED) 46 | register_link_library("nvrtc" "cuda" "cudart") 47 | else () 48 | message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}") 49 | endif() 50 | endmacro() 51 | 52 | macro(setup_target) 53 | target_sources(${EXE_NAME} PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/babelstream.c") 54 | include_directories("${CMAKE_CURRENT_BINARY_DIR}") 55 | endmacro() 56 | -------------------------------------------------------------------------------- /src/hip/HIPStream.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "Stream.h" 15 | 16 | #define IMPLEMENTATION_STRING "HIP" 17 | #define DOT_READ_DWORDS_PER_LANE 4 18 | 19 | 20 | template 21 | class HIPStream : public Stream 22 | { 23 | // Make sure that either: 24 | // DOT_READ_DWORDS_PER_LANE is less than sizeof(T), in which case we default to 1 element 25 | // or 26 | // DOT_READ_DWORDS_PER_LANE is divisible by sizeof(T) 27 | static_assert((DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) < sizeof(T)) || 28 | (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) % sizeof(T) == 0), 29 | "DOT_READ_DWORDS_PER_LANE not divisible by sizeof(element_type)"); 30 | 31 | // Take into account the datatype size 32 | // That is, for 4 DOT_READ_DWORDS_PER_LANE, this is 2 FP64 elements 33 | // and 4 FP32 elements 34 | static constexpr unsigned int dot_elements_per_lane{ 35 | (DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int)) < sizeof(T) ? 1 : ( 36 | DOT_READ_DWORDS_PER_LANE * sizeof(unsigned int) / sizeof(T))}; 37 | 38 | protected: 39 | // Size of arrays 40 | int array_size; 41 | int dot_num_blocks; 42 | 43 | // Host array for partial sums for dot kernel 44 | T *sums; 45 | 46 | // Device side pointers to arrays 47 | T *d_a; 48 | T *d_b; 49 | T *d_c; 50 | 51 | 52 | public: 53 | 54 | HIPStream(const int, const int); 55 | ~HIPStream(); 56 | 57 | virtual void copy() override; 58 | virtual void add() override; 59 | virtual void mul() override; 60 | virtual void triad() override; 61 | virtual void nstream() override; 62 | virtual T dot() override; 63 | 64 | virtual void init_arrays(T initA, T initB, T initC) override; 65 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 66 | 67 | }; 68 | -------------------------------------------------------------------------------- /src/hip/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_required(CMAKE_CXX_COMPILER 3 | "Absolute path to the AMD HIP C++ compiler") 4 | 5 | register_flag_optional(MEM "Device memory mode: 6 | DEFAULT - allocate host and device memory pointers. 7 | MANAGED - use HIP Managed Memory. 8 | PAGEFAULT - shared memory, only host pointers allocated." 9 | "DEFAULT") 10 | 11 | macro(setup) 12 | # nothing to do here as hipcc does everything correctly, what a surprise! 13 | register_definitions(${MEM}) 14 | endmacro() -------------------------------------------------------------------------------- /src/java/java-stream/.gitignore: -------------------------------------------------------------------------------- 1 | ## File-based project format: 2 | .idea 3 | *.iws 4 | *.iml 5 | 6 | ## Plugin-specific files: 7 | 8 | # IntelliJ 9 | /out/ 10 | 11 | # mpeltonen/sbt-idea plugin 12 | .idea_modules/ 13 | 14 | # JIRA plugin 15 | atlassian-ide-plugin.xml 16 | 17 | # Crashlytics plugin (for Android Studio and IntelliJ) 18 | com_crashlytics_export_strings.xml 19 | crashlytics.properties 20 | crashlytics-build.properties 21 | fabric.properties 22 | ### VisualStudioCode template 23 | .vscode/* 24 | !.vscode/settings.json 25 | !.vscode/tasks.json 26 | !.vscode/launch.json 27 | !.vscode/extensions.json 28 | ### Linux template 29 | *~ 30 | 31 | # temporary files which can be created if a process still has a handle open of a deleted file 32 | .fuse_hidden* 33 | 34 | # KDE directory preferences 35 | .directory 36 | 37 | # Linux trash folder which might appear on any partition or disk 38 | .Trash-* 39 | 40 | # .nfs files are created when an open file is removed but is still being accessed 41 | .nfs* 42 | 43 | # Windows thumbnail cache files 44 | Thumbs.db 45 | ehthumbs.db 46 | ehthumbs_vista.db 47 | 48 | # Folder config file 49 | Desktop.ini 50 | 51 | # Recycle Bin used on file shares 52 | $RECYCLE.BIN/ 53 | 54 | # Windows Installer files 55 | *.cab 56 | *.msi 57 | *.msm 58 | *.msp 59 | 60 | # Windows shortcuts 61 | *.lnk 62 | ### Maven template 63 | target/ 64 | pom.xml.tag 65 | pom.xml.releaseBackup 66 | pom.xml.versionsBackup 67 | pom.xml.next 68 | release.properties 69 | dependency-reduced-pom.xml 70 | buildNumber.properties 71 | .mvn/timing.properties 72 | 73 | # Avoid ignoring Maven wrapper jar file (.jar files are usually ignored) 74 | !/.mvn/wrapper/maven-wrapper.jar 75 | ### Java template 76 | # Compiled class file 77 | *.class 78 | 79 | # Log file 80 | *.log 81 | 82 | # BlueJ files 83 | *.ctxt 84 | 85 | # Mobile Tools for Java (J2ME) 86 | .mtj.tmp/ 87 | 88 | # Package Files # 89 | *.jar 90 | *.war 91 | *.ear 92 | *.zip 93 | *.tar.gz 94 | *.rar 95 | 96 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 97 | hs_err_pid* 98 | ### macOS template 99 | *.DS_Store 100 | .AppleDouble 101 | .LSOverride 102 | 103 | # Icon must end with two \r 104 | Icon 105 | 106 | # Thumbnails 107 | ._* 108 | 109 | # Files that might appear in the root of a volume 110 | .DocumentRevisions-V100 111 | .fseventsd 112 | .Spotlight-V100 113 | .TemporaryItems 114 | .Trashes 115 | .VolumeIcon.icns 116 | .com.apple.timemachine.donotpresent 117 | 118 | # Directories potentially created on remote AFP share 119 | .AppleDB 120 | .AppleDesktop 121 | Network Trash Folder 122 | Temporary Items 123 | .apdisk 124 | 125 | 126 | !.mvn/**/* 127 | 128 | settings.xml 129 | -------------------------------------------------------------------------------- /src/java/java-stream/.mvn/wrapper/maven-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/BabelStream/2f00dfb7f8b7cfe8c53d20d5c770bccbf8673440/src/java/java-stream/.mvn/wrapper/maven-wrapper.jar -------------------------------------------------------------------------------- /src/java/java-stream/.mvn/wrapper/maven-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionUrl=https://repo1.maven.org/maven2/org/apache/maven/apache-maven/3.5.0/apache-maven-3.5.0-bin.zip -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/FractionalMaths.java: -------------------------------------------------------------------------------- 1 | package javastream; 2 | 3 | /** 4 | * This class represents our Fractional typeclass. Java's type system isn't unified so we have to do 5 | * insane things for parametric operations on fractional types. 6 | */ 7 | @SuppressWarnings("unchecked") 8 | public final class FractionalMaths { 9 | 10 | private FractionalMaths() { 11 | throw new AssertionError(); 12 | } 13 | 14 | public static T from(Class evidence, Number n) { 15 | if (evidence == Double.TYPE || evidence == Double.class) 16 | return (T) Double.valueOf(n.doubleValue()); 17 | else if (evidence == Float.TYPE || evidence == Float.class) 18 | return (T) Float.valueOf(n.floatValue()); 19 | throw new IllegalArgumentException(); 20 | } 21 | 22 | public static T plus(T x, T y) { 23 | if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() + y.doubleValue()); 24 | else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() + y.floatValue()); 25 | throw new IllegalArgumentException(); 26 | } 27 | 28 | static T minus(T x, T y) { 29 | if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() - y.doubleValue()); 30 | else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() - y.floatValue()); 31 | throw new IllegalArgumentException(); 32 | } 33 | 34 | public static T times(T x, T y) { 35 | if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() * y.doubleValue()); 36 | else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() * y.floatValue()); 37 | throw new IllegalArgumentException(); 38 | } 39 | 40 | static T divide(T x, T y) { 41 | if (x instanceof Double) return (T) Double.valueOf(x.doubleValue() / y.doubleValue()); 42 | else if (x instanceof Float) return (T) Float.valueOf(x.floatValue() / y.floatValue()); 43 | throw new IllegalArgumentException(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/aparapi/GenericAparapiStreamKernel.java: -------------------------------------------------------------------------------- 1 | package javastream.aparapi; 2 | 3 | import com.aparapi.Kernel; 4 | import com.aparapi.Range; 5 | import javastream.JavaStream.Data; 6 | import javastream.Main.Config; 7 | 8 | abstract class GenericAparapiStreamKernel extends Kernel { 9 | 10 | protected static final int FN_COPY = 1; 11 | protected static final int FN_MUL = 2; 12 | protected static final int FN_ADD = 3; 13 | protected static final int FN_TRIAD = 4; 14 | protected static final int FN_NSTREAM = 5; 15 | protected static final int FN_DOT = 6; 16 | protected final Config config; 17 | protected final int arraysize, numGroups, workGroupSize; 18 | 19 | interface Factory { 20 | GenericAparapiStreamKernel create(Config config, int numGroups, int workGroupSize); 21 | } 22 | 23 | GenericAparapiStreamKernel(Config config, int numGroups, int workGroupSize) { 24 | this.config = config; 25 | this.arraysize = config.options.arraysize; 26 | this.numGroups = numGroups; 27 | this.workGroupSize = workGroupSize; 28 | setExplicit(true); 29 | } 30 | 31 | protected int function; 32 | 33 | public abstract void init(); 34 | 35 | public void copy() { 36 | function = FN_COPY; 37 | execute(arraysize); 38 | } 39 | 40 | public void mul() { 41 | function = FN_MUL; 42 | execute(arraysize); 43 | } 44 | 45 | public void add() { 46 | function = FN_ADD; 47 | execute(arraysize); 48 | } 49 | 50 | public void triad() { 51 | function = FN_TRIAD; 52 | execute(arraysize); 53 | } 54 | 55 | public void nstream() { 56 | function = FN_NSTREAM; 57 | execute(arraysize); 58 | } 59 | 60 | protected Kernel partialDot() { 61 | function = FN_DOT; 62 | return execute(Range.create(numGroups * workGroupSize, workGroupSize)); 63 | } 64 | 65 | abstract T dot(); 66 | 67 | abstract Data syncAndDispose(); 68 | } 69 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/aparapi/SpecialisedDoubleKernel.java: -------------------------------------------------------------------------------- 1 | package javastream.aparapi; 2 | 3 | import java.util.Arrays; 4 | import javastream.JavaStream; 5 | import javastream.JavaStream.Data; 6 | import javastream.Main.Config; 7 | 8 | final class SpecialisedDoubleKernel extends GenericAparapiStreamKernel { 9 | private final double scalar; 10 | final double[] a, b, c; 11 | private final double[] partialSum; 12 | @Local private final double[] workGroupSum; 13 | 14 | SpecialisedDoubleKernel(Config config, int numGroups, int workGroupSize) { 15 | super(config, numGroups, workGroupSize); 16 | this.scalar = config.scalar; 17 | this.a = new double[this.arraysize]; 18 | this.b = new double[this.arraysize]; 19 | this.c = new double[this.arraysize]; 20 | 21 | this.partialSum = new double[numGroups]; 22 | this.workGroupSum = new double[workGroupSize]; 23 | } 24 | 25 | @SuppressWarnings("DuplicatedCode") 26 | @Override 27 | public void run() { 28 | int i = getGlobalId(); 29 | if (function == FN_COPY) { 30 | c[i] = a[i]; 31 | } else if (function == FN_MUL) { 32 | b[i] = scalar * c[i]; 33 | } else if (function == FN_ADD) { 34 | c[i] = a[i] + b[i]; 35 | } else if (function == FN_TRIAD) { 36 | a[i] = b[i] + scalar * c[i]; 37 | } else if (function == FN_NSTREAM) { 38 | a[i] += b[i] + scalar * c[i]; 39 | } else if (function == FN_DOT) { 40 | int localId = getLocalId(0); 41 | workGroupSum[localId] = 0.0; 42 | for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i]; 43 | for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) { 44 | localBarrier(); 45 | if (localId < offset) { 46 | workGroupSum[localId] += workGroupSum[localId + offset]; 47 | } 48 | } 49 | if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId]; 50 | } 51 | } 52 | 53 | @Override 54 | public void init() { 55 | Arrays.fill(a, config.initA); 56 | Arrays.fill(b, config.initB); 57 | Arrays.fill(c, config.initC); 58 | put(a).put(b).put(c); 59 | } 60 | 61 | @Override 62 | public Double dot() { 63 | partialDot().get(partialSum); 64 | double sum = 0; 65 | for (double v : partialSum) sum += v; 66 | return sum; 67 | } 68 | 69 | @Override 70 | public Data syncAndDispose() { 71 | get(a).get(b).get(c).dispose(); 72 | return new Data<>(JavaStream.boxed(a), JavaStream.boxed(b), JavaStream.boxed(c)); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/aparapi/SpecialisedFloatKernel.java: -------------------------------------------------------------------------------- 1 | package javastream.aparapi; 2 | 3 | import static javastream.JavaStream.boxed; 4 | 5 | import java.util.Arrays; 6 | import javastream.JavaStream.Data; 7 | import javastream.Main.Config; 8 | 9 | final class SpecialisedFloatKernel extends GenericAparapiStreamKernel { 10 | private final float scalar; 11 | final float[] a, b, c; 12 | private final float[] partialSum; 13 | @Local private final float[] workGroupSum; 14 | 15 | SpecialisedFloatKernel(Config config, int numGroups, int workGroupSize) { 16 | super(config, numGroups, workGroupSize); 17 | this.scalar = config.scalar; 18 | this.a = new float[this.arraysize]; 19 | this.b = new float[this.arraysize]; 20 | this.c = new float[this.arraysize]; 21 | 22 | this.partialSum = new float[numGroups]; 23 | this.workGroupSum = new float[workGroupSize]; 24 | } 25 | 26 | @SuppressWarnings("DuplicatedCode") 27 | @Override 28 | public void run() { 29 | int i = getGlobalId(); 30 | if (function == FN_COPY) { 31 | c[i] = a[i]; 32 | } else if (function == FN_MUL) { 33 | b[i] = scalar * c[i]; 34 | } else if (function == FN_ADD) { 35 | c[i] = a[i] + b[i]; 36 | } else if (function == FN_TRIAD) { 37 | a[i] = b[i] + scalar * c[i]; 38 | } else if (function == FN_NSTREAM) { 39 | a[i] += b[i] + scalar * c[i]; 40 | } else if (function == FN_DOT) { 41 | int localId = getLocalId(0); 42 | workGroupSum[localId] = 0.f; 43 | for (; i < arraysize; i += getGlobalSize(0)) workGroupSum[localId] += a[i] * b[i]; 44 | for (int offset = getLocalSize(0) / 2; offset > 0; offset /= 2) { 45 | localBarrier(); 46 | if (localId < offset) { 47 | workGroupSum[localId] += workGroupSum[localId + offset]; 48 | } 49 | } 50 | if (localId == 0) partialSum[getGroupId(0)] = workGroupSum[localId]; 51 | } 52 | } 53 | 54 | @Override 55 | public void init() { 56 | Arrays.fill(a, config.initA); 57 | Arrays.fill(b, config.initB); 58 | Arrays.fill(c, config.initC); 59 | put(a).put(b).put(c); 60 | } 61 | 62 | @Override 63 | public Float dot() { 64 | partialDot().get(partialSum); 65 | float sum = 0; 66 | for (float v : partialSum) sum += v; 67 | return sum; 68 | } 69 | 70 | @Override 71 | public Data syncAndDispose() { 72 | get(a).get(b).get(c).dispose(); 73 | return new Data<>(boxed(a), boxed(b), boxed(c)); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/jdk/GenericPlainStream.java: -------------------------------------------------------------------------------- 1 | package javastream.jdk; 2 | 3 | import static javastream.FractionalMaths.from; 4 | import static javastream.FractionalMaths.plus; 5 | import static javastream.FractionalMaths.times; 6 | 7 | import java.lang.reflect.Array; 8 | import java.util.Collections; 9 | import java.util.List; 10 | import javastream.JavaStream; 11 | import javastream.Main.Config; 12 | 13 | final class GenericPlainStream extends JavaStream { 14 | 15 | private final T[] a; 16 | private final T[] b; 17 | private final T[] c; 18 | 19 | @SuppressWarnings("unchecked") 20 | GenericPlainStream(Config config) { 21 | super(config); 22 | this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize); 23 | this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize); 24 | this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize); 25 | } 26 | 27 | @Override 28 | public List listDevices() { 29 | return Collections.singletonList("JVM"); 30 | } 31 | 32 | @Override 33 | public void initArrays() { 34 | for (int i = 0; i < config.options.arraysize; i++) { 35 | a[i] = config.initA; 36 | b[i] = config.initB; 37 | c[i] = config.initC; 38 | } 39 | } 40 | 41 | @SuppressWarnings("ManualArrayCopy") 42 | @Override 43 | public void copy() { 44 | for (int i = 0; i < config.options.arraysize; i++) { 45 | c[i] = a[i]; 46 | } 47 | } 48 | 49 | @Override 50 | public void mul() { 51 | for (int i = 0; i < config.options.arraysize; i++) { 52 | b[i] = times(config.scalar, c[i]); 53 | } 54 | } 55 | 56 | @Override 57 | public void add() { 58 | 59 | for (int i = 0; i < config.options.arraysize; i++) { 60 | c[i] = plus(a[i], b[i]); 61 | } 62 | } 63 | 64 | @Override 65 | public void triad() { 66 | 67 | for (int i = 0; i < config.options.arraysize; i++) { 68 | a[i] = plus(b[i], times(config.scalar, c[i])); 69 | } 70 | } 71 | 72 | @Override 73 | public void nstream() { 74 | for (int i = 0; i < config.options.arraysize; i++) { 75 | a[i] = plus(a[i], plus(b[i], times(config.scalar, c[i]))); 76 | } 77 | } 78 | 79 | @Override 80 | public T dot() { 81 | T acc = from(config.evidence, 0); 82 | for (int i = 0; i < config.options.arraysize; i++) { 83 | acc = plus(acc, times(a[i], b[i])); 84 | } 85 | return acc; 86 | } 87 | 88 | @Override 89 | public Data readArrays() { 90 | return new Data<>(a, b, c); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/jdk/GenericStream.java: -------------------------------------------------------------------------------- 1 | package javastream.jdk; 2 | 3 | import static javastream.FractionalMaths.from; 4 | import static javastream.FractionalMaths.plus; 5 | import static javastream.FractionalMaths.times; 6 | 7 | import java.lang.reflect.Array; 8 | import java.util.Arrays; 9 | import java.util.Collections; 10 | import java.util.List; 11 | import java.util.stream.IntStream; 12 | import javastream.FractionalMaths; 13 | import javastream.JavaStream; 14 | import javastream.Main.Config; 15 | 16 | /** 17 | * We use 18 | * 19 | *
Arrays.parallelSetAll
20 | * 21 | *

here as it internally calls 22 | * 23 | *

IntStream.range(0, array.length).parallel().forEach(...)
24 | */ 25 | final class GenericStream extends JavaStream { 26 | 27 | private final T[] a, b, c; 28 | 29 | @SuppressWarnings("unchecked") 30 | GenericStream(Config config) { 31 | super(config); 32 | this.a = (T[]) Array.newInstance(config.evidence, config.options.arraysize); 33 | this.b = (T[]) Array.newInstance(config.evidence, config.options.arraysize); 34 | this.c = (T[]) Array.newInstance(config.evidence, config.options.arraysize); 35 | } 36 | 37 | @Override 38 | public List listDevices() { 39 | return Collections.singletonList("JVM"); 40 | } 41 | 42 | @Override 43 | public void initArrays() { 44 | Arrays.parallelSetAll(a, i -> config.initA); 45 | Arrays.parallelSetAll(b, i -> config.initB); 46 | Arrays.parallelSetAll(c, i -> config.initC); 47 | } 48 | 49 | @Override 50 | public void copy() { 51 | Arrays.parallelSetAll(c, i -> a[i]); 52 | } 53 | 54 | @Override 55 | public void mul() { 56 | Arrays.parallelSetAll(b, i -> times(config.scalar, c[i])); 57 | } 58 | 59 | @Override 60 | public void add() { 61 | Arrays.parallelSetAll(c, i -> plus(a[i], b[i])); 62 | } 63 | 64 | @Override 65 | public void triad() { 66 | Arrays.parallelSetAll(a, i -> plus(b[i], times(config.scalar, c[i]))); 67 | } 68 | 69 | @Override 70 | public void nstream() { 71 | Arrays.parallelSetAll(a, i -> plus(a[i], plus(b[i], times(config.scalar, c[i])))); 72 | } 73 | 74 | @Override 75 | public T dot() { 76 | return IntStream.range(0, config.options.arraysize) 77 | .parallel() 78 | .mapToObj(i -> times(a[i], b[i])) 79 | .reduce(from(config.evidence, 0), FractionalMaths::plus); 80 | } 81 | 82 | @Override 83 | public Data readArrays() { 84 | return new Data<>(a, b, c); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/jdk/JdkStreams.java: -------------------------------------------------------------------------------- 1 | package javastream.jdk; 2 | 3 | import java.util.AbstractMap.SimpleImmutableEntry; 4 | import java.util.function.Function; 5 | import javastream.JavaStream; 6 | import javastream.JavaStream.EnumeratedStream; 7 | import javastream.Main.Config; 8 | 9 | public final class JdkStreams { 10 | 11 | private JdkStreams() {} 12 | 13 | public static final Function, JavaStream> FLOAT = 14 | config -> 15 | new EnumeratedStream<>( 16 | config, 17 | new SimpleImmutableEntry<>("specialised", SpecialisedFloatStream::new), 18 | new SimpleImmutableEntry<>("generic", GenericStream::new)); 19 | 20 | public static final Function, JavaStream> DOUBLE = 21 | config -> 22 | new EnumeratedStream<>( 23 | config, 24 | new SimpleImmutableEntry<>("specialised", SpecialisedDoubleStream::new), 25 | new SimpleImmutableEntry<>("generic", GenericStream::new)); 26 | } 27 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/jdk/PlainStream.java: -------------------------------------------------------------------------------- 1 | package javastream.jdk; 2 | 3 | import java.util.AbstractMap.SimpleImmutableEntry; 4 | import java.util.function.Function; 5 | import javastream.JavaStream; 6 | import javastream.JavaStream.EnumeratedStream; 7 | import javastream.Main.Config; 8 | 9 | public final class PlainStream { 10 | 11 | private PlainStream() {} 12 | 13 | public static final Function, JavaStream> FLOAT = 14 | config -> 15 | new EnumeratedStream<>( 16 | config, 17 | new SimpleImmutableEntry<>("specialised", SpecialisedPlainFloatStream::new), 18 | new SimpleImmutableEntry<>("generic", GenericPlainStream::new)); 19 | 20 | public static final Function, JavaStream> DOUBLE = 21 | config -> 22 | new EnumeratedStream<>( 23 | config, 24 | new SimpleImmutableEntry<>("specialised", SpecialisedPlainDoubleStream::new), 25 | new SimpleImmutableEntry<>("generic", GenericPlainStream::new)); 26 | } 27 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/jdk/SpecialisedDoubleStream.java: -------------------------------------------------------------------------------- 1 | package javastream.jdk; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | import java.util.stream.IntStream; 6 | import javastream.JavaStream; 7 | import javastream.Main.Config; 8 | 9 | final class SpecialisedDoubleStream extends JavaStream { 10 | 11 | private final double[] a, b, c; 12 | 13 | SpecialisedDoubleStream(Config config) { 14 | super(config); 15 | this.a = new double[config.options.arraysize]; 16 | this.b = new double[config.options.arraysize]; 17 | this.c = new double[config.options.arraysize]; 18 | } 19 | 20 | @Override 21 | public List listDevices() { 22 | return Collections.singletonList("JVM"); 23 | } 24 | 25 | @Override 26 | public void initArrays() { 27 | IntStream.range(0, config.options.arraysize) // 28 | .parallel() 29 | .forEach( 30 | i -> { 31 | a[i] = config.initA; 32 | b[i] = config.initB; 33 | c[i] = config.initC; 34 | }); 35 | } 36 | 37 | @Override 38 | public void copy() { 39 | IntStream.range(0, config.options.arraysize) // 40 | .parallel() 41 | .forEach(i -> c[i] = a[i]); 42 | } 43 | 44 | @Override 45 | public void mul() { 46 | IntStream.range(0, config.options.arraysize) // 47 | .parallel() 48 | .forEach(i -> b[i] = config.scalar * c[i]); 49 | } 50 | 51 | @Override 52 | public void add() { 53 | IntStream.range(0, config.options.arraysize) // 54 | .parallel() 55 | .forEach(i -> c[i] = a[i] + b[i]); 56 | } 57 | 58 | @Override 59 | public void triad() { 60 | IntStream.range(0, config.options.arraysize) // 61 | .parallel() 62 | .forEach(i -> a[i] = b[i] + config.scalar * c[i]); 63 | } 64 | 65 | @Override 66 | public void nstream() { 67 | IntStream.range(0, config.options.arraysize) // 68 | .parallel() 69 | .forEach(i -> a[i] += b[i] + config.scalar * c[i]); 70 | } 71 | 72 | @Override 73 | public Double dot() { 74 | return IntStream.range(0, config.options.arraysize) 75 | .parallel() 76 | .mapToDouble(i -> a[i] * b[i]) 77 | .reduce(0f, Double::sum); 78 | } 79 | 80 | @Override 81 | public Data readArrays() { 82 | return new Data<>(boxed(a), boxed(b), boxed(c)); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/jdk/SpecialisedFloatStream.java: -------------------------------------------------------------------------------- 1 | package javastream.jdk; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | import java.util.stream.IntStream; 6 | import javastream.JavaStream; 7 | import javastream.Main.Config; 8 | 9 | final class SpecialisedFloatStream extends JavaStream { 10 | 11 | private final float[] a, b, c; 12 | 13 | SpecialisedFloatStream(Config config) { 14 | super(config); 15 | this.a = new float[config.options.arraysize]; 16 | this.b = new float[config.options.arraysize]; 17 | this.c = new float[config.options.arraysize]; 18 | } 19 | 20 | @Override 21 | public List listDevices() { 22 | return Collections.singletonList("JVM"); 23 | } 24 | 25 | @Override 26 | public void initArrays() { 27 | IntStream.range(0, config.options.arraysize) // 28 | .parallel() 29 | .forEach( 30 | i -> { 31 | a[i] = config.initA; 32 | b[i] = config.initB; 33 | c[i] = config.initC; 34 | }); 35 | } 36 | 37 | @Override 38 | public void copy() { 39 | IntStream.range(0, config.options.arraysize) // 40 | .parallel() 41 | .forEach(i -> c[i] = a[i]); 42 | } 43 | 44 | @Override 45 | public void mul() { 46 | IntStream.range(0, config.options.arraysize) // 47 | .parallel() 48 | .forEach(i -> b[i] = config.scalar * c[i]); 49 | } 50 | 51 | @Override 52 | public void add() { 53 | IntStream.range(0, config.options.arraysize) // 54 | .parallel() 55 | .forEach(i -> c[i] = a[i] + b[i]); 56 | } 57 | 58 | @Override 59 | public void triad() { 60 | IntStream.range(0, config.options.arraysize) // 61 | .parallel() 62 | .forEach(i -> a[i] = b[i] + config.scalar * c[i]); 63 | } 64 | 65 | @Override 66 | public void nstream() { 67 | IntStream.range(0, config.options.arraysize) // 68 | .parallel() 69 | .forEach(i -> a[i] += b[i] + config.scalar * c[i]); 70 | } 71 | 72 | @Override 73 | public Float dot() { 74 | return IntStream.range(0, config.options.arraysize) // 75 | .parallel() 76 | .mapToObj(i -> a[i] * b[i]) // XXX there isn't a specialised Stream for floats 77 | .reduce(0f, Float::sum); 78 | } 79 | 80 | @Override 81 | public Data readArrays() { 82 | return new Data<>(boxed(a), boxed(b), boxed(c)); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainDoubleStream.java: -------------------------------------------------------------------------------- 1 | package javastream.jdk; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | import javastream.JavaStream; 6 | import javastream.Main.Config; 7 | 8 | final class SpecialisedPlainDoubleStream extends JavaStream { 9 | 10 | private final double[] a; 11 | private final double[] b; 12 | private final double[] c; 13 | 14 | SpecialisedPlainDoubleStream(Config config) { 15 | super(config); 16 | this.a = new double[config.options.arraysize]; 17 | this.b = new double[config.options.arraysize]; 18 | this.c = new double[config.options.arraysize]; 19 | } 20 | 21 | @Override 22 | public List listDevices() { 23 | return Collections.singletonList("JVM"); 24 | } 25 | 26 | @Override 27 | public void initArrays() { 28 | for (int i = 0; i < config.options.arraysize; i++) { 29 | a[i] = config.initA; 30 | b[i] = config.initB; 31 | c[i] = config.initC; 32 | } 33 | } 34 | 35 | @SuppressWarnings("ManualArrayCopy") 36 | @Override 37 | public void copy() { 38 | for (int i = 0; i < config.options.arraysize; i++) { 39 | c[i] = a[i]; 40 | } 41 | } 42 | 43 | @Override 44 | public void mul() { 45 | for (int i = 0; i < config.options.arraysize; i++) { 46 | b[i] = config.scalar * c[i]; 47 | } 48 | } 49 | 50 | @Override 51 | public void add() { 52 | for (int i = 0; i < config.options.arraysize; i++) { 53 | c[i] = a[i] + b[i]; 54 | } 55 | } 56 | 57 | @Override 58 | public void triad() { 59 | for (int i = 0; i < config.options.arraysize; i++) { 60 | a[i] = b[i] + config.scalar * c[i]; 61 | } 62 | } 63 | 64 | @Override 65 | public void nstream() { 66 | for (int i = 0; i < config.options.arraysize; i++) { 67 | a[i] += b[i] + config.scalar * c[i]; 68 | } 69 | } 70 | 71 | @Override 72 | public Double dot() { 73 | double acc = 0f; 74 | for (int i = 0; i < config.options.arraysize; i++) { 75 | acc += a[i] * b[i]; 76 | } 77 | return acc; 78 | } 79 | 80 | @Override 81 | public Data readArrays() { 82 | return new Data<>(boxed(a), boxed(b), boxed(c)); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/jdk/SpecialisedPlainFloatStream.java: -------------------------------------------------------------------------------- 1 | package javastream.jdk; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | import javastream.JavaStream; 6 | import javastream.Main.Config; 7 | 8 | final class SpecialisedPlainFloatStream extends JavaStream { 9 | 10 | private final float[] a; 11 | private final float[] b; 12 | private final float[] c; 13 | 14 | SpecialisedPlainFloatStream(Config config) { 15 | super(config); 16 | this.a = new float[config.options.arraysize]; 17 | this.b = new float[config.options.arraysize]; 18 | this.c = new float[config.options.arraysize]; 19 | } 20 | 21 | @Override 22 | public List listDevices() { 23 | return Collections.singletonList("JVM"); 24 | } 25 | 26 | @Override 27 | public void initArrays() { 28 | for (int i = 0; i < config.options.arraysize; i++) { 29 | a[i] = config.initA; 30 | b[i] = config.initB; 31 | c[i] = config.initC; 32 | } 33 | } 34 | 35 | @SuppressWarnings("ManualArrayCopy") 36 | @Override 37 | public void copy() { 38 | for (int i = 0; i < config.options.arraysize; i++) { 39 | c[i] = a[i]; 40 | } 41 | } 42 | 43 | @Override 44 | public void mul() { 45 | for (int i = 0; i < config.options.arraysize; i++) { 46 | b[i] = config.scalar * c[i]; 47 | } 48 | } 49 | 50 | @Override 51 | public void add() { 52 | for (int i = 0; i < config.options.arraysize; i++) { 53 | c[i] = a[i] + b[i]; 54 | } 55 | } 56 | 57 | @Override 58 | public void triad() { 59 | for (int i = 0; i < config.options.arraysize; i++) { 60 | a[i] = b[i] + config.scalar * c[i]; 61 | } 62 | } 63 | 64 | @Override 65 | public void nstream() { 66 | for (int i = 0; i < config.options.arraysize; i++) { 67 | a[i] += b[i] + config.scalar * c[i]; 68 | } 69 | } 70 | 71 | @Override 72 | public Float dot() { 73 | float acc = 0f; 74 | for (int i = 0; i < config.options.arraysize; i++) { 75 | acc += a[i] * b[i]; 76 | } 77 | return acc; 78 | } 79 | 80 | @Override 81 | public Data readArrays() { 82 | return new Data<>(boxed(a), boxed(b), boxed(c)); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/tornadovm/GenericTornadoVMStream.java: -------------------------------------------------------------------------------- 1 | package javastream.tornadovm; 2 | 3 | import java.util.List; 4 | import java.util.stream.Collectors; 5 | import javastream.JavaStream; 6 | import javastream.Main.Config; 7 | import uk.ac.manchester.tornado.api.TornadoExecutionPlan; 8 | import uk.ac.manchester.tornado.api.TornadoRuntimeInterface; 9 | import uk.ac.manchester.tornado.api.common.TornadoDevice; 10 | import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; 11 | 12 | abstract class GenericTornadoVMStream extends JavaStream { 13 | 14 | protected final TornadoDevice device; 15 | 16 | protected TornadoExecutionPlan copyTask; 17 | protected TornadoExecutionPlan mulTask; 18 | protected TornadoExecutionPlan addTask; 19 | protected TornadoExecutionPlan triadTask; 20 | protected TornadoExecutionPlan nstreamTask; 21 | protected TornadoExecutionPlan dotTask; 22 | 23 | GenericTornadoVMStream(Config config) { 24 | super(config); 25 | 26 | try { 27 | TornadoRuntimeInterface runtime = TornadoRuntime.getTornadoRuntime(); 28 | List devices = TornadoVMStreams.enumerateDevices(runtime); 29 | device = devices.get(config.options.device); 30 | 31 | if (config.options.isVerboseBenchmark()) { 32 | System.out.println("Using TornadoVM device:"); 33 | System.out.println(" - Name : " + device.getDescription()); 34 | System.out.println(" - Id : " + device.getDeviceName()); 35 | System.out.println(" - Platform : " + device.getPlatformName()); 36 | System.out.println(" - Backend : " + device.getTornadoVMBackend().name()); 37 | } 38 | } catch (Throwable e) { 39 | throw new RuntimeException( 40 | "Unable to initialise TornadoVM, make sure you are running the binary with the `tornado -jar ...` wrapper and not `java -jar ...`", 41 | e); 42 | } 43 | } 44 | 45 | @Override 46 | public List listDevices() { 47 | return TornadoVMStreams.enumerateDevices(TornadoRuntime.getTornadoRuntime()).stream() 48 | .map(d -> d.getDescription() + "(" + d.getDeviceName() + ")") 49 | .collect(Collectors.toList()); 50 | } 51 | 52 | @Override 53 | public void initArrays() { 54 | this.copyTask.withWarmUp(); 55 | this.mulTask.withWarmUp(); 56 | this.addTask.withWarmUp(); 57 | this.triadTask.withWarmUp(); 58 | this.nstreamTask.withWarmUp(); 59 | this.dotTask.withWarmUp(); 60 | } 61 | 62 | @Override 63 | public void copy() { 64 | this.copyTask.execute(); 65 | } 66 | 67 | @Override 68 | public void mul() { 69 | this.mulTask.execute(); 70 | } 71 | 72 | @Override 73 | public void add() { 74 | this.addTask.execute(); 75 | } 76 | 77 | @Override 78 | public void triad() { 79 | this.triadTask.execute(); 80 | } 81 | 82 | @Override 83 | public void nstream() { 84 | this.nstreamTask.execute(); 85 | } 86 | 87 | protected abstract T getSum(); 88 | 89 | @Override 90 | public T dot() { 91 | this.dotTask.execute(); 92 | return getSum(); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/java/java-stream/src/main/java/javastream/tornadovm/TornadoVMStreams.java: -------------------------------------------------------------------------------- 1 | package javastream.tornadovm; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | import java.util.function.Function; 6 | import java.util.stream.Collectors; 7 | import java.util.stream.IntStream; 8 | import javastream.JavaStream; 9 | import javastream.Main.Config; 10 | import uk.ac.manchester.tornado.api.TornadoRuntimeInterface; 11 | import uk.ac.manchester.tornado.api.common.Event; 12 | import uk.ac.manchester.tornado.api.common.TornadoDevice; 13 | import uk.ac.manchester.tornado.api.memory.TornadoDeviceObjectState; 14 | import uk.ac.manchester.tornado.api.memory.TornadoGlobalObjectState; 15 | import uk.ac.manchester.tornado.api.runtime.TornadoRuntime; 16 | 17 | public final class TornadoVMStreams { 18 | 19 | private TornadoVMStreams() {} 20 | 21 | static void allocAndXferToDevice(TornadoDevice device, Object... xs) { 22 | for (Object x : xs) { 23 | TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); 24 | device.allocateObjects( 25 | new Object[] {x}, 0, new TornadoDeviceObjectState[] {state.getDeviceState(device)}); 26 | List writeEvent = device.ensurePresent(x, state.getDeviceState(device), null, 0, 0); 27 | if (writeEvent != null) writeEvent.forEach(e -> device.resolveEvent(e).waitOn()); 28 | } 29 | } 30 | 31 | static void xferFromDevice(TornadoDevice device, Object... xs) { 32 | Arrays.stream(xs) 33 | .map( 34 | x -> { 35 | TornadoGlobalObjectState state = TornadoRuntime.getTornadoRuntime().resolveObject(x); 36 | return device.resolveEvent( 37 | device.streamOut(x, 0, state.getDeviceState(device), null)); 38 | }) 39 | .collect(Collectors.toList()) 40 | .forEach(Event::waitOn); 41 | } 42 | 43 | static List enumerateDevices(TornadoRuntimeInterface runtime) { 44 | return IntStream.range(0, runtime.getNumDrivers()) 45 | .mapToObj(runtime::getDriver) 46 | .flatMap(d -> IntStream.range(0, d.getDeviceCount()).mapToObj(d::getDevice)) 47 | .collect(Collectors.toList()); 48 | } 49 | 50 | public static final Function, JavaStream> FLOAT = SpecialisedFloat::new; 51 | public static final Function, JavaStream> DOUBLE = SpecialisedDouble::new; 52 | } 53 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/.JuliaFormatter.toml: -------------------------------------------------------------------------------- 1 | indent = 2 2 | margin = 100 -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/.gitignore: -------------------------------------------------------------------------------- 1 | *.jl.cov 2 | *.jl.*.cov 3 | *.jl.mem 4 | /docs/build/ 5 | /docs/Manifest.toml -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/AMDGPU/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" 3 | ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" 4 | Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" 5 | 6 | [compat] 7 | julia = "1.9" 8 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/CUDA/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" 3 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" 4 | Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" 5 | 6 | [compat] 7 | julia = "1.9" 8 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/KernelAbstractions/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" 3 | ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" 4 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" 5 | CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" 6 | KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" 7 | Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" 8 | ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" 9 | 10 | [compat] 11 | julia = "1.9" 12 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/Project.toml: -------------------------------------------------------------------------------- 1 | name = "JuliaStream" 2 | uuid = "1bdcc9b7-f5ed-4705-bc7b-be1b748ec681" 3 | authors = ["Wei-Chen Lin "] 4 | version = "4.0" 5 | 6 | [deps] 7 | AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" 8 | ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" 9 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" 10 | CUDAKernels = "72cfdca4-0801-4ab0-bf6a-d52aa10adc57" 11 | Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" 12 | ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" 13 | KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" 14 | Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" 15 | ROCKernels = "7eb9e9f0-4bd3-4c4c-8bef-26bd9629d9b9" 16 | oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" 17 | 18 | [compat] 19 | julia = "1.9" 20 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/README.md: -------------------------------------------------------------------------------- 1 | JuliaStream.jl 2 | ============== 3 | 4 | This is an implementation of BabelStream in Julia which contains the following variants: 5 | 6 | * `PlainStream.jl` - Single threaded `for` 7 | * `ThreadedStream.jl` - Threaded implementation with `Threads.@threads` macros 8 | * `DistributedStream.jl` - Process based parallelism with `@distributed` macros 9 | * `CUDAStream.jl` - Direct port of BabelStream's native CUDA implementation using [CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) 10 | * `AMDGPUStream.jl` - Direct port of BabelStream's native HIP implementation using [AMDGPU.jl](https://github.com/JuliaGPU/AMDGPU.jl) 11 | * `oneAPIStream.jl` - Direct port of BabelStream's native SYCL implementation using [oneAPI.jl](https://github.com/JuliaGPU/oneAPI.jl) 12 | * `KernelAbstractions.jl` - Direct port of miniBUDE's native CUDA implementation using [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl) 13 | 14 | ### Build & Run 15 | 16 | Prerequisites 17 | 18 | * Julia >= 1.6+ 19 | 20 | A set of reduced dependency projects are available for the following backend and implementations: 21 | 22 | * `AMDGPU` supports: 23 | - `AMDGPUStream.jl` 24 | * `CUDA` supports: 25 | - `CUDAStream.jl` 26 | * `oneAPI` supports: 27 | - `oneAPIStream.jl` 28 | * `KernelAbstractions` supports: 29 | - `KernelAbstractionsStream.jl` 30 | * `Threaded` supports: 31 | - `PlainStream.jl` 32 | - `ThreadedStream.jl` 33 | - `DistributedStream.jl` 34 | 35 | With Julia on path, run your selected benchmark with: 36 | 37 | ```shell 38 | > cd JuliaStream.jl 39 | > julia --project= -e 'import Pkg; Pkg.instantiate()' # only required on first run 40 | > julia --project= src/Stream.jl 41 | ``` 42 | 43 | For example. to run the CUDA implementation: 44 | 45 | ```shell 46 | > cd JuliaStream.jl 47 | > julia --project=CUDA -e 'import Pkg; Pkg.instantiate()' 48 | > julia --project=CUDA src/CUDAStream.jl 49 | ``` 50 | 51 | **Important:** 52 | * Julia is 1-indexed, so N >= 1 in `--device N`. 53 | * Thread count for `ThreadedStream` must be set via the `JULIA_NUM_THREADS` environment variable (e.g `export JULIA_NUM_THREADS=$(nproc)`) otherwise it defaults to 1. 54 | * Worker count for `DistributedStream` is set with `-p ` as per the [documentation](https://docs.julialang.org/en/v1/manual/distributed-computing). 55 | * Certain implementations such as CUDA and AMDGPU will do hardware detection at runtime and may download and/or compile further software packages for the platform. 56 | 57 | *** 58 | 59 | Alternatively, the top-level project `Project.toml` contains all dependencies needed to run all implementations in `src`. 60 | There may be instances where some packages are locked to an older version because of transitive dependency requirements. 61 | 62 | To run the benchmark using the top-level project, run the benchmark with: 63 | ```shell 64 | > cd JuliaStream.jl 65 | > julia --project -e 'import Pkg; Pkg.instantiate()' 66 | > julia --project src/Stream.jl 67 | ``` -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/Threaded/Manifest.toml: -------------------------------------------------------------------------------- 1 | # This file is machine-generated - editing it directly is not advised 2 | 3 | julia_version = "1.9.3" 4 | manifest_format = "2.0" 5 | project_hash = "fbff310f722a52622a273a48a8a6b3b64f06b029" 6 | 7 | [[deps.ArgParse]] 8 | deps = ["Logging", "TextWrap"] 9 | git-tree-sha1 = "3102bce13da501c9104df33549f511cd25264d7d" 10 | uuid = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" 11 | version = "1.1.4" 12 | 13 | [[deps.Logging]] 14 | uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" 15 | 16 | [[deps.OrderedCollections]] 17 | git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" 18 | uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" 19 | version = "1.6.2" 20 | 21 | [[deps.Parameters]] 22 | deps = ["OrderedCollections", "UnPack"] 23 | git-tree-sha1 = "34c0e9ad262e5f7fc75b10a9952ca7692cfc5fbe" 24 | uuid = "d96e819e-fc66-5662-9728-84c9c7592b0a" 25 | version = "0.12.3" 26 | 27 | [[deps.TextWrap]] 28 | git-tree-sha1 = "9250ef9b01b66667380cf3275b3f7488d0e25faf" 29 | uuid = "b718987f-49a8-5099-9789-dcd902bef87d" 30 | version = "1.0.1" 31 | 32 | [[deps.UnPack]] 33 | git-tree-sha1 = "387c1f73762231e86e0c9c5443ce3b4a0a9a0c2b" 34 | uuid = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" 35 | version = "1.0.2" 36 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/Threaded/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" 3 | Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" 4 | 5 | [compat] 6 | julia = "1.9" 7 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/oneAPI/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" 3 | Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a" 4 | oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" 5 | 6 | [compat] 7 | julia = "1.9" 8 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/src/DistributedStream.jl: -------------------------------------------------------------------------------- 1 | using Distributed 2 | 3 | @everywhere using Pkg 4 | @everywhere Pkg.activate("."; io = devnull) # don't spam `Activating environment at...` 5 | @everywhere include("StreamData.jl") 6 | @everywhere include("Stream.jl") 7 | @everywhere using SharedArrays 8 | @everywhere const SharedArrayData = StreamData{T,SharedArray{T}} where {T} 9 | 10 | function devices()::Vector{DeviceWithRepr} 11 | return [(undef, "CPU (localhost) $(nworkers())P", "Distributed.jl")] 12 | end 13 | 14 | function make_stream( 15 | arraysize::Int, 16 | scalar::T, 17 | _::DeviceWithRepr, 18 | silent::Bool, 19 | )::Tuple{SharedArrayData{T},Nothing} where {T} 20 | 21 | if !silent 22 | println("Using max $(nworkers()) process(es) + 1 master") 23 | end 24 | return ( 25 | SharedArrayData{T}( 26 | SharedArray{T}(arraysize), 27 | SharedArray{T}(arraysize), 28 | SharedArray{T}(arraysize), 29 | scalar, 30 | arraysize, 31 | ), 32 | nothing, 33 | ) 34 | end 35 | 36 | function init_arrays!(data::SharedArrayData{T}, _, init::Tuple{T,T,T}) where {T} 37 | 38 | @sync @distributed for i = 1:data.size 39 | @inbounds data.a[i] = init[1] 40 | @inbounds data.b[i] = init[2] 41 | @inbounds data.c[i] = init[3] 42 | end 43 | end 44 | 45 | function copy!(data::SharedArrayData{T}, _) where {T} 46 | @sync @distributed for i = 1:data.size 47 | @inbounds data.c[i] = data.a[i] 48 | end 49 | end 50 | 51 | function mul!(data::SharedArrayData{T}, _) where {T} 52 | @sync @distributed for i = 1:data.size 53 | @inbounds data.b[i] = data.scalar * data.c[i] 54 | end 55 | end 56 | 57 | function add!(data::SharedArrayData{T}, _) where {T} 58 | @sync @distributed for i = 1:data.size 59 | @inbounds data.c[i] = data.a[i] + data.b[i] 60 | end 61 | end 62 | 63 | function triad!(data::SharedArrayData{T}, _) where {T} 64 | @sync @distributed for i = 1:data.size 65 | @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) 66 | end 67 | end 68 | 69 | function nstream!(data::SharedArrayData{T}, _) where {T} 70 | @sync @distributed for i = 1:data.size 71 | @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] 72 | end 73 | end 74 | 75 | function dot(data::SharedArrayData{T}, _) where {T} 76 | return @distributed (+) for i = 1:data.size 77 | @inbounds data.a[i] * data.b[i] 78 | end 79 | end 80 | 81 | function read_data(data::SharedArrayData{T}, _)::VectorData{T} where {T} 82 | return VectorData{T}(data.a, data.b, data.c, data.scalar, data.size) 83 | end 84 | 85 | main() -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/src/JuliaStream.jl: -------------------------------------------------------------------------------- 1 | module JuliaStream 2 | end 3 | 4 | println("Please run benchmarks directly via `julia --project src/Stream.jl`") -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/src/PlainStream.jl: -------------------------------------------------------------------------------- 1 | include("Stream.jl") 2 | 3 | function devices()::Vector{DeviceWithRepr} 4 | return [(undef, "CPU", "Palin")] 5 | end 6 | 7 | function make_stream( 8 | arraysize::Int, 9 | scalar::T, 10 | _::DeviceWithRepr, 11 | silent::Bool, 12 | )::Tuple{VectorData{T},Nothing} where {T} 13 | return ( 14 | VectorData{T}( 15 | Vector{T}(undef, arraysize), 16 | Vector{T}(undef, arraysize), 17 | Vector{T}(undef, arraysize), 18 | scalar, 19 | arraysize, 20 | ), 21 | nothing 22 | ) 23 | end 24 | 25 | function init_arrays!(data::VectorData{T}, _, init::Tuple{T,T,T}) where {T} 26 | for i = 1:data.size 27 | @inbounds data.a[i] = init[1] 28 | @inbounds data.b[i] = init[2] 29 | @inbounds data.c[i] = init[3] 30 | end 31 | end 32 | 33 | function copy!(data::VectorData{T}, _) where {T} 34 | for i = 1:data.size 35 | @inbounds data.c[i] = data.a[i] 36 | end 37 | end 38 | 39 | function mul!(data::VectorData{T}, _) where {T} 40 | for i = 1:data.size 41 | @inbounds data.b[i] = data.scalar * data.c[i] 42 | end 43 | end 44 | 45 | function add!(data::VectorData{T}, _) where {T} 46 | for i = 1:data.size 47 | @inbounds data.c[i] = data.a[i] + data.b[i] 48 | end 49 | end 50 | 51 | function triad!(data::VectorData{T}, _) where {T} 52 | for i = 1:data.size 53 | @inbounds data.a[i] = data.b[i] + (data.scalar * data.c[i]) 54 | end 55 | end 56 | 57 | function nstream!(data::VectorData{T}, _) where {T} 58 | for i = 1:data.size 59 | @inbounds data.a[i] += data.b[i] + data.scalar * data.c[i] 60 | end 61 | end 62 | 63 | function dot(data::VectorData{T}, _) where {T} 64 | sum = zero(T) 65 | for i = 1:data.size 66 | @inbounds sum += data.a[i] * data.b[i] 67 | end 68 | return sum 69 | end 70 | 71 | function read_data(data::VectorData{T}, _)::VectorData{T} where {T} 72 | return data 73 | end 74 | 75 | main() -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/src/StreamData.jl: -------------------------------------------------------------------------------- 1 | struct StreamData{T,C<:AbstractArray{T}} 2 | a::C 3 | b::C 4 | c::C 5 | scalar::T 6 | size::Int 7 | end 8 | -------------------------------------------------------------------------------- /src/julia/JuliaStream.jl/update_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC2034 disable=SC2153 3 | 4 | for BACKEND in "." "AMDGPU" "CUDA" "oneAPI" "Threaded" "KernelAbstractions" 5 | do 6 | echo "Updating subproject $BACKEND" 7 | julia --project="$BACKEND" -e 'import Pkg; Pkg.resolve(); Pkg.instantiate(); Pkg.update(); Pkg.gc();' 8 | done 9 | -------------------------------------------------------------------------------- /src/kokkos/KokkosStream.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 2 | // University of Bristol HPC 3 | // 4 | // For full license terms please see the LICENSE file distributed with this 5 | // source code 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | #include "Stream.h" 14 | 15 | #define IMPLEMENTATION_STRING "Kokkos" 16 | 17 | template 18 | class KokkosStream : public Stream 19 | { 20 | protected: 21 | // Size of arrays 22 | int array_size; 23 | 24 | // Device side pointers to arrays 25 | typename Kokkos::View* d_a; 26 | typename Kokkos::View* d_b; 27 | typename Kokkos::View* d_c; 28 | typename Kokkos::View::HostMirror* hm_a; 29 | typename Kokkos::View::HostMirror* hm_b; 30 | typename Kokkos::View::HostMirror* hm_c; 31 | 32 | public: 33 | 34 | KokkosStream(const int, const int); 35 | ~KokkosStream(); 36 | 37 | virtual void copy() override; 38 | virtual void add() override; 39 | virtual void mul() override; 40 | virtual void triad() override; 41 | virtual void nstream() override; 42 | virtual T dot() override; 43 | 44 | virtual void init_arrays(T initA, T initB, T initC) override; 45 | virtual void read_arrays( 46 | std::vector& a, std::vector& b, std::vector& c) override; 47 | }; 48 | 49 | -------------------------------------------------------------------------------- /src/kokkos/model.cmake: -------------------------------------------------------------------------------- 1 | register_flag_optional(CMAKE_CXX_COMPILER 2 | "Any CXX compiler that is supported by CMake detection and RAJA. 3 | See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" 4 | "c++") 5 | 6 | register_flag_optional(KOKKOS_IN_TREE 7 | "Absolute path to the *source* distribution directory of Kokkos. 8 | Remember to append Kokkos specific flags as well, for example: 9 | -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... 10 | See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") 11 | 12 | register_flag_optional(KOKKOS_IN_PACKAGE 13 | "Absolute path to package R-Path containing Kokkos libs. 14 | Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "") 15 | 16 | # compiler vendor and arch specific flags 17 | set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) 18 | 19 | macro(setup) 20 | 21 | set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17 22 | cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md 23 | 24 | 25 | if (EXISTS "${KOKKOS_IN_TREE}") 26 | message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") 27 | add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) 28 | register_link_library(Kokkos::kokkos) 29 | elseif (EXISTS "${KOKKOS_IN_PACKAGE}") 30 | message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") 31 | set (Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos") 32 | find_package(Kokkos REQUIRED) 33 | register_link_library(Kokkos::kokkos) 34 | else() 35 | message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!") 36 | endif () 37 | 38 | register_append_compiler_and_arch_specific_cxx_flags( 39 | KOKKOS_FLAGS_CPU 40 | ${CMAKE_CXX_COMPILER_ID} 41 | ${CMAKE_SYSTEM_PROCESSOR} 42 | ) 43 | 44 | endmacro() 45 | -------------------------------------------------------------------------------- /src/legacy/HCStream.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #include "Stream.h" 15 | #include "hc.hpp" 16 | 17 | #define IMPLEMENTATION_STRING "HC" 18 | 19 | template 20 | class HCStream : public Stream 21 | { 22 | protected: 23 | // Size of arrays 24 | int array_size; 25 | // Device side pointers to arrays 26 | hc::array d_a; 27 | hc::array d_b; 28 | hc::array d_c; 29 | 30 | 31 | public: 32 | 33 | HCStream(const int, const int); 34 | ~HCStream(); 35 | 36 | virtual void copy() override; 37 | virtual void add() override; 38 | virtual void mul() override; 39 | virtual void triad() override; 40 | virtual T dot() override; 41 | T dot_impl(); 42 | 43 | virtual void init_arrays(T initA, T initB, T initC) override; 44 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 45 | 46 | }; 47 | -------------------------------------------------------------------------------- /src/ocl/OCLStream.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #define CL_HPP_ENABLE_EXCEPTIONS 15 | #define CL_HPP_TARGET_OPENCL_VERSION 120 16 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 17 | 18 | #include "CL/cl2.hpp" 19 | 20 | #include "Stream.h" 21 | 22 | #define IMPLEMENTATION_STRING "OpenCL" 23 | 24 | template 25 | class OCLStream : public Stream 26 | { 27 | protected: 28 | // Size of arrays 29 | int array_size; 30 | 31 | // Host array for partial sums for dot kernel 32 | std::vector sums; 33 | 34 | // OpenCL objects 35 | cl::Device device; 36 | cl::Context context; 37 | cl::CommandQueue queue; 38 | 39 | // Device side pointers to arrays 40 | cl::Buffer d_a; 41 | cl::Buffer d_b; 42 | cl::Buffer d_c; 43 | cl::Buffer d_sum; 44 | 45 | cl::KernelFunctor *init_kernel; 46 | cl::KernelFunctor *copy_kernel; 47 | cl::KernelFunctor * mul_kernel; 48 | cl::KernelFunctor *add_kernel; 49 | cl::KernelFunctor *triad_kernel; 50 | cl::KernelFunctor *nstream_kernel; 51 | cl::KernelFunctor *dot_kernel; 52 | 53 | // NDRange configuration for the dot kernel 54 | size_t dot_num_groups; 55 | size_t dot_wgsize; 56 | 57 | public: 58 | 59 | OCLStream(const int, const int); 60 | ~OCLStream(); 61 | 62 | virtual void copy() override; 63 | virtual void add() override; 64 | virtual void mul() override; 65 | virtual void triad() override; 66 | virtual void nstream() override; 67 | virtual T dot() override; 68 | 69 | virtual void init_arrays(T initA, T initB, T initC) override; 70 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 71 | 72 | }; 73 | 74 | // Populate the devices list 75 | void getDeviceList(void); 76 | -------------------------------------------------------------------------------- /src/ocl/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection" 4 | "c++") 5 | 6 | register_flag_optional(OpenCL_LIBRARY 7 | "Path to OpenCL library, usually called libOpenCL.so" 8 | "${OpenCL_LIBRARY}") 9 | 10 | 11 | macro(setup) 12 | setup_opencl_header_includes() 13 | find_package(OpenCL REQUIRED) 14 | register_link_library(OpenCL::OpenCL) 15 | endmacro() 16 | 17 | -------------------------------------------------------------------------------- /src/omp/OMPStream.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | 13 | #include "Stream.h" 14 | 15 | #include 16 | 17 | #define IMPLEMENTATION_STRING "OpenMP" 18 | 19 | template 20 | class OMPStream : public Stream 21 | { 22 | protected: 23 | // Size of arrays 24 | int array_size; 25 | 26 | // Device side pointers 27 | T *a; 28 | T *b; 29 | T *c; 30 | 31 | public: 32 | OMPStream(const int, int); 33 | ~OMPStream(); 34 | 35 | virtual void copy() override; 36 | virtual void add() override; 37 | virtual void mul() override; 38 | virtual void triad() override; 39 | virtual void nstream() override; 40 | virtual T dot() override; 41 | 42 | virtual void init_arrays(T initA, T initB, T initC) override; 43 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 44 | 45 | 46 | 47 | }; 48 | -------------------------------------------------------------------------------- /src/raja/RAJAStream.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 2 | // University of Bristol HPC 3 | // 4 | // For full license terms please see the LICENSE file distributed with this 5 | // source code 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include "RAJA/RAJA.hpp" 12 | 13 | #include "Stream.h" 14 | 15 | #define IMPLEMENTATION_STRING "RAJA" 16 | 17 | #ifdef RAJA_TARGET_CPU 18 | // TODO verify old and new templates are semantically equal 19 | //typedef RAJA::ExecPolicy< 20 | // RAJA::seq_segit, 21 | // RAJA::omp_parallel_for_exec> policy; 22 | 23 | typedef RAJA::omp_parallel_for_exec policy; 24 | typedef RAJA::omp_reduce reduce_policy; 25 | #else 26 | const size_t block_size = 128; 27 | // TODO verify old and new templates are semantically equal 28 | //typedef RAJA::IndexSet::ExecPolicy< 29 | // RAJA::seq_segit, 30 | // RAJA::cuda_exec> policy; 31 | //typedef RAJA::cuda_reduce reduce_policy; 32 | typedef RAJA::cuda_exec policy; 33 | typedef RAJA::cuda_reduce reduce_policy; 34 | #endif 35 | 36 | using RAJA::RangeSegment; 37 | 38 | 39 | template 40 | class RAJAStream : public Stream 41 | { 42 | protected: 43 | // Size of arrays 44 | const int array_size; 45 | const RangeSegment range; 46 | 47 | // Device side pointers to arrays 48 | T* d_a; 49 | T* d_b; 50 | T* d_c; 51 | 52 | public: 53 | 54 | RAJAStream(const int, const int); 55 | ~RAJAStream(); 56 | 57 | virtual void copy() override; 58 | virtual void add() override; 59 | virtual void mul() override; 60 | virtual void triad() override; 61 | virtual void nstream() override; 62 | virtual T dot() override; 63 | 64 | virtual void init_arrays(T initA, T initB, T initC) override; 65 | virtual void read_arrays( 66 | std::vector& a, std::vector& b, std::vector& c) override; 67 | }; 68 | 69 | -------------------------------------------------------------------------------- /src/rust/rust-stream/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | rustflags = ["-C", "target-cpu=native"] -------------------------------------------------------------------------------- /src/rust/rust-stream/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .idea 3 | -------------------------------------------------------------------------------- /src/rust/rust-stream/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rust-stream" 3 | version = "5.0.0" 4 | authors = ["Wei-Chen Lin "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | num-traits = "0.2.15" 11 | structopt = "0.3.26" 12 | tabular = "0.2.0" 13 | rayon = "1.5.3" 14 | crossbeam = "0.8.2" 15 | num_cpus = "1.13.1" 16 | rustversion = "1.0.9" 17 | libc = "0.2.134" 18 | core_affinity = "0.5.10" 19 | colour = "0.6.0" 20 | 21 | [dev-dependencies] 22 | rstest = "0.13.0" 23 | 24 | [build-dependencies] 25 | rustversion = "1.0" 26 | 27 | [profile.dev] 28 | opt-level = 2 29 | overflow-checks = true 30 | 31 | 32 | [profile.release] 33 | opt-level = 3 34 | lto = "thin" # fully enabling this (i.e true) negatively affects performance as tested on both AMD and Intel 35 | -------------------------------------------------------------------------------- /src/rust/rust-stream/rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 100 2 | hard_tabs = false 3 | tab_spaces = 2 4 | newline_style = "Auto" 5 | use_small_heuristics = "Max" 6 | indent_style = "Block" 7 | wrap_comments = false 8 | format_code_in_doc_comments = false 9 | comment_width = 80 10 | normalize_comments = false 11 | normalize_doc_attributes = false 12 | license_template_path = "" 13 | format_strings = false 14 | format_macro_matchers = false 15 | format_macro_bodies = true 16 | empty_item_single_line = true 17 | struct_lit_single_line = true 18 | fn_single_line = true 19 | where_single_line = true 20 | imports_indent = "Block" 21 | imports_layout = "Mixed" 22 | imports_granularity = "Preserve" 23 | group_imports = "Preserve" 24 | reorder_imports = true 25 | reorder_modules = true 26 | reorder_impl_items = false 27 | type_punctuation_density = "Wide" 28 | space_before_colon = false 29 | space_after_colon = true 30 | spaces_around_ranges = false 31 | binop_separator = "Front" 32 | remove_nested_parens = true 33 | combine_control_expr = true 34 | overflow_delimited_expr = false 35 | struct_field_align_threshold = 0 36 | enum_discrim_align_threshold = 0 37 | match_arm_blocks = true 38 | match_arm_leading_pipes = "Never" 39 | force_multiline_blocks = false 40 | fn_args_layout = "Compressed" 41 | brace_style = "PreferSameLine" 42 | control_brace_style = "AlwaysSameLine" 43 | trailing_semicolon = true 44 | trailing_comma = "Vertical" 45 | match_block_trailing_comma = false 46 | blank_lines_upper_bound = 1 47 | blank_lines_lower_bound = 0 48 | edition = "2015" 49 | version = "One" 50 | inline_attribute_width = 0 51 | merge_derives = true 52 | use_try_shorthand = false 53 | use_field_init_shorthand = false 54 | force_explicit_abi = true 55 | condense_wildcard_suffixes = false 56 | color = "Auto" 57 | required_version = "1.6.0" 58 | unstable_features = false 59 | disable_all_formatting = false 60 | skip_children = false 61 | hide_parse_errors = false 62 | error_on_line_overflow = false 63 | error_on_unformatted = false 64 | report_todo = "Never" 65 | report_fixme = "Never" 66 | ignore = [] 67 | emit_mode = "Files" 68 | make_backup = false 69 | -------------------------------------------------------------------------------- /src/rust/rust-stream/src/main.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | if !rust_stream::run(&std::env::args().collect::>()) { 3 | std::process::exit(1); 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /src/rust/rust-stream/src/plain_stream.rs: -------------------------------------------------------------------------------- 1 | use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; 2 | use core_affinity::CoreId; 3 | 4 | pub struct SerialDevice { 5 | pub(crate) pin: bool, 6 | } 7 | 8 | // single threaded version 9 | impl RustStream for StreamData { 10 | fn init_arrays(&mut self) { 11 | if self.device.pin { 12 | core_affinity::set_for_current( 13 | match core_affinity::get_core_ids().as_ref().map(|x| x.first()) { 14 | Some(Some(x)) => *x, 15 | _ => CoreId { id: 0 }, 16 | }, 17 | ); 18 | } 19 | self.a.fill(self.init.0); 20 | self.b.fill(self.init.1); 21 | self.c.fill(self.init.2); 22 | } 23 | 24 | fn copy(&mut self) { 25 | for i in 0..self.size { 26 | self.c[i] = self.a[i]; 27 | } 28 | } 29 | 30 | fn mul(&mut self) { 31 | for i in 0..self.size { 32 | self.b[i] = self.scalar * self.c[i]; 33 | } 34 | } 35 | 36 | fn add(&mut self) { 37 | for i in 0..self.size { 38 | self.c[i] = self.a[i] + self.b[i]; 39 | } 40 | } 41 | 42 | fn triad(&mut self) { 43 | for i in 0..self.size { 44 | self.a[i] = self.b[i] + self.scalar * self.c[i]; 45 | } 46 | } 47 | 48 | fn nstream(&mut self) { 49 | for i in 0..self.size { 50 | self.a[i] += self.b[i] + self.scalar * self.c[i]; 51 | } 52 | } 53 | 54 | fn dot(&mut self) -> T { 55 | let mut sum = T::default(); 56 | for i in 0..self.size { 57 | sum += self.a[i] * self.b[i]; 58 | } 59 | sum 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/rust/rust-stream/src/rayon_stream.rs: -------------------------------------------------------------------------------- 1 | use std::iter::Sum; 2 | 3 | use rayon::prelude::*; 4 | use rayon::ThreadPool; 5 | 6 | use crate::stream::{AllocatorType, ArrayType, RustStream, StreamData}; 7 | 8 | pub struct RayonDevice { 9 | pub(crate) pool: ThreadPool, 10 | } 11 | 12 | // Rayon version, it should be semantically equal to the single threaded version 13 | impl RustStream 14 | for StreamData 15 | { 16 | fn init_arrays(&mut self) { 17 | let init = self.init; 18 | self.a.par_iter_mut().for_each(|v| *v = init.0); 19 | self.b.par_iter_mut().for_each(|v| *v = init.1); 20 | self.c.par_iter_mut().for_each(|v| *v = init.2); 21 | } 22 | 23 | fn copy(&mut self) { 24 | let a = &self.a; 25 | let c = &mut self.c; 26 | self.device.pool.install(|| { 27 | (*c).par_iter_mut().enumerate().for_each(|(i, c)| *c = a[i]); 28 | }); 29 | } 30 | 31 | fn mul(&mut self) { 32 | let scalar = self.scalar; 33 | let c = &self.c; 34 | let b = &mut self.b; 35 | self 36 | .device 37 | .pool 38 | .install(|| (*b).par_iter_mut().enumerate().for_each(|(i, b)| *b = scalar * c[i])); 39 | } 40 | 41 | fn add(&mut self) { 42 | let a = &self.a; 43 | let b = &self.b; 44 | let c = &mut self.c; 45 | self.device.pool.install(|| (*c).par_iter_mut().enumerate().for_each(|(i, c)| *c = a[i] + b[i])) 46 | } 47 | 48 | fn triad(&mut self) { 49 | let scalar = self.scalar; 50 | let a = &mut self.a; 51 | let b = &self.b; 52 | let c = &self.c; 53 | self 54 | .device 55 | .pool 56 | .install(|| (*a).par_iter_mut().enumerate().for_each(|(i, a)| *a = b[i] + scalar * c[i])) 57 | } 58 | 59 | fn nstream(&mut self) { 60 | let scalar = self.scalar; 61 | let a = &mut self.a; 62 | let b = &self.b; 63 | let c = &self.c; 64 | self 65 | .device 66 | .pool 67 | .install(|| (*a).par_iter_mut().enumerate().for_each(|(i, a)| *a += b[i] + scalar * c[i])) 68 | } 69 | 70 | fn dot(&mut self) -> T { 71 | let a = &self.a; 72 | let b = &self.b; 73 | self.device.pool.install(|| { 74 | (0..self.size).into_par_iter().fold(|| T::default(), |acc, i| acc + a[i] * b[i]).sum::() 75 | }) 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/rust/rust-stream/tests/integration_test.rs: -------------------------------------------------------------------------------- 1 | use rstest::rstest; 2 | 3 | #[rstest] 4 | fn test_main( 5 | #[values(0, 1, 2, 3, 4)] device: usize, // 6 | #[values("", "--pin")] pin: &str, // 7 | #[values("", "--malloc")] malloc: &str, // 8 | #[values("", "--init")] init: &str, // 9 | #[values("", "--triad-only", "--nstream-only")] option: &str, // 10 | ) { 11 | let line = format!( 12 | "rust-stream --arraysize 2048 --device {} {} {} {} {}", 13 | device, pin, malloc, init, option 14 | ); 15 | let args = line.split_whitespace().map(|s| s.to_string()).collect::>(); 16 | assert!(rust_stream::run(&args)); 17 | } 18 | -------------------------------------------------------------------------------- /src/scala/scala-stream/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .bsp/ 3 | -------------------------------------------------------------------------------- /src/scala/scala-stream/.jvmopts: -------------------------------------------------------------------------------- 1 | -Xmx4096m 2 | -Xss4m -------------------------------------------------------------------------------- /src/scala/scala-stream/.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = "3.7.14" 2 | runner.dialect = scala3 3 | 4 | style = defaultWithAlign 5 | 6 | maxColumn = 100 7 | 8 | align.preset = more 9 | 10 | rewrite.rules = [ 11 | AvoidInfix 12 | RedundantBraces 13 | RedundantParens 14 | AsciiSortImports 15 | PreferCurlyFors 16 | ] 17 | 18 | rewrite.neverInfix.excludeFilters = [until 19 | to 20 | by 21 | eq 22 | ne 23 | "should.*" 24 | "contain.*" 25 | "must.*" 26 | in 27 | be 28 | taggedAs 29 | thrownBy 30 | synchronized 31 | have 32 | when 33 | size 34 | theSameElementsAs] -------------------------------------------------------------------------------- /src/scala/scala-stream/build.sbt: -------------------------------------------------------------------------------- 1 | lazy val mainCls = Some("scalastream.App") 2 | 3 | lazy val root = (project in file(".")) 4 | .enablePlugins(NativeImagePlugin) 5 | .settings( 6 | scalaVersion := "3.3.1", 7 | version := "5.0", 8 | organization := "uk.ac.bristol.uob-hpc", 9 | organizationName := "University of Bristol", 10 | Compile / mainClass := mainCls, 11 | assembly / mainClass := mainCls, 12 | scalacOptions ~= filterConsoleScalacOptions, 13 | assembly / assemblyJarName := "scala-stream.jar", 14 | assembly / assemblyMergeStrategy := { 15 | case PathList("module-info.class") => MergeStrategy.discard 16 | case PathList("META-INF", "versions", xs @ _, "module-info.class") => MergeStrategy.discard 17 | case x => (ThisBuild / assemblyMergeStrategy).value(x) 18 | }, 19 | nativeImageOptions := Seq( 20 | "--no-fallback", 21 | "-H:ReflectionConfigurationFiles=../../reflect-config.json" 22 | ), 23 | nativeImageVersion := "21.1.0", 24 | (Global / excludeLintKeys) += nativeImageVersion, 25 | name := "scala-stream", 26 | libraryDependencies ++= Seq( 27 | // Lazy val implementation in Scala 3 triggers an exception in nativeImage, use 2_13 for arg parsing for now otherwise we can't get to the benchmarking part 28 | ("com.github.scopt" %% "scopt" % "4.0.1").cross(CrossVersion.for3Use2_13), 29 | // par also uses lazy val at some point, so it doesn't work in nativeImage 30 | "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4", 31 | "net.openhft" % "affinity" % "3.23.2", 32 | "org.slf4j" % "slf4j-simple" % "2.0.5" // for affinity 33 | ) 34 | ) 35 | -------------------------------------------------------------------------------- /src/scala/scala-stream/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.9.2 2 | -------------------------------------------------------------------------------- /src/scala/scala-stream/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.timushev.sbt" % "sbt-updates" % "0.5.3") 2 | addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.20") 3 | addSbtPlugin("org.scalameta" % "sbt-native-image" % "0.3.0") 4 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.3") 5 | addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.27") 6 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.3") 7 | -------------------------------------------------------------------------------- /src/scala/scala-stream/reflect-config.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "sun.misc.Unsafe", 4 | "fields": [ 5 | { 6 | "name": "theUnsafe", 7 | "allowUnsafeAccess": true 8 | } 9 | ] 10 | } 11 | ] -------------------------------------------------------------------------------- /src/scala/scala-stream/sbt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ./sbt-dist/bin/sbt "$@" -------------------------------------------------------------------------------- /src/scala/scala-stream/sbt-dist/bin/java9-rt-export.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/BabelStream/2f00dfb7f8b7cfe8c53d20d5c770bccbf8673440/src/scala/scala-stream/sbt-dist/bin/java9-rt-export.jar -------------------------------------------------------------------------------- /src/scala/scala-stream/sbt-dist/bin/sbt-launch.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UoB-HPC/BabelStream/2f00dfb7f8b7cfe8c53d20d5c770bccbf8673440/src/scala/scala-stream/sbt-dist/bin/sbt-launch.jar -------------------------------------------------------------------------------- /src/scala/scala-stream/sbt-dist/conf/sbtconfig.txt: -------------------------------------------------------------------------------- 1 | # Set the java args to high 2 | 3 | -Xmx512M 4 | 5 | -XX:MaxPermSize=256m 6 | 7 | -XX:ReservedCodeCacheSize=128m 8 | 9 | 10 | 11 | # Set the extra SBT options 12 | 13 | -Dsbt.log.format=true 14 | 15 | -------------------------------------------------------------------------------- /src/scala/scala-stream/sbt-dist/conf/sbtopts: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------ # 2 | # The SBT Configuration file. # 3 | # ------------------------------------------------ # 4 | 5 | 6 | # Disable ANSI color codes 7 | # 8 | #-no-colors 9 | 10 | # Starts sbt even if the current directory contains no sbt project. 11 | # 12 | -sbt-create 13 | 14 | # Path to global settings/plugins directory (default: ~/.sbt) 15 | # 16 | #-sbt-dir /etc/sbt 17 | 18 | # Path to shared boot directory (default: ~/.sbt/boot in 0.11 series) 19 | # 20 | #-sbt-boot ~/.sbt/boot 21 | 22 | # Path to local Ivy repository (default: ~/.ivy2) 23 | # 24 | #-ivy ~/.ivy2 25 | 26 | # set memory options 27 | # 28 | #-mem 29 | 30 | # Use local caches for projects, no sharing. 31 | # 32 | #-no-share 33 | 34 | # Put SBT in offline mode. 35 | # 36 | #-offline 37 | 38 | # Sets the SBT version to use. 39 | #-sbt-version 0.11.3 40 | 41 | # Scala version (default: latest release) 42 | # 43 | #-scala-home 44 | #-scala-version 45 | 46 | # java version (default: java from PATH, currently $(java -version |& grep version)) 47 | # 48 | #-java-home 49 | 50 | -------------------------------------------------------------------------------- /src/scala/scala-stream/src/main/scala/scalastream/J8SStream.scala: -------------------------------------------------------------------------------- 1 | package scalastream 2 | 3 | import scalastream.App.{Config, Data} 4 | 5 | import scala.collection.immutable.ArraySeq 6 | import scala.reflect.{ClassTag, classTag} 7 | 8 | class J8SStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) 9 | extends ScalaStream[A]: 10 | 11 | private var a: Array[A] = _ 12 | private var b: Array[A] = _ 13 | private var c: Array[A] = _ 14 | private val scalar: A = config.scalar 15 | 16 | inline private def stream = 17 | java.util.stream.IntStream.range(0, config.options.arraysize).parallel() 18 | 19 | override inline def initArrays(): Unit = 20 | a = Array.ofDim(config.options.arraysize) 21 | b = Array.ofDim(config.options.arraysize) 22 | c = Array.ofDim(config.options.arraysize) 23 | stream.forEach { i => 24 | a(i) = config.init._1 25 | b(i) = config.init._2 26 | c(i) = config.init._3 27 | } 28 | 29 | override inline def copy(): Unit = stream.forEach(i => c(i) = a(i)) 30 | override inline def mul(): Unit = stream.forEach(i => b(i) = scalar * c(i)) 31 | override inline def add(): Unit = stream.forEach(i => c(i) = a(i) + b(i)) 32 | override inline def triad(): Unit = stream.forEach(i => a(i) = b(i) + scalar * c(i)) 33 | override inline def nstream(): Unit = stream.forEach(i => a(i) = b(i) * scalar * c(i)) 34 | override inline def dot(): A = 35 | // horrible special-case for double, there isn't a mapToFloat so we give up on that 36 | val cls = classTag[A].runtimeClass 37 | if java.lang.Double.TYPE == cls then 38 | stream 39 | .mapToDouble(i => (a(i) * b(i)).asInstanceOf[Double]) 40 | .reduce(0, (l: Double, r: Double) => l + r) 41 | .asInstanceOf[A] 42 | else stream.mapToObj[A](i => a(i) * b(i)).reduce(0.fractional, (l: A, r: A) => l + r) 43 | 44 | override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) 45 | -------------------------------------------------------------------------------- /src/scala/scala-stream/src/main/scala/scalastream/ParStream.scala: -------------------------------------------------------------------------------- 1 | package scalastream 2 | 3 | import scalastream.App.{Config, Data} 4 | 5 | import scala.collection.immutable.ArraySeq 6 | import scala.collection.parallel.CollectionConverters._ 7 | import scala.reflect.ClassTag 8 | class ParStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) 9 | extends ScalaStream[A]: 10 | 11 | private var a: Array[A] = _ 12 | private var b: Array[A] = _ 13 | private var c: Array[A] = _ 14 | private val scalar: A = config.scalar 15 | 16 | inline private def indices = (0 until config.options.arraysize).par 17 | 18 | override inline def initArrays(): Unit = 19 | a = Array.ofDim(config.options.arraysize) 20 | b = Array.ofDim(config.options.arraysize) 21 | c = Array.ofDim(config.options.arraysize) 22 | 23 | for i <- indices do 24 | a(i) = config.init._1 25 | b(i) = config.init._2 26 | c(i) = config.init._3 27 | 28 | override inline def copy(): Unit = for i <- indices do c(i) = a(i) 29 | override inline def mul(): Unit = for i <- indices do b(i) = scalar * c(i) 30 | override inline def add(): Unit = for i <- indices do c(i) = a(i) + b(i) 31 | override inline def triad(): Unit = for i <- indices do a(i) = b(i) + scalar * c(i) 32 | override inline def nstream(): Unit = for i <- indices do a(i) = b(i) * scalar * c(i) 33 | override inline def dot(): A = 34 | indices.aggregate[A](0.fractional)((acc, i) => acc + (a(i) * b(i)), _ + _) 35 | 36 | override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) 37 | -------------------------------------------------------------------------------- /src/scala/scala-stream/src/main/scala/scalastream/PlainStream.scala: -------------------------------------------------------------------------------- 1 | package scalastream 2 | 3 | import scalastream.App.{Config, Data} 4 | 5 | import scala.collection.immutable.ArraySeq 6 | import scala.reflect.ClassTag 7 | class PlainStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) 8 | extends ScalaStream[A]: 9 | 10 | private var a: Array[A] = _ 11 | private var b: Array[A] = _ 12 | private var c: Array[A] = _ 13 | private val scalar: A = config.scalar 14 | 15 | override inline def initArrays(): Unit = 16 | a = Array.fill(config.options.arraysize)(config.init._1) 17 | b = Array.fill(config.options.arraysize)(config.init._2) 18 | c = Array.fill(config.options.arraysize)(config.init._3) 19 | 20 | private inline def indices = 0 until config.options.arraysize 21 | 22 | override inline def copy(): Unit = for i <- indices do c(i) = a(i) 23 | override inline def mul(): Unit = for i <- indices do b(i) = scalar * c(i) 24 | override inline def add(): Unit = for i <- indices do c(i) = a(i) + b(i) 25 | override inline def triad(): Unit = for i <- indices do a(i) = b(i) + (scalar * c(i)) 26 | override inline def nstream(): Unit = for i <- indices do a(i) = b(i) * scalar * c(i) 27 | override inline def dot(): A = 28 | var acc: A = 0.fractional 29 | for i <- indices do acc = acc + (a(i) * b(i)) 30 | acc 31 | override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) 32 | -------------------------------------------------------------------------------- /src/scala/scala-stream/src/main/scala/scalastream/ThreadStream.scala: -------------------------------------------------------------------------------- 1 | package scalastream 2 | 3 | import net.openhft.affinity.{AffinityStrategies, AffinityThreadFactory} 4 | import scalastream.App.{Config, Data} 5 | 6 | import java.util.concurrent.{Callable, Executors} 7 | import scala.collection.immutable.ArraySeq 8 | import scala.reflect.ClassTag 9 | object ThreadStream {} 10 | class ThreadStream[@specialized(Float, Double) A: Fractional: ClassTag](val config: Config[A]) 11 | extends ScalaStream[A]: 12 | 13 | private var a: Array[A] = _ 14 | private var b: Array[A] = _ 15 | private var c: Array[A] = _ 16 | private val scalar: A = config.scalar 17 | 18 | private val chunks: Int = sys.runtime.availableProcessors() 19 | 20 | private val pool = Executors.newFixedThreadPool( 21 | chunks, 22 | new AffinityThreadFactory("scala-stream", true, AffinityStrategies.DIFFERENT_CORE) 23 | ) 24 | 25 | private val indices = (0 until config.options.arraysize) 26 | .grouped(config.options.arraysize / chunks) 27 | .toSeq 28 | 29 | private inline def forEachAll[C](c: => C)(f: (C, Int) => Unit): Seq[C] = 30 | import scala.jdk.CollectionConverters._ 31 | val xs = pool 32 | .invokeAll( 33 | indices.map { r => 34 | { () => 35 | val ctx = c 36 | r.foreach(f(ctx, _)) 37 | ctx 38 | }: Callable[C] 39 | }.asJavaCollection 40 | ) 41 | .asScala 42 | .map(_.get()) 43 | .toSeq 44 | xs 45 | 46 | override inline def initArrays(): Unit = 47 | a = Array.ofDim(config.options.arraysize) 48 | b = Array.ofDim(config.options.arraysize) 49 | c = Array.ofDim(config.options.arraysize) 50 | forEachAll(()) { (_, i) => 51 | a(i) = config.init._1 52 | b(i) = config.init._2 53 | c(i) = config.init._3 54 | } 55 | () 56 | 57 | class Box(var value: A) 58 | override inline def copy(): Unit = { forEachAll(())((_, i) => c(i) = a(i)); () } 59 | override inline def mul(): Unit = { forEachAll(())((_, i) => b(i) = scalar * c(i)); () } 60 | override inline def add(): Unit = { forEachAll(())((_, i) => c(i) = a(i) + b(i)); () } 61 | override inline def triad(): Unit = { forEachAll(())((_, i) => a(i) = b(i) + scalar * c(i)); () } 62 | override inline def nstream(): Unit = { forEachAll(())((_, i) => a(i) = b(i) * scalar * c(i)); () } 63 | 64 | override inline def dot(): A = 65 | forEachAll(Box(0.fractional))((acc, i) => acc.value = acc.value + (a(i) * b(i))) 66 | .map(_.value) 67 | .fold(0.fractional)(_ + _) 68 | override inline def data(): Data[A] = Data(a.to(ArraySeq), b.to(ArraySeq), c.to(ArraySeq)) 69 | -------------------------------------------------------------------------------- /src/std-data/STDDataStream.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved. 2 | // Updated 2021 by University of Bristol 3 | // 4 | // For full license terms please see the LICENSE file distributed with this 5 | // source code 6 | 7 | #pragma once 8 | #include "dpl_shim.h" 9 | 10 | #include 11 | #include 12 | #include "Stream.h" 13 | 14 | #define IMPLEMENTATION_STRING "STD (data-oriented)" 15 | 16 | 17 | template 18 | class STDDataStream : public Stream 19 | { 20 | protected: 21 | // Size of arrays 22 | int array_size; 23 | 24 | // Device side pointers 25 | T *a, *b, *c; 26 | 27 | public: 28 | STDDataStream(const int, int) noexcept; 29 | ~STDDataStream(); 30 | 31 | virtual void copy() override; 32 | virtual void add() override; 33 | virtual void mul() override; 34 | virtual void triad() override; 35 | virtual void nstream() override; 36 | virtual T dot() override; 37 | 38 | virtual void init_arrays(T initA, T initB, T initC) override; 39 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 40 | }; 41 | 42 | -------------------------------------------------------------------------------- /src/std-data/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection" 4 | "c++") 5 | 6 | register_flag_optional(NVHPC_OFFLOAD 7 | "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. 8 | The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) 9 | 10 | Possible values are: 11 | cc35 - Compile for compute capability 3.5 12 | cc50 - Compile for compute capability 5.0 13 | cc60 - Compile for compute capability 6.0 14 | cc62 - Compile for compute capability 6.2 15 | cc70 - Compile for compute capability 7.0 16 | cc72 - Compile for compute capability 7.2 17 | cc75 - Compile for compute capability 7.5 18 | cc80 - Compile for compute capability 8.0 19 | ccall - Compile for all supported compute capabilities" 20 | "") 21 | 22 | register_flag_optional(USE_TBB 23 | "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." 24 | "OFF") 25 | 26 | register_flag_optional(USE_ONEDPL 27 | "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. 28 | 29 | Possible values are: 30 | OPENMP - Implements policies using OpenMP. 31 | CMake will handle any flags needed to enable OpenMP if the compiler supports it. 32 | TBB - Implements policies using TBB. 33 | TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. 34 | DPCPP - Implements policies through SYCL2020. 35 | This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." 36 | "OFF") 37 | 38 | macro(setup) 39 | set(CMAKE_CXX_STANDARD 17) 40 | if (NVHPC_OFFLOAD) 41 | set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) 42 | # propagate flags to linker so that it links with the gpu stuff as well 43 | register_append_cxx_flags(ANY ${NVHPC_FLAGS}) 44 | register_append_link_flags(${NVHPC_FLAGS}) 45 | endif () 46 | if (USE_TBB) 47 | register_link_library(TBB::tbb) 48 | endif () 49 | if (USE_ONEDPL) 50 | register_definitions(USE_ONEDPL) 51 | register_link_library(oneDPL) 52 | endif () 53 | endmacro() 54 | -------------------------------------------------------------------------------- /src/std-indices/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection" 4 | "c++") 5 | 6 | register_flag_optional(NVHPC_OFFLOAD 7 | "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. 8 | The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) 9 | 10 | Possible values are: 11 | cc35 - Compile for compute capability 3.5 12 | cc50 - Compile for compute capability 5.0 13 | cc60 - Compile for compute capability 6.0 14 | cc62 - Compile for compute capability 6.2 15 | cc70 - Compile for compute capability 7.0 16 | cc72 - Compile for compute capability 7.2 17 | cc75 - Compile for compute capability 7.5 18 | cc80 - Compile for compute capability 8.0 19 | ccall - Compile for all supported compute capabilities" 20 | "") 21 | 22 | register_flag_optional(USE_TBB 23 | "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." 24 | "OFF") 25 | 26 | register_flag_optional(USE_ONEDPL 27 | "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. 28 | 29 | Possible values are: 30 | OPENMP - Implements policies using OpenMP. 31 | CMake will handle any flags needed to enable OpenMP if the compiler supports it. 32 | TBB - Implements policies using TBB. 33 | TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. 34 | DPCPP - Implements policies through SYCL2020. 35 | This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." 36 | "OFF") 37 | 38 | macro(setup) 39 | set(CMAKE_CXX_STANDARD 17) 40 | if (NVHPC_OFFLOAD) 41 | set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) 42 | # propagate flags to linker so that it links with the gpu stuff as well 43 | register_append_cxx_flags(ANY ${NVHPC_FLAGS}) 44 | register_append_link_flags(${NVHPC_FLAGS}) 45 | endif () 46 | if (USE_TBB) 47 | register_link_library(TBB::tbb) 48 | endif () 49 | if (USE_ONEDPL) 50 | register_definitions(USE_ONEDPL) 51 | register_link_library(oneDPL) 52 | endif () 53 | endmacro() 54 | -------------------------------------------------------------------------------- /src/std-ranges/STDRangesStream.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Tom Deakin 2 | // University of Bristol HPC 3 | // 4 | // For full license terms please see the LICENSE file distributed with this 5 | // source code 6 | 7 | #pragma once 8 | #include "dpl_shim.h" 9 | 10 | #include 11 | #include 12 | #include "Stream.h" 13 | 14 | #define IMPLEMENTATION_STRING "STD C++ ranges" 15 | 16 | template 17 | class STDRangesStream : public Stream 18 | { 19 | protected: 20 | // Size of arrays 21 | int array_size; 22 | 23 | // Device side pointers 24 | T *a, *b, *c; 25 | 26 | public: 27 | STDRangesStream(const int, int) noexcept; 28 | ~STDRangesStream(); 29 | 30 | virtual void copy() override; 31 | virtual void add() override; 32 | virtual void mul() override; 33 | virtual void triad() override; 34 | virtual void nstream() override; 35 | virtual T dot() override; 36 | 37 | virtual void init_arrays(T initA, T initB, T initC) override; 38 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 39 | 40 | }; 41 | 42 | -------------------------------------------------------------------------------- /src/std-ranges/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" 4 | "c++") 5 | 6 | register_flag_optional(USE_TBB 7 | "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." 8 | "OFF") 9 | 10 | register_flag_optional(USE_ONEDPL 11 | "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. 12 | 13 | Possible values are: 14 | OPENMP - Implements policies using OpenMP. 15 | CMake will handle any flags needed to enable OpenMP if the compiler supports it. 16 | TBB - Implements policies using TBB. 17 | TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. 18 | DPCPP - Implements policies through SYCL2020. 19 | This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." 20 | "OFF") 21 | 22 | macro(setup) 23 | 24 | # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here 25 | 26 | # C++ 2a is too new, disable CMake's std flags completely: 27 | set(CMAKE_CXX_EXTENSIONS OFF) 28 | set(CMAKE_CXX_STANDARD_REQUIRED OFF) 29 | unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default 30 | # and append our own: 31 | register_append_cxx_flags(ANY -std=c++20) 32 | if (USE_TBB) 33 | register_link_library(TBB::tbb) 34 | endif () 35 | if (USE_ONEDPL) 36 | register_definitions(USE_ONEDPL) 37 | register_link_library(oneDPL) 38 | endif () 39 | endmacro() 40 | 41 | macro(setup_target NAME) 42 | if (USE_ONEDPL) 43 | target_compile_features(${NAME} INTERFACE cxx_std_20) 44 | target_compile_features(oneDPL INTERFACE cxx_std_20) 45 | endif () 46 | endmacro() 47 | -------------------------------------------------------------------------------- /src/sycl/SYCLStream.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | 12 | #include "Stream.h" 13 | #include "CL/sycl.hpp" 14 | 15 | #define IMPLEMENTATION_STRING "SYCL" 16 | 17 | namespace sycl_kernels 18 | { 19 | template class init; 20 | template class copy; 21 | template class mul; 22 | template class add; 23 | template class triad; 24 | template class nstream; 25 | template class dot; 26 | } 27 | 28 | template 29 | class SYCLStream : public Stream 30 | { 31 | protected: 32 | // Size of arrays 33 | size_t array_size; 34 | 35 | // SYCL objects 36 | cl::sycl::queue *queue; 37 | cl::sycl::buffer *d_a; 38 | cl::sycl::buffer *d_b; 39 | cl::sycl::buffer *d_c; 40 | cl::sycl::buffer *d_sum; 41 | 42 | // SYCL kernel names 43 | typedef sycl_kernels::init init_kernel; 44 | typedef sycl_kernels::copy copy_kernel; 45 | typedef sycl_kernels::mul mul_kernel; 46 | typedef sycl_kernels::add add_kernel; 47 | typedef sycl_kernels::triad triad_kernel; 48 | typedef sycl_kernels::nstream nstream_kernel; 49 | typedef sycl_kernels::dot dot_kernel; 50 | 51 | // NDRange configuration for the dot kernel 52 | size_t dot_num_groups; 53 | size_t dot_wgsize; 54 | 55 | public: 56 | 57 | SYCLStream(const int, const int); 58 | ~SYCLStream(); 59 | 60 | virtual void copy() override; 61 | virtual void add() override; 62 | virtual void mul() override; 63 | virtual void triad() override; 64 | virtual void nstream() override; 65 | virtual T dot() override; 66 | 67 | virtual void init_arrays(T initA, T initB, T initC) override; 68 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 69 | 70 | }; 71 | 72 | // Populate the devices list 73 | void getDeviceList(void); 74 | -------------------------------------------------------------------------------- /src/sycl2020-acc/SYCLStream2020.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | 13 | #include "Stream.h" 14 | 15 | #include 16 | 17 | #define IMPLEMENTATION_STRING "SYCL2020 accessors" 18 | 19 | template 20 | class SYCLStream : public Stream 21 | { 22 | protected: 23 | // Size of arrays 24 | size_t array_size; 25 | 26 | // SYCL objects 27 | // Queue is a pointer because we allow device selection 28 | std::unique_ptr queue; 29 | 30 | // Buffers 31 | sycl::buffer d_a; 32 | sycl::buffer d_b; 33 | sycl::buffer d_c; 34 | sycl::buffer d_sum; 35 | 36 | public: 37 | 38 | SYCLStream(const size_t, const int); 39 | ~SYCLStream() = default; 40 | 41 | virtual void copy() override; 42 | virtual void add() override; 43 | virtual void mul() override; 44 | virtual void triad() override; 45 | virtual void nstream() override; 46 | virtual T dot() override; 47 | 48 | virtual void init_arrays(T initA, T initB, T initC) override; 49 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 50 | 51 | }; 52 | 53 | // Populate the devices list 54 | void getDeviceList(void); 55 | -------------------------------------------------------------------------------- /src/sycl2020-usm/SYCLStream2020.h: -------------------------------------------------------------------------------- 1 | 2 | // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, 3 | // University of Bristol HPC 4 | // 5 | // For full license terms please see the LICENSE file distributed with this 6 | // source code 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | 13 | #include "Stream.h" 14 | 15 | #include 16 | 17 | #define IMPLEMENTATION_STRING "SYCL2020 USM" 18 | 19 | template 20 | class SYCLStream : public Stream 21 | { 22 | protected: 23 | // Size of arrays 24 | size_t array_size; 25 | 26 | // SYCL objects 27 | // Queue is a pointer because we allow device selection 28 | std::unique_ptr queue; 29 | 30 | // Buffers 31 | T *a{}; 32 | T *b{}; 33 | T *c{}; 34 | T *sum{}; 35 | 36 | public: 37 | 38 | SYCLStream(const size_t, const int); 39 | ~SYCLStream(); 40 | 41 | virtual void copy() override; 42 | virtual void add() override; 43 | virtual void mul() override; 44 | virtual void triad() override; 45 | virtual void nstream() override; 46 | virtual T dot() override; 47 | 48 | virtual void init_arrays(T initA, T initB, T initC) override; 49 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 50 | 51 | }; 52 | 53 | // Populate the devices list 54 | void getDeviceList(void); 55 | -------------------------------------------------------------------------------- /src/tbb/TBBStream.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Tom Deakin 2 | // University of Bristol HPC 3 | // 4 | // For full license terms please see the LICENSE file distributed with this 5 | // source code 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include "tbb/tbb.h" 12 | #include "Stream.h" 13 | 14 | #define IMPLEMENTATION_STRING "TBB" 15 | 16 | #if defined(PARTITIONER_AUTO) 17 | using tbb_partitioner = tbb::auto_partitioner; 18 | #define PARTITIONER_NAME "auto_partitioner" 19 | #elif defined(PARTITIONER_AFFINITY) 20 | using tbb_partitioner = tbb::affinity_partitioner; 21 | #define PARTITIONER_NAME "affinity_partitioner" 22 | #elif defined(PARTITIONER_STATIC) 23 | using tbb_partitioner = tbb::static_partitioner; 24 | #define PARTITIONER_NAME "static_partitioner" 25 | #elif defined(PARTITIONER_SIMPLE) 26 | using tbb_partitioner = tbb::simple_partitioner; 27 | #define PARTITIONER_NAME "simple_partitioner" 28 | #else 29 | // default to auto 30 | using tbb_partitioner = tbb::auto_partitioner; 31 | #define PARTITIONER_NAME "auto_partitioner" 32 | #endif 33 | 34 | 35 | template 36 | class TBBStream : public Stream 37 | { 38 | protected: 39 | 40 | tbb_partitioner partitioner; 41 | tbb::blocked_range range; 42 | // Device side pointers 43 | #ifdef USE_VECTOR 44 | std::vector a, b, c; 45 | #else 46 | size_t array_size; 47 | T *a, *b, *c; 48 | #endif 49 | 50 | 51 | 52 | public: 53 | TBBStream(const int, int); 54 | ~TBBStream() = default; 55 | 56 | virtual void copy() override; 57 | virtual void add() override; 58 | virtual void mul() override; 59 | virtual void triad() override; 60 | virtual void nstream() override; 61 | virtual T dot() override; 62 | 63 | virtual void init_arrays(T initA, T initB, T initC) override; 64 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 65 | 66 | }; 67 | 68 | -------------------------------------------------------------------------------- /src/tbb/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(ONE_TBB_DIR 3 | "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. 4 | If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." 5 | "") 6 | 7 | 8 | register_flag_optional(PARTITIONER 9 | "Partitioner specifies how a loop template should partition its work among threads. 10 | Possible values are: 11 | AUTO - Optimize range subdivision based on work-stealing events. 12 | AFFINITY - Proportional splitting that optimizes for cache affinity. 13 | STATIC - Distribute work uniformly with no additional load balancing. 14 | SIMPLE - Recursively split its range until it cannot be further subdivided. 15 | See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." 16 | "AUTO") 17 | 18 | register_flag_optional(USE_VECTOR 19 | "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." 20 | "OFF") 21 | 22 | register_flag_optional(USE_TBB 23 | "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." 24 | "OFF") 25 | 26 | macro(setup) 27 | if(ONE_TBB_DIR) 28 | set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 29 | # docs on Intel's website refers to TBB_DIR which is not correct 30 | endif() 31 | if (NOT USE_TBB) 32 | # Only find TBB when we're not building in-tree 33 | find_package(TBB REQUIRED) 34 | endif() 35 | 36 | # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages 37 | register_link_library(TBB::tbb) 38 | register_definitions(PARTITIONER_${PARTITIONER}) 39 | if(USE_VECTOR) 40 | register_definitions(USE_VECTOR) 41 | endif() 42 | endmacro() 43 | -------------------------------------------------------------------------------- /src/thrust/ThrustStream.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Tom Deakin 2 | // University of Bristol HPC 3 | // 4 | // For full license terms please see the LICENSE file distributed with this 5 | // source code 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #if defined(MANAGED) 12 | #include 13 | #else 14 | #include 15 | #endif 16 | 17 | #include "Stream.h" 18 | 19 | #define IMPLEMENTATION_STRING "Thrust" 20 | 21 | template 22 | class ThrustStream : public Stream 23 | { 24 | protected: 25 | // Size of arrays 26 | int array_size; 27 | 28 | #if defined(MANAGED) 29 | thrust::universtal_vector a; 30 | thrust::universtal_vector b; 31 | thrust::universtal_vector c; 32 | #else 33 | thrust::device_vector a; 34 | thrust::device_vector b; 35 | thrust::device_vector c; 36 | #endif 37 | 38 | public: 39 | ThrustStream(const int, int); 40 | ~ThrustStream() = default; 41 | 42 | virtual void copy() override; 43 | virtual void add() override; 44 | virtual void mul() override; 45 | virtual void triad() override; 46 | virtual void nstream() override; 47 | virtual T dot() override; 48 | 49 | virtual void init_arrays(T initA, T initB, T initC) override; 50 | virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; 51 | 52 | }; 53 | 54 | --------------------------------------------------------------------------------