├── examples ├── kokkos │ ├── kokkos_build_scripts │ │ ├── README.md │ │ ├── kokkos_nvcc_cuda_build.sh │ │ ├── kokkos_clang_cuda_build.sh │ │ └── kokkos_build.sh │ ├── build.sh │ ├── CMakeLists.txt │ └── README.md ├── sgemm_interop │ ├── build.sh │ ├── README.md │ ├── CMakeLists.txt │ ├── sgemm.cu │ ├── sycl_sgemm_usm.cpp │ └── sycl_sgemm.cpp ├── vector_addition │ ├── build.sh │ ├── CMakeLists.txt │ ├── vector_addition.cpp │ ├── vector_addition_usm.cpp │ ├── README.md │ └── vector_addition.cu ├── hashing │ ├── tests │ │ ├── CMakeLists.txt │ │ └── tests_helpers.hpp │ ├── cmake │ │ ├── Modules │ │ │ ├── ComputeCppIRMap.cmake │ │ │ └── ComputeCppCompilerChecks.cmake │ │ └── FindSYCL.cmake │ ├── include │ │ ├── sycl_hash.hpp │ │ ├── hash_functions │ │ │ ├── md2.hpp │ │ │ ├── sha1.hpp │ │ │ ├── md5.hpp │ │ │ ├── sha256.hpp │ │ │ ├── keccak.hpp │ │ │ ├── blake2b.hpp │ │ │ └── blake3.hpp │ │ ├── internal │ │ │ ├── config.hpp │ │ │ ├── determine_kernel_config.hpp │ │ │ ├── handle.hpp │ │ │ ├── async_api.hpp │ │ │ └── sync_api.hpp │ │ └── tools │ │ │ ├── fill_rand.hpp │ │ │ ├── intrinsics.hpp │ │ │ ├── usm_smart_ptr.hpp │ │ │ ├── runtime_byte_array.hpp │ │ │ └── sycl_queue_helpers.hpp │ ├── src │ │ ├── tools │ │ │ └── queue_tester.cpp │ │ ├── benchmarks │ │ │ └── misc.hpp │ │ └── hash_functions │ │ │ ├── md2.cpp │ │ │ ├── sha1.cpp │ │ │ └── sha256.cpp │ ├── CMakeLists.txt │ ├── demo_main.cpp │ ├── doc │ │ └── README.md │ └── README.md ├── MPI │ ├── README.md │ └── Makefile ├── distrib_batch_gemm │ ├── README.md │ ├── main.cpp │ ├── Makefile │ ├── vadd_sycl.cpp │ └── vadd_cuda.cu └── fortran_interface │ ├── README.md │ ├── Makefile │ ├── saxpy.cpp │ └── saxpy.cuf ├── .github ├── dependabot.yml └── workflows │ └── scorecard.yml ├── SECURITY.md ├── .gitignore ├── Contributing.md ├── setup-script ├── sample │ ├── README.md │ ├── CMakeLists.txt │ ├── include │ │ ├── chrono.hpp │ │ ├── common.hpp │ │ └── usm_smart_ptr.hpp │ ├── mkl_matmult_usm.cpp │ └── mkl_matmult.cpp ├── README.md ├── build_minimal.sh ├── build.sh └── build_with_libcxx.sh └── README.md /examples/kokkos/kokkos_build_scripts/README.md: -------------------------------------------------------------------------------- 1 | These build scripts are provided for illustration only. They will almost certainly require modification before they work elsewhere. 2 | -------------------------------------------------------------------------------- /examples/sgemm_interop/build.sh: -------------------------------------------------------------------------------- 1 | rm -rf build && mkdir build && cd build 2 | cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++ 3 | make -j 4 | -------------------------------------------------------------------------------- /examples/vector_addition/build.sh: -------------------------------------------------------------------------------- 1 | rm -rf build && mkdir build && cd build 2 | cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++ -DCMAKE_EXPORT_COMPILE_COMMANDS=yes 3 | make -j 8 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Enable version updates for Github Actions 4 | - package-ecosystem: "github-actions" 5 | directory: "/" 6 | schedule: 7 | interval: "monthly" 8 | groups: 9 | github-actions: 10 | patterns: 11 | - "*" 12 | reviewers: 13 | - "codeplaysoftware/security-managers" 14 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | To report a vulnerability or a security issue please fill the security 6 | advisories form [here](../../security/advisories/new), send an email to 7 | security@codeplay.com or contact us using the [contact form on our web 8 | page](https://codeplay.com/company/contact/?q=Report%20Security%20Issue). 9 | -------------------------------------------------------------------------------- /examples/kokkos/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf build 4 | mkdir build 5 | cd build 6 | 7 | # Set the environment variable Kokkos_ROOT="[your/kokkos/installation]/lib/cmake/Kokkos" 8 | CXXFLAGS="-Xsycl-target-frontend -O3" \ 9 | LDFLAGS="-Xsycl-target-frontend -O3" \ 10 | cmake .. -G Ninja \ 11 | -DCMAKE_BUILD_TYPE=Debug \ 12 | -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ 13 | -DCMAKE_CXX_COMPILER=clang++ \ 14 | -DCMAKE_C_COMPILER=clang 15 | 16 | ninja 17 | 18 | cd .. 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # Temporaries 35 | *~ 36 | *# 37 | */build 38 | ] 39 | # vim 40 | *.swp 41 | -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | Contributing 2 | ------------- 3 | 4 | New examples or improvements to the existing ones are welcomed. 5 | Please, follow 6 | [LLVM coding standards](https://llvm.org/docs/CodingStandards.html) when 7 | contributing code, since sometimes they will be contributed as tests to the 8 | [DPCPP project](https://github.com/intel/llvm) project. 9 | 10 | When writing your commit message, please make sure to follow 11 | [LLVM developer policies](https://llvm.org/docs/DeveloperPolicy.html#commit-messages) 12 | on the subject. 13 | -------------------------------------------------------------------------------- /examples/hashing/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | include(FetchContent) 2 | FetchContent_Declare( 3 | googletest 4 | URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip 5 | ) 6 | # For Windows: Prevent overriding the parent project's compiler/linker settings 7 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 8 | FetchContent_MakeAvailable(googletest) 9 | 10 | 11 | enable_testing() 12 | 13 | add_executable( 14 | test_all_hashes 15 | tests/tests.cpp 16 | ) 17 | 18 | add_sycl_to_target(TARGET test_all_hashes SOURCES tests/tests.cpp) 19 | 20 | target_link_libraries(test_all_hashes PUBLIC gtest_main sycl_hash) 21 | 22 | include(GoogleTest) 23 | gtest_discover_tests(test_all_hashes) -------------------------------------------------------------------------------- /examples/MPI/README.md: -------------------------------------------------------------------------------- 1 | ## MPI + SYCL example 2 | 3 | This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks for integration. 4 | 5 | 6 | ## Requisites 7 | 8 | The Makefile provided assumes the MPICXX compiler points to the DPCPP compiler with CUDA support. 9 | That requires the MPI implementation to be built, or use, the DPCPP compiler. 10 | The MPI implementation needs to have been built with CUDA support (typically called "CUDA-aware" MPI") 11 | 12 | ## Compilation 13 | 14 | If MPICXX points to DPC++ with CUDA support and its on the path, "make" should build the program. 15 | 16 | ## Execution 17 | 18 | The makefile contains a target to execute the problem in two processes: 19 | 20 | ```sh 21 | make run 22 | ``` 23 | 24 | The target assumes mpirun is on the PATH 25 | 26 | 27 | -------------------------------------------------------------------------------- /examples/hashing/cmake/Modules/ComputeCppIRMap.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4.3) 2 | 3 | # These should match the types of IR output by compute++ 4 | set(IR_MAP_spir bc) 5 | set(IR_MAP_spir64 bc) 6 | set(IR_MAP_spir32 bc) 7 | set(IR_MAP_spirv spv) 8 | set(IR_MAP_spirv64 spv) 9 | set(IR_MAP_spirv32 spv) 10 | set(IR_MAP_aorta-x86_64 o) 11 | set(IR_MAP_aorta-aarch64 o) 12 | set(IR_MAP_aorta-rcar-cve o) 13 | set(IR_MAP_custom-spir64 bc) 14 | set(IR_MAP_custom-spir32 bc) 15 | set(IR_MAP_custom-spirv64 spv) 16 | set(IR_MAP_custom-spirv32 spv) 17 | set(IR_MAP_ptx64 s) 18 | set(IR_MAP_amdgcn s) 19 | 20 | # Retrieves the filename extension of the IR output of compute++ 21 | function(get_sycl_target_extension output) 22 | set(syclExtension ${IR_MAP_${COMPUTECPP_BITCODE}}) 23 | if (NOT syclExtension) 24 | # Needed when using multiple device targets 25 | set(syclExtension "bc") 26 | endif () 27 | set(${output} ${syclExtension} PARENT_SCOPE) 28 | endfunction() 29 | -------------------------------------------------------------------------------- /examples/kokkos/kokkos_build_scripts/kokkos_nvcc_cuda_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Install Kokkos w/ sycl-cuda support 3 | 4 | set -x #echo on 5 | 6 | # Set: 7 | # KOKKOS_INSTALL_DIR=[/your/install/dir] 8 | # KOKKOS_SOURCE_DIR=[/your/source/dir] 9 | # HWLOC_DIR=[/your/hwloc/dir] 10 | 11 | # Configure & build kokkos 12 | mkdir kokkos-build 13 | cd kokkos-build 14 | 15 | cmake $KOKKOS_SOURCE_DIR -G Ninja \ 16 | -DCMAKE_BUILD_TYPE=Release \ 17 | -DCMAKE_CXX_STANDARD=17 \ 18 | -DCMAKE_CXX_COMPILER=g++ \ 19 | -DCMAKE_CUDA_COMPILER=nvcc \ 20 | -DCMAKE_INSTALL_PREFIX=$KOKKOS_INSTALL_DIR \ 21 | -DKokkos_CXX_STANDARD=17 \ 22 | -DKokkos_ENABLE_SYCL=OFF \ 23 | -DKokkos_ENABLE_CUDA=ON \ 24 | -DKokkos_ARCH_HSW=ON \ 25 | -DKokkos_ARCH_AMPERE80=ON \ 26 | -DKokkos_ENABLE_HWLOC=ON \ 27 | -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ 28 | -DKokkos_ENABLE_TESTS=OFF \ 29 | -DKokkos_HWLOC_DIR=$HWLOC_DIR 30 | 31 | ninja install 32 | 33 | cd .. 34 | -------------------------------------------------------------------------------- /setup-script/sample/README.md: -------------------------------------------------------------------------------- 1 | # oneMKL samples 2 | 3 | The code runs a small benchmarks of your blas implementation using multiplication of square matrices. You can pass the size as an argument of the executable. 4 | 5 | Two versions are provided, one of which is using the USM inferface. 6 | 7 | If the environment is correctly set you should be able to run the sample with: 8 | 9 | ``` 10 | mkdir build; 11 | cd build 12 | CXX=clang++ cmake .. -DCMAKE_BUILD_TYPE=Release 13 | cmake --build . 14 | ``` 15 | 16 | ### Detail 17 | 18 | - `sycl_unique` is a unique pointer to a USM allocated memory which wraps a `std::unique_ptr` with a custom deleter and holds the allocated size. 19 | - `fill_rand` fills a `std::vector` or `sycl_unique` with random values. 20 | 21 | ### Refs 22 | 23 | - Working example adapted 24 | from [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/api-based-programming/intel-oneapi-math-kernel-library-onemkl/onemkl-code-sample.html) -------------------------------------------------------------------------------- /setup-script/sample/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(Sycl_Tests LANGUAGES CXX) 3 | set(CMAKE_CXX_STANDARD 20) 4 | 5 | set(WARNING_FLAGS "-Wall -Wextra -Wshadow -Wdouble-promotion -Wshadow -Wuninitialized -Wmissing-declarations -Woverloaded-virtual") 6 | set(DISABLED_WARNINGS "-Wno-c++20-extensions -Wno-unknown-cuda-version -Wno-unused -Wno-unused-parameter") 7 | 8 | set(OPT_FLAGS "-march=native -mtune=native -Ofast -fomit-frame-pointer") 9 | 10 | SET(CMAKE_CXX_FLAGS "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -sycl-std=2020 -fsycl-unnamed-lambda") 11 | SET(CMAKE_EXE_LINKER_FLAGS "-lonemkl_blas_cublas -lonemkl") 12 | 13 | set(CMAKE_CXX_FLAGS_RELEASE "${OPT_FLAGS} ${WARNING_FLAGS} ${DISABLED_WARNINGS}") 14 | set(CMAKE_CXX_FLAGS_DEBUG " ${WARNING_FLAGS} ${DISABLED_WARNINGS} -g3 -Og") 15 | 16 | include_directories(include/) 17 | include_directories($ENV{DPCPP_HOME}/deploy/include) 18 | link_directories($ENV{DPCPP_HOME}/deploy/lib) 19 | 20 | add_executable(mkl_kernel mkl_matmult.cpp) 21 | add_executable(mkl_kernel_usm mkl_matmult_usm.cpp) 22 | 23 | -------------------------------------------------------------------------------- /examples/kokkos/kokkos_build_scripts/kokkos_clang_cuda_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Install Kokkos w/ sycl-cuda support 3 | 4 | set -x #echo on 5 | 6 | # Set: 7 | # KOKKOS_INSTALL_DIR=[/your/install/dir] 8 | # KOKKOS_SOURCE_DIR=[/your/source/dir] 9 | # HWLOC_DIR=[/your/hwloc/dir] 10 | 11 | # Configure & build kokkos 12 | mkdir kokkos-build 13 | cd kokkos-build 14 | 15 | CXXFLAGS="-Xsycl-target-frontend -O3 -fgpu-inline-threshold=100000" \ 16 | LDFLAGS="-Xsycl-target-frontend -O3" \ 17 | cmake $KOKKOS_SOURCE_DIR -G Ninja \ 18 | -DCMAKE_BUILD_TYPE=Release \ 19 | -DCMAKE_CXX_STANDARD=17 \ 20 | -DCMAKE_CXX_COMPILER=clang++ \ 21 | -DCMAKE_INSTALL_PREFIX=$KOKKOS_INSTALL_DIR \ 22 | -DKokkos_CXX_STANDARD=17 \ 23 | -DKokkos_ENABLE_SYCL=OFF \ 24 | -DKokkos_ENABLE_CUDA=ON \ 25 | -DKokkos_ARCH_HSW=ON \ 26 | -DKokkos_ARCH_AMPERE80=ON \ 27 | -DKokkos_ENABLE_HWLOC=ON \ 28 | -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ 29 | -DKokkos_ENABLE_TESTS=OFF \ 30 | -DKokkos_HWLOC_DIR=$HWLOC_DIR 31 | 32 | ninja install 33 | 34 | cd .. 35 | -------------------------------------------------------------------------------- /examples/kokkos/kokkos_build_scripts/kokkos_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Install Kokkos w/ sycl-cuda support 3 | 4 | set -x #echo on 5 | 6 | # Set: 7 | # KOKKOS_INSTALL_DIR=[/your/install/dir] 8 | # KOKKOS_SOURCE_DIR=[/your/source/dir] 9 | # HWLOC_DIR=[/your/hwloc/dir] 10 | 11 | # Configure & build kokkos 12 | mkdir kokkos-build 13 | cd kokkos-build 14 | 15 | CXXFLAGS="-Xsycl-target-frontend -O3 -fgpu-inline-threshold=100000 -Wno-unknown-cuda-version -Wno-deprecated-declarations -Wno-linker-warnings -ffast-math" \ 16 | LDFLAGS="-Xsycl-target-frontend -O3" \ 17 | cmake $KOKKOS_SOURCE_DIR -G Ninja \ 18 | -DCMAKE_BUILD_TYPE=Release \ 19 | -DCMAKE_CXX_STANDARD=17 \ 20 | -DCMAKE_CXX_COMPILER=clang++ \ 21 | -DCMAKE_INSTALL_PREFIX=$KOKKOS_INSTALL_DIR \ 22 | -DKokkos_CXX_STANDARD=17 \ 23 | -DKokkos_ENABLE_SYCL=ON \ 24 | -DKokkos_ARCH_HSW=ON \ 25 | -DKokkos_ARCH_AMPERE80=ON \ 26 | -DKokkos_ENABLE_HWLOC=ON \ 27 | -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ 28 | -DKokkos_ENABLE_TESTS=OFF \ 29 | -DKokkos_HWLOC_DIR=$HWLOC_DIR 30 | 31 | ninja install 32 | 33 | cd .. 34 | -------------------------------------------------------------------------------- /examples/hashing/include/sycl_hash.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sycl_hash.hpp 20 | * 21 | * Description: 22 | * SYCL hashing 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include "internal/config.hpp" 27 | #include "internal/sync_api.hpp" 28 | #include "internal/async_api.hpp" 29 | -------------------------------------------------------------------------------- /examples/hashing/src/tools/queue_tester.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * queue_tester.cpp 20 | * 21 | * Description: 22 | * Queue tester 23 | **************************************************************************/ 24 | #include 25 | #include 26 | 27 | void queue_tester(sycl::queue &q) { 28 | q.submit([](sycl::handler &cgh) { 29 | cgh.single_task([]() {}); 30 | }).wait_and_throw(); 31 | } -------------------------------------------------------------------------------- /examples/kokkos/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #/*************************************************************************** 2 | # * 3 | # * Copyright (C) Codeplay Software Ltd. 4 | # * 5 | # * Licensed under the Apache License, Version 2.0 (the "License"); 6 | # * you may not use this file except in compliance with the License. 7 | # * You may obtain a copy of the License at 8 | # * 9 | # * http://www.apache.org/licenses/LICENSE-2.0 10 | # * 11 | # * Unless required by applicable law or agreed to in writing, software 12 | # * distributed under the License is distributed on an "AS IS" BASIS, 13 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # * See the License for the specific language governing permissions and 15 | # * limitations under the License. 16 | # * 17 | # * Codeplay's SYCL-For-CUDA-Examples 18 | # * 19 | # * CMakeLists.txt 20 | # * 21 | # * Description: 22 | # * CMake for kokkos example 23 | # **************************************************************************/ 24 | cmake_minimum_required (VERSION 3.10) 25 | cmake_policy(SET CMP0074 NEW) 26 | project (Kokkos_Test_Case) 27 | 28 | set(Kokkos_DIR "$ENV{Kokkos_ROOT}" CACHE STRING "Kokkos root directory") 29 | find_package(Kokkos REQUIRED) 30 | 31 | add_executable(test_case test_case.cpp) 32 | target_link_libraries(test_case Kokkos::kokkos) 33 | 34 | 35 | -------------------------------------------------------------------------------- /examples/hashing/include/hash_functions/md2.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * md2.hpp 20 | * 21 | * Description: 22 | * MD2 hash function 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | 29 | constexpr dword MD2_BLOCK_SIZE = 16; 30 | 31 | namespace hash::internal { 32 | class md2_kernel; 33 | 34 | using namespace usm_smart_ptr; 35 | 36 | 37 | sycl::event launch_md2_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch); 38 | 39 | } -------------------------------------------------------------------------------- /examples/hashing/include/hash_functions/sha1.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sha1.hpp 20 | * 21 | * Description: 22 | * SHA1 hash function 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | 29 | 30 | constexpr dword SHA1_BLOCK_SIZE = 20; 31 | 32 | namespace hash::internal { 33 | class sha1_kernel; 34 | 35 | using namespace usm_smart_ptr; 36 | 37 | sycl::event launch_sha1_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch); 38 | 39 | } -------------------------------------------------------------------------------- /examples/distrib_batch_gemm/README.md: -------------------------------------------------------------------------------- 1 | ## Distributed Batch GEMM example 2 | 3 | This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks to distribute Batch GEMM accross MPI process. 4 | 5 | 6 | ## Requisites 7 | 8 | The Makefile provided assumes the MPICXX compiler points to the DPCPP compiler with CUDA support. 9 | That requires the MPI implementation to be built, or use, the DPCPP compiler. 10 | The MPI implementation needs to have been built with CUDA support (typically called "CUDA-aware" MPI") 11 | 12 | The example uses [SYCL-BLAS](https://github.com/codeplaysoftware/sycl-blas) library to call the GEMM routine. 13 | The SYCL-BLAS Library should be [compiled by DPCPP compiler](https://github.com/codeplaysoftware/sycl-blas#compile-with-dpc) to target CUDA backend. The following command line is used to build SYCL-BLAS library: 14 | 15 | ```bash 16 | cmake -GNinja ../ -DTARGET=NVIDIA_GPU -DSYCL_COMPILER=dpcpp -DBLAS_DATA_TYPES=float -DGEMM_VECTORIZATION_SUPPORT=ON -DBLAS_ENABLE_TESTING=OFF -DENABLE_EXPRESSION_TESTS=OFF -DBLAS_ENABLE_BENCHMARK=OFF -DBLAS_VERIFY_BENCHMARK=OFF -DBLAS_BUILD_SAMPLES=OFF 17 | ``` 18 | 19 | ## Compilation 20 | 21 | If MPICXX points to DPC++ with CUDA support and its on the path, "make" should build the program. 22 | 23 | ## Execution 24 | 25 | The makefile contains a target to execute the problem in two processes: 26 | 27 | ```sh 28 | make run 29 | ``` 30 | 31 | The target assumes mpirun is on the PATH 32 | -------------------------------------------------------------------------------- /examples/MPI/Makefile: -------------------------------------------------------------------------------- 1 | #/*************************************************************************** 2 | # * 3 | # * Copyright (C) Codeplay Software Ltd. 4 | # * 5 | # * Licensed under the Apache License, Version 2.0 (the "License"); 6 | # * you may not use this file except in compliance with the License. 7 | # * You may obtain a copy of the License at 8 | # * 9 | # * http://www.apache.org/licenses/LICENSE-2.0 10 | # * 11 | # * Unless required by applicable law or agreed to in writing, software 12 | # * distributed under the License is distributed on an "AS IS" BASIS, 13 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # * See the License for the specific language governing permissions and 15 | # * limitations under the License. 16 | # * 17 | # * Codeplay's SYCL-For-CUDA-Examples 18 | # * 19 | # * Makefile 20 | # * 21 | # * Description: 22 | # * Makefile for MPI example 23 | # **************************************************************************/ 24 | MPICOMP=mpicxx -I${SYCL_ROOT_DIR}/include/sycl/ -O1 -fsycl-unnamed-lambda -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Wno-linker-warnings 25 | 26 | sycl-mpi-sample: SYCL-MPI-Sample.o 27 | $(MPICOMP) SYCL-MPI-Sample.o -o sycl-mpi-sample 28 | 29 | SYCL-MPI-Sample.o: SYCL-MPI-Sample.cpp 30 | $(MPICOMP) -c SYCL-MPI-Sample.cpp 31 | 32 | run: sycl-mpi-sample 33 | mpirun -np 2 ./sycl-mpi-sample 34 | 35 | .PHONY: clean 36 | 37 | clean: 38 | rm -f sycl-mpi-sample *.o 39 | -------------------------------------------------------------------------------- /examples/fortran_interface/README.md: -------------------------------------------------------------------------------- 1 | CUDA Fortran and SYCL integration 2 | ====================================== 3 | 4 | This directory shows an example of how to call a SYCL function 5 | from a CUDA fortran code. 6 | 7 | The SYCL routine is called using the Fortran ISO bindings like 8 | any other C function. 9 | 10 | ```fortran 11 | interface saxpy_sycl 12 | subroutine saxpy_call(x, y, a, N) & 13 | bind(C,name='saxpy_sycl_cuda_wrapper') 14 | implicit none 15 | real :: x(:), y(:) 16 | real, value :: a 17 | integer, value :: N 18 | end subroutine 19 | end interface 20 | ``` 21 | 22 | The SYCL code implemented in the C++ version of the code works as usual with one minor modification: 23 | Uses the CUDA Primary context to enable inter-operating with the CUDA Fortran code, ensuring the same resources are shared. 24 | 25 | The following snipped highligts the construction of a SYCL context associated with the Primary context. 26 | To ensure synchronization with the CUDA Fortran code, the queue will also be mapped to the default CUDA 27 | stream, instead of creating a new stream. 28 | It is possible to create a normal stream, just by using the default SYCL queue constructor on the CUDA 29 | context. Said queue will run concurrently (i.e. won't sync) to the main queue. 30 | 31 | ```cpp 32 | sycl::context c{sycl::property::context::cuda::use_primary_context()}; 33 | sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()}; 34 | ``` 35 | 36 | 37 | -------------------------------------------------------------------------------- /setup-script/sample/include/chrono.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | /** 8 | * Small Chrono class that prints the time spent in a scope. 9 | */ 10 | class Chrono { 11 | public: 12 | inline Chrono(); 13 | 14 | inline explicit Chrono(std::string &&caller_name); 15 | 16 | inline Chrono(const Chrono &) = delete; 17 | 18 | Chrono &operator=(const Chrono &) = delete; 19 | 20 | inline double stop(); 21 | 22 | inline ~Chrono(); 23 | 24 | private: 25 | std::string caller; 26 | 27 | const std::chrono::time_point>> start; 28 | }; 29 | 30 | inline Chrono::Chrono() 31 | : start(std::chrono::high_resolution_clock::now()) { 32 | } 33 | 34 | inline Chrono::~Chrono() { 35 | double elapsed_seconds = Chrono::stop(); 36 | if (!caller.empty()) { 37 | std::cerr << "time in " << caller << " : " << elapsed_seconds << "s" << std::endl; 38 | } else { 39 | std::cerr << "time " << elapsed_seconds << "s" << std::endl; 40 | } 41 | } 42 | 43 | inline Chrono::Chrono(std::string &&caller_name) 44 | : Chrono() { 45 | caller = caller_name; 46 | } 47 | 48 | inline double Chrono::stop() { 49 | auto end = std::chrono::high_resolution_clock::now(); 50 | auto duration = std::chrono::duration_cast(end - start); 51 | return static_cast(duration.count()) / 1000000.0; 52 | } 53 | -------------------------------------------------------------------------------- /examples/hashing/include/hash_functions/md5.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * md5.hpp 20 | * 21 | * Description: 22 | * MD5 hash function 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | 29 | /****************************** MACROS ******************************/ 30 | constexpr dword MD5_BLOCK_SIZE = 16; // MD5 outputs a 16 byte digest 31 | 32 | namespace hash::internal { 33 | class md5_kernel; 34 | 35 | using namespace usm_smart_ptr; 36 | 37 | sycl::event launch_md5_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch); 38 | 39 | } -------------------------------------------------------------------------------- /examples/sgemm_interop/README.md: -------------------------------------------------------------------------------- 1 | SYCL interop with CUDA library 2 | ------------------------------- 3 | 4 | The example shows how to interop with CUBLAS from a SYCL for CUDA application. 5 | The example uses Codeplay's extension *interop_task* to call the **SGEMM** 6 | routine in CUBLAS. Parameters are extracted using the interop handler conversion. 7 | 8 | Pre-requisites 9 | --------------- 10 | 11 | These instructions assume that example [docker image](https://hub.docker.com/r/ruyman/dpcpp_cuda_examples/dockerfile) is being used. This image 12 | simplifies accessing these examples as the environment is set up correctly. 13 | For details on how to get started with the example docker image, refer to the 14 | root README file. 15 | 16 | Building the example 17 | ===================== 18 | 19 | ``` sh 20 | $ bash build.sh 21 | ``` 22 | 23 | or (SYCL version only): 24 | 25 | ``` 26 | ${SYCL_ROOT_DIR}/bin/clang++ -DCUDA_NO_HALF -isystem /usr/local/cuda/include -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda -std=gnu++17 -L/usr/local/cuda/lib64 -lcublas -lcudart -lcuda -o sycl_sgemm sycl_sgemm.cpp 27 | ``` 28 | Example 29 | ========= 30 | 31 | Two source codes are provided. `sgemm.cu` is the original CUDA code calling 32 | CUBLAS library to perform the matrix multiplication. `sycl_sgemm.cpp` is the 33 | SYCL variant that calls CUBLAS underneath. 34 | 35 | Both implementations perform the multiplication of square matrices A and B, 36 | where A is a matrix full of ones, and B is an identity matrix. 37 | The expected output on C is a matrix full of ones. 38 | -------------------------------------------------------------------------------- /examples/hashing/include/hash_functions/sha256.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sha256.hpp 20 | * 21 | * Description: 22 | * SHA256 hash function 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | 29 | /****************************** MACROS ******************************/ 30 | constexpr dword SHA256_BLOCK_SIZE = 32; // SHA256 outputs a 32 byte digest 31 | 32 | namespace hash::internal { 33 | class sha256_kernel; 34 | 35 | using namespace usm_smart_ptr; 36 | 37 | 38 | sycl::event launch_sha256_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch); 39 | 40 | 41 | } -------------------------------------------------------------------------------- /examples/hashing/include/hash_functions/keccak.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * keccak.hpp 20 | * 21 | * Description: 22 | * Keccak hash function 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | 29 | 30 | constexpr dword KECCAK_ROUND = 24; 31 | constexpr dword KECCAK_STATE_SIZE = 25; 32 | constexpr dword KECCAK_Q_SIZE = 192; 33 | 34 | namespace hash::internal { 35 | 36 | template 37 | class keccak_kernel; 38 | 39 | using namespace usm_smart_ptr; 40 | 41 | 42 | sycl::event 43 | launch_keccak_kernel(bool is_sha3, sycl::queue &item, sycl::event e, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch, dword n_outbit); 44 | 45 | 46 | } -------------------------------------------------------------------------------- /examples/fortran_interface/Makefile: -------------------------------------------------------------------------------- 1 | #/*************************************************************************** 2 | # * 3 | # * Copyright (C) Codeplay Software Ltd. 4 | # * 5 | # * Licensed under the Apache License, Version 2.0 (the "License"); 6 | # * you may not use this file except in compliance with the License. 7 | # * You may obtain a copy of the License at 8 | # * 9 | # * http://www.apache.org/licenses/LICENSE-2.0 10 | # * 11 | # * Unless required by applicable law or agreed to in writing, software 12 | # * distributed under the License is distributed on an "AS IS" BASIS, 13 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # * See the License for the specific language governing permissions and 15 | # * limitations under the License. 16 | # * 17 | # * Codeplay's SYCL-For-CUDA-Examples 18 | # * 19 | # * Makefile 20 | # * 21 | # * Description: 22 | # * Makefile for fortran interface 23 | # **************************************************************************/ 24 | CXX=clang++ 25 | FORT=nvfortran 26 | FFLAGS=-c++libs -cuda 27 | CXXFLAGS=-fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda -Wno-linker-warnings 28 | DPCPP_PATH=/home/ruyman/sycl_workspace/build_dpcpp/install 29 | 30 | default: final.exe 31 | 32 | saxpy_sycl.so: saxpy.cpp 33 | $(CXX) $(CXXFLAGS) -fPIC --shared saxpy.cpp -o saxpy_sycl.so 34 | 35 | saxpy_cuf.o: saxpy.cuf 36 | $(FORT) $(FFLAGS) -c saxpy.cuf -o saxpy_cuf.o 37 | 38 | final.exe: saxpy_cuf.o saxpy_sycl.so 39 | $(FORT) $(FFLAGS) -o final.exe saxpy_cuf.o saxpy_sycl.so -L${DPCPP_PATH}/lib/ -lsycl 40 | 41 | .PHONY: clean 42 | 43 | clean: 44 | rm -f saxpy_cuf.o saxpy_sycl.so final.exe mathops.mod 45 | 46 | -------------------------------------------------------------------------------- /examples/hashing/include/internal/config.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * config.hpp 20 | * 21 | * Description: 22 | * Hashing function configuration 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | /** 31 | * To update on every abi update so two you won't be able to link the new declarations against an older library. 32 | */ 33 | #define abi_rev v_1 34 | 35 | using byte = uint8_t; 36 | using dword = uint32_t; 37 | using qword = uint64_t; 38 | 39 | //#define IMPLICIT_MEMORY_COPY 1 // ONLY ON LINUX AND MACOS 40 | 41 | namespace hash { 42 | /** 43 | * Defines the various types of hashes supported. 44 | */ 45 | enum class method { 46 | sha256, 47 | keccak, 48 | blake2b, 49 | sha1, 50 | sha3, 51 | md5, 52 | md2 53 | }; 54 | 55 | 56 | } -------------------------------------------------------------------------------- /examples/vector_addition/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10 FATAL_ERROR) 2 | # Don't complain about empty CMAKE_CUDA_ARCHITECTURES 3 | cmake_policy(SET CMP0104 OLD) 4 | 5 | project(cmake_and_cuda LANGUAGES CXX CUDA) 6 | 7 | include(CTest) 8 | 9 | # SYCL installation 10 | if (NOT SYCL_ROOT) 11 | message(FATAL_ERROR "No SYCL installation detected") 12 | endif(NOT SYCL_ROOT) 13 | 14 | set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/14.0.0/include/") 15 | set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so") 16 | set(SYCL_FLAGS "-fsycl" 17 | "-fsycl-targets=nvptx64-nvidia-cuda" 18 | "-fsycl-unnamed-lambda" 19 | "-Wno-linker-warnings") 20 | 21 | # Build the CUDA code 22 | add_executable(vector_addition vector_addition.cu) 23 | target_compile_features(vector_addition PUBLIC cxx_std_11) 24 | set_target_properties(vector_addition PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 25 | set_property(TARGET vector_addition PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}") 26 | 27 | # Build the SYCL code 28 | add_executable (sycl_vector_addition vector_addition.cpp) 29 | target_compile_features(sycl_vector_addition PUBLIC cxx_std_17) 30 | target_compile_options(sycl_vector_addition PUBLIC ${SYCL_FLAGS}) 31 | target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_FLAGS}) 32 | target_include_directories(sycl_vector_addition PUBLIC ${SYCL_INCLUDE_DIR}) 33 | target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_LIB}) 34 | 35 | 36 | # Build the SYCL (USM) code 37 | add_executable (sycl_vector_addition_usm vector_addition_usm.cpp) 38 | target_compile_features(sycl_vector_addition_usm PUBLIC cxx_std_17) 39 | target_compile_options(sycl_vector_addition_usm PUBLIC ${SYCL_FLAGS}) 40 | target_link_libraries(sycl_vector_addition_usm PUBLIC ${SYCL_FLAGS}) 41 | target_include_directories(sycl_vector_addition_usm PUBLIC ${SYCL_INCLUDE_DIR}) 42 | target_link_libraries(sycl_vector_addition_usm PUBLIC ${SYCL_LIB}) 43 | 44 | -------------------------------------------------------------------------------- /examples/fortran_interface/saxpy.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * saxpy.cpp 20 | * 21 | * Description: 22 | * SAXPY in SYCL 23 | **************************************************************************/ 24 | #include 25 | #include 26 | 27 | extern "C" { 28 | void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N); 29 | }; 30 | 31 | 32 | void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N) { 33 | sycl::context c{sycl::property::context::cuda::use_primary_context()}; 34 | sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()}; 35 | { 36 | sycl::buffer bX {x, sycl::range<1>(N)}; 37 | sycl::buffer bY {y, sycl::range<1>(N)}; 38 | 39 | q.submit([&](sycl::handler& h) { 40 | auto aX = bX.get_access(h); 41 | auto aY = bY.get_access(h); 42 | h.parallel_for(sycl::range<1>(N), [=](sycl::id<1> id) { 43 | if (id[0] < N) 44 | aY[id] = aX[id] * a + aY[id]; 45 | }); 46 | }); 47 | 48 | q.wait_and_throw(); 49 | } 50 | return; 51 | } 52 | -------------------------------------------------------------------------------- /.github/workflows/scorecard.yml: -------------------------------------------------------------------------------- 1 | # Scorecards' GitHub action 2 | 3 | name: Scorecard supply-chain security 4 | on: 5 | # For Branch-Protection check. Only the default branch is supported. See 6 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection 7 | branch_protection_rule: 8 | schedule: 9 | - cron: '7 12 * * 4' 10 | push: 11 | branches: [ "master" ] 12 | 13 | # Declare default permissions as read only. 14 | permissions: read-all 15 | 16 | jobs: 17 | analysis: 18 | name: Scorecard analysis 19 | runs-on: ubuntu-latest 20 | permissions: 21 | # Needed to upload the results to code-scanning dashboard. 22 | security-events: write 23 | # Needed to publish results and get a badge (see publish_results below). 24 | id-token: write 25 | 26 | steps: 27 | - name: "Checkout code" 28 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 29 | with: 30 | persist-credentials: false 31 | 32 | - name: "Run analysis" 33 | uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1 34 | with: 35 | results_file: results.sarif 36 | results_format: sarif 37 | publish_results: true 38 | 39 | # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF 40 | # format to the repository Actions tab. 41 | - name: "Upload artifact" 42 | uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 43 | with: 44 | name: SARIF file 45 | path: results.sarif 46 | retention-days: 5 47 | 48 | # Upload the results to GitHub's code scanning dashboard (optional). 49 | # Commenting out will disable upload of results to your repo's Code Scanning dashboard 50 | - name: "Upload to code-scanning" 51 | uses: github/codeql-action/upload-sarif@b56ba49b26e50535fa1e7f7db0f4f7b4bf65d80d # v3.28.10 52 | with: 53 | sarif_file: results.sarif 54 | -------------------------------------------------------------------------------- /examples/distrib_batch_gemm/main.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * main.cpp 20 | * 21 | * Description: 22 | * Demonstrates simple vector addition 23 | **************************************************************************/ 24 | #include 25 | #include 26 | 27 | template 28 | void simple_vadd_sycl(const std::array& VA, const std::array& VB, 29 | std::array& VC); 30 | 31 | template 32 | void simple_vadd_cuda(const std::array& VA, const std::array& VB, 33 | std::array& VC); 34 | 35 | int main() { 36 | const size_t array_size = 4; 37 | std::array A = {{1, 2, 3, 4}}, 38 | B = {{1, 2, 3, 4}}, C; 39 | std::array D = {{1.f, 2.f, 3.f, 4.f}}, 40 | E = {{1.f, 2.f, 3.f, 4.f}}, F; 41 | simple_vadd_sycl(A, B, C); 42 | simple_vadd_cuda(D, E, F); 43 | for (unsigned int i = 0; i < array_size; i++) { 44 | if (C[i] != A[i] + B[i]) { 45 | std::cout << "The results are incorrect (element " << i << " is " << C[i] 46 | << "!\n"; 47 | return 1; 48 | } 49 | if (F[i] != D[i] + E[i]) { 50 | std::cout << "The results are incorrect (element " << i << " is " << F[i] 51 | << "!\n"; 52 | return 1; 53 | } 54 | } 55 | std::cout << "The results are correct!\n"; 56 | return 0; 57 | } 58 | -------------------------------------------------------------------------------- /examples/distrib_batch_gemm/Makefile: -------------------------------------------------------------------------------- 1 | #/*************************************************************************** 2 | # * 3 | # * Copyright (C) Codeplay Software Ltd. 4 | # * 5 | # * Licensed under the Apache License, Version 2.0 (the "License"); 6 | # * you may not use this file except in compliance with the License. 7 | # * You may obtain a copy of the License at 8 | # * 9 | # * http://www.apache.org/licenses/LICENSE-2.0 10 | # * 11 | # * Unless required by applicable law or agreed to in writing, software 12 | # * distributed under the License is distributed on an "AS IS" BASIS, 13 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # * See the License for the specific language governing permissions and 15 | # * limitations under the License. 16 | # * 17 | # * Codeplay's SYCL-For-CUDA-Examples 18 | # * 19 | # * Makefile 20 | # * 21 | # * Description: 22 | # * Makefile for distributed batch gemm 23 | # **************************************************************************/ 24 | 25 | SYCLCXX=clang++ 26 | SYCLFLAGS=-O2 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda -Wno-linker-warnings 27 | OBJS=main.o vadd_sycl.o vadd_cuda.o 28 | CUFLAGS=--cuda-gpu-arch=sm_80 -std=c++11 29 | 30 | 31 | %.o: %.cpp 32 | ${SYCLCXX} ${SYCLFLAGS} -c -o $@ $< 33 | 34 | %.o: %.cu 35 | ${SYCLCXX} ${CUFLAGS} -c -o $@ $< 36 | 37 | main.exe: ${OBJS} 38 | ${SYCLCXX} ${SYCLFLAGS} ${CUFLAGS} ${OBJS} -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -o $@ 39 | 40 | clean: 41 | rm -f ${OBJS} 42 | 43 | MPICOMP = mpicxx -I$(HOME)/sycl_workspace/build_dpcpp/install/include/sycl/ -I$(HOME)/sycl-blas/include -I$(HOME)/sycl-blas/external/computecpp-sdk/include/ -L$(HOME)/sycl-blas/build -O3 -fsycl-unnamed-lambda -std=c++17 -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -lsycl_blas 44 | 45 | distributed-batch-gemm: distributed-batch-gemm.o 46 | $(MPICOMP) distributed-batch-gemm.o -o distributed-batch-gemm 47 | 48 | distributed-batch-gemm.o: distributed-batch-gemm.cpp 49 | $(MPICOMP) -c distributed-batch-gemm.cpp 50 | 51 | run: distributed-batch-gemm 52 | LD_LIBRARY_PATH=~/sycl_workspace/build_dpcpp/install/lib:$(HOME)/sycl-blas/build mpirun -np 2 --mca pml ucx -mca btl ^uct -x UCX_NET_DEVICES=mlx5_0:1 ./distributed-batch-gemm 53 | 54 | .PHONY: clean 55 | 56 | 57 | -------------------------------------------------------------------------------- /examples/hashing/cmake/FindSYCL.cmake: -------------------------------------------------------------------------------- 1 | set(A_SYCL_FOUND false) 2 | 3 | find_package(hipSYCL CONFIG) 4 | 5 | #[ { "name": "My Compiler Kit", "compilers": { "C": "/home/michel/sycl_workspace/deploy/bin/clang-13", "CXX": "/home/michel/sycl_workspace/deploy/bin/clang++" },"environmentVariables":{"LD_PRELOAD":"/opt/intel/opencl/libOpenCL.so.1"} } ] 6 | 7 | 8 | if (hipSYCL_FOUND) 9 | set(A_SYCL_FOUND true) 10 | if (NOT CMAKE_BUILD_TYPE) 11 | set(CMAKE_BUILD_TYPE Release) 12 | endif () 13 | 14 | cmake_policy(SET CMP0005 NEW) 15 | add_definitions(-DHIPSYCL_DEBUG_LEVEL=0) 16 | 17 | if (NOT HIPSYCL_DEBUG_LEVEL) 18 | if (CMAKE_BUILD_TYPE MATCHES "Debug") 19 | set(HIPSYCL_DEBUG_LEVEL 3 CACHE STRING 20 | "Choose the debug level, options are: 0 (no debug), 1 (print errors), 2 (also print warnings), 3 (also print general information)" 21 | FORCE) 22 | else () 23 | set(HIPSYCL_DEBUG_LEVEL 2 CACHE STRING 24 | "Choose the debug level, options are: 0 (no debug), 1 (print errors), 2 (also print warnings), 3 (also print general information)" 25 | FORCE) 26 | endif () 27 | endif () 28 | endif () 29 | 30 | 31 | if (ComputeCpp_DIR) 32 | include(cmake/Modules/FindComputeCpp.cmake) 33 | 34 | if (ComputeCpp_ROOT_DIR) 35 | set(A_SYCL_FOUND true) 36 | endif () 37 | add_compile_definitions(USING_COMPUTECPP) 38 | message(STATUS " Using ComputeCpp CMake") 39 | message(STATUS " Path to ComputeCpp implementation: ${COMPUTECPP_PACKAGE_ROOT_DIR} ") 40 | #set(CMAKE_CXX_STANDARD 11) 41 | include(FindOpenCL) 42 | endif () 43 | 44 | 45 | if (TRISYCL_INCLUDE_DIR AND NOT A_SYCL_FOUND) 46 | set(A_SYCL_FOUND true) 47 | message(STATUS " Using triSYCL CMake") 48 | include(FindTriSYCL) 49 | endif () 50 | 51 | # We expect the DPCPP compiler to have used 52 | if (NOT A_SYCL_FOUND) 53 | function(add_sycl_to_target arg1 arg2) 54 | target_compile_options(${arg2} PRIVATE $<$:${DPCPP_FLAGS} -sycl-std=2020 -std=c++20 -fsycl-unnamed-lambda>) 55 | target_link_options(${arg2} PRIVATE ${DPCPP_FLAGS} -sycl-std=2020 -std=c++20 -fsycl-unnamed-lambda) 56 | endfunction() 57 | 58 | endif () 59 | -------------------------------------------------------------------------------- /examples/fortran_interface/saxpy.cuf: -------------------------------------------------------------------------------- 1 | !************************************************************************** 2 | ! 3 | ! Copyright (C) Codeplay Software Ltd. 4 | ! 5 | ! Licensed under the Apache License, Version 2.0 (the "License"); 6 | ! you may not use this file except in compliance with the License. 7 | ! You may obtain a copy of the License at 8 | ! 9 | ! http://www.apache.org/licenses/LICENSE-2.0 10 | ! 11 | ! Unless required by applicable law or agreed to in writing, software 12 | ! distributed under the License is distributed on an "AS IS" BASIS, 13 | ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | ! See the License for the specific language governing permissions and 15 | ! limitations under the License. 16 | ! 17 | ! Codeplay's SYCL-For-CUDA-Examples 18 | ! 19 | ! saxpy.cuf 20 | ! 21 | ! Description: 22 | ! CUDA Fortran code calling SAXPY from SYCL 23 | !*************************************************************************/ 24 | 25 | module mathOps 26 | contains 27 | attributes(global) subroutine saxpy(x, y, a) 28 | implicit none 29 | real :: x(:), y(:) 30 | real, value :: a 31 | integer :: i, n 32 | n = size(x) 33 | i = blockDim%x * (blockIdx%x - 1) + threadIdx%x 34 | if (i <= n) y(i) = y(i) + a*x(i) 35 | end subroutine saxpy 36 | end module mathOps 37 | 38 | program testSaxpy 39 | use mathOps 40 | use cudafor 41 | 42 | implicit none 43 | 44 | interface saxpy_sycl 45 | subroutine saxpy_call(x, y, a, N) & 46 | bind(C,name='saxpy_sycl_cuda_wrapper') 47 | implicit none 48 | real :: x(:), y(:) 49 | real, value :: a 50 | integer, value :: N 51 | end subroutine 52 | end interface 53 | 54 | 55 | integer, parameter :: N = 1024 56 | real :: x(N), y(N), a 57 | real, device :: x_d(N), y_d(N) 58 | type(dim3) :: grid, tBlock 59 | 60 | tBlock = dim3(256,1,1) 61 | grid = dim3(ceiling(real(N)/tBlock%x),1,1) 62 | 63 | write (*,*) 'CUDA version: ' 64 | x = 1.0; y = 2.0; a = 2.0 65 | x_d = x 66 | y_d = y 67 | call saxpy<<>>(x_d, y_d, a) 68 | y = y_d 69 | write(*,*) 'Max error: ', maxval(abs(y-4.0)) 70 | write(*,*) 'N ', N 71 | 72 | write (*,*) 'SYCL version: ' 73 | y = 2.0; 74 | call saxpy_call(x, y, a, N); 75 | write(*,*) 'Max error: ', maxval(abs(y-4.0)) 76 | 77 | end program testSaxpy 78 | -------------------------------------------------------------------------------- /examples/kokkos/README.md: -------------------------------------------------------------------------------- 1 | Simple Test Case for Kokkos 2 | ---- 3 | 4 | This is a simple standalone test case taken from the Kokkos repository & packaged up here for use with SYCL. 5 | It's doing a vector-matrix-vector product. It's an identity matrix with two vectors of 1s, so the expected answer 6 | is just equal to the problem size. 7 | 8 | Building the test case 9 | ----- 10 | 11 | test_case.cpp contains a simple kernel which has been copied straight from the Kokkos Tutorials (Exercises/02/Solution). 12 | 13 | Build it with build.sh, after setting the environment variable: 14 | ``` 15 | Kokkos_ROOT="[your/kokkos/installation]/lib/cmake/Kokkos" 16 | ``` 17 | 18 | Running the test case 19 | ---- 20 | 21 | Just launch it! There are optional flags: 22 | 23 | -N : number of rows 24 | -M : number of columns 25 | -S : total size 26 | -nrepeat : how many times to repeat the test (default 100) 27 | 28 | Obviously, not all of N, M & S should be set. The test case will sanity check your args anyway. 29 | 30 | Building Kokkos 31 | ------ 32 | 33 | In case you don't have an existing Kokkos build, there are some build scripts in `./kokkos_build_scripts`. 34 | There are scripts for building Kokkos with SYCL, or CUDA (nvcc or clang). 35 | 36 | Set the following environment variables: 37 | ``` 38 | KOKKOS_INSTALL_DIR=[/your/install/dir] 39 | KOKKOS_SOURCE_DIR=[/your/source/dir] 40 | HWLOC_DIR=[/your/hwloc/dir] 41 | ``` 42 | 43 | HWLOC 44 | ------ 45 | 46 | The [Portable Hardware Locality](https://www.open-mpi.org/projects/hwloc/) (hwloc) package is an optional dependency which enables Kokkos to query the hardware topology of the system on which it is running. If you do not have a HWLOC installation, this option can be removed & Kokkos will be built without HWLOC support. 47 | 48 | SYCL backend 49 | ------------- 50 | 51 | Kokkos should work with any SYCL backend, though the focus of this examples repo is SYCL-For-CUDA. 52 | Previous work at Codeplay has involved running Kokkos with SYCL on Nvidia hardware with Ampere architecture, hence the flag: 53 | ``` 54 | -DKokkos_ARCH_AMPERE80=ON \ 55 | ``` 56 | This flag is not strictly necessary, but it enables Ahead of Time (AoT) compilation, which can give a significant performance gain when building large projects built on Kokkos. 57 | You should modify the cmake command for your GPU arch. 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /setup-script/sample/mkl_matmult_usm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace usm_smart_ptr; 9 | 10 | int main(int argc, char *argv[]) { 11 | using T = float; 12 | size_t n_laps = 30; 13 | size_t mat_size = 16384; // Bound by your GPU's memory. 14 | 15 | if (argc > 1) { 16 | mat_size = std::stoul(argv[1], nullptr, 10); 17 | } 18 | T alpha = 1, beta = 0; // gemm parameters 19 | 20 | sycl::queue my_queue = try_get_queue(cuda_selector{}); 21 | 22 | std::cout << "Initalizing the matrices..." << std::endl; 23 | long n = mat_size, m = mat_size, k = mat_size, ldA = mat_size, ldB = mat_size, ldC = mat_size; 24 | // Initializing USM shared memory in an std::unique_ptr for auto mem management 25 | auto A = make_unique_ptr(mat_size * mat_size, my_queue); 26 | auto B = make_unique_ptr(mat_size * mat_size, my_queue); 27 | auto C = make_unique_ptr(mat_size * mat_size, my_queue); 28 | fill_rand(A.get(), A.count()); 29 | fill_rand(B.get(), B.count()); 30 | 31 | std::cout << "Running on:" << my_queue.get_device().get_info() << std::endl; 32 | Chrono c("computing + error handling"); 33 | 34 | try { 35 | sycl::event e; 36 | for (size_t i = 0; i < n_laps; i++) { 37 | std::cout << i << '/' << n_laps << '\n'; 38 | using oneapi::mkl::transpose; 39 | using oneapi::mkl::blas::column_major::gemm; 40 | // C <- alpha*OP(A)*OP(B) + beta*C 41 | e = gemm(my_queue, transpose::nontrans, transpose::nontrans, m, n, k, alpha, A.get(), ldA, B.get(), ldB, beta, C.get(), ldC, {e}); 42 | } 43 | e.wait_and_throw(); 44 | } 45 | catch (sycl::exception const &e) { 46 | std::cout << "Caught synchronous SYCL exception during GEMM: " << e.what() << std::endl; 47 | } 48 | catch (std::exception const &e) { 49 | std::cout << "Caught synchronous STL exception during GEMM: " << e.what() << std::endl; 50 | } 51 | 52 | uint64_t operations_performed = n_laps * mat_size * mat_size * (2 * mat_size - 1); 53 | std::cout << "Gflops : " << operations_performed / 1000000000 / c.stop() << std::endl; 54 | 55 | return 0; 56 | } -------------------------------------------------------------------------------- /examples/vector_addition/vector_addition.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * SYCL FOR CUDA : Vector Addition Example 3 | * 4 | * Copyright 2020 Codeplay Software Ltd. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * @File: vector_addition.cpp 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | int main(int argc, char *argv[]) { 28 | constexpr const size_t N = 100000; 29 | const sycl::range VecSize{N}; 30 | 31 | sycl::buffer bufA{VecSize}; 32 | sycl::buffer bufB{VecSize}; 33 | sycl::buffer bufC{VecSize}; 34 | 35 | // Initialize input data 36 | { 37 | sycl::host_accessor h_a{bufA, sycl::write_only}; 38 | sycl::host_accessor h_b{bufB, sycl::write_only}; 39 | 40 | for (int i = 0; i < N; i++) { 41 | h_a[i] = sin(i) * sin(i); 42 | h_b[i] = cos(i) * cos(i); 43 | } 44 | } 45 | 46 | auto CUDASelector = [](sycl::device const &dev) { 47 | if (dev.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda) { 48 | std::cout << " CUDA device found " << std::endl; 49 | return 1; 50 | } else { 51 | return -1; 52 | } 53 | }; 54 | sycl::queue myQueue{CUDASelector}; 55 | 56 | // Command Group creation 57 | auto cg = [&](sycl::handler &h) { 58 | const auto read_t = sycl::access::mode::read; 59 | const auto write_t = sycl::access::mode::write; 60 | 61 | auto a = bufA.get_access(h); 62 | auto b = bufB.get_access(h); 63 | auto c = bufC.get_access(h); 64 | 65 | h.parallel_for(VecSize, [=](sycl::id<1> i) { c[i] = a[i] + b[i]; }); 66 | }; 67 | 68 | myQueue.submit(cg); 69 | 70 | { 71 | sycl::host_accessor h_c{bufC, sycl::read_only}; 72 | 73 | double sum = 0.0f; 74 | for (int i = 0; i < N; i++) { 75 | sum += h_c[i]; 76 | } 77 | std::cout << "Sum is : " << sum << std::endl; 78 | } 79 | 80 | return 0; 81 | } 82 | -------------------------------------------------------------------------------- /setup-script/sample/mkl_matmult.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | int main(int argc, char *argv[]) { 8 | using T = float; 9 | size_t n_laps = 30; 10 | size_t mat_size = 16384; 11 | if (argc > 1) { 12 | mat_size = std::stoul(argv[1], nullptr, 10); 13 | } 14 | T alpha = 1, beta = 0; // gemm parameters 15 | 16 | sycl::queue my_queue = try_get_queue(cuda_selector{}); 17 | 18 | std::cout << "Initalizing the matrices..." << std::endl; 19 | size_t n = mat_size, m = mat_size, k = mat_size, ldA = mat_size, ldB = mat_size, ldC = mat_size; 20 | std::vector A(mat_size * mat_size); 21 | std::vector B(mat_size * mat_size); 22 | std::vector C(mat_size * mat_size); 23 | fill_rand(A); 24 | fill_rand(B); 25 | 26 | // create sycl buffers of matrix data for offloading between device and host 27 | sycl::buffer A_buffer(A.data(), A.size()); 28 | sycl::buffer B_buffer(B.data(), B.size()); 29 | sycl::buffer C_buffer(C.data(), C.size()); 30 | 31 | std::cout << "Running on:" << my_queue.get_device().get_info() << std::endl; 32 | Chrono c("computing + error handling"); 33 | for (size_t i = 0; i < n_laps; i++) { 34 | std::cout << i << '/' << n_laps << '\n'; 35 | // add oneapi::mkl::blas::gemm to execution queue and catch any synchronous exceptions 36 | try { 37 | using oneapi::mkl::transpose; 38 | using oneapi::mkl::blas::column_major::gemm; // row_major not implemented on cublas 39 | // C <- alpha*OP(A)*OP(B) + beta*C 40 | gemm(my_queue, transpose::nontrans, transpose::nontrans, m, n, k, alpha, A_buffer, ldA, B_buffer, ldB, beta, 41 | C_buffer, ldC); 42 | } 43 | catch (sycl::exception const &e) { 44 | std::cout << "Caught synchronous SYCL exception during GEMM: " << e.what() << std::endl; 45 | } 46 | catch (std::exception const &e) { 47 | std::cout << "Caught synchronous STL exception during GEMM: " << e.what() << std::endl; 48 | } 49 | // ensure any asynchronous exceptions caught are handled before proceeding 50 | my_queue.wait_and_throw(); 51 | } 52 | uint64_t operations_performed = n_laps * mat_size * mat_size * (2 * mat_size - 1); 53 | std::cout << "Gflops : " << operations_performed / 1000000000 / c.stop() << std::endl; 54 | 55 | return 0; 56 | } -------------------------------------------------------------------------------- /examples/hashing/include/hash_functions/blake2b.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * blake2b.hpp 20 | * 21 | * Description: 22 | * Blake2 hash function 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | 29 | constexpr dword BLAKE2B_ROUNDS = 12; 30 | constexpr dword BLAKE2B_BLOCK_LENGTH = 128; 31 | constexpr dword BLAKE2B_CHAIN_SIZE = 8; 32 | constexpr dword BLAKE2B_CHAIN_LENGTH = (BLAKE2B_CHAIN_SIZE * sizeof(qword)); 33 | constexpr dword BLAKE2B_STATE_SIZE = 16; 34 | constexpr dword BLAKE2B_STATE_LENGTH = (BLAKE2B_STATE_SIZE * sizeof(qword)); 35 | 36 | struct blake2b_ctx { 37 | int64_t digestlen{}; 38 | dword keylen{}; 39 | dword pos{}; 40 | qword t0{}; 41 | qword t1{}; 42 | qword f0{}; 43 | byte buff[BLAKE2B_BLOCK_LENGTH] = {0}; 44 | qword chain[BLAKE2B_CHAIN_SIZE] = {0}; 45 | qword state[BLAKE2B_STATE_SIZE] = {0}; 46 | }; 47 | 48 | namespace hash::internal { 49 | class blake2b_kernel; 50 | 51 | using namespace usm_smart_ptr; 52 | 53 | usm_shared_ptr get_blake2b_ctx(sycl::queue &q, const byte *key, dword keylen, dword n_outbit); 54 | 55 | 56 | sycl::event 57 | launch_blake2b_kernel(sycl::queue &item, sycl::event e, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch, dword n_outbit, const byte *key, 58 | dword keylen); 59 | 60 | sycl::event 61 | launch_blake2b_kernel(sycl::queue &item, sycl::event e, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch, dword n_outbit, const byte *key, 62 | dword keylen, device_accessible_ptr); 63 | 64 | } -------------------------------------------------------------------------------- /examples/distrib_batch_gemm/vadd_sycl.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * vadd_sycl.cpp 20 | * 21 | * Description: 22 | * Vector addition in SYCL 23 | **************************************************************************/ 24 | /* This example is a very small one designed to show how compact SYCL code 25 | * can be. That said, it includes no error checking and is rather terse. */ 26 | #include 27 | 28 | #include 29 | #include 30 | 31 | constexpr cl::sycl::access::mode sycl_read = cl::sycl::access::mode::read; 32 | constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write; 33 | 34 | /* This is the class used to name the kernel for the runtime. 35 | * This must be done when the kernel is expressed as a lambda. */ 36 | template 37 | class SimpleVadd; 38 | 39 | template 40 | void simple_vadd_sycl(const std::array& VA, const std::array& VB, 41 | std::array& VC) { 42 | cl::sycl::queue deviceQueue; 43 | cl::sycl::range<1> numOfItems{N}; 44 | cl::sycl::buffer bufferA(VA.data(), numOfItems); 45 | cl::sycl::buffer bufferB(VB.data(), numOfItems); 46 | cl::sycl::buffer bufferC(VC.data(), numOfItems); 47 | 48 | deviceQueue.submit([&](cl::sycl::handler& cgh) { 49 | auto accessorA = bufferA.template get_access(cgh); 50 | auto accessorB = bufferB.template get_access(cgh); 51 | auto accessorC = bufferC.template get_access(cgh); 52 | 53 | auto kern = [=](cl::sycl::id<1> wiID) { 54 | accessorC[wiID] = accessorA[wiID] + accessorB[wiID]; 55 | }; 56 | cgh.parallel_for>(numOfItems, kern); 57 | }); 58 | } 59 | 60 | template void simple_vadd_sycl(const std::array& VA, const std::array& VB, 61 | std::array& VC); 62 | template void simple_vadd_sycl(const std::array& VA, const std::array& VB, 63 | std::array& VC); 64 | -------------------------------------------------------------------------------- /examples/vector_addition/vector_addition_usm.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * SYCL FOR CUDA : Vector Addition Example 3 | * 4 | * Copyright 2020 Codeplay Software Ltd. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * @File: vector_addition.cpp 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | 27 | int main(int argc, char *argv[]) { 28 | constexpr const size_t n = 100000; 29 | 30 | // Create a sycl queue with our CUDASelector 31 | auto CUDASelector = [](sycl::device const &dev) { 32 | if (dev.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda) { 33 | std::cout << " CUDA device found " << std::endl; 34 | return 1; 35 | } else { 36 | return -1; 37 | } 38 | }; 39 | sycl::queue myQueue{CUDASelector}; 40 | 41 | // Host input vectors 42 | double *h_a; 43 | double *h_b; 44 | // Host output vector 45 | double *h_c; 46 | 47 | // Device input vectors 48 | double *d_a; 49 | double *d_b; 50 | // Device output vector 51 | double *d_c; 52 | 53 | // Size, in bytes, of each vector 54 | size_t bytes = n * sizeof(double); 55 | 56 | // Allocate memory for each vector on host 57 | h_a = (double *)malloc(bytes); 58 | h_b = (double *)malloc(bytes); 59 | h_c = (double *)malloc(bytes); 60 | 61 | // Allocate memory for each vector on GPU 62 | d_a = sycl::malloc_device(n, myQueue); 63 | d_b = sycl::malloc_device(n, myQueue); 64 | d_c = sycl::malloc_device(n, myQueue); 65 | 66 | // Initialize vectors on host 67 | for (int i = 0; i < n; i++) { 68 | h_a[i] = sin(i) * sin(i); 69 | h_b[i] = cos(i) * cos(i); 70 | } 71 | 72 | myQueue.memcpy(d_a, h_a, bytes).wait(); 73 | myQueue.memcpy(d_b, h_b, bytes).wait(); 74 | 75 | // Command Group creation 76 | auto cg = [&](sycl::handler &h) { 77 | h.parallel_for(sycl::range(n), 78 | [=](sycl::id<1> i) { 79 | d_c[i] = d_a[i] + d_b[i]; 80 | }); 81 | }; 82 | 83 | // Run the kernel defined above 84 | myQueue.submit(cg).wait(); 85 | 86 | // Copy the result back to host 87 | myQueue.memcpy(h_c, d_c, bytes).wait(); 88 | 89 | double sum = 0.0f; 90 | for (int i = 0; i < n; i++) { 91 | sum += h_c[i]; 92 | } 93 | std::cout << "Sum is : " << sum << std::endl; 94 | 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /examples/distrib_batch_gemm/vadd_cuda.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * vadd_cuda.cu 20 | * 21 | * Description: 22 | * Vector addition in CUDA 23 | **************************************************************************/ 24 | #include 25 | 26 | // CUDA kernel. Each thread takes care of one element of c 27 | template 28 | __global__ void vecAdd(T *a, T *b, T *c, int n) 29 | { 30 | // Get our global thread ID 31 | int id = blockIdx.x*blockDim.x+threadIdx.x; 32 | 33 | // Make sure we do not go out of bounds 34 | if (id < n) 35 | c[id] = a[id] + b[id]; 36 | } 37 | 38 | template 39 | void simple_vadd_cuda(const std::array& VA, const std::array& VB, 40 | std::array& VC) { 41 | // Device input vectors 42 | T *d_a; 43 | T *d_b; 44 | //Device output vector 45 | T *d_c; 46 | 47 | // Size, in bytes, of each vector 48 | const size_t bytes = N*sizeof(T); 49 | 50 | // Allocate memory for each vector on GPU 51 | cudaMalloc(&d_a, bytes); 52 | cudaMalloc(&d_b, bytes); 53 | cudaMalloc(&d_c, bytes); 54 | 55 | // Copy host vectors to device 56 | cudaMemcpy( d_a, VA.data(), bytes, cudaMemcpyHostToDevice); 57 | cudaMemcpy( d_b, VB.data(), bytes, cudaMemcpyHostToDevice); 58 | 59 | int blockSize, gridSize; 60 | 61 | // Number of threads in each thread block 62 | blockSize = 1024; 63 | 64 | // Number of thread blocks in grid 65 | gridSize = (int)ceil((float)N/blockSize); 66 | 67 | // Execute the kernel 68 | vecAdd<<>>(d_a, d_b, d_c, N); 69 | 70 | // Copy array back to host 71 | cudaMemcpy( VC.data(), d_c, bytes, cudaMemcpyDeviceToHost ); 72 | 73 | // Release device memory 74 | cudaFree(d_a); 75 | cudaFree(d_b); 76 | cudaFree(d_c); 77 | 78 | } 79 | 80 | 81 | template void simple_vadd_cuda(const std::array& VA, const std::array& VB, 82 | std::array& VC); 83 | template void simple_vadd_cuda(const std::array& VA, const std::array& VB, 84 | std::array& VC); 85 | 86 | -------------------------------------------------------------------------------- /examples/sgemm_interop/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #/*************************************************************************** 2 | # * 3 | # * Copyright (C) Codeplay Software Ltd. 4 | # * 5 | # * Licensed under the Apache License, Version 2.0 (the "License"); 6 | # * you may not use this file except in compliance with the License. 7 | # * You may obtain a copy of the License at 8 | # * 9 | # * http://www.apache.org/licenses/LICENSE-2.0 10 | # * 11 | # * Unless required by applicable law or agreed to in writing, software 12 | # * distributed under the License is distributed on an "AS IS" BASIS, 13 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # * See the License for the specific language governing permissions and 15 | # * limitations under the License. 16 | # * 17 | # * Codeplay's SYCL-For-CUDA-Examples 18 | # * 19 | # * CMakeLists.txt 20 | # * 21 | # * Description: 22 | # * CMake for SGEMM 23 | # **************************************************************************/ 24 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR) 25 | 26 | # Don't complain about empty CMAKE_CUDA_ARCHITECTURES 27 | cmake_policy(SET CMP0104 OLD) 28 | 29 | project(sycl_cuda_interop LANGUAGES CXX CUDA) 30 | 31 | find_package(CUDAToolkit) 32 | 33 | # SYCL installation 34 | if (NOT SYCL_ROOT) 35 | message(FATAL_ERROR "No SYCL installation detected") 36 | endif(NOT SYCL_ROOT) 37 | 38 | set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/14.0.0/include/") 39 | set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so") 40 | set(SYCL_FLAGS "-fsycl" 41 | "-fsycl-targets=nvptx64-nvidia-cuda" 42 | "-fsycl-unnamed-lambda" 43 | "-Wno-linker-warnings") 44 | 45 | 46 | # Build the CUDA code 47 | add_executable(cuda_sgemm sgemm.cu) 48 | target_compile_features(cuda_sgemm PUBLIC cxx_std_11) 49 | set_target_properties(cuda_sgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 50 | set_property(TARGET cuda_sgemm PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}") 51 | target_link_libraries(cuda_sgemm CUDA::toolkit CUDA::cublas) 52 | 53 | # Build the SYCL code 54 | add_executable (sycl_sgemm sycl_sgemm.cpp) 55 | target_compile_features(sycl_sgemm PUBLIC cxx_std_17) 56 | target_compile_options(sycl_sgemm PUBLIC ${SYCL_FLAGS}) 57 | target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF) 58 | target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS}) 59 | target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS}) 60 | target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cuda_driver CUDA::cublas) 61 | 62 | # Build the SYCL USM code 63 | add_executable (sycl_sgemm_usm sycl_sgemm_usm.cpp) 64 | target_compile_features(sycl_sgemm_usm PUBLIC cxx_std_17) 65 | target_compile_options(sycl_sgemm_usm PUBLIC ${SYCL_FLAGS}) 66 | target_compile_definitions(sycl_sgemm_usm PUBLIC CUDA_NO_HALF) 67 | target_link_libraries(sycl_sgemm_usm PUBLIC ${SYCL_FLAGS}) 68 | target_include_directories(sycl_sgemm_usm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS}) 69 | target_link_libraries(sycl_sgemm_usm PUBLIC CUDA::toolkit CUDA::cuda_driver CUDA::cublas) 70 | -------------------------------------------------------------------------------- /examples/hashing/include/tools/fill_rand.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * fill_rand.hpp 20 | * 21 | * Description: 22 | * Random generation for containers 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "usm_smart_ptr.hpp" 31 | 32 | using namespace usm_smart_ptr; 33 | 34 | /** 35 | * Fills a container/array with random numbers from positions first to last 36 | */ 37 | template 38 | static inline void do_fill_rand_on_host(ForwardIt first, ForwardIt last) { 39 | static std::random_device dev; 40 | static std::mt19937 engine(dev()); 41 | auto generator = [&]() { 42 | if constexpr (std::is_integral::value) { 43 | static std::uniform_int_distribution distribution; 44 | return distribution(engine); 45 | } else if constexpr (std::is_floating_point::value) { 46 | static std::uniform_real_distribution distribution; 47 | return distribution(engine); 48 | } else if constexpr (std::is_same_v) { 49 | static std::uniform_real_distribution distribution; 50 | return distribution(engine); 51 | } 52 | }; 53 | std::generate(first, last, generator); 54 | } 55 | 56 | 57 | /** 58 | * This function accepts only memory that is accessible from the CPU 59 | * To achive this it uses fantom types that wraps the pointer. 60 | * This could be done by calling the runtime to check where is the 61 | * usm memory allocated, but here we can avoid doing that. 62 | */ 63 | template 64 | static inline void fill_rand(host_accessible_ptr v, size_t count) { 65 | do_fill_rand_on_host((T *) v, (T *) v + count); 66 | } 67 | 68 | /** 69 | * This function would only accept device allocated memory 70 | */ 71 | /*template 72 | typename std::enable_if::type 73 | fill_rand(const usm_ptr &v, size_t count) { 74 | do_fill_rand_on_device(+v, v + count); 75 | }*/ 76 | 77 | template 78 | static inline void fill_rand(std::vector &v) { 79 | do_fill_rand_on_host(v.begin(), v.end()); 80 | } 81 | -------------------------------------------------------------------------------- /examples/hashing/include/internal/determine_kernel_config.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * determine_kernel_config.hpp 20 | * 21 | * Description: 22 | * Functions for SYCL kernel configuration 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | namespace hash::internal { 31 | 32 | 33 | struct kernel_config { 34 | size_t wg_size; 35 | size_t block; 36 | }; 37 | 38 | 39 | inline kernel_config get_kernel_sizes(const sycl::queue &q, size_t job_size) { 40 | kernel_config config{.wg_size= 1, .block= job_size}; 41 | if (q.get_device().is_gpu()) { 42 | /** 43 | * If the device is a GPU we will try to have as many threads in each work group as possible. 44 | * We need to bound the value of `max_work_group_size` as it can be ANY 64-bit integer 45 | */ 46 | config.wg_size = std::min(std::max(1ul, 2 * q.get_device().get_info()), job_size); 47 | config.wg_size = std::min(config.wg_size, 64ul); //TODO Find a better alternative than a hardcoded 64 ? 48 | config.block = (job_size / config.wg_size) + (job_size % config.wg_size != 0); 49 | } else { 50 | /** 51 | * We need that case because on a CPU, one work group runs on one thread, and threads are expensive to launch 52 | * We'll multiply the thread count by a factor in order to allow the scheduler to better balance the work load. 53 | */ 54 | config.block = std::min((size_t) std::max(1u, 2 * q.get_device().get_info()), job_size); 55 | config.wg_size = job_size / config.block + (job_size % config.block != 0); 56 | 57 | /* We check that the work groups are not too big */ 58 | size_t max_wg_size = std::min(std::max(1ul, q.get_device().get_info()), job_size); 59 | if (config.wg_size > max_wg_size) { 60 | config.wg_size = max_wg_size; 61 | config.block = (job_size / config.wg_size) + (job_size % config.wg_size != 0); 62 | } 63 | 64 | } 65 | assert(config.block * config.wg_size >= job_size); 66 | return config; 67 | } 68 | 69 | } -------------------------------------------------------------------------------- /setup-script/README.md: -------------------------------------------------------------------------------- 1 | # oneAPI on CUDA setup script 2 | 3 | This script allows you to build and setup the DPC++ compiler, oneMKL and oneDNN with Nvidia GPUs support. 4 | 5 | ### Use 6 | 7 | 1. If needed, set `$DPCPP_HOME` and `$CUDA_ROOT` in the script. 8 | 2. Run the script with ```./build.sh```. 9 | 10 | Everything will be installed to `$DPCPP_HOME/deploy`. 11 | 12 | To build with testing support and run the tests for DPC++, oneMKL, oneTBB, oneDNN and Lapack, 13 | run: ```DPCPP_TESTS=ON ./build.sh``` 14 | 15 | If you want to build libc++, use `CC=clang-X CXX=clang++-X` with another version of clang, gcc won't compile the libc as 16 | the `asm` syntax is not the same. Using this clang/dpc++ won't work either, there's a bug. Then link 17 | with `-stdlib=libc++`. 18 | 19 | ### Environment variables 20 | 21 | Once everything was built, add the first four exports of the script in your environment or add them in your shell's 22 | config file. 23 | 24 | ### Using the CUDA Backend 25 | 26 | 1. Use the following selector: 27 | 28 | ```C++ 29 | class CUDADeviceSelector : public sycl::device_selector { 30 | public: 31 | int operator()(const sycl::device &device) const override { 32 |    return device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda ? 1 : -1; 33 | } 34 | }; 35 | ``` 36 | 37 | 2. Build with `-fsycl -fsycl-targets=nvptx64-nvidia-cuda` 38 | 3. For oneMKL, link with: `-lonemkl_blas_cublas -lonemkl` 39 | 40 | ### Intel's openCL 41 | 42 | You can also install Intel's openCL driver (`intel-oneapi-runtime-opencl`) so you can target your CPU 43 | with `-fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice`. For the setup 44 | see [here](https://software.intel.com/content/www/us/en/develop/documentation/installation-guide-for-intel-oneapi-toolkits-linux/top/installation/install-using-package-managers.html) 45 | . 46 | 47 | ### Dependencies 48 | 49 | * Even though the latest version of CUDA seems to work with DPC++, we need to use CUDA 10.2 as oneMKL uses cuBLAS 10.2. 50 | On RHEL: `cuda-10-2 libcublas-devel-10-2`. For the setup 51 | see: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#pre-installation-actions. 52 | * `libatomic` 53 | * to be able to target Intel's CPUs with openCL. 54 | * `ninja-build cmake libxml2-devel ...` and other "build essentials" required for building a compiler. 55 | * `libcudnn8 libcudnn8-devel` for oneDNN. 56 | * `gcc-gfortran` or another Fortran compiler for NETLIB Lapack. 57 | * `hwloc-devel` for oneTBB 58 | * `mpfr-devel` when building libcxx 59 | 60 | ### Caveats 61 | 62 | * CLion integration: you need to add `cidr.compiler.clang.fsycl=true` in the registry of CLion [see](https://www.jetbrains.com/help/clion/tuning-the-ide.html?keymap=secondary_macos#configure-platform-properties) 63 | * oneMKL does not support testing when building for both cuBLAS and cuRAND 64 | * for oneMKL with cuda you should use the namespace `oneapi::mkl::blas::column_major::` as cuBLAS is column_major. 65 | 66 | ### Repositories used 67 | 68 | - OpenCL Headers+Loaders: https://github.com/KhronosGroup/OpenCL-Headers.git 69 | , https://github.com/KhronosGroup/OpenCL-ICD-Loader.git 70 | - DPC++ Compiler: https://github.com/intel/llvm.git 71 | - NETLIB Lapack: https://github.com/Reference-LAPACK/lapack.git 72 | - oneTBB: https://github.com/oneapi-src/oneTBB.git 73 | - oneMKL: https://github.com/oneapi-src/oneMKL.git 74 | - oneDNN: https://github.com/oneapi-src/oneDNN.git 75 | -------------------------------------------------------------------------------- /examples/hashing/cmake/Modules/ComputeCppCompilerChecks.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4.3) 2 | 3 | if (CMAKE_COMPILER_IS_GNUCXX) 4 | if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8) 5 | message(FATAL_ERROR "host compiler - gcc version must be > 4.8") 6 | endif () 7 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 8 | if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6) 9 | message(FATAL_ERROR "host compiler - clang version must be > 3.6") 10 | endif () 11 | endif () 12 | 13 | if (MSVC) 14 | set(ComputeCpp_STL_CHECK_SRC __STL_check) 15 | file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp 16 | "#include \n" 17 | "int main() { return 0; }\n") 18 | set(_stl_test_command ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE} 19 | -sycl 20 | ${COMPUTECPP_DEVICE_COMPILER_FLAGS} 21 | -isystem ${ComputeCpp_INCLUDE_DIRS} 22 | -isystem ${OpenCL_INCLUDE_DIRS} 23 | -o ${ComputeCpp_STL_CHECK_SRC}.sycl 24 | -c ${ComputeCpp_STL_CHECK_SRC}.cpp) 25 | execute_process( 26 | COMMAND ${_stl_test_command} 27 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} 28 | RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT 29 | ERROR_VARIABLE ComputeCpp_STL_CHECK_ERROR_OUTPUT 30 | OUTPUT_QUIET) 31 | if (NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0) 32 | # Try disabling compiler version checks 33 | execute_process( 34 | COMMAND ${_stl_test_command} 35 | -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH 36 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} 37 | RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT 38 | ERROR_VARIABLE ComputeCpp_STL_CHECK_ERROR_OUTPUT 39 | OUTPUT_QUIET) 40 | if (NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0) 41 | # Try again with __CUDACC__ and _HAS_CONDITIONAL_EXPLICIT=0. This relaxes the restritions in the MSVC headers 42 | execute_process( 43 | COMMAND ${_stl_test_command} 44 | -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH 45 | -D_HAS_CONDITIONAL_EXPLICIT=0 46 | -D__CUDACC__ 47 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} 48 | RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT 49 | ERROR_VARIABLE ComputeCpp_STL_CHECK_ERROR_OUTPUT 50 | OUTPUT_QUIET) 51 | if (NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0) 52 | message(FATAL_ERROR "compute++ cannot consume hosted STL headers. This means that compute++ can't \ 53 | compile a simple program in this platform and will fail when used in this system. \ 54 | \n ${ComputeCpp_STL_CHECK_ERROR_OUTPUT}") 55 | else () 56 | list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH 57 | -D_HAS_CONDITIONAL_EXPLICIT=0 58 | -D__CUDACC__) 59 | endif () 60 | else () 61 | list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH) 62 | endif () 63 | endif () 64 | file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp 65 | ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp.sycl) 66 | endif (MSVC) 67 | -------------------------------------------------------------------------------- /examples/hashing/include/tools/intrinsics.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * intrinsics.hpp 20 | * 21 | * Description: 22 | * Intrinsic operations for hashing functions 23 | **************************************************************************/ 24 | /** 25 | Copyright 2021 Codeplay Software Ltd. 26 | 27 | Licensed under the Apache License, Version 2.0 (the "License"); 28 | you may not use these files except in compliance with the License. 29 | You may obtain a copy of the License at 30 | 31 | http://www.apache.org/licenses/LICENSE-2.0 32 | 33 | For your convenience, a copy of the License has been included in this 34 | repository. 35 | 36 | Unless required by applicable law or agreed to in writing, software 37 | distributed under the License is distributed on an "AS IS" BASIS, 38 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 39 | See the License for the specific language governing permissions and 40 | limitations under the License. 41 | */ 42 | 43 | #pragma once 44 | 45 | #include 46 | 47 | namespace sbb { 48 | 49 | template 50 | static inline std::enable_if_t || std::is_same_v, uint32_t> 51 | upsample(const T hi_hi, const T hi, const T lo, const T lo_lo) { 52 | uint16_t hi_upsampled = (uint16_t(hi_hi) << 8) + uint16_t(hi); 53 | uint16_t lo_upsampled = (uint16_t(lo) << 8) + uint16_t(lo_lo); 54 | return (uint32_t(hi_upsampled) << 16) + uint32_t(lo_upsampled); 55 | } 56 | 57 | 58 | static inline sycl::event memcpy_with_dependency(sycl::queue &q, void *dest, const void *src, size_t numBytes, sycl::event depEvent) { 59 | return q.submit([=](sycl::handler &cgh) { 60 | cgh.depends_on(depEvent); 61 | cgh.memcpy(dest, src, numBytes); 62 | }); 63 | } 64 | 65 | static inline sycl::event memcpy_with_dependency(sycl::queue &q, void *dest, const void *src, size_t numBytes, const std::vector &depEvent) { 66 | return q.submit([&](sycl::handler &cgh) { 67 | cgh.depends_on(depEvent); 68 | cgh.memcpy(dest, src, numBytes); 69 | }); 70 | } 71 | 72 | 73 | template 74 | static inline constexpr uint8_t get_byte(const T &word, const uint &idx) { 75 | static_assert(std::is_integral_v && std::is_unsigned_v); 76 | return (word >> (8 * idx)) & 0xFF; 77 | } 78 | 79 | template 80 | static inline constexpr T set_byte(const T &word, const uint8_t &byte_in, const uint &idx) { 81 | static_assert(std::is_integral_v && std::is_unsigned_v); 82 | T select_mask = ~(T(0xFF) << (idx * 8)); 83 | T new_val = (T(byte_in) & 0xFF) << (idx * 8); 84 | return (word & select_mask) + new_val; 85 | } 86 | } 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /examples/sgemm_interop/sgemm.cu: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sgemm.cu 20 | * 21 | * Description: 22 | * SGEMM operation in CUDA 23 | **************************************************************************/ 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | #include 30 | 31 | #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) 32 | 33 | void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) { 34 | if (status != CUBLAS_STATUS_SUCCESS) { 35 | std::cout << msg << " - " << status << std::endl; 36 | exit(EXIT_FAILURE); 37 | } 38 | } 39 | 40 | void inline checkCudaErrorMsg(cudaError status, const char *msg) { 41 | if (status != cudaSuccess) { 42 | std::cout << msg << " - " << status << std::endl; 43 | exit(EXIT_FAILURE); 44 | } 45 | } 46 | 47 | int main() { 48 | constexpr size_t WIDTH = 1024; 49 | constexpr size_t HEIGHT = 1024; 50 | constexpr float ALPHA = 1.0f; 51 | constexpr float BETA = 0.0f; 52 | 53 | std::vector h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT), 54 | h_C(WIDTH * HEIGHT); 55 | 56 | std::cout << "Size: " << h_C.size() << std::endl; 57 | float *d_A, *d_B, *d_C; 58 | 59 | // A is an identity matrix 60 | std::fill(std::begin(h_A), std::end(h_A), 0.0f); 61 | for (size_t i = 0; i < WIDTH; i++) { 62 | h_A[i * WIDTH + i] = 1.0f; 63 | } 64 | 65 | // B is a matrix fill with 1 66 | std::fill(std::begin(h_B), std::end(h_B), 1.0f); 67 | 68 | const size_t numBytes = WIDTH * HEIGHT * sizeof(float); 69 | 70 | CHECK_ERROR(cudaMalloc((void **)&d_A, numBytes)); 71 | CHECK_ERROR(cudaMalloc((void **)&d_B, numBytes)); 72 | CHECK_ERROR(cudaMalloc((void **)&d_C, numBytes)); 73 | 74 | CHECK_ERROR(cudaMemcpy(d_A, h_A.data(), numBytes, cudaMemcpyHostToDevice)); 75 | CHECK_ERROR(cudaMemcpy(d_B, h_B.data(), numBytes, cudaMemcpyHostToDevice)); 76 | 77 | cublasHandle_t handle; 78 | CHECK_ERROR(cublasCreate(&handle)); 79 | 80 | // C = A * B 81 | CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, 82 | WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, d_C, 83 | WIDTH)); 84 | 85 | CHECK_ERROR(cudaMemcpy(h_C.data(), d_C, numBytes, cudaMemcpyDeviceToHost)); 86 | 87 | // C must be all ones 88 | const bool allEqual = std::all_of(std::begin(h_C), std::end(h_C), 89 | [](float num) { return num == 1; }); 90 | 91 | if (!allEqual) { 92 | std::cout << " Incorrect result " << std::endl; 93 | } else { 94 | std::cout << " Correct! " << std::endl; 95 | } 96 | 97 | CHECK_ERROR(cublasDestroy(handle)); 98 | CHECK_ERROR(cudaFree(d_A)); 99 | CHECK_ERROR(cudaFree(d_B)); 100 | CHECK_ERROR(cudaFree(d_C)); 101 | 102 | return allEqual ? EXIT_SUCCESS : EXIT_FAILURE; 103 | } 104 | -------------------------------------------------------------------------------- /setup-script/sample/include/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace usm_smart_ptr; 10 | 11 | class cuda_selector : public sycl::device_selector { 12 | public: 13 | int operator()(const sycl::device &device) const override { 14 | return device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda; 15 | //return device.is_gpu() && (device.get_info().find("CUDA") != std::string::npos); 16 | } 17 | }; 18 | 19 | /** 20 | * Tries to get a CUDA device else returns the host device 21 | */ 22 | sycl::queue try_get_queue(const sycl::device_selector &selector) { 23 | sycl::device dev; 24 | try { 25 | dev = sycl::device(selector); 26 | } 27 | catch (...) { 28 | dev = sycl::device(sycl::host_selector()); 29 | std::cout << "Warning: GPU device not found! Fall back on: " << dev.get_info() 30 | << std::endl; 31 | } 32 | auto exception_handler = [](const sycl::exception_list &exceptions) { 33 | for (std::exception_ptr const &e : exceptions) { 34 | try { 35 | std::rethrow_exception(e); 36 | } 37 | catch (sycl::exception const &e) { 38 | std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl; 39 | } 40 | catch (std::exception const &e) { 41 | std::cout << "Caught asynchronous STL exception: " << e.what() << std::endl; 42 | } 43 | } 44 | }; 45 | 46 | return sycl::queue(dev, exception_handler); 47 | } 48 | 49 | 50 | /** 51 | * Fills a container/array with random numbers from positions first to last 52 | */ 53 | template 54 | void do_fill_rand_on_host(ForwardIt first, ForwardIt last) { 55 | static std::random_device dev; 56 | static std::mt19937 engine(dev()); 57 | auto generator = [&]() { 58 | if constexpr (std::is_integral::value) { 59 | static std::uniform_int_distribution distribution; 60 | return distribution(engine); 61 | } else if constexpr (std::is_floating_point::value) { 62 | static std::uniform_real_distribution distribution; 63 | return distribution(engine); 64 | } else if constexpr (std::is_same_v) { 65 | static std::uniform_real_distribution distribution; 66 | return distribution(engine); 67 | } 68 | }; 69 | std::generate(first, last, generator); 70 | } 71 | 72 | 73 | /** 74 | * This function accepts only memory that is accessible from the CPU 75 | * To achive this it uses fantom types that wraps the pointer. 76 | * This could be done by calling the runtime to check where is the 77 | * usm memory allocated, but here we can avoid doing that. 78 | */ 79 | template 80 | typename std::enable_if::type 81 | fill_rand(const usm_ptr &v, size_t count) { 82 | do_fill_rand_on_host(+v, v + count); 83 | } 84 | 85 | /** 86 | * This function would only accept device allocated memory 87 | */ 88 | /*template 89 | typename std::enable_if::type 90 | fill_rand(const usm_ptr &v, size_t count) { 91 | do_fill_rand_on_device(+v, v + count); 92 | }*/ 93 | 94 | template 95 | void fill_rand(std::vector &v) { 96 | do_fill_rand_on_host(v.begin(), v.end()); 97 | } 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SYCL for CUDA examples 2 | ========================== 3 | 4 | [![OpenSSF Scorecard](https://api.scorecard.dev/projects/github.com/codeplaysoftware/SYCL-For-CUDA-Examples/badge)](https://scorecard.dev/viewer/?uri=github.com/codeplaysoftware/SYCL-For-CUDA-Examples) 5 | 6 | This repository contains examples that demonstrate how to use the CUDA backend 7 | in SYCL. 8 | 9 | The examples are built and test in Linux with GCC 7.4, NVCC 10.1 and the 10 | experimental support for CUDA in the DPC++ SYCL implementation. 11 | 12 | CUDA is a registered trademark of NVIDIA Corporation 13 | SYCL is a trademark of the Khronos Group Inc. 14 | 15 | Prerequisites 16 | ------------- 17 | 18 | These examples are intended to be used with this [docker image](https://hub.docker.com/r/ruyman/dpcpp_cuda_examples). 19 | It provides all the examples, libraries and the required environment variables. 20 | 21 | [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-docker) must be installed to run the image. 22 | 23 | A useful guide for setting up docker and the NVIDIA Container Toolkit can be found [here](https://www.pugetsystems.com/labs/hpc/Workstation-Setup-for-Docker-with-the-New-NVIDIA-Container-Toolkit-nvidia-docker2-is-deprecated-1568). 24 | 25 | Getting Started 26 | ------------- 27 | 28 | Once docker and the NVIDIA Container Toolkit are installed, we can create a new container and run the examples witin it. 29 | 30 | ``` sh 31 | $ sudo docker run --gpus all -it ruyman/dpcpp_cuda_examples 32 | ``` 33 | 34 | Once inside the docker image, navigate to `/home/examples/` to find a local clone of this repo. Make sure to pull the latest changes: 35 | 36 | ``` sh 37 | $ cd /home/examples/SYCL-For-CUDA-Examples 38 | $ git pull 39 | ``` 40 | 41 | Refer to each example and/or exercise for detailed instructions on how to run it. 42 | 43 | Examples 44 | ========= 45 | 46 | [Vector Addition](examples/vector_addition) 47 | -------------------------------------------- 48 | 49 | This trivial example can be used to compare a simple vector addition in CUDA to 50 | an equivalent implementation in SYCL for CUDA. The aim of the example is also 51 | to highlight how to build an application with SYCL for CUDA using DPC++ support, 52 | for which an example CMakefile is provided. 53 | 54 | [Fortran Interface](examples/fortran_interface) 55 | -------------------------------------------- 56 | 57 | This demonstrates an example of how to call a SYCL function from a CUDA fortran code. 58 | 59 | [MPI](examples/MPI) 60 | -------------------------------------------- 61 | 62 | This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks for integration. 63 | 64 | 65 | [SGEMM Interop](examples/sgemm_interop) 66 | -------------------------- 67 | 68 | This demonstrates using SYCL's `host_task` for CUDA interoperability, calling CUBLAS's SGEMM routine for matrix multiplication. 69 | 70 | [Distributed (MPI) GEMM](examples/distrib_batch_gemm) 71 | -------------------------------------------- 72 | 73 | This example combines the MPI and SGEMM Interop examples to distribute a matrix multiplication problem between MPI ranks. 74 | 75 | [Kokkos](examples/kokkos) 76 | -------------------------------------------- 77 | 78 | [Kokkos](https://github.com/kokkos/kokkos) is a middle-layer for scientific computing which features a SYCL backend. This example 79 | shows a small Kokkos test case (vector-matrix-vector multiplication), adapted from a test case in the Kokkos repo; 80 | there is no SYCL code in the example, but it includes scripts to build Kokkos with SYCL support. 81 | 82 | [Hashing Algorithms](examples/hashing) 83 | -------------------------------------------- 84 | 85 | This example is slightly different - it benchmarks a series of hashing algorithms. 86 | 87 | -------------------------------------------------------------------------------- /examples/hashing/include/hash_functions/blake3.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * blake3.hpp 20 | * 21 | * Description: 22 | * Blake3 hash function 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | 29 | 30 | #include 31 | #include 32 | 33 | #define BLAKE3_VERSION_STRING "0.3.7" 34 | #define BLAKE3_KEY_LEN 32 35 | #define BLAKE3_OUT_LEN 32 36 | #define BLAKE3_BLOCK_LEN 64 37 | #define BLAKE3_CHUNK_LEN 1024 38 | #define BLAKE3_MAX_DEPTH 54 39 | 40 | // This struct is a private implementation detail. It has to be here because 41 | // it's part of blake3_hasher below. 42 | struct blake3_chunk_state { 43 | uint32_t cv[8]; 44 | uint64_t chunk_counter; 45 | uint8_t buf[BLAKE3_BLOCK_LEN]; 46 | uint8_t buf_len; 47 | uint8_t blocks_compressed; 48 | uint8_t flags; 49 | }; 50 | 51 | struct blake3_hasher { 52 | uint32_t key[8]; 53 | blake3_chunk_state chunk; 54 | uint8_t cv_stack_len; 55 | // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, 56 | // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk 57 | // requires a 4th entry, rather than merging everything down to 1, because we 58 | // don't know whether more input is coming. This is different from how the 59 | // reference implementation does things. 60 | uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; 61 | }; 62 | 63 | //const char *blake3_version(); 64 | 65 | //void blake3_hasher_init(blake3_hasher *self); 66 | 67 | //void blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]); 68 | 69 | //void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context); 70 | 71 | //void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, size_t context_len); 72 | 73 | //void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len); 74 | 75 | //void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len); 76 | 77 | //void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len); 78 | 79 | 80 | #include 81 | 82 | #include 83 | #include 84 | #include 85 | 86 | // internal flags 87 | enum blake3_flags { 88 | CHUNK_START = 1 << 0, 89 | CHUNK_END = 1 << 1, 90 | PARENT = 1 << 2, 91 | ROOT = 1 << 3, 92 | KEYED_HASH = 1 << 4, 93 | DERIVE_KEY_CONTEXT = 1 << 5, 94 | DERIVE_KEY_MATERIAL = 1 << 6, 95 | }; 96 | 97 | 98 | // There are some places where we want a static size that's equal to the 99 | // MAX_SIMD_DEGREE, but also at least 2. 100 | #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) 101 | 102 | 103 | namespace hash::internal { 104 | using namespace usm_smart_ptr; 105 | 106 | sycl::event 107 | launch_blake3_kernel(sycl::queue &item, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch, dword n_outbit, const byte *key, dword keylen); 108 | 109 | int test_blake3(); 110 | 111 | } -------------------------------------------------------------------------------- /examples/vector_addition/README.md: -------------------------------------------------------------------------------- 1 | Vector addition 2 | =============================== 3 | 4 | This trivial example can be used to compare a simple vector addition in CUDA to 5 | an equivalent implementation in SYCL for CUDA. The aim of the example is also 6 | to highlight how to build an application with SYCL for CUDA using DPC++ support, 7 | for which an example CMakefile is provided. For detailed documentation on how to 8 | migrate from CUDA to SYCL, see [SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers). 9 | 10 | Pre-requisites 11 | --------------- 12 | 13 | These instructions assume that example [docker image](https://hub.docker.com/r/ruyman/dpcpp_cuda_examples/dockerfile) is being used. This image 14 | simplifies accessing these examples as the environment is set up correctly. 15 | For details on how to get started with the example docker image, refer to the 16 | root README file. 17 | 18 | Building the example 19 | --------------------- 20 | 21 | ``` sh 22 | $ mkdir build && cd build 23 | $ cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++ 24 | $ make -j 8 25 | ``` 26 | 27 | This should produce two binaries, `vector_addition` and `sycl_vector_addition` . 28 | The former is the unmodified CUDA source and the second is the SYCL for CUDA 29 | version. 30 | 31 | Running the example 32 | -------------------- 33 | 34 | ``` 35 | $ ./sycl_vector_addition 36 | $ ./vector_addition 37 | ``` 38 | 39 | CMake Build script 40 | ------------------------ 41 | 42 | The provided CMake build script uses the native CUDA support to build the 43 | CUDA application. It also serves as a check that all CUDA requirements 44 | on the system are available (such as an installation of CUDA on the system). 45 | 46 | Two flags are required: `-DSYCL_ROOT` , which must point to the place where the 47 | DPC++ compiler is installed, and `-DCMAKE_CXX_COMPILER` , which must point to 48 | the Clang compiler provided by DPC++. 49 | 50 | The CMake target `sycl_vector_addition` will build the SYCL version of 51 | the application. 52 | 53 | Note the variable `SYCL_FLAGS` is used to store the Clang flags that enable 54 | the compilation of a SYCL application ( `-fsycl` ) but also the flag that specify 55 | which targets are built ( `-fsycl-targets` ). In this case, we will build the example 56 | for both NVPTX and SPIR64. This means the kernel for the vector addition will be 57 | compiled for both backends, and runtime selection to the right queue will 58 | decide which variant to use. 59 | 60 | Note the project is built with C++17 support, which enables the usage of 61 | [deduction guides](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/deduction_guides/SYCL_INTEL_deduction_guides.asciidoc) to reduce the number of template parameters used. 62 | 63 | SYCL Vector Addition code 64 | -------------------------- 65 | 66 | The vector addition example uses a simple approach to implement with a plain 67 | kernel that performs the add. Vectors are stored directly in buffers. Data is 68 | initialized on the host using host accessors. This approach avoids creating 69 | unnecessary storage on the host, and facilitates the SYCL runtime to use 70 | optimized memory paths. 71 | 72 | The SYCL queue created later on uses a custom `CUDASelector` to select a CUDA 73 | device, or bail out if its not there. The CUDA selector uses the 74 | `info::device::driver_version` to identify the device exported by the CUDA 75 | backend. If the NVIDIA OpenCL implementation is available on the system, it 76 | will be reported as another SYCL device. The driver version is the best way to 77 | differentiate between the two. 78 | 79 | The command group is created as a lambda expression that takes the 80 | `sycl::handler` parameter. Accessors are obtained from buffers using the 81 | `get_access` method. Finally the `parallel_for` with the SYCL kernel is invoked 82 | as usual. 83 | 84 | The command group is subm$ itted to a queue which will convert all the operations 85 | into CUDA commands that will be executed once the host accessor is encountered 86 | later on. 87 | 88 | The host accessor will trigger a copy of the data back to the host, and then 89 | the values are reduced into a single sum element. 90 | -------------------------------------------------------------------------------- /examples/hashing/include/internal/handle.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * handle.hpp 20 | * 21 | * Description: 22 | * Handler for objects for hashing functions 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include "config.hpp" 29 | #include "../tools/usm_smart_ptr.hpp" 30 | 31 | 32 | namespace hash { 33 | using namespace usm_smart_ptr; 34 | struct handle_item { 35 | usm_unique_ptr input_dev_data_; 36 | usm_unique_ptr output_dev_data_; 37 | sycl::event dev_e_; 38 | }; 39 | 40 | /** 41 | * Holds unique pointers to the memory used by the different queues. 42 | * This object is thus not copyable. 43 | */ 44 | class handle { 45 | private: 46 | std::vector items_{}; 47 | public: 48 | /** 49 | * Move constructor. 50 | */ 51 | explicit handle(std::vector &&input) noexcept: 52 | items_(std::move(input)) { 53 | } 54 | 55 | handle() = default; 56 | 57 | /** 58 | * Rule of five, we need to redefine it. 59 | */ 60 | handle &operator=(handle &&other) noexcept { 61 | std::swap(items_, other.items_); 62 | return *this; 63 | } 64 | 65 | /** 66 | * Waits on all the events, then clears the vector 67 | * which results in freeing the USM allocated memory 68 | */ 69 | void wait() { 70 | for (auto &worker: items_) { 71 | worker.dev_e_.wait(); 72 | } 73 | items_.clear(); 74 | } 75 | 76 | /** 77 | * Waits and throws on all the events, then clears the queue 78 | * which results in freeing the USM allocated memory 79 | */ 80 | void wait_and_throw() { 81 | for (auto &worker: items_) { 82 | worker.dev_e_.wait_and_throw(); 83 | } 84 | items_.clear(); 85 | } 86 | 87 | 88 | /** 89 | * No copy constructor. 90 | */ 91 | handle(const handle &) = delete; 92 | 93 | /** 94 | * No assignement operator. 95 | */ 96 | handle &operator=(const handle) = delete; 97 | 98 | /** 99 | * We need to join all the SYCL kernels/events before freeing the memory they use. 100 | */ 101 | ~handle() noexcept { 102 | if (!items_.empty()) { 103 | std::cerr << "Destroying handled that still holds data. Did you forget to call .wait()?\n"; 104 | for (auto &e: items_) { 105 | try { 106 | e.dev_e_.wait_and_throw(); 107 | } 108 | catch (sycl::exception const &e) { 109 | std::cerr << "Caught asynchronous SYCL exception at handle destruction: " << e.what() << std::endl; 110 | } 111 | catch (std::exception const &e) { 112 | std::cerr << "Caught asynchronous STL exception at handle destruction: " << e.what() << std::endl; 113 | } 114 | } 115 | items_.clear(); 116 | } 117 | } 118 | }; 119 | } -------------------------------------------------------------------------------- /examples/hashing/include/internal/async_api.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * async_api.hpp 20 | * 21 | * Description: 22 | * Asynchronous API foir hashing functions 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include "common.hpp" 28 | #include "handle.hpp" 29 | 30 | namespace hash { 31 | /** 32 | * Base class for hashing 33 | * @tparam M 34 | * @tparam n_outbit 35 | */ 36 | template 37 | class hasher { 38 | private: 39 | runners runners_; 40 | public: 41 | explicit hasher(runners v) : runners_(std::move(v)) {} 42 | 43 | handle hash(const byte *indata, dword inlen, byte *outdata, dword n_batch, byte *key, dword keylen) { 44 | size_t size = runners_.size(); 45 | std::vector handles; 46 | handles.reserve(size); 47 | auto items = internal::get_hash_queue_work_item(runners_, indata, inlen, outdata, n_batch); 48 | for (size_t i = 0; i < size; ++i) { 49 | handles.emplace_back(internal::hash_with_data_copy(items[i], key, keylen)); 50 | } 51 | return handle(std::move(handles)); 52 | } 53 | 54 | handle hash(const byte *indata, dword inlen, byte *outdata, dword n_batch) { 55 | return hash(indata, inlen, outdata, n_batch, nullptr, 0); 56 | } 57 | 58 | 59 | }; 60 | 61 | 62 | /** 63 | * Blake 2B 64 | * @tparam n_outbit 65 | */ 66 | template 67 | class hasher { 68 | 69 | private: 70 | hash::runners runners_; 71 | std::vector> keyed_ctxts_{}; 72 | public: 73 | explicit hasher(const hash::runners &v, const byte *key, dword keylen) : runners_(v) { 74 | size_t size = v.size(); 75 | keyed_ctxts_.reserve(size); 76 | 77 | for (size_t i = 0; i < size; ++i) { 78 | keyed_ctxts_.emplace_back(internal::get_blake2b_ctx(runners_[i].q, key, keylen, n_outbit)); 79 | } 80 | 81 | } 82 | 83 | handle hash(const byte *indata, dword inlen, byte *outdata, dword n_batch) { 84 | size_t size = runners_.size(); 85 | std::vector handles; 86 | handles.reserve(2 * size); 87 | auto items = internal::get_hash_queue_work_item(runners_, indata, inlen, outdata, n_batch); 88 | for (size_t i = 0; i < size; ++i) { 89 | handles.emplace_back(internal::hash_with_data_copy(items[i], nullptr, 0, keyed_ctxts_[i].get())); 90 | } 91 | return handle(std::move(handles)); 92 | } 93 | }; 94 | 95 | 96 | using md2 = hasher; 97 | using md5 = hasher; 98 | using sha1 = hasher; 99 | using sha256 = hasher; 100 | 101 | template 102 | using keccak = hasher; 103 | 104 | template 105 | using sha3 = hasher; 106 | 107 | template 108 | using blake2b = hasher; 109 | } 110 | -------------------------------------------------------------------------------- /examples/hashing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | #/*************************************************************************** 2 | # * 3 | # * Copyright (C) Codeplay Software Ltd. 4 | # * 5 | # * Licensed under the Apache License, Version 2.0 (the "License"); 6 | # * you may not use this file except in compliance with the License. 7 | # * You may obtain a copy of the License at 8 | # * 9 | # * http://www.apache.org/licenses/LICENSE-2.0 10 | # * 11 | # * Unless required by applicable law or agreed to in writing, software 12 | # * distributed under the License is distributed on an "AS IS" BASIS, 13 | # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # * See the License for the specific language governing permissions and 15 | # * limitations under the License. 16 | # * 17 | # * Codeplay's SYCL-For-CUDA-Examples 18 | # * 19 | # * CMakeLists.txt 20 | # * 21 | # * Description: 22 | # * CMake for hash functions 23 | # **************************************************************************/ 24 | cmake_minimum_required(VERSION 3.0) 25 | project(SYCL_HASH LANGUAGES CXX) 26 | set(CMAKE_CXX_STANDARD 17) 27 | 28 | option(VERBOSE_LIB "Adds various prints in the code" OFF) 29 | if (VERBOSE_LIB) 30 | #message(WARNING "Verbose mode on. Did you forget it?") 31 | add_compile_definitions(VERBOSE_HASH_LIB) 32 | endif () 33 | 34 | # If you're using the DPCPP compiler, these flags will be used. Set here the devies you want to target 35 | set(DPCPP_FLAGS -fsycl -fsycl-targets=spir64_x86_64,nvptx64-nvidia-cuda -Xcuda-ptxas -v -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 -Wno-linker-warnings) 36 | include(cmake/FindSYCL.cmake) 37 | 38 | # Default C++ Flags for warnings and optimisation 39 | set(WARNING_FLAGS "-Wall -Wextra -Wshadow -Wdouble-promotion -fno-common -Winit-self -Wuninitialized -Wmissing-declarations -Woverloaded-virtual") 40 | set(EXTRA_W_FLAGS "-pedantic -Wall -Wextra -Wcast-align -Wctor-dtor-privacy -Wdisabled-optimization -Wformat=2 -Winit-self -Wmissing-declarations -Wmissing-include-dirs -Woverloaded-virtual -Wredundant-decls -Wshadow -Wsign-conversion -Wsign-promo -Wstrict-overflow=5") #-Wnoexcept -Wold-style-cast -Wstrict-null-sentinel -switch-default -Wlogical-op 41 | set(DISABLED_WARNINGS "-Wno-c++20-extensions -Wno-inline-namespace-reopened-noninline -Wno-undef -Wno-unused -Wno-unused-command-line-argument") 42 | set(OPT_FLAGS "-march=native -mtune=native -Ofast -fomit-frame-pointer") 43 | 44 | # Adding the flags to the targets 45 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${OPT_FLAGS} ${DISABLED_WARNINGS}") 46 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${EXTRA_W_FLAGS} ${DISABLED_WARNINGS} -g -Og") 47 | 48 | include_directories(${PROJECT_BINARY_DIR} ${PROJECT_SOURCE_DIR}/include) 49 | 50 | 51 | set(sycl_hash_all_kernels 52 | src/hash_functions/sha256.cpp 53 | src/hash_functions/blake2b.cpp 54 | src/hash_functions/sha1.cpp 55 | src/hash_functions/md5.cpp 56 | src/hash_functions/keccak.cpp 57 | src/hash_functions/md2.cpp 58 | src/tools/queue_tester.cpp 59 | ) 60 | 61 | set(sycl_hash_all_sources 62 | src/benchmarks/misc.hpp 63 | include/sycl_hash.hpp 64 | include/internal/config.hpp 65 | include/internal/handle.hpp 66 | include/internal/common.hpp 67 | include/internal/determine_kernel_config.hpp 68 | include/internal/sync_api.hpp 69 | include/internal/async_api.hpp 70 | include/hash_functions/sha256.hpp 71 | include/hash_functions/blake2b.hpp 72 | include/hash_functions/sha1.hpp 73 | include/hash_functions/md5.hpp 74 | include/hash_functions/keccak.hpp 75 | include/hash_functions/md2.hpp 76 | include/tools/fill_rand.hpp 77 | include/tools/sycl_queue_helpers.hpp 78 | include/tools/usm_smart_ptr.hpp 79 | include/tools/runtime_byte_array.hpp 80 | include/tools/intrinsics.hpp 81 | ) 82 | 83 | add_library(sycl_hash SHARED ${sycl_hash_all_sources} ${sycl_hash_all_kernels}) 84 | add_sycl_to_target(TARGET sycl_hash SOURCES ${sycl_hash_all_kernels}) 85 | 86 | add_executable(demo demo_main.cpp src/benchmarks/misc.hpp) 87 | target_link_libraries(demo PUBLIC sycl_hash) 88 | add_sycl_to_target(TARGET demo SOURCES demo_main.cpp) 89 | 90 | include(tests/CMakeLists.txt) 91 | -------------------------------------------------------------------------------- /examples/hashing/demo_main.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * demo_main.cpp 20 | * 21 | * Description: 22 | * Main function for hashing demo 23 | **************************************************************************/ 24 | /*Copyright 2021 Codeplay Software Ltd. 25 | 26 | Licensed under the Apache License, Version 2.0 (the "License"); 27 | you may not use these files except in compliance with the License. 28 | You may obtain a copy of the License at 29 | 30 | http://www.apache.org/licenses/LICENSE-2.0 31 | 32 | For your convenience, a copy of the License has been included in this 33 | repository. 34 | 35 | Unless required by applicable law or agreed to in writing, software 36 | distributed under the License is distributed on an "AS IS" BASIS, 37 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 38 | See the License for the specific language governing permissions and 39 | limitations under the License. */ 40 | 41 | 42 | #include 43 | #include 44 | #include "src/benchmarks/misc.hpp" 45 | 46 | int main() { 47 | size_t input_block_size = 512 * 1024; //bytes 48 | size_t n_blocs = 1024 * 6; 49 | size_t n_iters = 40; 50 | auto cpu_q = try_get_queue(sycl::cpu_selector{}); 51 | auto cuda_q = try_get_queue(cuda_selector{}); 52 | 53 | // auto ptr = (byte *) malloc(input_block_size * 100 * sizeof(byte)); 54 | // auto out = (byte *) malloc(hash::get_block_size() * 100 * sizeof(byte)); 55 | // double cpu_speed = benchmark_one_queue(cpu_q, input_block_size, 80); 56 | // double gpu_speed = benchmark_one_queue(cuda_q, input_block_size, n_blocs, 5); 57 | // hash::sha256 hasher({{cpu_q, cpu_speed}, {cuda_q, gpu_speed}}); 58 | // auto e = hasher.hash(ptr, input_block_size, out, 100); 59 | // hash::compute_md2(cpu_q, ptr, input_block_size, out, n_blocs); 60 | // hash::compute_sha3<512>(cpu_q, ptr, input_block_size, out, n_blocs); 61 | 62 | 63 | //GPU 64 | run_benchmark(cuda_q, input_block_size, n_blocs, n_iters); 65 | run_benchmark(cuda_q, input_block_size, n_blocs, n_iters); 66 | run_benchmark(cuda_q, input_block_size, n_blocs, n_iters); 67 | run_benchmark(cuda_q, input_block_size, n_blocs, n_iters); 68 | run_benchmark(cuda_q, input_block_size, n_blocs, n_iters); 69 | run_benchmark(cuda_q, input_block_size, n_blocs, n_iters); 70 | run_benchmark(cuda_q, input_block_size, n_blocs, n_iters); 71 | 72 | //CPU 73 | run_benchmark(cpu_q, input_block_size, n_blocs, n_iters); 74 | run_benchmark(cpu_q, input_block_size, n_blocs, n_iters); 75 | run_benchmark(cpu_q, input_block_size, n_blocs, n_iters); 76 | run_benchmark(cpu_q, input_block_size, n_blocs, n_iters); 77 | run_benchmark(cpu_q, input_block_size, n_blocs, n_iters); 78 | run_benchmark(cpu_q, input_block_size, n_blocs, n_iters); 79 | 80 | 81 | // CPU == GPU ?? 82 | compare_two_devices(cuda_q, cpu_q, 1024, 4096); 83 | compare_two_devices(cuda_q, cpu_q, 1024, 4096); 84 | compare_two_devices(cuda_q, cpu_q, 1024, 4096); 85 | compare_two_devices(cuda_q, cpu_q, 1024, 4096); 86 | compare_two_devices(cuda_q, cpu_q, 1024, 4096); 87 | compare_two_devices(cuda_q, cpu_q, 1024, 4096); 88 | } 89 | 90 | -------------------------------------------------------------------------------- /examples/hashing/tests/tests_helpers.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * test_helpers.hpp 20 | * 21 | * Description: 22 | * Helper functions for tests 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | template 33 | static inline sycl::queue try_get_queue_with_device(const sycl::device &in_dev) { 34 | auto exception_handler = [](const sycl::exception_list &exceptions) { 35 | for (std::exception_ptr const &e: exceptions) { 36 | try { 37 | std::rethrow_exception(e); 38 | } 39 | catch (sycl::exception const &e) { 40 | std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl; 41 | } 42 | catch (std::exception const &e) { 43 | std::cout << "Caught asynchronous STL exception: " << e.what() << std::endl; 44 | } 45 | } 46 | }; 47 | 48 | sycl::device dev; 49 | sycl::queue q; 50 | try { 51 | dev = in_dev; 52 | q = sycl::queue(dev, exception_handler); 53 | if constexpr (strict) { 54 | if (dev.is_cpu() || dev.is_gpu()) { //Only CPU and GPU not host, dsp, fpga, ?... 55 | queue_tester(q); 56 | } 57 | } 58 | } 59 | catch (...) { 60 | dev = sycl::device(sycl::host_selector()); 61 | q = sycl::queue(dev, exception_handler); 62 | std::cout << "Warning: Expected device not found! Fall back on: " << dev.get_info() << std::endl; 63 | } 64 | return q; 65 | } 66 | 67 | 68 | void print_hex(byte *ptr, dword len) { 69 | for (size_t i = 0; i < len; ++i) // only the first block 70 | std::cout << std::hex << std::setfill('0') << std::setw(2) << (int) (ptr[i]) << " "; 71 | std::cout << std::dec << std::endl << std::endl; 72 | } 73 | 74 | void duplicate(byte *in, byte *out, dword item_len, dword count) { 75 | for (size_t i = 0; i < count; ++i) { 76 | std::memcpy(out + item_len * i, in, item_len); 77 | } 78 | } 79 | 80 | 81 | std::vector get_all_queues_once() { 82 | std::vector devices1 = sycl::device::get_devices(); 83 | std::vector queues1; 84 | std::for_each(devices1.begin(), devices1.end(), [&](auto &d) { queues1.emplace_back(try_get_queue_with_device(d)); }); 85 | return queues1; 86 | } 87 | 88 | 89 | std::vector get_all_queues() { 90 | static std::vector queues = get_all_queues_once(); 91 | return queues; 92 | } 93 | 94 | 95 | template 96 | void for_all_workers(Func f) { 97 | static auto queues = get_all_queues(); 98 | { 99 | for (const auto &q: queues) { 100 | std::cout << "Running on: " << q.get_device().get_info() << std::endl; 101 | f(hash::runners(1, hash::runner{q, 1})); 102 | } 103 | } 104 | } 105 | 106 | 107 | template 108 | void for_all_workers_pairs(Func f) { 109 | auto queues = get_all_queues(); 110 | for (const auto &q1: queues) { 111 | for (const auto &q2: queues) { 112 | std::cout << "Running on: " << q1.get_device().get_info() << " and: " << q2.get_device().get_info() << std::endl; 113 | f({{q1, 1}, 114 | {q2, 1}}); 115 | } 116 | 117 | } 118 | 119 | } 120 | -------------------------------------------------------------------------------- /examples/hashing/doc/README.md: -------------------------------------------------------------------------------- 1 | # usm_smart_ptr wrappers 2 | 3 | We provide in `include/tools/usm_smart_ptr.hpp` overloads to `std::unique_ptr` and `std::shared_ptr` that manages SYCL's USM memory. 4 | 5 | ```c++ 6 | using namespace usm_smart_ptr; 7 | sbb::queue q; 8 | size_t count; 9 | auto usm_unique_ptr = make_unique_ptr(count, q); // could return a pointer to sbb::malloc_shared(count, q); 10 | auto usm_shared_ptr = make_shared_ptr(count, q); 11 | ``` 12 | 13 | You can choose between `alloc::device`, `alloc::shared` and `alloc::host`. Calling `.get()` on the pointers will return a decorated pointer to the underlying memory which allow to keep track of where the memory was 14 | allocated. It's just compile-time type safety. These decorated types allow to further construct the types : 15 | 16 | * `device_accessible_ptr` 17 | * `host_accessible_ptr` 18 | 19 | This prevents the following (potentially incorrect) code from compiling: 20 | 21 | ```c++ 22 | // A function 23 | void fill_on_host(host_accessible_ptr ptr, size_t size){ 24 | float* raw_ptr = (float*) ptr; 25 | //do your thing on the host 26 | } 27 | 28 | // later 29 | auto device_memory = make_shared_ptr(1024, q); 30 | 31 | // attempt to fill the memory 32 | fill_on_host(device_memory.get(), 1024); // Won't compile, we're saved! 33 | ``` 34 | 35 | # API 36 | 37 | ## Computing hashes 38 | 39 | The number of inputs and template arguments to the hashing function depends on the type. Any missing or extra argument passed will result in compilation failure. 40 | 41 | ```C++ 42 | hash::compute(queue, input_ptr, input_block_size, output_hashes, n_blocs, key_ptr, key_size); 43 | hash::compute(queue, input_ptr, input_block_size, output_hashes, n_blocs); 44 | hash::compute(queue, input_ptr, input_block_size, output_hashes, n_blocs; 45 | hash::compute(queue, input_ptr, input_block_size, output_hashes, n_blocs,); 46 | hash::compute(queue, input_ptr, input_block_size, output_hashes, n_blocs); 47 | hash::compute(queue, input_ptr, input_block_size, output_hashes, n_blocs); 48 | hash::compute(queue, input_ptr, input_block_size, output_hashes, n_blocs); 49 | ``` 50 | 51 | We'll consider the `blake2b` method in the rest. For each method we got two overloads : 52 | 53 | ### 1. Implicit memory copy 54 | 55 | ``` 56 | hash::compute(sbb::queue &q, const byte*, dword, byte*, dword, byte *, dword); 57 | ``` 58 | 59 | This is the overload you would call if you got C++ allocated pointers to your memory (array on the stack, malloc, new[], ...). When calling this function, the memory will be copied behind the scenes to the device as it's 60 | the safets behaviour. 61 | 62 | ### 2. No memory copy 63 | 64 | If you wrap your memory pointers in `device_accessible_ptr` (see `include/tools/usm_smart_ptr.hpp`), then the library will assume these points to a memory that is accessible by the `sbb::device` you build 65 | your `sbb::queue` on. 66 | 67 | ``` 68 | hash::compute(sbb::queue &q, const device_accessible_ptr, dword, device_accessible_ptr, dword, const byte *, dword); 69 | ``` 70 | 71 | Best, this overload will be called if you use the previously described `usm_smart_ptr wrappers` with `alloc::device` or `alloc::shared`. We voluntarily excluded `alloc::host` as the remote memory accesses could 72 | potentially cause performanec issues. 73 | 74 | ## Hash functions querying 75 | 76 | The following `constexpr` function returns the length, in bytes of a hash produced by the method. 77 | 78 | ```c++ 79 | hash::get_block_size(); 80 | ``` 81 | 82 | You can also query the name with: 83 | 84 | ```c++ 85 | hash::get_name() 86 | ``` 87 | 88 | r 89 | 90 | # Kernel work group size formula 91 | 92 | The nd_range sizes are computed in `include/determine_kernel_config.hpp`. When running on a CPU we'll try to make twice as much work groups as you've got execution threads on your system as, with OpenCL, each work group 93 | seems to be executed on one CPU thread. When running on the GPU we'll try to make work groups that contains 64 work items each. Going above 64 seems to decrease performance. This behaviour might not be optimal and should 94 | be customised to fit your SYCL implementation. 95 | -------------------------------------------------------------------------------- /examples/sgemm_interop/sycl_sgemm_usm.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sycl_sgemm_usm.cpp 20 | * 21 | * Description: 22 | * SGEMM operation in SYCL with USM 23 | **************************************************************************/ 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | 30 | #include 31 | #include 32 | 33 | #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) 34 | 35 | void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) { 36 | if (status != CUBLAS_STATUS_SUCCESS) { 37 | std::cout << "ERROR CUBLAS:" << msg << " - " << status << std::endl; 38 | exit(EXIT_FAILURE); 39 | } 40 | } 41 | 42 | void inline checkCudaErrorMsg(cudaError status, const char *msg) { 43 | if (status != cudaSuccess) { 44 | std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl; 45 | exit(EXIT_FAILURE); 46 | } 47 | } 48 | 49 | int main() { 50 | using namespace sycl; 51 | 52 | constexpr size_t WIDTH = 1024; 53 | constexpr size_t HEIGHT = 1024; 54 | constexpr float ALPHA = 1.0f; 55 | constexpr float BETA = 0.0f; 56 | 57 | std::vector h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT), 58 | h_C(WIDTH * HEIGHT); 59 | 60 | std::cout << "Size: " << h_C.size() << std::endl; 61 | 62 | // A is an identity matrix 63 | std::fill(std::begin(h_A), std::end(h_A), 0.0f); 64 | for (size_t i = 0; i < WIDTH; i++) { 65 | h_A[i * WIDTH + i] = 1.0f; 66 | } 67 | 68 | // B is a matrix fill with 1 69 | std::fill(std::begin(h_B), std::end(h_B), 1.0f); 70 | 71 | sycl::queue q{[](auto &d) { 72 | return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); 73 | }}; 74 | 75 | // Allocate memory on the device 76 | float *d_A = sycl::malloc_device(WIDTH * HEIGHT, q); 77 | float *d_B = sycl::malloc_device(WIDTH * HEIGHT, q); 78 | float *d_C = sycl::malloc_device(WIDTH * HEIGHT, q); 79 | 80 | // Copy matrices A & B to device from host vectors 81 | const size_t numBytes = WIDTH * HEIGHT * sizeof(float); 82 | q.memcpy(d_A, h_A.data(), numBytes).wait(); 83 | q.memcpy(d_B, h_B.data(), numBytes).wait(); 84 | 85 | // Create cublas handle 86 | cublasHandle_t handle; 87 | CHECK_ERROR(cublasCreate(&handle)); 88 | 89 | q.submit([&](handler &h) { 90 | h.host_task([=](sycl::interop_handle ih) { 91 | // Set the correct cuda context & stream 92 | cuCtxSetCurrent(ih.get_native_context()); 93 | auto cuStream = ih.get_native_queue(); 94 | cublasSetStream(handle, cuStream); 95 | 96 | // Call generalised matrix-matrix multiply 97 | CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, 98 | WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, 99 | d_C, WIDTH)); 100 | cuStreamSynchronize(cuStream); 101 | }); 102 | }).wait(); 103 | 104 | // Copy the result back to host 105 | q.memcpy(h_C.data(), d_C, numBytes).wait(); 106 | 107 | // C must be all ones 108 | int i = 0; 109 | const bool allEqual = 110 | std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) { 111 | ++i; 112 | if (num != 1) { 113 | std::cout << i << " Not one : " << num << std::endl; 114 | } 115 | return num == 1; 116 | }); 117 | 118 | if (!allEqual) { 119 | std::cout << " Incorrect result " << std::endl; 120 | } else { 121 | std::cout << " Correct! " << std::endl; 122 | } 123 | 124 | CHECK_ERROR(cublasDestroy(handle)); 125 | 126 | return allEqual ? EXIT_SUCCESS : EXIT_FAILURE; 127 | } 128 | -------------------------------------------------------------------------------- /examples/sgemm_interop/sycl_sgemm.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sycl_sgemm.cpp 20 | * 21 | * Description: 22 | * SGEMM operation in SYCL with buffers 23 | **************************************************************************/ 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | 30 | #include 31 | #include 32 | 33 | #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC) 34 | 35 | void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) { 36 | if (status != CUBLAS_STATUS_SUCCESS) { 37 | std::cout << "ERROR CUBLAS:" << msg << " - " << status << std::endl; 38 | exit(EXIT_FAILURE); 39 | } 40 | } 41 | 42 | void inline checkCudaErrorMsg(cudaError status, const char *msg) { 43 | if (status != cudaSuccess) { 44 | std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl; 45 | exit(EXIT_FAILURE); 46 | } 47 | } 48 | 49 | int main() { 50 | using namespace sycl; 51 | 52 | constexpr size_t WIDTH = 1024; 53 | constexpr size_t HEIGHT = 1024; 54 | constexpr float ALPHA = 1.0f; 55 | constexpr float BETA = 0.0f; 56 | 57 | std::vector h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT), 58 | h_C(WIDTH * HEIGHT); 59 | 60 | std::cout << "Size: " << h_C.size() << std::endl; 61 | 62 | // A is an identity matrix 63 | std::fill(std::begin(h_A), std::end(h_A), 0.0f); 64 | for (size_t i = 0; i < WIDTH; i++) { 65 | h_A[i * WIDTH + i] = 1.0f; 66 | } 67 | 68 | // B is a matrix fill with 1 69 | std::fill(std::begin(h_B), std::end(h_B), 1.0f); 70 | 71 | sycl::queue q{[](auto &d) { 72 | return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda); 73 | }}; 74 | 75 | cublasHandle_t handle; 76 | CHECK_ERROR(cublasCreate(&handle)); 77 | 78 | { 79 | buffer b_A{h_A.data(), range<2>{WIDTH, HEIGHT}}; 80 | buffer b_B{h_B.data(), range<2>{WIDTH, HEIGHT}}; 81 | buffer b_C{h_C.data(), range<2>{WIDTH, HEIGHT}}; 82 | 83 | q.submit([&](handler &h) { 84 | auto d_A = b_A.get_access(h); 85 | auto d_B = b_B.get_access(h); 86 | auto d_C = b_C.get_access(h); 87 | 88 | h.host_task([=](sycl::interop_handle ih) { 89 | // Set the correct cuda context & stream 90 | cuCtxSetCurrent(ih.get_native_context()); 91 | auto cuStream = ih.get_native_queue(); 92 | cublasSetStream(handle, cuStream); 93 | auto cuA = reinterpret_cast( 94 | ih.get_native_mem(d_A)); 95 | auto cuB = reinterpret_cast( 96 | ih.get_native_mem(d_B)); 97 | auto cuC = reinterpret_cast( 98 | ih.get_native_mem(d_C)); 99 | 100 | CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT, 101 | WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA, 102 | cuC, WIDTH)); 103 | cuStreamSynchronize(cuStream); 104 | }); 105 | }); 106 | } 107 | 108 | // C must be all ones 109 | int i = 0; 110 | const bool allEqual = 111 | std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) { 112 | ++i; 113 | if (num != 1) { 114 | std::cout << i << " Not one : " << num << std::endl; 115 | } 116 | return num == 1; 117 | }); 118 | 119 | if (!allEqual) { 120 | std::cout << " Incorrect result " << std::endl; 121 | } else { 122 | std::cout << " Correct! " << std::endl; 123 | } 124 | 125 | CHECK_ERROR(cublasDestroy(handle)); 126 | 127 | return allEqual ? EXIT_SUCCESS : EXIT_FAILURE; 128 | } 129 | -------------------------------------------------------------------------------- /examples/vector_addition/vector_addition.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Tom Papatheodore 2 | 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy 4 | // of this software and associated documentation files (the "Software"), to deal 5 | // in the Software without restriction, including without limitation the rights 6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | // copies of the Software, and to permit persons to whom the Software is 8 | // furnished to do so, subject to the following conditions: 9 | 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | // SOFTWARE. 20 | 21 | #include 22 | 23 | // Macro for checking errors in GPU API calls 24 | #define gpuErrorCheck(call) \ 25 | do{ \ 26 | cudaError_t gpuErr = call; \ 27 | if(cudaSuccess != gpuErr){ \ 28 | printf("GPU Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(gpuErr)); \ 29 | exit(1); \ 30 | } \ 31 | }while(0) 32 | 33 | // Size of array 34 | #define N 1048576 35 | 36 | // Kernel 37 | __global__ void vector_addition(double *a, double *b, double *c) 38 | { 39 | int id = blockDim.x * blockIdx.x + threadIdx.x; 40 | if(id < N) c[id] = a[id] + b[id]; 41 | } 42 | 43 | // Main program 44 | int main() 45 | { 46 | // Number of bytes to allocate for N doubles 47 | size_t bytes = N*sizeof(double); 48 | 49 | // Allocate memory for arrays A, B, and C on host 50 | double *A = (double*)malloc(bytes); 51 | double *B = (double*)malloc(bytes); 52 | double *C = (double*)malloc(bytes); 53 | 54 | // Allocate memory for arrays d_A, d_B, and d_C on device 55 | double *d_A, *d_B, *d_C; 56 | gpuErrorCheck( cudaMalloc(&d_A, bytes) ); 57 | gpuErrorCheck( cudaMalloc(&d_B, bytes) ); 58 | gpuErrorCheck( cudaMalloc(&d_C, bytes) ); 59 | 60 | // Fill host arrays A, B, and C 61 | for(int i=0; i>>(d_A, d_B, d_C); 80 | 81 | // Check for synchronous errors during kernel launch (e.g. invalid execution configuration paramters) 82 | gpuErrorCheck( cudaGetLastError() ); 83 | 84 | // Check for asynchronous errors during GPU execution (after control is returned to CPU) 85 | gpuErrorCheck( cudaDeviceSynchronize() ); 86 | 87 | // Copy data from device array d_C to host array C 88 | gpuErrorCheck( cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost) ); 89 | 90 | // Verify results 91 | double tolerance = 1.0e-14; 92 | for(int i=0; i tolerance ) 95 | { 96 | printf("Error: value of C[%d] = %f instead of 3.0\n", i, C[i]); 97 | exit(1); 98 | } 99 | } 100 | 101 | // Free CPU memory 102 | free(A); 103 | free(B); 104 | free(C); 105 | 106 | // Free GPU memory 107 | gpuErrorCheck( cudaFree(d_A) ); 108 | gpuErrorCheck( cudaFree(d_B) ); 109 | gpuErrorCheck( cudaFree(d_C) ); 110 | 111 | printf("\n---------------------------\n"); 112 | printf("__SUCCESS__\n"); 113 | printf("---------------------------\n"); 114 | printf("N = %d\n", N); 115 | printf("Threads Per Block = %d\n", thr_per_blk); 116 | printf("Blocks In Grid = %d\n", blk_in_grid); 117 | printf("---------------------------\n\n"); 118 | 119 | return 0; 120 | } 121 | -------------------------------------------------------------------------------- /setup-script/sample/include/usm_smart_ptr.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | //#include 9 | 10 | namespace usm_smart_ptr { 11 | using namespace sycl::usm; 12 | 13 | template 14 | /* template 15 | concept host_accessible = location == sycl::usm::alloc::host || location == sycl::usm::alloc::shared; */ 16 | struct usm_ptr { 17 | explicit usm_ptr(T *t) : val_(t) {} 18 | 19 | operator T *() const noexcept { return val_; } 20 | 21 | private: 22 | T *val_; 23 | }; 24 | 25 | 26 | /** 27 | * SYCL USM Deleter. The std::unique_ptr deleter takes only the pointer 28 | * to delete as an argument so that's the only work-around. 29 | */ 30 | template 31 | struct usm_deleter { 32 | sycl::queue q_; 33 | 34 | explicit usm_deleter(sycl::queue q) : q_(std::move(q)) {} 35 | 36 | void operator()(T *ptr) const noexcept { 37 | if (ptr) 38 | sycl::free(ptr, q_); 39 | } 40 | }; 41 | 42 | /** 43 | * Wrapper for a std::unique_ptr that calls the SYCL deleter (sycl::free). 44 | * Also holds the number of elements allocated. 45 | */ 46 | template 47 | class usm_unique_ptr : public std::unique_ptr> { 48 | private: 49 | size_t count_; 50 | public: 51 | usm_unique_ptr(T *ptr, usm_deleter deleter, size_t count) 52 | : std::unique_ptr>(ptr, deleter) { count_ = count; } 53 | 54 | [[nodiscard]] inline size_t size() const noexcept { return count_ * sizeof(T); } 55 | 56 | [[nodiscard]] inline size_t count() const noexcept { return count_; } 57 | 58 | [[nodiscard]] inline usm_ptr get() const noexcept { 59 | return usm_ptr(std::unique_ptr>::get()); 60 | } 61 | 62 | }; 63 | 64 | /** 65 | * Builds a usm_unique_ptr pointer 66 | * @tparam location indicates where is the memory allocated (device, host, or shared) 67 | */ 68 | template 69 | usm_unique_ptr make_unique_ptr(size_t count, sycl::queue &q) { 70 | //return usm_unique_ptr(sycl::usm_allocator < T, location > {q}.allocate(count), usm_deleter{q}, count); 71 | if constexpr(location == alloc::shared) 72 | return usm_unique_ptr(sycl::malloc_shared(count, q), usm_deleter{q}, count); 73 | else if constexpr(location == alloc::host) 74 | return usm_unique_ptr(sycl::malloc_host(count, q), usm_deleter{q}, count); 75 | else if constexpr(location == alloc::device) 76 | return usm_unique_ptr(sycl::malloc_device(count, q), usm_deleter{q}, count); 77 | else static_assert(!std::is_same_v, "Invalid template parameter."); 78 | } 79 | 80 | 81 | template 82 | usm_unique_ptr make_unique_ptr(sycl::queue &q) { 83 | return make_unique_ptr(1, q); 84 | } 85 | 86 | 87 | /** 88 | * Same interface as usm_unique_ptr 89 | * @tparam T 90 | */ 91 | template 92 | class usm_shared_ptr : public std::shared_ptr { 93 | private: 94 | size_t count_; 95 | 96 | public: 97 | usm_shared_ptr(T *ptr, usm_deleter deleter, size_t count) : std::shared_ptr(ptr, 98 | deleter) { count_ = count; } 99 | 100 | [[nodiscard]] inline size_t size() const noexcept { return count_ * sizeof(T); } 101 | 102 | [[nodiscard]] inline size_t count() const noexcept { return count_; } 103 | 104 | [[nodiscard]] inline usm_ptr get() const noexcept { 105 | return usm_ptr(std::shared_ptr::get()); 106 | } 107 | 108 | }; 109 | 110 | template 111 | usm_shared_ptr make_shared_ptr(size_t count, sycl::queue &q) { 112 | //return usm_shared_ptr(sycl::usm_allocator < T, location > {q}.allocate(count), usm_deleter{q}, count); 113 | if constexpr(location == alloc::shared) 114 | return usm_shared_ptr(sycl::malloc_shared(count, q), usm_deleter{q}, count); 115 | else if constexpr(location == alloc::host) 116 | return usm_shared_ptr(sycl::malloc_host(count, q), usm_deleter{q}, count); 117 | else if constexpr(location == alloc::device) 118 | return usm_shared_ptr(sycl::malloc_device(count, q), usm_deleter{q}, count); 119 | else static_assert(!std::is_same_v, "Invalid template parameter."); 120 | } 121 | 122 | template 123 | usm_shared_ptr make_sycl_shared(sycl::queue &q) { 124 | return make_shared_ptr(1, q); 125 | } 126 | } -------------------------------------------------------------------------------- /setup-script/build_minimal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export DPCPP_HOME=~/sycl_workspace 3 | export CUDA_ROOT=/usr/local/cuda-10.2 4 | export LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib/:$DPCPP_HOME/deploy/lib64/:$DPCPP_HOME/lapack/install/lib64/:$DPCPP_HOME/OpenCL-ICD-Loader/install/lib64:$CUDA_ROOT/lib:$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 5 | export PATH=$DPCPP_HOME/deploy/bin/:$CUDA_ROOT/bin:$PATH 6 | 7 | mkdir -p $DPCPP_HOME 8 | cd $DPCPP_HOME 9 | mkdir -p deploy 10 | 11 | #export LD_PRELOAD=/opt/intel/opencl/libOpenCL.so.1 12 | 13 | run_test=false 14 | cmake_test="OFF" 15 | 16 | if [[ -z "$DPCPP_TESTS" ]]; then 17 | echo "Not testing" 18 | else 19 | echo "testing" 20 | run_test=true 21 | cmake_test="ON" 22 | fi 23 | 24 | export CXXFLAGS="${CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=1" 25 | 26 | # OpenCL headers+ICD 27 | cd $DPCPP_HOME 28 | (if cd OpenCL-Headers; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-Headers.git; fi) 29 | (if cd OpenCL-ICD-Loader; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; fi) 30 | cd OpenCL-ICD-Loader 31 | mkdir -p build 32 | cd build 33 | cmake \ 34 | -DOPENCL_ICD_LOADER_HEADERS_DIR=$DPCPP_HOME/OpenCL-Headers \ 35 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/OpenCL-ICD-Loader/install \ 36 | .. 37 | make install -j $(nproc) 38 | 39 | #sycl compiler with cuda 40 | cd $DPCPP_HOME 41 | (if cd llvm; then git pull; else git clone https://github.com/intel/llvm.git -b sycl; fi) 42 | cd llvm 43 | python3 ./buildbot/configure.py \ 44 | --cuda \ 45 | -t release \ 46 | --cmake-opt="-DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy" \ 47 | --cmake-opt="-DCUDA_SDK_ROOT_DIR=$CUDA_ROOT" \ 48 | --cmake-opt="-DLLVM_BINUTILS_INCDIR=/usr/local/include" \ 49 | --cmake-opt="-DLLVM_ENABLE_PROJECTS=clang;sycl;llvm-spirv;opencl;libdevice;xpti;xptifw;libclc;openmp" \ 50 | --cmake-opt="-DLLVM_BUILD_TESTS=$cmake_test" \ 51 | --cmake-opt="-DCMAKE_CXX_STANDARD=17" \ 52 | --cmake-opt="-DLLVM_ENABLE_LTO=off" \ 53 | --cmake-opt="-DLLVM_ENABLE_LLD=ON" \ 54 | --cmake-opt="-Wno-dev" 55 | cd build 56 | ninja install -j $(nproc) 57 | if $run_test; then 58 | echo "testing llvm" 59 | ninja check -j $(nproc) 60 | fi 61 | 62 | #Lapack Reference 63 | cd $DPCPP_HOME 64 | (if cd lapack; then git pull; else git clone https://github.com/Reference-LAPACK/lapack.git; fi) 65 | cd lapack/ 66 | mkdir -p build 67 | cd build/ 68 | cmake \ 69 | -DBUILD_SHARED_LIBS=ON \ 70 | -DCBLAS=ON \ 71 | -DLAPACKE=ON \ 72 | -DBUILD_TESTING=$cmake_test \ 73 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/lapack/install \ 74 | .. 75 | cmake --build . -j $(nproc) --target install 76 | if $run_test; then 77 | cmake --build . -j $(nproc) --target test 78 | fi 79 | 80 | #oneTBB 81 | cd $DPCPP_HOME 82 | (if cd oneTBB; then git pull; else git clone https://github.com/oneapi-src/oneTBB.git; fi) 83 | cd oneTBB 84 | mkdir -p build 85 | cd build 86 | cmake -GNinja \ 87 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 88 | -DCMAKE_BUILD_TYPE=Release \ 89 | -DTBB_STRICT=OFF \ 90 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \ 91 | -DTBB_TEST=$cmake_test \ 92 | .. 93 | ninja install -j $(nproc) 94 | if $run_test; then 95 | ninja test -j $(nproc) 96 | fi 97 | 98 | #oneMKL 99 | cd $DPCPP_HOME 100 | (if cd oneMKL; then git pull; else git clone https://github.com/oneapi-src/oneMKL.git; fi) 101 | cd oneMKL 102 | mkdir -p build 103 | cd build 104 | cmake -GNinja \ 105 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 106 | -DCMAKE_BUILD_TYPE=Release \ 107 | -DCMAKE_CXX_STANDARD=17 \ 108 | -DTARGET_DOMAINS=blas \ 109 | -DENABLE_MKLGPU_BACKEND=OFF \ 110 | -DENABLE_CURAND_BACKEND=OFF \ 111 | -DENABLE_MKLCPU_BACKEND=OFF \ 112 | -DENABLE_CUBLAS_BACKEND=ON \ 113 | -DENABLE_NETLIB_BACKEND=ON \ 114 | -DREF_BLAS_ROOT=$DPCPP_HOME/lapack/install \ 115 | -DNETLIB_ROOT=$DPCPP_HOME/lapack/install \ 116 | -DOPENCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \ 117 | -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_ROOT \ 118 | -DSYCL_LIBRARY=$DPCPP_HOME/deploy/lib/libsycl.so \ 119 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \ 120 | -DBUILD_FUNCTIONAL_TESTS=$cmake_test \ 121 | .. 122 | ninja install -j $(nproc) 123 | if $run_test; then 124 | ninja test -j $(nproc) 125 | fi 126 | 127 | #oneDNN 128 | cd $DPCPP_HOME 129 | (if cd oneDNN; then git pull; else git clone https://github.com/oneapi-src/oneDNN.git; fi) 130 | cd oneDNN 131 | mkdir -p build 132 | cd build 133 | cmake -GNinja \ 134 | -DCMAKE_C_COMPILER=$DPCPP_HOME/deploy/bin/clang \ 135 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 136 | -DCMAKE_BUILD_TYPE=Release \ 137 | -DCMAKE_CXX_STANDARD=17 \ 138 | -DDNNL_INSTALL_MODE=BUNDLE \ 139 | -DDNNL_CPU_RUNTIME=DPCPP \ 140 | -DDNNL_GPU_RUNTIME=DPCPP \ 141 | -DDNNL_GPU_VENDOR=NVIDIA \ 142 | -DTBBROOT=$DPCPP_HOME/deploy \ 143 | -DCUDA_SDK_ROOT_DIR=$CUDA_ROOT \ 144 | -DOPENCLROOT=$DPCPP_HOME/OpenCL-ICD-Loader/install \ 145 | -DOpenCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \ 146 | -DCUBLAS_LIBRARY=$CUDA_ROOT/lib64/libcublas.so \ 147 | -DCUBLAS_INCLUDE_DIR=$CUDA_ROOT/include \ 148 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy \ 149 | -DDNNL_BUILD_TESTS=$cmake_test \ 150 | .. 151 | ninja install -j $(nproc) 152 | if $run_test; then 153 | LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib ninja test -j $(nproc) 154 | fi 155 | -------------------------------------------------------------------------------- /examples/hashing/include/tools/usm_smart_ptr.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * usm_smart_ptr.hpp 20 | * 21 | * Description: 22 | * SYCL-compatible smart pointer implementation 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include "../internal/config.hpp" 28 | #include 29 | #include 30 | #include 31 | 32 | #ifdef USING_COMPUTECPP 33 | namespace cl::sycl::usm { 34 | using cl::sycl::experimental::usm::alloc; 35 | } 36 | #endif 37 | 38 | 39 | #include "intrinsics.hpp" 40 | 41 | namespace usm_smart_ptr { 42 | using namespace sycl::usm; 43 | 44 | template 45 | struct usm_ptr { 46 | private: 47 | T *val_; 48 | public: 49 | explicit usm_ptr(T *t) : val_(t) {} 50 | 51 | operator T *() const noexcept { return val_; } 52 | }; 53 | 54 | 55 | template 56 | struct device_accessible_ptr { 57 | private: 58 | const T *val_; 59 | public: 60 | explicit device_accessible_ptr(T *p) : val_(p) {}; 61 | 62 | explicit device_accessible_ptr(const T *p) : val_(p) {}; 63 | 64 | device_accessible_ptr(usm_ptr p) : val_((T *) p) {}; 65 | 66 | device_accessible_ptr(usm_ptr p) : val_((T *) p) {}; 67 | 68 | operator T *() const noexcept { return (T *) val_; } 69 | 70 | 71 | }; 72 | 73 | template 74 | struct host_accessible_ptr { 75 | private: 76 | T *val_; 77 | public: 78 | host_accessible_ptr(usm_ptr p) : val_((T *) p) {}; 79 | 80 | host_accessible_ptr(usm_ptr p) : val_((T *) p) {}; 81 | 82 | operator T *() const noexcept { return val_; } 83 | 84 | 85 | }; 86 | 87 | 88 | /** 89 | * SYCL USM Deleter. The std::unique_ptr deleter takes only the pointer 90 | * to delete as an argument so that's the only work-around. 91 | */ 92 | template 93 | struct usm_deleter { 94 | sycl::queue q_; 95 | 96 | explicit usm_deleter(const sycl::queue &q) : q_(q) {} 97 | 98 | void operator()(T *ptr) const noexcept { 99 | if (ptr) 100 | sycl::free(ptr, q_); 101 | } 102 | }; 103 | 104 | /** 105 | * Wrapper for a std::unique_ptr that calls the SYCL deleter (sycl::free). 106 | * Also holds the number of elements allocated. 107 | */ 108 | template 109 | class usm_unique_ptr : public std::unique_ptr> { 110 | private: 111 | size_t count_; 112 | public: 113 | usm_unique_ptr(size_t count, sycl::queue q) 114 | : std::unique_ptr>(sycl::malloc(count, q, location), usm_deleter{q}) { count_ = count; } 115 | 116 | explicit usm_unique_ptr(sycl::queue q) : 117 | usm_unique_ptr(1, q) { count_ = 1; } 118 | 119 | 120 | [[nodiscard]] inline size_t alloc_size() const noexcept { return count_ * sizeof(T); } 121 | 122 | [[nodiscard]] inline size_t alloc_count() const noexcept { return count_; } 123 | 124 | [[nodiscard]] inline usm_ptr get() const noexcept { 125 | return usm_ptr(std::unique_ptr>::get()); 126 | } 127 | 128 | [[nodiscard]] inline T *raw() const noexcept { 129 | return std::unique_ptr>::get(); 130 | } 131 | 132 | }; 133 | 134 | 135 | /** 136 | * Same interface as usm_unique_ptr 137 | * @tparam T 138 | */ 139 | template 140 | class usm_shared_ptr : public std::shared_ptr { 141 | private: 142 | size_t count_; 143 | 144 | public: 145 | usm_shared_ptr(size_t count, sycl::queue q) : std::shared_ptr(sycl::malloc(count, q, location), usm_deleter{q}) { count_ = count; } 146 | 147 | explicit usm_shared_ptr(sycl::queue q) : 148 | usm_shared_ptr(1, q) { count_ = 1; } 149 | 150 | [[nodiscard]] inline size_t alloc_size() const noexcept { return count_ * sizeof(T); } 151 | 152 | [[nodiscard]] inline size_t alloc_count() const noexcept { return count_; } 153 | 154 | [[nodiscard]] inline usm_ptr get() const noexcept { 155 | return usm_ptr(std::shared_ptr::get()); 156 | } 157 | 158 | [[nodiscard]] inline T *raw() const noexcept { 159 | return std::shared_ptr::get(); 160 | } 161 | 162 | }; 163 | 164 | } -------------------------------------------------------------------------------- /setup-script/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export DPCPP_HOME=~/sycl_workspace 3 | export CUDA_ROOT=/usr/local/cuda-10.2 4 | export LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib/:$DPCPP_HOME/deploy/lib64/:$DPCPP_HOME/lapack/install/lib64/:$DPCPP_HOME/OpenCL-ICD-Loader/install/lib64:$CUDA_ROOT/lib:$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 5 | export PATH=$DPCPP_HOME/deploy/bin/:$CUDA_ROOT/bin:$PATH 6 | 7 | mkdir -p $DPCPP_HOME 8 | cd $DPCPP_HOME 9 | mkdir -p deploy 10 | 11 | #export LD_PRELOAD=/opt/intel/opencl/libOpenCL.so.1 12 | 13 | run_test=false 14 | cmake_test="OFF" 15 | 16 | if [[ -z "$DPCPP_TESTS" ]]; then 17 | echo "Not testing" 18 | else 19 | echo "testing" 20 | run_test=true 21 | cmake_test="ON" 22 | fi 23 | 24 | export CXXFLAGS="${CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=1" 25 | 26 | # OpenCL headers+ICD 27 | cd $DPCPP_HOME 28 | (if cd OpenCL-Headers; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-Headers.git; fi) 29 | (if cd OpenCL-ICD-Loader; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; fi) 30 | cd OpenCL-ICD-Loader 31 | mkdir -p build 32 | cd build 33 | cmake \ 34 | -DOPENCL_ICD_LOADER_HEADERS_DIR=$DPCPP_HOME/OpenCL-Headers \ 35 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/OpenCL-ICD-Loader/install \ 36 | .. 37 | make install -j $(nproc) 38 | 39 | #sycl compiler with cuda 40 | source /opt/intel/opencl/env/compiler_rt_vars.sh 41 | cd $DPCPP_HOME 42 | (if cd llvm; then git pull; else git clone https://github.com/intel/llvm.git -b sycl; fi) 43 | cd llvm 44 | python3 ./buildbot/configure.py \ 45 | --cuda \ 46 | -t release \ 47 | --cmake-opt="-DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy" \ 48 | --cmake-opt="-DCUDA_SDK_ROOT_DIR=$CUDA_ROOT" \ 49 | --cmake-opt="-DLLVM_BINUTILS_INCDIR=/usr/local/include" \ 50 | --cmake-opt="-DLLVM_ENABLE_PROJECTS=clang;sycl;llvm-spirv;libunwind;opencl;libdevice;xpti;xptifw;libclc;lld;lldb;libc;libcxx;libcxxabi;openmp;clang-tools-extra;compiler-rt" \ 51 | --cmake-opt="-DLLVM_BUILD_TESTS=$cmake_test" \ 52 | --cmake-opt="-DCMAKE_CXX_STANDARD=17" \ 53 | --cmake-opt="-DLLVM_ENABLE_LTO=off" \ 54 | --cmake-opt="-DLLVM_ENABLE_LLD=ON" \ 55 | --cmake-opt="-DSYCL_ENABLE_WERROR=OFF" \ 56 | --cmake-opt="-Wno-dev" 57 | cd build 58 | ninja install -j $(nproc) 59 | if $run_test; then 60 | echo "testing llvm" 61 | ninja check -j $(nproc) 62 | fi 63 | 64 | #Lapack Reference 65 | cd $DPCPP_HOME 66 | (if cd lapack; then git pull; else git clone https://github.com/Reference-LAPACK/lapack.git; fi) 67 | cd lapack/ 68 | mkdir -p build 69 | cd build/ 70 | cmake \ 71 | -DBUILD_SHARED_LIBS=ON \ 72 | -DCBLAS=ON \ 73 | -DLAPACKE=ON \ 74 | -DBUILD_TESTING=$cmake_test \ 75 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/lapack/install \ 76 | .. 77 | cmake --build . -j $(nproc) --target install 78 | if $run_test; then 79 | cmake --build . -j $(nproc) --target test 80 | fi 81 | 82 | #oneTBB 83 | cd $DPCPP_HOME 84 | (if cd oneTBB; then git pull; else git clone https://github.com/oneapi-src/oneTBB.git; fi) 85 | cd oneTBB 86 | mkdir -p build 87 | cd build 88 | cmake -GNinja \ 89 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 90 | -DCMAKE_BUILD_TYPE=Release \ 91 | -DTBB_STRICT=OFF \ 92 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \ 93 | -DTBB_TEST=$cmake_test \ 94 | .. 95 | ninja install -j $(nproc) 96 | if $run_test; then 97 | ninja test -j $(nproc) 98 | fi 99 | 100 | #oneMKL 101 | cd $DPCPP_HOME 102 | (if cd oneMKL; then git pull; else git clone https://github.com/oneapi-src/oneMKL.git; fi) 103 | cd oneMKL 104 | mkdir -p build 105 | cd build 106 | cmake -GNinja \ 107 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 108 | -DCMAKE_BUILD_TYPE=Release \ 109 | -DCMAKE_CXX_STANDARD=17 \ 110 | -DTARGET_DOMAINS=blas \ 111 | -DENABLE_MKLGPU_BACKEND=OFF \ 112 | -DENABLE_CURAND_BACKEND=OFF \ 113 | -DENABLE_MKLCPU_BACKEND=OFF \ 114 | -DENABLE_CUBLAS_BACKEND=ON \ 115 | -DENABLE_NETLIB_BACKEND=ON \ 116 | -DREF_BLAS_ROOT=$DPCPP_HOME/lapack/install \ 117 | -DNETLIB_ROOT=$DPCPP_HOME/lapack/install \ 118 | -DOPENCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \ 119 | -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_ROOT \ 120 | -DSYCL_LIBRARY=$DPCPP_HOME/deploy/lib/libsycl.so \ 121 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \ 122 | -DBUILD_FUNCTIONAL_TESTS=$cmake_test \ 123 | .. 124 | ninja install -j $(nproc) 125 | if $run_test; then 126 | ninja test -j $(nproc) 127 | fi 128 | 129 | #oneDNN 130 | cd $DPCPP_HOME 131 | (if cd oneDNN; then git pull; else git clone https://github.com/oneapi-src/oneDNN.git; fi) 132 | cd oneDNN 133 | mkdir -p build 134 | cd build 135 | cmake -GNinja \ 136 | -DCMAKE_C_COMPILER=$DPCPP_HOME/deploy/bin/clang \ 137 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 138 | -DCMAKE_BUILD_TYPE=Release \ 139 | -DCMAKE_CXX_STANDARD=17 \ 140 | -DDNNL_INSTALL_MODE=BUNDLE \ 141 | -DDNNL_CPU_RUNTIME=DPCPP \ 142 | -DDNNL_GPU_RUNTIME=DPCPP \ 143 | -DDNNL_GPU_VENDOR=NVIDIA \ 144 | -DNNL_BUILD_EXAMPLES=OFF \ 145 | -DTBBROOT=$DPCPP_HOME/deploy \ 146 | -DCUDA_SDK_ROOT_DIR=$CUDA_ROOT \ 147 | -DOPENCLROOT=$DPCPP_HOME/OpenCL-ICD-Loader/install \ 148 | -DOpenCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \ 149 | -DCUBLAS_LIBRARY=$CUDA_ROOT/lib64/libcublas.so \ 150 | -DCUBLAS_INCLUDE_DIR=$CUDA_ROOT/include \ 151 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy \ 152 | -DDNNL_BUILD_TESTS=$cmake_test \ 153 | .. 154 | ninja install -j $(nproc) 155 | if $run_test; then 156 | LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib ninja test -j $(nproc) 157 | fi 158 | -------------------------------------------------------------------------------- /examples/hashing/src/benchmarks/misc.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * misc.hpp 20 | * 21 | * Description: 22 | * Miscellaneous 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include 29 | 30 | #include 31 | 32 | using namespace usm_smart_ptr; 33 | 34 | 35 | template 36 | void compare_two_devices(sycl::queue q1, sycl::queue q2, size_t input_block_size, size_t n_blocs) { 37 | std::cout << "Comparing " << hash::get_name() << " on: " << q1.get_device().get_info() << " and: " << q2.get_device().get_info() 38 | << " ..."; 39 | size_t out_block_size = hash::get_block_size(); 40 | auto input_data1 = usm_unique_ptr(input_block_size * n_blocs, q1); 41 | auto output_hashes1 = usm_unique_ptr(out_block_size * n_blocs, q1); 42 | auto input_data2 = usm_unique_ptr(input_block_size * n_blocs, q2); 43 | auto output_hashes2 = usm_unique_ptr(out_block_size * n_blocs, q2); 44 | 45 | fill_rand(input_data1.get(), input_data1.alloc_count()); 46 | memcpy(input_data2.raw(), input_data1.raw(), input_data1.alloc_size()); 47 | 48 | 49 | if constexpr (M == hash::method::blake2b) { 50 | byte key[64]; 51 | std::memset(key, 1, 64); 52 | hash::compute(q1, input_data1.get(), input_block_size, output_hashes1.get(), n_blocs, key, 64); 53 | hash::compute(q2, input_data2.get(), input_block_size, output_hashes2.get(), n_blocs, key, 64); 54 | } else { 55 | hash::compute(q1, input_data1.get(), input_block_size, output_hashes1.get(), n_blocs); 56 | hash::compute(q2, input_data2.get(), input_block_size, output_hashes2.get(), n_blocs); 57 | } 58 | 59 | auto[idx1, idx2]= std::mismatch(output_hashes1.raw(), output_hashes1.raw() + output_hashes1.alloc_count(), output_hashes2.raw()); 60 | if (idx1 != output_hashes1.raw() + output_hashes1.alloc_count()) { 61 | std::cout << "mismatch" << std::endl; 62 | } else { 63 | std::cout << "pass" << std::endl; 64 | } 65 | } 66 | 67 | 68 | template 69 | double benchmark_one_queue(sycl::queue q, size_t input_block_size, size_t n_blocs, size_t n_iters = 1) { 70 | auto all_input_data = usm_unique_ptr(input_block_size * n_blocs, q); 71 | auto all_output_hashes = usm_unique_ptr(hash::get_block_size() * n_blocs, q); 72 | if constexpr (M == hash::method::blake2b) { 73 | byte key[64]; 74 | std::memset(key, 1, 64); 75 | hash::compute(q, all_input_data.get(), input_block_size, all_output_hashes.get(), n_blocs, key, 64);/* Preheat */ 76 | auto before = std::chrono::steady_clock::now(); 77 | for (size_t i = 0; i < n_iters; ++i) { 78 | #ifdef VERBOSE_HASH_LIB 79 | std::cerr << i << " "; 80 | #endif 81 | hash::compute(q, all_input_data.get(), input_block_size, all_output_hashes.get(), n_blocs, key, 64); 82 | } 83 | auto after = std::chrono::steady_clock::now(); 84 | auto time = std::chrono::duration(after - before).count(); 85 | return (double) n_iters / time * (double) (input_block_size * n_blocs) / 1e6; 86 | } else { 87 | hash::compute(q, all_input_data.get(), input_block_size, all_output_hashes.get(), n_blocs);/* Preheat */ 88 | auto before = std::chrono::steady_clock::now(); 89 | for (size_t i = 0; i < n_iters; ++i) { 90 | #ifdef VERBOSE_HASH_LIB 91 | std::cerr << i << " "; 92 | #endif 93 | hash::compute(q, all_input_data.get(), input_block_size, all_output_hashes.get(), n_blocs); 94 | } 95 | auto after = std::chrono::steady_clock::now(); 96 | auto time = std::chrono::duration(after - before).count(); 97 | return (double) n_iters / time * (double) (input_block_size * n_blocs) / 1e6; 98 | } 99 | } 100 | 101 | 102 | template 103 | void run_benchmark(sycl::queue q, size_t input_block_size, size_t n_blocs, size_t n_iters) { 104 | std::cout << "Running " << hash::get_name() << " on:" << q.get_device().get_info() << ": "; 105 | auto gflops = benchmark_one_queue(q, input_block_size, n_blocs, n_iters); 106 | std::cout << "\nGB hashed per sec: " << gflops << "\n\n"; 107 | } 108 | -------------------------------------------------------------------------------- /setup-script/build_with_libcxx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export DPCPP_HOME=~/sycl_workspace 3 | export CUDA_ROOT=/usr/local/cuda-10.2 4 | export LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib/:$DPCPP_HOME/deploy/lib64/:$DPCPP_HOME/lapack/install/lib64/:$DPCPP_HOME/OpenCL-ICD-Loader/install/lib64:$CUDA_ROOT/lib:$CUDA_ROOT/lib64:$LD_LIBRARY_PATH 5 | export PATH=$DPCPP_HOME/deploy/bin/:$CUDA_ROOT/bin:$PATH 6 | 7 | mkdir -p $DPCPP_HOME 8 | cd $DPCPP_HOME 9 | mkdir -p deploy 10 | 11 | export CXXFLAGS="${CXXFLAGS} -stdlib=libc++" 12 | #export LD_PRELOAD=/opt/intel/opencl/libOpenCL.so.1 13 | 14 | run_test=false 15 | cmake_test="OFF" 16 | 17 | if [[ -z "$DPCPP_TESTS" ]]; then 18 | echo "Not testing" 19 | else 20 | echo "testing" 21 | run_test=true 22 | cmake_test="ON" 23 | fi 24 | 25 | # OpenCL headers+ICD 26 | cd $DPCPP_HOME 27 | (if cd OpenCL-Headers; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-Headers.git; fi) 28 | (if cd OpenCL-ICD-Loader; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; fi) 29 | cd OpenCL-ICD-Loader 30 | mkdir -p build 31 | cd build 32 | cmake \ 33 | -DOPENCL_ICD_LOADER_HEADERS_DIR=$DPCPP_HOME/OpenCL-Headers \ 34 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/OpenCL-ICD-Loader/install \ 35 | .. 36 | make install -j $(nproc) 37 | 38 | #sycl compiler with cuda 39 | source /opt/intel/opencl/env/compiler_rt_vars.sh 40 | cd $DPCPP_HOME 41 | (if cd llvm; then git pull; else git clone https://github.com/intel/llvm.git -b sycl; fi) 42 | cd llvm 43 | python3 ./buildbot/configure.py \ 44 | --cuda \ 45 | -t release \ 46 | --cmake-opt="-DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy" \ 47 | --cmake-opt="-DCUDA_SDK_ROOT_DIR=$CUDA_ROOT" \ 48 | --cmake-opt="-DLLVM_ENABLE_PROJECTS=libcxxabi;libunwind;libcxx;clang;sycl;llvm-spirv;opencl;opencl-aot;libdevice;xpti;xptifw;libclc;openmp;clang-tools-extra;compiler-rt" \ 49 | --cmake-opt="-DLLVM_BUILD_TESTS=$cmake_test" \ 50 | --cmake-opt="-DLIBC_COMPILE_OPTIONS_DEFAULT=-march=native" \ 51 | --cmake-opt="-DLLVM_LIBC_FULL_BUILD=ON" \ 52 | --cmake-opt="-DLIBCXXABI_USE_LLVM_UNWINDER=ON" \ 53 | --cmake-opt="-DLIBCXX_USE_COMPILER_RT=ON" \ 54 | --cmake-opt="-DSYCL_ENABLE_WERROR=OFF" \ 55 | --cmake-opt="-DCLANG_DEFAULT_CXX_STDLIB=libc++" \ 56 | --cmake-opt="-DCMAKE_CXX_STANDARD=17" \ 57 | --cmake-opt="-Wno-dev" 58 | cd build 59 | ninja install -j $(nproc) 60 | if $run_test; then 61 | echo "testing llvm" 62 | ninja check -j $(nproc) 63 | fi 64 | 65 | #Lapack Reference 66 | cd $DPCPP_HOME 67 | (if cd lapack; then git pull; else git clone https://github.com/Reference-LAPACK/lapack.git; fi) 68 | cd lapack/ 69 | mkdir -p build 70 | cd build/ 71 | cmake \ 72 | -DBUILD_SHARED_LIBS=ON \ 73 | -DCBLAS=ON \ 74 | -DCMAKE_CXX_FLAGS="-stdlib=libc++" \ 75 | -DLAPACKE=ON \ 76 | -DBUILD_TESTING=$cmake_test \ 77 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/lapack/install \ 78 | .. 79 | cmake --build . -j $(nproc) --target install 80 | if $run_test; then 81 | cmake --build . -j $(nproc) --target test 82 | fi 83 | 84 | #oneTBB 85 | cd $DPCPP_HOME 86 | (if cd oneTBB; then git pull; else git clone https://github.com/oneapi-src/oneTBB.git; fi) 87 | cd oneTBB 88 | mkdir -p build 89 | cd build 90 | cmake -GNinja \ 91 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 92 | -DCMAKE_BUILD_TYPE=Release \ 93 | -DCMAKE_CXX_FLAGS="-stdlib=libc++" \ 94 | -DTBB_STRICT=OFF \ 95 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \ 96 | -DTBB_TEST=$cmake_test \ 97 | .. 98 | ninja install -j $(nproc) 99 | if $run_test; then 100 | ninja test -j $(nproc) 101 | fi 102 | 103 | #oneMKL 104 | cd $DPCPP_HOME 105 | (if cd oneMKL; then git pull; else git clone https://github.com/oneapi-src/oneMKL.git; fi) 106 | cd oneMKL 107 | mkdir -p build 108 | cd build 109 | cmake -GNinja \ 110 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 111 | -DCMAKE_BUILD_TYPE=Release \ 112 | -DCMAKE_CXX_STANDARD=17 \ 113 | -DCMAKE_CXX_FLAGS="-stdlib=libc++ -lc++ -lc++abi" \ 114 | -DTARGET_DOMAINS=blas \ 115 | -DENABLE_MKLGPU_BACKEND=OFF \ 116 | -DENABLE_CURAND_BACKEND=OFF \ 117 | -DENABLE_MKLCPU_BACKEND=OFF \ 118 | -DENABLE_CUBLAS_BACKEND=ON \ 119 | -DENABLE_NETLIB_BACKEND=ON \ 120 | -DREF_BLAS_ROOT=$DPCPP_HOME/lapack/install \ 121 | -DNETLIB_ROOT=$DPCPP_HOME/lapack/install \ 122 | -DOPENCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \ 123 | -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_ROOT \ 124 | -DSYCL_LIBRARY=$DPCPP_HOME/deploy/lib/libsycl.so \ 125 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \ 126 | -DBUILD_FUNCTIONAL_TESTS=$cmake_test \ 127 | .. 128 | ninja install -j $(nproc) 129 | if $run_test; then 130 | ninja test -j $(nproc) 131 | fi 132 | 133 | #oneDNN 134 | cd $DPCPP_HOME 135 | (if cd oneDNN; then git pull; else git clone https://github.com/oneapi-src/oneDNN.git; fi) 136 | cd oneDNN 137 | mkdir -p build 138 | cd build 139 | cmake -GNinja \ 140 | -DCMAKE_C_COMPILER=$DPCPP_HOME/deploy/bin/clang \ 141 | -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \ 142 | -DCMAKE_BUILD_TYPE=Release \ 143 | -DCMAKE_CXX_STANDARD=17 \ 144 | -DCMAKE_CXX_FLAGS="-stdlib=libc++" \ 145 | -DDNNL_INSTALL_MODE=BUNDLE \ 146 | -DDNNL_CPU_RUNTIME=DPCPP \ 147 | -DDNNL_GPU_RUNTIME=DPCPP \ 148 | -DDNNL_GPU_VENDOR=NVIDIA \ 149 | -DTBBROOT=$DPCPP_HOME/deploy \ 150 | -DCUDA_SDK_ROOT_DIR=$CUDA_ROOT \ 151 | -DOPENCLROOT=$DPCPP_HOME/OpenCL-ICD-Loader/install \ 152 | -DOpenCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \ 153 | -DCUBLAS_LIBRARY=$CUDA_ROOT/lib64/libcublas.so \ 154 | -DCUBLAS_INCLUDE_DIR=$CUDA_ROOT/include \ 155 | -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy \ 156 | -DDNNL_BUILD_TESTS=$cmake_test \ 157 | .. 158 | ninja install -j $(nproc) 159 | if $run_test; then 160 | LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib ninja test -j $(nproc) 161 | fi 162 | -------------------------------------------------------------------------------- /examples/hashing/README.md: -------------------------------------------------------------------------------- 1 | # SYCL Hashing Algorithms 2 | 3 | This repository contains hashing algorithms implemented using [SYCL](https://www.khronos.org/sycl/) which is a heterogeneous programming model based on standard C++. 4 | 5 | The following hashing methods are currently available: 6 | 7 | - sha256 8 | - sha1 (unsecure) 9 | - md2 (unsecure) 10 | - md5 (unsecure) 11 | - keccak (128 224 256 288 384 512) 12 | - sha3 (224 256 384 512) 13 | - blake2b 14 | 15 | ## Benchmarks 16 | 17 | Some functions were ported from a CUDA implementation. The SYCL code was tested unchanged across the different implementations and hardware. Here's how they perform (the values are in GB/s): 18 | 19 | | Function | Native CUDA | SYCL on DPC++ CUDA (optimised) | SYCL on ComputeCPP CPU (spir64/spirv64) | SYCL on DPC++ CPU (spir64_x86_64) | SYCL on hipSYCL (omp/cuda) | 20 | | -------- | ----------- | ------------------------------------------- | --------------------------------------- | --------------------------------- | -------------------------- | 21 | | keccak | 15.7 | 23.0 | 4.14 / 3.89 | 4.98 | 4.32 / 23.2 | 22 | | md5 | 14.6 | 20.3 | 6.26 / 5.89 | 9.93 | 9.27 / 20.2 | 23 | | blake2b | 14.7 | 21.6 | 9.46 / 10.0 | 12.4 | 7.71 / 17.9 | 24 | | sha1 | 14.7 | 19.34 | 3.61 / 3.35 | 3.30 | 4.39 / 19.2 | 25 | | sha256 | 13.5 | 19.15 | 2.23 / 2.00 | 2.91 | 2.93 / 19.1 | 26 | | md2 | 4.18 | 4.23 | 0.22 / 0.25 | 0.176 | 0.25 / 2.33 | 27 | 28 | ### Note 29 | 30 | Something broke the spir64 backend of DPC++ and it produces now very slow code 31 | 32 | Benchmark configuration: 33 | 34 | - block_size: 512 kiB 35 | - n_blocks: 4\*1536 36 | - n_outbit: 128 37 | - GPU: GTX 1660 Ti 38 | - OS: rhel8.4 39 | - CPU: 2x E5-2670 v2 40 | 41 | ### Remark 42 | 43 | These are not the "best" settings as the optimum changes with the algorithm. The benchmarks measure the time to run 40 iterations, without copying the memory between the device and the host. In a real application, you 44 | could be memory bound. 45 | 46 | ## How to build 47 | 48 | ```bash 49 | git clone https://github.com/Michoumichmich/SYCL-Hashing-Algorithms.git ; cd SYCL-Hashing-Algorithms; 50 | mkdir build; cd build 51 | CXX= cmake .. -DCMAKE_BUILD_TYPE=Release 52 | make 53 | ``` 54 | 55 | This will build the library, and a demo executable. Running it will perform a benchmark on your CPU and CUDA device (if available). 56 | 57 | You do not necessarily need to pass the `` to cmake, it depends on the implementation you're using and its toolchain. 58 | 59 | ## How to use 60 | 61 | Let's assume you used this [script](https://github.com/Michoumichmich/oneAPI-setup-script) to setup the toolchain with CUDA support. 62 | 63 | Here's a minimal example: 64 | 65 | ```C++ 66 | #include // SYCL headers 67 | #include "sycl_hash.hpp" // The headers 68 | #include "tools/sycl_queue_helpers.hpp" // To make sycl queue 69 | using namespace hash; 70 | 71 | int main(){ 72 | auto cuda_q = try_get_queue(cuda_selector{}); // create a queue on a cuda device and attach an exception handler 73 | 74 | constexpr int hash_size = get_block_size(); 75 | constexpr int n_blocks = 20; // amount of hash to do in parallel 76 | constexpr int item_size = 1024; 77 | 78 | byte input[n_blocks * item_size]; // get an array of 20 same-sized data items to hash; 79 | byte output[n_blocks * hash_size]; // reserve space for the output 80 | 81 | compute(cuda_q, input, item_size, output, n_blocks); // do the computing 82 | compute_sha256(cuda_q, input, item_size, output, n_blocks); // identical 83 | 84 | /** 85 | * For SHA3 one could write: 86 | * compute_sha3<512>(cuda_q, input, item_size, output, n_blocks); 87 | */ 88 | 89 | return 0; 90 | } 91 | ``` 92 | 93 | And, for clang build with 94 | 95 | ``` 96 | -fsycl -fsycl-targets=spir64_x86_64,nvptx64-nvidia-cuda--sm_50 -I /libsycl_hash.a 97 | ``` 98 | 99 | And your hash will run on the GPU. 100 | 101 | # Sources 102 | 103 | You may find [here](https://github.com/Michoumichmich/cuda-hashing-algos-with-benchmark) the fork of the original CUDA implementations with the benchmarks added. 104 | 105 | # Tested implementations 106 | 107 | - [Intel's clang](https://github.com/intel/llvm) with OpenCL on CPU (using Intel's driver) and [Codeplay's CUDA backend](https://www.codeplay.com/solutions/oneapi/for-cuda/) 108 | - [hipSYCL](https://github.com/illuhad/hipSYCL) on macOS with the OpenMP backend (set `hipSYCL_DIR` then `cmake .. -DHIPSYCL_TARGETS="..."`) 109 | - [ComputeCPP](https://developer.codeplay.com/products/computecpp/ce/home) you can build with `cmake .. -DComputeCpp_DIR=/path_to_computecpp -DCOMPUTECPP_BITCODE=spir64 -DCMAKE_BUILD_TYPE=Release`, Tested on the host 110 | device, `spir64` and `spirv64`. See [ComputeCpp SDK](https://github.com/codeplaysoftware/computecpp-sdk) 111 | 112 | # Acknowledgements 113 | 114 | This repository contains code written by Matt Zweil & The Mochimo Core Contributor Team. Please see the [files](https://github.com/mochimodev/cuda-hashing-algos) for their respective licences. 115 | -------------------------------------------------------------------------------- /examples/hashing/include/tools/runtime_byte_array.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * runtime_byte_array.hpp 20 | * 21 | * Description: 22 | * Runtime byte array for hashing functions 23 | **************************************************************************/ 24 | /** 25 | Copyright 2021 Codeplay Software Ltd. 26 | 27 | Licensed under the Apache License, Version 2.0 (the "License"); 28 | you may not use these files except in compliance with the License. 29 | You may obtain a copy of the License at 30 | 31 | http://www.apache.org/licenses/LICENSE-2.0 32 | 33 | For your convenience, a copy of the License has been included in this 34 | repository. 35 | 36 | Unless required by applicable law or agreed to in writing, software 37 | distributed under the License is distributed on an "AS IS" BASIS, 38 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 39 | See the License for the specific language governing permissions and 40 | limitations under the License. 41 | 42 | @author Michel Migdal. 43 | */ 44 | 45 | /** 46 | * Array of bytes accessible with runtimes indices and that is stored using larger types to reduce register look-up lacencies/ 47 | */ 48 | 49 | #pragma once 50 | 51 | #include 52 | #include 53 | #include 54 | #include 55 | 56 | namespace sbb { 57 | namespace runtime_idx_detail { 58 | 59 | template 60 | static inline constexpr void runtime_index_wrapper_internal_store_byte(array_t &arr, const uint &word_idx, const uint8_t &byte_in, const uint &byte_idx) { 61 | static_assert(idx_max >= 0 && idx_max < N); 62 | #pragma unroll 63 | for (uint i = 0; i < N; ++i) { 64 | arr[i] = (word_idx == i) ? set_byte(arr[i], byte_in, byte_idx) : arr[i]; 65 | } 66 | } 67 | 68 | 69 | template 70 | [[nodiscard]] static inline constexpr T runtime_index_wrapper_internal_read_copy(const array_t &arr, const uint &i) { 71 | static_assert(idx_max >= 0 && idx_max < N); 72 | if constexpr (idx_max == 0 || N == 1) { 73 | return arr[0]; 74 | } else { 75 | if (i == idx_max) { 76 | return arr[idx_max]; 77 | } else { 78 | return runtime_index_wrapper_internal_read_copy(arr, i); 79 | } 80 | } 81 | } 82 | 83 | template 84 | static inline constexpr uint8_t runtime_index_wrapper_store_byte(std::array &array, const uint &i, const uint8_t &val, const uint &byte_idx) { 85 | runtime_index_wrapper_internal_store_byte, N>(array, i, (T) val, byte_idx); 86 | return val; 87 | } 88 | 89 | 90 | template 91 | [[nodiscard]] static inline constexpr T runtime_index_wrapper(const std::array &array, const uint &i) { 92 | return runtime_index_wrapper_internal_read_copy, N>(array, i); 93 | } 94 | } 95 | 96 | } 97 | 98 | 99 | template 100 | class runtime_byte_array { 101 | public: 102 | 103 | static_assert(std::is_unsigned_v && std::is_integral_v); 104 | 105 | /** 106 | * Connstructor that takes a list of bytes 107 | * @param init 108 | */ 109 | constexpr runtime_byte_array(const std::initializer_list &init) { 110 | uint idx = 0; 111 | for (auto b: init) { 112 | write(idx, b); 113 | ++idx; 114 | } 115 | } 116 | 117 | /** 118 | * Reads the ith byte 119 | * @param i index 120 | * @return the byte 121 | */ 122 | [[nodiscard]] constexpr uint8_t read(const uint &i) const { 123 | storage_type word = sbb::runtime_idx_detail::runtime_index_wrapper(storage_array_, i / sizeof(storage_type)); 124 | return sbb::get_byte(word, i % sizeof(storage_type)); 125 | } 126 | 127 | /** 128 | * Reads the ith byte 129 | * @param i index 130 | * @return the byte 131 | */ 132 | [[nodiscard]] constexpr uint8_t operator[](const uint &i) const { 133 | return read(i); 134 | } 135 | 136 | /** 137 | * Write the ith byte 138 | * @param i index 139 | * @return the byte written 140 | */ 141 | constexpr uint8_t write(const uint &i, const uint8_t &write_byte) { 142 | return sbb::runtime_idx_detail::runtime_index_wrapper_store_byte(storage_array_, i / sizeof(storage_type), write_byte, i % sizeof(storage_type)); 143 | } 144 | 145 | private: 146 | 147 | static constexpr int get_storage_size() { 148 | return (N + sizeof(storage_type) - 1) / sizeof(storage_type); 149 | } 150 | 151 | std::array storage_array_{}; 152 | 153 | }; -------------------------------------------------------------------------------- /examples/hashing/src/hash_functions/md2.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * md2.cpp 20 | * 21 | * Description: 22 | * MD2 hash function 23 | **************************************************************************/ 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | using namespace usm_smart_ptr; 31 | using namespace hash; 32 | 33 | struct md2_ctx { 34 | int len = 0; 35 | runtime_byte_array<16> data{}; 36 | byte state[48]{}; 37 | byte checksum[16]{}; 38 | }; 39 | 40 | /**************************** VARIABLES *****************************/ 41 | 42 | 43 | /*********************** FUNCTION DEFINITIONS ***********************/ 44 | template 45 | static inline void md2_transform(md2_ctx *ctx, const T &data) { 46 | constexpr byte consts[256] 47 | {41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6, 48 | 19, 98, 167, 5, 243, 192, 199, 115, 140, 152, 147, 43, 217, 188, 76, 49 | 130, 202, 30, 155, 87, 60, 253, 212, 224, 22, 103, 66, 111, 24, 138, 50 | 23, 229, 18, 190, 78, 196, 214, 218, 158, 222, 73, 160, 251, 245, 142, 51 | 187, 47, 238, 122, 169, 104, 121, 145, 21, 178, 7, 63, 148, 194, 16, 52 | 137, 11, 34, 95, 33, 128, 127, 93, 154, 90, 144, 50, 39, 53, 62, 53 | 204, 231, 191, 247, 151, 3, 255, 25, 48, 179, 72, 165, 181, 209, 215, 54 | 94, 146, 42, 172, 86, 170, 198, 79, 184, 56, 210, 150, 164, 125, 182, 55 | 118, 252, 107, 226, 156, 116, 4, 241, 69, 157, 112, 89, 100, 113, 135, 56 | 32, 134, 91, 207, 101, 230, 45, 168, 2, 27, 96, 37, 173, 174, 176, 57 | 185, 246, 28, 70, 97, 105, 52, 64, 126, 15, 85, 71, 163, 35, 221, 58 | 81, 175, 58, 195, 92, 249, 206, 186, 197, 234, 38, 44, 83, 13, 110, 59 | 133, 40, 132, 9, 211, 223, 205, 244, 65, 129, 77, 82, 106, 220, 55, 60 | 200, 108, 193, 171, 250, 36, 225, 123, 8, 12, 189, 177, 74, 120, 136, 61 | 149, 139, 227, 99, 232, 109, 233, 203, 213, 254, 59, 0, 29, 57, 242, 62 | 239, 183, 14, 102, 88, 208, 228, 166, 119, 114, 248, 235, 117, 75, 10, 63 | 49, 68, 80, 180, 143, 237, 31, 26, 219, 153, 141, 51, 159, 17, 131, 64 | 20}; 65 | 66 | #ifdef __NVPTX__ 67 | #pragma unroll 68 | #endif 69 | for (int j = 0; j < 16; ++j) { 70 | ctx->state[j + 32] = (ctx->state[j + 16] = data[j]) ^ ctx->state[j]; 71 | } 72 | 73 | dword t = 0; 74 | 75 | #ifdef __NVPTX__ 76 | #pragma unroll 77 | #endif 78 | for (dword j = 0; j < 18; ++j) { 79 | 80 | #ifdef __NVPTX__ 81 | #pragma unroll 82 | #endif 83 | for (unsigned char &k: ctx->state) { 84 | t = k ^= consts[t]; 85 | } 86 | t = (t + j) & 0xFF; 87 | } 88 | 89 | t = ctx->checksum[15]; 90 | 91 | #ifdef __NVPTX__ 92 | #pragma unroll 93 | #endif 94 | for (int j = 0; j < 16; ++j) { 95 | t = ctx->checksum[j] ^= consts[data[j] ^ t]; 96 | } 97 | } 98 | 99 | static inline void md2_update(md2_ctx *ctx, const byte *data, size_t len) { 100 | for (size_t i = 0; i < len; ++i) { 101 | ctx->data.write(ctx->len, data[i]); 102 | ctx->len++; 103 | if (ctx->len == MD2_BLOCK_SIZE) { 104 | md2_transform(ctx, ctx->data); 105 | ctx->len = 0; 106 | } 107 | } 108 | } 109 | 110 | static inline void md2_final(md2_ctx *ctx, byte *hash) { 111 | int to_pad = (int) MD2_BLOCK_SIZE - ctx->len; 112 | if (to_pad > 0) { 113 | #ifdef __NVPTX__ 114 | #pragma unroll 115 | #endif 116 | for (int i = ctx->len; i < MD2_BLOCK_SIZE; ++i) { 117 | ctx->data.write(i, (byte) to_pad); 118 | } 119 | } 120 | md2_transform(ctx, ctx->data); 121 | md2_transform(ctx, ctx->checksum); 122 | memcpy(hash, ctx->state, MD2_BLOCK_SIZE); 123 | } 124 | 125 | static inline void kernel_md2_hash(const byte *indata, dword inlen, byte *outdata, dword n_batch, dword thread) { 126 | if (thread >= n_batch) { 127 | return; 128 | } 129 | const byte *in = indata + thread * inlen; 130 | byte *out = outdata + thread * MD2_BLOCK_SIZE; 131 | md2_ctx ctx{}; 132 | md2_update(&ctx, in, inlen); 133 | md2_final(&ctx, out); 134 | } 135 | 136 | namespace hash::internal { 137 | 138 | sycl::event 139 | launch_md2_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch) { 140 | auto config = get_kernel_sizes(q, n_batch); 141 | return q.submit([&](sycl::handler &cgh) { 142 | cgh.depends_on(e); 143 | cgh.parallel_for( 144 | sycl::nd_range<1>(sycl::range<1>(config.block) * sycl::range<1>(config.wg_size), sycl::range<1>(config.wg_size)), 145 | [=](sycl::nd_item<1> item) { 146 | kernel_md2_hash(indata, inlen, outdata, n_batch, item.get_global_linear_id()); 147 | }); 148 | }); 149 | } 150 | 151 | 152 | } 153 | 154 | -------------------------------------------------------------------------------- /examples/hashing/include/internal/sync_api.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sync_api.hpp 20 | * 21 | * Description: 22 | * Synchronous hashing API 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include "handle.hpp" 27 | #include "common.hpp" 28 | 29 | #include "../tools/intrinsics.hpp" 30 | #include "../tools/sycl_queue_helpers.hpp" 31 | 32 | #include 33 | #include 34 | 35 | namespace hash { 36 | using namespace usm_smart_ptr; 37 | 38 | 39 | /** 40 | * Computes synchronously a hash. 41 | * @tparam M Hash method 42 | * @param q Queue to run on 43 | * @param in Pointer to the input data in any memory accessible by the HOST. Contains an array of data. 44 | * @param inlen Size in bytes of one block to hash. 45 | * @param out Pointer to the output memory accessible by the HOST 46 | * @param n_batch Number of blocks to hash. In and Out pointers must have correct sizes. 47 | */ 48 | template > 49 | inline void compute(sycl::queue &q, const byte *in, dword inlen, byte *out, dword n_batch) { 50 | if (is_ptr_usable(in, q) && is_ptr_usable(out, q)) { 51 | internal::dispatch_hash(q, sycl::event{}, device_accessible_ptr(in), device_accessible_ptr(out), inlen, n_batch, nullptr, 0).wait(); 52 | } else { 53 | internal::hash_with_data_copy({q, in, out, n_batch, inlen}, nullptr, 0).dev_e_.wait(); 54 | } 55 | } 56 | 57 | /** 58 | * Computes synchronously a hash. 59 | * @tparam M Hash method 60 | * @tparam n_outbit Number of bits to output 61 | * @param q Queue to run on 62 | * @param in Pointer to the input data in any memory accessible by the HOST. Contains an array of data. 63 | * @param inlen Size in bytes of one block to hash. 64 | * @param out Pointer to the output memory accessible by the HOST 65 | * @param n_batch Number of blocks to hash. In and Out pointers must have correct sizes. 66 | */ 67 | template> 68 | inline void compute(sycl::queue &q, const byte *in, dword inlen, byte *out, dword n_batch) { 69 | if (is_ptr_usable(in, q) && is_ptr_usable(out, q)) { 70 | internal::dispatch_hash(q, sycl::event{}, device_accessible_ptr(in), device_accessible_ptr(out), inlen, n_batch, nullptr, 0).wait(); 71 | } else { 72 | internal::hash_with_data_copy({q, in, out, n_batch, inlen}, nullptr, 0).dev_e_.wait(); 73 | } 74 | } 75 | 76 | /** 77 | * Computes synchronously a hash. 78 | * @tparam M Hash method 79 | * @tparam n_outbit Number of bits to output 80 | * @param q Queue to run on 81 | * @param in Pointer to the input data in any memory accessible by the HOST. Contains an array of data. 82 | * @param inlen Size in bytes of one block to hash. 83 | * @param out Pointer to the output memory accessible by the HOST 84 | * @param n_batch Number of blocks to hash. In and Out pointers must have correct sizes. 85 | */ 86 | template> 87 | inline void compute(sycl::queue &q, const byte *in, dword inlen, byte *out, dword n_batch, byte *key, dword keylen) { 88 | if (is_ptr_usable(in, q) && is_ptr_usable(out, q)) { 89 | internal::dispatch_hash(q, sycl::event{}, device_accessible_ptr(in), device_accessible_ptr(out), inlen, n_batch, key, keylen).wait(); 90 | } else { 91 | internal::hash_with_data_copy({q, in, out, n_batch, inlen}, key, keylen).dev_e_.wait(); 92 | } 93 | } 94 | 95 | #define alias_sync_compute(alias_name, method) \ 96 | template \ 97 | auto alias_name(Args&&... args) -> decltype(compute(std::forward(args)...)) { \ 98 | return compute(std::forward(args)...); \ 99 | } 100 | 101 | #define alias_sync_compute_with_n_outbit(alias_name, method) \ 102 | template \ 103 | auto alias_name(Args&&... args) -> decltype(compute(std::forward(args)...)) { \ 104 | return compute(std::forward(args)...); \ 105 | } 106 | 107 | alias_sync_compute(compute_md2, hash::method::md2) 108 | 109 | alias_sync_compute(compute_md5, hash::method::md5) 110 | 111 | alias_sync_compute(compute_sha1, hash::method::sha1) 112 | 113 | alias_sync_compute(compute_sha256, hash::method::sha256) 114 | 115 | alias_sync_compute_with_n_outbit(compute_sha3, hash::method::sha3) 116 | 117 | alias_sync_compute_with_n_outbit(compute_blake2b, hash::method::blake2b) 118 | 119 | alias_sync_compute_with_n_outbit(compute_keccak, hash::method::keccak) 120 | 121 | #undef alias_sync_compute 122 | #undef alias_sync_compute_with_n_outbit 123 | 124 | 125 | } //namespace hash::v_1 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /examples/hashing/src/hash_functions/sha1.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sha1.cpp 20 | * 21 | * Description: 22 | * SHA1 hash function 23 | **************************************************************************/ 24 | #include 25 | #include 26 | #include 27 | 28 | #include 29 | 30 | 31 | using namespace usm_smart_ptr; 32 | 33 | struct sha1_ctx { 34 | byte data[64]; 35 | dword datalen = 0; 36 | qword bitlen = 0; 37 | dword state[5]{0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xc3d2e1f0}; 38 | dword k[4]{0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6}; 39 | 40 | }; 41 | 42 | /****************************** MACROS ******************************/ 43 | #ifndef ROTLEFT 44 | #define ROTLEFT(a, b) (((a) << (b)) | ((a) >> (32-(b)))) 45 | #endif 46 | 47 | /*********************** FUNCTION DEFINITIONS ***********************/ 48 | void sha1_transform(sha1_ctx *ctx, const byte *data) { 49 | dword a, b, c, d, e, t, m[80]; 50 | 51 | #ifdef __NVPTX__ 52 | #pragma unroll 53 | #endif 54 | for (int i = 0, j = 0; i < 16; ++i, j += 4) { 55 | m[i] = sbb::upsample(data[j], data[j + 1], data[j + 2], data[j + 3]); 56 | } 57 | 58 | 59 | #ifdef __NVPTX__ 60 | #pragma unroll 61 | #endif 62 | for (qword i = 16; i < 80; ++i) { 63 | m[i] = (m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16]); 64 | m[i] = (m[i] << 1) | (m[i] >> 31); 65 | } 66 | 67 | a = ctx->state[0]; 68 | b = ctx->state[1]; 69 | c = ctx->state[2]; 70 | d = ctx->state[3]; 71 | e = ctx->state[4]; 72 | 73 | #ifdef __NVPTX__ 74 | #pragma unroll 75 | #endif 76 | for (dword i = 0; i < 20; ++i) { 77 | t = ROTLEFT(a, 5) + ((b & c) ^ (~b & d)) + e + ctx->k[0] + m[i]; 78 | e = d; 79 | d = c; 80 | c = ROTLEFT(b, 30); 81 | b = a; 82 | a = t; 83 | } 84 | #ifdef __NVPTX__ 85 | #pragma unroll 86 | #endif 87 | for (dword i = 20; i < 40; ++i) { 88 | t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[1] + m[i]; 89 | e = d; 90 | d = c; 91 | c = ROTLEFT(b, 30); 92 | b = a; 93 | a = t; 94 | } 95 | 96 | #ifdef __NVPTX__ 97 | #pragma unroll 98 | #endif 99 | for (dword i = 40; i < 60; ++i) { 100 | t = ROTLEFT(a, 5) + ((b & c) ^ (b & d) ^ (c & d)) + e + ctx->k[2] + m[i]; 101 | e = d; 102 | d = c; 103 | c = ROTLEFT(b, 30); 104 | b = a; 105 | a = t; 106 | } 107 | 108 | #ifdef __NVPTX__ 109 | #pragma unroll 110 | #endif 111 | for (dword i = 60; i < 80; ++i) { 112 | t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[3] + m[i]; 113 | e = d; 114 | d = c; 115 | c = ROTLEFT(b, 30); 116 | b = a; 117 | a = t; 118 | } 119 | 120 | ctx->state[0] += a; 121 | ctx->state[1] += b; 122 | ctx->state[2] += c; 123 | ctx->state[3] += d; 124 | ctx->state[4] += e; 125 | } 126 | 127 | void sha1_update(sha1_ctx *ctx, const byte *data, size_t len) { 128 | for (size_t i = 0; i < len; ++i) { 129 | ctx->data[ctx->datalen] = data[i]; 130 | ctx->datalen++; 131 | if (ctx->datalen == 64) { 132 | sha1_transform(ctx, ctx->data); 133 | ctx->bitlen += 512; 134 | ctx->datalen = 0; 135 | } 136 | } 137 | } 138 | 139 | void sha1_final(sha1_ctx *ctx, byte *hash) { 140 | dword i = ctx->datalen; 141 | 142 | // Pad whatever data is left in the buffer. 143 | if (ctx->datalen < 56) { 144 | ctx->data[i++] = 0x80; 145 | while (i < 56) 146 | ctx->data[i++] = 0x00; 147 | } else { 148 | ctx->data[i++] = 0x80; 149 | while (i < 64) 150 | ctx->data[i++] = 0x00; 151 | sha1_transform(ctx, ctx->data); 152 | memset(ctx->data, 0, 56); 153 | } 154 | 155 | // Append to the padding the total message's length in bits and transform. 156 | ctx->bitlen += ctx->datalen * 8; 157 | ctx->data[63] = ctx->bitlen; 158 | ctx->data[62] = ctx->bitlen >> 8; 159 | ctx->data[61] = ctx->bitlen >> 16; 160 | ctx->data[60] = ctx->bitlen >> 24; 161 | ctx->data[59] = ctx->bitlen >> 32; 162 | ctx->data[58] = ctx->bitlen >> 40; 163 | ctx->data[57] = ctx->bitlen >> 48; 164 | ctx->data[56] = ctx->bitlen >> 56; 165 | sha1_transform(ctx, ctx->data); 166 | 167 | // Since this implementation uses little endian byte ordering and MD uses big endian, 168 | // reverse all the bytes when copying the final state to the output hash. 169 | for (i = 0; i < 4; ++i) { 170 | hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff; 171 | hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff; 172 | hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff; 173 | hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff; 174 | hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff; 175 | } 176 | } 177 | 178 | void kernel_sha1_hash(const byte *indata, dword inlen, byte *outdata, dword n_batch, dword thread) { 179 | if (thread >= n_batch) { 180 | return; 181 | } 182 | const byte *in = indata + thread * inlen; 183 | byte *out = outdata + thread * SHA1_BLOCK_SIZE; 184 | sha1_ctx ctx{}; 185 | sha1_update(&ctx, in, inlen); 186 | sha1_final(&ctx, out); 187 | } 188 | 189 | 190 | namespace hash::internal { 191 | sycl::event launch_sha1_kernel(sycl::queue &q, sycl::event e, const device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch) { 192 | auto config = get_kernel_sizes(q, n_batch); 193 | return q.submit([&](sycl::handler &cgh) { 194 | cgh.depends_on(e); 195 | cgh.parallel_for( 196 | sycl::nd_range<1>(sycl::range<1>(config.block) * sycl::range<1>(config.wg_size), sycl::range<1>(config.wg_size)), 197 | [=](sycl::nd_item<1> item) { 198 | kernel_sha1_hash(indata, inlen, outdata, n_batch, item.get_global_linear_id()); 199 | }); 200 | }); 201 | } 202 | 203 | } 204 | -------------------------------------------------------------------------------- /examples/hashing/include/tools/sycl_queue_helpers.hpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sycl_queue_helpers.hpp 20 | * 21 | * Description: 22 | * Helper functions relating to SYCL queues 23 | **************************************************************************/ 24 | #pragma once 25 | 26 | #include 27 | #include 28 | #include "../internal/common.hpp" 29 | 30 | #ifdef USING_COMPUTECPP 31 | class queue_kernel_tester; 32 | namespace cl::sycl::usm{ 33 | using cl::sycl::experimental::usm::alloc; 34 | } 35 | #endif 36 | 37 | /** 38 | * Selects a CUDA device (but returns sometimes an invalid one) 39 | */ 40 | class cuda_selector : public sycl::device_selector { 41 | public: 42 | int operator()(const sycl::device &device) const override { 43 | #if defined(SYCL_IMPLEMENTATION_ONEAPI) || defined(SYCL_IMPLEMENTATION_INTEL) 44 | return device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda && device.get_info() ? 1 : -1; 45 | #else 46 | return device.is_gpu() && (device.get_info().find("NVIDIA") != std::string::npos) ? 1 : -1; 47 | #endif 48 | } 49 | }; 50 | 51 | 52 | void queue_tester(sycl::queue &q); 53 | 54 | 55 | /** 56 | * Tries to get a queue from a selector else returns the host device 57 | * @tparam strict if true will check whether the queue can run a trivial task which implied 58 | * that the translation unit needs to be compiler with support for the device you're selecting. 59 | */ 60 | template 61 | inline sycl::queue try_get_queue(const T &selector) { 62 | auto exception_handler = [](const sycl::exception_list &exceptions) { 63 | for (std::exception_ptr const &e: exceptions) { 64 | try { 65 | std::rethrow_exception(e); 66 | } 67 | catch (sycl::exception const &e) { 68 | std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl; 69 | } 70 | catch (std::exception const &e) { 71 | std::cout << "Caught asynchronous STL exception: " << e.what() << std::endl; 72 | } 73 | } 74 | }; 75 | 76 | sycl::device dev; 77 | sycl::queue q; 78 | try { 79 | dev = sycl::device(selector); 80 | q = sycl::queue(dev, exception_handler); 81 | 82 | try { 83 | if constexpr (strict) { 84 | if (dev.is_cpu() || dev.is_gpu()) { //Only CPU and GPU not host, dsp, fpga, ?... 85 | queue_tester(q); 86 | } 87 | } 88 | } catch (...) { 89 | std::cerr << "Warning: " << dev.get_info() << " found but not working! Fall back on: "; 90 | dev = sycl::device(sycl::host_selector()); 91 | q = sycl::queue(dev, exception_handler); 92 | std::cerr << dev.get_info() << '\n'; 93 | return q; 94 | } 95 | } 96 | catch (...) { 97 | 98 | dev = sycl::device(sycl::host_selector()); 99 | q = sycl::queue(dev, exception_handler); 100 | std::cerr << "Warning: Expected device not found! Fall back on: " << dev.get_info() << '\n'; 101 | } 102 | return q; 103 | } 104 | 105 | #if defined(__linux__) || defined(__APPLE__) || defined(__LINUX__) 106 | 107 | #include 108 | #include 109 | 110 | /** 111 | * Checks whether a pointer was allocated on the host device as the pointer query is not reliable on DPC++ on the host. 112 | * @see http://si-head.nl/articles/msync 113 | * @return Wether the memory was allocated on the host OS. 114 | */ 115 | template 116 | inline bool valid_pointer(T *p) { 117 | // Get page size and calculate page mask 118 | auto pagesz = (size_t) sysconf(_SC_PAGESIZE); 119 | size_t pagemask = ~(pagesz - 1); 120 | // Calculate base address 121 | void *base = (void *) (((size_t) p) & pagemask); 122 | return msync(base, sizeof(T), MS_ASYNC) == 0; 123 | } 124 | 125 | #else 126 | template 127 | inline bool valid_pointer(T *p) { 128 | return false; 129 | } 130 | #endif 131 | 132 | 133 | template 134 | inline bool is_ptr_usable([[maybe_unused]] const T *ptr, [[maybe_unused]] const sycl::queue &q) { 135 | if (q.get_device().is_host()) { 136 | return valid_pointer(ptr); 137 | } 138 | 139 | try { 140 | sycl::get_pointer_device(ptr, q.get_context()); 141 | sycl::usm::alloc alloc_type = sycl::get_pointer_type(ptr, q.get_context()); 142 | if constexpr(debug) { 143 | std::cerr << "Allocated on:" << q.get_device().get_info() << " USM type: "; 144 | switch (alloc_type) { 145 | case sycl::usm::alloc::host: 146 | std::cerr << "alloc::host" << '\n'; 147 | break; 148 | case sycl::usm::alloc::device: 149 | std::cerr << "alloc::device" << '\n'; 150 | break; 151 | case sycl::usm::alloc::shared: 152 | std::cerr << "alloc::shared" << '\n'; 153 | break; 154 | case sycl::usm::alloc::unknown: 155 | std::cerr << "alloc::unknown" << '\n'; 156 | break; 157 | } 158 | } 159 | return alloc_type == sycl::usm::alloc::shared // Shared memory is ok 160 | || alloc_type == sycl::usm::alloc::device // Device memory is ok 161 | || (alloc_type == sycl::usm::alloc::host && q.get_device().is_cpu()) // We discard host allocated memory because of poor performance unless on the CPU 162 | ; 163 | } catch (...) { 164 | if constexpr (debug) { 165 | std::cerr << "Not allocated on:" << q.get_device().get_info() << '\n'; 166 | } 167 | return false; 168 | } 169 | 170 | } 171 | 172 | 173 | /** 174 | * Usefull for memory bound computation. 175 | * Returns CPU devices that represents different numa nodes. 176 | * @return 177 | */ 178 | /* inline hash::runners get_cpu_runners_numa() { 179 | try { 180 | sycl::device d{sycl::cpu_selector{}}; 181 | auto numa_nodes = d.create_sub_devices(sycl::info::partition_affinity_domain::numa); 182 | hash::runners runners_; 183 | std::transform(numa_nodes.begin(), numa_nodes.end(), runners_.begin(), [](auto &dev) -> hash::runner { return {try_get_queue(dev), 1}; }); 184 | return runners_; 185 | } 186 | catch (...) { 187 | return {{sycl::queue{sycl::host_selector{}}, 1}}; 188 | } 189 | } */ 190 | -------------------------------------------------------------------------------- /examples/hashing/src/hash_functions/sha256.cpp: -------------------------------------------------------------------------------- 1 | /*************************************************************************** 2 | * 3 | * Copyright (C) Codeplay Software Ltd. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | * 17 | * Codeplay's SYCL-For-CUDA-Examples 18 | * 19 | * sha256.cpp 20 | * 21 | * Description: 22 | * SHA256 hash function 23 | **************************************************************************/ 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | 30 | using namespace usm_smart_ptr; 31 | 32 | /**************************** DATA TYPES ****************************/ 33 | struct sha256_ctx { 34 | byte data[64]; 35 | qword bitlen = 0; 36 | dword datalen = 0; 37 | dword state[8]{}; 38 | 39 | sha256_ctx() { 40 | state[0] = 0x6a09e667; 41 | state[1] = 0xbb67ae85; 42 | state[2] = 0x3c6ef372; 43 | state[3] = 0xa54ff53a; 44 | state[4] = 0x510e527f; 45 | state[5] = 0x9b05688c; 46 | state[6] = 0x1f83d9ab; 47 | state[7] = 0x5be0cd19; 48 | } 49 | }; 50 | 51 | /****************************** MACROS ******************************/ 52 | #ifndef ROTLEFT 53 | #define ROTLEFT(a, b) (((a) << (b)) | ((a) >> (32-(b)))) 54 | #endif 55 | 56 | #define ROTRIGHT(a, b) (((a) >> (b)) | ((a) << (32-(b)))) 57 | 58 | #define CH(x, y, z) (((x) & (y)) ^ (~(x) & (z))) 59 | #define MAJ(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) 60 | #define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22)) 61 | #define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25)) 62 | #define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3)) 63 | #define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10)) 64 | 65 | /**************************** VARIABLES *****************************/ 66 | 67 | 68 | /*********************** FUNCTION DEFINITIONS ***********************/ 69 | static void sha256_transform(sha256_ctx *ctx, const byte *data) { 70 | dword a, b, c, d, e, f, g, h, t1, t2, m[64]; 71 | 72 | static const dword consts[64] = 73 | {0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 74 | 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 75 | 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 76 | 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 77 | 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 78 | 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 79 | 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 80 | 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 81 | 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 82 | 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 83 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2}; 84 | 85 | #ifdef __NVPTX__ 86 | #pragma unroll 87 | #endif 88 | for (int i = 0, j = 0; i < 16; ++i, j += 4) { 89 | m[i] = sbb::upsample(data[j], data[j + 1], data[j + 2], data[j + 3]); 90 | } 91 | 92 | #ifdef __NVPTX__ 93 | #pragma unroll 94 | #endif 95 | for (int i = 16; i < 64; ++i) { 96 | m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16]; 97 | } 98 | 99 | a = ctx->state[0]; 100 | b = ctx->state[1]; 101 | c = ctx->state[2]; 102 | d = ctx->state[3]; 103 | e = ctx->state[4]; 104 | f = ctx->state[5]; 105 | g = ctx->state[6]; 106 | h = ctx->state[7]; 107 | 108 | #ifdef __NVPTX__ 109 | #pragma unroll 110 | #endif 111 | for (int i = 0; i < 64; ++i) { 112 | t1 = h + EP1(e) + CH(e, f, g) + consts[i] + m[i]; 113 | t2 = EP0(a) + MAJ(a, b, c); 114 | h = g; 115 | g = f; 116 | f = e; 117 | e = d + t1; 118 | d = c; 119 | c = b; 120 | b = a; 121 | a = t1 + t2; 122 | } 123 | 124 | ctx->state[0] += a; 125 | ctx->state[1] += b; 126 | ctx->state[2] += c; 127 | ctx->state[3] += d; 128 | ctx->state[4] += e; 129 | ctx->state[5] += f; 130 | ctx->state[6] += g; 131 | ctx->state[7] += h; 132 | } 133 | 134 | 135 | static void sha256_update(sha256_ctx *ctx, const byte *data, size_t len) { 136 | for (dword i = 0; i < len; ++i) { 137 | ctx->data[ctx->datalen] = data[i]; 138 | ctx->datalen++; 139 | if (ctx->datalen == 64) { 140 | sha256_transform(ctx, ctx->data); 141 | ctx->bitlen += 512; 142 | ctx->datalen = 0; 143 | } 144 | } 145 | } 146 | 147 | static void sha256_final(sha256_ctx *ctx, byte *hash) { 148 | dword i = ctx->datalen; 149 | // Pad whatever data is left in the buffer. 150 | if (ctx->datalen < 56) { 151 | ctx->data[i++] = 0x80; 152 | while (i < 56) { 153 | ctx->data[i++] = 0x00; 154 | } 155 | 156 | } else { 157 | ctx->data[i++] = 0x80; 158 | while (i < 64) { 159 | ctx->data[i++] = 0x00; 160 | } 161 | sha256_transform(ctx, ctx->data); 162 | std::memset(ctx->data, 0, 56); 163 | } 164 | 165 | // Append to the padding the total message's length in bits and transform. 166 | ctx->bitlen += ctx->datalen * 8; 167 | ctx->data[63] = ctx->bitlen; 168 | ctx->data[62] = ctx->bitlen >> 8; 169 | ctx->data[61] = ctx->bitlen >> 16; 170 | ctx->data[60] = ctx->bitlen >> 24; 171 | ctx->data[59] = ctx->bitlen >> 32; 172 | ctx->data[58] = ctx->bitlen >> 40; 173 | ctx->data[57] = ctx->bitlen >> 48; 174 | ctx->data[56] = ctx->bitlen >> 56; 175 | sha256_transform(ctx, ctx->data); 176 | 177 | // Since this implementation uses little endian byte ordering and SHA uses big endian, 178 | // reverse all the bytes when copying the final state to the output hash. 179 | #pragma unroll 180 | for (i = 0; i < 4; ++i) { 181 | hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff; 182 | hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff; 183 | hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff; 184 | hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff; 185 | hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff; 186 | hash[i + 20] = (ctx->state[5] >> (24 - i * 8)) & 0x000000ff; 187 | hash[i + 24] = (ctx->state[6] >> (24 - i * 8)) & 0x000000ff; 188 | hash[i + 28] = (ctx->state[7] >> (24 - i * 8)) & 0x000000ff; 189 | } 190 | } 191 | 192 | static void kernel_sha256_hash(const byte *indata, dword inlen, byte *outdata, dword n_batch, dword thread) { 193 | if (thread >= n_batch) { 194 | return; 195 | } 196 | const byte *in = indata + thread * inlen; 197 | byte *out = outdata + thread * SHA256_BLOCK_SIZE; 198 | sha256_ctx ctx{}; 199 | sha256_update(&ctx, in, inlen); 200 | sha256_final(&ctx, out); 201 | } 202 | 203 | namespace hash::internal { 204 | 205 | sycl::event 206 | launch_sha256_kernel(sycl::queue &q, sycl::event e, const device_accessible_ptr indata, device_accessible_ptr outdata, dword inlen, dword n_batch) { 207 | auto config = get_kernel_sizes(q, n_batch); 208 | return q.submit([&](sycl::handler &cgh) { 209 | cgh.depends_on(e); 210 | cgh.parallel_for( 211 | sycl::nd_range<1>(sycl::range<1>(config.block) * sycl::range<1>(config.wg_size), sycl::range<1>(config.wg_size)), 212 | [=](sycl::nd_item<1> item) { 213 | kernel_sha256_hash(indata, inlen, outdata, n_batch, item.get_global_linear_id()); 214 | }); 215 | }); 216 | } 217 | 218 | 219 | } 220 | --------------------------------------------------------------------------------