├── examples
    ├── kokkos
    │   ├── kokkos_build_scripts
    │   │   ├── README.md
    │   │   ├── kokkos_nvcc_cuda_build.sh
    │   │   ├── kokkos_clang_cuda_build.sh
    │   │   └── kokkos_build.sh
    │   ├── build.sh
    │   ├── CMakeLists.txt
    │   └── README.md
    ├── sgemm_interop
    │   ├── build.sh
    │   ├── README.md
    │   ├── CMakeLists.txt
    │   ├── sgemm.cu
    │   ├── sycl_sgemm_usm.cpp
    │   └── sycl_sgemm.cpp
    ├── vector_addition
    │   ├── build.sh
    │   ├── CMakeLists.txt
    │   ├── vector_addition.cpp
    │   ├── vector_addition_usm.cpp
    │   ├── README.md
    │   └── vector_addition.cu
    ├── hashing
    │   ├── tests
    │   │   ├── CMakeLists.txt
    │   │   └── tests_helpers.hpp
    │   ├── cmake
    │   │   ├── Modules
    │   │   │   ├── ComputeCppIRMap.cmake
    │   │   │   └── ComputeCppCompilerChecks.cmake
    │   │   └── FindSYCL.cmake
    │   ├── include
    │   │   ├── sycl_hash.hpp
    │   │   ├── hash_functions
    │   │   │   ├── md2.hpp
    │   │   │   ├── sha1.hpp
    │   │   │   ├── md5.hpp
    │   │   │   ├── sha256.hpp
    │   │   │   ├── keccak.hpp
    │   │   │   ├── blake2b.hpp
    │   │   │   └── blake3.hpp
    │   │   ├── internal
    │   │   │   ├── config.hpp
    │   │   │   ├── determine_kernel_config.hpp
    │   │   │   ├── handle.hpp
    │   │   │   ├── async_api.hpp
    │   │   │   └── sync_api.hpp
    │   │   └── tools
    │   │   │   ├── fill_rand.hpp
    │   │   │   ├── intrinsics.hpp
    │   │   │   ├── usm_smart_ptr.hpp
    │   │   │   ├── runtime_byte_array.hpp
    │   │   │   └── sycl_queue_helpers.hpp
    │   ├── src
    │   │   ├── tools
    │   │   │   └── queue_tester.cpp
    │   │   ├── benchmarks
    │   │   │   └── misc.hpp
    │   │   └── hash_functions
    │   │   │   ├── md2.cpp
    │   │   │   ├── sha1.cpp
    │   │   │   └── sha256.cpp
    │   ├── CMakeLists.txt
    │   ├── demo_main.cpp
    │   ├── doc
    │   │   └── README.md
    │   └── README.md
    ├── MPI
    │   ├── README.md
    │   └── Makefile
    ├── distrib_batch_gemm
    │   ├── README.md
    │   ├── main.cpp
    │   ├── Makefile
    │   ├── vadd_sycl.cpp
    │   └── vadd_cuda.cu
    └── fortran_interface
    │   ├── README.md
    │   ├── Makefile
    │   ├── saxpy.cpp
    │   └── saxpy.cuf
├── .github
    ├── dependabot.yml
    └── workflows
    │   └── scorecard.yml
├── SECURITY.md
├── .gitignore
├── Contributing.md
├── setup-script
    ├── sample
    │   ├── README.md
    │   ├── CMakeLists.txt
    │   ├── include
    │   │   ├── chrono.hpp
    │   │   ├── common.hpp
    │   │   └── usm_smart_ptr.hpp
    │   ├── mkl_matmult_usm.cpp
    │   └── mkl_matmult.cpp
    ├── README.md
    ├── build_minimal.sh
    ├── build.sh
    └── build_with_libcxx.sh
└── README.md


/examples/kokkos/kokkos_build_scripts/README.md:
--------------------------------------------------------------------------------
1 | These build scripts are provided for illustration only. They will almost certainly require modification before they work elsewhere.
2 | 


--------------------------------------------------------------------------------
/examples/sgemm_interop/build.sh:
--------------------------------------------------------------------------------
1 | rm -rf build && mkdir build && cd build
2 | cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++
3 | make -j
4 | 


--------------------------------------------------------------------------------
/examples/vector_addition/build.sh:
--------------------------------------------------------------------------------
1 | rm -rf build && mkdir  build && cd build
2 | cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++ -DCMAKE_EXPORT_COMPILE_COMMANDS=yes
3 | make -j 8
4 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # Enable version updates for Github Actions
 4 |   - package-ecosystem: "github-actions"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "monthly"
 8 |     groups:
 9 |       github-actions:
10 |         patterns:
11 |           - "*"
12 |     reviewers:
13 |       - "codeplaysoftware/security-managers"
14 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | 
3 | ## Reporting a Vulnerability
4 | 
5 | To report a vulnerability or a security issue please fill the security
6 | advisories form [here](../../security/advisories/new), send an email to
7 | security@codeplay.com or contact us using the [contact form on our web
8 | page](https://codeplay.com/company/contact/?q=Report%20Security%20Issue).
9 | 


--------------------------------------------------------------------------------
/examples/kokkos/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | rm -rf build
 4 | mkdir build
 5 | cd build
 6 | 
 7 | # Set the environment variable Kokkos_ROOT="[your/kokkos/installation]/lib/cmake/Kokkos"
 8 | CXXFLAGS="-Xsycl-target-frontend -O3" \
 9 | LDFLAGS="-Xsycl-target-frontend -O3" \
10 | cmake .. -G Ninja \
11 |       -DCMAKE_BUILD_TYPE=Debug \
12 |       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
13 |       -DCMAKE_CXX_COMPILER=clang++ \
14 |       -DCMAKE_C_COMPILER=clang
15 | 
16 | ninja
17 | 
18 | cd ..
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | # Temporaries
35 | *~
36 | *#
37 | */build
38 | ]
39 | # vim
40 | *.swp
41 | 


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | -------------
 3 | 
 4 | New examples or improvements to the existing ones are welcomed. 
 5 | Please, follow 
 6 | [LLVM coding standards](https://llvm.org/docs/CodingStandards.html) when 
 7 | contributing code, since sometimes they will be contributed as tests to the
 8 | [DPCPP project](https://github.com/intel/llvm) project.
 9 | 
10 | When writing your commit message, please make sure to follow 
11 | [LLVM developer policies](https://llvm.org/docs/DeveloperPolicy.html#commit-messages) 
12 | on the subject.
13 | 


--------------------------------------------------------------------------------
/examples/hashing/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | include(FetchContent)
 2 | FetchContent_Declare(
 3 |         googletest
 4 |         URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
 5 | )
 6 | # For Windows: Prevent overriding the parent project's compiler/linker settings
 7 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 8 | FetchContent_MakeAvailable(googletest)
 9 | 
10 | 
11 | enable_testing()
12 | 
13 | add_executable(
14 |         test_all_hashes
15 |         tests/tests.cpp
16 | )
17 | 
18 | add_sycl_to_target(TARGET test_all_hashes SOURCES tests/tests.cpp)
19 | 
20 | target_link_libraries(test_all_hashes PUBLIC gtest_main sycl_hash)
21 | 
22 | include(GoogleTest)
23 | gtest_discover_tests(test_all_hashes)


--------------------------------------------------------------------------------
/examples/MPI/README.md:
--------------------------------------------------------------------------------
 1 | ## MPI + SYCL example
 2 | 
 3 | This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks for integration.
 4 | 
 5 | 
 6 | ## Requisites
 7 | 
 8 | The Makefile provided assumes the MPICXX compiler points to the DPCPP compiler with CUDA support.
 9 | That requires the MPI implementation to be built, or use, the DPCPP compiler.
10 | The MPI implementation needs to have been built with CUDA support (typically called "CUDA-aware" MPI")
11 | 
12 | ## Compilation
13 | 
14 | If MPICXX points to DPC++ with CUDA support and its on the path, "make" should build the program.
15 | 
16 | ## Execution
17 | 
18 | The makefile contains a target to execute the problem in two processes:
19 | 
20 | ```sh
21 | make run
22 | ```
23 | 
24 | The target assumes mpirun is on the PATH
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/examples/hashing/cmake/Modules/ComputeCppIRMap.cmake:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.4.3)
 2 | 
 3 | # These should match the types of IR output by compute++
 4 | set(IR_MAP_spir bc)
 5 | set(IR_MAP_spir64 bc)
 6 | set(IR_MAP_spir32 bc)
 7 | set(IR_MAP_spirv spv)
 8 | set(IR_MAP_spirv64 spv)
 9 | set(IR_MAP_spirv32 spv)
10 | set(IR_MAP_aorta-x86_64 o)
11 | set(IR_MAP_aorta-aarch64 o)
12 | set(IR_MAP_aorta-rcar-cve o)
13 | set(IR_MAP_custom-spir64 bc)
14 | set(IR_MAP_custom-spir32 bc)
15 | set(IR_MAP_custom-spirv64 spv)
16 | set(IR_MAP_custom-spirv32 spv)
17 | set(IR_MAP_ptx64 s)
18 | set(IR_MAP_amdgcn s)
19 | 
20 | # Retrieves the filename extension of the IR output of compute++
21 | function(get_sycl_target_extension output)
22 |     set(syclExtension ${IR_MAP_${COMPUTECPP_BITCODE}})
23 |     if (NOT syclExtension)
24 |         # Needed when using multiple device targets
25 |         set(syclExtension "bc")
26 |     endif ()
27 |     set(${output} ${syclExtension} PARENT_SCOPE)
28 | endfunction()
29 | 


--------------------------------------------------------------------------------
/examples/kokkos/kokkos_build_scripts/kokkos_nvcc_cuda_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Install Kokkos w/ sycl-cuda support
 3 | 
 4 | set -x #echo on
 5 | 
 6 | # Set:
 7 | # KOKKOS_INSTALL_DIR=[/your/install/dir]
 8 | # KOKKOS_SOURCE_DIR=[/your/source/dir]
 9 | # HWLOC_DIR=[/your/hwloc/dir]
10 | 
11 | # Configure & build kokkos
12 | mkdir kokkos-build
13 | cd kokkos-build
14 | 
15 | cmake $KOKKOS_SOURCE_DIR -G Ninja \
16 |       -DCMAKE_BUILD_TYPE=Release \
17 |       -DCMAKE_CXX_STANDARD=17 \
18 |       -DCMAKE_CXX_COMPILER=g++ \
19 |       -DCMAKE_CUDA_COMPILER=nvcc \
20 |       -DCMAKE_INSTALL_PREFIX=$KOKKOS_INSTALL_DIR \
21 |       -DKokkos_CXX_STANDARD=17 \
22 |       -DKokkos_ENABLE_SYCL=OFF \
23 |       -DKokkos_ENABLE_CUDA=ON \
24 |       -DKokkos_ARCH_HSW=ON \
25 |       -DKokkos_ARCH_AMPERE80=ON \
26 |       -DKokkos_ENABLE_HWLOC=ON \
27 |       -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
28 |       -DKokkos_ENABLE_TESTS=OFF \
29 |       -DKokkos_HWLOC_DIR=$HWLOC_DIR
30 | 
31 | ninja install
32 | 
33 | cd ..
34 | 


--------------------------------------------------------------------------------
/setup-script/sample/README.md:
--------------------------------------------------------------------------------
 1 | # oneMKL samples
 2 | 
 3 | The code runs a small benchmarks of your blas implementation using multiplication of square matrices. You can pass the size as an argument of the executable.
 4 | 
 5 | Two versions are provided, one of which is using the USM inferface.
 6 | 
 7 | If the environment is correctly set you should be able to run the sample with:
 8 | 
 9 | ```
10 | mkdir build; 
11 | cd build 
12 | CXX=clang++ cmake .. -DCMAKE_BUILD_TYPE=Release
13 | cmake --build .
14 | ```
15 | 
16 | ### Detail
17 | 
18 | - `sycl_unique<T>` is a unique pointer to a USM allocated memory which wraps a `std::unique_ptr<T>` with a custom deleter and holds the allocated size.
19 | - `fill_rand` fills a `std::vector<T>` or `sycl_unique<T>` with random values.
20 | 
21 | ### Refs
22 | 
23 | - Working example adapted
24 |   from [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-programming-guide/top/api-based-programming/intel-oneapi-math-kernel-library-onemkl/onemkl-code-sample.html)


--------------------------------------------------------------------------------
/setup-script/sample/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.9)
 2 | project(Sycl_Tests LANGUAGES CXX)
 3 | set(CMAKE_CXX_STANDARD 20)
 4 | 
 5 | set(WARNING_FLAGS "-Wall -Wextra -Wshadow -Wdouble-promotion -Wshadow -Wuninitialized -Wmissing-declarations -Woverloaded-virtual")
 6 | set(DISABLED_WARNINGS "-Wno-c++20-extensions -Wno-unknown-cuda-version -Wno-unused -Wno-unused-parameter")
 7 | 
 8 | set(OPT_FLAGS "-march=native -mtune=native -Ofast -fomit-frame-pointer")
 9 | 
10 | SET(CMAKE_CXX_FLAGS "-fsycl -fsycl-targets=nvptx64-nvidia-cuda -sycl-std=2020 -fsycl-unnamed-lambda")
11 | SET(CMAKE_EXE_LINKER_FLAGS "-lonemkl_blas_cublas -lonemkl")
12 | 
13 | set(CMAKE_CXX_FLAGS_RELEASE "${OPT_FLAGS} ${WARNING_FLAGS} ${DISABLED_WARNINGS}")
14 | set(CMAKE_CXX_FLAGS_DEBUG " ${WARNING_FLAGS}  ${DISABLED_WARNINGS} -g3 -Og")
15 | 
16 | include_directories(include/)
17 | include_directories($ENV{DPCPP_HOME}/deploy/include)
18 | link_directories($ENV{DPCPP_HOME}/deploy/lib)
19 | 
20 | add_executable(mkl_kernel mkl_matmult.cpp)
21 | add_executable(mkl_kernel_usm mkl_matmult_usm.cpp)
22 | 
23 | 


--------------------------------------------------------------------------------
/examples/kokkos/kokkos_build_scripts/kokkos_clang_cuda_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Install Kokkos w/ sycl-cuda support
 3 | 
 4 | set -x #echo on
 5 | 
 6 | # Set:
 7 | # KOKKOS_INSTALL_DIR=[/your/install/dir]
 8 | # KOKKOS_SOURCE_DIR=[/your/source/dir]
 9 | # HWLOC_DIR=[/your/hwloc/dir]
10 | 
11 | # Configure & build kokkos
12 | mkdir kokkos-build
13 | cd kokkos-build
14 | 
15 | CXXFLAGS="-Xsycl-target-frontend -O3 -fgpu-inline-threshold=100000" \
16 | LDFLAGS="-Xsycl-target-frontend -O3" \
17 | cmake $KOKKOS_SOURCE_DIR -G Ninja \
18 |       -DCMAKE_BUILD_TYPE=Release \
19 |       -DCMAKE_CXX_STANDARD=17 \
20 |       -DCMAKE_CXX_COMPILER=clang++ \
21 |       -DCMAKE_INSTALL_PREFIX=$KOKKOS_INSTALL_DIR \
22 |       -DKokkos_CXX_STANDARD=17 \
23 |       -DKokkos_ENABLE_SYCL=OFF \
24 |       -DKokkos_ENABLE_CUDA=ON \
25 |       -DKokkos_ARCH_HSW=ON \
26 |       -DKokkos_ARCH_AMPERE80=ON \
27 |       -DKokkos_ENABLE_HWLOC=ON \
28 |       -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
29 |       -DKokkos_ENABLE_TESTS=OFF \
30 |       -DKokkos_HWLOC_DIR=$HWLOC_DIR
31 | 
32 | ninja install
33 | 
34 | cd ..
35 | 


--------------------------------------------------------------------------------
/examples/kokkos/kokkos_build_scripts/kokkos_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Install Kokkos w/ sycl-cuda support
 3 | 
 4 | set -x #echo on
 5 | 
 6 | # Set:
 7 | # KOKKOS_INSTALL_DIR=[/your/install/dir]
 8 | # KOKKOS_SOURCE_DIR=[/your/source/dir]
 9 | # HWLOC_DIR=[/your/hwloc/dir]
10 | 
11 | # Configure & build kokkos
12 | mkdir kokkos-build
13 | cd kokkos-build
14 | 
15 | CXXFLAGS="-Xsycl-target-frontend -O3 -fgpu-inline-threshold=100000 -Wno-unknown-cuda-version -Wno-deprecated-declarations -Wno-linker-warnings -ffast-math" \
16 | LDFLAGS="-Xsycl-target-frontend -O3" \
17 | cmake $KOKKOS_SOURCE_DIR -G Ninja \
18 |       -DCMAKE_BUILD_TYPE=Release \
19 |       -DCMAKE_CXX_STANDARD=17 \
20 |       -DCMAKE_CXX_COMPILER=clang++ \
21 |       -DCMAKE_INSTALL_PREFIX=$KOKKOS_INSTALL_DIR \
22 |       -DKokkos_CXX_STANDARD=17 \
23 |       -DKokkos_ENABLE_SYCL=ON \
24 |       -DKokkos_ARCH_HSW=ON \
25 |       -DKokkos_ARCH_AMPERE80=ON \
26 |       -DKokkos_ENABLE_HWLOC=ON \
27 |       -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
28 |       -DKokkos_ENABLE_TESTS=OFF \
29 |       -DKokkos_HWLOC_DIR=$HWLOC_DIR
30 | 
31 | ninja install
32 | 
33 | cd ..
34 | 


--------------------------------------------------------------------------------
/examples/hashing/include/sycl_hash.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  sycl_hash.hpp
20 |  *
21 |  *  Description:
22 |  *    SYCL hashing
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include "internal/config.hpp"
27 | #include "internal/sync_api.hpp"
28 | #include "internal/async_api.hpp"
29 | 


--------------------------------------------------------------------------------
/examples/hashing/src/tools/queue_tester.cpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  queue_tester.cpp
20 |  *
21 |  *  Description:
22 |  *    Queue tester
23 |  **************************************************************************/
24 | #include <sycl/sycl.hpp>
25 | #include <tools/sycl_queue_helpers.hpp>
26 | 
27 | void queue_tester(sycl::queue &q) {
28 |     q.submit([](sycl::handler &cgh) {
29 |         cgh.single_task<class queue_kernel_tester>([]() {});
30 |     }).wait_and_throw();
31 | }


--------------------------------------------------------------------------------
/examples/kokkos/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #/***************************************************************************
 2 | # *
 3 | # *  Copyright (C) Codeplay Software Ltd.
 4 | # *
 5 | # *  Licensed under the Apache License, Version 2.0 (the "License");
 6 | # *  you may not use this file except in compliance with the License.
 7 | # *  You may obtain a copy of the License at
 8 | # *
 9 | # *      http://www.apache.org/licenses/LICENSE-2.0
10 | # *
11 | # *  Unless required by applicable law or agreed to in writing, software
12 | # *  distributed under the License is distributed on an "AS IS" BASIS,
13 | # *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # *  See the License for the specific language governing permissions and
15 | # *  limitations under the License.
16 | # *
17 | # *  Codeplay's SYCL-For-CUDA-Examples
18 | # *
19 | # *  CMakeLists.txt
20 | # *
21 | # *  Description:
22 | # *    CMake for kokkos example
23 | # **************************************************************************/
24 | cmake_minimum_required (VERSION 3.10)
25 | cmake_policy(SET CMP0074 NEW)
26 | project (Kokkos_Test_Case)
27 | 
28 | set(Kokkos_DIR "$ENV{Kokkos_ROOT}" CACHE STRING "Kokkos root directory")
29 | find_package(Kokkos REQUIRED)
30 | 
31 | add_executable(test_case test_case.cpp)
32 | target_link_libraries(test_case Kokkos::kokkos)
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/examples/hashing/include/hash_functions/md2.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  md2.hpp
20 |  *
21 |  *  Description:
22 |  *    MD2 hash function
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <internal/config.hpp>
27 | #include <tools/usm_smart_ptr.hpp>
28 | 
29 | constexpr dword MD2_BLOCK_SIZE = 16;
30 | 
31 | namespace hash::internal {
32 |     class md2_kernel;
33 | 
34 |     using namespace usm_smart_ptr;
35 | 
36 | 
37 |     sycl::event launch_md2_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch);
38 | 
39 | }


--------------------------------------------------------------------------------
/examples/hashing/include/hash_functions/sha1.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  sha1.hpp
20 |  *
21 |  *  Description:
22 |  *    SHA1 hash function
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <internal/config.hpp>
27 | #include <tools/usm_smart_ptr.hpp>
28 | 
29 | 
30 | constexpr dword SHA1_BLOCK_SIZE = 20;
31 | 
32 | namespace hash::internal {
33 |     class sha1_kernel;
34 | 
35 |     using namespace usm_smart_ptr;
36 | 
37 |     sycl::event launch_sha1_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch);
38 | 
39 | }


--------------------------------------------------------------------------------
/examples/distrib_batch_gemm/README.md:
--------------------------------------------------------------------------------
 1 | ## Distributed Batch GEMM example
 2 | 
 3 | This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks to distribute Batch GEMM accross MPI process.
 4 | 
 5 | 
 6 | ## Requisites
 7 | 
 8 | The Makefile provided assumes the MPICXX compiler points to the DPCPP compiler with CUDA support.
 9 | That requires the MPI implementation to be built, or use, the DPCPP compiler.
10 | The MPI implementation needs to have been built with CUDA support (typically called "CUDA-aware" MPI")
11 | 
12 | The example uses [SYCL-BLAS](https://github.com/codeplaysoftware/sycl-blas) library to call the GEMM routine.
13 | The SYCL-BLAS Library should be [compiled by DPCPP compiler](https://github.com/codeplaysoftware/sycl-blas#compile-with-dpc) to target CUDA backend. The following command line is used to build SYCL-BLAS library:
14 | 
15 | ```bash
16 | cmake -GNinja ../ -DTARGET=NVIDIA_GPU -DSYCL_COMPILER=dpcpp -DBLAS_DATA_TYPES=float -DGEMM_VECTORIZATION_SUPPORT=ON -DBLAS_ENABLE_TESTING=OFF -DENABLE_EXPRESSION_TESTS=OFF -DBLAS_ENABLE_BENCHMARK=OFF -DBLAS_VERIFY_BENCHMARK=OFF -DBLAS_BUILD_SAMPLES=OFF
17 | ```
18 | 
19 | ## Compilation
20 | 
21 | If MPICXX points to DPC++ with CUDA support and its on the path, "make" should build the program.
22 | 
23 | ## Execution
24 | 
25 | The makefile contains a target to execute the problem in two processes:
26 | 
27 | ```sh
28 | make run
29 | ```
30 | 
31 | The target assumes mpirun is on the PATH
32 | 


--------------------------------------------------------------------------------
/examples/MPI/Makefile:
--------------------------------------------------------------------------------
 1 | #/***************************************************************************
 2 | # *
 3 | # *  Copyright (C) Codeplay Software Ltd.
 4 | # *
 5 | # *  Licensed under the Apache License, Version 2.0 (the "License");
 6 | # *  you may not use this file except in compliance with the License.
 7 | # *  You may obtain a copy of the License at
 8 | # *
 9 | # *      http://www.apache.org/licenses/LICENSE-2.0
10 | # *
11 | # *  Unless required by applicable law or agreed to in writing, software
12 | # *  distributed under the License is distributed on an "AS IS" BASIS,
13 | # *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # *  See the License for the specific language governing permissions and
15 | # *  limitations under the License.
16 | # *
17 | # *  Codeplay's SYCL-For-CUDA-Examples
18 | # *
19 | # *  Makefile
20 | # *
21 | # *  Description:
22 | # *    Makefile for MPI example
23 | # **************************************************************************/
24 | MPICOMP=mpicxx -I${SYCL_ROOT_DIR}/include/sycl/ -O1 -fsycl-unnamed-lambda -std=c++17  -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Wno-linker-warnings
25 | 
26 | sycl-mpi-sample: SYCL-MPI-Sample.o
27 | 	$(MPICOMP) SYCL-MPI-Sample.o -o sycl-mpi-sample
28 | 
29 | SYCL-MPI-Sample.o: SYCL-MPI-Sample.cpp
30 | 	$(MPICOMP) -c SYCL-MPI-Sample.cpp
31 | 
32 | run: sycl-mpi-sample
33 | 	mpirun -np 2 ./sycl-mpi-sample
34 | 
35 | .PHONY: clean
36 | 
37 | clean:
38 | 	rm -f sycl-mpi-sample *.o
39 | 


--------------------------------------------------------------------------------
/examples/fortran_interface/README.md:
--------------------------------------------------------------------------------
 1 | CUDA Fortran and SYCL integration
 2 | ======================================
 3 | 
 4 | This directory shows an example of how to call a SYCL function
 5 | from a CUDA fortran code.
 6 | 
 7 | The SYCL routine is called using the Fortran ISO bindings like
 8 | any other C function.
 9 | 
10 | ```fortran
11 | interface saxpy_sycl
12 |   subroutine saxpy_call(x, y, a, N) &
13 |     bind(C,name='saxpy_sycl_cuda_wrapper')
14 |     implicit none
15 |     real :: x(:), y(:)
16 |     real, value :: a
17 |     integer, value :: N
18 |   end subroutine
19 | end interface
20 | ```
21 | 
22 | The SYCL code implemented in the C++ version of the code works as usual with one minor modification:
23 | Uses the CUDA Primary context to enable inter-operating with the CUDA Fortran code, ensuring the same resources are shared.
24 | 
25 | The following snipped highligts the construction of a SYCL context associated with the Primary context.
26 | To ensure synchronization with the CUDA Fortran code, the queue will also be mapped to the default CUDA
27 | stream, instead of creating a new stream.
28 | It is possible to create a normal stream,  just by using the default SYCL queue constructor on the CUDA
29 | context. Said queue will run concurrently (i.e. won't sync) to the main queue.
30 | 
31 | ```cpp
32 | sycl::context c{sycl::property::context::cuda::use_primary_context()};
33 | sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()};
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/setup-script/sample/include/chrono.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <iostream>
 5 | #include <chrono>
 6 | 
 7 | /**
 8 |  * Small Chrono class that prints the time spent in a scope.
 9 |  */
10 | class Chrono {
11 | public:
12 |     inline Chrono();
13 | 
14 |     inline explicit Chrono(std::string &&caller_name);
15 | 
16 |     inline Chrono(const Chrono &) = delete;
17 | 
18 |     Chrono &operator=(const Chrono &) = delete;
19 | 
20 |     inline double stop();
21 | 
22 |     inline ~Chrono();
23 | 
24 | private:
25 |     std::string caller;
26 | 
27 |     const std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::duration<long int, std::ratio<1, 1000000000>>> start;
28 | };
29 | 
30 | inline Chrono::Chrono()
31 |         : start(std::chrono::high_resolution_clock::now()) {
32 | }
33 | 
34 | inline Chrono::~Chrono() {
35 |     double elapsed_seconds = Chrono::stop();
36 |     if (!caller.empty()) {
37 |         std::cerr << "time in " << caller << " : " << elapsed_seconds << "s" << std::endl;
38 |     } else {
39 |         std::cerr << "time " << elapsed_seconds << "s" << std::endl;
40 |     }
41 | }
42 | 
43 | inline Chrono::Chrono(std::string &&caller_name)
44 |         : Chrono() {
45 |     caller = caller_name;
46 | }
47 | 
48 | inline double Chrono::stop() {
49 |     auto end = std::chrono::high_resolution_clock::now();
50 |     auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
51 |     return static_cast<double>(duration.count()) / 1000000.0;
52 | }
53 | 


--------------------------------------------------------------------------------
/examples/hashing/include/hash_functions/md5.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  md5.hpp
20 |  *
21 |  *  Description:
22 |  *    MD5 hash function
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <internal/config.hpp>
27 | #include <tools/usm_smart_ptr.hpp>
28 | 
29 | /****************************** MACROS ******************************/
30 | constexpr dword MD5_BLOCK_SIZE = 16;            // MD5 outputs a 16 byte digest
31 | 
32 | namespace hash::internal {
33 |     class md5_kernel;
34 | 
35 |     using namespace usm_smart_ptr;
36 | 
37 |     sycl::event launch_md5_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch);
38 | 
39 | }


--------------------------------------------------------------------------------
/examples/sgemm_interop/README.md:
--------------------------------------------------------------------------------
 1 | SYCL interop with CUDA library
 2 | -------------------------------
 3 | 
 4 | The example shows how to interop with CUBLAS from a SYCL for CUDA application.
 5 | The example uses Codeplay's extension *interop_task* to call the **SGEMM** 
 6 | routine in CUBLAS. Parameters are extracted using the interop handler conversion.
 7 | 
 8 | Pre-requisites
 9 | ---------------
10 | 
11 | These instructions assume that example [docker image](https://hub.docker.com/r/ruyman/dpcpp_cuda_examples/dockerfile) is being used. This image 
12 | simplifies accessing these examples as the environment is set up correctly.
13 | For details on how to get started with the example docker image, refer to the 
14 | root README file.
15 | 
16 | Building the example
17 | =====================
18 | 
19 | ``` sh
20 | $ bash build.sh
21 | ```
22 | 
23 | or (SYCL version only):
24 | 
25 | ```
26 | ${SYCL_ROOT_DIR}/bin/clang++ -DCUDA_NO_HALF -isystem /usr/local/cuda/include -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda -std=gnu++17 -L/usr/local/cuda/lib64  -lcublas -lcudart -lcuda -o sycl_sgemm sycl_sgemm.cpp
27 | ```
28 | Example
29 | =========
30 | 
31 | Two source codes are provided. `sgemm.cu` is the original CUDA code calling
32 | CUBLAS library to perform the matrix multiplication. `sycl_sgemm.cpp` is the 
33 | SYCL variant that calls CUBLAS underneath.
34 | 
35 | Both implementations perform the multiplication of square matrices A and B, 
36 | where A is a matrix full of ones, and B is an identity matrix.
37 | The expected output on C is a matrix full of ones.
38 | 


--------------------------------------------------------------------------------
/examples/hashing/include/hash_functions/sha256.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  sha256.hpp
20 |  *
21 |  *  Description:
22 |  *    SHA256 hash function
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <internal/config.hpp>
27 | #include <tools/usm_smart_ptr.hpp>
28 | 
29 | /****************************** MACROS ******************************/
30 | constexpr dword SHA256_BLOCK_SIZE = 32;            // SHA256 outputs a 32 byte digest
31 | 
32 | namespace hash::internal {
33 |     class sha256_kernel;
34 | 
35 |     using namespace usm_smart_ptr;
36 | 
37 | 
38 |     sycl::event launch_sha256_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch);
39 | 
40 | 
41 | }


--------------------------------------------------------------------------------
/examples/hashing/include/hash_functions/keccak.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  keccak.hpp
20 |  *
21 |  *  Description:
22 |  *    Keccak hash function
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <internal/config.hpp>
27 | #include <tools/usm_smart_ptr.hpp>
28 | 
29 | 
30 | constexpr dword KECCAK_ROUND = 24;
31 | constexpr dword KECCAK_STATE_SIZE = 25;
32 | constexpr dword KECCAK_Q_SIZE = 192;
33 | 
34 | namespace hash::internal {
35 | 
36 |     template<dword n_outbit>
37 |     class keccak_kernel;
38 | 
39 |     using namespace usm_smart_ptr;
40 | 
41 | 
42 |     sycl::event
43 |     launch_keccak_kernel(bool is_sha3, sycl::queue &item, sycl::event e, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch, dword n_outbit);
44 | 
45 | 
46 | }


--------------------------------------------------------------------------------
/examples/fortran_interface/Makefile:
--------------------------------------------------------------------------------
 1 | #/***************************************************************************
 2 | # *
 3 | # *  Copyright (C) Codeplay Software Ltd.
 4 | # *
 5 | # *  Licensed under the Apache License, Version 2.0 (the "License");
 6 | # *  you may not use this file except in compliance with the License.
 7 | # *  You may obtain a copy of the License at
 8 | # *
 9 | # *      http://www.apache.org/licenses/LICENSE-2.0
10 | # *
11 | # *  Unless required by applicable law or agreed to in writing, software
12 | # *  distributed under the License is distributed on an "AS IS" BASIS,
13 | # *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # *  See the License for the specific language governing permissions and
15 | # *  limitations under the License.
16 | # *
17 | # *  Codeplay's SYCL-For-CUDA-Examples
18 | # *
19 | # *  Makefile
20 | # *
21 | # *  Description:
22 | # *    Makefile for fortran interface
23 | # **************************************************************************/
24 | CXX=clang++
25 | FORT=nvfortran
26 | FFLAGS=-c++libs -cuda  
27 | CXXFLAGS=-fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda -Wno-linker-warnings
28 | DPCPP_PATH=/home/ruyman/sycl_workspace/build_dpcpp/install
29 | 
30 | default: final.exe
31 | 
32 | saxpy_sycl.so: saxpy.cpp
33 | 	$(CXX) $(CXXFLAGS) -fPIC --shared saxpy.cpp -o saxpy_sycl.so
34 | 
35 | saxpy_cuf.o: saxpy.cuf
36 | 	$(FORT) $(FFLAGS) -c saxpy.cuf -o saxpy_cuf.o
37 |        
38 | final.exe: saxpy_cuf.o saxpy_sycl.so
39 | 	$(FORT) $(FFLAGS) -o final.exe saxpy_cuf.o saxpy_sycl.so -L${DPCPP_PATH}/lib/ -lsycl 
40 | 
41 | .PHONY: clean
42 | 
43 | clean: 
44 | 	rm -f saxpy_cuf.o saxpy_sycl.so final.exe mathops.mod
45 | 
46 | 


--------------------------------------------------------------------------------
/examples/hashing/include/internal/config.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  config.hpp
20 |  *
21 |  *  Description:
22 |  *    Hashing function configuration
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <cstdint>
27 | #include <vector>
28 | #include <sycl/sycl.hpp>
29 | 
30 | /**
31 |  * To update on every abi update so two you won't be able to link the new declarations against an older library.
32 |  */
33 | #define abi_rev v_1
34 | 
35 | using byte = uint8_t;
36 | using dword = uint32_t;
37 | using qword = uint64_t;
38 | 
39 | //#define IMPLICIT_MEMORY_COPY 1 // ONLY ON LINUX AND MACOS
40 | 
41 | namespace hash {
42 |     /**
43 |      * Defines the various types of hashes supported.
44 |      */
45 |     enum class method {
46 |         sha256,
47 |         keccak,
48 |         blake2b,
49 |         sha1,
50 |         sha3,
51 |         md5,
52 |         md2
53 |     };
54 | 
55 | 
56 | }


--------------------------------------------------------------------------------
/examples/vector_addition/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
 2 | # Don't complain about empty CMAKE_CUDA_ARCHITECTURES
 3 | cmake_policy(SET CMP0104 OLD)
 4 | 
 5 | project(cmake_and_cuda LANGUAGES CXX CUDA)
 6 | 
 7 | include(CTest)
 8 | 
 9 | # SYCL installation
10 | if (NOT SYCL_ROOT) 
11 |   message(FATAL_ERROR "No SYCL installation detected")
12 | endif(NOT SYCL_ROOT)
13 | 
14 | set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/14.0.0/include/")
15 | set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
16 | set(SYCL_FLAGS "-fsycl"
17 |       "-fsycl-targets=nvptx64-nvidia-cuda"
18 |       "-fsycl-unnamed-lambda"
19 |       "-Wno-linker-warnings")
20 | 
21 | # Build the CUDA code
22 | add_executable(vector_addition vector_addition.cu)
23 | target_compile_features(vector_addition PUBLIC cxx_std_11)
24 | set_target_properties(vector_addition PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
25 | set_property(TARGET vector_addition PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
26 | 
27 | # Build the SYCL code
28 | add_executable (sycl_vector_addition vector_addition.cpp)
29 | target_compile_features(sycl_vector_addition PUBLIC cxx_std_17)
30 | target_compile_options(sycl_vector_addition PUBLIC ${SYCL_FLAGS})
31 | target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_FLAGS})
32 | target_include_directories(sycl_vector_addition PUBLIC ${SYCL_INCLUDE_DIR})
33 | target_link_libraries(sycl_vector_addition PUBLIC ${SYCL_LIB})
34 | 
35 | 
36 | # Build the SYCL (USM) code
37 | add_executable (sycl_vector_addition_usm vector_addition_usm.cpp)
38 | target_compile_features(sycl_vector_addition_usm PUBLIC cxx_std_17)
39 | target_compile_options(sycl_vector_addition_usm PUBLIC ${SYCL_FLAGS})
40 | target_link_libraries(sycl_vector_addition_usm PUBLIC ${SYCL_FLAGS})
41 | target_include_directories(sycl_vector_addition_usm PUBLIC ${SYCL_INCLUDE_DIR})
42 | target_link_libraries(sycl_vector_addition_usm PUBLIC ${SYCL_LIB})
43 | 
44 | 


--------------------------------------------------------------------------------
/examples/fortran_interface/saxpy.cpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  saxpy.cpp
20 |  *
21 |  *  Description:
22 |  *    SAXPY in SYCL
23 |  **************************************************************************/
24 | #include <iostream>
25 | #include <CL/sycl.hpp>
26 | 
27 | extern "C" {
28 |    void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N);
29 | };
30 | 
31 | 
32 | void saxpy_sycl_cuda_wrapper (float* x, float* y, float a, int N) {
33 |    sycl::context c{sycl::property::context::cuda::use_primary_context()};
34 |    sycl::queue q{c, c.get_devices()[0], sycl::property::queue::cuda::use_default_stream()};
35 |    {
36 |       sycl::buffer bX {x, sycl::range<1>(N)};
37 |       sycl::buffer bY {y, sycl::range<1>(N)};
38 | 
39 |       q.submit([&](sycl::handler& h) {
40 |          auto aX = bX.get_access<sycl::access::mode::read_write>(h);
41 |          auto aY = bY.get_access<sycl::access::mode::read_write>(h);
42 |          h.parallel_for<class saxpy_kernel>(sycl::range<1>(N), [=](sycl::id<1> id) {
43 |             if (id[0] < N) 
44 |                aY[id] = aX[id] * a + aY[id];
45 |          });
46 |       });
47 | 
48 |       q.wait_and_throw();
49 |    }
50 |    return;
51 | }
52 | 


--------------------------------------------------------------------------------
/.github/workflows/scorecard.yml:
--------------------------------------------------------------------------------
 1 | # Scorecards' GitHub action
 2 | 
 3 | name: Scorecard supply-chain security
 4 | on:
 5 |   # For Branch-Protection check. Only the default branch is supported. See
 6 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
 7 |   branch_protection_rule:
 8 |   schedule:
 9 |     - cron: '7 12 * * 4'
10 |   push:
11 |     branches: [ "master" ]
12 | 
13 | # Declare default permissions as read only.
14 | permissions: read-all
15 | 
16 | jobs:
17 |   analysis:
18 |     name: Scorecard analysis
19 |     runs-on: ubuntu-latest
20 |     permissions:
21 |       # Needed to upload the results to code-scanning dashboard.
22 |       security-events: write
23 |       # Needed to publish results and get a badge (see publish_results below).
24 |       id-token: write
25 | 
26 |     steps:
27 |       - name: "Checkout code"
28 |         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
29 |         with:
30 |           persist-credentials: false
31 | 
32 |       - name: "Run analysis"
33 |         uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
34 |         with:
35 |           results_file: results.sarif
36 |           results_format: sarif
37 |           publish_results: true
38 | 
39 |       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
40 |       # format to the repository Actions tab.
41 |       - name: "Upload artifact"
42 |         uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1
43 |         with:
44 |           name: SARIF file
45 |           path: results.sarif
46 |           retention-days: 5
47 | 
48 |       # Upload the results to GitHub's code scanning dashboard (optional).
49 |       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
50 |       - name: "Upload to code-scanning"
51 |         uses: github/codeql-action/upload-sarif@b56ba49b26e50535fa1e7f7db0f4f7b4bf65d80d # v3.28.10
52 |         with:
53 |           sarif_file: results.sarif
54 | 


--------------------------------------------------------------------------------
/examples/distrib_batch_gemm/main.cpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  main.cpp
20 |  *
21 |  *  Description:
22 |  *    Demonstrates simple vector addition
23 |  **************************************************************************/
24 | #include <array>
25 | #include <iostream>
26 | 
27 | template <typename T, size_t N>
28 | void simple_vadd_sycl(const std::array<T, N>& VA, const std::array<T, N>& VB,
29 |                  std::array<T, N>& VC);
30 | 
31 | template <typename T, size_t N>
32 | void simple_vadd_cuda(const std::array<T, N>& VA, const std::array<T, N>& VB,
33 |                  std::array<T, N>& VC);
34 | 
35 | int main() {
36 |   const size_t array_size = 4;
37 |   std::array<int, array_size> A = {{1, 2, 3, 4}},
38 |                                            B = {{1, 2, 3, 4}}, C;
39 |   std::array<float, array_size> D = {{1.f, 2.f, 3.f, 4.f}},
40 |                                              E = {{1.f, 2.f, 3.f, 4.f}}, F;
41 |   simple_vadd_sycl(A, B, C);
42 |   simple_vadd_cuda(D, E, F);
43 |   for (unsigned int i = 0; i < array_size; i++) {
44 |     if (C[i] != A[i] + B[i]) {
45 |       std::cout << "The results are incorrect (element " << i << " is " << C[i]
46 |                 << "!\n";
47 |       return 1;
48 |     }
49 |     if (F[i] != D[i] + E[i]) {
50 |       std::cout << "The results are incorrect (element " << i << " is " << F[i]
51 |                 << "!\n";
52 |       return 1;
53 |     }
54 |   }
55 |   std::cout << "The results are correct!\n";
56 |   return 0;
57 | }
58 | 


--------------------------------------------------------------------------------
/examples/distrib_batch_gemm/Makefile:
--------------------------------------------------------------------------------
 1 | #/***************************************************************************
 2 | # *
 3 | # *  Copyright (C) Codeplay Software Ltd.
 4 | # *
 5 | # *  Licensed under the Apache License, Version 2.0 (the "License");
 6 | # *  you may not use this file except in compliance with the License.
 7 | # *  You may obtain a copy of the License at
 8 | # *
 9 | # *      http://www.apache.org/licenses/LICENSE-2.0
10 | # *
11 | # *  Unless required by applicable law or agreed to in writing, software
12 | # *  distributed under the License is distributed on an "AS IS" BASIS,
13 | # *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # *  See the License for the specific language governing permissions and
15 | # *  limitations under the License.
16 | # *
17 | # *  Codeplay's SYCL-For-CUDA-Examples
18 | # *
19 | # *  Makefile
20 | # *
21 | # *  Description:
22 | # *    Makefile for distributed batch gemm
23 | # **************************************************************************/
24 | 
25 | SYCLCXX=clang++
26 | SYCLFLAGS=-O2 -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda -Wno-linker-warnings
27 | OBJS=main.o vadd_sycl.o vadd_cuda.o
28 | CUFLAGS=--cuda-gpu-arch=sm_80 -std=c++11 
29 | 
30 | 
31 | %.o: %.cpp
32 | 	${SYCLCXX} ${SYCLFLAGS} -c -o $@ $<
33 | 
34 | %.o: %.cu
35 | 	${SYCLCXX} ${CUFLAGS} -c -o $@ $<
36 | 
37 | main.exe: ${OBJS}
38 | 	${SYCLCXX} ${SYCLFLAGS} ${CUFLAGS} ${OBJS}  -L/usr/local/cuda/lib64 -lcudart_static -ldl -lrt -pthread -o $@
39 | 
40 | clean: 
41 | 	rm -f ${OBJS}
42 | 
43 | MPICOMP = mpicxx -I$(HOME)/sycl_workspace/build_dpcpp/install/include/sycl/ -I$(HOME)/sycl-blas/include -I$(HOME)/sycl-blas/external/computecpp-sdk/include/ -L$(HOME)/sycl-blas/build -O3 -fsycl-unnamed-lambda -std=c++17  -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -lsycl_blas
44 | 
45 | distributed-batch-gemm: distributed-batch-gemm.o
46 | 	$(MPICOMP) distributed-batch-gemm.o -o distributed-batch-gemm
47 | 
48 | distributed-batch-gemm.o: distributed-batch-gemm.cpp
49 | 	$(MPICOMP) -c distributed-batch-gemm.cpp
50 | 
51 | run: distributed-batch-gemm
52 | 	LD_LIBRARY_PATH=~/sycl_workspace/build_dpcpp/install/lib:$(HOME)/sycl-blas/build mpirun -np 2 --mca pml ucx -mca btl ^uct -x UCX_NET_DEVICES=mlx5_0:1 ./distributed-batch-gemm
53 | 
54 | .PHONY: clean
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/examples/hashing/cmake/FindSYCL.cmake:
--------------------------------------------------------------------------------
 1 | set(A_SYCL_FOUND false)
 2 | 
 3 | find_package(hipSYCL CONFIG)
 4 | 
 5 | #[     {          "name": "My Compiler Kit",          "compilers":          {             "C": "/home/michel/sycl_workspace/deploy/bin/clang-13",             "CXX": "/home/michel/sycl_workspace/deploy/bin/clang++" },"environmentVariables":{"LD_PRELOAD":"/opt/intel/opencl/libOpenCL.so.1"}     } ]
 6 | 
 7 | 
 8 | if (hipSYCL_FOUND)
 9 |     set(A_SYCL_FOUND true)
10 |     if (NOT CMAKE_BUILD_TYPE)
11 |         set(CMAKE_BUILD_TYPE Release)
12 |     endif ()
13 | 
14 |     cmake_policy(SET CMP0005 NEW)
15 |     add_definitions(-DHIPSYCL_DEBUG_LEVEL=0)
16 | 
17 |     if (NOT HIPSYCL_DEBUG_LEVEL)
18 |         if (CMAKE_BUILD_TYPE MATCHES "Debug")
19 |             set(HIPSYCL_DEBUG_LEVEL 3 CACHE STRING
20 |                     "Choose the debug level, options are: 0 (no debug), 1 (print errors), 2 (also print warnings), 3 (also print general information)"
21 |                     FORCE)
22 |         else ()
23 |             set(HIPSYCL_DEBUG_LEVEL 2 CACHE STRING
24 |                     "Choose the debug level, options are: 0 (no debug), 1 (print errors), 2 (also print warnings), 3 (also print general information)"
25 |                     FORCE)
26 |         endif ()
27 |     endif ()
28 | endif ()
29 | 
30 | 
31 | if (ComputeCpp_DIR)
32 |     include(cmake/Modules/FindComputeCpp.cmake)
33 | 
34 |     if (ComputeCpp_ROOT_DIR)
35 |         set(A_SYCL_FOUND true)
36 |     endif ()
37 |     add_compile_definitions(USING_COMPUTECPP)
38 |     message(STATUS " Using ComputeCpp CMake")
39 |     message(STATUS " Path to ComputeCpp implementation: ${COMPUTECPP_PACKAGE_ROOT_DIR} ")
40 |     #set(CMAKE_CXX_STANDARD 11)
41 |     include(FindOpenCL)
42 | endif ()
43 | 
44 | 
45 | if (TRISYCL_INCLUDE_DIR AND NOT A_SYCL_FOUND)
46 |     set(A_SYCL_FOUND true)
47 |     message(STATUS " Using triSYCL CMake")
48 |     include(FindTriSYCL)
49 | endif ()
50 | 
51 | # We expect the DPCPP compiler to have used
52 | if (NOT A_SYCL_FOUND)
53 |     function(add_sycl_to_target arg1 arg2)
54 |         target_compile_options(${arg2} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${DPCPP_FLAGS} -sycl-std=2020 -std=c++20 -fsycl-unnamed-lambda>)
55 |         target_link_options(${arg2} PRIVATE ${DPCPP_FLAGS} -sycl-std=2020 -std=c++20 -fsycl-unnamed-lambda)
56 |     endfunction()
57 | 
58 | endif ()
59 | 


--------------------------------------------------------------------------------
/examples/fortran_interface/saxpy.cuf:
--------------------------------------------------------------------------------
 1 | !**************************************************************************
 2 | !
 3 | !  Copyright (C) Codeplay Software Ltd.
 4 | !
 5 | !  Licensed under the Apache License, Version 2.0 (the "License");
 6 | !  you may not use this file except in compliance with the License.
 7 | !  You may obtain a copy of the License at
 8 | !
 9 | !      http://www.apache.org/licenses/LICENSE-2.0
10 | !
11 | !  Unless required by applicable law or agreed to in writing, software
12 | !  distributed under the License is distributed on an "AS IS" BASIS,
13 | !  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | !  See the License for the specific language governing permissions and
15 | !  limitations under the License.
16 | !
17 | !  Codeplay's SYCL-For-CUDA-Examples
18 | !
19 | !  saxpy.cuf
20 | !
21 | !  Description:
22 | !    CUDA Fortran code calling SAXPY from SYCL
23 | !*************************************************************************/
24 | 
25 | module mathOps
26 | contains
27 |   attributes(global) subroutine saxpy(x, y, a)
28 |     implicit none
29 |     real :: x(:), y(:)
30 |     real, value :: a
31 |     integer :: i, n
32 |     n = size(x)
33 |     i = blockDim%x * (blockIdx%x - 1) + threadIdx%x
34 |     if (i <= n) y(i) = y(i) + a*x(i)
35 |   end subroutine saxpy 
36 | end module mathOps
37 | 
38 | program testSaxpy
39 |   use mathOps
40 |   use cudafor
41 | 
42 |   implicit none
43 | 
44 | interface saxpy_sycl
45 |   subroutine saxpy_call(x, y, a, N) &
46 |     bind(C,name='saxpy_sycl_cuda_wrapper')
47 |     implicit none
48 |     real :: x(:), y(:)
49 |     real, value :: a
50 |     integer, value :: N
51 |   end subroutine
52 | end interface
53 | 
54 | 
55 |   integer, parameter :: N = 1024
56 |   real :: x(N), y(N), a
57 |   real, device :: x_d(N), y_d(N)
58 |   type(dim3) :: grid, tBlock
59 | 
60 |   tBlock = dim3(256,1,1)
61 |   grid = dim3(ceiling(real(N)/tBlock%x),1,1)
62 | 
63 |   write (*,*) 'CUDA version: '
64 |   x = 1.0; y = 2.0; a = 2.0
65 |   x_d = x
66 |   y_d = y
67 |   call saxpy<<<grid, tBlock>>>(x_d, y_d, a)
68 |   y = y_d
69 |   write(*,*) 'Max error: ', maxval(abs(y-4.0))
70 |   write(*,*) 'N ', N
71 | 
72 |   write (*,*) 'SYCL version: '
73 |   y = 2.0;
74 |   call saxpy_call(x, y, a, N);
75 |   write(*,*) 'Max error: ', maxval(abs(y-4.0))
76 | 
77 | end program testSaxpy
78 | 


--------------------------------------------------------------------------------
/examples/kokkos/README.md:
--------------------------------------------------------------------------------
 1 | Simple Test Case for Kokkos
 2 | ----
 3 | 
 4 | This is a simple standalone test case taken from the Kokkos repository & packaged up here for use with SYCL.
 5 | It's doing a vector-matrix-vector product. It's an identity matrix with two vectors of 1s, so the expected answer
 6 | is just equal to the problem size.
 7 | 
 8 | Building the test case
 9 | -----
10 | 
11 | test_case.cpp contains a simple kernel which has been copied straight from the Kokkos Tutorials (Exercises/02/Solution).
12 | 
13 | Build it with build.sh, after setting the environment variable:
14 | ```
15 | Kokkos_ROOT="[your/kokkos/installation]/lib/cmake/Kokkos"
16 | ```
17 | 
18 | Running the test case
19 | ----
20 | 
21 | Just launch it! There are optional flags:
22 | 
23 | -N : number of rows
24 | -M : number of columns
25 | -S : total size
26 | -nrepeat : how many times to repeat the test (default 100)
27 | 
28 | Obviously, not all of N, M & S should be set. The test case will sanity check your args anyway.
29 | 
30 | Building Kokkos
31 | ------
32 | 
33 | In case you don't have an existing Kokkos build, there are some build scripts in `./kokkos_build_scripts`.
34 | There are scripts for building Kokkos with SYCL, or CUDA (nvcc or clang).
35 | 
36 | Set the following environment variables:
37 | ```
38 | KOKKOS_INSTALL_DIR=[/your/install/dir]
39 | KOKKOS_SOURCE_DIR=[/your/source/dir]
40 | HWLOC_DIR=[/your/hwloc/dir]
41 | ```
42 | 
43 | HWLOC
44 | ------
45 | 
46 | The [Portable Hardware Locality](https://www.open-mpi.org/projects/hwloc/) (hwloc) package is an optional dependency which enables Kokkos to query the hardware topology of the system on which it is running. If you do not have a HWLOC installation, this option can be removed & Kokkos will be built without HWLOC support.
47 | 
48 | SYCL backend
49 | -------------
50 | 
51 | Kokkos should work with any SYCL backend, though the focus of this examples repo is SYCL-For-CUDA.
52 | Previous work at Codeplay has involved running Kokkos with SYCL on Nvidia hardware with Ampere architecture, hence the flag:
53 | ```
54 |       -DKokkos_ARCH_AMPERE80=ON \
55 | ```
56 | This flag is not strictly necessary, but it enables Ahead of Time (AoT) compilation, which can give a significant performance gain when building large projects built on Kokkos.
57 | You should modify the cmake command for your GPU arch.
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/setup-script/sample/mkl_matmult_usm.cpp:
--------------------------------------------------------------------------------
 1 | #include <sycl/sycl.hpp>
 2 | #include <oneapi/mkl.hpp>
 3 | 
 4 | #include <chrono.hpp>
 5 | #include <common.hpp>
 6 | #include <usm_smart_ptr.hpp>
 7 | 
 8 | using namespace usm_smart_ptr;
 9 | 
10 | int main(int argc, char *argv[]) {
11 |     using T = float;
12 |     size_t n_laps = 30;
13 |     size_t mat_size = 16384; // Bound by your GPU's memory.
14 | 
15 |     if (argc > 1) {
16 |         mat_size = std::stoul(argv[1], nullptr, 10);
17 |     }
18 |     T alpha = 1, beta = 0; // gemm parameters
19 | 
20 |     sycl::queue my_queue = try_get_queue(cuda_selector{});
21 | 
22 |     std::cout << "Initalizing the matrices..." << std::endl;
23 |     long n = mat_size, m = mat_size, k = mat_size, ldA = mat_size, ldB = mat_size, ldC = mat_size;
24 |     // Initializing USM shared memory in an std::unique_ptr for auto mem management
25 |     auto A = make_unique_ptr<T, alloc::shared>(mat_size * mat_size, my_queue);
26 |     auto B = make_unique_ptr<T, alloc::shared>(mat_size * mat_size, my_queue);
27 |     auto C = make_unique_ptr<T, alloc::device>(mat_size * mat_size, my_queue);
28 |     fill_rand(A.get(), A.count());
29 |     fill_rand(B.get(), B.count());
30 | 
31 |     std::cout << "Running on:" << my_queue.get_device().get_info<sycl::info::device::name>() << std::endl;
32 |     Chrono c("computing + error handling");
33 | 
34 |     try {
35 |         sycl::event e;
36 |         for (size_t i = 0; i < n_laps; i++) {
37 |             std::cout << i << '/' << n_laps << '\n';
38 |             using oneapi::mkl::transpose;
39 |             using oneapi::mkl::blas::column_major::gemm;
40 |             // C <- alpha*OP(A)*OP(B) + beta*C
41 |             e = gemm(my_queue, transpose::nontrans, transpose::nontrans, m, n, k, alpha, A.get(), ldA, B.get(), ldB, beta, C.get(), ldC, {e});
42 |         }
43 |         e.wait_and_throw();
44 |     }
45 |     catch (sycl::exception const &e) {
46 |         std::cout << "Caught synchronous SYCL exception during GEMM: " << e.what() << std::endl;
47 |     }
48 |     catch (std::exception const &e) {
49 |         std::cout << "Caught synchronous STL exception during GEMM: " << e.what() << std::endl;
50 |     }
51 | 
52 |     uint64_t operations_performed = n_laps * mat_size * mat_size * (2 * mat_size - 1);
53 |     std::cout << "Gflops : " << operations_performed / 1000000000 / c.stop() << std::endl;
54 | 
55 |     return 0;
56 | }


--------------------------------------------------------------------------------
/examples/vector_addition/vector_addition.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * SYCL FOR CUDA : Vector Addition Example
 3 |  *
 4 |  * Copyright 2020 Codeplay Software Ltd.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  *     Unless required by applicable law or agreed to in writing, software
13 |  *     distributed under the License is distributed on an "AS IS" BASIS,
14 |  *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  *     See the License for the specific language governing permissions and
16 |  *     limitations under the License.
17 |  *
18 |  * @File: vector_addition.cpp
19 |  */
20 | 
21 | #include <algorithm>
22 | #include <iostream>
23 | #include <vector>
24 | 
25 | #include <CL/sycl.hpp>
26 | 
27 | int main(int argc, char *argv[]) {
28 |   constexpr const size_t N = 100000;
29 |   const sycl::range VecSize{N};
30 | 
31 |   sycl::buffer<double> bufA{VecSize};
32 |   sycl::buffer<double> bufB{VecSize};
33 |   sycl::buffer<double> bufC{VecSize};
34 | 
35 |   // Initialize input data
36 |   {
37 |     sycl::host_accessor h_a{bufA, sycl::write_only};
38 |     sycl::host_accessor h_b{bufB, sycl::write_only};
39 | 
40 |     for (int i = 0; i < N; i++) {
41 |       h_a[i] = sin(i) * sin(i);
42 |       h_b[i] = cos(i) * cos(i);
43 |     }
44 |   }
45 | 
46 |   auto CUDASelector = [](sycl::device const &dev) {
47 |     if (dev.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda) {
48 |       std::cout << " CUDA device found " << std::endl;
49 |       return 1;
50 |     } else {
51 |       return -1;
52 |     }
53 |   };
54 |   sycl::queue myQueue{CUDASelector};
55 | 
56 |   // Command Group creation
57 |   auto cg = [&](sycl::handler &h) {
58 |     const auto read_t = sycl::access::mode::read;
59 |     const auto write_t = sycl::access::mode::write;
60 | 
61 |     auto a = bufA.get_access<read_t>(h);
62 |     auto b = bufB.get_access<read_t>(h);
63 |     auto c = bufC.get_access<write_t>(h);
64 | 
65 |     h.parallel_for(VecSize, [=](sycl::id<1> i) { c[i] = a[i] + b[i]; });
66 |   };
67 | 
68 |   myQueue.submit(cg);
69 | 
70 |   {
71 |     sycl::host_accessor h_c{bufC, sycl::read_only};
72 | 
73 |     double sum = 0.0f;
74 |     for (int i = 0; i < N; i++) {
75 |       sum += h_c[i];
76 |     }
77 |     std::cout << "Sum is : " << sum << std::endl;
78 |   }
79 | 
80 |   return 0;
81 | }
82 | 


--------------------------------------------------------------------------------
/setup-script/sample/mkl_matmult.cpp:
--------------------------------------------------------------------------------
 1 | #include <sycl/sycl.hpp>
 2 | #include <oneapi/mkl.hpp>
 3 | 
 4 | #include <chrono.hpp>
 5 | #include <common.hpp>
 6 | 
 7 | int main(int argc, char *argv[]) {
 8 |     using T = float;
 9 |     size_t n_laps = 30;
10 |     size_t mat_size = 16384;
11 |     if (argc > 1) {
12 |         mat_size = std::stoul(argv[1], nullptr, 10);
13 |     }
14 |     T alpha = 1, beta = 0; // gemm parameters
15 | 
16 |     sycl::queue my_queue = try_get_queue(cuda_selector{});
17 | 
18 |     std::cout << "Initalizing the matrices..." << std::endl;
19 |     size_t n = mat_size, m = mat_size, k = mat_size, ldA = mat_size, ldB = mat_size, ldC = mat_size;
20 |     std::vector<T> A(mat_size * mat_size);
21 |     std::vector<T> B(mat_size * mat_size);
22 |     std::vector<T> C(mat_size * mat_size);
23 |     fill_rand(A);
24 |     fill_rand(B);
25 | 
26 |     // create sycl buffers of matrix data for offloading between device and host
27 |     sycl::buffer<T, 1> A_buffer(A.data(), A.size());
28 |     sycl::buffer<T, 1> B_buffer(B.data(), B.size());
29 |     sycl::buffer<T, 1> C_buffer(C.data(), C.size());
30 | 
31 |     std::cout << "Running on:" << my_queue.get_device().get_info<sycl::info::device::name>() << std::endl;
32 |     Chrono c("computing + error handling");
33 |     for (size_t i = 0; i < n_laps; i++) {
34 |         std::cout << i << '/' << n_laps << '\n';
35 |         // add oneapi::mkl::blas::gemm to execution queue and catch any synchronous exceptions
36 |         try {
37 |             using oneapi::mkl::transpose;
38 |             using oneapi::mkl::blas::column_major::gemm; // row_major not implemented on cublas
39 |             // C <- alpha*OP(A)*OP(B) + beta*C
40 |             gemm(my_queue, transpose::nontrans, transpose::nontrans, m, n, k, alpha, A_buffer, ldA, B_buffer, ldB, beta,
41 |                  C_buffer, ldC);
42 |         }
43 |         catch (sycl::exception const &e) {
44 |             std::cout << "Caught synchronous SYCL exception during GEMM: " << e.what() << std::endl;
45 |         }
46 |         catch (std::exception const &e) {
47 |             std::cout << "Caught synchronous STL exception during GEMM: " << e.what() << std::endl;
48 |         }
49 |         // ensure any asynchronous exceptions caught are handled before proceeding
50 |         my_queue.wait_and_throw();
51 |     }
52 |     uint64_t operations_performed = n_laps * mat_size * mat_size * (2 * mat_size - 1);
53 |     std::cout << "Gflops : " << operations_performed / 1000000000 / c.stop() << std::endl;
54 | 
55 |     return 0;
56 | }


--------------------------------------------------------------------------------
/examples/hashing/include/hash_functions/blake2b.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  blake2b.hpp
20 |  *
21 |  *  Description:
22 |  *    Blake2 hash function
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <internal/config.hpp>
27 | #include <tools/usm_smart_ptr.hpp>
28 | 
29 | constexpr dword BLAKE2B_ROUNDS = 12;
30 | constexpr dword BLAKE2B_BLOCK_LENGTH = 128;
31 | constexpr dword BLAKE2B_CHAIN_SIZE = 8;
32 | constexpr dword BLAKE2B_CHAIN_LENGTH = (BLAKE2B_CHAIN_SIZE * sizeof(qword));
33 | constexpr dword BLAKE2B_STATE_SIZE = 16;
34 | constexpr dword BLAKE2B_STATE_LENGTH = (BLAKE2B_STATE_SIZE * sizeof(qword));
35 | 
36 | struct blake2b_ctx {
37 |     int64_t digestlen{};
38 |     dword keylen{};
39 |     dword pos{};
40 |     qword t0{};
41 |     qword t1{};
42 |     qword f0{};
43 |     byte buff[BLAKE2B_BLOCK_LENGTH] = {0};
44 |     qword chain[BLAKE2B_CHAIN_SIZE] = {0};
45 |     qword state[BLAKE2B_STATE_SIZE] = {0};
46 | };
47 | 
48 | namespace hash::internal {
49 |     class blake2b_kernel;
50 | 
51 |     using namespace usm_smart_ptr;
52 | 
53 |     usm_shared_ptr<blake2b_ctx, alloc::device> get_blake2b_ctx(sycl::queue &q, const byte *key, dword keylen, dword n_outbit);
54 | 
55 | 
56 |     sycl::event
57 |     launch_blake2b_kernel(sycl::queue &item, sycl::event e, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch, dword n_outbit, const byte *key,
58 |                           dword keylen);
59 | 
60 |     sycl::event
61 |     launch_blake2b_kernel(sycl::queue &item, sycl::event e, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch, dword n_outbit, const byte *key,
62 |                           dword keylen, device_accessible_ptr<blake2b_ctx>);
63 | 
64 | }


--------------------------------------------------------------------------------
/examples/distrib_batch_gemm/vadd_sycl.cpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  vadd_sycl.cpp
20 |  *
21 |  *  Description:
22 |  *    Vector addition in SYCL
23 |  **************************************************************************/
24 | /* This example is a very small one designed to show how compact SYCL code
25 |  * can be. That said, it includes no error checking and is rather terse. */
26 | #include <CL/sycl.hpp>
27 | 
28 | #include <array>
29 | #include <iostream>
30 | 
31 | constexpr cl::sycl::access::mode sycl_read = cl::sycl::access::mode::read;
32 | constexpr cl::sycl::access::mode sycl_write = cl::sycl::access::mode::write;
33 | 
34 | /* This is the class used to name the kernel for the runtime.
35 |  * This must be done when the kernel is expressed as a lambda. */
36 | template <typename T>
37 | class SimpleVadd;
38 | 
39 | template <typename T, size_t N>
40 | void simple_vadd_sycl(const std::array<T, N>& VA, const std::array<T, N>& VB,
41 |                  std::array<T, N>& VC) {
42 |   cl::sycl::queue deviceQueue;
43 |   cl::sycl::range<1> numOfItems{N};
44 |   cl::sycl::buffer<T, 1> bufferA(VA.data(), numOfItems);
45 |   cl::sycl::buffer<T, 1> bufferB(VB.data(), numOfItems);
46 |   cl::sycl::buffer<T, 1> bufferC(VC.data(), numOfItems);
47 | 
48 |   deviceQueue.submit([&](cl::sycl::handler& cgh) {
49 |     auto accessorA = bufferA.template get_access<sycl_read>(cgh);
50 |     auto accessorB = bufferB.template get_access<sycl_read>(cgh);
51 |     auto accessorC = bufferC.template get_access<sycl_write>(cgh);
52 | 
53 |     auto kern = [=](cl::sycl::id<1> wiID) {
54 |       accessorC[wiID] = accessorA[wiID] + accessorB[wiID];
55 |     };
56 |     cgh.parallel_for<class SimpleVadd<T>>(numOfItems, kern);
57 |   });
58 | }
59 | 
60 | template void simple_vadd_sycl<float, 4>(const std::array<float, 4>& VA, const std::array<float, 4>& VB,
61 |                  std::array<float, 4>& VC);
62 | template void simple_vadd_sycl<int, 4>(const std::array<int, 4>& VA, const std::array<int, 4>& VB,
63 |                  std::array<int, 4>& VC);
64 |                  


--------------------------------------------------------------------------------
/examples/vector_addition/vector_addition_usm.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * SYCL FOR CUDA : Vector Addition Example
 3 |  *
 4 |  * Copyright 2020 Codeplay Software Ltd.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  *     Unless required by applicable law or agreed to in writing, software
13 |  *     distributed under the License is distributed on an "AS IS" BASIS,
14 |  *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  *     See the License for the specific language governing permissions and
16 |  *     limitations under the License.
17 |  *
18 |  * @File: vector_addition.cpp
19 |  */
20 | 
21 | #include <algorithm>
22 | #include <iostream>
23 | #include <vector>
24 | 
25 | #include <CL/sycl.hpp>
26 | 
27 | int main(int argc, char *argv[]) {
28 |   constexpr const size_t n = 100000;
29 | 
30 |   // Create a sycl queue with our CUDASelector
31 |   auto CUDASelector = [](sycl::device const &dev) {
32 |     if (dev.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda) {
33 |       std::cout << " CUDA device found " << std::endl;
34 |       return 1;
35 |     } else {
36 |       return -1;
37 |     }
38 |   };
39 |   sycl::queue myQueue{CUDASelector};
40 | 
41 |   // Host input vectors
42 |   double *h_a;
43 |   double *h_b;
44 |   // Host output vector
45 |   double *h_c;
46 | 
47 |   // Device input vectors
48 |   double *d_a;
49 |   double *d_b;
50 |   // Device output vector
51 |   double *d_c;
52 | 
53 |   // Size, in bytes, of each vector
54 |   size_t bytes = n * sizeof(double);
55 | 
56 |   // Allocate memory for each vector on host
57 |   h_a = (double *)malloc(bytes);
58 |   h_b = (double *)malloc(bytes);
59 |   h_c = (double *)malloc(bytes);
60 | 
61 |   // Allocate memory for each vector on GPU
62 |   d_a = sycl::malloc_device<double>(n, myQueue);
63 |   d_b = sycl::malloc_device<double>(n, myQueue);
64 |   d_c = sycl::malloc_device<double>(n, myQueue);
65 | 
66 |   // Initialize vectors on host
67 |   for (int i = 0; i < n; i++) {
68 |     h_a[i] = sin(i) * sin(i);
69 |     h_b[i] = cos(i) * cos(i);
70 |   }
71 | 
72 |   myQueue.memcpy(d_a, h_a, bytes).wait();
73 |   myQueue.memcpy(d_b, h_b, bytes).wait();
74 | 
75 |   // Command Group creation
76 |   auto cg = [&](sycl::handler &h) {
77 |     h.parallel_for(sycl::range(n),
78 |                    [=](sycl::id<1> i) {
79 |                      d_c[i] = d_a[i] + d_b[i];
80 |                    });
81 |   };
82 | 
83 |   // Run the kernel defined above
84 |   myQueue.submit(cg).wait();
85 | 
86 |   // Copy the result back to host
87 |   myQueue.memcpy(h_c, d_c, bytes).wait();
88 | 
89 |   double sum = 0.0f;
90 |   for (int i = 0; i < n; i++) {
91 |     sum += h_c[i];
92 |   }
93 |   std::cout << "Sum is : " << sum << std::endl;
94 | 
95 |   return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/examples/distrib_batch_gemm/vadd_cuda.cu:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  vadd_cuda.cu
20 |  *
21 |  *  Description:
22 |  *    Vector addition in CUDA
23 |  **************************************************************************/
24 | #include <array>
25 | 
26 | // CUDA kernel. Each thread takes care of one element of c
27 | template<class T>
28 | __global__ void vecAdd(T *a, T *b, T *c, int n)
29 | {
30 |     // Get our global thread ID
31 |     int id = blockIdx.x*blockDim.x+threadIdx.x;
32 |  
33 |     // Make sure we do not go out of bounds
34 |     if (id < n)
35 |         c[id] = a[id] + b[id];
36 | }
37 |  
38 | template <typename T, size_t N>
39 | void simple_vadd_cuda(const std::array<T, N>& VA, const std::array<T, N>& VB,
40 |                  std::array<T, N>& VC) {
41 |     // Device input vectors
42 |     T *d_a;
43 |     T *d_b;
44 |     //Device output vector
45 |     T *d_c;
46 |  
47 |     // Size, in bytes, of each vector
48 |     const size_t bytes = N*sizeof(T);
49 |  
50 |     // Allocate memory for each vector on GPU
51 |     cudaMalloc(&d_a, bytes);
52 |     cudaMalloc(&d_b, bytes);
53 |     cudaMalloc(&d_c, bytes);
54 |  
55 |     // Copy host vectors to device
56 |     cudaMemcpy( d_a, VA.data(), bytes, cudaMemcpyHostToDevice);
57 |     cudaMemcpy( d_b, VB.data(), bytes, cudaMemcpyHostToDevice);
58 |  
59 |     int blockSize, gridSize;
60 |  
61 |     // Number of threads in each thread block
62 |     blockSize = 1024;
63 |  
64 |     // Number of thread blocks in grid
65 |     gridSize = (int)ceil((float)N/blockSize);
66 |  
67 |     // Execute the kernel
68 |     vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, N);
69 |  
70 |     // Copy array back to host
71 |     cudaMemcpy( VC.data(), d_c, bytes, cudaMemcpyDeviceToHost );
72 |  
73 |     // Release device memory
74 |     cudaFree(d_a);
75 |     cudaFree(d_b);
76 |     cudaFree(d_c);
77 | 
78 | }
79 | 
80 | 
81 | template void simple_vadd_cuda<float, 4>(const std::array<float, 4>& VA, const std::array<float, 4>& VB,
82 |                  std::array<float, 4>& VC);
83 | template void simple_vadd_cuda<int, 4>(const std::array<int, 4>& VA, const std::array<int, 4>& VB,
84 |                  std::array<int, 4>& VC);
85 | 
86 | 


--------------------------------------------------------------------------------
/examples/sgemm_interop/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #/***************************************************************************
 2 | # *
 3 | # *  Copyright (C) Codeplay Software Ltd.
 4 | # *
 5 | # *  Licensed under the Apache License, Version 2.0 (the "License");
 6 | # *  you may not use this file except in compliance with the License.
 7 | # *  You may obtain a copy of the License at
 8 | # *
 9 | # *      http://www.apache.org/licenses/LICENSE-2.0
10 | # *
11 | # *  Unless required by applicable law or agreed to in writing, software
12 | # *  distributed under the License is distributed on an "AS IS" BASIS,
13 | # *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # *  See the License for the specific language governing permissions and
15 | # *  limitations under the License.
16 | # *
17 | # *  Codeplay's SYCL-For-CUDA-Examples
18 | # *
19 | # *  CMakeLists.txt
20 | # *
21 | # *  Description:
22 | # *    CMake for SGEMM
23 | # **************************************************************************/
24 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
25 | 
26 | # Don't complain about empty CMAKE_CUDA_ARCHITECTURES
27 | cmake_policy(SET CMP0104 OLD)
28 | 
29 | project(sycl_cuda_interop LANGUAGES CXX CUDA)
30 | 
31 | find_package(CUDAToolkit)
32 | 
33 | # SYCL installation
34 | if (NOT SYCL_ROOT) 
35 |   message(FATAL_ERROR "No SYCL installation detected")
36 | endif(NOT SYCL_ROOT)
37 | 
38 | set(SYCL_INCLUDE_DIR "${SYCL_ROOT}/lib/clang/14.0.0/include/")
39 | set(SYCL_LIB "${SYCL_ROOT}/lib/libsycl.so")
40 | set(SYCL_FLAGS "-fsycl" 
41 |       "-fsycl-targets=nvptx64-nvidia-cuda"
42 |       "-fsycl-unnamed-lambda"
43 |       "-Wno-linker-warnings")
44 | 
45 | 
46 | # Build the CUDA code
47 | add_executable(cuda_sgemm sgemm.cu)
48 | target_compile_features(cuda_sgemm PUBLIC cxx_std_11)
49 | set_target_properties(cuda_sgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
50 | set_property(TARGET cuda_sgemm PROPERTY BUILD_RPATH "${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}")
51 | target_link_libraries(cuda_sgemm CUDA::toolkit CUDA::cublas)
52 | 
53 | # Build the SYCL code
54 | add_executable (sycl_sgemm sycl_sgemm.cpp)
55 | target_compile_features(sycl_sgemm PUBLIC cxx_std_17)
56 | target_compile_options(sycl_sgemm PUBLIC ${SYCL_FLAGS})
57 | target_compile_definitions(sycl_sgemm PUBLIC CUDA_NO_HALF)
58 | target_link_libraries(sycl_sgemm PUBLIC ${SYCL_FLAGS})
59 | target_include_directories(sycl_sgemm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
60 | target_link_libraries(sycl_sgemm PUBLIC CUDA::toolkit CUDA::cuda_driver CUDA::cublas)
61 | 
62 | # Build the SYCL USM code
63 | add_executable (sycl_sgemm_usm sycl_sgemm_usm.cpp)
64 | target_compile_features(sycl_sgemm_usm PUBLIC cxx_std_17)
65 | target_compile_options(sycl_sgemm_usm PUBLIC ${SYCL_FLAGS})
66 | target_compile_definitions(sycl_sgemm_usm PUBLIC CUDA_NO_HALF)
67 | target_link_libraries(sycl_sgemm_usm PUBLIC ${SYCL_FLAGS})
68 | target_include_directories(sycl_sgemm_usm PUBLIC ${SYCL_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS})
69 | target_link_libraries(sycl_sgemm_usm PUBLIC CUDA::toolkit CUDA::cuda_driver CUDA::cublas)
70 | 


--------------------------------------------------------------------------------
/examples/hashing/include/tools/fill_rand.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  fill_rand.hpp
20 |  *
21 |  *  Description:
22 |  *    Random generation for containers
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <sycl/sycl.hpp>
27 | #include <random>
28 | #include <algorithm>
29 | #include <type_traits>
30 | #include "usm_smart_ptr.hpp"
31 | 
32 | using namespace usm_smart_ptr;
33 | 
34 | /**
35 |  * Fills a container/array with random numbers from positions first to last
36 |  */
37 | template<typename T, class ForwardIt>
38 | static inline void do_fill_rand_on_host(ForwardIt first, ForwardIt last) {
39 |     static std::random_device dev;
40 |     static std::mt19937 engine(dev());
41 |     auto generator = [&]() {
42 |         if constexpr (std::is_integral<T>::value) {
43 |             static std::uniform_int_distribution<T> distribution;
44 |             return distribution(engine);
45 |         } else if constexpr (std::is_floating_point<T>::value) {
46 |             static std::uniform_real_distribution<T> distribution;
47 |             return distribution(engine);
48 |         } else if constexpr (std::is_same_v<T, sycl::half>) {
49 |             static std::uniform_real_distribution<float> distribution;
50 |             return distribution(engine);
51 |         }
52 |     };
53 |     std::generate(first, last, generator);
54 | }
55 | 
56 | 
57 | /**
58 |  * This function accepts only memory that is accessible from the CPU
59 |  * To achive this it uses fantom types that wraps the pointer.
60 |  * This could be done by calling the runtime to check where is the
61 |  * usm memory allocated, but here we can avoid doing that.
62 |  */
63 | template<typename T>
64 | static inline void fill_rand(host_accessible_ptr<T> v, size_t count) {
65 |     do_fill_rand_on_host<T>((T *) v, (T *) v + count);
66 | }
67 | 
68 | /**
69 |  * This function would only accept device allocated memory
70 |  */
71 | /*template<typename T, sycl::usm::alloc location>
72 | typename std::enable_if<location == sycl::usm::alloc::device, void>::type
73 | fill_rand(const usm_ptr<T, location> &v, size_t count) {
74 |     do_fill_rand_on_device<T>(+v, v + count);
75 | }*/
76 | 
77 | template<typename T>
78 | static inline void fill_rand(std::vector<T> &v) {
79 |     do_fill_rand_on_host<T>(v.begin(), v.end());
80 | }
81 | 


--------------------------------------------------------------------------------
/examples/hashing/include/internal/determine_kernel_config.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  determine_kernel_config.hpp
20 |  *
21 |  *  Description:
22 |  *    Functions for SYCL kernel configuration
23 |  **************************************************************************/
24 | #pragma once
25 | 
26 | #include <sycl/sycl.hpp>
27 | #include <cstddef>
28 | #include <utility>
29 | 
30 | namespace hash::internal {
31 | 
32 | 
33 |     struct kernel_config {
34 |         size_t wg_size;
35 |         size_t block;
36 |     };
37 | 
38 | 
39 |     inline kernel_config get_kernel_sizes(const sycl::queue &q, size_t job_size) {
40 |         kernel_config config{.wg_size= 1, .block= job_size};
41 |         if (q.get_device().is_gpu()) {
42 |             /**
43 |              * If the device is a GPU we will try to have as many threads in each work group as possible.
44 |              * We need to bound the value of `max_work_group_size` as it can be ANY 64-bit integer
45 |              */
46 |             config.wg_size = std::min(std::max(1ul, 2 * q.get_device().get_info<sycl::info::device::max_work_group_size>()), job_size);
47 |             config.wg_size = std::min(config.wg_size, 64ul); //TODO Find a better alternative than a hardcoded 64 ?
48 |             config.block = (job_size / config.wg_size) + (job_size % config.wg_size != 0);
49 |         } else {
50 |             /**
51 |              * We need that case because on a CPU, one work group runs on one thread, and threads are expensive to launch
52 |              * We'll multiply the thread count by a factor in order to allow the scheduler to better balance the work load.
53 |              */
54 |             config.block = std::min((size_t) std::max(1u, 2 * q.get_device().get_info<sycl::info::device::max_compute_units>()), job_size);
55 |             config.wg_size = job_size / config.block + (job_size % config.block != 0);
56 | 
57 |             /* We check that the work groups are not too big */
58 |             size_t max_wg_size = std::min(std::max(1ul, q.get_device().get_info<sycl::info::device::max_work_group_size>()), job_size);
59 |             if (config.wg_size > max_wg_size) {
60 |                 config.wg_size = max_wg_size;
61 |                 config.block = (job_size / config.wg_size) + (job_size % config.wg_size != 0);
62 |             }
63 | 
64 |         }
65 |         assert(config.block * config.wg_size >= job_size);
66 |         return config;
67 |     }
68 | 
69 | }


--------------------------------------------------------------------------------
/setup-script/README.md:
--------------------------------------------------------------------------------
 1 | # oneAPI on CUDA setup script
 2 | 
 3 | This script allows you to build and setup the DPC++ compiler, oneMKL and oneDNN with Nvidia GPUs support.
 4 | 
 5 | ### Use
 6 | 
 7 | 1. If needed, set `$DPCPP_HOME` and `$CUDA_ROOT` in the script.
 8 | 2. Run the script with ```./build.sh```.
 9 | 
10 | Everything will be installed to `$DPCPP_HOME/deploy`.
11 | 
12 | To build with testing support and run the tests for DPC++, oneMKL, oneTBB, oneDNN and Lapack,
13 | run: ```DPCPP_TESTS=ON ./build.sh```
14 | 
15 | If you want to build libc++, use `CC=clang-X CXX=clang++-X` with another version of clang, gcc won't compile the libc as
16 | the `asm` syntax is not the same. Using this clang/dpc++ won't work either, there's a bug. Then link
17 | with `-stdlib=libc++`.
18 | 
19 | ### Environment variables
20 | 
21 | Once everything was built, add the first four exports of the script in your environment or add them in your shell's
22 | config file.
23 | 
24 | ### Using the CUDA Backend
25 | 
26 | 1. Use the following selector:
27 | 
28 | ```C++
29 | class CUDADeviceSelector : public sycl::device_selector {
30 | public:
31 |     int operator()(const sycl::device &device) const override {
32 |         return device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda ? 1 : -1;
33 |     }
34 | };
35 | ```
36 | 
37 | 2. Build with `-fsycl -fsycl-targets=nvptx64-nvidia-cuda`
38 | 3. For oneMKL, link with: `-lonemkl_blas_cublas -lonemkl`
39 | 
40 | ### Intel's openCL
41 | 
42 | You can also install Intel's openCL driver (`intel-oneapi-runtime-opencl`) so you can target your CPU
43 | with `-fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice`. For the setup
44 | see [here](https://software.intel.com/content/www/us/en/develop/documentation/installation-guide-for-intel-oneapi-toolkits-linux/top/installation/install-using-package-managers.html)
45 | .
46 | 
47 | ### Dependencies
48 | 
49 | * Even though the latest version of CUDA seems to work with DPC++, we need to use CUDA 10.2 as oneMKL uses cuBLAS 10.2.
50 |   On RHEL: `cuda-10-2 libcublas-devel-10-2`. For the setup
51 |   see: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#pre-installation-actions.
52 | * `libatomic`
53 | * to be able to target Intel's CPUs with openCL.
54 | * `ninja-build cmake libxml2-devel ...` and other "build essentials" required for building a compiler.
55 | * `libcudnn8 libcudnn8-devel` for oneDNN.
56 | * `gcc-gfortran` or another Fortran compiler for NETLIB Lapack.
57 | * `hwloc-devel` for oneTBB
58 | * `mpfr-devel` when building libcxx
59 | 
60 | ### Caveats
61 | 
62 | * CLion integration: you need to add `cidr.compiler.clang.fsycl=true` in the registry of CLion [see](https://www.jetbrains.com/help/clion/tuning-the-ide.html?keymap=secondary_macos#configure-platform-properties)
63 | * oneMKL does not support testing when building for both cuBLAS and cuRAND
64 | * for oneMKL with cuda you should use the namespace `oneapi::mkl::blas::column_major::` as cuBLAS is column_major.
65 | 
66 | ### Repositories used
67 | 
68 | - OpenCL Headers+Loaders: https://github.com/KhronosGroup/OpenCL-Headers.git
69 |   , https://github.com/KhronosGroup/OpenCL-ICD-Loader.git
70 | - DPC++ Compiler: https://github.com/intel/llvm.git
71 | - NETLIB Lapack: https://github.com/Reference-LAPACK/lapack.git
72 | - oneTBB: https://github.com/oneapi-src/oneTBB.git
73 | - oneMKL: https://github.com/oneapi-src/oneMKL.git
74 | - oneDNN: https://github.com/oneapi-src/oneDNN.git
75 | 


--------------------------------------------------------------------------------
/examples/hashing/cmake/Modules/ComputeCppCompilerChecks.cmake:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.4.3)
 2 | 
 3 | if (CMAKE_COMPILER_IS_GNUCXX)
 4 |     if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.8)
 5 |         message(FATAL_ERROR "host compiler - gcc version must be > 4.8")
 6 |     endif ()
 7 | elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
 8 |     if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.6)
 9 |         message(FATAL_ERROR "host compiler - clang version must be > 3.6")
10 |     endif ()
11 | endif ()
12 | 
13 | if (MSVC)
14 |     set(ComputeCpp_STL_CHECK_SRC __STL_check)
15 |     file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp
16 |             "#include <CL/sycl.hpp>  \n"
17 |             "int main() { return 0; }\n")
18 |     set(_stl_test_command ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}
19 |             -sycl
20 |             ${COMPUTECPP_DEVICE_COMPILER_FLAGS}
21 |             -isystem ${ComputeCpp_INCLUDE_DIRS}
22 |             -isystem ${OpenCL_INCLUDE_DIRS}
23 |             -o ${ComputeCpp_STL_CHECK_SRC}.sycl
24 |             -c ${ComputeCpp_STL_CHECK_SRC}.cpp)
25 |     execute_process(
26 |             COMMAND ${_stl_test_command}
27 |             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
28 |             RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
29 |             ERROR_VARIABLE ComputeCpp_STL_CHECK_ERROR_OUTPUT
30 |             OUTPUT_QUIET)
31 |     if (NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
32 |         # Try disabling compiler version checks
33 |         execute_process(
34 |                 COMMAND ${_stl_test_command}
35 |                 -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH
36 |                 WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
37 |                 RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
38 |                 ERROR_VARIABLE ComputeCpp_STL_CHECK_ERROR_OUTPUT
39 |                 OUTPUT_QUIET)
40 |         if (NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
41 |             # Try again with __CUDACC__ and _HAS_CONDITIONAL_EXPLICIT=0. This relaxes the restritions in the MSVC headers
42 |             execute_process(
43 |                     COMMAND ${_stl_test_command}
44 |                     -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH
45 |                     -D_HAS_CONDITIONAL_EXPLICIT=0
46 |                     -D__CUDACC__
47 |                     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
48 |                     RESULT_VARIABLE ComputeCpp_STL_CHECK_RESULT
49 |                     ERROR_VARIABLE ComputeCpp_STL_CHECK_ERROR_OUTPUT
50 |                     OUTPUT_QUIET)
51 |             if (NOT ${ComputeCpp_STL_CHECK_RESULT} EQUAL 0)
52 |                 message(FATAL_ERROR "compute++ cannot consume hosted STL headers. This means that compute++ can't \
53 |                                compile a simple program in this platform and will fail when used in this system. \
54 |                                \n ${ComputeCpp_STL_CHECK_ERROR_OUTPUT}")
55 |             else ()
56 |                 list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH
57 |                         -D_HAS_CONDITIONAL_EXPLICIT=0
58 |                         -D__CUDACC__)
59 |             endif ()
60 |         else ()
61 |             list(APPEND COMPUTECPP_DEVICE_COMPILER_FLAGS -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH)
62 |         endif ()
63 |     endif ()
64 |     file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp
65 |             ${CMAKE_CURRENT_BINARY_DIR}/${ComputeCpp_STL_CHECK_SRC}.cpp.sycl)
66 | endif (MSVC)
67 | 


--------------------------------------------------------------------------------
/examples/hashing/include/tools/intrinsics.hpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  intrinsics.hpp
20 |  *
21 |  *  Description:
22 |  *    Intrinsic operations for hashing functions
23 |  **************************************************************************/
24 | /**
25 |     Copyright 2021 Codeplay Software Ltd.
26 | 
27 |     Licensed under the Apache License, Version 2.0 (the "License");
28 |     you may not use these files except in compliance with the License.
29 |     You may obtain a copy of the License at
30 | 
31 |     http://www.apache.org/licenses/LICENSE-2.0
32 | 
33 |     For your convenience, a copy of the License has been included in this
34 |     repository.
35 | 
36 |     Unless required by applicable law or agreed to in writing, software
37 |     distributed under the License is distributed on an "AS IS" BASIS,
38 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
39 |     See the License for the specific language governing permissions and
40 |     limitations under the License.
41 |  */
42 | 
43 | #pragma once
44 | 
45 | #include <sycl/sycl.hpp>
46 | 
47 | namespace sbb {
48 | 
49 |     template<typename T>
50 |     static inline std::enable_if_t<std::is_same_v<T, std::byte> || std::is_same_v<T, unsigned char>, uint32_t>
51 |     upsample(const T hi_hi, const T hi, const T lo, const T lo_lo) {
52 |         uint16_t hi_upsampled = (uint16_t(hi_hi) << 8) + uint16_t(hi);
53 |         uint16_t lo_upsampled = (uint16_t(lo) << 8) + uint16_t(lo_lo);
54 |         return (uint32_t(hi_upsampled) << 16) + uint32_t(lo_upsampled);
55 |     }
56 | 
57 | 
58 |     static inline sycl::event memcpy_with_dependency(sycl::queue &q, void *dest, const void *src, size_t numBytes, sycl::event depEvent) {
59 |         return q.submit([=](sycl::handler &cgh) {
60 |             cgh.depends_on(depEvent);
61 |             cgh.memcpy(dest, src, numBytes);
62 |         });
63 |     }
64 | 
65 |     static inline sycl::event memcpy_with_dependency(sycl::queue &q, void *dest, const void *src, size_t numBytes, const std::vector<sycl::event> &depEvent) {
66 |         return q.submit([&](sycl::handler &cgh) {
67 |             cgh.depends_on(depEvent);
68 |             cgh.memcpy(dest, src, numBytes);
69 |         });
70 |     }
71 | 
72 | 
73 |     template<typename T>
74 |     static inline constexpr uint8_t get_byte(const T &word, const uint &idx) {
75 |         static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
76 |         return (word >> (8 * idx)) & 0xFF;
77 |     }
78 | 
79 |     template<typename T>
80 |     static inline constexpr T set_byte(const T &word, const uint8_t &byte_in, const uint &idx) {
81 |         static_assert(std::is_integral_v<T> && std::is_unsigned_v<T>);
82 |         T select_mask = ~(T(0xFF) << (idx * 8));
83 |         T new_val = (T(byte_in) & 0xFF) << (idx * 8);
84 |         return (word & select_mask) + new_val;
85 |     }
86 | }
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/examples/sgemm_interop/sgemm.cu:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  sgemm.cu
 20 |  *
 21 |  *  Description:
 22 |  *    SGEMM operation in CUDA
 23 |  **************************************************************************/
 24 | #include <algorithm>
 25 | #include <iostream>
 26 | #include <vector>
 27 | 
 28 | #include <cublas_v2.h>
 29 | #include <cuda.h>
 30 | 
 31 | #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
 32 | 
 33 | void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
 34 |   if (status != CUBLAS_STATUS_SUCCESS) {
 35 |     std::cout << msg << " - " << status << std::endl;
 36 |     exit(EXIT_FAILURE);
 37 |   }
 38 | }
 39 | 
 40 | void inline checkCudaErrorMsg(cudaError status, const char *msg) {
 41 |   if (status != cudaSuccess) {
 42 |     std::cout << msg << " - " << status << std::endl;
 43 |     exit(EXIT_FAILURE);
 44 |   }
 45 | }
 46 | 
 47 | int main() {
 48 |   constexpr size_t WIDTH = 1024;
 49 |   constexpr size_t HEIGHT = 1024;
 50 |   constexpr float ALPHA = 1.0f;
 51 |   constexpr float BETA = 0.0f;
 52 | 
 53 |   std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
 54 |       h_C(WIDTH * HEIGHT);
 55 | 
 56 |   std::cout << "Size: " << h_C.size() << std::endl;
 57 |   float *d_A, *d_B, *d_C;
 58 | 
 59 |   // A is an identity matrix
 60 |   std::fill(std::begin(h_A), std::end(h_A), 0.0f);
 61 |   for (size_t i = 0; i < WIDTH; i++) {
 62 |     h_A[i * WIDTH + i] = 1.0f;
 63 |   }
 64 | 
 65 |   // B is a matrix fill with 1
 66 |   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 67 | 
 68 |   const size_t numBytes = WIDTH * HEIGHT * sizeof(float);
 69 | 
 70 |   CHECK_ERROR(cudaMalloc((void **)&d_A, numBytes));
 71 |   CHECK_ERROR(cudaMalloc((void **)&d_B, numBytes));
 72 |   CHECK_ERROR(cudaMalloc((void **)&d_C, numBytes));
 73 | 
 74 |   CHECK_ERROR(cudaMemcpy(d_A, h_A.data(), numBytes, cudaMemcpyHostToDevice));
 75 |   CHECK_ERROR(cudaMemcpy(d_B, h_B.data(), numBytes, cudaMemcpyHostToDevice));
 76 | 
 77 |   cublasHandle_t handle;
 78 |   CHECK_ERROR(cublasCreate(&handle));
 79 | 
 80 |   // C = A * B
 81 |   CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
 82 |                           WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA, d_C,
 83 |                           WIDTH));
 84 | 
 85 |   CHECK_ERROR(cudaMemcpy(h_C.data(), d_C, numBytes, cudaMemcpyDeviceToHost));
 86 | 
 87 |   // C must be all ones
 88 |   const bool allEqual = std::all_of(std::begin(h_C), std::end(h_C),
 89 |                                     [](float num) { return num == 1; });
 90 | 
 91 |   if (!allEqual) {
 92 |     std::cout << " Incorrect result " << std::endl;
 93 |   } else {
 94 |     std::cout << " Correct! " << std::endl;
 95 |   }
 96 | 
 97 |   CHECK_ERROR(cublasDestroy(handle));
 98 |   CHECK_ERROR(cudaFree(d_A));
 99 |   CHECK_ERROR(cudaFree(d_B));
100 |   CHECK_ERROR(cudaFree(d_C));
101 | 
102 |   return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
103 | }
104 | 


--------------------------------------------------------------------------------
/setup-script/sample/include/common.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <sycl/sycl.hpp>
 4 | #include <random>
 5 | #include <algorithm>
 6 | #include <type_traits>
 7 | #include <usm_smart_ptr.hpp>
 8 | 
 9 | using namespace usm_smart_ptr;
10 | 
11 | class cuda_selector : public sycl::device_selector {
12 | public:
13 |     int operator()(const sycl::device &device) const override {
14 |         return device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda;
15 |         //return device.is_gpu() && (device.get_info<sycl::info::device::driver_version>().find("CUDA") != std::string::npos);
16 |     }
17 | };
18 | 
19 | /**
20 |  * Tries to get a CUDA device else returns the host device
21 |  */
22 | sycl::queue try_get_queue(const sycl::device_selector &selector) {
23 |     sycl::device dev;
24 |     try {
25 |         dev = sycl::device(selector);
26 |     }
27 |     catch (...) {
28 |         dev = sycl::device(sycl::host_selector());
29 |         std::cout << "Warning: GPU device not found! Fall back on: " << dev.get_info<sycl::info::device::name>()
30 |                   << std::endl;
31 |     }
32 |     auto exception_handler = [](const sycl::exception_list &exceptions) {
33 |         for (std::exception_ptr const &e : exceptions) {
34 |             try {
35 |                 std::rethrow_exception(e);
36 |             }
37 |             catch (sycl::exception const &e) {
38 |                 std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl;
39 |             }
40 |             catch (std::exception const &e) {
41 |                 std::cout << "Caught asynchronous STL exception: " << e.what() << std::endl;
42 |             }
43 |         }
44 |     };
45 | 
46 |     return sycl::queue(dev, exception_handler);
47 | }
48 | 
49 | 
50 | /**
51 |  * Fills a container/array with random numbers from positions first to last
52 |  */
53 | template<typename T, class ForwardIt>
54 | void do_fill_rand_on_host(ForwardIt first, ForwardIt last) {
55 |     static std::random_device dev;
56 |     static std::mt19937 engine(dev());
57 |     auto generator = [&]() {
58 |         if constexpr (std::is_integral<T>::value) {
59 |             static std::uniform_int_distribution<T> distribution;
60 |             return distribution(engine);
61 |         } else if constexpr (std::is_floating_point<T>::value) {
62 |             static std::uniform_real_distribution<T> distribution;
63 |             return distribution(engine);
64 |         } else if constexpr (std::is_same_v<T, sycl::half>) {
65 |             static std::uniform_real_distribution<float> distribution;
66 |             return distribution(engine);
67 |         }
68 |     };
69 |     std::generate(first, last, generator);
70 | }
71 | 
72 | 
73 | /**
74 |  * This function accepts only memory that is accessible from the CPU
75 |  * To achive this it uses fantom types that wraps the pointer.
76 |  * This could be done by calling the runtime to check where is the
77 |  * usm memory allocated, but here we can avoid doing that.
78 |  */
79 | template<typename T, sycl::usm::alloc location>
80 | typename std::enable_if<location == sycl::usm::alloc::host || location == sycl::usm::alloc::shared, void>::type
81 | fill_rand(const usm_ptr<T, location> &v, size_t count) {
82 |     do_fill_rand_on_host<T>(+v, v + count);
83 | }
84 | 
85 | /**
86 |  * This function would only accept device allocated memory
87 |  */
88 | /*template<typename T, sycl::usm::alloc location>
89 | typename std::enable_if<location == sycl::usm::alloc::device, void>::type
90 | fill_rand(const usm_ptr<T, location> &v, size_t count) {
91 |     do_fill_rand_on_device<T>(+v, v + count);
92 | }*/
93 | 
94 | template<typename T>
95 | void fill_rand(std::vector<T> &v) {
96 |     do_fill_rand_on_host<T>(v.begin(), v.end());
97 | }
98 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | SYCL for CUDA examples
 2 | ==========================
 3 | 
 4 | [![OpenSSF Scorecard](https://api.scorecard.dev/projects/github.com/codeplaysoftware/SYCL-For-CUDA-Examples/badge)](https://scorecard.dev/viewer/?uri=github.com/codeplaysoftware/SYCL-For-CUDA-Examples)
 5 | 
 6 | This repository contains examples that demonstrate how to use the CUDA backend
 7 | in SYCL.
 8 | 
 9 | The examples are built and test in Linux with GCC 7.4, NVCC 10.1 and the
10 | experimental support for CUDA in the DPC++ SYCL implementation.
11 | 
12 | CUDA is a registered trademark of NVIDIA Corporation
13 | SYCL is a trademark of the Khronos Group Inc.
14 | 
15 | Prerequisites
16 | -------------
17 | 
18 | These examples are intended to be used with this [docker image](https://hub.docker.com/r/ruyman/dpcpp_cuda_examples). 
19 | It provides all the examples, libraries and the required environment variables. 
20 | 
21 | [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-docker) must be installed to run the image.
22 | 
23 | A useful guide for setting up docker and the NVIDIA Container Toolkit can be found [here](https://www.pugetsystems.com/labs/hpc/Workstation-Setup-for-Docker-with-the-New-NVIDIA-Container-Toolkit-nvidia-docker2-is-deprecated-1568).
24 | 
25 | Getting Started
26 | -------------
27 | 
28 | Once docker and the NVIDIA Container Toolkit are installed, we can create a new container and run the examples witin it.
29 | 
30 | ``` sh
31 | $ sudo docker run --gpus all -it ruyman/dpcpp_cuda_examples
32 | ```
33 | 
34 | Once inside the docker image, navigate to `/home/examples/` to find a local clone of this repo. Make sure to pull the latest changes:
35 | 
36 | ``` sh
37 | $ cd /home/examples/SYCL-For-CUDA-Examples
38 | $ git pull
39 | ```
40 | 
41 | Refer to each example and/or exercise for detailed instructions on how  to run it.
42 | 
43 | Examples
44 | =========
45 | 
46 | [Vector Addition](examples/vector_addition)
47 | --------------------------------------------
48 | 
49 | This trivial example can be used to compare a simple vector addition in CUDA to
50 | an equivalent implementation in SYCL for CUDA. The aim of the example is also 
51 | to highlight how to build an application with SYCL for CUDA using DPC++ support, 
52 | for which an example CMakefile is provided.
53 | 
54 | [Fortran Interface](examples/fortran_interface)
55 | --------------------------------------------
56 | 
57 | This demonstrates an example of how to call a SYCL function from a CUDA fortran code.
58 | 
59 | [MPI](examples/MPI)
60 | --------------------------------------------
61 | 
62 | This example shows how to integrate MPI calls within the SYCL DAG using Host Tasks for integration.
63 | 
64 | 
65 | [SGEMM Interop](examples/sgemm_interop)
66 | --------------------------
67 | 
68 | This demonstrates using SYCL's `host_task` for CUDA interoperability, calling CUBLAS's SGEMM routine for matrix multiplication.
69 | 
70 | [Distributed (MPI) GEMM](examples/distrib_batch_gemm)
71 | --------------------------------------------
72 | 
73 | This example combines the MPI and SGEMM Interop examples to distribute a matrix multiplication problem between MPI ranks.
74 | 
75 | [Kokkos](examples/kokkos)
76 | --------------------------------------------
77 | 
78 | [Kokkos](https://github.com/kokkos/kokkos) is a middle-layer for scientific computing which features a SYCL backend. This example 
79 | shows a small Kokkos test case (vector-matrix-vector multiplication), adapted from a test case in the Kokkos repo; 
80 | there is no SYCL code in the example, but it includes scripts to build Kokkos with SYCL support.
81 | 
82 | [Hashing Algorithms](examples/hashing)
83 | --------------------------------------------
84 | 
85 | This example is slightly different - it benchmarks a series of hashing algorithms.
86 | 
87 | 


--------------------------------------------------------------------------------
/examples/hashing/include/hash_functions/blake3.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  blake3.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    Blake3 hash function
 23 |  **************************************************************************/
 24 | #pragma once
 25 | 
 26 | #include <internal/config.hpp>
 27 | #include <tools/usm_smart_ptr.hpp>
 28 | 
 29 | 
 30 | #include <cstddef>
 31 | #include <cstdint>
 32 | 
 33 | #define BLAKE3_VERSION_STRING "0.3.7"
 34 | #define BLAKE3_KEY_LEN 32
 35 | #define BLAKE3_OUT_LEN 32
 36 | #define BLAKE3_BLOCK_LEN 64
 37 | #define BLAKE3_CHUNK_LEN 1024
 38 | #define BLAKE3_MAX_DEPTH 54
 39 | 
 40 | // This struct is a private implementation detail. It has to be here because
 41 | // it's part of blake3_hasher below.
 42 | struct blake3_chunk_state {
 43 |     uint32_t cv[8];
 44 |     uint64_t chunk_counter;
 45 |     uint8_t buf[BLAKE3_BLOCK_LEN];
 46 |     uint8_t buf_len;
 47 |     uint8_t blocks_compressed;
 48 |     uint8_t flags;
 49 | };
 50 | 
 51 | struct blake3_hasher {
 52 |     uint32_t key[8];
 53 |     blake3_chunk_state chunk;
 54 |     uint8_t cv_stack_len;
 55 |     // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
 56 |     // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
 57 |     // requires a 4th entry, rather than merging everything down to 1, because we
 58 |     // don't know whether more input is coming. This is different from how the
 59 |     // reference implementation does things.
 60 |     uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
 61 | };
 62 | 
 63 | //const char *blake3_version();
 64 | 
 65 | //void blake3_hasher_init(blake3_hasher *self);
 66 | 
 67 | //void blake3_hasher_init_keyed(blake3_hasher *self, const uint8_t key[BLAKE3_KEY_LEN]);
 68 | 
 69 | //void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
 70 | 
 71 | //void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context, size_t context_len);
 72 | 
 73 | //void blake3_hasher_update(blake3_hasher *self, const void *input, size_t input_len);
 74 | 
 75 | //void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, size_t out_len);
 76 | 
 77 | //void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek, uint8_t *out, size_t out_len);
 78 | 
 79 | 
 80 | #include <cassert>
 81 | 
 82 | #include <cstddef>
 83 | #include <cstdint>
 84 | #include <cstring>
 85 | 
 86 | // internal flags
 87 | enum blake3_flags {
 88 |     CHUNK_START = 1 << 0,
 89 |     CHUNK_END = 1 << 1,
 90 |     PARENT = 1 << 2,
 91 |     ROOT = 1 << 3,
 92 |     KEYED_HASH = 1 << 4,
 93 |     DERIVE_KEY_CONTEXT = 1 << 5,
 94 |     DERIVE_KEY_MATERIAL = 1 << 6,
 95 | };
 96 | 
 97 | 
 98 | // There are some places where we want a static size that's equal to the
 99 | // MAX_SIMD_DEGREE, but also at least 2.
100 | #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
101 | 
102 | 
103 | namespace hash::internal {
104 |     using namespace usm_smart_ptr;
105 | 
106 |     sycl::event
107 |     launch_blake3_kernel(sycl::queue &item, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch, dword n_outbit, const byte *key, dword keylen);
108 | 
109 |     int test_blake3();
110 | 
111 | }


--------------------------------------------------------------------------------
/examples/vector_addition/README.md:
--------------------------------------------------------------------------------
 1 | Vector addition
 2 | ===============================
 3 | 
 4 | This trivial example can be used to compare a simple vector addition in CUDA to
 5 | an equivalent implementation in SYCL for CUDA. The aim of the example is also 
 6 | to highlight how to build an application with SYCL for CUDA using DPC++ support, 
 7 | for which an example CMakefile is provided. For detailed documentation on how to
 8 | migrate from CUDA to SYCL, see [SYCL For CUDA Developers](https://developer.codeplay.com/products/computecpp/ce/guides/sycl-for-cuda-developers).
 9 | 
10 | Pre-requisites
11 | ---------------
12 | 
13 | These instructions assume that example [docker image](https://hub.docker.com/r/ruyman/dpcpp_cuda_examples/dockerfile) is being used. This image 
14 | simplifies accessing these examples as the environment is set up correctly.
15 | For details on how to get started with the example docker image, refer to the 
16 | root README file.
17 | 
18 | Building the example
19 | ---------------------
20 | 
21 | ``` sh
22 | $ mkdir build && cd build
23 | $ cmake ../ -DSYCL_ROOT=${SYCL_ROOT_DIR} -DCMAKE_CXX_COMPILER=${SYCL_ROOT_DIR}/bin/clang++
24 | $ make -j 8
25 | ```
26 | 
27 | This should produce two binaries, `vector_addition` and `sycl_vector_addition` .
28 | The former is the unmodified CUDA source and the second is the SYCL for CUDA
29 | version.
30 | 
31 | Running the example
32 | --------------------
33 | 
34 | ``` 
35 | $ ./sycl_vector_addition
36 | $ ./vector_addition
37 | ```
38 | 
39 | CMake Build script
40 | ------------------------
41 | 
42 | The provided CMake build script uses the native CUDA support to build the
43 | CUDA application. It also serves as a check that all CUDA requirements
44 | on the system are available (such as an installation of CUDA on the system).
45 | 
46 | Two flags are required: `-DSYCL_ROOT` , which must point to the place where the
47 | DPC++ compiler is installed, and `-DCMAKE_CXX_COMPILER` , which must point to
48 | the Clang compiler provided by DPC++. 
49 | 
50 | The CMake target `sycl_vector_addition` will build the SYCL version of
51 | the application.
52 | 
53 | Note the variable `SYCL_FLAGS` is used to store the Clang flags that enable
54 | the compilation of a SYCL application ( `-fsycl` ) but also the flag that specify
55 | which targets are built ( `-fsycl-targets` ). In this case, we will build the example 
56 | for both NVPTX and SPIR64. This means the kernel for the vector addition will be 
57 | compiled for both backends, and runtime selection to the right queue will 
58 | decide which variant to use.
59 | 
60 | Note the project is built with C++17 support, which enables the usage of
61 | [deduction guides](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/deduction_guides/SYCL_INTEL_deduction_guides.asciidoc) to reduce the number of template parameters used.
62 | 
63 | SYCL Vector Addition code
64 | --------------------------
65 | 
66 | The vector addition example uses a simple approach to implement with a plain
67 | kernel that performs the add. Vectors are stored directly in buffers. Data is
68 | initialized on the host using host accessors. This approach avoids creating
69 | unnecessary storage on the host, and facilitates the SYCL runtime to use
70 | optimized memory paths.
71 | 
72 | The SYCL queue created later on uses a custom `CUDASelector` to select a CUDA
73 | device, or bail out if its not there. The CUDA selector uses the
74 | `info::device::driver_version` to identify the device exported by the CUDA
75 | backend. If the NVIDIA OpenCL implementation is available on the system, it
76 | will be reported as another SYCL device. The driver version is the best way to
77 | differentiate between the two.
78 | 
79 | The command group is created as a lambda expression that takes the 
80 | `sycl::handler` parameter. Accessors are obtained from buffers using the
81 | `get_access` method. Finally the `parallel_for` with the SYCL kernel is invoked
82 | as usual.
83 | 
84 | The command group is subm$ itted to a queue which will convert all the operations
85 | into CUDA commands that will be executed once the host accessor is encountered
86 | later on.
87 | 
88 | The host accessor will trigger a copy of the data back to the host, and then
89 | the values are reduced into a single sum element.
90 | 


--------------------------------------------------------------------------------
/examples/hashing/include/internal/handle.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  handle.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    Handler for objects for hashing functions
 23 |  **************************************************************************/
 24 | #pragma once
 25 | 
 26 | #include <utility>
 27 | #include <iostream>
 28 | #include "config.hpp"
 29 | #include "../tools/usm_smart_ptr.hpp"
 30 | 
 31 | 
 32 | namespace hash {
 33 |     using namespace usm_smart_ptr;
 34 |     struct handle_item {
 35 |         usm_unique_ptr<byte, alloc::device> input_dev_data_;
 36 |         usm_unique_ptr<byte, alloc::device> output_dev_data_;
 37 |         sycl::event dev_e_;
 38 |     };
 39 | 
 40 |     /**
 41 |      * Holds unique pointers to the memory used by the different queues.
 42 |      * This object is thus not copyable.
 43 |      */
 44 |     class handle {
 45 |     private:
 46 |         std::vector<handle_item> items_{};
 47 |     public:
 48 |         /**
 49 |          * Move constructor.
 50 |          */
 51 |         explicit handle(std::vector<handle_item> &&input) noexcept:
 52 |                 items_(std::move(input)) {
 53 |         }
 54 | 
 55 |         handle() = default;
 56 | 
 57 |         /**
 58 |          * Rule of five, we need to redefine it.
 59 |          */
 60 |         handle &operator=(handle &&other) noexcept {
 61 |             std::swap(items_, other.items_);
 62 |             return *this;
 63 |         }
 64 | 
 65 |         /**
 66 |          * Waits on all the events, then clears the vector
 67 |          * which results in freeing the USM allocated memory
 68 |          */
 69 |         void wait() {
 70 |             for (auto &worker: items_) {
 71 |                 worker.dev_e_.wait();
 72 |             }
 73 |             items_.clear();
 74 |         }
 75 | 
 76 |         /**
 77 |          * Waits and throws on all the events, then clears the queue
 78 |          * which results in freeing the USM allocated memory
 79 |          */
 80 |         void wait_and_throw() {
 81 |             for (auto &worker: items_) {
 82 |                 worker.dev_e_.wait_and_throw();
 83 |             }
 84 |             items_.clear();
 85 |         }
 86 | 
 87 | 
 88 |         /**
 89 |          * No copy constructor.
 90 |          */
 91 |         handle(const handle &) = delete;
 92 | 
 93 |         /**
 94 |          * No assignement operator.
 95 |          */
 96 |         handle &operator=(const handle) = delete;
 97 | 
 98 |         /**
 99 |          * We need to join all the SYCL kernels/events before freeing the memory they use.
100 |          */
101 |         ~handle() noexcept {
102 |             if (!items_.empty()) {
103 |                 std::cerr << "Destroying handled that still holds data. Did you forget to call .wait()?\n";
104 |                 for (auto &e: items_) {
105 |                     try {
106 |                         e.dev_e_.wait_and_throw();
107 |                     }
108 |                     catch (sycl::exception const &e) {
109 |                         std::cerr << "Caught asynchronous SYCL exception at handle destruction: " << e.what() << std::endl;
110 |                     }
111 |                     catch (std::exception const &e) {
112 |                         std::cerr << "Caught asynchronous STL exception at handle destruction: " << e.what() << std::endl;
113 |                     }
114 |                 }
115 |                 items_.clear();
116 |             }
117 |         }
118 |     };
119 | }


--------------------------------------------------------------------------------
/examples/hashing/include/internal/async_api.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  async_api.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    Asynchronous API foir hashing functions
 23 |  **************************************************************************/
 24 | #pragma once
 25 | 
 26 | #include <utility>
 27 | #include "common.hpp"
 28 | #include "handle.hpp"
 29 | 
 30 | namespace hash {
 31 |     /**
 32 |      * Base class for hashing
 33 |      * @tparam M
 34 |      * @tparam n_outbit
 35 |      */
 36 |     template<hash::method M, int n_outbit = 0>
 37 |     class hasher {
 38 |     private:
 39 |         runners runners_;
 40 |     public:
 41 |         explicit hasher(runners v) : runners_(std::move(v)) {}
 42 | 
 43 |         handle hash(const byte *indata, dword inlen, byte *outdata, dword n_batch, byte *key, dword keylen) {
 44 |             size_t size = runners_.size();
 45 |             std::vector<handle_item> handles;
 46 |             handles.reserve(size);
 47 |             auto items = internal::get_hash_queue_work_item<M, n_outbit>(runners_, indata, inlen, outdata, n_batch);
 48 |             for (size_t i = 0; i < size; ++i) {
 49 |                 handles.emplace_back(internal::hash_with_data_copy<M, n_outbit>(items[i], key, keylen));
 50 |             }
 51 |             return handle(std::move(handles));
 52 |         }
 53 | 
 54 |         handle hash(const byte *indata, dword inlen, byte *outdata, dword n_batch) {
 55 |             return hash(indata, inlen, outdata, n_batch, nullptr, 0);
 56 |         }
 57 | 
 58 | 
 59 |     };
 60 | 
 61 | 
 62 |     /**
 63 |      * Blake 2B
 64 |      * @tparam n_outbit
 65 |      */
 66 |     template<int n_outbit>
 67 |     class hasher<method::blake2b, n_outbit> {
 68 | 
 69 |     private:
 70 |         hash::runners runners_;
 71 |         std::vector<usm_shared_ptr < blake2b_ctx, alloc::device>> keyed_ctxts_{};
 72 |     public:
 73 |         explicit hasher(const hash::runners &v, const byte *key, dword keylen) : runners_(v) {
 74 |             size_t size = v.size();
 75 |             keyed_ctxts_.reserve(size);
 76 | 
 77 |             for (size_t i = 0; i < size; ++i) {
 78 |                 keyed_ctxts_.emplace_back(internal::get_blake2b_ctx(runners_[i].q, key, keylen, n_outbit));
 79 |             }
 80 | 
 81 |         }
 82 | 
 83 |         handle hash(const byte *indata, dword inlen, byte *outdata, dword n_batch) {
 84 |             size_t size = runners_.size();
 85 |             std::vector<handle_item> handles;
 86 |             handles.reserve(2 * size);
 87 |             auto items = internal::get_hash_queue_work_item<method::blake2b, n_outbit>(runners_, indata, inlen, outdata, n_batch);
 88 |             for (size_t i = 0; i < size; ++i) {
 89 |                 handles.emplace_back(internal::hash_with_data_copy<method::blake2b, n_outbit>(items[i], nullptr, 0, keyed_ctxts_[i].get()));
 90 |             }
 91 |             return handle(std::move(handles));
 92 |         }
 93 |     };
 94 | 
 95 | 
 96 |     using md2 = hasher<hash::method::md2>;
 97 |     using md5 = hasher<hash::method::md5>;
 98 |     using sha1 = hasher<hash::method::sha1>;
 99 |     using sha256 = hasher<hash::method::sha256>;
100 | 
101 |     template<int n_outbit>
102 |     using keccak = hasher<hash::method::keccak, n_outbit>;
103 | 
104 |     template<int n_outbit>
105 |     using sha3 = hasher<hash::method::sha3, n_outbit>;
106 | 
107 |     template<int n_outbit>
108 |     using blake2b = hasher<hash::method::blake2b, n_outbit>;
109 | }
110 | 


--------------------------------------------------------------------------------
/examples/hashing/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #/***************************************************************************
 2 | # *
 3 | # *  Copyright (C) Codeplay Software Ltd.
 4 | # *
 5 | # *  Licensed under the Apache License, Version 2.0 (the "License");
 6 | # *  you may not use this file except in compliance with the License.
 7 | # *  You may obtain a copy of the License at
 8 | # *
 9 | # *      http://www.apache.org/licenses/LICENSE-2.0
10 | # *
11 | # *  Unless required by applicable law or agreed to in writing, software
12 | # *  distributed under the License is distributed on an "AS IS" BASIS,
13 | # *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # *  See the License for the specific language governing permissions and
15 | # *  limitations under the License.
16 | # *
17 | # *  Codeplay's SYCL-For-CUDA-Examples
18 | # *
19 | # *  CMakeLists.txt
20 | # *
21 | # *  Description:
22 | # *    CMake for hash functions
23 | # **************************************************************************/
24 | cmake_minimum_required(VERSION 3.0)
25 | project(SYCL_HASH LANGUAGES CXX)
26 | set(CMAKE_CXX_STANDARD 17)
27 | 
28 | option(VERBOSE_LIB "Adds various prints in the code" OFF)
29 | if (VERBOSE_LIB)
30 |     #message(WARNING "Verbose mode on. Did you forget it?")
31 |     add_compile_definitions(VERBOSE_HASH_LIB)
32 | endif ()
33 | 
34 | # If you're using the DPCPP compiler, these flags will be used. Set here the devies you want to target
35 | set(DPCPP_FLAGS -fsycl -fsycl-targets=spir64_x86_64,nvptx64-nvidia-cuda -Xcuda-ptxas -v -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_75 -Wno-linker-warnings)
36 | include(cmake/FindSYCL.cmake)
37 | 
38 | # Default C++ Flags for warnings and optimisation
39 | set(WARNING_FLAGS "-Wall -Wextra -Wshadow -Wdouble-promotion -fno-common -Winit-self -Wuninitialized -Wmissing-declarations -Woverloaded-virtual")
40 | set(EXTRA_W_FLAGS "-pedantic -Wall -Wextra -Wcast-align -Wctor-dtor-privacy -Wdisabled-optimization -Wformat=2 -Winit-self -Wmissing-declarations -Wmissing-include-dirs  -Woverloaded-virtual -Wredundant-decls -Wshadow -Wsign-conversion -Wsign-promo -Wstrict-overflow=5") #-Wnoexcept -Wold-style-cast -Wstrict-null-sentinel -switch-default -Wlogical-op
41 | set(DISABLED_WARNINGS "-Wno-c++20-extensions -Wno-inline-namespace-reopened-noninline -Wno-undef -Wno-unused -Wno-unused-command-line-argument")
42 | set(OPT_FLAGS "-march=native -mtune=native -Ofast -fomit-frame-pointer")
43 | 
44 | # Adding the flags to the targets
45 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${OPT_FLAGS} ${DISABLED_WARNINGS}")
46 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${EXTRA_W_FLAGS} ${DISABLED_WARNINGS} -g -Og")
47 | 
48 | include_directories(${PROJECT_BINARY_DIR} ${PROJECT_SOURCE_DIR}/include)
49 | 
50 | 
51 | set(sycl_hash_all_kernels
52 |         src/hash_functions/sha256.cpp
53 |         src/hash_functions/blake2b.cpp
54 |         src/hash_functions/sha1.cpp
55 |         src/hash_functions/md5.cpp
56 |         src/hash_functions/keccak.cpp
57 |         src/hash_functions/md2.cpp
58 |         src/tools/queue_tester.cpp
59 |         )
60 | 
61 | set(sycl_hash_all_sources
62 |         src/benchmarks/misc.hpp
63 |         include/sycl_hash.hpp
64 |         include/internal/config.hpp
65 |         include/internal/handle.hpp
66 |         include/internal/common.hpp
67 |         include/internal/determine_kernel_config.hpp
68 |         include/internal/sync_api.hpp
69 |         include/internal/async_api.hpp
70 |         include/hash_functions/sha256.hpp
71 |         include/hash_functions/blake2b.hpp
72 |         include/hash_functions/sha1.hpp
73 |         include/hash_functions/md5.hpp
74 |         include/hash_functions/keccak.hpp
75 |         include/hash_functions/md2.hpp
76 |         include/tools/fill_rand.hpp
77 |         include/tools/sycl_queue_helpers.hpp
78 |         include/tools/usm_smart_ptr.hpp
79 |         include/tools/runtime_byte_array.hpp
80 |         include/tools/intrinsics.hpp
81 |         )
82 | 
83 | add_library(sycl_hash SHARED ${sycl_hash_all_sources} ${sycl_hash_all_kernels})
84 | add_sycl_to_target(TARGET sycl_hash SOURCES ${sycl_hash_all_kernels})
85 | 
86 | add_executable(demo demo_main.cpp src/benchmarks/misc.hpp)
87 | target_link_libraries(demo PUBLIC sycl_hash)
88 | add_sycl_to_target(TARGET demo SOURCES demo_main.cpp)
89 | 
90 | include(tests/CMakeLists.txt)
91 | 


--------------------------------------------------------------------------------
/examples/hashing/demo_main.cpp:
--------------------------------------------------------------------------------
 1 | /***************************************************************************
 2 |  *
 3 |  *  Copyright (C) Codeplay Software Ltd.
 4 |  *
 5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 6 |  *  you may not use this file except in compliance with the License.
 7 |  *  You may obtain a copy of the License at
 8 |  *
 9 |  *      http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  *  Unless required by applicable law or agreed to in writing, software
12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  *  See the License for the specific language governing permissions and
15 |  *  limitations under the License.
16 |  *
17 |  *  Codeplay's SYCL-For-CUDA-Examples
18 |  *
19 |  *  demo_main.cpp
20 |  *
21 |  *  Description:
22 |  *    Main function for hashing demo
23 |  **************************************************************************/
24 | /*Copyright 2021 Codeplay Software Ltd.
25 | 
26 | Licensed under the Apache License, Version 2.0 (the "License");
27 | you may not use these files except in compliance with the License.
28 | You may obtain a copy of the License at
29 | 
30 | http://www.apache.org/licenses/LICENSE-2.0
31 | 
32 | For your convenience, a copy of the License has been included in this
33 | repository.
34 | 
35 | Unless required by applicable law or agreed to in writing, software
36 | distributed under the License is distributed on an "AS IS" BASIS,
37 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
38 | See the License for the specific language governing permissions and
39 | limitations under the License. */
40 | 
41 | 
42 | #include <sycl_hash.hpp>
43 | #include <tools/sycl_queue_helpers.hpp>
44 | #include "src/benchmarks/misc.hpp"
45 | 
46 | int main() {
47 |     size_t input_block_size = 512 * 1024; //bytes
48 |     size_t n_blocs = 1024 * 6;
49 |     size_t n_iters = 40;
50 |     auto cpu_q = try_get_queue(sycl::cpu_selector{});
51 |     auto cuda_q = try_get_queue(cuda_selector{});
52 | 
53 | //    auto ptr = (byte *) malloc(input_block_size * 100 * sizeof(byte));
54 | //    auto out = (byte *) malloc(hash::get_block_size<hash::method::sha256>() * 100 * sizeof(byte));
55 | //    double cpu_speed = benchmark_one_queue<hash::method::sha256>(cpu_q, input_block_size, 80);
56 | //    double gpu_speed = benchmark_one_queue<hash::method::sha256>(cuda_q, input_block_size, n_blocs, 5);
57 | //    hash::sha256 hasher({{cpu_q,  cpu_speed}, {cuda_q, gpu_speed}});
58 | //    auto e = hasher.hash(ptr, input_block_size, out, 100);
59 | //    hash::compute_md2(cpu_q, ptr, input_block_size, out, n_blocs);
60 | //    hash::compute_sha3<512>(cpu_q, ptr, input_block_size, out, n_blocs);
61 | 
62 | 
63 |     //GPU
64 |     run_benchmark<hash::method::keccak, 128>(cuda_q, input_block_size, n_blocs, n_iters);
65 |     run_benchmark<hash::method::sha3, 256>(cuda_q, input_block_size, n_blocs, n_iters);
66 |     run_benchmark<hash::method::md5>(cuda_q, input_block_size, n_blocs, n_iters);
67 |     run_benchmark<hash::method::blake2b, 128>(cuda_q, input_block_size, n_blocs, n_iters);
68 |     run_benchmark<hash::method::sha1>(cuda_q, input_block_size, n_blocs, n_iters);
69 |     run_benchmark<hash::method::sha256>(cuda_q, input_block_size, n_blocs, n_iters);
70 |     run_benchmark<hash::method::md2>(cuda_q, input_block_size, n_blocs, n_iters);
71 | 
72 |     //CPU
73 |     run_benchmark<hash::method::keccak, 128>(cpu_q, input_block_size, n_blocs, n_iters);
74 |     run_benchmark<hash::method::md5>(cpu_q, input_block_size, n_blocs, n_iters);
75 |     run_benchmark<hash::method::blake2b, 128>(cpu_q, input_block_size, n_blocs, n_iters);
76 |     run_benchmark<hash::method::sha1>(cpu_q, input_block_size, n_blocs, n_iters);
77 |     run_benchmark<hash::method::sha256>(cpu_q, input_block_size, n_blocs, n_iters);
78 |     run_benchmark<hash::method::md2>(cpu_q, input_block_size, n_blocs, n_iters);
79 | 
80 | 
81 |     // CPU == GPU ??
82 |     compare_two_devices<hash::method::sha256>(cuda_q, cpu_q, 1024, 4096);
83 |     compare_two_devices<hash::method::keccak, 128>(cuda_q, cpu_q, 1024, 4096);
84 |     compare_two_devices<hash::method::md2>(cuda_q, cpu_q, 1024, 4096);
85 |     compare_two_devices<hash::method::md5>(cuda_q, cpu_q, 1024, 4096);
86 |     compare_two_devices<hash::method::sha1>(cuda_q, cpu_q, 1024, 4096);
87 |     compare_two_devices<hash::method::blake2b, 128>(cuda_q, cpu_q, 1024, 4096);
88 | }
89 | 
90 | 


--------------------------------------------------------------------------------
/examples/hashing/tests/tests_helpers.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  test_helpers.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    Helper functions for tests
 23 |  **************************************************************************/
 24 | #pragma once
 25 | 
 26 | #include <sycl_hash.hpp>
 27 | #include <iomanip>
 28 | #include <iostream>
 29 | #include <cstring>
 30 | #include <tools/sycl_queue_helpers.hpp>
 31 | 
 32 | template<bool strict = true>
 33 | static inline sycl::queue try_get_queue_with_device(const sycl::device &in_dev) {
 34 |     auto exception_handler = [](const sycl::exception_list &exceptions) {
 35 |         for (std::exception_ptr const &e: exceptions) {
 36 |             try {
 37 |                 std::rethrow_exception(e);
 38 |             }
 39 |             catch (sycl::exception const &e) {
 40 |                 std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl;
 41 |             }
 42 |             catch (std::exception const &e) {
 43 |                 std::cout << "Caught asynchronous STL exception: " << e.what() << std::endl;
 44 |             }
 45 |         }
 46 |     };
 47 | 
 48 |     sycl::device dev;
 49 |     sycl::queue q;
 50 |     try {
 51 |         dev = in_dev;
 52 |         q = sycl::queue(dev, exception_handler);
 53 |         if constexpr (strict) {
 54 |             if (dev.is_cpu() || dev.is_gpu()) { //Only CPU and GPU not host, dsp, fpga, ?...
 55 |                 queue_tester(q);
 56 |             }
 57 |         }
 58 |     }
 59 |     catch (...) {
 60 |         dev = sycl::device(sycl::host_selector());
 61 |         q = sycl::queue(dev, exception_handler);
 62 |         std::cout << "Warning: Expected device not found! Fall back on: " << dev.get_info<sycl::info::device::name>() << std::endl;
 63 |     }
 64 |     return q;
 65 | }
 66 | 
 67 | 
 68 | void print_hex(byte *ptr, dword len) {
 69 |     for (size_t i = 0; i < len; ++i) // only the first block
 70 |         std::cout << std::hex << std::setfill('0') << std::setw(2) << (int) (ptr[i]) << " ";
 71 |     std::cout << std::dec << std::endl << std::endl;
 72 | }
 73 | 
 74 | void duplicate(byte *in, byte *out, dword item_len, dword count) {
 75 |     for (size_t i = 0; i < count; ++i) {
 76 |         std::memcpy(out + item_len * i, in, item_len);
 77 |     }
 78 | }
 79 | 
 80 | 
 81 | std::vector<sycl::queue> get_all_queues_once() {
 82 |     std::vector<sycl::device> devices1 = sycl::device::get_devices();
 83 |     std::vector<sycl::queue> queues1;
 84 |     std::for_each(devices1.begin(), devices1.end(), [&](auto &d) { queues1.emplace_back(try_get_queue_with_device(d)); });
 85 |     return queues1;
 86 | }
 87 | 
 88 | 
 89 | std::vector<sycl::queue> get_all_queues() {
 90 |     static std::vector<sycl::queue> queues = get_all_queues_once();
 91 |     return queues;
 92 | }
 93 | 
 94 | 
 95 | template<typename Func>
 96 | void for_all_workers(Func f) {
 97 |     static auto queues = get_all_queues();
 98 |     {
 99 |         for (const auto &q: queues) {
100 |             std::cout << "Running on: " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
101 |             f(hash::runners(1, hash::runner{q, 1}));
102 |         }
103 |     }
104 | }
105 | 
106 | 
107 | template<typename Func>
108 | void for_all_workers_pairs(Func f) {
109 |     auto queues = get_all_queues();
110 |     for (const auto &q1: queues) {
111 |         for (const auto &q2: queues) {
112 |             std::cout << "Running on: " << q1.get_device().get_info<sycl::info::device::name>() << " and: " << q2.get_device().get_info<sycl::info::device::name>() << std::endl;
113 |             f({{q1, 1},
114 |                {q2, 1}});
115 |         }
116 | 
117 |     }
118 | 
119 | }
120 | 


--------------------------------------------------------------------------------
/examples/hashing/doc/README.md:
--------------------------------------------------------------------------------
 1 | # usm_smart_ptr wrappers
 2 | 
 3 | We provide in `include/tools/usm_smart_ptr.hpp` overloads to `std::unique_ptr` and `std::shared_ptr` that manages SYCL's USM memory.
 4 | 
 5 | ```c++
 6 | using namespace usm_smart_ptr;
 7 | sbb::queue q;
 8 | size_t count;
 9 | auto usm_unique_ptr = make_unique_ptr<my_type, alloc::shared>(count, q); // could return a pointer to sbb::malloc_shared<my_type>(count, q);
10 | auto usm_shared_ptr = make_shared_ptr<my_type, alloc::shared>(count, q);
11 | ```
12 | 
13 | You can choose between `alloc::device`, `alloc::shared` and `alloc::host`. Calling `.get()` on the pointers will return a decorated pointer to the underlying memory which allow to keep track of where the memory was
14 | allocated. It's just compile-time type safety. These decorated types allow to further construct the types :
15 | 
16 | * `device_accessible_ptr<byte>`
17 | * `host_accessible_ptr<byte>`
18 | 
19 | This prevents the following (potentially incorrect) code from compiling:
20 | 
21 | ```c++
22 | // A function
23 | void fill_on_host(host_accessible_ptr<flloat> ptr, size_t size){
24 | float* raw_ptr = (float*) ptr;
25 |     //do your thing on the host
26 | }
27 | 
28 | // later
29 | auto device_memory = make_shared_ptr<flloat, alloc::device>(1024, q);
30 | 
31 | // attempt to fill the memory
32 | fill_on_host(device_memory.get(), 1024); // Won't compile, we're saved!
33 | ```
34 | 
35 | # API
36 | 
37 | ## Computing hashes
38 | 
39 | The number of inputs and template arguments to the hashing function depends on the type. Any missing or extra argument passed will result in compilation failure.
40 | 
41 | ```C++
42 | hash::compute<hash::method::blake2b, n_outbit>(queue, input_ptr, input_block_size, output_hashes, n_blocs, key_ptr, key_size);
43 | hash::compute<hash::method::keccak, n_outbit>(queue, input_ptr, input_block_size, output_hashes, n_blocs);
44 | hash::compute<hash::method::sha3, n_outbit>(queue, input_ptr, input_block_size, output_hashes, n_blocs;
45 | hash::compute<hash::method::sha1>(queue, input_ptr, input_block_size, output_hashes, n_blocs,);
46 | hash::compute<hash::method::sha256>(queue, input_ptr, input_block_size, output_hashes, n_blocs);
47 | hash::compute<hash::method::md2>(queue, input_ptr, input_block_size, output_hashes, n_blocs);
48 | hash::compute<hash::method::md5>(queue, input_ptr, input_block_size, output_hashes, n_blocs);
49 | ```
50 | 
51 | We'll consider the `blake2b` method in the rest. For each method we got two overloads :
52 | 
53 | ### 1. Implicit memory copy
54 | 
55 | ```
56 | hash::compute<hash::method::blake2b, n_outbit>(sbb::queue &q, const byte*, dword, byte*, dword, byte *, dword);
57 | ```
58 | 
59 | This is the overload you would call if you got C++ allocated pointers to your memory (array on the stack, malloc, new[], ...). When calling this function, the memory will be copied behind the scenes to the device as it's
60 | the safets behaviour.
61 | 
62 | ### 2. No memory copy
63 | 
64 | If you wrap your memory pointers in `device_accessible_ptr<byte>` (see `include/tools/usm_smart_ptr.hpp`), then the library will assume these points to a memory that is accessible by the `sbb::device` you build
65 | your `sbb::queue` on.
66 | 
67 | ```
68 | hash::compute<hash::method::blake2b, n_outbit>(sbb::queue &q, const device_accessible_ptr<byte>, dword, device_accessible_ptr<byte>, dword, const byte *, dword);
69 | ```
70 | 
71 | Best, this overload will be called if you use the previously described `usm_smart_ptr wrappers` with `alloc::device` or `alloc::shared`. We voluntarily excluded `alloc::host` as the remote memory accesses could
72 | potentially cause performanec issues.
73 | 
74 | ## Hash functions querying
75 | 
76 | The following `constexpr` function returns the length, in bytes of a hash produced by the method.
77 | 
78 | ```c++
79 | hash::get_block_size<hash::method::keccak, 128>();
80 | ```
81 | 
82 | You can also query the name with:
83 | 
84 | ```c++
85 | hash::get_name<hash::method::keccak, 128>()
86 | ```
87 | 
88 | r
89 | 
90 | # Kernel work group size formula
91 | 
92 | The nd_range sizes are computed in `include/determine_kernel_config.hpp`. When running on a CPU we'll try to make twice as much work groups as you've got execution threads on your system as, with OpenCL, each work group
93 | seems to be executed on one CPU thread. When running on the GPU we'll try to make work groups that contains 64 work items each. Going above 64 seems to decrease performance. This behaviour might not be optimal and should
94 | be customised to fit your SYCL implementation. 
95 | 


--------------------------------------------------------------------------------
/examples/sgemm_interop/sycl_sgemm_usm.cpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  sycl_sgemm_usm.cpp
 20 |  *
 21 |  *  Description:
 22 |  *    SGEMM operation in SYCL with USM
 23 |  **************************************************************************/
 24 | #include <algorithm>
 25 | #include <iostream>
 26 | #include <vector>
 27 | 
 28 | #include <sycl/sycl.hpp>
 29 | 
 30 | #include <cublas_v2.h>
 31 | #include <cuda.h>
 32 | 
 33 | #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
 34 | 
 35 | void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
 36 |   if (status != CUBLAS_STATUS_SUCCESS) {
 37 |     std::cout << "ERROR CUBLAS:" << msg << " - " << status << std::endl;
 38 |     exit(EXIT_FAILURE);
 39 |   }
 40 | }
 41 | 
 42 | void inline checkCudaErrorMsg(cudaError status, const char *msg) {
 43 |   if (status != cudaSuccess) {
 44 |     std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
 45 |     exit(EXIT_FAILURE);
 46 |   }
 47 | }
 48 | 
 49 | int main() {
 50 |   using namespace sycl;
 51 | 
 52 |   constexpr size_t WIDTH = 1024;
 53 |   constexpr size_t HEIGHT = 1024;
 54 |   constexpr float ALPHA = 1.0f;
 55 |   constexpr float BETA = 0.0f;
 56 | 
 57 |   std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
 58 |       h_C(WIDTH * HEIGHT);
 59 | 
 60 |   std::cout << "Size: " << h_C.size() << std::endl;
 61 | 
 62 |   // A is an identity matrix
 63 |   std::fill(std::begin(h_A), std::end(h_A), 0.0f);
 64 |   for (size_t i = 0; i < WIDTH; i++) {
 65 |     h_A[i * WIDTH + i] = 1.0f;
 66 |   }
 67 | 
 68 |   // B is a matrix fill with 1
 69 |   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 70 | 
 71 |   sycl::queue q{[](auto &d) {
 72 |     return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda);
 73 |   }};
 74 | 
 75 |   // Allocate memory on the device
 76 |   float *d_A = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
 77 |   float *d_B = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
 78 |   float *d_C = sycl::malloc_device<float>(WIDTH * HEIGHT, q);
 79 | 
 80 |   // Copy matrices A & B to device from host vectors
 81 |   const size_t numBytes = WIDTH * HEIGHT * sizeof(float);
 82 |   q.memcpy(d_A, h_A.data(), numBytes).wait();
 83 |   q.memcpy(d_B, h_B.data(), numBytes).wait();
 84 | 
 85 |   // Create cublas handle
 86 |   cublasHandle_t handle;
 87 |   CHECK_ERROR(cublasCreate(&handle));
 88 | 
 89 |   q.submit([&](handler &h) {
 90 |      h.host_task([=](sycl::interop_handle ih) {
 91 |        // Set the correct cuda context & stream
 92 |        cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
 93 |        auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
 94 |        cublasSetStream(handle, cuStream);
 95 | 
 96 |        // Call generalised matrix-matrix multiply
 97 |        CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
 98 |                                WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
 99 |                                d_C, WIDTH));
100 |        cuStreamSynchronize(cuStream);
101 |      });
102 |    }).wait();
103 | 
104 |   // Copy the result back to host
105 |   q.memcpy(h_C.data(), d_C, numBytes).wait();
106 | 
107 |   // C must be all ones
108 |   int i = 0;
109 |   const bool allEqual =
110 |       std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) {
111 |         ++i;
112 |         if (num != 1) {
113 |           std::cout << i << " Not one : " << num << std::endl;
114 |         }
115 |         return num == 1;
116 |       });
117 | 
118 |   if (!allEqual) {
119 |     std::cout << " Incorrect result " << std::endl;
120 |   } else {
121 |     std::cout << " Correct! " << std::endl;
122 |   }
123 | 
124 |   CHECK_ERROR(cublasDestroy(handle));
125 | 
126 |   return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
127 | }
128 | 


--------------------------------------------------------------------------------
/examples/sgemm_interop/sycl_sgemm.cpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  sycl_sgemm.cpp
 20 |  *
 21 |  *  Description:
 22 |  *    SGEMM operation in SYCL with buffers
 23 |  **************************************************************************/
 24 | #include <algorithm>
 25 | #include <iostream>
 26 | #include <vector>
 27 | 
 28 | #include <sycl/sycl.hpp>
 29 | 
 30 | #include <cublas_v2.h>
 31 | #include <cuda.h>
 32 | 
 33 | #define CHECK_ERROR(FUNC) checkCudaErrorMsg(FUNC, " " #FUNC)
 34 | 
 35 | void inline checkCudaErrorMsg(cublasStatus_t status, const char *msg) {
 36 |   if (status != CUBLAS_STATUS_SUCCESS) {
 37 |     std::cout << "ERROR CUBLAS:" << msg << " - " << status << std::endl;
 38 |     exit(EXIT_FAILURE);
 39 |   }
 40 | }
 41 | 
 42 | void inline checkCudaErrorMsg(cudaError status, const char *msg) {
 43 |   if (status != cudaSuccess) {
 44 |     std::cout << "ERROR CUDA: " << msg << " - " << status << std::endl;
 45 |     exit(EXIT_FAILURE);
 46 |   }
 47 | }
 48 | 
 49 | int main() {
 50 |   using namespace sycl;
 51 | 
 52 |   constexpr size_t WIDTH = 1024;
 53 |   constexpr size_t HEIGHT = 1024;
 54 |   constexpr float ALPHA = 1.0f;
 55 |   constexpr float BETA = 0.0f;
 56 | 
 57 |   std::vector<float> h_A(WIDTH * HEIGHT), h_B(WIDTH * HEIGHT),
 58 |       h_C(WIDTH * HEIGHT);
 59 | 
 60 |   std::cout << "Size: " << h_C.size() << std::endl;
 61 | 
 62 |   // A is an identity matrix
 63 |   std::fill(std::begin(h_A), std::end(h_A), 0.0f);
 64 |   for (size_t i = 0; i < WIDTH; i++) {
 65 |     h_A[i * WIDTH + i] = 1.0f;
 66 |   }
 67 | 
 68 |   // B is a matrix fill with 1
 69 |   std::fill(std::begin(h_B), std::end(h_B), 1.0f);
 70 | 
 71 |   sycl::queue q{[](auto &d) {
 72 |     return (d.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda);
 73 |   }};
 74 | 
 75 |   cublasHandle_t handle;
 76 |   CHECK_ERROR(cublasCreate(&handle));
 77 | 
 78 |   {
 79 |     buffer<float, 2> b_A{h_A.data(), range<2>{WIDTH, HEIGHT}};
 80 |     buffer<float, 2> b_B{h_B.data(), range<2>{WIDTH, HEIGHT}};
 81 |     buffer<float, 2> b_C{h_C.data(), range<2>{WIDTH, HEIGHT}};
 82 | 
 83 |     q.submit([&](handler &h) {
 84 |       auto d_A = b_A.get_access<sycl::access::mode::read>(h);
 85 |       auto d_B = b_B.get_access<sycl::access::mode::read>(h);
 86 |       auto d_C = b_C.get_access<sycl::access::mode::write>(h);
 87 | 
 88 |       h.host_task([=](sycl::interop_handle ih) {
 89 |         // Set the correct cuda context & stream
 90 | 	cuCtxSetCurrent(ih.get_native_context<backend::ext_oneapi_cuda>());
 91 |         auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
 92 |         cublasSetStream(handle, cuStream);
 93 |         auto cuA = reinterpret_cast<float *>(
 94 |             ih.get_native_mem<backend::ext_oneapi_cuda>(d_A));
 95 |         auto cuB = reinterpret_cast<float *>(
 96 |             ih.get_native_mem<backend::ext_oneapi_cuda>(d_B));
 97 |         auto cuC = reinterpret_cast<float *>(
 98 |             ih.get_native_mem<backend::ext_oneapi_cuda>(d_C));
 99 | 
100 |         CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
101 |                                 WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
102 |                                 cuC, WIDTH));
103 |         cuStreamSynchronize(cuStream);
104 |       });
105 |     });
106 |   }
107 | 
108 |   // C must be all ones
109 |   int i = 0;
110 |   const bool allEqual =
111 |       std::all_of(std::begin(h_C), std::end(h_C), [&i](float num) {
112 |         ++i;
113 |         if (num != 1) {
114 |           std::cout << i << " Not one : " << num << std::endl;
115 |         }
116 |         return num == 1;
117 |       });
118 | 
119 |   if (!allEqual) {
120 |     std::cout << " Incorrect result " << std::endl;
121 |   } else {
122 |     std::cout << " Correct! " << std::endl;
123 |   }
124 | 
125 |   CHECK_ERROR(cublasDestroy(handle));
126 | 
127 |   return allEqual ? EXIT_SUCCESS : EXIT_FAILURE;
128 | }
129 | 


--------------------------------------------------------------------------------
/examples/vector_addition/vector_addition.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2022 Tom Papatheodore
  2 | 
  3 | // Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | // of this software and associated documentation files (the "Software"), to deal
  5 | // in the Software without restriction, including without limitation the rights
  6 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7 | // copies of the Software, and to permit persons to whom the Software is
  8 | // furnished to do so, subject to the following conditions:
  9 | 
 10 | // The above copyright notice and this permission notice shall be included in all
 11 | // copies or substantial portions of the Software.
 12 | 
 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 16 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 18 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 19 | // SOFTWARE.
 20 | 
 21 | #include <stdio.h>
 22 | 
 23 | // Macro for checking errors in GPU API calls
 24 | #define gpuErrorCheck(call)                                                                  \
 25 | do{                                                                                          \
 26 |     cudaError_t gpuErr = call;                                                               \
 27 |     if(cudaSuccess != gpuErr){                                                               \
 28 |         printf("GPU Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(gpuErr)); \
 29 |         exit(1);                                                                             \
 30 |     }                                                                                        \
 31 | }while(0)
 32 | 
 33 | // Size of array
 34 | #define N 1048576
 35 | 
 36 | // Kernel
 37 | __global__ void vector_addition(double *a, double *b, double *c)
 38 | {
 39 |     int id = blockDim.x * blockIdx.x + threadIdx.x;
 40 |     if(id < N) c[id] = a[id] + b[id];
 41 | }
 42 | 
 43 | // Main program
 44 | int main()
 45 | {
 46 |     // Number of bytes to allocate for N doubles
 47 |     size_t bytes = N*sizeof(double);
 48 | 
 49 |     // Allocate memory for arrays A, B, and C on host
 50 |     double *A = (double*)malloc(bytes);
 51 |     double *B = (double*)malloc(bytes);
 52 |     double *C = (double*)malloc(bytes);
 53 | 
 54 |     // Allocate memory for arrays d_A, d_B, and d_C on device
 55 |     double *d_A, *d_B, *d_C;
 56 |     gpuErrorCheck( cudaMalloc(&d_A, bytes) );	
 57 |     gpuErrorCheck( cudaMalloc(&d_B, bytes) );
 58 |     gpuErrorCheck( cudaMalloc(&d_C, bytes) );
 59 | 
 60 |     // Fill host arrays A, B, and C
 61 |     for(int i=0; i<N; i++)
 62 |     {
 63 |         A[i] = 1.0;
 64 |         B[i] = 2.0;
 65 |         C[i] = 0.0;
 66 |     }
 67 | 
 68 |     // Copy data from host arrays A and B to device arrays d_A and d_B
 69 |     gpuErrorCheck( cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice) );
 70 |     gpuErrorCheck( cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice) );
 71 | 
 72 |     // Set execution configuration parameters
 73 |     //      thr_per_blk: number of GPU threads per grid block
 74 |     //      blk_in_grid: number of blocks in grid
 75 |     int thr_per_blk = 128;
 76 |     int blk_in_grid = ceil( float(N) / thr_per_blk );
 77 | 
 78 |     // Launch kernel
 79 |     vector_addition<<<blk_in_grid, thr_per_blk>>>(d_A, d_B, d_C);
 80 | 
 81 |     // Check for synchronous errors during kernel launch (e.g. invalid execution configuration paramters)
 82 |     gpuErrorCheck( cudaGetLastError() );
 83 | 
 84 |     // Check for asynchronous errors during GPU execution (after control is returned to CPU)
 85 |     gpuErrorCheck( cudaDeviceSynchronize() );
 86 | 
 87 |     // Copy data from device array d_C to host array C
 88 |     gpuErrorCheck( cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost) );
 89 | 
 90 |     // Verify results
 91 |     double tolerance = 1.0e-14;
 92 |     for(int i=0; i<N; i++)
 93 |     {
 94 |         if( fabs(C[i] - 3.0) > tolerance )
 95 |         { 
 96 |             printf("Error: value of C[%d] = %f instead of 3.0\n", i, C[i]);
 97 |             exit(1);
 98 |         }
 99 |     }	
100 | 
101 |     // Free CPU memory
102 |     free(A);
103 |     free(B);
104 |     free(C);
105 | 
106 |     // Free GPU memory
107 |     gpuErrorCheck( cudaFree(d_A) );
108 |     gpuErrorCheck( cudaFree(d_B) );
109 |     gpuErrorCheck( cudaFree(d_C) );
110 | 
111 |     printf("\n---------------------------\n");
112 |     printf("__SUCCESS__\n");
113 |     printf("---------------------------\n");
114 |     printf("N                 = %d\n", N);
115 |     printf("Threads Per Block = %d\n", thr_per_blk);
116 |     printf("Blocks In Grid    = %d\n", blk_in_grid);
117 |     printf("---------------------------\n\n");
118 | 
119 |     return 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/setup-script/sample/include/usm_smart_ptr.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <sycl/sycl.hpp>
  4 | #include <memory>
  5 | #include <utility>
  6 | #include <type_traits>
  7 | 
  8 | //#include <concepts>
  9 | 
 10 | namespace usm_smart_ptr {
 11 |     using namespace sycl::usm;
 12 | 
 13 |     template<class T, sycl::usm::alloc Tag>
 14 |     /* template <sycl::usm::alloc location>
 15 |     concept host_accessible = location == sycl::usm::alloc::host || location == sycl::usm::alloc::shared; */
 16 |     struct usm_ptr {
 17 |         explicit usm_ptr(T *t) : val_(t) {}
 18 | 
 19 |         operator T *() const noexcept { return val_; }
 20 | 
 21 |     private:
 22 |         T *val_;
 23 |     };
 24 | 
 25 | 
 26 | /**
 27 |  * SYCL USM Deleter. The std::unique_ptr deleter takes only the pointer
 28 |  * to delete as an argument so that's the only work-around.
 29 |  */
 30 |     template<typename T>
 31 |     struct usm_deleter {
 32 |         sycl::queue q_;
 33 | 
 34 |         explicit usm_deleter(sycl::queue q) : q_(std::move(q)) {}
 35 | 
 36 |         void operator()(T *ptr) const noexcept {
 37 |             if (ptr)
 38 |                 sycl::free(ptr, q_);
 39 |         }
 40 |     };
 41 | 
 42 | /**
 43 |  * Wrapper for a std::unique_ptr that calls the SYCL deleter (sycl::free).
 44 |  * Also holds the number of elements allocated.
 45 |  */
 46 |     template<typename T, sycl::usm::alloc location>
 47 |     class usm_unique_ptr : public std::unique_ptr<T, usm_deleter<T>> {
 48 |     private:
 49 |         size_t count_;
 50 |     public:
 51 |         usm_unique_ptr(T *ptr, usm_deleter<T> deleter, size_t count)
 52 |                 : std::unique_ptr<T, usm_deleter<T>>(ptr, deleter) { count_ = count; }
 53 | 
 54 |         [[nodiscard]] inline size_t size() const noexcept { return count_ * sizeof(T); }
 55 | 
 56 |         [[nodiscard]] inline size_t count() const noexcept { return count_; }
 57 | 
 58 |         [[nodiscard]] inline usm_ptr<T, location> get() const noexcept {
 59 |             return usm_ptr<T, location>(std::unique_ptr<T, usm_deleter<T>>::get());
 60 |         }
 61 | 
 62 |     };
 63 | 
 64 | /**
 65 |  * Builds a usm_unique_ptr pointer
 66 |  * @tparam location indicates where is the memory allocated (device, host, or shared)
 67 |  */
 68 |     template<typename T, sycl::usm::alloc location>
 69 |     usm_unique_ptr<T, location> make_unique_ptr(size_t count, sycl::queue &q) {
 70 |         //return usm_unique_ptr<T>(sycl::usm_allocator < T, location > {q}.allocate(count), usm_deleter<T>{q}, count);
 71 |         if constexpr(location == alloc::shared)
 72 |             return usm_unique_ptr<T, location>(sycl::malloc_shared<T>(count, q), usm_deleter<T>{q}, count);
 73 |         else if constexpr(location == alloc::host)
 74 |             return usm_unique_ptr<T, location>(sycl::malloc_host<T>(count, q), usm_deleter<T>{q}, count);
 75 |         else if constexpr(location == alloc::device)
 76 |             return usm_unique_ptr<T, location>(sycl::malloc_device<T>(count, q), usm_deleter<T>{q}, count);
 77 |         else static_assert(!std::is_same_v<T, T>, "Invalid template parameter.");
 78 |     }
 79 | 
 80 | 
 81 |     template<typename T, sycl::usm::alloc location>
 82 |     usm_unique_ptr<T, location> make_unique_ptr(sycl::queue &q) {
 83 |         return make_unique_ptr<T, location>(1, q);
 84 |     }
 85 | 
 86 | 
 87 | /**
 88 |  * Same interface as usm_unique_ptr
 89 |  * @tparam T
 90 |  */
 91 |     template<typename T, sycl::usm::alloc location>
 92 |     class usm_shared_ptr : public std::shared_ptr<T> {
 93 |     private:
 94 |         size_t count_;
 95 | 
 96 |     public:
 97 |         usm_shared_ptr(T *ptr, usm_deleter<T> deleter, size_t count) : std::shared_ptr<T>(ptr,
 98 |                                                                                           deleter) { count_ = count; }
 99 | 
100 |         [[nodiscard]] inline size_t size() const noexcept { return count_ * sizeof(T); }
101 | 
102 |         [[nodiscard]] inline size_t count() const noexcept { return count_; }
103 | 
104 |         [[nodiscard]] inline usm_ptr<T, location> get() const noexcept {
105 |             return usm_ptr<T, location>(std::shared_ptr<T>::get());
106 |         }
107 | 
108 |     };
109 | 
110 |     template<typename T, sycl::usm::alloc location>
111 |     usm_shared_ptr<T, location> make_shared_ptr(size_t count, sycl::queue &q) {
112 |         //return usm_shared_ptr<T>(sycl::usm_allocator < T, location > {q}.allocate(count), usm_deleter<T>{q}, count);
113 |         if constexpr(location == alloc::shared)
114 |             return usm_shared_ptr<T, location>(sycl::malloc_shared<T>(count, q), usm_deleter<T>{q}, count);
115 |         else if constexpr(location == alloc::host)
116 |             return usm_shared_ptr<T, location>(sycl::malloc_host<T>(count, q), usm_deleter<T>{q}, count);
117 |         else if constexpr(location == alloc::device)
118 |             return usm_shared_ptr<T, location>(sycl::malloc_device<T>(count, q), usm_deleter<T>{q}, count);
119 |         else static_assert(!std::is_same_v<T, T>, "Invalid template parameter.");
120 |     }
121 | 
122 |     template<typename T, sycl::usm::alloc location>
123 |     usm_shared_ptr<T, location> make_sycl_shared(sycl::queue &q) {
124 |         return make_shared_ptr<T, location>(1, q);
125 |     }
126 | }


--------------------------------------------------------------------------------
/setup-script/build_minimal.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | export DPCPP_HOME=~/sycl_workspace
  3 | export CUDA_ROOT=/usr/local/cuda-10.2
  4 | export LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib/:$DPCPP_HOME/deploy/lib64/:$DPCPP_HOME/lapack/install/lib64/:$DPCPP_HOME/OpenCL-ICD-Loader/install/lib64:$CUDA_ROOT/lib:$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
  5 | export PATH=$DPCPP_HOME/deploy/bin/:$CUDA_ROOT/bin:$PATH
  6 | 
  7 | mkdir -p $DPCPP_HOME
  8 | cd $DPCPP_HOME
  9 | mkdir -p deploy
 10 | 
 11 | #export LD_PRELOAD=/opt/intel/opencl/libOpenCL.so.1
 12 | 
 13 | run_test=false
 14 | cmake_test="OFF"
 15 | 
 16 | if [[ -z "$DPCPP_TESTS" ]]; then
 17 |   echo "Not testing"
 18 | else
 19 |   echo "testing"
 20 |   run_test=true
 21 |   cmake_test="ON"
 22 | fi
 23 | 
 24 | export CXXFLAGS="${CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=1"
 25 | 
 26 | # OpenCL headers+ICD
 27 | cd $DPCPP_HOME
 28 | (if cd OpenCL-Headers; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-Headers.git; fi)
 29 | (if cd OpenCL-ICD-Loader; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; fi)
 30 | cd OpenCL-ICD-Loader
 31 | mkdir -p build
 32 | cd build
 33 | cmake \
 34 |   -DOPENCL_ICD_LOADER_HEADERS_DIR=$DPCPP_HOME/OpenCL-Headers \
 35 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/OpenCL-ICD-Loader/install \
 36 |   ..
 37 | make install -j $(nproc)
 38 | 
 39 | #sycl compiler with cuda
 40 | cd $DPCPP_HOME
 41 | (if cd llvm; then git pull; else git clone https://github.com/intel/llvm.git -b sycl; fi)
 42 | cd llvm
 43 | python3 ./buildbot/configure.py \
 44 |   --cuda \
 45 |   -t release \
 46 |   --cmake-opt="-DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy" \
 47 |   --cmake-opt="-DCUDA_SDK_ROOT_DIR=$CUDA_ROOT" \
 48 |   --cmake-opt="-DLLVM_BINUTILS_INCDIR=/usr/local/include" \
 49 |   --cmake-opt="-DLLVM_ENABLE_PROJECTS=clang;sycl;llvm-spirv;opencl;libdevice;xpti;xptifw;libclc;openmp" \
 50 |   --cmake-opt="-DLLVM_BUILD_TESTS=$cmake_test" \
 51 |   --cmake-opt="-DCMAKE_CXX_STANDARD=17" \
 52 |   --cmake-opt="-DLLVM_ENABLE_LTO=off" \
 53 |   --cmake-opt="-DLLVM_ENABLE_LLD=ON" \
 54 |   --cmake-opt="-Wno-dev"
 55 | cd build
 56 | ninja install -j $(nproc)
 57 | if $run_test; then
 58 |   echo "testing llvm"
 59 |   ninja check -j $(nproc)
 60 | fi
 61 | 
 62 | #Lapack Reference
 63 | cd $DPCPP_HOME
 64 | (if cd lapack; then git pull; else git clone https://github.com/Reference-LAPACK/lapack.git; fi)
 65 | cd lapack/
 66 | mkdir -p build
 67 | cd build/
 68 | cmake \
 69 |   -DBUILD_SHARED_LIBS=ON \
 70 |   -DCBLAS=ON \
 71 |   -DLAPACKE=ON \
 72 |   -DBUILD_TESTING=$cmake_test \
 73 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/lapack/install \
 74 |   ..
 75 | cmake --build . -j $(nproc) --target install
 76 | if $run_test; then
 77 |   cmake --build . -j $(nproc) --target test
 78 | fi
 79 | 
 80 | #oneTBB
 81 | cd $DPCPP_HOME
 82 | (if cd oneTBB; then git pull; else git clone https://github.com/oneapi-src/oneTBB.git; fi)
 83 | cd oneTBB
 84 | mkdir -p build
 85 | cd build
 86 | cmake -GNinja \
 87 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
 88 |   -DCMAKE_BUILD_TYPE=Release \
 89 |   -DTBB_STRICT=OFF \
 90 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \
 91 |   -DTBB_TEST=$cmake_test \
 92 |   ..
 93 | ninja install -j $(nproc)
 94 | if $run_test; then
 95 |   ninja test -j $(nproc)
 96 | fi
 97 | 
 98 | #oneMKL
 99 | cd $DPCPP_HOME
100 | (if cd oneMKL; then git pull; else git clone https://github.com/oneapi-src/oneMKL.git; fi)
101 | cd oneMKL
102 | mkdir -p build
103 | cd build
104 | cmake -GNinja \
105 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
106 |   -DCMAKE_BUILD_TYPE=Release \
107 |   -DCMAKE_CXX_STANDARD=17 \
108 |   -DTARGET_DOMAINS=blas \
109 |   -DENABLE_MKLGPU_BACKEND=OFF \
110 |   -DENABLE_CURAND_BACKEND=OFF \
111 |   -DENABLE_MKLCPU_BACKEND=OFF \
112 |   -DENABLE_CUBLAS_BACKEND=ON \
113 |   -DENABLE_NETLIB_BACKEND=ON \
114 |   -DREF_BLAS_ROOT=$DPCPP_HOME/lapack/install \
115 |   -DNETLIB_ROOT=$DPCPP_HOME/lapack/install \
116 |   -DOPENCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \
117 |   -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_ROOT \
118 |   -DSYCL_LIBRARY=$DPCPP_HOME/deploy/lib/libsycl.so \
119 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \
120 |   -DBUILD_FUNCTIONAL_TESTS=$cmake_test \
121 |   ..
122 | ninja install -j $(nproc)
123 | if $run_test; then
124 |   ninja test -j $(nproc)
125 | fi
126 | 
127 | #oneDNN
128 | cd $DPCPP_HOME
129 | (if cd oneDNN; then git pull; else git clone https://github.com/oneapi-src/oneDNN.git; fi)
130 | cd oneDNN
131 | mkdir -p build
132 | cd build
133 | cmake -GNinja \
134 |   -DCMAKE_C_COMPILER=$DPCPP_HOME/deploy/bin/clang \
135 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
136 |   -DCMAKE_BUILD_TYPE=Release \
137 |   -DCMAKE_CXX_STANDARD=17 \
138 |   -DDNNL_INSTALL_MODE=BUNDLE \
139 |   -DDNNL_CPU_RUNTIME=DPCPP \
140 |   -DDNNL_GPU_RUNTIME=DPCPP \
141 |   -DDNNL_GPU_VENDOR=NVIDIA \
142 |   -DTBBROOT=$DPCPP_HOME/deploy \
143 |   -DCUDA_SDK_ROOT_DIR=$CUDA_ROOT \
144 |   -DOPENCLROOT=$DPCPP_HOME/OpenCL-ICD-Loader/install \
145 |   -DOpenCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \
146 |   -DCUBLAS_LIBRARY=$CUDA_ROOT/lib64/libcublas.so \
147 |   -DCUBLAS_INCLUDE_DIR=$CUDA_ROOT/include \
148 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy \
149 |   -DDNNL_BUILD_TESTS=$cmake_test \
150 |   ..
151 | ninja install -j $(nproc)
152 | if $run_test; then
153 |   LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib ninja test -j $(nproc)
154 | fi
155 | 


--------------------------------------------------------------------------------
/examples/hashing/include/tools/usm_smart_ptr.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  usm_smart_ptr.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    SYCL-compatible smart pointer implementation
 23 |  **************************************************************************/
 24 | #pragma once
 25 | 
 26 | #include <sycl/sycl.hpp>
 27 | #include "../internal/config.hpp"
 28 | #include <memory>
 29 | #include <utility>
 30 | #include <type_traits>
 31 | 
 32 | #ifdef USING_COMPUTECPP
 33 | namespace cl::sycl::usm {
 34 |     using cl::sycl::experimental::usm::alloc;
 35 | }
 36 | #endif
 37 | 
 38 | 
 39 | #include "intrinsics.hpp"
 40 | 
 41 | namespace usm_smart_ptr {
 42 |     using namespace sycl::usm;
 43 | 
 44 |     template<class T, sycl::usm::alloc Tag>
 45 |     struct usm_ptr {
 46 |     private:
 47 |         T *val_;
 48 |     public:
 49 |         explicit usm_ptr(T *t) : val_(t) {}
 50 | 
 51 |         operator T *() const noexcept { return val_; }
 52 |     };
 53 | 
 54 | 
 55 |     template<typename T>
 56 |     struct device_accessible_ptr {
 57 |     private:
 58 |         const T *val_;
 59 |     public:
 60 |         explicit device_accessible_ptr(T *p) : val_(p) {};
 61 | 
 62 |         explicit device_accessible_ptr(const T *p) : val_(p) {};
 63 | 
 64 |         device_accessible_ptr(usm_ptr<T, alloc::shared> p) : val_((T *) p) {};
 65 | 
 66 |         device_accessible_ptr(usm_ptr<T, alloc::device> p) : val_((T *) p) {};
 67 | 
 68 |         operator T *() const noexcept { return (T *) val_; }
 69 | 
 70 | 
 71 |     };
 72 | 
 73 |     template<typename T>
 74 |     struct host_accessible_ptr {
 75 |     private:
 76 |         T *val_;
 77 |     public:
 78 |         host_accessible_ptr(usm_ptr<T, alloc::shared> p) : val_((T *) p) {};
 79 | 
 80 |         host_accessible_ptr(usm_ptr<T, alloc::host> p) : val_((T *) p) {};
 81 | 
 82 |         operator T *() const noexcept { return val_; }
 83 | 
 84 | 
 85 |     };
 86 | 
 87 | 
 88 | /**
 89 |  * SYCL USM Deleter. The std::unique_ptr deleter takes only the pointer
 90 |  * to delete as an argument so that's the only work-around.
 91 |  */
 92 |     template<typename T>
 93 |     struct usm_deleter {
 94 |         sycl::queue q_;
 95 | 
 96 |         explicit usm_deleter(const sycl::queue &q) : q_(q) {}
 97 | 
 98 |         void operator()(T *ptr) const noexcept {
 99 |             if (ptr)
100 |                 sycl::free(ptr, q_);
101 |         }
102 |     };
103 | 
104 | /**
105 |  * Wrapper for a std::unique_ptr that calls the SYCL deleter (sycl::free).
106 |  * Also holds the number of elements allocated.
107 |  */
108 |     template<typename T, sycl::usm::alloc location>
109 |     class usm_unique_ptr : public std::unique_ptr<T, usm_deleter<T>> {
110 |     private:
111 |         size_t count_;
112 |     public:
113 |         usm_unique_ptr(size_t count, sycl::queue q)
114 |                 : std::unique_ptr<T, usm_deleter<T>>(sycl::malloc<T>(count, q, location), usm_deleter<T>{q}) { count_ = count; }
115 | 
116 |         explicit usm_unique_ptr(sycl::queue q) :
117 |                 usm_unique_ptr(1, q) { count_ = 1; }
118 | 
119 | 
120 |         [[nodiscard]] inline size_t alloc_size() const noexcept { return count_ * sizeof(T); }
121 | 
122 |         [[nodiscard]] inline size_t alloc_count() const noexcept { return count_; }
123 | 
124 |         [[nodiscard]] inline usm_ptr<T, location> get() const noexcept {
125 |             return usm_ptr<T, location>(std::unique_ptr<T, usm_deleter<T>>::get());
126 |         }
127 | 
128 |         [[nodiscard]] inline T *raw() const noexcept {
129 |             return std::unique_ptr<T, usm_deleter<T>>::get();
130 |         }
131 | 
132 |     };
133 | 
134 | 
135 | /**
136 |  * Same interface as usm_unique_ptr
137 |  * @tparam T
138 |  */
139 |     template<typename T, sycl::usm::alloc location>
140 |     class usm_shared_ptr : public std::shared_ptr<T> {
141 |     private:
142 |         size_t count_;
143 | 
144 |     public:
145 |         usm_shared_ptr(size_t count, sycl::queue q) : std::shared_ptr<T>(sycl::malloc<T>(count, q, location), usm_deleter<T>{q}) { count_ = count; }
146 | 
147 |         explicit usm_shared_ptr(sycl::queue q) :
148 |                 usm_shared_ptr(1, q) { count_ = 1; }
149 | 
150 |         [[nodiscard]] inline size_t alloc_size() const noexcept { return count_ * sizeof(T); }
151 | 
152 |         [[nodiscard]] inline size_t alloc_count() const noexcept { return count_; }
153 | 
154 |         [[nodiscard]] inline usm_ptr<T, location> get() const noexcept {
155 |             return usm_ptr<T, location>(std::shared_ptr<T>::get());
156 |         }
157 | 
158 |         [[nodiscard]] inline T *raw() const noexcept {
159 |             return std::shared_ptr<T>::get();
160 |         }
161 | 
162 |     };
163 | 
164 | }


--------------------------------------------------------------------------------
/setup-script/build.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | export DPCPP_HOME=~/sycl_workspace
  3 | export CUDA_ROOT=/usr/local/cuda-10.2
  4 | export LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib/:$DPCPP_HOME/deploy/lib64/:$DPCPP_HOME/lapack/install/lib64/:$DPCPP_HOME/OpenCL-ICD-Loader/install/lib64:$CUDA_ROOT/lib:$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
  5 | export PATH=$DPCPP_HOME/deploy/bin/:$CUDA_ROOT/bin:$PATH
  6 | 
  7 | mkdir -p $DPCPP_HOME
  8 | cd $DPCPP_HOME
  9 | mkdir -p deploy
 10 | 
 11 | #export LD_PRELOAD=/opt/intel/opencl/libOpenCL.so.1
 12 | 
 13 | run_test=false
 14 | cmake_test="OFF"
 15 | 
 16 | if [[ -z "$DPCPP_TESTS" ]]; then
 17 |   echo "Not testing"
 18 | else
 19 |   echo "testing"
 20 |   run_test=true
 21 |   cmake_test="ON"
 22 | fi
 23 | 
 24 | export CXXFLAGS="${CXXFLAGS} -D_GLIBCXX_USE_CXX11_ABI=1"
 25 | 
 26 | # OpenCL headers+ICD
 27 | cd $DPCPP_HOME
 28 | (if cd OpenCL-Headers; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-Headers.git; fi)
 29 | (if cd OpenCL-ICD-Loader; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; fi)
 30 | cd OpenCL-ICD-Loader
 31 | mkdir -p build
 32 | cd build
 33 | cmake \
 34 |   -DOPENCL_ICD_LOADER_HEADERS_DIR=$DPCPP_HOME/OpenCL-Headers \
 35 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/OpenCL-ICD-Loader/install \
 36 |   ..
 37 | make install -j $(nproc)
 38 | 
 39 | #sycl compiler with cuda
 40 | source /opt/intel/opencl/env/compiler_rt_vars.sh
 41 | cd $DPCPP_HOME
 42 | (if cd llvm; then git pull; else git clone https://github.com/intel/llvm.git -b sycl; fi)
 43 | cd llvm
 44 | python3 ./buildbot/configure.py \
 45 |   --cuda \
 46 |   -t release \
 47 |   --cmake-opt="-DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy" \
 48 |   --cmake-opt="-DCUDA_SDK_ROOT_DIR=$CUDA_ROOT" \
 49 |   --cmake-opt="-DLLVM_BINUTILS_INCDIR=/usr/local/include" \
 50 |   --cmake-opt="-DLLVM_ENABLE_PROJECTS=clang;sycl;llvm-spirv;libunwind;opencl;libdevice;xpti;xptifw;libclc;lld;lldb;libc;libcxx;libcxxabi;openmp;clang-tools-extra;compiler-rt" \
 51 |   --cmake-opt="-DLLVM_BUILD_TESTS=$cmake_test" \
 52 |   --cmake-opt="-DCMAKE_CXX_STANDARD=17" \
 53 |   --cmake-opt="-DLLVM_ENABLE_LTO=off" \
 54 |   --cmake-opt="-DLLVM_ENABLE_LLD=ON" \
 55 |   --cmake-opt="-DSYCL_ENABLE_WERROR=OFF" \
 56 |   --cmake-opt="-Wno-dev"
 57 | cd build
 58 | ninja install -j $(nproc)
 59 | if $run_test; then
 60 |   echo "testing llvm"
 61 |   ninja check -j $(nproc)
 62 | fi
 63 | 
 64 | #Lapack Reference
 65 | cd $DPCPP_HOME
 66 | (if cd lapack; then git pull; else git clone https://github.com/Reference-LAPACK/lapack.git; fi)
 67 | cd lapack/
 68 | mkdir -p build
 69 | cd build/
 70 | cmake \
 71 |   -DBUILD_SHARED_LIBS=ON \
 72 |   -DCBLAS=ON \
 73 |   -DLAPACKE=ON \
 74 |   -DBUILD_TESTING=$cmake_test \
 75 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/lapack/install \
 76 |   ..
 77 | cmake --build . -j $(nproc) --target install
 78 | if $run_test; then
 79 |   cmake --build . -j $(nproc) --target test
 80 | fi
 81 | 
 82 | #oneTBB
 83 | cd $DPCPP_HOME
 84 | (if cd oneTBB; then git pull; else git clone https://github.com/oneapi-src/oneTBB.git; fi)
 85 | cd oneTBB
 86 | mkdir -p build
 87 | cd build
 88 | cmake -GNinja \
 89 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
 90 |   -DCMAKE_BUILD_TYPE=Release \
 91 |   -DTBB_STRICT=OFF \
 92 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \
 93 |   -DTBB_TEST=$cmake_test \
 94 |   ..
 95 | ninja install -j $(nproc)
 96 | if $run_test; then
 97 |   ninja test -j $(nproc)
 98 | fi
 99 | 
100 | #oneMKL
101 | cd $DPCPP_HOME
102 | (if cd oneMKL; then git pull; else git clone https://github.com/oneapi-src/oneMKL.git; fi)
103 | cd oneMKL
104 | mkdir -p build
105 | cd build
106 | cmake -GNinja \
107 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
108 |   -DCMAKE_BUILD_TYPE=Release \
109 |   -DCMAKE_CXX_STANDARD=17 \
110 |   -DTARGET_DOMAINS=blas \
111 |   -DENABLE_MKLGPU_BACKEND=OFF \
112 |   -DENABLE_CURAND_BACKEND=OFF \
113 |   -DENABLE_MKLCPU_BACKEND=OFF \
114 |   -DENABLE_CUBLAS_BACKEND=ON \
115 |   -DENABLE_NETLIB_BACKEND=ON \
116 |   -DREF_BLAS_ROOT=$DPCPP_HOME/lapack/install \
117 |   -DNETLIB_ROOT=$DPCPP_HOME/lapack/install \
118 |   -DOPENCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \
119 |   -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_ROOT \
120 |   -DSYCL_LIBRARY=$DPCPP_HOME/deploy/lib/libsycl.so \
121 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \
122 |   -DBUILD_FUNCTIONAL_TESTS=$cmake_test \
123 |   ..
124 | ninja install -j $(nproc)
125 | if $run_test; then
126 |   ninja test -j $(nproc)
127 | fi
128 | 
129 | #oneDNN
130 | cd $DPCPP_HOME
131 | (if cd oneDNN; then git pull; else git clone https://github.com/oneapi-src/oneDNN.git; fi)
132 | cd oneDNN
133 | mkdir -p build
134 | cd build
135 | cmake -GNinja \
136 |   -DCMAKE_C_COMPILER=$DPCPP_HOME/deploy/bin/clang \
137 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
138 |   -DCMAKE_BUILD_TYPE=Release \
139 |   -DCMAKE_CXX_STANDARD=17 \
140 |   -DDNNL_INSTALL_MODE=BUNDLE \
141 |   -DDNNL_CPU_RUNTIME=DPCPP \
142 |   -DDNNL_GPU_RUNTIME=DPCPP \
143 |   -DDNNL_GPU_VENDOR=NVIDIA \
144 |   -DNNL_BUILD_EXAMPLES=OFF \
145 |   -DTBBROOT=$DPCPP_HOME/deploy \
146 |   -DCUDA_SDK_ROOT_DIR=$CUDA_ROOT \
147 |   -DOPENCLROOT=$DPCPP_HOME/OpenCL-ICD-Loader/install \
148 |   -DOpenCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \
149 |   -DCUBLAS_LIBRARY=$CUDA_ROOT/lib64/libcublas.so \
150 |   -DCUBLAS_INCLUDE_DIR=$CUDA_ROOT/include \
151 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy \
152 |   -DDNNL_BUILD_TESTS=$cmake_test \
153 |   ..
154 | ninja install -j $(nproc)
155 | if $run_test; then
156 |   LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib ninja test -j $(nproc)
157 | fi
158 | 


--------------------------------------------------------------------------------
/examples/hashing/src/benchmarks/misc.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  misc.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    Miscellaneous
 23 |  **************************************************************************/
 24 | #pragma once
 25 | 
 26 | #include <cstring>
 27 | #include <sycl_hash.hpp>
 28 | #include <tools/fill_rand.hpp>
 29 | 
 30 | #include <iomanip>
 31 | 
 32 | using namespace usm_smart_ptr;
 33 | 
 34 | 
 35 | template<hash::method M, int ... args>
 36 | void compare_two_devices(sycl::queue q1, sycl::queue q2, size_t input_block_size, size_t n_blocs) {
 37 |     std::cout << "Comparing " << hash::get_name<M, args...>() << " on: " << q1.get_device().get_info<sycl::info::device::name>() << " and: " << q2.get_device().get_info<sycl::info::device::name>()
 38 |               << "   ...";
 39 |     size_t out_block_size = hash::get_block_size<M, args...>();
 40 |     auto input_data1 = usm_unique_ptr<byte, alloc::shared>(input_block_size * n_blocs, q1);
 41 |     auto output_hashes1 = usm_unique_ptr<byte, alloc::shared>(out_block_size * n_blocs, q1);
 42 |     auto input_data2 = usm_unique_ptr<byte, alloc::shared>(input_block_size * n_blocs, q2);
 43 |     auto output_hashes2 = usm_unique_ptr<byte, alloc::shared>(out_block_size * n_blocs, q2);
 44 | 
 45 |     fill_rand<byte>(input_data1.get(), input_data1.alloc_count());
 46 |     memcpy(input_data2.raw(), input_data1.raw(), input_data1.alloc_size());
 47 | 
 48 | 
 49 |     if constexpr (M == hash::method::blake2b) {
 50 |         byte key[64];
 51 |         std::memset(key, 1, 64);
 52 |         hash::compute<M, args...>(q1, input_data1.get(), input_block_size, output_hashes1.get(), n_blocs, key, 64);
 53 |         hash::compute<M, args...>(q2, input_data2.get(), input_block_size, output_hashes2.get(), n_blocs, key, 64);
 54 |     } else {
 55 |         hash::compute<M, args...>(q1, input_data1.get(), input_block_size, output_hashes1.get(), n_blocs);
 56 |         hash::compute<M, args...>(q2, input_data2.get(), input_block_size, output_hashes2.get(), n_blocs);
 57 |     }
 58 | 
 59 |     auto[idx1, idx2]= std::mismatch(output_hashes1.raw(), output_hashes1.raw() + output_hashes1.alloc_count(), output_hashes2.raw());
 60 |     if (idx1 != output_hashes1.raw() + output_hashes1.alloc_count()) {
 61 |         std::cout << "mismatch" << std::endl;
 62 |     } else {
 63 |         std::cout << "pass" << std::endl;
 64 |     }
 65 | }
 66 | 
 67 | 
 68 | template<hash::method M, int ... args>
 69 | double benchmark_one_queue(sycl::queue q, size_t input_block_size, size_t n_blocs, size_t n_iters = 1) {
 70 |     auto all_input_data = usm_unique_ptr<byte, alloc::device>(input_block_size * n_blocs, q);
 71 |     auto all_output_hashes = usm_unique_ptr<byte, alloc::device>(hash::get_block_size<M, args...>() * n_blocs, q);
 72 |     if constexpr (M == hash::method::blake2b) {
 73 |         byte key[64];
 74 |         std::memset(key, 1, 64);
 75 |         hash::compute<M, args...>(q, all_input_data.get(), input_block_size, all_output_hashes.get(), n_blocs, key, 64);/* Preheat */
 76 |         auto before = std::chrono::steady_clock::now();
 77 |         for (size_t i = 0; i < n_iters; ++i) {
 78 | #ifdef VERBOSE_HASH_LIB
 79 |             std::cerr << i << "  ";
 80 | #endif
 81 |             hash::compute<M, args...>(q, all_input_data.get(), input_block_size, all_output_hashes.get(), n_blocs, key, 64);
 82 |         }
 83 |         auto after = std::chrono::steady_clock::now();
 84 |         auto time = std::chrono::duration<double, std::milli>(after - before).count();
 85 |         return (double) n_iters / time * (double) (input_block_size * n_blocs) / 1e6;
 86 |     } else {
 87 |         hash::compute<M, args...>(q, all_input_data.get(), input_block_size, all_output_hashes.get(), n_blocs);/* Preheat */
 88 |         auto before = std::chrono::steady_clock::now();
 89 |         for (size_t i = 0; i < n_iters; ++i) {
 90 | #ifdef VERBOSE_HASH_LIB
 91 |             std::cerr << i << "  ";
 92 | #endif
 93 |             hash::compute<M, args...>(q, all_input_data.get(), input_block_size, all_output_hashes.get(), n_blocs);
 94 |         }
 95 |         auto after = std::chrono::steady_clock::now();
 96 |         auto time = std::chrono::duration<double, std::milli>(after - before).count();
 97 |         return (double) n_iters / time * (double) (input_block_size * n_blocs) / 1e6;
 98 |     }
 99 | }
100 | 
101 | 
102 | template<hash::method M, int ... args>
103 | void run_benchmark(sycl::queue q, size_t input_block_size, size_t n_blocs, size_t n_iters) {
104 |     std::cout << "Running " << hash::get_name<M, args...>() << " on:" << q.get_device().get_info<sycl::info::device::name>() << ": ";
105 |     auto gflops = benchmark_one_queue<M, args...>(q, input_block_size, n_blocs, n_iters);
106 |     std::cout << "\nGB hashed per sec: " << gflops << "\n\n";
107 | }
108 | 


--------------------------------------------------------------------------------
/setup-script/build_with_libcxx.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | export DPCPP_HOME=~/sycl_workspace
  3 | export CUDA_ROOT=/usr/local/cuda-10.2
  4 | export LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib/:$DPCPP_HOME/deploy/lib64/:$DPCPP_HOME/lapack/install/lib64/:$DPCPP_HOME/OpenCL-ICD-Loader/install/lib64:$CUDA_ROOT/lib:$CUDA_ROOT/lib64:$LD_LIBRARY_PATH
  5 | export PATH=$DPCPP_HOME/deploy/bin/:$CUDA_ROOT/bin:$PATH
  6 | 
  7 | mkdir -p $DPCPP_HOME
  8 | cd $DPCPP_HOME
  9 | mkdir -p deploy
 10 | 
 11 | export CXXFLAGS="${CXXFLAGS} -stdlib=libc++"
 12 | #export LD_PRELOAD=/opt/intel/opencl/libOpenCL.so.1
 13 | 
 14 | run_test=false
 15 | cmake_test="OFF"
 16 | 
 17 | if [[ -z "$DPCPP_TESTS" ]]; then
 18 |   echo "Not testing"
 19 | else
 20 |   echo "testing"
 21 |   run_test=true
 22 |   cmake_test="ON"
 23 | fi
 24 | 
 25 | # OpenCL headers+ICD
 26 | cd $DPCPP_HOME
 27 | (if cd OpenCL-Headers; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-Headers.git; fi)
 28 | (if cd OpenCL-ICD-Loader; then git pull; else git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader.git; fi)
 29 | cd OpenCL-ICD-Loader
 30 | mkdir -p build
 31 | cd build
 32 | cmake \
 33 |   -DOPENCL_ICD_LOADER_HEADERS_DIR=$DPCPP_HOME/OpenCL-Headers \
 34 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/OpenCL-ICD-Loader/install \
 35 |   ..
 36 | make install -j $(nproc)
 37 | 
 38 | #sycl compiler with cuda
 39 | source /opt/intel/opencl/env/compiler_rt_vars.sh
 40 | cd $DPCPP_HOME
 41 | (if cd llvm; then git pull; else git clone https://github.com/intel/llvm.git -b sycl; fi)
 42 | cd llvm
 43 | python3 ./buildbot/configure.py \
 44 |   --cuda \
 45 |   -t release \
 46 |   --cmake-opt="-DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy" \
 47 |   --cmake-opt="-DCUDA_SDK_ROOT_DIR=$CUDA_ROOT" \
 48 |   --cmake-opt="-DLLVM_ENABLE_PROJECTS=libcxxabi;libunwind;libcxx;clang;sycl;llvm-spirv;opencl;opencl-aot;libdevice;xpti;xptifw;libclc;openmp;clang-tools-extra;compiler-rt" \
 49 |   --cmake-opt="-DLLVM_BUILD_TESTS=$cmake_test" \
 50 |   --cmake-opt="-DLIBC_COMPILE_OPTIONS_DEFAULT=-march=native" \
 51 |   --cmake-opt="-DLLVM_LIBC_FULL_BUILD=ON" \
 52 |   --cmake-opt="-DLIBCXXABI_USE_LLVM_UNWINDER=ON" \
 53 |   --cmake-opt="-DLIBCXX_USE_COMPILER_RT=ON" \
 54 |   --cmake-opt="-DSYCL_ENABLE_WERROR=OFF" \
 55 |   --cmake-opt="-DCLANG_DEFAULT_CXX_STDLIB=libc++" \
 56 |   --cmake-opt="-DCMAKE_CXX_STANDARD=17" \
 57 |   --cmake-opt="-Wno-dev"
 58 | cd build
 59 | ninja install -j $(nproc)
 60 | if $run_test; then
 61 |   echo "testing llvm"
 62 |   ninja check -j $(nproc)
 63 | fi
 64 | 
 65 | #Lapack Reference
 66 | cd $DPCPP_HOME
 67 | (if cd lapack; then git pull; else git clone https://github.com/Reference-LAPACK/lapack.git; fi)
 68 | cd lapack/
 69 | mkdir -p build
 70 | cd build/
 71 | cmake \
 72 |   -DBUILD_SHARED_LIBS=ON \
 73 |   -DCBLAS=ON \
 74 |   -DCMAKE_CXX_FLAGS="-stdlib=libc++" \
 75 |   -DLAPACKE=ON \
 76 |   -DBUILD_TESTING=$cmake_test \
 77 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/lapack/install \
 78 |   ..
 79 | cmake --build . -j $(nproc) --target install
 80 | if $run_test; then
 81 |   cmake --build . -j $(nproc) --target test
 82 | fi
 83 | 
 84 | #oneTBB
 85 | cd $DPCPP_HOME
 86 | (if cd oneTBB; then git pull; else git clone https://github.com/oneapi-src/oneTBB.git; fi)
 87 | cd oneTBB
 88 | mkdir -p build
 89 | cd build
 90 | cmake -GNinja \
 91 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
 92 |   -DCMAKE_BUILD_TYPE=Release \
 93 |   -DCMAKE_CXX_FLAGS="-stdlib=libc++" \
 94 |   -DTBB_STRICT=OFF \
 95 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \
 96 |   -DTBB_TEST=$cmake_test \
 97 |   ..
 98 | ninja install -j $(nproc)
 99 | if $run_test; then
100 |   ninja test -j $(nproc)
101 | fi
102 | 
103 | #oneMKL
104 | cd $DPCPP_HOME
105 | (if cd oneMKL; then git pull; else git clone https://github.com/oneapi-src/oneMKL.git; fi)
106 | cd oneMKL
107 | mkdir -p build
108 | cd build
109 | cmake -GNinja \
110 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
111 |   -DCMAKE_BUILD_TYPE=Release \
112 |   -DCMAKE_CXX_STANDARD=17 \
113 |   -DCMAKE_CXX_FLAGS="-stdlib=libc++ -lc++ -lc++abi" \
114 |   -DTARGET_DOMAINS=blas \
115 |   -DENABLE_MKLGPU_BACKEND=OFF \
116 |   -DENABLE_CURAND_BACKEND=OFF \
117 |   -DENABLE_MKLCPU_BACKEND=OFF \
118 |   -DENABLE_CUBLAS_BACKEND=ON \
119 |   -DENABLE_NETLIB_BACKEND=ON \
120 |   -DREF_BLAS_ROOT=$DPCPP_HOME/lapack/install \
121 |   -DNETLIB_ROOT=$DPCPP_HOME/lapack/install \
122 |   -DOPENCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \
123 |   -DCUDA_TOOLKIT_ROOT_DIR=$CUDA_ROOT \
124 |   -DSYCL_LIBRARY=$DPCPP_HOME/deploy/lib/libsycl.so \
125 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy/ \
126 |   -DBUILD_FUNCTIONAL_TESTS=$cmake_test \
127 |   ..
128 | ninja install -j $(nproc)
129 | if $run_test; then
130 |   ninja test -j $(nproc)
131 | fi
132 | 
133 | #oneDNN
134 | cd $DPCPP_HOME
135 | (if cd oneDNN; then git pull; else git clone https://github.com/oneapi-src/oneDNN.git; fi)
136 | cd oneDNN
137 | mkdir -p build
138 | cd build
139 | cmake -GNinja \
140 |   -DCMAKE_C_COMPILER=$DPCPP_HOME/deploy/bin/clang \
141 |   -DCMAKE_CXX_COMPILER=$DPCPP_HOME/deploy/bin/clang++ \
142 |   -DCMAKE_BUILD_TYPE=Release \
143 |   -DCMAKE_CXX_STANDARD=17 \
144 |   -DCMAKE_CXX_FLAGS="-stdlib=libc++" \
145 |   -DDNNL_INSTALL_MODE=BUNDLE \
146 |   -DDNNL_CPU_RUNTIME=DPCPP \
147 |   -DDNNL_GPU_RUNTIME=DPCPP \
148 |   -DDNNL_GPU_VENDOR=NVIDIA \
149 |   -DTBBROOT=$DPCPP_HOME/deploy \
150 |   -DCUDA_SDK_ROOT_DIR=$CUDA_ROOT \
151 |   -DOPENCLROOT=$DPCPP_HOME/OpenCL-ICD-Loader/install \
152 |   -DOpenCL_INCLUDE_DIR=$DPCPP_HOME/OpenCL-Headers \
153 |   -DCUBLAS_LIBRARY=$CUDA_ROOT/lib64/libcublas.so \
154 |   -DCUBLAS_INCLUDE_DIR=$CUDA_ROOT/include \
155 |   -DCMAKE_INSTALL_PREFIX=$DPCPP_HOME/deploy \
156 |   -DDNNL_BUILD_TESTS=$cmake_test \
157 |   ..
158 | ninja install -j $(nproc)
159 | if $run_test; then
160 |   LD_LIBRARY_PATH=$DPCPP_HOME/deploy/lib ninja test -j $(nproc)
161 | fi
162 | 


--------------------------------------------------------------------------------
/examples/hashing/README.md:
--------------------------------------------------------------------------------
  1 | # SYCL Hashing Algorithms
  2 | 
  3 | This repository contains hashing algorithms implemented using [SYCL](https://www.khronos.org/sycl/) which is a heterogeneous programming model based on standard C++.
  4 | 
  5 | The following hashing methods are currently available:
  6 | 
  7 | - sha256
  8 | - sha1 (unsecure)
  9 | - md2 (unsecure)
 10 | - md5 (unsecure)
 11 | - keccak (128 224 256 288 384 512)
 12 | - sha3 (224 256 384 512)
 13 | - blake2b
 14 | 
 15 | ## Benchmarks
 16 | 
 17 | Some functions were ported from a CUDA implementation. The SYCL code was tested unchanged across the different implementations and hardware. Here's how they perform (the values are in GB/s):
 18 | 
 19 | | Function | Native CUDA | SYCL on DPC++ CUDA (optimised)              | SYCL on ComputeCPP CPU (spir64/spirv64) | SYCL on DPC++ CPU (spir64_x86_64) | SYCL on hipSYCL (omp/cuda) |
 20 | | -------- | ----------- | ------------------------------------------- | --------------------------------------- | --------------------------------- | -------------------------- |
 21 | | keccak   | 15.7        | 23.0                                        | 4.14 / 3.89                             | 4.98                              | 4.32 / 23.2                |
 22 | | md5      | 14.6        | 20.3                                        | 6.26 / 5.89                             | 9.93                              | 9.27 / 20.2                |
 23 | | blake2b  | 14.7        | 21.6                                        | 9.46 / 10.0                             | 12.4                              | 7.71 / 17.9                |
 24 | | sha1     | 14.7        | 19.34                                       | 3.61 / 3.35                             | 3.30                              | 4.39 / 19.2                |
 25 | | sha256   | 13.5        | 19.15                                       | 2.23 / 2.00                             | 2.91                              | 2.93 / 19.1                |
 26 | | md2      | 4.18        | 4.23                                        | 0.22 / 0.25                             | 0.176                             | 0.25 / 2.33                |
 27 | 
 28 | ### Note
 29 | 
 30 | Something broke the spir64 backend of DPC++ and it produces now very slow code
 31 | 
 32 | Benchmark configuration:
 33 | 
 34 | - block_size: 512 kiB
 35 | - n_blocks: 4\*1536
 36 | - n_outbit: 128
 37 | - GPU: GTX 1660 Ti
 38 | - OS: rhel8.4
 39 | - CPU: 2x E5-2670 v2
 40 | 
 41 | ### Remark
 42 | 
 43 | These are not the "best" settings as the optimum changes with the algorithm. The benchmarks measure the time to run 40 iterations, without copying the memory between the device and the host. In a real application, you
 44 | could be memory bound.
 45 | 
 46 | ## How to build
 47 | 
 48 | ```bash
 49 | git clone https://github.com/Michoumichmich/SYCL-Hashing-Algorithms.git ; cd SYCL-Hashing-Algorithms;
 50 | mkdir build; cd build
 51 | CXX=<sycl_compiler> cmake .. -DCMAKE_BUILD_TYPE=Release
 52 | make
 53 | ```
 54 | 
 55 | This will build the library, and a demo executable. Running it will perform a benchmark on your CPU and CUDA device (if available).
 56 | 
 57 | You do not necessarily need to pass the `<sycl_compiler>` to cmake, it depends on the implementation you're using and its toolchain.
 58 | 
 59 | ## How to use
 60 | 
 61 | Let's assume you used this [script](https://github.com/Michoumichmich/oneAPI-setup-script) to setup the toolchain with CUDA support.
 62 | 
 63 | Here's a minimal example:
 64 | 
 65 | ```C++
 66 | #include <sycl/sycl.hpp> // SYCL headers
 67 | #include "sycl_hash.hpp" // The headers
 68 | #include "tools/sycl_queue_helpers.hpp" // To make sycl queue
 69 | using namespace hash;
 70 | 
 71 | int main(){
 72 |     auto cuda_q = try_get_queue(cuda_selector{}); // create a queue on a cuda device and attach an exception handler
 73 | 
 74 |     constexpr int hash_size = get_block_size<method::sha256>();
 75 |     constexpr int n_blocks = 20; // amount of hash to do in parallel
 76 |     constexpr int item_size = 1024;
 77 | 
 78 |     byte input[n_blocks * item_size]; // get an array of 20 same-sized data items to hash;
 79 |     byte output[n_blocks * hash_size]; // reserve space for the output
 80 | 
 81 |     compute<method::sha256>(cuda_q, input, item_size, output, n_blocks); // do the computing
 82 |     compute_sha256(cuda_q, input, item_size, output, n_blocks); // identical
 83 | 
 84 |     /**
 85 |      * For SHA3 one could write:
 86 |      * compute_sha3<512>(cuda_q, input, item_size, output, n_blocks);
 87 |      */
 88 | 
 89 |     return 0;
 90 | }
 91 | ```
 92 | 
 93 | And, for clang build with
 94 | 
 95 | ```
 96 | -fsycl -fsycl-targets=spir64_x86_64,nvptx64-nvidia-cuda--sm_50 -I<include_dir> <build_dir>/libsycl_hash.a
 97 | ```
 98 | 
 99 | And your hash will run on the GPU.
100 | 
101 | # Sources
102 | 
103 | You may find [here](https://github.com/Michoumichmich/cuda-hashing-algos-with-benchmark) the fork of the original CUDA implementations with the benchmarks added.
104 | 
105 | # Tested implementations
106 | 
107 | - [Intel's clang](https://github.com/intel/llvm) with OpenCL on CPU (using Intel's driver) and [Codeplay's CUDA backend](https://www.codeplay.com/solutions/oneapi/for-cuda/)
108 | - [hipSYCL](https://github.com/illuhad/hipSYCL) on macOS with the OpenMP backend (set `hipSYCL_DIR` then `cmake .. -DHIPSYCL_TARGETS="..."`)
109 | - [ComputeCPP](https://developer.codeplay.com/products/computecpp/ce/home) you can build with `cmake .. -DComputeCpp_DIR=/path_to_computecpp -DCOMPUTECPP_BITCODE=spir64 -DCMAKE_BUILD_TYPE=Release`, Tested on the host
110 |   device, `spir64` and `spirv64`. See [ComputeCpp SDK](https://github.com/codeplaysoftware/computecpp-sdk)
111 | 
112 | # Acknowledgements
113 | 
114 | This repository contains code written by Matt Zweil & The Mochimo Core Contributor Team. Please see the [files](https://github.com/mochimodev/cuda-hashing-algos) for their respective licences.
115 | 


--------------------------------------------------------------------------------
/examples/hashing/include/tools/runtime_byte_array.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  runtime_byte_array.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    Runtime byte array for hashing functions
 23 |  **************************************************************************/
 24 | /**
 25 |     Copyright 2021 Codeplay Software Ltd.
 26 | 
 27 |     Licensed under the Apache License, Version 2.0 (the "License");
 28 |     you may not use these files except in compliance with the License.
 29 |     You may obtain a copy of the License at
 30 | 
 31 |     http://www.apache.org/licenses/LICENSE-2.0
 32 | 
 33 |     For your convenience, a copy of the License has been included in this
 34 |     repository.
 35 | 
 36 |     Unless required by applicable law or agreed to in writing, software
 37 |     distributed under the License is distributed on an "AS IS" BASIS,
 38 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 39 |     See the License for the specific language governing permissions and
 40 |     limitations under the License.
 41 | 
 42 |     @author Michel Migdal.
 43 |  */
 44 | 
 45 | /**
 46 |  * Array of bytes accessible with runtimes indices and that is stored using larger types to reduce register look-up lacencies/
 47 |  */
 48 | 
 49 | #pragma once
 50 | 
 51 | #include <sycl/sycl.hpp>
 52 | #include <type_traits>
 53 | #include <array>
 54 | #include <tools/intrinsics.hpp>
 55 | 
 56 | namespace sbb {
 57 |     namespace runtime_idx_detail {
 58 | 
 59 |         template<typename T, typename array_t, int N, int idx_max = N - 1>
 60 |         static inline constexpr void runtime_index_wrapper_internal_store_byte(array_t &arr, const uint &word_idx, const uint8_t &byte_in, const uint &byte_idx) {
 61 |             static_assert(idx_max >= 0 && idx_max < N);
 62 | #pragma unroll
 63 |             for (uint i = 0; i < N; ++i) {
 64 |                 arr[i] = (word_idx == i) ? set_byte(arr[i], byte_in, byte_idx) : arr[i];
 65 |             }
 66 |         }
 67 | 
 68 | 
 69 |         template<typename T, typename array_t, int N, int idx_max = N - 1>
 70 |         [[nodiscard]] static inline constexpr T runtime_index_wrapper_internal_read_copy(const array_t &arr, const uint &i) {
 71 |             static_assert(idx_max >= 0 && idx_max < N);
 72 |             if constexpr (idx_max == 0 || N == 1) {
 73 |                 return arr[0];
 74 |             } else {
 75 |                 if (i == idx_max) {
 76 |                     return arr[idx_max];
 77 |                 } else {
 78 |                     return runtime_index_wrapper_internal_read_copy<T, array_t, N, idx_max - 1>(arr, i);
 79 |                 }
 80 |             }
 81 |         }
 82 | 
 83 |         template<typename T, size_t N>
 84 |         static inline constexpr uint8_t runtime_index_wrapper_store_byte(std::array<T, N> &array, const uint &i, const uint8_t &val, const uint &byte_idx) {
 85 |             runtime_index_wrapper_internal_store_byte<T, std::array<T, N>, N>(array, i, (T) val, byte_idx);
 86 |             return val;
 87 |         }
 88 | 
 89 | 
 90 |         template<typename T, size_t N>
 91 |         [[nodiscard]] static inline constexpr T runtime_index_wrapper(const std::array<T, N> &array, const uint &i) {
 92 |             return runtime_index_wrapper_internal_read_copy<T, std::array<T, N>, N>(array, i);
 93 |         }
 94 |     }
 95 | 
 96 | }
 97 | 
 98 | 
 99 | template<int N, typename storage_type = uint32_t>
100 | class runtime_byte_array {
101 | public:
102 | 
103 |     static_assert(std::is_unsigned_v<storage_type> && std::is_integral_v<storage_type>);
104 | 
105 |     /**
106 |      * Connstructor that takes a list of bytes
107 |      * @param init
108 |      */
109 |     constexpr runtime_byte_array(const std::initializer_list<uint8_t> &init) {
110 |         uint idx = 0;
111 |         for (auto b: init) {
112 |             write(idx, b);
113 |             ++idx;
114 |         }
115 |     }
116 | 
117 |     /**
118 |      * Reads the ith byte
119 |      * @param i index
120 |      * @return the byte
121 |      */
122 |     [[nodiscard]] constexpr uint8_t read(const uint &i) const {
123 |         storage_type word = sbb::runtime_idx_detail::runtime_index_wrapper(storage_array_, i / sizeof(storage_type));
124 |         return sbb::get_byte(word, i % sizeof(storage_type));
125 |     }
126 | 
127 |     /**
128 |      * Reads the ith byte
129 |      * @param i index
130 |      * @return the byte
131 |      */
132 |     [[nodiscard]] constexpr uint8_t operator[](const uint &i) const {
133 |         return read(i);
134 |     }
135 | 
136 |     /**
137 |      * Write the ith byte
138 |      * @param i index
139 |      * @return the byte written
140 |      */
141 |     constexpr uint8_t write(const uint &i, const uint8_t &write_byte) {
142 |         return sbb::runtime_idx_detail::runtime_index_wrapper_store_byte(storage_array_, i / sizeof(storage_type), write_byte, i % sizeof(storage_type));
143 |     }
144 | 
145 | private:
146 | 
147 |     static constexpr int get_storage_size() {
148 |         return (N + sizeof(storage_type) - 1) / sizeof(storage_type);
149 |     }
150 | 
151 |     std::array<storage_type, get_storage_size()> storage_array_{};
152 | 
153 | };


--------------------------------------------------------------------------------
/examples/hashing/src/hash_functions/md2.cpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  md2.cpp
 20 |  *
 21 |  *  Description:
 22 |  *    MD2 hash function
 23 |  **************************************************************************/
 24 | #include <hash_functions/md2.hpp>
 25 | #include <internal/determine_kernel_config.hpp>
 26 | 
 27 | #include <cstring>
 28 | #include <tools/runtime_byte_array.hpp>
 29 | 
 30 | using namespace usm_smart_ptr;
 31 | using namespace hash;
 32 | 
 33 | struct md2_ctx {
 34 |     int len = 0;
 35 |     runtime_byte_array<16> data{};
 36 |     byte state[48]{};
 37 |     byte checksum[16]{};
 38 | };
 39 | 
 40 | /**************************** VARIABLES *****************************/
 41 | 
 42 | 
 43 | /*********************** FUNCTION DEFINITIONS ***********************/
 44 | template<typename T>
 45 | static inline void md2_transform(md2_ctx *ctx, const T &data) {
 46 |     constexpr byte consts[256]
 47 |             {41, 46, 67, 201, 162, 216, 124, 1, 61, 54, 84, 161, 236, 240, 6,
 48 |              19, 98, 167, 5, 243, 192, 199, 115, 140, 152, 147, 43, 217, 188, 76,
 49 |              130, 202, 30, 155, 87, 60, 253, 212, 224, 22, 103, 66, 111, 24, 138,
 50 |              23, 229, 18, 190, 78, 196, 214, 218, 158, 222, 73, 160, 251, 245, 142,
 51 |              187, 47, 238, 122, 169, 104, 121, 145, 21, 178, 7, 63, 148, 194, 16,
 52 |              137, 11, 34, 95, 33, 128, 127, 93, 154, 90, 144, 50, 39, 53, 62,
 53 |              204, 231, 191, 247, 151, 3, 255, 25, 48, 179, 72, 165, 181, 209, 215,
 54 |              94, 146, 42, 172, 86, 170, 198, 79, 184, 56, 210, 150, 164, 125, 182,
 55 |              118, 252, 107, 226, 156, 116, 4, 241, 69, 157, 112, 89, 100, 113, 135,
 56 |              32, 134, 91, 207, 101, 230, 45, 168, 2, 27, 96, 37, 173, 174, 176,
 57 |              185, 246, 28, 70, 97, 105, 52, 64, 126, 15, 85, 71, 163, 35, 221,
 58 |              81, 175, 58, 195, 92, 249, 206, 186, 197, 234, 38, 44, 83, 13, 110,
 59 |              133, 40, 132, 9, 211, 223, 205, 244, 65, 129, 77, 82, 106, 220, 55,
 60 |              200, 108, 193, 171, 250, 36, 225, 123, 8, 12, 189, 177, 74, 120, 136,
 61 |              149, 139, 227, 99, 232, 109, 233, 203, 213, 254, 59, 0, 29, 57, 242,
 62 |              239, 183, 14, 102, 88, 208, 228, 166, 119, 114, 248, 235, 117, 75, 10,
 63 |              49, 68, 80, 180, 143, 237, 31, 26, 219, 153, 141, 51, 159, 17, 131,
 64 |              20};
 65 | 
 66 | #ifdef __NVPTX__
 67 | #pragma unroll
 68 | #endif
 69 |     for (int j = 0; j < 16; ++j) {
 70 |         ctx->state[j + 32] = (ctx->state[j + 16] = data[j]) ^ ctx->state[j];
 71 |     }
 72 | 
 73 |     dword t = 0;
 74 | 
 75 | #ifdef __NVPTX__
 76 | #pragma unroll
 77 | #endif
 78 |     for (dword j = 0; j < 18; ++j) {
 79 | 
 80 | #ifdef __NVPTX__
 81 | #pragma unroll
 82 | #endif
 83 |         for (unsigned char &k: ctx->state) {
 84 |             t = k ^= consts[t];
 85 |         }
 86 |         t = (t + j) & 0xFF;
 87 |     }
 88 | 
 89 |     t = ctx->checksum[15];
 90 | 
 91 | #ifdef __NVPTX__
 92 | #pragma unroll
 93 | #endif
 94 |     for (int j = 0; j < 16; ++j) {
 95 |         t = ctx->checksum[j] ^= consts[data[j] ^ t];
 96 |     }
 97 | }
 98 | 
 99 | static inline void md2_update(md2_ctx *ctx, const byte *data, size_t len) {
100 |     for (size_t i = 0; i < len; ++i) {
101 |         ctx->data.write(ctx->len, data[i]);
102 |         ctx->len++;
103 |         if (ctx->len == MD2_BLOCK_SIZE) {
104 |             md2_transform(ctx, ctx->data);
105 |             ctx->len = 0;
106 |         }
107 |     }
108 | }
109 | 
110 | static inline void md2_final(md2_ctx *ctx, byte *hash) {
111 |     int to_pad = (int) MD2_BLOCK_SIZE - ctx->len;
112 |     if (to_pad > 0) {
113 | #ifdef __NVPTX__
114 | #pragma unroll
115 | #endif
116 |         for (int i = ctx->len; i < MD2_BLOCK_SIZE; ++i) {
117 |             ctx->data.write(i, (byte) to_pad);
118 |         }
119 |     }
120 |     md2_transform(ctx, ctx->data);
121 |     md2_transform(ctx, ctx->checksum);
122 |     memcpy(hash, ctx->state, MD2_BLOCK_SIZE);
123 | }
124 | 
125 | static inline void kernel_md2_hash(const byte *indata, dword inlen, byte *outdata, dword n_batch, dword thread) {
126 |     if (thread >= n_batch) {
127 |         return;
128 |     }
129 |     const byte *in = indata + thread * inlen;
130 |     byte *out = outdata + thread * MD2_BLOCK_SIZE;
131 |     md2_ctx ctx{};
132 |     md2_update(&ctx, in, inlen);
133 |     md2_final(&ctx, out);
134 | }
135 | 
136 | namespace hash::internal {
137 | 
138 |     sycl::event
139 |     launch_md2_kernel(sycl::queue &q, sycl::event e, device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch) {
140 |         auto config = get_kernel_sizes(q, n_batch);
141 |         return q.submit([&](sycl::handler &cgh) {
142 |             cgh.depends_on(e);
143 |             cgh.parallel_for<class md2_kernel>(
144 |                     sycl::nd_range<1>(sycl::range<1>(config.block) * sycl::range<1>(config.wg_size), sycl::range<1>(config.wg_size)),
145 |                     [=](sycl::nd_item<1> item) {
146 |                         kernel_md2_hash(indata, inlen, outdata, n_batch, item.get_global_linear_id());
147 |                     });
148 |         });
149 |     }
150 | 
151 | 
152 | }
153 | 
154 | 


--------------------------------------------------------------------------------
/examples/hashing/include/internal/sync_api.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  sync_api.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    Synchronous hashing API
 23 |  **************************************************************************/
 24 | #pragma once
 25 | 
 26 | #include "handle.hpp"
 27 | #include "common.hpp"
 28 | 
 29 | #include "../tools/intrinsics.hpp"
 30 | #include "../tools/sycl_queue_helpers.hpp"
 31 | 
 32 | #include <type_traits>
 33 | #include <vector>
 34 | 
 35 | namespace hash {
 36 |     using namespace usm_smart_ptr;
 37 | 
 38 | 
 39 |     /**
 40 |      * Computes synchronously a hash.
 41 |      * @tparam M Hash method
 42 |      * @param q Queue to run on
 43 |      * @param in Pointer to the input data in any memory accessible by the HOST. Contains an array of data.
 44 |      * @param inlen Size in bytes of one block to hash.
 45 |      * @param out Pointer to the output memory accessible by the HOST
 46 |      * @param n_batch Number of blocks to hash. In and Out pointers must have correct sizes.
 47 |      */
 48 |     template<method M, typename = std::enable_if_t<M != method::keccak && M != method::sha3 && M != method::blake2b> >
 49 |     inline void compute(sycl::queue &q, const byte *in, dword inlen, byte *out, dword n_batch) {
 50 |         if (is_ptr_usable(in, q) && is_ptr_usable(out, q)) {
 51 |             internal::dispatch_hash<M, 0>(q, sycl::event{}, device_accessible_ptr<byte>(in), device_accessible_ptr<byte>(out), inlen, n_batch, nullptr, 0).wait();
 52 |         } else {
 53 |             internal::hash_with_data_copy<M, 0>({q, in, out, n_batch, inlen}, nullptr, 0).dev_e_.wait();
 54 |         }
 55 |     }
 56 | 
 57 |     /**
 58 |      * Computes synchronously a hash.
 59 |      * @tparam M Hash method
 60 |      * @tparam n_outbit Number of bits to output
 61 |      * @param q Queue to run on
 62 |      * @param in Pointer to the input data in any memory accessible by the HOST. Contains an array of data.
 63 |      * @param inlen Size in bytes of one block to hash.
 64 |      * @param out Pointer to the output memory accessible by the HOST
 65 |      * @param n_batch Number of blocks to hash. In and Out pointers must have correct sizes.
 66 |      */
 67 |     template<method M, int n_outbit, typename = std::enable_if_t<M == method::keccak || M == method::sha3 >>
 68 |     inline void compute(sycl::queue &q, const byte *in, dword inlen, byte *out, dword n_batch) {
 69 |         if (is_ptr_usable(in, q) && is_ptr_usable(out, q)) {
 70 |             internal::dispatch_hash<M, n_outbit>(q, sycl::event{}, device_accessible_ptr<byte>(in), device_accessible_ptr<byte>(out), inlen, n_batch, nullptr, 0).wait();
 71 |         } else {
 72 |             internal::hash_with_data_copy<M, n_outbit>({q, in, out, n_batch, inlen}, nullptr, 0).dev_e_.wait();
 73 |         }
 74 |     }
 75 | 
 76 |     /**
 77 |      * Computes synchronously a hash.
 78 |      * @tparam M Hash method
 79 |      * @tparam n_outbit Number of bits to output
 80 |      * @param q Queue to run on
 81 |      * @param in Pointer to the input data in any memory accessible by the HOST. Contains an array of data.
 82 |      * @param inlen Size in bytes of one block to hash.
 83 |      * @param out Pointer to the output memory accessible by the HOST
 84 |      * @param n_batch Number of blocks to hash. In and Out pointers must have correct sizes.
 85 |      */
 86 |     template<method M, int n_outbit, typename = std::enable_if_t<M == method::blake2b>>
 87 |     inline void compute(sycl::queue &q, const byte *in, dword inlen, byte *out, dword n_batch, byte *key, dword keylen) {
 88 |         if (is_ptr_usable(in, q) && is_ptr_usable(out, q)) {
 89 |             internal::dispatch_hash<M, n_outbit>(q, sycl::event{}, device_accessible_ptr<byte>(in), device_accessible_ptr<byte>(out), inlen, n_batch, key, keylen).wait();
 90 |         } else {
 91 |             internal::hash_with_data_copy<M, n_outbit>({q, in, out, n_batch, inlen}, key, keylen).dev_e_.wait();
 92 |         }
 93 |     }
 94 | 
 95 | #define alias_sync_compute(alias_name, method)  \
 96 |     template <typename... Args> \
 97 |     auto alias_name(Args&&... args) -> decltype(compute<method>(std::forward<Args>(args)...)) { \
 98 |         return compute<method>(std::forward<Args>(args)...); \
 99 |     }
100 | 
101 | #define alias_sync_compute_with_n_outbit(alias_name, method)  \
102 |     template <int n_outbit, typename... Args> \
103 |     auto alias_name(Args&&... args) -> decltype(compute<method, n_outbit>(std::forward<Args>(args)...)) { \
104 |         return compute<method, n_outbit>(std::forward<Args>(args)...); \
105 |     }
106 | 
107 |     alias_sync_compute(compute_md2, hash::method::md2)
108 | 
109 |     alias_sync_compute(compute_md5, hash::method::md5)
110 | 
111 |     alias_sync_compute(compute_sha1, hash::method::sha1)
112 | 
113 |     alias_sync_compute(compute_sha256, hash::method::sha256)
114 | 
115 |     alias_sync_compute_with_n_outbit(compute_sha3, hash::method::sha3)
116 | 
117 |     alias_sync_compute_with_n_outbit(compute_blake2b, hash::method::blake2b)
118 | 
119 |     alias_sync_compute_with_n_outbit(compute_keccak, hash::method::keccak)
120 | 
121 | #undef alias_sync_compute
122 | #undef alias_sync_compute_with_n_outbit
123 | 
124 | 
125 | } //namespace hash::v_1
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/examples/hashing/src/hash_functions/sha1.cpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  sha1.cpp
 20 |  *
 21 |  *  Description:
 22 |  *    SHA1 hash function
 23 |  **************************************************************************/
 24 | #include <hash_functions/sha1.hpp>
 25 | #include <internal/determine_kernel_config.hpp>
 26 | #include <tools/intrinsics.hpp>
 27 | 
 28 | #include <cstring>
 29 | 
 30 | 
 31 | using namespace usm_smart_ptr;
 32 | 
 33 | struct sha1_ctx {
 34 |     byte data[64];
 35 |     dword datalen = 0;
 36 |     qword bitlen = 0;
 37 |     dword state[5]{0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xc3d2e1f0};
 38 |     dword k[4]{0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
 39 | 
 40 | };
 41 | 
 42 | /****************************** MACROS ******************************/
 43 | #ifndef ROTLEFT
 44 | #define ROTLEFT(a, b) (((a) << (b)) | ((a) >> (32-(b))))
 45 | #endif
 46 | 
 47 | /*********************** FUNCTION DEFINITIONS ***********************/
 48 | void sha1_transform(sha1_ctx *ctx, const byte *data) {
 49 |     dword a, b, c, d, e, t, m[80];
 50 | 
 51 | #ifdef __NVPTX__
 52 | #pragma unroll
 53 | #endif
 54 |     for (int i = 0, j = 0; i < 16; ++i, j += 4) {
 55 |         m[i] = sbb::upsample(data[j], data[j + 1], data[j + 2], data[j + 3]);
 56 |     }
 57 | 
 58 | 
 59 | #ifdef __NVPTX__
 60 | #pragma unroll
 61 | #endif
 62 |     for (qword i = 16; i < 80; ++i) {
 63 |         m[i] = (m[i - 3] ^ m[i - 8] ^ m[i - 14] ^ m[i - 16]);
 64 |         m[i] = (m[i] << 1) | (m[i] >> 31);
 65 |     }
 66 | 
 67 |     a = ctx->state[0];
 68 |     b = ctx->state[1];
 69 |     c = ctx->state[2];
 70 |     d = ctx->state[3];
 71 |     e = ctx->state[4];
 72 | 
 73 | #ifdef __NVPTX__
 74 | #pragma unroll
 75 | #endif
 76 |     for (dword i = 0; i < 20; ++i) {
 77 |         t = ROTLEFT(a, 5) + ((b & c) ^ (~b & d)) + e + ctx->k[0] + m[i];
 78 |         e = d;
 79 |         d = c;
 80 |         c = ROTLEFT(b, 30);
 81 |         b = a;
 82 |         a = t;
 83 |     }
 84 | #ifdef __NVPTX__
 85 | #pragma unroll
 86 | #endif
 87 |     for (dword i = 20; i < 40; ++i) {
 88 |         t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[1] + m[i];
 89 |         e = d;
 90 |         d = c;
 91 |         c = ROTLEFT(b, 30);
 92 |         b = a;
 93 |         a = t;
 94 |     }
 95 | 
 96 | #ifdef __NVPTX__
 97 | #pragma unroll
 98 | #endif
 99 |     for (dword i = 40; i < 60; ++i) {
100 |         t = ROTLEFT(a, 5) + ((b & c) ^ (b & d) ^ (c & d)) + e + ctx->k[2] + m[i];
101 |         e = d;
102 |         d = c;
103 |         c = ROTLEFT(b, 30);
104 |         b = a;
105 |         a = t;
106 |     }
107 | 
108 | #ifdef __NVPTX__
109 | #pragma unroll
110 | #endif
111 |     for (dword i = 60; i < 80; ++i) {
112 |         t = ROTLEFT(a, 5) + (b ^ c ^ d) + e + ctx->k[3] + m[i];
113 |         e = d;
114 |         d = c;
115 |         c = ROTLEFT(b, 30);
116 |         b = a;
117 |         a = t;
118 |     }
119 | 
120 |     ctx->state[0] += a;
121 |     ctx->state[1] += b;
122 |     ctx->state[2] += c;
123 |     ctx->state[3] += d;
124 |     ctx->state[4] += e;
125 | }
126 | 
127 | void sha1_update(sha1_ctx *ctx, const byte *data, size_t len) {
128 |     for (size_t i = 0; i < len; ++i) {
129 |         ctx->data[ctx->datalen] = data[i];
130 |         ctx->datalen++;
131 |         if (ctx->datalen == 64) {
132 |             sha1_transform(ctx, ctx->data);
133 |             ctx->bitlen += 512;
134 |             ctx->datalen = 0;
135 |         }
136 |     }
137 | }
138 | 
139 | void sha1_final(sha1_ctx *ctx, byte *hash) {
140 |     dword i = ctx->datalen;
141 | 
142 |     // Pad whatever data is left in the buffer.
143 |     if (ctx->datalen < 56) {
144 |         ctx->data[i++] = 0x80;
145 |         while (i < 56)
146 |             ctx->data[i++] = 0x00;
147 |     } else {
148 |         ctx->data[i++] = 0x80;
149 |         while (i < 64)
150 |             ctx->data[i++] = 0x00;
151 |         sha1_transform(ctx, ctx->data);
152 |         memset(ctx->data, 0, 56);
153 |     }
154 | 
155 |     // Append to the padding the total message's length in bits and transform.
156 |     ctx->bitlen += ctx->datalen * 8;
157 |     ctx->data[63] = ctx->bitlen;
158 |     ctx->data[62] = ctx->bitlen >> 8;
159 |     ctx->data[61] = ctx->bitlen >> 16;
160 |     ctx->data[60] = ctx->bitlen >> 24;
161 |     ctx->data[59] = ctx->bitlen >> 32;
162 |     ctx->data[58] = ctx->bitlen >> 40;
163 |     ctx->data[57] = ctx->bitlen >> 48;
164 |     ctx->data[56] = ctx->bitlen >> 56;
165 |     sha1_transform(ctx, ctx->data);
166 | 
167 |     // Since this implementation uses little endian byte ordering and MD uses big endian,
168 |     // reverse all the bytes when copying the final state to the output hash.
169 |     for (i = 0; i < 4; ++i) {
170 |         hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff;
171 |         hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff;
172 |         hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff;
173 |         hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff;
174 |         hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff;
175 |     }
176 | }
177 | 
178 | void kernel_sha1_hash(const byte *indata, dword inlen, byte *outdata, dword n_batch, dword thread) {
179 |     if (thread >= n_batch) {
180 |         return;
181 |     }
182 |     const byte *in = indata + thread * inlen;
183 |     byte *out = outdata + thread * SHA1_BLOCK_SIZE;
184 |     sha1_ctx ctx{};
185 |     sha1_update(&ctx, in, inlen);
186 |     sha1_final(&ctx, out);
187 | }
188 | 
189 | 
190 | namespace hash::internal {
191 |     sycl::event launch_sha1_kernel(sycl::queue &q, sycl::event e, const device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch) {
192 |         auto config = get_kernel_sizes(q, n_batch);
193 |         return q.submit([&](sycl::handler &cgh) {
194 |             cgh.depends_on(e);
195 |             cgh.parallel_for<class sha1_kernel>(
196 |                     sycl::nd_range<1>(sycl::range<1>(config.block) * sycl::range<1>(config.wg_size), sycl::range<1>(config.wg_size)),
197 |                     [=](sycl::nd_item<1> item) {
198 |                         kernel_sha1_hash(indata, inlen, outdata, n_batch, item.get_global_linear_id());
199 |                     });
200 |         });
201 |     }
202 | 
203 | }
204 | 


--------------------------------------------------------------------------------
/examples/hashing/include/tools/sycl_queue_helpers.hpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  sycl_queue_helpers.hpp
 20 |  *
 21 |  *  Description:
 22 |  *    Helper functions relating to SYCL queues
 23 |  **************************************************************************/
 24 | #pragma once
 25 | 
 26 | #include <sycl/sycl.hpp>
 27 | #include <iostream>
 28 | #include "../internal/common.hpp"
 29 | 
 30 | #ifdef USING_COMPUTECPP
 31 | class queue_kernel_tester;
 32 | namespace cl::sycl::usm{
 33 |     using cl::sycl::experimental::usm::alloc;
 34 | }
 35 | #endif
 36 | 
 37 | /**
 38 |  * Selects a CUDA device (but returns sometimes an invalid one)
 39 |  */
 40 | class cuda_selector : public sycl::device_selector {
 41 | public:
 42 |     int operator()(const sycl::device &device) const override {
 43 | #if defined(SYCL_IMPLEMENTATION_ONEAPI) || defined(SYCL_IMPLEMENTATION_INTEL)
 44 |         return device.get_platform().get_backend() == sycl::backend::ext_oneapi_cuda && device.get_info<sycl::info::device::is_available>() ? 1 : -1;
 45 | #else
 46 |         return device.is_gpu() && (device.get_info<sycl::info::device::name>().find("NVIDIA") != std::string::npos) ? 1 : -1;
 47 | #endif
 48 |     }
 49 | };
 50 | 
 51 | 
 52 | void queue_tester(sycl::queue &q);
 53 | 
 54 | 
 55 | /**
 56 |  * Tries to get a queue from a selector else returns the host device
 57 |  * @tparam strict if true will check whether the queue can run a trivial task which implied
 58 |  * that the translation unit needs to be compiler with support for the device you're selecting.
 59 |  */
 60 | template<bool strict = true, typename T>
 61 | inline sycl::queue try_get_queue(const T &selector) {
 62 |     auto exception_handler = [](const sycl::exception_list &exceptions) {
 63 |         for (std::exception_ptr const &e: exceptions) {
 64 |             try {
 65 |                 std::rethrow_exception(e);
 66 |             }
 67 |             catch (sycl::exception const &e) {
 68 |                 std::cout << "Caught asynchronous SYCL exception: " << e.what() << std::endl;
 69 |             }
 70 |             catch (std::exception const &e) {
 71 |                 std::cout << "Caught asynchronous STL exception: " << e.what() << std::endl;
 72 |             }
 73 |         }
 74 |     };
 75 | 
 76 |     sycl::device dev;
 77 |     sycl::queue q;
 78 |     try {
 79 |         dev = sycl::device(selector);
 80 |         q = sycl::queue(dev, exception_handler);
 81 | 
 82 |         try {
 83 |             if constexpr (strict) {
 84 |                 if (dev.is_cpu() || dev.is_gpu()) { //Only CPU and GPU not host, dsp, fpga, ?...
 85 |                     queue_tester(q);
 86 |                 }
 87 |             }
 88 |         } catch (...) {
 89 |             std::cerr << "Warning: " << dev.get_info<sycl::info::device::name>() << " found but not working! Fall back on: ";
 90 |             dev = sycl::device(sycl::host_selector());
 91 |             q = sycl::queue(dev, exception_handler);
 92 |             std::cerr << dev.get_info<sycl::info::device::name>() << '\n';
 93 |             return q;
 94 |         }
 95 |     }
 96 |     catch (...) {
 97 | 
 98 |         dev = sycl::device(sycl::host_selector());
 99 |         q = sycl::queue(dev, exception_handler);
100 |         std::cerr << "Warning: Expected device not found! Fall back on: " << dev.get_info<sycl::info::device::name>() << '\n';
101 |     }
102 |     return q;
103 | }
104 | 
105 | #if defined(__linux__) || defined(__APPLE__) || defined(__LINUX__)
106 | 
107 | #include <sys/mman.h>
108 | #include <unistd.h>
109 | 
110 | /**
111 |  * Checks whether a pointer was allocated on the host device as the pointer query is not reliable on DPC++ on the host.
112 |  * @see http://si-head.nl/articles/msync
113 |  * @return Wether the memory was allocated on the host OS.
114 |  */
115 | template<typename T>
116 | inline bool valid_pointer(T *p) {
117 |     // Get page size and calculate page mask
118 |     auto pagesz = (size_t) sysconf(_SC_PAGESIZE);
119 |     size_t pagemask = ~(pagesz - 1);
120 |     // Calculate base address
121 |     void *base = (void *) (((size_t) p) & pagemask);
122 |     return msync(base, sizeof(T), MS_ASYNC) == 0;
123 | }
124 | 
125 | #else
126 | template<typename T>
127 | inline bool valid_pointer(T *p) {
128 |     return false;
129 | }
130 | #endif
131 | 
132 | 
133 | template<typename T, bool debug = false>
134 | inline bool is_ptr_usable([[maybe_unused]] const T *ptr, [[maybe_unused]] const sycl::queue &q) {
135 |     if (q.get_device().is_host()) {
136 |         return valid_pointer(ptr);
137 |     }
138 | 
139 |     try {
140 |         sycl::get_pointer_device(ptr, q.get_context());
141 |         sycl::usm::alloc alloc_type = sycl::get_pointer_type(ptr, q.get_context());
142 |         if constexpr(debug) {
143 |             std::cerr << "Allocated on:" << q.get_device().get_info<sycl::info::device::name>() << " USM type: ";
144 |             switch (alloc_type) {
145 |                 case sycl::usm::alloc::host:
146 |                     std::cerr << "alloc::host" << '\n';
147 |                     break;
148 |                 case sycl::usm::alloc::device:
149 |                     std::cerr << "alloc::device" << '\n';
150 |                     break;
151 |                 case sycl::usm::alloc::shared:
152 |                     std::cerr << "alloc::shared" << '\n';
153 |                     break;
154 |                 case sycl::usm::alloc::unknown:
155 |                     std::cerr << "alloc::unknown" << '\n';
156 |                     break;
157 |             }
158 |         }
159 |         return alloc_type == sycl::usm::alloc::shared // Shared memory is ok
160 |                || alloc_type == sycl::usm::alloc::device // Device memory is ok
161 |                || (alloc_type == sycl::usm::alloc::host && q.get_device().is_cpu()) // We discard host allocated memory because of poor performance unless on the CPU
162 |                 ;
163 |     } catch (...) {
164 |         if constexpr (debug) {
165 |             std::cerr << "Not allocated on:" << q.get_device().get_info<sycl::info::device::name>() << '\n';
166 |         }
167 |         return false;
168 |     }
169 | 
170 | }
171 | 
172 | 
173 | /**
174 |  * Usefull for memory bound computation.
175 |  * Returns CPU devices that represents different numa nodes.
176 |  * @return
177 |  */
178 | /* inline hash::runners get_cpu_runners_numa() {
179 |     try {
180 |         sycl::device d{sycl::cpu_selector{}};
181 |         auto numa_nodes = d.create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(sycl::info::partition_affinity_domain::numa);
182 |         hash::runners runners_;
183 |         std::transform(numa_nodes.begin(), numa_nodes.end(), runners_.begin(), [](auto &dev) -> hash::runner { return {try_get_queue(dev), 1}; });
184 |         return runners_;
185 |     }
186 |     catch (...) {
187 |         return {{sycl::queue{sycl::host_selector{}}, 1}};
188 |     }
189 | } */
190 | 


--------------------------------------------------------------------------------
/examples/hashing/src/hash_functions/sha256.cpp:
--------------------------------------------------------------------------------
  1 | /***************************************************************************
  2 |  *
  3 |  *  Copyright (C) Codeplay Software Ltd.
  4 |  *
  5 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  6 |  *  you may not use this file except in compliance with the License.
  7 |  *  You may obtain a copy of the License at
  8 |  *
  9 |  *      http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  *  Unless required by applicable law or agreed to in writing, software
 12 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 13 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  *  See the License for the specific language governing permissions and
 15 |  *  limitations under the License.
 16 |  *
 17 |  *  Codeplay's SYCL-For-CUDA-Examples
 18 |  *
 19 |  *  sha256.cpp
 20 |  *
 21 |  *  Description:
 22 |  *    SHA256 hash function
 23 |  **************************************************************************/
 24 | #include <hash_functions/sha256.hpp>
 25 | #include <internal/determine_kernel_config.hpp>
 26 | 
 27 | #include <cstring>
 28 | #include <tools/intrinsics.hpp>
 29 | 
 30 | using namespace usm_smart_ptr;
 31 | 
 32 | /**************************** DATA TYPES ****************************/
 33 | struct sha256_ctx {
 34 |     byte data[64];
 35 |     qword bitlen = 0;
 36 |     dword datalen = 0;
 37 |     dword state[8]{};
 38 | 
 39 |     sha256_ctx() {
 40 |         state[0] = 0x6a09e667;
 41 |         state[1] = 0xbb67ae85;
 42 |         state[2] = 0x3c6ef372;
 43 |         state[3] = 0xa54ff53a;
 44 |         state[4] = 0x510e527f;
 45 |         state[5] = 0x9b05688c;
 46 |         state[6] = 0x1f83d9ab;
 47 |         state[7] = 0x5be0cd19;
 48 |     }
 49 | };
 50 | 
 51 | /****************************** MACROS ******************************/
 52 | #ifndef ROTLEFT
 53 | #define ROTLEFT(a, b) (((a) << (b)) | ((a) >> (32-(b))))
 54 | #endif
 55 | 
 56 | #define ROTRIGHT(a, b) (((a) >> (b)) | ((a) << (32-(b))))
 57 | 
 58 | #define CH(x, y, z) (((x) & (y)) ^ (~(x) & (z)))
 59 | #define MAJ(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
 60 | #define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
 61 | #define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
 62 | #define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3))
 63 | #define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10))
 64 | 
 65 | /**************************** VARIABLES *****************************/
 66 | 
 67 | 
 68 | /*********************** FUNCTION DEFINITIONS ***********************/
 69 | static void sha256_transform(sha256_ctx *ctx, const byte *data) {
 70 |     dword a, b, c, d, e, f, g, h, t1, t2, m[64];
 71 | 
 72 |     static const dword consts[64] =
 73 |             {0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1,
 74 |              0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
 75 |              0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
 76 |              0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 77 |              0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147,
 78 |              0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
 79 |              0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b,
 80 |              0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 81 |              0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
 82 |              0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
 83 |              0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};
 84 | 
 85 | #ifdef __NVPTX__
 86 | #pragma unroll
 87 | #endif
 88 |     for (int i = 0, j = 0; i < 16; ++i, j += 4) {
 89 |         m[i] = sbb::upsample(data[j], data[j + 1], data[j + 2], data[j + 3]);
 90 |     }
 91 | 
 92 | #ifdef __NVPTX__
 93 | #pragma unroll
 94 | #endif
 95 |     for (int i = 16; i < 64; ++i) {
 96 |         m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16];
 97 |     }
 98 | 
 99 |     a = ctx->state[0];
100 |     b = ctx->state[1];
101 |     c = ctx->state[2];
102 |     d = ctx->state[3];
103 |     e = ctx->state[4];
104 |     f = ctx->state[5];
105 |     g = ctx->state[6];
106 |     h = ctx->state[7];
107 | 
108 | #ifdef __NVPTX__
109 | #pragma unroll
110 | #endif
111 |     for (int i = 0; i < 64; ++i) {
112 |         t1 = h + EP1(e) + CH(e, f, g) + consts[i] + m[i];
113 |         t2 = EP0(a) + MAJ(a, b, c);
114 |         h = g;
115 |         g = f;
116 |         f = e;
117 |         e = d + t1;
118 |         d = c;
119 |         c = b;
120 |         b = a;
121 |         a = t1 + t2;
122 |     }
123 | 
124 |     ctx->state[0] += a;
125 |     ctx->state[1] += b;
126 |     ctx->state[2] += c;
127 |     ctx->state[3] += d;
128 |     ctx->state[4] += e;
129 |     ctx->state[5] += f;
130 |     ctx->state[6] += g;
131 |     ctx->state[7] += h;
132 | }
133 | 
134 | 
135 | static void sha256_update(sha256_ctx *ctx, const byte *data, size_t len) {
136 |     for (dword i = 0; i < len; ++i) {
137 |         ctx->data[ctx->datalen] = data[i];
138 |         ctx->datalen++;
139 |         if (ctx->datalen == 64) {
140 |             sha256_transform(ctx, ctx->data);
141 |             ctx->bitlen += 512;
142 |             ctx->datalen = 0;
143 |         }
144 |     }
145 | }
146 | 
147 | static void sha256_final(sha256_ctx *ctx, byte *hash) {
148 |     dword i = ctx->datalen;
149 |     // Pad whatever data is left in the buffer.
150 |     if (ctx->datalen < 56) {
151 |         ctx->data[i++] = 0x80;
152 |         while (i < 56) {
153 |             ctx->data[i++] = 0x00;
154 |         }
155 | 
156 |     } else {
157 |         ctx->data[i++] = 0x80;
158 |         while (i < 64) {
159 |             ctx->data[i++] = 0x00;
160 |         }
161 |         sha256_transform(ctx, ctx->data);
162 |         std::memset(ctx->data, 0, 56);
163 |     }
164 | 
165 |     // Append to the padding the total message's length in bits and transform.
166 |     ctx->bitlen += ctx->datalen * 8;
167 |     ctx->data[63] = ctx->bitlen;
168 |     ctx->data[62] = ctx->bitlen >> 8;
169 |     ctx->data[61] = ctx->bitlen >> 16;
170 |     ctx->data[60] = ctx->bitlen >> 24;
171 |     ctx->data[59] = ctx->bitlen >> 32;
172 |     ctx->data[58] = ctx->bitlen >> 40;
173 |     ctx->data[57] = ctx->bitlen >> 48;
174 |     ctx->data[56] = ctx->bitlen >> 56;
175 |     sha256_transform(ctx, ctx->data);
176 | 
177 |     // Since this implementation uses little endian byte ordering and SHA uses big endian,
178 |     // reverse all the bytes when copying the final state to the output hash.
179 | #pragma unroll
180 |     for (i = 0; i < 4; ++i) {
181 |         hash[i] = (ctx->state[0] >> (24 - i * 8)) & 0x000000ff;
182 |         hash[i + 4] = (ctx->state[1] >> (24 - i * 8)) & 0x000000ff;
183 |         hash[i + 8] = (ctx->state[2] >> (24 - i * 8)) & 0x000000ff;
184 |         hash[i + 12] = (ctx->state[3] >> (24 - i * 8)) & 0x000000ff;
185 |         hash[i + 16] = (ctx->state[4] >> (24 - i * 8)) & 0x000000ff;
186 |         hash[i + 20] = (ctx->state[5] >> (24 - i * 8)) & 0x000000ff;
187 |         hash[i + 24] = (ctx->state[6] >> (24 - i * 8)) & 0x000000ff;
188 |         hash[i + 28] = (ctx->state[7] >> (24 - i * 8)) & 0x000000ff;
189 |     }
190 | }
191 | 
192 | static void kernel_sha256_hash(const byte *indata, dword inlen, byte *outdata, dword n_batch, dword thread) {
193 |     if (thread >= n_batch) {
194 |         return;
195 |     }
196 |     const byte *in = indata + thread * inlen;
197 |     byte *out = outdata + thread * SHA256_BLOCK_SIZE;
198 |     sha256_ctx ctx{};
199 |     sha256_update(&ctx, in, inlen);
200 |     sha256_final(&ctx, out);
201 | }
202 | 
203 | namespace hash::internal {
204 | 
205 |     sycl::event
206 |     launch_sha256_kernel(sycl::queue &q, sycl::event e, const device_accessible_ptr<byte> indata, device_accessible_ptr<byte> outdata, dword inlen, dword n_batch) {
207 |         auto config = get_kernel_sizes(q, n_batch);
208 |         return q.submit([&](sycl::handler &cgh) {
209 |             cgh.depends_on(e);
210 |             cgh.parallel_for<class sha256_kernel>(
211 |                     sycl::nd_range<1>(sycl::range<1>(config.block) * sycl::range<1>(config.wg_size), sycl::range<1>(config.wg_size)),
212 |                     [=](sycl::nd_item<1> item) {
213 |                         kernel_sha256_hash(indata, inlen, outdata, n_batch, item.get_global_linear_id());
214 |                     });
215 |         });
216 |     }
217 | 
218 | 
219 | }
220 | 


--------------------------------------------------------------------------------