├── .clang-format
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── README.md
├── build.sh
├── requirements.txt
├── run_sample.sh
├── sample
    ├── cuda
    │   ├── bandwidth_test.cu
    │   ├── include
    │   │   ├── exception.h
    │   │   ├── helper_cuda.h
    │   │   ├── helper_functions.h
    │   │   ├── helper_image.h
    │   │   ├── helper_string.h
    │   │   └── helper_timer.h
    │   ├── matrix_mul.cu
    │   └── vector_add.cu
    └── nvml
    │   ├── nvml_example.c
    │   └── supported_vgpus.c
├── src
    ├── common
    │   ├── hook.h
    │   ├── macro_common.h
    │   └── trace_profile.h
    ├── cublas
    │   ├── cublas_hook.cpp
    │   └── cublas_subset.h
    ├── cublasLt
    │   ├── cublasLt_hook.cpp
    │   └── cublasLt_subset.h
    ├── cuda
    │   ├── cuda_hook.cpp
    │   └── cuda_subset.h
    ├── cudart
    │   ├── cudart_hook.cpp
    │   └── cudart_subset.h
    ├── cudnn
    │   ├── cudnn_hook.cpp
    │   └── cudnn_subset.h
    ├── cufft
    │   ├── cufft_hook.cpp
    │   └── cufft_subset.h
    ├── curand
    │   ├── curand_hook.cpp
    │   └── curand_subset.h
    ├── cusolver
    │   ├── cusolver_hook.cpp
    │   └── cusolver_subset.h
    ├── cusparse
    │   ├── cusparse_hook.cpp
    │   └── cusparse_subset.h
    ├── nvblas
    │   └── nvblas_hook.cpp
    ├── nvjpeg
    │   ├── nvjpeg_hook.cpp
    │   └── nvjpeg_subset.h
    ├── nvml
    │   ├── nvml_hook.cpp
    │   └── nvml_subset.h
    ├── nvrtc
    │   ├── nvrtc_hook.cpp
    │   └── nvrtc_subset.h
    └── nvtx
    │   ├── nvtx_hook.cpp
    │   └── nvtx_subset.h
└── tools
    └── code_generate
        ├── code_generate.py
        ├── code_generate.sh
        └── include
            ├── cublas.h
            ├── cublasLt.h
            ├── cuda.h
            ├── cuda_runtime_api.h
            ├── cudnn.h
            ├── cufft.h
            ├── curand.h
            ├── cusolver_common.h
            ├── cusparse.h
            ├── nvToolsExt.h
            ├── nvblas.h
            ├── nvjpeg.h
            ├── nvml.h
            └── nvrtc.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | 
 3 | IndentWidth: 4
 4 | 
 5 | TabWidth: 4
 6 | 
 7 | UseTab: Never
 8 | 
 9 | IndentCaseLabels: true
10 | 
11 | IndentWrappedFunctionNames: true
12 | 
13 | ColumnLimit: 120
14 | 
15 | AccessModifierOffset: -4
16 | 
17 | AllowShortFunctionsOnASingleLine: Empty
18 | 
19 | AllowShortIfStatementsOnASingleLine: false
20 | 
21 | AllowShortLoopsOnASingleLine: false
22 | 
23 | AllowShortBlocksOnASingleLine: false
24 | 
25 | AllowShortCaseLabelsOnASingleLine: false
26 | 
27 | KeepEmptyLinesAtTheStartOfBlocks: true
28 | 
29 | MaxEmptyLinesToKeep: 1
30 | 
31 | DerivePointerAlignment: false
32 | 
33 | PointerAlignment: Right
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | .VSCodeCounter/
 3 | .idea/
 4 | 
 5 | __pycache__/
 6 | go/
 7 | 
 8 | *~/
 9 | build/
10 | install/
11 | release/
12 | output/
13 | bin/
14 | log/
15 | model/
16 | ncu/
17 | nsys/
18 | roofline/
19 | ptx/
20 | sass/
21 | tmp/
22 | temp/
23 | 
24 | *.o
25 | *.so
26 | *.so.*
27 | *.out
28 | *.log
29 | *.bak
30 | *.pkz
31 | 
32 | setting.h
33 | .config*
34 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Copyright 2022. All Rights Reserved.
  2 | # Author: Bruce-Lee-LY
  3 | # Date: 16:19:40 on Sun, May 29, 2022
  4 | #
  5 | # Description: cmake for cuda hook
  6 | 
  7 | cmake_minimum_required (VERSION 3.12)
  8 | 
  9 | project (cuda_hook LANGUAGES C CXX)
 10 | 
 11 | set (CMAKE_VERBOSE_MAKEFILE ${HOOK_VERBOSE_MAKEFILE})
 12 | 
 13 | set (CMAKE_C_VISIBILITY_PRESET hidden)
 14 | set (CMAKE_CXX_VISIBILITY_PRESET hidden)
 15 | set (CMAKE_POSITION_INDEPENDENT_CODE ON)
 16 | 
 17 | set (CMAKE_C_FLAGS "-std=c11")
 18 | set (CMAKE_C_FLAGS_DEBUG "$ENV{CFLAGS} -O0 -g2 -ggdb -DHOOK_BUILD_DEBUG")
 19 | set (CMAKE_C_FLAGS_RELEASE "$ENV{CFLAGS} -O3")
 20 | 
 21 | set (CMAKE_CXX_FLAGS "-std=c++11")
 22 | set (CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -g2 -ggdb -DHOOK_BUILD_DEBUG")
 23 | set (CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3")
 24 | 
 25 | set (CMAKE_SHARED_LINKER_FLAGS "-s -Wl,--exclude-libs,ALL")
 26 | set (CMAKE_EXE_LINKER_FLAGS "-Wl,--as-needed")
 27 | 
 28 | add_compile_options (
 29 |     -Wall
 30 |     -Werror
 31 |     -Wextra
 32 |     -Wswitch-default
 33 |     -Wfloat-equal
 34 |     -Wshadow
 35 |     -Wcast-qual
 36 | )
 37 | 
 38 | include_directories (
 39 |     ${PROJECT_SOURCE_DIR}/src/common
 40 |     ${PROJECT_SOURCE_DIR}/src/cuda
 41 |     ${PROJECT_SOURCE_DIR}/src/nvml
 42 |     ${PROJECT_SOURCE_DIR}/src/cudart
 43 |     ${PROJECT_SOURCE_DIR}/src/cudnn
 44 |     ${PROJECT_SOURCE_DIR}/src/cublas
 45 |     ${PROJECT_SOURCE_DIR}/src/cublasLt
 46 |     ${PROJECT_SOURCE_DIR}/src/cufft
 47 |     ${PROJECT_SOURCE_DIR}/src/nvtx
 48 |     ${PROJECT_SOURCE_DIR}/src/nvrtc
 49 |     ${PROJECT_SOURCE_DIR}/src/curand
 50 |     ${PROJECT_SOURCE_DIR}/src/cusparse
 51 |     ${PROJECT_SOURCE_DIR}/src/cusolver
 52 |     ${PROJECT_SOURCE_DIR}/src/nvjpeg
 53 |     ${PROJECT_SOURCE_DIR}/src/nvblas
 54 | )
 55 | 
 56 | file (GLOB HOOK_SRCS 
 57 |     ${PROJECT_SOURCE_DIR}/src/*/*.cpp
 58 | )
 59 | 
 60 | # libcuda_hook.so
 61 | add_library (cuda_hook SHARED ${HOOK_SRCS})
 62 | target_link_libraries (cuda_hook -ldl)
 63 | 
 64 | install (TARGETS cuda_hook LIBRARY DESTINATION lib64)
 65 | 
 66 | # libcuda_hook.a
 67 | add_library (cuda_hook_static STATIC ${HOOK_SRCS})
 68 | target_link_libraries (cuda_hook_static -ldl)
 69 | set_target_properties (cuda_hook_static PROPERTIES OUTPUT_NAME cuda_hook)
 70 | 
 71 | install (TARGETS cuda_hook_static ARCHIVE DESTINATION lib64)
 72 | 
 73 | if (HOOK_WITH_SAMPLE)
 74 |     find_package (CUDA REQUIRED)
 75 |     unset (CUDA_USE_STATIC_CUDA_RUNTIME CACHE)
 76 |     option (CUDA_USE_STATIC_CUDA_RUNTIME OFF)
 77 | 
 78 |     set (CUDA_VERBOSE_BUILD ${HOOK_VERBOSE_MAKEFILE})
 79 |     set (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
 80 |     if (${CMAKE_BUILD_TYPE} MATCHES "Debug")
 81 |         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CMAKE_CUDA_ARCHITECTURES} -g -lineinfo -O0")
 82 |     else ()
 83 |         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES} --use_fast_math -O3")
 84 |     endif ()
 85 | 
 86 |     set (SYS_CUDART_PATH "/usr/local/cuda")
 87 |     set (SYS_CUDA_DRIVER_PATH "/usr/lib/x86_64-linux-gnu")
 88 | 
 89 |     include_directories (
 90 |         ${SYS_CUDART_PATH}/include
 91 |         ${PROJECT_SOURCE_DIR}/sample/cuda/include
 92 |     )
 93 | 
 94 |     link_directories (
 95 |         ${SYS_CUDART_PATH}/lib64
 96 |         ${SYS_CUDA_DRIVER_PATH}
 97 |     )
 98 | 
 99 |     # cuda
100 |     cuda_add_executable (bandwidth_test ${PROJECT_SOURCE_DIR}/sample/cuda/bandwidth_test.cu)
101 |     cuda_add_executable (matrix_mul ${PROJECT_SOURCE_DIR}/sample/cuda/matrix_mul.cu)
102 |     cuda_add_executable (vector_add ${PROJECT_SOURCE_DIR}/sample/cuda/vector_add.cu)
103 | 
104 |     install (TARGETS bandwidth_test matrix_mul vector_add RUNTIME DESTINATION sample/cuda)
105 | 
106 |     # nvml
107 |     add_executable (nvml_example ${PROJECT_SOURCE_DIR}/sample/nvml/nvml_example.c)
108 |     target_link_libraries (nvml_example -lnvidia-ml)
109 |     add_executable (supported_vgpus ${PROJECT_SOURCE_DIR}/sample/nvml/supported_vgpus.c)
110 |     target_link_libraries (supported_vgpus -lnvidia-ml)
111 | 
112 |     install (TARGETS nvml_example supported_vgpus RUNTIME DESTINATION sample/nvml)
113 | endif ()
114 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Bruce-Lee-LY
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CUDA Hook
 2 | Hooked CUDA-related dynamic libraries by using automated code generation tools. Based on this, you can easily obtain the CUDA API called by the CUDA program, and you can also hijack the CUDA API to insert custom logic.
 3 | 
 4 | It implements an ingenious tool to automatically generate code that hooks the CUDA api with CUDA native header files, and is extremely practical and extensible.
 5 | 
 6 | At present, the hooking of dynamic libraries such as cuda driver, nvml, cuda runtime, cudnn, cublas, cublasLt, cufft, nvtx, nvrtc, curand, cusparse, cusolver, nvjpeg and nvblas has been completed, and it can also be easily extended to the hooking of other cuda dynamic libraries.
 7 | 
 8 | # Support Dynamic Libraries
 9 | - CUDA Driver: libcuda.so
10 | - NVML: libnvidia-ml.so
11 | - CUDA Runtime: libcudart.so
12 | - CUDNN: libcudnn.so
13 | - CUBLAS: libcublas.so
14 | - CUBLASLT: libcublasLt.so
15 | - CUFFT: libcufft.so
16 | - NVTX: libnvToolsExt.so
17 | - NVRTC: libnvrtc.so
18 | - CURAND: libcurand.so
19 | - CUSPARSE: libcusparse.so
20 | - CUSOLVER: libcusolver.so
21 | - NVJPEG: libnvjpeg.so
22 | - NVBLAS: libnvblas.so
23 | 
24 | # Compile
25 | ## Environment
26 | - OS: Linux
27 | - Cmake Version: >= 3.12
28 | - GCC Version: >= 4.8
29 | - CUDA Version: 11.4 (best)
30 | - CUDA Driver Version: 470.129.06 (best)
31 | - CUDNN Version: 7.6.5 (best)
32 | 
33 | ## Clone
34 | ```
35 | git clone https://github.com/Bruce-Lee-LY/cuda_hook.git
36 | ```
37 | 
38 | ## Build
39 | ### GTX1080Ti
40 | ```
41 | cd cuda_hook
42 | ./build.sh -a 61 -t Release -s ON -b OFF
43 | ./build.sh -a 61 -t Debug -s OFF -b ON
44 | ```
45 | 
46 | ### Tesla V100
47 | ```
48 | cd cuda_hook
49 | ./build.sh -a 70 -t Release -s ON -b OFF
50 | ./build.sh -a 70 -t Debug -s OFF -b ON
51 | ```
52 | 
53 | ### RTX2080Ti
54 | ```
55 | cd cuda_hook
56 | ./build.sh -a 75 -t Release -s ON -b OFF
57 | ./build.sh -a 75 -t Debug -s OFF -b ON
58 | ```
59 | 
60 | ### NVIDIA A100
61 | ```
62 | cd cuda_hook
63 | ./build.sh -a 80 -t Release -s ON -b OFF
64 | ./build.sh -a 80 -t Debug -s OFF -b ON
65 | ```
66 | 
67 | ### RTX3080Ti / RTX3090 / RTX A6000
68 | ```
69 | cd cuda_hook
70 | ./build.sh -a 86 -t Release -s ON -b OFF
71 | ./build.sh -a 86 -t Debug -s OFF -b ON
72 | ```
73 | 
74 | # Run Sample
75 | ```
76 | ./run_sample.sh
77 | ```
78 | 
79 | # Tools
80 | ## Code Generate
81 | Use CUDA native header files to automatically generate code that hooks CUDA API.
82 | ```
83 | cd tools/code_generate
84 | ./code_generate.sh
85 | ```
86 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
  1 | # Copyright 2022. All Rights Reserved.
  2 | # Author: Bruce-Lee-LY
  3 | # Date: 16:19:40 on Sun, May 29, 2022
  4 | #
  5 | # Description: compile script
  6 | 
  7 | #!/bin/bash
  8 | 
  9 | set -euo pipefail
 10 | 
 11 | echo "========== build enter =========="
 12 | 
 13 | WORK_PATH=$(cd $(dirname $0) && pwd) && cd $WORK_PATH
 14 | 
 15 | CUDA_ARCHITECTURE=86 # a: (Tesla P100: 60, GTX1080Ti: 61, Tesla V100: 70, RTX2080Ti: 75, NVIDIA A100: 80, RTX3080Ti / RTX3090 / RTX A6000: 86, RTX4090: 89, NVIDIA H100: 90)
 16 | BUILD_TYPE=Debug # t: (Debug, Release)
 17 | WITH_SAMPLE=ON # s: (ON, OFF)
 18 | VERBOSE_MAKEFILE=OFF # b: (ON, OFF)
 19 | 
 20 | while getopts ":a:t:s:b:" opt
 21 | do
 22 |     case $opt in
 23 |         a)
 24 |         CUDA_ARCHITECTURE=$OPTARG
 25 |         echo "CUDA_ARCHITECTURE: $CUDA_ARCHITECTURE"
 26 |         ;;
 27 |         t)
 28 |         BUILD_TYPE=$OPTARG
 29 |         echo "BUILD_TYPE: $BUILD_TYPE"
 30 |         ;;
 31 |         s)
 32 |         WITH_SAMPLE=$OPTARG
 33 |         echo "WITH_SAMPLE: $WITH_SAMPLE"
 34 |         ;;
 35 |         b)
 36 |         VERBOSE_MAKEFILE=$OPTARG
 37 |         echo "VERBOSE_MAKEFILE: $VERBOSE_MAKEFILE"
 38 |         ;;
 39 |         ?)
 40 |         echo "invalid param: $OPTARG"
 41 |         exit 1
 42 |         ;;
 43 |     esac
 44 | done
 45 | 
 46 | echo_cmd() {
 47 |     echo $1
 48 |     $1
 49 | }
 50 | 
 51 | echo "========== build cuda_hook =========="
 52 | 
 53 | echo_cmd "rm -rf build output"
 54 | echo_cmd "mkdir build"
 55 | 
 56 | echo_cmd "cd build"
 57 | echo_cmd "cmake -DCMAKE_CUDA_ARCHITECTURES=$CUDA_ARCHITECTURE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DHOOK_WITH_SAMPLE=$WITH_SAMPLE -DHOOK_VERBOSE_MAKEFILE=$VERBOSE_MAKEFILE -DCMAKE_INSTALL_PREFIX=$WORK_PATH/output -DCMAKE_SKIP_RPATH=ON .."
 58 | echo_cmd "make -j$(nproc --ignore=2)"
 59 | echo_cmd "make install"
 60 | 
 61 | echo "========== create soft link =========="
 62 | 
 63 | # cuda
 64 | echo_cmd "ln -s libcuda_hook.so libcuda.so.1"
 65 | echo_cmd "ln -s libcuda.so.1 libcuda.so"
 66 | 
 67 | # nvml
 68 | echo_cmd "ln -s libcuda_hook.so libnvidia-ml.so.1"
 69 | echo_cmd "ln -s libnvidia-ml.so.1 libnvidia-ml.so"
 70 | 
 71 | # cudart
 72 | echo_cmd "ln -s libcuda_hook.so libcudart.so.11.0"
 73 | echo_cmd "ln -s libcudart.so.11.0 libcudart.so"
 74 | 
 75 | # cudnn
 76 | echo_cmd "ln -s libcuda_hook.so libcudnn.so.7"
 77 | echo_cmd "ln -s libcudnn.so.7 libcudnn.so"
 78 | 
 79 | # cublas
 80 | echo_cmd "ln -s libcuda_hook.so libcublas.so.11"
 81 | echo_cmd "ln -s libcublas.so.11 libcublas.so"
 82 | 
 83 | # cublasLt
 84 | echo_cmd "ln -s libcuda_hook.so libcublasLt.so.11"
 85 | echo_cmd "ln -s libcublasLt.so.11 libcublasLt.so"
 86 | 
 87 | # cufft
 88 | echo_cmd "ln -s libcuda_hook.so libcufft.so.10"
 89 | echo_cmd "ln -s libcufft.so.10 libcufft.so"
 90 | 
 91 | # nvtx
 92 | echo_cmd "ln -s libcuda_hook.so libnvToolsExt.so.1"
 93 | echo_cmd "ln -s libnvToolsExt.so.1 libnvToolsExt.so"
 94 | 
 95 | # nvrtc
 96 | echo_cmd "ln -s libcuda_hook.so libnvrtc.so.11.2"
 97 | echo_cmd "ln -s libnvrtc.so.11.2 libnvrtc.so"
 98 | 
 99 | # curand
100 | echo_cmd "ln -s libcuda_hook.so libcurand.so.10"
101 | echo_cmd "ln -s libcurand.so.10 libcurand.so"
102 | 
103 | # cusparse
104 | echo_cmd "ln -s libcuda_hook.so libcusparse.so.11"
105 | echo_cmd "ln -s libcusparse.so.11 libcusparse.so"
106 | 
107 | # cusolver
108 | echo_cmd "ln -s libcuda_hook.so libcusolver.so.11"
109 | echo_cmd "ln -s libcusolver.so.11 libcusolver.so"
110 | 
111 | # nvjpeg
112 | echo_cmd "ln -s libcuda_hook.so libnvjpeg.so.11"
113 | echo_cmd "ln -s libnvjpeg.so.11 libnvjpeg.so"
114 | 
115 | # nvblas
116 | echo_cmd "ln -s libcuda_hook.so libnvblas.so.11"
117 | echo_cmd "ln -s libnvblas.so.11 libnvblas.so"
118 | 
119 | echo_cmd "cp -d *.so *.so.* $WORK_PATH/output/lib64"
120 | 
121 | echo "========== build info =========="
122 | 
123 | BRANCH=`git rev-parse --abbrev-ref HEAD`
124 | COMMIT=`git rev-parse HEAD`
125 | GCC_VERSION=`gcc -dumpversion`
126 | COMPILE_TIME=$(date "+%H:%M:%S %Y-%m-%d")
127 | 
128 | echo "branch: $BRANCH" >> $WORK_PATH/output/hook_version
129 | echo "commit: $COMMIT" >> $WORK_PATH/output/hook_version
130 | echo "gcc_version: $GCC_VERSION" >> $WORK_PATH/output/hook_version
131 | echo "compile_time: $COMPILE_TIME" >> $WORK_PATH/output/hook_version
132 | 
133 | echo "========== build exit =========="
134 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | robotpy-cppheaderparser==5.0.16


--------------------------------------------------------------------------------
/run_sample.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2022. All Rights Reserved.
 2 | # Author: Bruce-Lee-LY
 3 | # Date: 22:25:53 on Sun, May 29, 2022
 4 | #
 5 | # Description: run sample script
 6 | 
 7 | #!/bin/bash
 8 | 
 9 | set -euo pipefail
10 | 
11 | WORK_PATH=$(cd $(dirname $0) && pwd) && cd $WORK_PATH
12 | export LD_LIBRARY_PATH=$WORK_PATH/output/lib64:/usr/local/cuda/targets/x86_64-linux/lib:/usr/lib/x86_64-linux-gnu
13 | 
14 | rm -rf log && mkdir -p log/sample/cuda log/sample/nvml
15 | 
16 | # cuda/cudart
17 | nohup $WORK_PATH/output/sample/cuda/bandwidth_test > log/sample/cuda/bandwidth_test.log 2>&1 &
18 | nohup $WORK_PATH/output/sample/cuda/matrix_mul > log/sample/cuda/matrix_mul.log 2>&1 &
19 | nohup $WORK_PATH/output/sample/cuda/vector_add > log/sample/cuda/vector_add.log 2>&1 &
20 | 
21 | # nvml
22 | nohup $WORK_PATH/output/sample/nvml/nvml_example > log/sample/nvml/nvml_example.log 2>&1 &
23 | nohup $WORK_PATH/output/sample/nvml/supported_vgpus > log/sample/nvml/supported_vgpus.log 2>&1 &
24 | 


--------------------------------------------------------------------------------
/sample/cuda/include/exception.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 22:17:08 on Sun, May 29, 2022
  4 | //
  5 | // Description: source file in /usr/local/cuda/samples/common/inc
  6 | 
  7 | /* CUda UTility Library */
  8 | #ifndef COMMON_EXCEPTION_H_
  9 | #define COMMON_EXCEPTION_H_
 10 | 
 11 | // includes, system
 12 | #include <stdlib.h>
 13 | 
 14 | #include <exception>
 15 | #include <iostream>
 16 | #include <stdexcept>
 17 | #include <string>
 18 | 
 19 | //! Exception wrapper.
 20 | //! @param Std_Exception Exception out of namespace std for easy typing.
 21 | template <class Std_Exception>
 22 | class Exception : public Std_Exception {
 23 | public:
 24 |     //! @brief Static construction interface
 25 |     //! @return Alwayss throws ( Located_Exception<Exception>)
 26 |     //! @param file file in which the Exception occurs
 27 |     //! @param line line in which the Exception occurs
 28 |     //! @param detailed details on the code fragment causing the Exception
 29 |     static void throw_it(const char *file, const int line, const char *detailed = "-");
 30 | 
 31 |     //! Static construction interface
 32 |     //! @return Alwayss throws ( Located_Exception<Exception>)
 33 |     //! @param file file in which the Exception occurs
 34 |     //! @param line line in which the Exception occurs
 35 |     //! @param detailed details on the code fragment causing the Exception
 36 |     static void throw_it(const char *file, const int line, const std::string &detailed);
 37 | 
 38 |     //! Destructor
 39 |     virtual ~Exception() throw();
 40 | 
 41 | private:
 42 |     //! Constructor, default (private)
 43 |     Exception();
 44 | 
 45 |     //! Constructor, standard
 46 |     //! @param str string returned by what()
 47 |     explicit Exception(const std::string &str);
 48 | };
 49 | 
 50 | ////////////////////////////////////////////////////////////////////////////////
 51 | //! Exception handler function for arbitrary exceptions
 52 | //! @param ex exception to handle
 53 | ////////////////////////////////////////////////////////////////////////////////
 54 | template <class Exception_Typ>
 55 | inline void handleException(const Exception_Typ &ex) {
 56 |     std::cerr << ex.what() << std::endl;
 57 | 
 58 |     exit(EXIT_FAILURE);
 59 | }
 60 | 
 61 | //! Convenience macros
 62 | 
 63 | //! Exception caused by dynamic program behavior, e.g. file does not exist
 64 | #define RUNTIME_EXCEPTION(msg) Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
 65 | 
 66 | //! Logic exception in program, e.g. an assert failed
 67 | #define LOGIC_EXCEPTION(msg) Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
 68 | 
 69 | //! Out of range exception
 70 | #define RANGE_EXCEPTION(msg) Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
 71 | 
 72 | ////////////////////////////////////////////////////////////////////////////////
 73 | //! Implementation
 74 | 
 75 | // includes, system
 76 | #include <sstream>
 77 | 
 78 | ////////////////////////////////////////////////////////////////////////////////
 79 | //! Static construction interface.
 80 | //! @param  Exception causing code fragment (file and line) and detailed infos.
 81 | ////////////////////////////////////////////////////////////////////////////////
 82 | /*static*/ template <class Std_Exception>
 83 | void Exception<Std_Exception>::throw_it(const char *file, const int line, const char *detailed) {
 84 |     std::stringstream s;
 85 | 
 86 |     // Quiet heavy-weight but exceptions are not for
 87 |     // performance / release versions
 88 |     s << "Exception in file '" << file << "' in line " << line << "\n"
 89 |       << "Detailed description: " << detailed << "\n";
 90 | 
 91 |     throw Exception(s.str());
 92 | }
 93 | 
 94 | ////////////////////////////////////////////////////////////////////////////////
 95 | //! Static construction interface.
 96 | //! @param  Exception causing code fragment (file and line) and detailed infos.
 97 | ////////////////////////////////////////////////////////////////////////////////
 98 | /*static*/ template <class Std_Exception>
 99 | void Exception<Std_Exception>::throw_it(const char *file, const int line, const std::string &msg) {
100 |     throw_it(file, line, msg.c_str());
101 | }
102 | 
103 | ////////////////////////////////////////////////////////////////////////////////
104 | //! Constructor, default (private).
105 | ////////////////////////////////////////////////////////////////////////////////
106 | template <class Std_Exception>
107 | Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
108 | 
109 | ////////////////////////////////////////////////////////////////////////////////
110 | //! Constructor, standard (private).
111 | //! String returned by what().
112 | ////////////////////////////////////////////////////////////////////////////////
113 | template <class Std_Exception>
114 | Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
115 | 
116 | ////////////////////////////////////////////////////////////////////////////////
117 | //! Destructor
118 | ////////////////////////////////////////////////////////////////////////////////
119 | template <class Std_Exception>
120 | Exception<Std_Exception>::~Exception() throw() {}
121 | 
122 | // functions, exported
123 | 
124 | #endif  // COMMON_EXCEPTION_H_
125 | 


--------------------------------------------------------------------------------
/sample/cuda/include/helper_functions.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2022. All Rights Reserved.
 2 | // Author: Bruce-Lee-LY
 3 | // Date: 22:17:08 on Sun, May 29, 2022
 4 | //
 5 | // Description: source file in /usr/local/cuda/samples/common/inc
 6 | 
 7 | // These are helper functions for the SDK samples (string parsing,
 8 | // timers, image helpers, etc)
 9 | #ifndef COMMON_HELPER_FUNCTIONS_H_
10 | #define COMMON_HELPER_FUNCTIONS_H_
11 | 
12 | #ifdef WIN32
13 | #pragma warning(disable : 4996)
14 | #endif
15 | 
16 | // includes, project
17 | #include <assert.h>
18 | #include <exception.h>
19 | #include <math.h>
20 | #include <stdio.h>
21 | #include <stdlib.h>
22 | 
23 | #include <algorithm>
24 | #include <fstream>
25 | #include <iostream>
26 | #include <string>
27 | #include <vector>
28 | 
29 | // includes, timer, string parsing, image helpers
30 | #include <helper_image.h>   // helper functions for image compare, dump, data comparisons
31 | #include <helper_string.h>  // helper functions for string parsing
32 | #include <helper_timer.h>   // helper functions for timers
33 | 
34 | #ifndef EXIT_WAIVED
35 | #define EXIT_WAIVED 2
36 | #endif
37 | 
38 | #endif  // COMMON_HELPER_FUNCTIONS_H_
39 | 


--------------------------------------------------------------------------------
/sample/cuda/include/helper_timer.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 22:17:08 on Sun, May 29, 2022
  4 | //
  5 | // Description: source file in /usr/local/cuda/samples/common/inc
  6 | 
  7 | // Helper Timing Functions
  8 | #ifndef COMMON_HELPER_TIMER_H_
  9 | #define COMMON_HELPER_TIMER_H_
 10 | 
 11 | #ifndef EXIT_WAIVED
 12 | #define EXIT_WAIVED 2
 13 | #endif
 14 | 
 15 | // includes, system
 16 | #include <vector>
 17 | 
 18 | // includes, project
 19 | #include <exception.h>
 20 | 
 21 | // Definition of the StopWatch Interface, this is used if we don't want to use
 22 | // the CUT functions But rather in a self contained class interface
 23 | class StopWatchInterface {
 24 | public:
 25 |     StopWatchInterface() {}
 26 |     virtual ~StopWatchInterface() {}
 27 | 
 28 | public:
 29 |     //! Start time measurement
 30 |     virtual void start() = 0;
 31 | 
 32 |     //! Stop time measurement
 33 |     virtual void stop() = 0;
 34 | 
 35 |     //! Reset time counters to zero
 36 |     virtual void reset() = 0;
 37 | 
 38 |     //! Time in msec. after start. If the stop watch is still running (i.e. there
 39 |     //! was no call to stop()) then the elapsed time is returned, otherwise the
 40 |     //! time between the last start() and stop call is returned
 41 |     virtual float getTime() = 0;
 42 | 
 43 |     //! Mean time to date based on the number of times the stopwatch has been
 44 |     //! _stopped_ (ie finished sessions) and the current total time
 45 |     virtual float getAverageTime() = 0;
 46 | };
 47 | 
 48 | //////////////////////////////////////////////////////////////////
 49 | // Begin Stopwatch timer class definitions for all OS platforms //
 50 | //////////////////////////////////////////////////////////////////
 51 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 52 | // includes, system
 53 | #define WINDOWS_LEAN_AND_MEAN
 54 | #include <windows.h>
 55 | #undef min
 56 | #undef max
 57 | 
 58 | //! Windows specific implementation of StopWatch
 59 | class StopWatchWin : public StopWatchInterface {
 60 | public:
 61 |     //! Constructor, default
 62 |     StopWatchWin()
 63 |         : start_time(),
 64 |           end_time(),
 65 |           diff_time(0.0f),
 66 |           total_time(0.0f),
 67 |           running(false),
 68 |           clock_sessions(0),
 69 |           freq(0),
 70 |           freq_set(false) {
 71 |         if (!freq_set) {
 72 |             // helper variable
 73 |             LARGE_INTEGER temp;
 74 | 
 75 |             // get the tick frequency from the OS
 76 |             QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
 77 | 
 78 |             // convert to type in which it is needed
 79 |             freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
 80 | 
 81 |             // rememeber query
 82 |             freq_set = true;
 83 |         }
 84 |     }
 85 | 
 86 |     // Destructor
 87 |     ~StopWatchWin() {}
 88 | 
 89 | public:
 90 |     //! Start time measurement
 91 |     inline void start();
 92 | 
 93 |     //! Stop time measurement
 94 |     inline void stop();
 95 | 
 96 |     //! Reset time counters to zero
 97 |     inline void reset();
 98 | 
 99 |     //! Time in msec. after start. If the stop watch is still running (i.e. there
100 |     //! was no call to stop()) then the elapsed time is returned, otherwise the
101 |     //! time between the last start() and stop call is returned
102 |     inline float getTime();
103 | 
104 |     //! Mean time to date based on the number of times the stopwatch has been
105 |     //! _stopped_ (ie finished sessions) and the current total time
106 |     inline float getAverageTime();
107 | 
108 | private:
109 |     // member variables
110 | 
111 |     //! Start of measurement
112 |     LARGE_INTEGER start_time;
113 |     //! End of measurement
114 |     LARGE_INTEGER end_time;
115 | 
116 |     //! Time difference between the last start and stop
117 |     float diff_time;
118 | 
119 |     //! TOTAL time difference between starts and stops
120 |     float total_time;
121 | 
122 |     //! flag if the stop watch is running
123 |     bool running;
124 | 
125 |     //! Number of times clock has been started
126 |     //! and stopped to allow averaging
127 |     int clock_sessions;
128 | 
129 |     //! tick frequency
130 |     double freq;
131 | 
132 |     //! flag if the frequency has been set
133 |     bool freq_set;
134 | };
135 | 
136 | // functions, inlined
137 | 
138 | ////////////////////////////////////////////////////////////////////////////////
139 | //! Start time measurement
140 | ////////////////////////////////////////////////////////////////////////////////
141 | inline void StopWatchWin::start() {
142 |     QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
143 |     running = true;
144 | }
145 | 
146 | ////////////////////////////////////////////////////////////////////////////////
147 | //! Stop time measurement and increment add to the current diff_time summation
148 | //! variable. Also increment the number of times this clock has been run.
149 | ////////////////////////////////////////////////////////////////////////////////
150 | inline void StopWatchWin::stop() {
151 |     QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
152 |     diff_time = static_cast<float>(
153 |         ((static_cast<double>(end_time.QuadPart) - static_cast<double>(start_time.QuadPart)) / freq));
154 | 
155 |     total_time += diff_time;
156 |     clock_sessions++;
157 |     running = false;
158 | }
159 | 
160 | ////////////////////////////////////////////////////////////////////////////////
161 | //! Reset the timer to 0. Does not change the timer running state but does
162 | //! recapture this point in time as the current start time if it is running.
163 | ////////////////////////////////////////////////////////////////////////////////
164 | inline void StopWatchWin::reset() {
165 |     diff_time = 0;
166 |     total_time = 0;
167 |     clock_sessions = 0;
168 | 
169 |     if (running) {
170 |         QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
171 |     }
172 | }
173 | 
174 | ////////////////////////////////////////////////////////////////////////////////
175 | //! Time in msec. after start. If the stop watch is still running (i.e. there
176 | //! was no call to stop()) then the elapsed time is returned added to the
177 | //! current diff_time sum, otherwise the current summed time difference alone
178 | //! is returned.
179 | ////////////////////////////////////////////////////////////////////////////////
180 | inline float StopWatchWin::getTime() {
181 |     // Return the TOTAL time to date
182 |     float retval = total_time;
183 | 
184 |     if (running) {
185 |         LARGE_INTEGER temp;
186 |         QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
187 |         retval += static_cast<float>(
188 |             ((static_cast<double>(temp.QuadPart) - static_cast<double>(start_time.QuadPart)) / freq));
189 |     }
190 | 
191 |     return retval;
192 | }
193 | 
194 | ////////////////////////////////////////////////////////////////////////////////
195 | //! Time in msec. for a single run based on the total number of COMPLETED runs
196 | //! and the total time.
197 | ////////////////////////////////////////////////////////////////////////////////
198 | inline float StopWatchWin::getAverageTime() {
199 |     return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
200 | }
201 | #else
202 | // Declarations for Stopwatch on Linux and Mac OSX
203 | // includes, system
204 | #include <sys/time.h>
205 | 
206 | #include <ctime>
207 | 
208 | //! Windows specific implementation of StopWatch
209 | class StopWatchLinux : public StopWatchInterface {
210 | public:
211 |     //! Constructor, default
212 |     StopWatchLinux() : start_time(), diff_time(0.0), total_time(0.0), running(false), clock_sessions(0) {}
213 | 
214 |     // Destructor
215 |     virtual ~StopWatchLinux() {}
216 | 
217 | public:
218 |     //! Start time measurement
219 |     inline void start();
220 | 
221 |     //! Stop time measurement
222 |     inline void stop();
223 | 
224 |     //! Reset time counters to zero
225 |     inline void reset();
226 | 
227 |     //! Time in msec. after start. If the stop watch is still running (i.e. there
228 |     //! was no call to stop()) then the elapsed time is returned, otherwise the
229 |     //! time between the last start() and stop call is returned
230 |     inline float getTime();
231 | 
232 |     //! Mean time to date based on the number of times the stopwatch has been
233 |     //! _stopped_ (ie finished sessions) and the current total time
234 |     inline float getAverageTime();
235 | 
236 | private:
237 |     // helper functions
238 | 
239 |     //! Get difference between start time and current time
240 |     inline float getDiffTime();
241 | 
242 | private:
243 |     // member variables
244 | 
245 |     //! Start of measurement
246 |     struct timeval start_time;
247 | 
248 |     //! Time difference between the last start and stop
249 |     float diff_time;
250 | 
251 |     //! TOTAL time difference between starts and stops
252 |     float total_time;
253 | 
254 |     //! flag if the stop watch is running
255 |     bool running;
256 | 
257 |     //! Number of times clock has been started
258 |     //! and stopped to allow averaging
259 |     int clock_sessions;
260 | };
261 | 
262 | // functions, inlined
263 | 
264 | ////////////////////////////////////////////////////////////////////////////////
265 | //! Start time measurement
266 | ////////////////////////////////////////////////////////////////////////////////
267 | inline void StopWatchLinux::start() {
268 |     gettimeofday(&start_time, 0);
269 |     running = true;
270 | }
271 | 
272 | ////////////////////////////////////////////////////////////////////////////////
273 | //! Stop time measurement and increment add to the current diff_time summation
274 | //! variable. Also increment the number of times this clock has been run.
275 | ////////////////////////////////////////////////////////////////////////////////
276 | inline void StopWatchLinux::stop() {
277 |     diff_time = getDiffTime();
278 |     total_time += diff_time;
279 |     running = false;
280 |     clock_sessions++;
281 | }
282 | 
283 | ////////////////////////////////////////////////////////////////////////////////
284 | //! Reset the timer to 0. Does not change the timer running state but does
285 | //! recapture this point in time as the current start time if it is running.
286 | ////////////////////////////////////////////////////////////////////////////////
287 | inline void StopWatchLinux::reset() {
288 |     diff_time = 0;
289 |     total_time = 0;
290 |     clock_sessions = 0;
291 | 
292 |     if (running) {
293 |         gettimeofday(&start_time, 0);
294 |     }
295 | }
296 | 
297 | ////////////////////////////////////////////////////////////////////////////////
298 | //! Time in msec. after start. If the stop watch is still running (i.e. there
299 | //! was no call to stop()) then the elapsed time is returned added to the
300 | //! current diff_time sum, otherwise the current summed time difference alone
301 | //! is returned.
302 | ////////////////////////////////////////////////////////////////////////////////
303 | inline float StopWatchLinux::getTime() {
304 |     // Return the TOTAL time to date
305 |     float retval = total_time;
306 | 
307 |     if (running) {
308 |         retval += getDiffTime();
309 |     }
310 | 
311 |     return retval;
312 | }
313 | 
314 | ////////////////////////////////////////////////////////////////////////////////
315 | //! Time in msec. for a single run based on the total number of COMPLETED runs
316 | //! and the total time.
317 | ////////////////////////////////////////////////////////////////////////////////
318 | inline float StopWatchLinux::getAverageTime() {
319 |     return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
320 | }
321 | ////////////////////////////////////////////////////////////////////////////////
322 | 
323 | ////////////////////////////////////////////////////////////////////////////////
324 | inline float StopWatchLinux::getDiffTime() {
325 |     struct timeval t_time;
326 |     gettimeofday(&t_time, 0);
327 | 
328 |     // time difference in milli-seconds
329 |     return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
330 |                               (0.001 * (t_time.tv_usec - start_time.tv_usec)));
331 | }
332 | #endif  // WIN32
333 | 
334 | ////////////////////////////////////////////////////////////////////////////////
335 | //! Timer functionality exported
336 | 
337 | ////////////////////////////////////////////////////////////////////////////////
338 | //! Create a new timer
339 | //! @return true if a time has been created, otherwise false
340 | //! @param  name of the new timer, 0 if the creation failed
341 | ////////////////////////////////////////////////////////////////////////////////
342 | inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
343 | // printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
344 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
345 |     *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
346 | #else
347 |     *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
348 | #endif
349 |     return (*timer_interface != NULL) ? true : false;
350 | }
351 | 
352 | ////////////////////////////////////////////////////////////////////////////////
353 | //! Delete a timer
354 | //! @return true if a time has been deleted, otherwise false
355 | //! @param  name of the timer to delete
356 | ////////////////////////////////////////////////////////////////////////////////
357 | inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
358 |     // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
359 |     if (*timer_interface) {
360 |         delete *timer_interface;
361 |         *timer_interface = NULL;
362 |     }
363 | 
364 |     return true;
365 | }
366 | 
367 | ////////////////////////////////////////////////////////////////////////////////
368 | //! Start the time with name \a name
369 | //! @param name  name of the timer to start
370 | ////////////////////////////////////////////////////////////////////////////////
371 | inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
372 |     // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
373 |     if (*timer_interface) {
374 |         (*timer_interface)->start();
375 |     }
376 | 
377 |     return true;
378 | }
379 | 
380 | ////////////////////////////////////////////////////////////////////////////////
381 | //! Stop the time with name \a name. Does not reset.
382 | //! @param name  name of the timer to stop
383 | ////////////////////////////////////////////////////////////////////////////////
384 | inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
385 |     // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
386 |     if (*timer_interface) {
387 |         (*timer_interface)->stop();
388 |     }
389 | 
390 |     return true;
391 | }
392 | 
393 | ////////////////////////////////////////////////////////////////////////////////
394 | //! Resets the timer's counter.
395 | //! @param name  name of the timer to reset.
396 | ////////////////////////////////////////////////////////////////////////////////
397 | inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
398 |     // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
399 |     if (*timer_interface) {
400 |         (*timer_interface)->reset();
401 |     }
402 | 
403 |     return true;
404 | }
405 | 
406 | ////////////////////////////////////////////////////////////////////////////////
407 | //! Return the average time for timer execution as the total time
408 | //! for the timer dividied by the number of completed (stopped) runs the timer
409 | //! has made.
410 | //! Excludes the current running time if the timer is currently running.
411 | //! @param name  name of the timer to return the time of
412 | ////////////////////////////////////////////////////////////////////////////////
413 | inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
414 |     //  printf("sdkGetAverageTimerValue called object %08x\n", (void
415 |     //  *)*timer_interface);
416 |     if (*timer_interface) {
417 |         return (*timer_interface)->getAverageTime();
418 |     } else {
419 |         return 0.0f;
420 |     }
421 | }
422 | 
423 | ////////////////////////////////////////////////////////////////////////////////
424 | //! Total execution time for the timer over all runs since the last reset
425 | //! or timer creation.
426 | //! @param name  name of the timer to obtain the value of.
427 | ////////////////////////////////////////////////////////////////////////////////
428 | inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
429 |     // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
430 |     if (*timer_interface) {
431 |         return (*timer_interface)->getTime();
432 |     } else {
433 |         return 0.0f;
434 |     }
435 | }
436 | 
437 | #endif  // COMMON_HELPER_TIMER_H_
438 | 


--------------------------------------------------------------------------------
/sample/cuda/matrix_mul.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 22:16:00 on Sun, May 29, 2022
  4 | //
  5 | // Description: source file in cuda/samples/0_Simple/matrixMul/matrixMul.cu
  6 | 
  7 | /**
  8 |  * Matrix multiplication: C = A * B.
  9 |  * Host code.
 10 |  *
 11 |  * This sample implements matrix multiplication which makes use of shared memory
 12 |  * to ensure data reuse, the matrix multiplication is done using tiling
 13 |  * approach. It has been written for clarity of exposition to illustrate various
 14 |  * CUDA programming principles, not with the goal of providing the most
 15 |  * performant generic kernel for matrix multiplication. See also: V. Volkov and
 16 |  * J. Demmel, "Benchmarking GPUs to tune dense linear algebra," in Proc. 2008
 17 |  * ACM/IEEE Conf. on Supercomputing (SC '08), Piscataway, NJ: IEEE Press, 2008,
 18 |  * pp. Art. 31:1-11.
 19 |  */
 20 | 
 21 | // System includes
 22 | #include <assert.h>
 23 | #include <stdio.h>
 24 | 
 25 | // CUDA runtime
 26 | #include <cuda_runtime.h>
 27 | 
 28 | // Helper functions and utilities to work with CUDA
 29 | #include <helper_cuda.h>
 30 | #include <helper_functions.h>
 31 | 
 32 | /**
 33 |  * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 34 |  * wA is A's width and wB is B's width
 35 |  */
 36 | template <int BLOCK_SIZE>
 37 | __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
 38 |     // Block index
 39 |     int bx = blockIdx.x;
 40 |     int by = blockIdx.y;
 41 | 
 42 |     // Thread index
 43 |     int tx = threadIdx.x;
 44 |     int ty = threadIdx.y;
 45 | 
 46 |     // Index of the first sub-matrix of A processed by the block
 47 |     int aBegin = wA * BLOCK_SIZE * by;
 48 | 
 49 |     // Index of the last sub-matrix of A processed by the block
 50 |     int aEnd = aBegin + wA - 1;
 51 | 
 52 |     // Step size used to iterate through the sub-matrices of A
 53 |     int aStep = BLOCK_SIZE;
 54 | 
 55 |     // Index of the first sub-matrix of B processed by the block
 56 |     int bBegin = BLOCK_SIZE * bx;
 57 | 
 58 |     // Step size used to iterate through the sub-matrices of B
 59 |     int bStep = BLOCK_SIZE * wB;
 60 | 
 61 |     // Csub is used to store the element of the block sub-matrix
 62 |     // that is computed by the thread
 63 |     float Csub = 0;
 64 | 
 65 |     // Loop over all the sub-matrices of A and B
 66 |     // required to compute the block sub-matrix
 67 |     for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
 68 |         // Declaration of the shared memory array As used to
 69 |         // store the sub-matrix of A
 70 |         __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
 71 | 
 72 |         // Declaration of the shared memory array Bs used to
 73 |         // store the sub-matrix of B
 74 |         __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
 75 | 
 76 |         // Load the matrices from device memory
 77 |         // to shared memory; each thread loads
 78 |         // one element of each matrix
 79 |         As[ty][tx] = A[a + wA * ty + tx];
 80 |         Bs[ty][tx] = B[b + wB * ty + tx];
 81 | 
 82 |         // Synchronize to make sure the matrices are loaded
 83 |         __syncthreads();
 84 | 
 85 |         // Multiply the two matrices together;
 86 |         // each thread computes one element
 87 |         // of the block sub-matrix
 88 | #pragma unroll
 89 | 
 90 |         for (int k = 0; k < BLOCK_SIZE; ++k) {
 91 |             Csub += As[ty][k] * Bs[k][tx];
 92 |         }
 93 | 
 94 |         // Synchronize to make sure that the preceding
 95 |         // computation is done before loading two new
 96 |         // sub-matrices of A and B in the next iteration
 97 |         __syncthreads();
 98 |     }
 99 | 
100 |     // Write the block sub-matrix to device memory;
101 |     // each thread writes one element
102 |     int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
103 |     C[c + wB * ty + tx] = Csub;
104 | }
105 | 
106 | void ConstantInit(float *data, int size, float val) {
107 |     for (int i = 0; i < size; ++i) {
108 |         data[i] = val;
109 |     }
110 | }
111 | 
112 | /**
113 |  * Run a simple test of matrix multiplication using CUDA
114 |  */
115 | int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB) {
116 |     // Allocate host memory for matrices A and B
117 |     unsigned int size_A = dimsA.x * dimsA.y;
118 |     unsigned int mem_size_A = sizeof(float) * size_A;
119 |     float *h_A;
120 |     checkCudaErrors(cudaMallocHost((void **)&h_A, mem_size_A));
121 |     unsigned int size_B = dimsB.x * dimsB.y;
122 |     unsigned int mem_size_B = sizeof(float) * size_B;
123 |     float *h_B;
124 |     checkCudaErrors(cudaMallocHost((void **)&h_B, mem_size_B));
125 |     cudaStream_t stream;
126 | 
127 |     // Initialize host memory
128 |     const float valB = 0.01f;
129 |     ConstantInit(h_A, size_A, 1.0f);
130 |     ConstantInit(h_B, size_B, valB);
131 | 
132 |     // Allocate device memory
133 |     float *d_A, *d_B, *d_C;
134 | 
135 |     // Allocate host matrix C
136 |     dim3 dimsC(dimsB.x, dimsA.y, 1);
137 |     unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
138 |     float *h_C;
139 |     checkCudaErrors(cudaMallocHost((void **)&h_C, mem_size_C));
140 |     if (h_C == NULL) {
141 |         fprintf(stderr, "Failed to allocate host matrix C!\n");
142 |         exit(EXIT_FAILURE);
143 |     }
144 | 
145 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
146 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
147 |     checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
148 | 
149 |     // Allocate CUDA events that we'll use for timing
150 |     cudaEvent_t start, stop;
151 |     checkCudaErrors(cudaEventCreate(&start));
152 |     checkCudaErrors(cudaEventCreate(&stop));
153 | 
154 |     checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
155 | 
156 |     // copy host memory to device
157 |     checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
158 |     checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
159 | 
160 |     // Setup execution parameters
161 |     dim3 threads(block_size, block_size);
162 |     dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
163 | 
164 |     // Create and start timer
165 |     printf("Computing result using CUDA Kernel...\n");
166 | 
167 |     // Performs warmup operation using matrixMul CUDA kernel
168 |     if (block_size == 16) {
169 |         MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
170 |     } else {
171 |         MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
172 |     }
173 | 
174 |     printf("done\n");
175 |     checkCudaErrors(cudaStreamSynchronize(stream));
176 | 
177 |     // Record the start event
178 |     checkCudaErrors(cudaEventRecord(start, stream));
179 | 
180 |     // Execute the kernel
181 |     int nIter = 300;
182 | 
183 |     for (int j = 0; j < nIter; j++) {
184 |         if (block_size == 16) {
185 |             MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
186 |         } else {
187 |             MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
188 |         }
189 |     }
190 | 
191 |     // Record the stop event
192 |     checkCudaErrors(cudaEventRecord(stop, stream));
193 | 
194 |     // Wait for the stop event to complete
195 |     checkCudaErrors(cudaEventSynchronize(stop));
196 | 
197 |     float msecTotal = 0.0f;
198 |     checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
199 | 
200 |     // Compute and print the performance
201 |     float msecPerMatrixMul = msecTotal / nIter;
202 |     double flopsPerMatrixMul =
203 |         2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
204 |     double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
205 |     printf(
206 |         "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
207 |         " WorkgroupSize= %u threads/block\n",
208 |         gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
209 | 
210 |     // Copy result from device to host
211 |     checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
212 |     checkCudaErrors(cudaStreamSynchronize(stream));
213 | 
214 |     printf("Checking computed result for correctness: ");
215 |     bool correct = true;
216 | 
217 |     // test relative error by the formula
218 |     //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
219 |     double eps = 1.e-6;  // machine zero
220 | 
221 |     for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
222 |         double abs_err = fabs(h_C[i] - (dimsA.x * valB));
223 |         double dot_length = dimsA.x;
224 |         double abs_val = fabs(h_C[i]);
225 |         double rel_err = abs_err / abs_val / dot_length;
226 | 
227 |         if (rel_err > eps) {
228 |             printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
229 |             correct = false;
230 |         }
231 |     }
232 | 
233 |     printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
234 | 
235 |     // Clean up memory
236 |     checkCudaErrors(cudaFreeHost(h_A));
237 |     checkCudaErrors(cudaFreeHost(h_B));
238 |     checkCudaErrors(cudaFreeHost(h_C));
239 | 
240 |     checkCudaErrors(cudaFree(d_A));
241 |     checkCudaErrors(cudaFree(d_B));
242 |     checkCudaErrors(cudaFree(d_C));
243 | 
244 |     checkCudaErrors(cudaEventDestroy(start));
245 |     checkCudaErrors(cudaEventDestroy(stop));
246 |     printf(
247 |         "\nNOTE: The CUDA Samples are not meant for performance"
248 |         "measurements. Results may vary when GPU Boost is enabled.\n");
249 | 
250 |     if (correct) {
251 |         return EXIT_SUCCESS;
252 |     } else {
253 |         return EXIT_FAILURE;
254 |     }
255 | }
256 | 
257 | /**
258 |  * Program main
259 |  */
260 | int main(int argc, char **argv) {
261 |     printf("[Matrix Multiply Using CUDA] - Starting...\n");
262 | 
263 |     if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
264 |         printf("Usage -device=n (n >= 0 for deviceID)\n");
265 |         printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
266 |         printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
267 |         printf(
268 |             "  Note: Outer matrix dimensions of A & B matrices"
269 |             " must be equal.\n");
270 | 
271 |         exit(EXIT_SUCCESS);
272 |     }
273 | 
274 |     // This will pick the best possible CUDA capable device, otherwise
275 |     // override the device ID based on input provided at the command line
276 |     int dev = findCudaDevice(argc, (const char **)argv);
277 | 
278 |     int block_size = 32;
279 | 
280 |     dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
281 |     dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
282 | 
283 |     // width of Matrix A
284 |     if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
285 |         dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
286 |     }
287 | 
288 |     // height of Matrix A
289 |     if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
290 |         dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
291 |     }
292 | 
293 |     // width of Matrix B
294 |     if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
295 |         dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
296 |     }
297 | 
298 |     // height of Matrix B
299 |     if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
300 |         dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
301 |     }
302 | 
303 |     if (dimsA.x != dimsB.y) {
304 |         printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
305 |         exit(EXIT_FAILURE);
306 |     }
307 | 
308 |     printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
309 | 
310 |     int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
311 | 
312 |     exit(matrix_result);
313 | }
314 | 


--------------------------------------------------------------------------------
/sample/cuda/vector_add.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 22:15:42 on Sun, May 29, 2022
  4 | //
  5 | // Description: source file in cuda/samples/0_Simple/vectorAdd/vectorAdd.cu
  6 | 
  7 | /**
  8 |  * Vector addition: C = A + B.
  9 |  *
 10 |  * This sample is a very basic sample that implements element by element
 11 |  * vector addition. It is the same as the sample illustrating Chapter 2
 12 |  * of the programming guide with some additions like error checking.
 13 |  */
 14 | 
 15 | #include <stdio.h>
 16 | 
 17 | // For the CUDA runtime routines (prefixed with "cuda_")
 18 | #include <cuda_runtime.h>
 19 | #include <helper_cuda.h>
 20 | 
 21 | /**
 22 |  * CUDA Kernel Device code
 23 |  *
 24 |  * Computes the vector addition of A and B into C. The 3 vectors have the same
 25 |  * number of elements numElements.
 26 |  */
 27 | __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
 28 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 29 | 
 30 |     if (i < numElements) {
 31 |         C[i] = A[i] + B[i];
 32 |     }
 33 | }
 34 | 
 35 | /**
 36 |  * Host main routine
 37 |  */
 38 | int main(void) {
 39 |     // Error code to check return values for CUDA calls
 40 |     cudaError_t err = cudaSuccess;
 41 | 
 42 |     // Print the vector length to be used, and compute its size
 43 |     int numElements = 50000;
 44 |     size_t size = numElements * sizeof(float);
 45 |     printf("[Vector addition of %d elements]\n", numElements);
 46 | 
 47 |     // Allocate the host input vector A
 48 |     float *h_A = (float *)malloc(size);
 49 | 
 50 |     // Allocate the host input vector B
 51 |     float *h_B = (float *)malloc(size);
 52 | 
 53 |     // Allocate the host output vector C
 54 |     float *h_C = (float *)malloc(size);
 55 | 
 56 |     // Verify that allocations succeeded
 57 |     if (h_A == NULL || h_B == NULL || h_C == NULL) {
 58 |         fprintf(stderr, "Failed to allocate host vectors!\n");
 59 |         exit(EXIT_FAILURE);
 60 |     }
 61 | 
 62 |     // Initialize the host input vectors
 63 |     for (int i = 0; i < numElements; ++i) {
 64 |         h_A[i] = rand() / (float)RAND_MAX;
 65 |         h_B[i] = rand() / (float)RAND_MAX;
 66 |     }
 67 | 
 68 |     // Allocate the device input vector A
 69 |     float *d_A = NULL;
 70 |     err = cudaMalloc((void **)&d_A, size);
 71 |     if (err != cudaSuccess) {
 72 |         fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
 73 |         exit(EXIT_FAILURE);
 74 |     }
 75 | 
 76 |     // Allocate the device input vector B
 77 |     float *d_B = NULL;
 78 |     err = cudaMalloc((void **)&d_B, size);
 79 |     if (err != cudaSuccess) {
 80 |         fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
 81 |         exit(EXIT_FAILURE);
 82 |     }
 83 | 
 84 |     // Allocate the device output vector C
 85 |     float *d_C = NULL;
 86 |     err = cudaMalloc((void **)&d_C, size);
 87 |     if (err != cudaSuccess) {
 88 |         fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
 89 |         exit(EXIT_FAILURE);
 90 |     }
 91 | 
 92 |     // Copy the host input vectors A and B in host memory to the device input vectors in
 93 |     // device memory
 94 |     printf("Copy input data from the host memory to the CUDA device\n");
 95 |     err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
 96 |     if (err != cudaSuccess) {
 97 |         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 98 |         exit(EXIT_FAILURE);
 99 |     }
100 | 
101 |     err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
102 |     if (err != cudaSuccess) {
103 |         fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
104 |         exit(EXIT_FAILURE);
105 |     }
106 | 
107 |     // Launch the Vector Add CUDA Kernel
108 |     int threadsPerBlock = 256;
109 |     int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
110 |     printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
111 |     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
112 |     err = cudaGetLastError();
113 |     if (err != cudaSuccess) {
114 |         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
115 |         exit(EXIT_FAILURE);
116 |     }
117 | 
118 |     // Copy the device result vector in device memory to the host result vector
119 |     // in host memory.
120 |     printf("Copy output data from the CUDA device to the host memory\n");
121 |     err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
122 |     if (err != cudaSuccess) {
123 |         fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
124 |         exit(EXIT_FAILURE);
125 |     }
126 | 
127 |     // Verify that the result vector is correct
128 |     for (int i = 0; i < numElements; ++i) {
129 |         if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
130 |             fprintf(stderr, "Result verification failed at element %d!\n", i);
131 |             exit(EXIT_FAILURE);
132 |         }
133 |     }
134 | 
135 |     printf("Test PASSED\n");
136 | 
137 |     // Free device global memory
138 |     err = cudaFree(d_A);
139 |     if (err != cudaSuccess) {
140 |         fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
141 |         exit(EXIT_FAILURE);
142 |     }
143 | 
144 |     err = cudaFree(d_B);
145 |     if (err != cudaSuccess) {
146 |         fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
147 |         exit(EXIT_FAILURE);
148 |     }
149 | 
150 |     err = cudaFree(d_C);
151 |     if (err != cudaSuccess) {
152 |         fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
153 |         exit(EXIT_FAILURE);
154 |     }
155 | 
156 |     // Free host memory
157 |     free(h_A);
158 |     free(h_B);
159 |     free(h_C);
160 | 
161 |     printf("Done\n");
162 |     return 0;
163 | }
164 | 


--------------------------------------------------------------------------------
/sample/nvml/nvml_example.c:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 22:34:28 on Sun, May 29, 2022
  4 | //
  5 | // Description: source file in cuda/nvml/example/example.c
  6 | 
  7 | /***************************************************************************\
  8 | |*                                                                           *|
  9 | |*      Copyright 2010-2016 NVIDIA Corporation.  All rights reserved.        *|
 10 | |*                                                                           *|
 11 | |*   NOTICE TO USER:                                                         *|
 12 | |*                                                                           *|
 13 | |*   This source code is subject to NVIDIA ownership rights under U.S.       *|
 14 | |*   and international Copyright laws.  Users and possessors of this         *|
 15 | |*   source code are hereby granted a nonexclusive, royalty-free             *|
 16 | |*   license to use this code in individual and commercial software.         *|
 17 | |*                                                                           *|
 18 | |*   NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE     *|
 19 | |*   CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR         *|
 20 | |*   IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH      *|
 21 | |*   REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF         *|
 22 | |*   MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR          *|
 23 | |*   PURPOSE. IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL,            *|
 24 | |*   INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES          *|
 25 | |*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN      *|
 26 | |*   AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING     *|
 27 | |*   OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE      *|
 28 | |*   CODE.                                                                   *|
 29 | |*                                                                           *|
 30 | |*   U.S. Government End Users. This source code is a "commercial item"      *|
 31 | |*   as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting       *|
 32 | |*   of "commercial computer  software" and "commercial computer software    *|
 33 | |*   documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)   *|
 34 | |*   and is provided to the U.S. Government only as a commercial end item.   *|
 35 | |*   Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through        *|
 36 | |*   227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the       *|
 37 | |*   source code with only those rights set forth herein.                    *|
 38 | |*                                                                           *|
 39 | |*   Any use of this source code in individual and commercial software must  *|
 40 | |*   include, in the user documentation and internal comments to the code,   *|
 41 | |*   the above Disclaimer and U.S. Government End Users Notice.              *|
 42 | |*                                                                           *|
 43 | |*                                                                           *|
 44 | \***************************************************************************/
 45 | 
 46 | #include <nvml.h>
 47 | #include <stdio.h>
 48 | 
 49 | static const char *convertToComputeModeString(nvmlComputeMode_t mode) {
 50 |     switch (mode) {
 51 |         case NVML_COMPUTEMODE_DEFAULT:
 52 |             return "Default";
 53 |         case NVML_COMPUTEMODE_EXCLUSIVE_THREAD:
 54 |             return "Exclusive_Thread";
 55 |         case NVML_COMPUTEMODE_PROHIBITED:
 56 |             return "Prohibited";
 57 |         case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS:
 58 |             return "Exclusive Process";
 59 |         default:
 60 |             return "Unknown";
 61 |     }
 62 | }
 63 | 
 64 | int main(void) {
 65 |     nvmlReturn_t result;
 66 |     unsigned int device_count, i;
 67 | 
 68 |     // First initialize NVML library
 69 |     result = nvmlInit();
 70 |     if (NVML_SUCCESS != result) {
 71 |         printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
 72 | 
 73 |         printf("Press ENTER to continue...\n");
 74 |         getchar();
 75 |         return 1;
 76 |     }
 77 | 
 78 |     result = nvmlDeviceGetCount(&device_count);
 79 |     if (NVML_SUCCESS != result) {
 80 |         printf("Failed to query device count: %s\n", nvmlErrorString(result));
 81 |         goto Error;
 82 |     }
 83 |     printf("Found %u device%s\n\n", device_count, device_count != 1 ? "s" : "");
 84 | 
 85 |     printf("Listing devices:\n");
 86 |     for (i = 0; i < device_count; i++) {
 87 |         nvmlDevice_t device;
 88 |         char name[NVML_DEVICE_NAME_BUFFER_SIZE];
 89 |         nvmlPciInfo_t pci;
 90 |         nvmlComputeMode_t compute_mode;
 91 | 
 92 |         // Query for device handle to perform operations on a device
 93 |         // You can also query device handle by other features like:
 94 |         // nvmlDeviceGetHandleBySerial
 95 |         // nvmlDeviceGetHandleByPciBusId
 96 |         result = nvmlDeviceGetHandleByIndex(i, &device);
 97 |         if (NVML_SUCCESS != result) {
 98 |             printf("Failed to get handle for device %u: %s\n", i, nvmlErrorString(result));
 99 |             goto Error;
100 |         }
101 | 
102 |         result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
103 |         if (NVML_SUCCESS != result) {
104 |             printf("Failed to get name of device %u: %s\n", i, nvmlErrorString(result));
105 |             goto Error;
106 |         }
107 | 
108 |         // pci.busId is very useful to know which device physically you're talking to
109 |         // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
110 |         result = nvmlDeviceGetPciInfo(device, &pci);
111 |         if (NVML_SUCCESS != result) {
112 |             printf("Failed to get pci info for device %u: %s\n", i, nvmlErrorString(result));
113 |             goto Error;
114 |         }
115 | 
116 |         printf("%u. %s [%s]\n", i, name, pci.busId);
117 | 
118 |         // This is a simple example on how you can modify GPU's state
119 |         result = nvmlDeviceGetComputeMode(device, &compute_mode);
120 |         if (NVML_ERROR_NOT_SUPPORTED == result)
121 |             printf("\t This is not CUDA capable device\n");
122 |         else if (NVML_SUCCESS != result) {
123 |             printf("Failed to get compute mode for device %u: %s\n", i, nvmlErrorString(result));
124 |             goto Error;
125 |         } else {
126 |             // try to change compute mode
127 |             printf("\t Changing device's compute mode from '%s' to '%s'\n", convertToComputeModeString(compute_mode),
128 |                    convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED));
129 | 
130 |             result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED);
131 |             if (NVML_ERROR_NO_PERMISSION == result)
132 |                 printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result));
133 |             else if (NVML_ERROR_NOT_SUPPORTED == result)
134 |                 printf(
135 |                     "\t\t Compute mode prohibited not supported. You might be running on\n"
136 |                     "\t\t windows in WDDM driver model or on non-CUDA capable GPU\n");
137 |             else if (NVML_SUCCESS != result) {
138 |                 printf("\t\t Failed to set compute mode for device %u: %s\n", i, nvmlErrorString(result));
139 |                 goto Error;
140 |             } else {
141 |                 printf("\t Restoring device's compute mode back to '%s'\n", convertToComputeModeString(compute_mode));
142 |                 result = nvmlDeviceSetComputeMode(device, compute_mode);
143 |                 if (NVML_SUCCESS != result) {
144 |                     printf("\t\t Failed to restore compute mode for device %u: %s\n", i, nvmlErrorString(result));
145 |                     goto Error;
146 |                 }
147 |             }
148 |         }
149 |     }
150 | 
151 |     result = nvmlShutdown();
152 |     if (NVML_SUCCESS != result)
153 |         printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));
154 | 
155 |     printf("All done.\n");
156 | 
157 |     return 0;
158 | 
159 | Error:
160 |     result = nvmlShutdown();
161 |     if (NVML_SUCCESS != result)
162 |         printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));
163 | 
164 |     return 1;
165 | }
166 | 


--------------------------------------------------------------------------------
/sample/nvml/supported_vgpus.c:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 22:35:09 on Sun, May 29, 2022
  4 | //
  5 | // Description: source file in cuda/nvml/example/supportedVgpus.c
  6 | 
  7 | /***************************************************************************\
  8 | |*                                                                           *|
  9 | |*      Copyright 2010-2016 NVIDIA Corporation.  All rights reserved.        *|
 10 | |*                                                                           *|
 11 | |*   NOTICE TO USER:                                                         *|
 12 | |*                                                                           *|
 13 | |*   This source code is subject to NVIDIA ownership rights under U.S.       *|
 14 | |*   and international Copyright laws.  Users and possessors of this         *|
 15 | |*   source code are hereby granted a nonexclusive, royalty-free             *|
 16 | |*   license to use this code in individual and commercial software.         *|
 17 | |*                                                                           *|
 18 | |*   NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE     *|
 19 | |*   CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR         *|
 20 | |*   IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH      *|
 21 | |*   REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF         *|
 22 | |*   MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR          *|
 23 | |*   PURPOSE. IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL,            *|
 24 | |*   INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES          *|
 25 | |*   WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN      *|
 26 | |*   AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING     *|
 27 | |*   OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE      *|
 28 | |*   CODE.                                                                   *|
 29 | |*                                                                           *|
 30 | |*   U.S. Government End Users. This source code is a "commercial item"      *|
 31 | |*   as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting       *|
 32 | |*   of "commercial computer  software" and "commercial computer software    *|
 33 | |*   documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)   *|
 34 | |*   and is provided to the U.S. Government only as a commercial end item.   *|
 35 | |*   Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through        *|
 36 | |*   227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the       *|
 37 | |*   source code with only those rights set forth herein.                    *|
 38 | |*                                                                           *|
 39 | |*   Any use of this source code in individual and commercial software must  *|
 40 | |*   include, in the user documentation and internal comments to the code,   *|
 41 | |*   the above Disclaimer and U.S. Government End Users Notice.              *|
 42 | |*                                                                           *|
 43 | |*                                                                           *|
 44 | \***************************************************************************/
 45 | 
 46 | #include <nvml.h>
 47 | #include <stdio.h>
 48 | #include <stdlib.h>
 49 | 
 50 | int main(void) {
 51 |     nvmlReturn_t result;
 52 |     unsigned int device_count, i;
 53 | 
 54 |     // First initialize NVML library
 55 |     result = nvmlInit();
 56 |     if (NVML_SUCCESS != result) {
 57 |         printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
 58 |         return 1;
 59 |     }
 60 | 
 61 |     result = nvmlDeviceGetCount(&device_count);
 62 |     if (NVML_SUCCESS != result) {
 63 |         printf("Failed to query device count: %s\n", nvmlErrorString(result));
 64 |         goto Error;
 65 |     }
 66 | 
 67 |     printf("Found %u device%s\n", device_count, device_count != 1 ? "s" : "");
 68 |     printf("Listing devices:\n");
 69 | 
 70 |     for (i = 0; i < device_count; i++) {
 71 |         nvmlDevice_t device;
 72 |         char name[NVML_DEVICE_NAME_BUFFER_SIZE];
 73 |         nvmlPciInfo_t pci;
 74 | 
 75 |         // Query for device handle to perform operations on a device
 76 |         // You can also query device handle by other features like:
 77 |         // nvmlDeviceGetHandleBySerial
 78 |         // nvmlDeviceGetHandleByPciBusId
 79 |         result = nvmlDeviceGetHandleByIndex(i, &device);
 80 |         if (NVML_SUCCESS != result) {
 81 |             printf("Failed to get handle for device %u: %s\n", i, nvmlErrorString(result));
 82 |             goto Error;
 83 |         }
 84 | 
 85 |         result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
 86 |         if (NVML_SUCCESS != result) {
 87 |             printf("Failed to get name of device %u: %s\n", i, nvmlErrorString(result));
 88 |             goto Error;
 89 |         }
 90 | 
 91 |         // pci.busId is very useful to know which device physically you're talking to
 92 |         // Using PCI identifier you can also match nvmlDevice handle to CUDA device.
 93 |         result = nvmlDeviceGetPciInfo(device, &pci);
 94 |         if (NVML_SUCCESS != result) {
 95 |             printf("Failed to get pci info for device %u: %s\n", i, nvmlErrorString(result));
 96 |             goto Error;
 97 |         }
 98 | 
 99 |         printf("%u. %s [%s]\n", i, name, pci.busId);
100 | 
101 |         // This is an example to get the supported vGPUs type names
102 |         unsigned int vgpuCount = 0;
103 |         nvmlVgpuTypeId_t *vgpuTypeIds = NULL;
104 |         unsigned int j;
105 | 
106 |         result = nvmlDeviceGetSupportedVgpus(device, &vgpuCount, NULL);
107 |         if (NVML_ERROR_INSUFFICIENT_SIZE != result)
108 |             goto Error;
109 | 
110 |         if (vgpuCount != 0) {
111 |             vgpuTypeIds = malloc(sizeof(nvmlVgpuTypeId_t) * vgpuCount);
112 |             if (!vgpuTypeIds) {
113 |                 printf("Memory allocation of %d bytes failed \n", (int)(sizeof(*vgpuTypeIds) * vgpuCount));
114 |                 goto Error;
115 |             }
116 | 
117 |             result = nvmlDeviceGetSupportedVgpus(device, &vgpuCount, vgpuTypeIds);
118 |             if (NVML_SUCCESS != result) {
119 |                 printf("Failed to get the supported vGPUs with status %d \n", (int)result);
120 |                 goto Error;
121 |             }
122 | 
123 |             printf("  Displaying vGPU type names: \n");
124 |             for (j = 0; j < vgpuCount; j++) {
125 |                 char vgpuTypeName[NVML_DEVICE_NAME_BUFFER_SIZE];
126 |                 unsigned int bufferSize = NVML_DEVICE_NAME_BUFFER_SIZE;
127 | 
128 |                 if (NVML_SUCCESS == (result = nvmlVgpuTypeGetName(vgpuTypeIds[j], vgpuTypeName, &bufferSize))) {
129 |                     printf("  %s\n", vgpuTypeName);
130 |                 } else {
131 |                     printf("Failed to query the vGPU type name with status %d \n", (int)result);
132 |                 }
133 |             }
134 |         }
135 |         if (vgpuTypeIds)
136 |             free(vgpuTypeIds);
137 |     }
138 | 
139 |     result = nvmlShutdown();
140 |     if (NVML_SUCCESS != result)
141 |         printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));
142 | 
143 |     printf("All done.\n");
144 |     return 0;
145 | 
146 | Error:
147 |     result = nvmlShutdown();
148 |     if (NVML_SUCCESS != result)
149 |         printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result));
150 | 
151 |     return 1;
152 | }
153 | 


--------------------------------------------------------------------------------
/src/common/hook.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 15:04:22 on Sun, May 29, 2022
  4 | //
  5 | // Description: hook
  6 | 
  7 | #ifndef __CUDA_HOOK_HOOK_H__
  8 | #define __CUDA_HOOK_HOOK_H__
  9 | 
 10 | #include <dlfcn.h>
 11 | 
 12 | #include <string>
 13 | 
 14 | #include "macro_common.h"
 15 | 
 16 | #define HOOK_GET_SYMBOL(type, symbol_name)                                          \
 17 |     do {                                                                            \
 18 |         static void *type##_handle = dlopen(s_##type##_dso, RTLD_NOW | RTLD_LOCAL); \
 19 |         HOOK_CHECK(type##_handle);                                                  \
 20 |         return dlsym(type##_handle, symbol_name.c_str());                           \
 21 |     } while (0)
 22 | 
 23 | class Hook {
 24 | public:
 25 |     Hook() = default;
 26 |     ~Hook() = default;
 27 | 
 28 |     static void *GetCUDASymbol(const std::string &symbol_name) {
 29 |         HOOK_GET_SYMBOL(cuda, symbol_name);
 30 |     }
 31 | 
 32 |     static void *GetNVMLSymbol(const std::string &symbol_name) {
 33 |         HOOK_GET_SYMBOL(nvml, symbol_name);
 34 |     }
 35 | 
 36 |     static void *GetCUDARTSymbol(const std::string &symbol_name) {
 37 |         HOOK_GET_SYMBOL(cudart, symbol_name);
 38 |     }
 39 | 
 40 |     static void *GetCUDNNSymbol(const std::string &symbol_name) {
 41 |         HOOK_GET_SYMBOL(cudnn, symbol_name);
 42 |     }
 43 | 
 44 |     static void *GetCUBLASSymbol(const std::string &symbol_name) {
 45 |         HOOK_GET_SYMBOL(cublas, symbol_name);
 46 |     }
 47 | 
 48 |     static void *GetCUBLASLTSymbol(const std::string &symbol_name) {
 49 |         HOOK_GET_SYMBOL(cublasLt, symbol_name);
 50 |     }
 51 | 
 52 |     static void *GetCUFFTSymbol(const std::string &symbol_name) {
 53 |         HOOK_GET_SYMBOL(cufft, symbol_name);
 54 |     }
 55 | 
 56 |     static void *GetNVTXSymbol(const std::string &symbol_name) {
 57 |         HOOK_GET_SYMBOL(nvtx, symbol_name);
 58 |     }
 59 | 
 60 |     static void *GetNVRTCSymbol(const std::string &symbol_name) {
 61 |         HOOK_GET_SYMBOL(nvrtc, symbol_name);
 62 |     }
 63 | 
 64 |     static void *GetCURANDSymbol(const std::string &symbol_name) {
 65 |         HOOK_GET_SYMBOL(curand, symbol_name);
 66 |     }
 67 | 
 68 |     static void *GetCUSPARSESymbol(const std::string &symbol_name) {
 69 |         HOOK_GET_SYMBOL(cusparse, symbol_name);
 70 |     }
 71 | 
 72 |     static void *GetCUSOLVERSymbol(const std::string &symbol_name) {
 73 |         HOOK_GET_SYMBOL(cusolver, symbol_name);
 74 |     }
 75 | 
 76 |     static void *GetNVJPEGSymbol(const std::string &symbol_name) {
 77 |         HOOK_GET_SYMBOL(nvjpeg, symbol_name);
 78 |     }
 79 | 
 80 |     static void *GetNVBLASSymbol(const std::string &symbol_name) {
 81 |         HOOK_GET_SYMBOL(nvblas, symbol_name);
 82 |     }
 83 | 
 84 | private:
 85 |     // nvidia native cuda dynamic library can be modified to any other unambiguous name, or moved to any path
 86 |     static constexpr const char *s_cuda_dso = "/usr/lib/x86_64-linux-gnu/libcuda.so";
 87 |     static constexpr const char *s_nvml_dso = "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so";
 88 |     static constexpr const char *s_cudart_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so";
 89 |     static constexpr const char *s_cudnn_dso = "/usr/local/cudnn/lib64/libcudnn.so";
 90 |     static constexpr const char *s_cublas_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcublas.so";
 91 |     static constexpr const char *s_cublasLt_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so";
 92 |     static constexpr const char *s_cufft_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcufft.so";
 93 |     static constexpr const char *s_nvtx_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libnvToolsExt.so";
 94 |     static constexpr const char *s_nvrtc_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libnvrtc.so";
 95 |     static constexpr const char *s_curand_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcurand.so";
 96 |     static constexpr const char *s_cusparse_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcusparse.so";
 97 |     static constexpr const char *s_cusolver_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcusolver.so";
 98 |     static constexpr const char *s_nvjpeg_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg.so";
 99 |     static constexpr const char *s_nvblas_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libnvblas.so";
100 | 
101 |     HOOK_DISALLOW_COPY_AND_ASSIGN(Hook);
102 | };
103 | 
104 | #define HOOK_CUDA_SYMBOL(symbol_name) Hook::GetCUDASymbol(symbol_name)
105 | #define HOOK_NVML_SYMBOL(symbol_name) Hook::GetNVMLSymbol(symbol_name)
106 | #define HOOK_CUDART_SYMBOL(symbol_name) Hook::GetCUDARTSymbol(symbol_name)
107 | #define HOOK_CUDNN_SYMBOL(symbol_name) Hook::GetCUDNNSymbol(symbol_name)
108 | #define HOOK_CUBLAS_SYMBOL(symbol_name) Hook::GetCUBLASSymbol(symbol_name)
109 | #define HOOK_CUBLASLT_SYMBOL(symbol_name) Hook::GetCUBLASLTSymbol(symbol_name)
110 | #define HOOK_CUFFT_SYMBOL(symbol_name) Hook::GetCUFFTSymbol(symbol_name)
111 | #define HOOK_NVTX_SYMBOL(symbol_name) Hook::GetNVTXSymbol(symbol_name)
112 | #define HOOK_NVRTC_SYMBOL(symbol_name) Hook::GetNVRTCSymbol(symbol_name)
113 | #define HOOK_CURAND_SYMBOL(symbol_name) Hook::GetCURANDSymbol(symbol_name)
114 | #define HOOK_CUSPARSE_SYMBOL(symbol_name) Hook::GetCUSPARSESymbol(symbol_name)
115 | #define HOOK_CUSOLVER_SYMBOL(symbol_name) Hook::GetCUSOLVERSymbol(symbol_name)
116 | #define HOOK_NVJPEG_SYMBOL(symbol_name) Hook::GetNVJPEGSymbol(symbol_name)
117 | #define HOOK_NVBLAS_SYMBOL(symbol_name) Hook::GetNVBLASSymbol(symbol_name)
118 | 
119 | #endif  // __CUDA_HOOK_HOOK_H__
120 | 


--------------------------------------------------------------------------------
/src/common/macro_common.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2022. All Rights Reserved.
 2 | // Author: Bruce-Lee-LY
 3 | // Date: 15:40:15 on Sun, May 29, 2022
 4 | //
 5 | // Description: common macro
 6 | 
 7 | #ifndef __CUDA_HOOK_MACRO_COMMON_H__
 8 | #define __CUDA_HOOK_MACRO_COMMON_H__
 9 | 
10 | #include <stdio.h>
11 | #include <string.h>
12 | #include <sys/syscall.h>
13 | #include <unistd.h>
14 | 
15 | #define HOOK_C_API extern "C"
16 | #define HOOK_DECL_EXPORT __attribute__((visibility("default")))
17 | 
18 | #define HOOK_LIKELY(x) __builtin_expect(!!(x), 1)
19 | #define HOOK_UNLIKELY(x) __builtin_expect(!!(x), 0)
20 | 
21 | inline char *curr_time() {
22 |     time_t raw_time = time(nullptr);
23 |     struct tm *time_info = localtime(&raw_time);
24 |     static char now_time[64];
25 |     now_time[strftime(now_time, sizeof(now_time), "%Y-%m-%d %H:%M:%S", time_info)] = '\0';
26 | 
27 |     return now_time;
28 | }
29 | 
30 | inline int get_pid() {
31 |     static int pid = getpid();
32 | 
33 |     return pid;
34 | }
35 | 
36 | inline long int get_tid() {
37 |     thread_local long int tid = syscall(SYS_gettid);
38 | 
39 |     return tid;
40 | }
41 | 
42 | #define HOOK_LOG_TAG "CUDA-HOOK"
43 | #define HOOK_LOG_FILE(x) (strrchr(x, '/') ? (strrchr(x, '/') + 1) : x)
44 | #define HLOG(format, ...)                                                                                        \
45 |     do {                                                                                                         \
46 |         fprintf(stderr, "[%s %s %d:%ld %s:%d %s] " format "\n", HOOK_LOG_TAG, curr_time(), get_pid(), get_tid(), \
47 |                 HOOK_LOG_FILE(__FILE__), __LINE__, __FUNCTION__, ##__VA_ARGS__);                                 \
48 |     } while (0)
49 | 
50 | #define HOOK_CHECK(x)                     \
51 |     do {                                  \
52 |         if (HOOK_UNLIKELY(!(x))) {        \
53 |             HLOG("Check failed: %s", #x); \
54 |             exit(EXIT_FAILURE);           \
55 |         }                                 \
56 |     } while (0)
57 | 
58 | #define HOOK_DISALLOW_COPY_AND_ASSIGN(TypeName) \
59 |     TypeName(const TypeName &) = delete;        \
60 |     void operator=(const TypeName &) = delete;
61 | 
62 | #endif  // __CUDA_HOOK_MACRO_COMMON_H__
63 | 


--------------------------------------------------------------------------------
/src/common/trace_profile.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2022. All Rights Reserved.
 2 | // Author: Bruce-Lee-LY
 3 | // Date: 14:54:28 on Sun, May 29, 2022
 4 | //
 5 | // Description: trace and profile
 6 | 
 7 | #ifndef __CUDA_HOOK_TRACE_PROFILE_H__
 8 | #define __CUDA_HOOK_TRACE_PROFILE_H__
 9 | 
10 | #include <chrono>
11 | #include <string>
12 | 
13 | #include "macro_common.h"
14 | 
15 | class TraceProfile {
16 | public:
17 |     TraceProfile(const std::string &name) : m_name(name), m_start(std::chrono::steady_clock::now()) {
18 |         HLOG("%s enter", m_name.c_str());
19 |     }
20 | 
21 |     ~TraceProfile() {
22 |         m_end = std::chrono::steady_clock::now();
23 |         m_duration = std::chrono::duration_cast<std::chrono::microseconds>(m_end - m_start);
24 |         HLOG("%s exit, taken %.3lf ms", m_name.c_str(), m_duration.count());
25 |     }
26 | 
27 | private:
28 |     const std::string m_name;
29 |     std::chrono::steady_clock::time_point m_start;
30 |     std::chrono::steady_clock::time_point m_end;
31 |     std::chrono::duration<double, std::milli> m_duration;
32 | 
33 |     HOOK_DISALLOW_COPY_AND_ASSIGN(TraceProfile);
34 | };
35 | 
36 | #ifdef HOOK_BUILD_DEBUG
37 | #define HOOK_TRACE_PROFILE(name) TraceProfile _tp_##name_(name)
38 | #else
39 | #define HOOK_TRACE_PROFILE(name)
40 | #endif
41 | 
42 | #endif  // __CUDA_HOOK_TRACE_PROFILE_H__
43 | 


--------------------------------------------------------------------------------
/src/cublas/cublas_subset.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 17:19:12 on Sun, May 29, 2022
  4 | //
  5 | // Description: cublas subset
  6 | 
  7 | #ifndef __CUDA_HOOK_CUBLAS_SUBSET_H__
  8 | #define __CUDA_HOOK_CUBLAS_SUBSET_H__
  9 | 
 10 | #include "cudart_subset.h"
 11 | 
 12 | #ifdef __cplusplus
 13 | extern "C" {
 14 | #endif
 15 | 
 16 | typedef struct __half __half;
 17 | 
 18 | typedef enum cudaDataType_t {
 19 |     CUDA_R_16F = 2,  /* real as a half */
 20 |     CUDA_C_16F = 6,  /* complex as a pair of half numbers */
 21 |     CUDA_R_32F = 0,  /* real as a float */
 22 |     CUDA_C_32F = 4,  /* complex as a pair of float numbers */
 23 |     CUDA_R_64F = 1,  /* real as a double */
 24 |     CUDA_C_64F = 5,  /* complex as a pair of double numbers */
 25 |     CUDA_R_8I = 3,   /* real as a signed char */
 26 |     CUDA_C_8I = 7,   /* complex as a pair of signed char numbers */
 27 |     CUDA_R_8U = 8,   /* real as a unsigned char */
 28 |     CUDA_C_8U = 9,   /* complex as a pair of unsigned char numbers */
 29 |     CUDA_R_32I = 10, /* real as a signed int */
 30 |     CUDA_C_32I = 11, /* complex as a pair of signed int numbers */
 31 |     CUDA_R_32U = 12, /* real as a unsigned int */
 32 |     CUDA_C_32U = 13  /* complex as a pair of unsigned int numbers */
 33 | } cudaDataType;
 34 | 
 35 | typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType;
 36 | 
 37 | struct float2 {
 38 |     float x, y;
 39 | };
 40 | 
 41 | typedef float2 cuFloatComplex;
 42 | /* aliases */
 43 | typedef cuFloatComplex cuComplex;
 44 | 
 45 | struct double2 {
 46 |     double x, y;
 47 | };
 48 | 
 49 | /* Double precision */
 50 | typedef double2 cuDoubleComplex;
 51 | 
 52 | /* CUBLAS data types */
 53 | #define cublasStatus cublasStatus_t
 54 | 
 55 | #define CUBLAS_VER_MAJOR 11
 56 | #define CUBLAS_VER_MINOR 6
 57 | #define CUBLAS_VER_PATCH 5
 58 | #define CUBLAS_VER_BUILD 2
 59 | #define CUBLAS_VERSION (CUBLAS_VER_MAJOR * 1000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH)
 60 | 
 61 | /* CUBLAS status type returns */
 62 | typedef enum {
 63 |     CUBLAS_STATUS_SUCCESS = 0,
 64 |     CUBLAS_STATUS_NOT_INITIALIZED = 1,
 65 |     CUBLAS_STATUS_ALLOC_FAILED = 3,
 66 |     CUBLAS_STATUS_INVALID_VALUE = 7,
 67 |     CUBLAS_STATUS_ARCH_MISMATCH = 8,
 68 |     CUBLAS_STATUS_MAPPING_ERROR = 11,
 69 |     CUBLAS_STATUS_EXECUTION_FAILED = 13,
 70 |     CUBLAS_STATUS_INTERNAL_ERROR = 14,
 71 |     CUBLAS_STATUS_NOT_SUPPORTED = 15,
 72 |     CUBLAS_STATUS_LICENSE_ERROR = 16
 73 | } cublasStatus_t;
 74 | 
 75 | typedef enum { CUBLAS_FILL_MODE_LOWER = 0, CUBLAS_FILL_MODE_UPPER = 1, CUBLAS_FILL_MODE_FULL = 2 } cublasFillMode_t;
 76 | 
 77 | typedef enum { CUBLAS_DIAG_NON_UNIT = 0, CUBLAS_DIAG_UNIT = 1 } cublasDiagType_t;
 78 | 
 79 | typedef enum { CUBLAS_SIDE_LEFT = 0, CUBLAS_SIDE_RIGHT = 1 } cublasSideMode_t;
 80 | 
 81 | typedef enum {
 82 |     CUBLAS_OP_N = 0,
 83 |     CUBLAS_OP_T = 1,
 84 |     CUBLAS_OP_C = 2,
 85 |     CUBLAS_OP_HERMITAN = 2, /* synonym if CUBLAS_OP_C */
 86 |     CUBLAS_OP_CONJG = 3     /* conjugate, placeholder - not supported in the current release */
 87 | } cublasOperation_t;
 88 | 
 89 | typedef enum { CUBLAS_POINTER_MODE_HOST = 0, CUBLAS_POINTER_MODE_DEVICE = 1 } cublasPointerMode_t;
 90 | 
 91 | typedef enum { CUBLAS_ATOMICS_NOT_ALLOWED = 0, CUBLAS_ATOMICS_ALLOWED = 1 } cublasAtomicsMode_t;
 92 | 
 93 | /*For different GEMM algorithm */
 94 | typedef enum {
 95 |     CUBLAS_GEMM_DFALT = -1,
 96 |     CUBLAS_GEMM_DEFAULT = -1,
 97 |     CUBLAS_GEMM_ALGO0 = 0,
 98 |     CUBLAS_GEMM_ALGO1 = 1,
 99 |     CUBLAS_GEMM_ALGO2 = 2,
100 |     CUBLAS_GEMM_ALGO3 = 3,
101 |     CUBLAS_GEMM_ALGO4 = 4,
102 |     CUBLAS_GEMM_ALGO5 = 5,
103 |     CUBLAS_GEMM_ALGO6 = 6,
104 |     CUBLAS_GEMM_ALGO7 = 7,
105 |     CUBLAS_GEMM_ALGO8 = 8,
106 |     CUBLAS_GEMM_ALGO9 = 9,
107 |     CUBLAS_GEMM_ALGO10 = 10,
108 |     CUBLAS_GEMM_ALGO11 = 11,
109 |     CUBLAS_GEMM_ALGO12 = 12,
110 |     CUBLAS_GEMM_ALGO13 = 13,
111 |     CUBLAS_GEMM_ALGO14 = 14,
112 |     CUBLAS_GEMM_ALGO15 = 15,
113 |     CUBLAS_GEMM_ALGO16 = 16,
114 |     CUBLAS_GEMM_ALGO17 = 17,
115 |     CUBLAS_GEMM_ALGO18 = 18,  // sliced 32x32
116 |     CUBLAS_GEMM_ALGO19 = 19,  // sliced 64x32
117 |     CUBLAS_GEMM_ALGO20 = 20,  // sliced 128x32
118 |     CUBLAS_GEMM_ALGO21 = 21,  // sliced 32x32  -splitK
119 |     CUBLAS_GEMM_ALGO22 = 22,  // sliced 64x32  -splitK
120 |     CUBLAS_GEMM_ALGO23 = 23,  // sliced 128x32 -splitK
121 |     CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99,
122 |     CUBLAS_GEMM_DFALT_TENSOR_OP = 99,
123 |     CUBLAS_GEMM_ALGO0_TENSOR_OP = 100,
124 |     CUBLAS_GEMM_ALGO1_TENSOR_OP = 101,
125 |     CUBLAS_GEMM_ALGO2_TENSOR_OP = 102,
126 |     CUBLAS_GEMM_ALGO3_TENSOR_OP = 103,
127 |     CUBLAS_GEMM_ALGO4_TENSOR_OP = 104,
128 |     CUBLAS_GEMM_ALGO5_TENSOR_OP = 105,
129 |     CUBLAS_GEMM_ALGO6_TENSOR_OP = 106,
130 |     CUBLAS_GEMM_ALGO7_TENSOR_OP = 107,
131 |     CUBLAS_GEMM_ALGO8_TENSOR_OP = 108,
132 |     CUBLAS_GEMM_ALGO9_TENSOR_OP = 109,
133 |     CUBLAS_GEMM_ALGO10_TENSOR_OP = 110,
134 |     CUBLAS_GEMM_ALGO11_TENSOR_OP = 111,
135 |     CUBLAS_GEMM_ALGO12_TENSOR_OP = 112,
136 |     CUBLAS_GEMM_ALGO13_TENSOR_OP = 113,
137 |     CUBLAS_GEMM_ALGO14_TENSOR_OP = 114,
138 |     CUBLAS_GEMM_ALGO15_TENSOR_OP = 115
139 | } cublasGemmAlgo_t;
140 | 
141 | /*Enum for default math mode/tensor operation*/
142 | typedef enum {
143 |     CUBLAS_DEFAULT_MATH = 0,
144 | 
145 |     /* deprecated, same effect as using CUBLAS_COMPUTE_32F_FAST_16F, will be removed in a future release */
146 |     CUBLAS_TENSOR_OP_MATH = 1,
147 | 
148 |     /* same as using matching _PEDANTIC compute type when using cublas<T>routine calls or cublasEx() calls with
149 |        cudaDataType as compute type */
150 |     CUBLAS_PEDANTIC_MATH = 2,
151 | 
152 |     /* allow accelerating single precision routines using TF32 tensor cores */
153 |     CUBLAS_TF32_TENSOR_OP_MATH = 3,
154 | 
155 |     /* flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines
156 |        with lower size output type */
157 |     CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION = 16,
158 | } cublasMath_t;
159 | 
160 | /* For backward compatibility purposes */
161 | typedef cudaDataType cublasDataType_t;
162 | 
163 | /* Enum for compute type
164 |  *
165 |  * - default types provide best available performance using all available hardware features
166 |  *   and guarantee internal storage precision with at least the same precision and range;
167 |  * - _PEDANTIC types ensure standard arithmetic and exact specified internal storage format;
168 |  * - _FAST types allow for some loss of precision to enable higher throughput arithmetic.
169 |  */
170 | typedef enum {
171 |     CUBLAS_COMPUTE_16F = 64,           /* half - default */
172 |     CUBLAS_COMPUTE_16F_PEDANTIC = 65,  /* half - pedantic */
173 |     CUBLAS_COMPUTE_32F = 68,           /* float - default */
174 |     CUBLAS_COMPUTE_32F_PEDANTIC = 69,  /* float - pedantic */
175 |     CUBLAS_COMPUTE_32F_FAST_16F = 74,  /* float - fast, allows down-converting inputs to half or TF32 */
176 |     CUBLAS_COMPUTE_32F_FAST_16BF = 75, /* float - fast, allows down-converting inputs to bfloat16 or TF32 */
177 |     CUBLAS_COMPUTE_32F_FAST_TF32 = 77, /* float - fast, allows down-converting inputs to TF32 */
178 |     CUBLAS_COMPUTE_64F = 70,           /* double - default */
179 |     CUBLAS_COMPUTE_64F_PEDANTIC = 71,  /* double - pedantic */
180 |     CUBLAS_COMPUTE_32I = 72,           /* signed 32-bit int - default */
181 |     CUBLAS_COMPUTE_32I_PEDANTIC = 73,  /* signed 32-bit int - pedantic */
182 | } cublasComputeType_t;
183 | 
184 | /* Opaque structure holding CUBLAS library context */
185 | struct cublasContext;
186 | typedef struct cublasContext *cublasHandle_t;
187 | 
188 | /* Cublas logging */
189 | typedef void (*cublasLogCallback)(const char *msg);
190 | 
191 | struct cublasXtContext;
192 | typedef struct cublasXtContext *cublasXtHandle_t;
193 | 
194 | typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
195 | 
196 | /* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
197 | typedef enum {
198 |     CUBLASXT_FLOAT = 0,
199 |     CUBLASXT_DOUBLE = 1,
200 |     CUBLASXT_COMPLEX = 2,
201 |     CUBLASXT_DOUBLECOMPLEX = 3,
202 | } cublasXtOpType_t;
203 | 
204 | typedef enum {
205 |     CUBLASXT_GEMM = 0,
206 |     CUBLASXT_SYRK = 1,
207 |     CUBLASXT_HERK = 2,
208 |     CUBLASXT_SYMM = 3,
209 |     CUBLASXT_HEMM = 4,
210 |     CUBLASXT_TRSM = 5,
211 |     CUBLASXT_SYR2K = 6,
212 |     CUBLASXT_HER2K = 7,
213 | 
214 |     CUBLASXT_SPMM = 8,
215 |     CUBLASXT_SYRKX = 9,
216 |     CUBLASXT_HERKX = 10,
217 |     CUBLASXT_TRMM = 11,
218 |     CUBLASXT_ROUTINE_MAX = 12,
219 | } cublasXtBlasOp_t;
220 | 
221 | #ifdef __cplusplus
222 | }
223 | #endif
224 | 
225 | #endif  // __CUDA_HOOK_CUBLAS_SUBSET_H__
226 | 


--------------------------------------------------------------------------------
/src/cufft/cufft_subset.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 17:19:12 on Sun, May 29, 2022
  4 | //
  5 | // Description: cufft subset
  6 | 
  7 | #ifndef __CUDA_HOOK_CUFFT_SUBSET_H__
  8 | #define __CUDA_HOOK_CUFFT_SUBSET_H__
  9 | 
 10 | #include "cublas_subset.h"
 11 | 
 12 | #ifdef __cplusplus
 13 | extern "C" {
 14 | #endif
 15 | 
 16 | #define CUFFT_VER_MAJOR 10
 17 | #define CUFFT_VER_MINOR 5
 18 | #define CUFFT_VER_PATCH 2
 19 | #define CUFFT_VER_BUILD 100
 20 | 
 21 | // cuFFT library version
 22 | //
 23 | // CUFFT_VERSION / 1000 - major version
 24 | // CUFFT_VERSION / 100 % 100 - minor version
 25 | // CUFFT_VERSION % 100 - patch level
 26 | #define CUFFT_VERSION 10502
 27 | 
 28 | // CUFFT API function return values
 29 | typedef enum cufftResult_t {
 30 |     CUFFT_SUCCESS = 0x0,
 31 |     CUFFT_INVALID_PLAN = 0x1,
 32 |     CUFFT_ALLOC_FAILED = 0x2,
 33 |     CUFFT_INVALID_TYPE = 0x3,
 34 |     CUFFT_INVALID_VALUE = 0x4,
 35 |     CUFFT_INTERNAL_ERROR = 0x5,
 36 |     CUFFT_EXEC_FAILED = 0x6,
 37 |     CUFFT_SETUP_FAILED = 0x7,
 38 |     CUFFT_INVALID_SIZE = 0x8,
 39 |     CUFFT_UNALIGNED_DATA = 0x9,
 40 |     CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA,
 41 |     CUFFT_INVALID_DEVICE = 0xB,
 42 |     CUFFT_PARSE_ERROR = 0xC,
 43 |     CUFFT_NO_WORKSPACE = 0xD,
 44 |     CUFFT_NOT_IMPLEMENTED = 0xE,
 45 |     CUFFT_LICENSE_ERROR = 0x0F,
 46 |     CUFFT_NOT_SUPPORTED = 0x10
 47 | 
 48 | } cufftResult;
 49 | 
 50 | #define MAX_CUFFT_ERROR 0x11
 51 | 
 52 | // CUFFT defines and supports the following data types
 53 | 
 54 | // cufftReal is a single-precision, floating-point real data type.
 55 | // cufftDoubleReal is a double-precision, real data type.
 56 | typedef float cufftReal;
 57 | typedef double cufftDoubleReal;
 58 | 
 59 | // cufftComplex is a single-precision, floating-point complex data type that
 60 | // consists of interleaved real and imaginary components.
 61 | // cufftDoubleComplex is the double-precision equivalent.
 62 | typedef cuComplex cufftComplex;
 63 | typedef cuDoubleComplex cufftDoubleComplex;
 64 | 
 65 | // CUFFT transform directions
 66 | #define CUFFT_FORWARD -1  // Forward FFT
 67 | #define CUFFT_INVERSE 1   // Inverse FFT
 68 | 
 69 | // CUFFT supports the following transform types
 70 | typedef enum cufftType_t {
 71 |     CUFFT_R2C = 0x2a,  // Real to Complex (interleaved)
 72 |     CUFFT_C2R = 0x2c,  // Complex (interleaved) to Real
 73 |     CUFFT_C2C = 0x29,  // Complex to Complex, interleaved
 74 |     CUFFT_D2Z = 0x6a,  // Double to Double-Complex
 75 |     CUFFT_Z2D = 0x6c,  // Double-Complex to Double
 76 |     CUFFT_Z2Z = 0x69   // Double-Complex to Double-Complex
 77 | } cufftType;
 78 | 
 79 | // CUFFT supports the following data layouts
 80 | typedef enum cufftCompatibility_t {
 81 |     CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01  // The default value
 82 | } cufftCompatibility;
 83 | 
 84 | #define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING
 85 | 
 86 | //
 87 | // structure definition used by the shim between old and new APIs
 88 | //
 89 | #define MAX_SHIM_RANK 3
 90 | 
 91 | // cufftHandle is a handle type used to store and access CUFFT plans.
 92 | typedef int cufftHandle;
 93 | 
 94 | //
 95 | // cufftXtSubFormat identifies the data layout of
 96 | // a memory descriptor owned by cufft.
 97 | // note that multi GPU cufft does not yet support out-of-place transforms
 98 | //
 99 | 
100 | typedef enum cufftXtSubFormat_t {
101 |     CUFFT_XT_FORMAT_INPUT = 0x00,              // by default input is in linear order across GPUs
102 |     CUFFT_XT_FORMAT_OUTPUT = 0x01,             // by default output is in scrambled order depending on transform
103 |     CUFFT_XT_FORMAT_INPLACE = 0x02,            // by default inplace is input order, which is linear across GPUs
104 |     CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03,   // shuffled output order after execution of the transform
105 |     CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04,  // shuffled input order prior to execution of 1D transforms
106 |     CUFFT_FORMAT_UNDEFINED = 0x05
107 | } cufftXtSubFormat;
108 | 
109 | //
110 | // cufftXtCopyType specifies the type of copy for cufftXtMemcpy
111 | //
112 | typedef enum cufftXtCopyType_t {
113 |     CUFFT_COPY_HOST_TO_DEVICE = 0x00,
114 |     CUFFT_COPY_DEVICE_TO_HOST = 0x01,
115 |     CUFFT_COPY_DEVICE_TO_DEVICE = 0x02,
116 |     CUFFT_COPY_UNDEFINED = 0x03
117 | } cufftXtCopyType;
118 | 
119 | //
120 | // cufftXtQueryType specifies the type of query for cufftXtQueryPlan
121 | //
122 | typedef enum cufftXtQueryType_t { CUFFT_QUERY_1D_FACTORS = 0x00, CUFFT_QUERY_UNDEFINED = 0x01 } cufftXtQueryType;
123 | 
124 | typedef struct cufftXt1dFactors_t {
125 |     long long int size;
126 |     long long int stringCount;
127 |     long long int stringLength;
128 |     long long int substringLength;
129 |     long long int factor1;
130 |     long long int factor2;
131 |     long long int stringMask;
132 |     long long int substringMask;
133 |     long long int factor1Mask;
134 |     long long int factor2Mask;
135 |     int stringShift;
136 |     int substringShift;
137 |     int factor1Shift;
138 |     int factor2Shift;
139 | } cufftXt1dFactors;
140 | 
141 | //
142 | // cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy
143 | //
144 | typedef enum cufftXtWorkAreaPolicy_t {
145 |     CUFFT_WORKAREA_MINIMAL = 0,     /* maximum reduction */
146 |     CUFFT_WORKAREA_USER = 1,        /* use workSize parameter as limit */
147 |     CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */
148 | } cufftXtWorkAreaPolicy;
149 | 
150 | // callbacks
151 | 
152 | typedef enum cufftXtCallbackType_t {
153 |     CUFFT_CB_LD_COMPLEX = 0x0,
154 |     CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1,
155 |     CUFFT_CB_LD_REAL = 0x2,
156 |     CUFFT_CB_LD_REAL_DOUBLE = 0x3,
157 |     CUFFT_CB_ST_COMPLEX = 0x4,
158 |     CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5,
159 |     CUFFT_CB_ST_REAL = 0x6,
160 |     CUFFT_CB_ST_REAL_DOUBLE = 0x7,
161 |     CUFFT_CB_UNDEFINED = 0x8
162 | 
163 | } cufftXtCallbackType;
164 | 
165 | typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
166 | typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
167 | typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
168 | typedef cufftDoubleReal (*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
169 | 
170 | typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo,
171 |                                     void *sharedPointer);
172 | typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo,
173 |                                     void *sharedPointer);
174 | typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo,
175 |                                     void *sharedPointer);
176 | typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo,
177 |                                     void *sharedPointer);
178 | 
179 | #define CUDA_XT_DESCRIPTOR_VERSION 0x01000000  // This is added to CUDART_VERSION
180 | 
181 | enum cudaXtCopyType_t { LIB_XT_COPY_HOST_TO_DEVICE, LIB_XT_COPY_DEVICE_TO_HOST, LIB_XT_COPY_DEVICE_TO_DEVICE };
182 | typedef enum cudaXtCopyType_t cudaLibXtCopyType;
183 | 
184 | enum libFormat_t { LIB_FORMAT_CUFFT = 0x0, LIB_FORMAT_UNDEFINED = 0x1 };
185 | 
186 | typedef enum libFormat_t libFormat;
187 | 
188 | #define MAX_CUDA_DESCRIPTOR_GPUS 64
189 | 
190 | struct cudaXtDesc_t {
191 |     int version;                            // descriptor version
192 |     int nGPUs;                              // number of GPUs
193 |     int GPUs[MAX_CUDA_DESCRIPTOR_GPUS];     // array of device IDs
194 |     void *data[MAX_CUDA_DESCRIPTOR_GPUS];   // array of pointers to data, one per GPU
195 |     size_t size[MAX_CUDA_DESCRIPTOR_GPUS];  // array of data sizes, one per GPU
196 |     void *cudaXtState;                      // opaque CUDA utility structure
197 | };
198 | typedef struct cudaXtDesc_t cudaXtDesc;
199 | 
200 | struct cudaLibXtDesc_t {
201 |     int version;             // descriptor version
202 |     cudaXtDesc *descriptor;  // multi-GPU memory descriptor
203 |     libFormat library;       // which library recognizes the format
204 |     int subFormat;           // library specific enumerator of sub formats
205 |     void *libDescriptor;     // library specific descriptor e.g. FFT transform plan object
206 | };
207 | typedef struct cudaLibXtDesc_t cudaLibXtDesc;
208 | 
209 | #ifdef __cplusplus
210 | }
211 | #endif
212 | 
213 | #endif  // __CUDA_HOOK_CUFFT_SUBSET_H__
214 | 


--------------------------------------------------------------------------------
/src/curand/curand_hook.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 17:19:12 on Sun, May 29, 2022
  4 | //
  5 | // Description: auto generate 29 apis
  6 | 
  7 | #include "cublas_subset.h"
  8 | #include "curand_subset.h"
  9 | #include "hook.h"
 10 | #include "macro_common.h"
 11 | #include "trace_profile.h"
 12 | 
 13 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandCreateGenerator(curandGenerator_t *generator,
 14 |                                                                  curandRngType_t rng_type) {
 15 |     HOOK_TRACE_PROFILE("curandCreateGenerator");
 16 |     using func_ptr = curandStatus_t (*)(curandGenerator_t *, curandRngType_t);
 17 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandCreateGenerator"));
 18 |     HOOK_CHECK(func_entry);
 19 |     return func_entry(generator, rng_type);
 20 | }
 21 | 
 22 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandCreateGeneratorHost(curandGenerator_t *generator,
 23 |                                                                      curandRngType_t rng_type) {
 24 |     HOOK_TRACE_PROFILE("curandCreateGeneratorHost");
 25 |     using func_ptr = curandStatus_t (*)(curandGenerator_t *, curandRngType_t);
 26 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandCreateGeneratorHost"));
 27 |     HOOK_CHECK(func_entry);
 28 |     return func_entry(generator, rng_type);
 29 | }
 30 | 
 31 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandDestroyGenerator(curandGenerator_t generator) {
 32 |     HOOK_TRACE_PROFILE("curandDestroyGenerator");
 33 |     using func_ptr = curandStatus_t (*)(curandGenerator_t);
 34 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandDestroyGenerator"));
 35 |     HOOK_CHECK(func_entry);
 36 |     return func_entry(generator);
 37 | }
 38 | 
 39 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetVersion(int *version) {
 40 |     HOOK_TRACE_PROFILE("curandGetVersion");
 41 |     using func_ptr = curandStatus_t (*)(int *);
 42 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGetVersion"));
 43 |     HOOK_CHECK(func_entry);
 44 |     return func_entry(version);
 45 | }
 46 | 
 47 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetProperty(libraryPropertyType type, int *value) {
 48 |     HOOK_TRACE_PROFILE("curandGetProperty");
 49 |     using func_ptr = curandStatus_t (*)(libraryPropertyType, int *);
 50 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGetProperty"));
 51 |     HOOK_CHECK(func_entry);
 52 |     return func_entry(type, value);
 53 | }
 54 | 
 55 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetStream(curandGenerator_t generator, cudaStream_t stream) {
 56 |     HOOK_TRACE_PROFILE("curandSetStream");
 57 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, cudaStream_t);
 58 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandSetStream"));
 59 |     HOOK_CHECK(func_entry);
 60 |     return func_entry(generator, stream);
 61 | }
 62 | 
 63 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetPseudoRandomGeneratorSeed(curandGenerator_t generator,
 64 |                                                                               unsigned long long seed) {
 65 |     HOOK_TRACE_PROFILE("curandSetPseudoRandomGeneratorSeed");
 66 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned long long);
 67 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandSetPseudoRandomGeneratorSeed"));
 68 |     HOOK_CHECK(func_entry);
 69 |     return func_entry(generator, seed);
 70 | }
 71 | 
 72 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetGeneratorOffset(curandGenerator_t generator,
 73 |                                                                     unsigned long long offset) {
 74 |     HOOK_TRACE_PROFILE("curandSetGeneratorOffset");
 75 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned long long);
 76 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandSetGeneratorOffset"));
 77 |     HOOK_CHECK(func_entry);
 78 |     return func_entry(generator, offset);
 79 | }
 80 | 
 81 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetGeneratorOrdering(curandGenerator_t generator,
 82 |                                                                       curandOrdering_t order) {
 83 |     HOOK_TRACE_PROFILE("curandSetGeneratorOrdering");
 84 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, curandOrdering_t);
 85 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandSetGeneratorOrdering"));
 86 |     HOOK_CHECK(func_entry);
 87 |     return func_entry(generator, order);
 88 | }
 89 | 
 90 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetQuasiRandomGeneratorDimensions(curandGenerator_t generator,
 91 |                                                                                    unsigned int num_dimensions) {
 92 |     HOOK_TRACE_PROFILE("curandSetQuasiRandomGeneratorDimensions");
 93 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int);
 94 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandSetQuasiRandomGeneratorDimensions"));
 95 |     HOOK_CHECK(func_entry);
 96 |     return func_entry(generator, num_dimensions);
 97 | }
 98 | 
 99 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerate(curandGenerator_t generator, unsigned int *outputPtr,
100 |                                                           size_t num) {
101 |     HOOK_TRACE_PROFILE("curandGenerate");
102 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t);
103 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerate"));
104 |     HOOK_CHECK(func_entry);
105 |     return func_entry(generator, outputPtr, num);
106 | }
107 | 
108 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateLongLong(curandGenerator_t generator,
109 |                                                                   unsigned long long *outputPtr, size_t num) {
110 |     HOOK_TRACE_PROFILE("curandGenerateLongLong");
111 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned long long *, size_t);
112 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateLongLong"));
113 |     HOOK_CHECK(func_entry);
114 |     return func_entry(generator, outputPtr, num);
115 | }
116 | 
117 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateUniform(curandGenerator_t generator, float *outputPtr,
118 |                                                                  size_t num) {
119 |     HOOK_TRACE_PROFILE("curandGenerateUniform");
120 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, float *, size_t);
121 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateUniform"));
122 |     HOOK_CHECK(func_entry);
123 |     return func_entry(generator, outputPtr, num);
124 | }
125 | 
126 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateUniformDouble(curandGenerator_t generator, double *outputPtr,
127 |                                                                        size_t num) {
128 |     HOOK_TRACE_PROFILE("curandGenerateUniformDouble");
129 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, double *, size_t);
130 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateUniformDouble"));
131 |     HOOK_CHECK(func_entry);
132 |     return func_entry(generator, outputPtr, num);
133 | }
134 | 
135 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateNormal(curandGenerator_t generator, float *outputPtr, size_t n,
136 |                                                                 float mean, float stddev) {
137 |     HOOK_TRACE_PROFILE("curandGenerateNormal");
138 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, float *, size_t, float, float);
139 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateNormal"));
140 |     HOOK_CHECK(func_entry);
141 |     return func_entry(generator, outputPtr, n, mean, stddev);
142 | }
143 | 
144 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateNormalDouble(curandGenerator_t generator, double *outputPtr,
145 |                                                                       size_t n, double mean, double stddev) {
146 |     HOOK_TRACE_PROFILE("curandGenerateNormalDouble");
147 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, double *, size_t, double, double);
148 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateNormalDouble"));
149 |     HOOK_CHECK(func_entry);
150 |     return func_entry(generator, outputPtr, n, mean, stddev);
151 | }
152 | 
153 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateLogNormal(curandGenerator_t generator, float *outputPtr,
154 |                                                                    size_t n, float mean, float stddev) {
155 |     HOOK_TRACE_PROFILE("curandGenerateLogNormal");
156 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, float *, size_t, float, float);
157 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateLogNormal"));
158 |     HOOK_CHECK(func_entry);
159 |     return func_entry(generator, outputPtr, n, mean, stddev);
160 | }
161 | 
162 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
163 |                                                                          size_t n, double mean, double stddev) {
164 |     HOOK_TRACE_PROFILE("curandGenerateLogNormalDouble");
165 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, double *, size_t, double, double);
166 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateLogNormalDouble"));
167 |     HOOK_CHECK(func_entry);
168 |     return func_entry(generator, outputPtr, n, mean, stddev);
169 | }
170 | 
171 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t
172 |     curandCreatePoissonDistribution(double lambda, curandDiscreteDistribution_t *discrete_distribution) {
173 |     HOOK_TRACE_PROFILE("curandCreatePoissonDistribution");
174 |     using func_ptr = curandStatus_t (*)(double, curandDiscreteDistribution_t *);
175 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandCreatePoissonDistribution"));
176 |     HOOK_CHECK(func_entry);
177 |     return func_entry(lambda, discrete_distribution);
178 | }
179 | 
180 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t
181 |     curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) {
182 |     HOOK_TRACE_PROFILE("curandDestroyDistribution");
183 |     using func_ptr = curandStatus_t (*)(curandDiscreteDistribution_t);
184 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandDestroyDistribution"));
185 |     HOOK_CHECK(func_entry);
186 |     return func_entry(discrete_distribution);
187 | }
188 | 
189 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGeneratePoisson(curandGenerator_t generator, unsigned int *outputPtr,
190 |                                                                  size_t n, double lambda) {
191 |     HOOK_TRACE_PROFILE("curandGeneratePoisson");
192 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t, double);
193 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGeneratePoisson"));
194 |     HOOK_CHECK(func_entry);
195 |     return func_entry(generator, outputPtr, n, lambda);
196 | }
197 | 
198 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGeneratePoissonMethod(curandGenerator_t generator,
199 |                                                                        unsigned int *outputPtr, size_t n, double lambda,
200 |                                                                        curandMethod_t method) {
201 |     HOOK_TRACE_PROFILE("curandGeneratePoissonMethod");
202 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t, double, curandMethod_t);
203 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGeneratePoissonMethod"));
204 |     HOOK_CHECK(func_entry);
205 |     return func_entry(generator, outputPtr, n, lambda, method);
206 | }
207 | 
208 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateBinomial(curandGenerator_t generator, unsigned int *outputPtr,
209 |                                                                   size_t num, unsigned int n, double p) {
210 |     HOOK_TRACE_PROFILE("curandGenerateBinomial");
211 |     using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t, unsigned int, double);
212 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateBinomial"));
213 |     HOOK_CHECK(func_entry);
214 |     return func_entry(generator, outputPtr, num, n, p);
215 | }
216 | 
217 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateBinomialMethod(curandGenerator_t generator,
218 |                                                                         unsigned int *outputPtr, size_t num,
219 |                                                                         unsigned int n, double p,
220 |                                                                         curandMethod_t method) {
221 |     HOOK_TRACE_PROFILE("curandGenerateBinomialMethod");
222 |     using func_ptr =
223 |         curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t, unsigned int, double, curandMethod_t);
224 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateBinomialMethod"));
225 |     HOOK_CHECK(func_entry);
226 |     return func_entry(generator, outputPtr, num, n, p, method);
227 | }
228 | 
229 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateSeeds(curandGenerator_t generator) {
230 |     HOOK_TRACE_PROFILE("curandGenerateSeeds");
231 |     using func_ptr = curandStatus_t (*)(curandGenerator_t);
232 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGenerateSeeds"));
233 |     HOOK_CHECK(func_entry);
234 |     return func_entry(generator);
235 | }
236 | 
237 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetDirectionVectors32(curandDirectionVectors32_t *vectors,
238 |                                                                        curandDirectionVectorSet_t set) {
239 |     HOOK_TRACE_PROFILE("curandGetDirectionVectors32");
240 |     using func_ptr = curandStatus_t (*)(curandDirectionVectors32_t *, curandDirectionVectorSet_t);
241 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGetDirectionVectors32"));
242 |     HOOK_CHECK(func_entry);
243 |     return func_entry(vectors, set);
244 | }
245 | 
246 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetScrambleConstants32(unsigned int **constants) {
247 |     HOOK_TRACE_PROFILE("curandGetScrambleConstants32");
248 |     using func_ptr = curandStatus_t (*)(unsigned int **);
249 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGetScrambleConstants32"));
250 |     HOOK_CHECK(func_entry);
251 |     return func_entry(constants);
252 | }
253 | 
254 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetDirectionVectors64(curandDirectionVectors64_t *vectors,
255 |                                                                        curandDirectionVectorSet_t set) {
256 |     HOOK_TRACE_PROFILE("curandGetDirectionVectors64");
257 |     using func_ptr = curandStatus_t (*)(curandDirectionVectors64_t *, curandDirectionVectorSet_t);
258 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGetDirectionVectors64"));
259 |     HOOK_CHECK(func_entry);
260 |     return func_entry(vectors, set);
261 | }
262 | 
263 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetScrambleConstants64(unsigned long long **constants) {
264 |     HOOK_TRACE_PROFILE("curandGetScrambleConstants64");
265 |     using func_ptr = curandStatus_t (*)(unsigned long long **);
266 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_CURAND_SYMBOL("curandGetScrambleConstants64"));
267 |     HOOK_CHECK(func_entry);
268 |     return func_entry(constants);
269 | }
270 | 


--------------------------------------------------------------------------------
/src/curand/curand_subset.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 17:19:12 on Sun, May 29, 2022
  4 | //
  5 | // Description: curand subset
  6 | 
  7 | #ifndef __CUDA_HOOK_CURAND_SUBSET_H__
  8 | #define __CUDA_HOOK_CURAND_SUBSET_H__
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | #define CURAND_VER_MAJOR 10
 15 | #define CURAND_VER_MINOR 2
 16 | #define CURAND_VER_PATCH 5
 17 | #define CURAND_VER_BUILD 120
 18 | #define CURAND_VERSION (CURAND_VER_MAJOR * 1000 + CURAND_VER_MINOR * 100 + CURAND_VER_PATCH)
 19 | /* CURAND Host API datatypes */
 20 | 
 21 | /**
 22 |  * @{
 23 |  */
 24 | 
 25 | /**
 26 |  * CURAND function call status types
 27 |  */
 28 | enum curandStatus {
 29 |     CURAND_STATUS_SUCCESS = 0,                      ///< No errors
 30 |     CURAND_STATUS_VERSION_MISMATCH = 100,           ///< Header file and linked library version do not match
 31 |     CURAND_STATUS_NOT_INITIALIZED = 101,            ///< Generator not initialized
 32 |     CURAND_STATUS_ALLOCATION_FAILED = 102,          ///< Memory allocation failed
 33 |     CURAND_STATUS_TYPE_ERROR = 103,                 ///< Generator is wrong type
 34 |     CURAND_STATUS_OUT_OF_RANGE = 104,               ///< Argument out of range
 35 |     CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105,        ///< Length requested is not a multple of dimension
 36 |     CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106,  ///< GPU does not have double precision required by MRG32k3a
 37 |     CURAND_STATUS_LAUNCH_FAILURE = 201,             ///< Kernel launch failure
 38 |     CURAND_STATUS_PREEXISTING_FAILURE = 202,        ///< Preexisting failure on library entry
 39 |     CURAND_STATUS_INITIALIZATION_FAILED = 203,      ///< Initialization of CUDA failed
 40 |     CURAND_STATUS_ARCH_MISMATCH = 204,              ///< Architecture mismatch, GPU does not support requested feature
 41 |     CURAND_STATUS_INTERNAL_ERROR = 999              ///< Internal library error
 42 | };
 43 | 
 44 | /*
 45 |  * CURAND function call status types
 46 |  */
 47 | /** \cond UNHIDE_TYPEDEFS */
 48 | typedef enum curandStatus curandStatus_t;
 49 | /** \endcond */
 50 | 
 51 | /**
 52 |  * CURAND generator types
 53 |  */
 54 | enum curandRngType {
 55 |     CURAND_RNG_TEST = 0,
 56 |     CURAND_RNG_PSEUDO_DEFAULT = 100,           ///< Default pseudorandom generator
 57 |     CURAND_RNG_PSEUDO_XORWOW = 101,            ///< XORWOW pseudorandom generator
 58 |     CURAND_RNG_PSEUDO_MRG32K3A = 121,          ///< MRG32k3a pseudorandom generator
 59 |     CURAND_RNG_PSEUDO_MTGP32 = 141,            ///< Mersenne Twister MTGP32 pseudorandom generator
 60 |     CURAND_RNG_PSEUDO_MT19937 = 142,           ///< Mersenne Twister MT19937 pseudorandom generator
 61 |     CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161,     ///< PHILOX-4x32-10 pseudorandom generator
 62 |     CURAND_RNG_QUASI_DEFAULT = 200,            ///< Default quasirandom generator
 63 |     CURAND_RNG_QUASI_SOBOL32 = 201,            ///< Sobol32 quasirandom generator
 64 |     CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202,  ///< Scrambled Sobol32 quasirandom generator
 65 |     CURAND_RNG_QUASI_SOBOL64 = 203,            ///< Sobol64 quasirandom generator
 66 |     CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204   ///< Scrambled Sobol64 quasirandom generator
 67 | };
 68 | 
 69 | /*
 70 |  * CURAND generator types
 71 |  */
 72 | /** \cond UNHIDE_TYPEDEFS */
 73 | typedef enum curandRngType curandRngType_t;
 74 | /** \endcond */
 75 | 
 76 | /**
 77 |  * CURAND ordering of results in memory
 78 |  */
 79 | enum curandOrdering {
 80 |     CURAND_ORDERING_PSEUDO_BEST = 100,  ///< Best ordering for pseudorandom results
 81 |     CURAND_ORDERING_PSEUDO_DEFAULT =
 82 |         101,  ///< Specific default thread sequence for pseudorandom results, same as CURAND_ORDERING_PSEUDO_BEST
 83 |     CURAND_ORDERING_PSEUDO_SEEDED = 102,  ///< Specific seeding pattern for fast lower quality pseudorandom results
 84 |     CURAND_ORDERING_PSEUDO_LEGACY = 103,  ///< Specific legacy sequence for pseudorandom results, guaranteed to remain
 85 |                                           ///< the same for all cuRAND release
 86 |     CURAND_ORDERING_QUASI_DEFAULT = 201   ///< Specific n-dimensional ordering for quasirandom results
 87 | };
 88 | 
 89 | /*
 90 |  * CURAND ordering of results in memory
 91 |  */
 92 | /** \cond UNHIDE_TYPEDEFS */
 93 | typedef enum curandOrdering curandOrdering_t;
 94 | /** \endcond */
 95 | 
 96 | /**
 97 |  * CURAND choice of direction vector set
 98 |  */
 99 | enum curandDirectionVectorSet {
100 |     CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101,  ///< Specific set of 32-bit direction vectors generated from polynomials
101 |                                                 ///< recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions
102 |     CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 =
103 |         102,  ///< Specific set of 32-bit direction vectors generated from polynomials recommended by S. Joe and F. Y.
104 |               ///< Kuo, for up to 20,000 dimensions, and scrambled
105 |     CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103,  ///< Specific set of 64-bit direction vectors generated from polynomials
106 |                                                 ///< recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions
107 |     CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 =
108 |         104  ///< Specific set of 64-bit direction vectors generated from polynomials recommended by S. Joe and F. Y.
109 |              ///< Kuo, for up to 20,000 dimensions, and scrambled
110 | };
111 | 
112 | /*
113 |  * CURAND choice of direction vector set
114 |  */
115 | /** \cond UNHIDE_TYPEDEFS */
116 | typedef enum curandDirectionVectorSet curandDirectionVectorSet_t;
117 | /** \endcond */
118 | 
119 | /**
120 |  * CURAND array of 32-bit direction vectors
121 |  */
122 | /** \cond UNHIDE_TYPEDEFS */
123 | typedef unsigned int curandDirectionVectors32_t[32];
124 | /** \endcond */
125 | 
126 | /**
127 |  * CURAND array of 64-bit direction vectors
128 |  */
129 | /** \cond UNHIDE_TYPEDEFS */
130 | typedef unsigned long long curandDirectionVectors64_t[64];
131 | /** \endcond **/
132 | 
133 | /**
134 |  * CURAND generator (opaque)
135 |  */
136 | struct curandGenerator_st;
137 | 
138 | /**
139 |  * CURAND generator
140 |  */
141 | /** \cond UNHIDE_TYPEDEFS */
142 | typedef struct curandGenerator_st *curandGenerator_t;
143 | /** \endcond */
144 | 
145 | /**
146 |  * CURAND distribution
147 |  */
148 | /** \cond UNHIDE_TYPEDEFS */
149 | typedef double curandDistribution_st;
150 | typedef curandDistribution_st *curandDistribution_t;
151 | typedef struct curandDistributionShift_st *curandDistributionShift_t;
152 | /** \endcond */
153 | /**
154 |  * CURAND distribution M2
155 |  */
156 | /** \cond UNHIDE_TYPEDEFS */
157 | typedef struct curandDistributionM2Shift_st *curandDistributionM2Shift_t;
158 | typedef struct curandHistogramM2_st *curandHistogramM2_t;
159 | typedef unsigned int curandHistogramM2K_st;
160 | typedef curandHistogramM2K_st *curandHistogramM2K_t;
161 | typedef curandDistribution_st curandHistogramM2V_st;
162 | typedef curandHistogramM2V_st *curandHistogramM2V_t;
163 | 
164 | typedef struct curandDiscreteDistribution_st *curandDiscreteDistribution_t;
165 | /** \endcond */
166 | 
167 | /*
168 |  * CURAND METHOD
169 |  */
170 | /** \cond UNHIDE_ENUMS */
171 | enum curandMethod {
172 |     CURAND_CHOOSE_BEST = 0,  // choose best depends on args
173 |     CURAND_ITR = 1,
174 |     CURAND_KNUTH = 2,
175 |     CURAND_HITR = 3,
176 |     CURAND_M1 = 4,
177 |     CURAND_M2 = 5,
178 |     CURAND_BINARY_SEARCH = 6,
179 |     CURAND_DISCRETE_GAUSS = 7,
180 |     CURAND_REJECTION = 8,
181 |     CURAND_DEVICE_API = 9,
182 |     CURAND_FAST_REJECTION = 10,
183 |     CURAND_3RD = 11,
184 |     CURAND_DEFINITION = 12,
185 |     CURAND_POISSON = 13
186 | };
187 | 
188 | typedef enum curandMethod curandMethod_t;
189 | 
190 | #ifdef __cplusplus
191 | }
192 | #endif
193 | 
194 | #endif  // __CUDA_HOOK_CURAND_SUBSET_H__
195 | 


--------------------------------------------------------------------------------
/src/cusolver/cusolver_subset.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 17:19:12 on Sun, May 29, 2022
  4 | //
  5 | // Description: cusolver subset
  6 | 
  7 | #ifndef __CUDA_HOOK_CUSOLVER_SUBSET_H__
  8 | #define __CUDA_HOOK_CUSOLVER_SUBSET_H__
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | typedef int cusolver_int_t;
 15 | 
 16 | #define CUSOLVER_VER_MAJOR 11
 17 | #define CUSOLVER_VER_MINOR 2
 18 | #define CUSOLVER_VER_PATCH 0
 19 | #define CUSOLVER_VER_BUILD 120
 20 | #define CUSOLVER_VERSION (CUSOLVER_VER_MAJOR * 1000 + CUSOLVER_VER_MINOR * 100 + CUSOLVER_VER_PATCH)
 21 | 
 22 | typedef enum {
 23 |     CUSOLVER_STATUS_SUCCESS = 0,
 24 |     CUSOLVER_STATUS_NOT_INITIALIZED = 1,
 25 |     CUSOLVER_STATUS_ALLOC_FAILED = 2,
 26 |     CUSOLVER_STATUS_INVALID_VALUE = 3,
 27 |     CUSOLVER_STATUS_ARCH_MISMATCH = 4,
 28 |     CUSOLVER_STATUS_MAPPING_ERROR = 5,
 29 |     CUSOLVER_STATUS_EXECUTION_FAILED = 6,
 30 |     CUSOLVER_STATUS_INTERNAL_ERROR = 7,
 31 |     CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8,
 32 |     CUSOLVER_STATUS_NOT_SUPPORTED = 9,
 33 |     CUSOLVER_STATUS_ZERO_PIVOT = 10,
 34 |     CUSOLVER_STATUS_INVALID_LICENSE = 11,
 35 |     CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED = 12,
 36 |     CUSOLVER_STATUS_IRS_PARAMS_INVALID = 13,
 37 |     CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC = 14,
 38 |     CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE = 15,
 39 |     CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER = 16,
 40 |     CUSOLVER_STATUS_IRS_INTERNAL_ERROR = 20,
 41 |     CUSOLVER_STATUS_IRS_NOT_SUPPORTED = 21,
 42 |     CUSOLVER_STATUS_IRS_OUT_OF_RANGE = 22,
 43 |     CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES = 23,
 44 |     CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED = 25,
 45 |     CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED = 26,
 46 |     CUSOLVER_STATUS_IRS_MATRIX_SINGULAR = 30,
 47 |     CUSOLVER_STATUS_INVALID_WORKSPACE = 31
 48 | } cusolverStatus_t;
 49 | 
 50 | typedef enum { CUSOLVER_EIG_TYPE_1 = 1, CUSOLVER_EIG_TYPE_2 = 2, CUSOLVER_EIG_TYPE_3 = 3 } cusolverEigType_t;
 51 | 
 52 | typedef enum { CUSOLVER_EIG_MODE_NOVECTOR = 0, CUSOLVER_EIG_MODE_VECTOR = 1 } cusolverEigMode_t;
 53 | 
 54 | typedef enum {
 55 |     CUSOLVER_EIG_RANGE_ALL = 1001,
 56 |     CUSOLVER_EIG_RANGE_I = 1002,
 57 |     CUSOLVER_EIG_RANGE_V = 1003,
 58 | } cusolverEigRange_t;
 59 | 
 60 | typedef enum {
 61 |     CUSOLVER_INF_NORM = 104,
 62 |     CUSOLVER_MAX_NORM = 105,
 63 |     CUSOLVER_ONE_NORM = 106,
 64 |     CUSOLVER_FRO_NORM = 107,
 65 | } cusolverNorm_t;
 66 | 
 67 | typedef enum {
 68 |     CUSOLVER_IRS_REFINE_NOT_SET = 1100,
 69 |     CUSOLVER_IRS_REFINE_NONE = 1101,
 70 |     CUSOLVER_IRS_REFINE_CLASSICAL = 1102,
 71 |     CUSOLVER_IRS_REFINE_CLASSICAL_GMRES = 1103,
 72 |     CUSOLVER_IRS_REFINE_GMRES = 1104,
 73 |     CUSOLVER_IRS_REFINE_GMRES_GMRES = 1105,
 74 |     CUSOLVER_IRS_REFINE_GMRES_NOPCOND = 1106,
 75 | 
 76 |     CUSOLVER_PREC_DD = 1150,
 77 |     CUSOLVER_PREC_SS = 1151,
 78 |     CUSOLVER_PREC_SHT = 1152,
 79 | 
 80 | } cusolverIRSRefinement_t;
 81 | 
 82 | typedef enum {
 83 |     CUSOLVER_R_8I = 1201,
 84 |     CUSOLVER_R_8U = 1202,
 85 |     CUSOLVER_R_64F = 1203,
 86 |     CUSOLVER_R_32F = 1204,
 87 |     CUSOLVER_R_16F = 1205,
 88 |     CUSOLVER_R_16BF = 1206,
 89 |     CUSOLVER_R_TF32 = 1207,
 90 |     CUSOLVER_R_AP = 1208,
 91 |     CUSOLVER_C_8I = 1211,
 92 |     CUSOLVER_C_8U = 1212,
 93 |     CUSOLVER_C_64F = 1213,
 94 |     CUSOLVER_C_32F = 1214,
 95 |     CUSOLVER_C_16F = 1215,
 96 |     CUSOLVER_C_16BF = 1216,
 97 |     CUSOLVER_C_TF32 = 1217,
 98 |     CUSOLVER_C_AP = 1218,
 99 | } cusolverPrecType_t;
100 | 
101 | typedef enum {
102 |     CUSOLVER_ALG_0 = 0, /* default algorithm */
103 |     CUSOLVER_ALG_1 = 1
104 | } cusolverAlgMode_t;
105 | 
106 | typedef enum { CUBLAS_STOREV_COLUMNWISE = 0, CUBLAS_STOREV_ROWWISE = 1 } cusolverStorevMode_t;
107 | 
108 | typedef enum { CUBLAS_DIRECT_FORWARD = 0, CUBLAS_DIRECT_BACKWARD = 1 } cusolverDirectMode_t;
109 | 
110 | struct cusolverDnContext;
111 | typedef struct cusolverDnContext *cusolverDnHandle_t;
112 | 
113 | struct syevjInfo;
114 | typedef struct syevjInfo *syevjInfo_t;
115 | 
116 | struct gesvdjInfo;
117 | typedef struct gesvdjInfo *gesvdjInfo_t;
118 | 
119 | //------------------------------------------------------
120 | // opaque cusolverDnIRS structure for IRS solver
121 | struct cusolverDnIRSParams;
122 | typedef struct cusolverDnIRSParams *cusolverDnIRSParams_t;
123 | 
124 | struct cusolverDnIRSInfos;
125 | typedef struct cusolverDnIRSInfos *cusolverDnIRSInfos_t;
126 | //------------------------------------------------------
127 | 
128 | struct cusolverDnParams;
129 | typedef struct cusolverDnParams *cusolverDnParams_t;
130 | 
131 | typedef enum { CUSOLVERDN_GETRF = 0 } cusolverDnFunction_t;
132 | 
133 | struct cusolverMgContext;
134 | typedef struct cusolverMgContext *cusolverMgHandle_t;
135 | 
136 | /**
137 |  * \beief This enum decides how 1D device Ids (or process ranks) get mapped to a 2D grid.
138 |  */
139 | typedef enum {
140 | 
141 |     CUDALIBMG_GRID_MAPPING_ROW_MAJOR = 1,
142 |     CUDALIBMG_GRID_MAPPING_COL_MAJOR = 0
143 | 
144 | } cusolverMgGridMapping_t;
145 | 
146 | /** \brief Opaque structure of the distributed grid */
147 | typedef void *cudaLibMgGrid_t;
148 | /** \brief Opaque structure of the distributed matrix descriptor */
149 | typedef void *cudaLibMgMatrixDesc_t;
150 | 
151 | /* CUSOLVERRF mode */
152 | typedef enum {
153 |     CUSOLVERRF_RESET_VALUES_FAST_MODE_OFF = 0,  // default
154 |     CUSOLVERRF_RESET_VALUES_FAST_MODE_ON = 1
155 | } cusolverRfResetValuesFastMode_t;
156 | 
157 | /* CUSOLVERRF matrix format */
158 | typedef enum {
159 |     CUSOLVERRF_MATRIX_FORMAT_CSR = 0,  // default
160 |     CUSOLVERRF_MATRIX_FORMAT_CSC = 1
161 | } cusolverRfMatrixFormat_t;
162 | 
163 | /* CUSOLVERRF unit diagonal */
164 | typedef enum {
165 |     CUSOLVERRF_UNIT_DIAGONAL_STORED_L = 0,  // default
166 |     CUSOLVERRF_UNIT_DIAGONAL_STORED_U = 1,
167 |     CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L = 2,
168 |     CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_U = 3
169 | } cusolverRfUnitDiagonal_t;
170 | 
171 | /* CUSOLVERRF factorization algorithm */
172 | typedef enum {
173 |     CUSOLVERRF_FACTORIZATION_ALG0 = 0,  // default
174 |     CUSOLVERRF_FACTORIZATION_ALG1 = 1,
175 |     CUSOLVERRF_FACTORIZATION_ALG2 = 2,
176 | } cusolverRfFactorization_t;
177 | 
178 | /* CUSOLVERRF triangular solve algorithm */
179 | typedef enum {
180 |     CUSOLVERRF_TRIANGULAR_SOLVE_ALG1 = 1,  // default
181 |     CUSOLVERRF_TRIANGULAR_SOLVE_ALG2 = 2,
182 |     CUSOLVERRF_TRIANGULAR_SOLVE_ALG3 = 3
183 | } cusolverRfTriangularSolve_t;
184 | 
185 | /* CUSOLVERRF numeric boost report */
186 | typedef enum {
187 |     CUSOLVERRF_NUMERIC_BOOST_NOT_USED = 0,  // default
188 |     CUSOLVERRF_NUMERIC_BOOST_USED = 1
189 | } cusolverRfNumericBoostReport_t;
190 | 
191 | /* Opaque structure holding CUSOLVERRF library common */
192 | struct cusolverRfCommon;
193 | typedef struct cusolverRfCommon *cusolverRfHandle_t;
194 | 
195 | struct cusolverSpContext;
196 | typedef struct cusolverSpContext *cusolverSpHandle_t;
197 | 
198 | struct csrqrInfo;
199 | typedef struct csrqrInfo *csrqrInfo_t;
200 | 
201 | struct csrluInfoHost;
202 | typedef struct csrluInfoHost *csrluInfoHost_t;
203 | 
204 | struct csrqrInfoHost;
205 | typedef struct csrqrInfoHost *csrqrInfoHost_t;
206 | 
207 | struct csrcholInfoHost;
208 | typedef struct csrcholInfoHost *csrcholInfoHost_t;
209 | 
210 | struct csrcholInfo;
211 | typedef struct csrcholInfo *csrcholInfo_t;
212 | 
213 | #ifdef __cplusplus
214 | }
215 | #endif
216 | 
217 | #endif  // __CUDA_HOOK_CUSOLVER_SUBSET_H__
218 | 


--------------------------------------------------------------------------------
/src/cusparse/cusparse_subset.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 17:19:12 on Sun, May 29, 2022
  4 | //
  5 | // Description: cusparse subset
  6 | 
  7 | #ifndef __CUDA_HOOK_CUSPARSE_SUBSET_H__
  8 | #define __CUDA_HOOK_CUSPARSE_SUBSET_H__
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | //##############################################################################
 15 | //# CUSPARSE VERSION INFORMATION
 16 | //##############################################################################
 17 | 
 18 | #define CUSPARSE_VER_MAJOR 11
 19 | #define CUSPARSE_VER_MINOR 6
 20 | #define CUSPARSE_VER_PATCH 0
 21 | #define CUSPARSE_VER_BUILD 120
 22 | #define CUSPARSE_VERSION (CUSPARSE_VER_MAJOR * 1000 + CUSPARSE_VER_MINOR * 100 + CUSPARSE_VER_PATCH)
 23 | 
 24 | //------------------------------------------------------------------------------
 25 | 
 26 | struct cusparseContext;
 27 | typedef struct cusparseContext *cusparseHandle_t;
 28 | 
 29 | struct cusparseMatDescr;
 30 | typedef struct cusparseMatDescr *cusparseMatDescr_t;
 31 | 
 32 | struct csrsv2Info;
 33 | typedef struct csrsv2Info *csrsv2Info_t;
 34 | 
 35 | struct csrsm2Info;
 36 | typedef struct csrsm2Info *csrsm2Info_t;
 37 | 
 38 | struct bsrsv2Info;
 39 | typedef struct bsrsv2Info *bsrsv2Info_t;
 40 | 
 41 | struct bsrsm2Info;
 42 | typedef struct bsrsm2Info *bsrsm2Info_t;
 43 | 
 44 | struct csric02Info;
 45 | typedef struct csric02Info *csric02Info_t;
 46 | 
 47 | struct bsric02Info;
 48 | typedef struct bsric02Info *bsric02Info_t;
 49 | 
 50 | struct csrilu02Info;
 51 | typedef struct csrilu02Info *csrilu02Info_t;
 52 | 
 53 | struct bsrilu02Info;
 54 | typedef struct bsrilu02Info *bsrilu02Info_t;
 55 | 
 56 | struct csrgemm2Info;
 57 | typedef struct csrgemm2Info *csrgemm2Info_t;
 58 | 
 59 | struct csru2csrInfo;
 60 | typedef struct csru2csrInfo *csru2csrInfo_t;
 61 | 
 62 | struct cusparseColorInfo;
 63 | typedef struct cusparseColorInfo *cusparseColorInfo_t;
 64 | 
 65 | struct pruneInfo;
 66 | typedef struct pruneInfo *pruneInfo_t;
 67 | 
 68 | //##############################################################################
 69 | //# ENUMERATORS
 70 | //##############################################################################
 71 | 
 72 | typedef enum {
 73 |     CUSPARSE_STATUS_SUCCESS = 0,
 74 |     CUSPARSE_STATUS_NOT_INITIALIZED = 1,
 75 |     CUSPARSE_STATUS_ALLOC_FAILED = 2,
 76 |     CUSPARSE_STATUS_INVALID_VALUE = 3,
 77 |     CUSPARSE_STATUS_ARCH_MISMATCH = 4,
 78 |     CUSPARSE_STATUS_MAPPING_ERROR = 5,
 79 |     CUSPARSE_STATUS_EXECUTION_FAILED = 6,
 80 |     CUSPARSE_STATUS_INTERNAL_ERROR = 7,
 81 |     CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8,
 82 |     CUSPARSE_STATUS_ZERO_PIVOT = 9,
 83 |     CUSPARSE_STATUS_NOT_SUPPORTED = 10,
 84 |     CUSPARSE_STATUS_INSUFFICIENT_RESOURCES = 11
 85 | } cusparseStatus_t;
 86 | 
 87 | typedef enum { CUSPARSE_POINTER_MODE_HOST = 0, CUSPARSE_POINTER_MODE_DEVICE = 1 } cusparsePointerMode_t;
 88 | 
 89 | typedef enum { CUSPARSE_ACTION_SYMBOLIC = 0, CUSPARSE_ACTION_NUMERIC = 1 } cusparseAction_t;
 90 | 
 91 | typedef enum {
 92 |     CUSPARSE_MATRIX_TYPE_GENERAL = 0,
 93 |     CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1,
 94 |     CUSPARSE_MATRIX_TYPE_HERMITIAN = 2,
 95 |     CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
 96 | } cusparseMatrixType_t;
 97 | 
 98 | typedef enum { CUSPARSE_FILL_MODE_LOWER = 0, CUSPARSE_FILL_MODE_UPPER = 1 } cusparseFillMode_t;
 99 | 
100 | typedef enum { CUSPARSE_DIAG_TYPE_NON_UNIT = 0, CUSPARSE_DIAG_TYPE_UNIT = 1 } cusparseDiagType_t;
101 | 
102 | typedef enum { CUSPARSE_INDEX_BASE_ZERO = 0, CUSPARSE_INDEX_BASE_ONE = 1 } cusparseIndexBase_t;
103 | 
104 | typedef enum {
105 |     CUSPARSE_OPERATION_NON_TRANSPOSE = 0,
106 |     CUSPARSE_OPERATION_TRANSPOSE = 1,
107 |     CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
108 | } cusparseOperation_t;
109 | 
110 | typedef enum { CUSPARSE_DIRECTION_ROW = 0, CUSPARSE_DIRECTION_COLUMN = 1 } cusparseDirection_t;
111 | 
112 | typedef enum { CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0, CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1 } cusparseSolvePolicy_t;
113 | 
114 | typedef enum { CUSPARSE_SIDE_LEFT = 0, CUSPARSE_SIDE_RIGHT = 1 } cusparseSideMode_t;
115 | 
116 | typedef enum {
117 |     CUSPARSE_COLOR_ALG0 = 0,  // default
118 |     CUSPARSE_COLOR_ALG1 = 1
119 | } cusparseColorAlg_t;
120 | 
121 | typedef enum {
122 |     CUSPARSE_ALG_MERGE_PATH  // merge path alias
123 | } cusparseAlgMode_t;
124 | 
125 | typedef enum {
126 |     CUSPARSE_CSR2CSC_ALG1 = 1,  // faster than V2 (in general), deterministc
127 |     CUSPARSE_CSR2CSC_ALG2 = 2   // low memory requirement, non-deterministc
128 | } cusparseCsr2CscAlg_t;
129 | 
130 | typedef enum {
131 |     CUSPARSE_FORMAT_CSR = 1,          ///< Compressed Sparse Row (CSR)
132 |     CUSPARSE_FORMAT_CSC = 2,          ///< Compressed Sparse Column (CSC)
133 |     CUSPARSE_FORMAT_COO = 3,          ///< Coordinate (COO) - Structure of Arrays
134 |     CUSPARSE_FORMAT_COO_AOS = 4,      ///< Coordinate (COO) - Array of Structures
135 |     CUSPARSE_FORMAT_BLOCKED_ELL = 5,  ///< Blocked ELL
136 | } cusparseFormat_t;
137 | 
138 | typedef enum {
139 |     CUSPARSE_ORDER_COL = 1,  ///< Column-Major Order - Matrix memory layout
140 |     CUSPARSE_ORDER_ROW = 2   ///< Row-Major Order - Matrix memory layout
141 | } cusparseOrder_t;
142 | 
143 | typedef enum {
144 |     CUSPARSE_INDEX_16U = 1,  ///< 16-bit unsigned integer for matrix/vector
145 |                              ///< indices
146 |     CUSPARSE_INDEX_32I = 2,  ///< 32-bit signed integer for matrix/vector indices
147 |     CUSPARSE_INDEX_64I = 3   ///< 64-bit signed integer for matrix/vector indices
148 | } cusparseIndexType_t;
149 | 
150 | //------------------------------------------------------------------------------
151 | 
152 | struct cusparseSpVecDescr;
153 | struct cusparseDnVecDescr;
154 | struct cusparseSpMatDescr;
155 | struct cusparseDnMatDescr;
156 | typedef struct cusparseSpVecDescr *cusparseSpVecDescr_t;
157 | typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t;
158 | typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t;
159 | typedef struct cusparseDnMatDescr *cusparseDnMatDescr_t;
160 | 
161 | typedef enum { CUSPARSE_SPMAT_FILL_MODE, CUSPARSE_SPMAT_DIAG_TYPE } cusparseSpMatAttribute_t;
162 | 
163 | typedef enum { CUSPARSE_SPARSETODENSE_ALG_DEFAULT = 0 } cusparseSparseToDenseAlg_t;
164 | 
165 | typedef enum { CUSPARSE_DENSETOSPARSE_ALG_DEFAULT = 0 } cusparseDenseToSparseAlg_t;
166 | 
167 | typedef enum {
168 |     CUSPARSE_MV_ALG_DEFAULT
169 |     /*CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_ALG_DEFAULT)*/
170 |     = 0,
171 |     // CUSPARSE_COOMV_ALG CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_COO_ALG1) = 1,
172 |     // CUSPARSE_CSRMV_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_CSR_ALG1) = 2,
173 |     // CUSPARSE_CSRMV_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_CSR_ALG2) = 3,
174 |     CUSPARSE_SPMV_ALG_DEFAULT = 0,
175 |     CUSPARSE_SPMV_CSR_ALG1 = 2,
176 |     CUSPARSE_SPMV_CSR_ALG2 = 3,
177 |     CUSPARSE_SPMV_COO_ALG1 = 1,
178 |     CUSPARSE_SPMV_COO_ALG2 = 4
179 | } cusparseSpMVAlg_t;
180 | 
181 | typedef enum {
182 |     CUSPARSE_SPSV_ALG_DEFAULT = 0,
183 | } cusparseSpSVAlg_t;
184 | 
185 | struct cusparseSpSVDescr;
186 | typedef struct cusparseSpSVDescr *cusparseSpSVDescr_t;
187 | 
188 | typedef enum {
189 |     CUSPARSE_SPSM_ALG_DEFAULT = 0,
190 | } cusparseSpSMAlg_t;
191 | 
192 | struct cusparseSpSMDescr;
193 | typedef struct cusparseSpSMDescr *cusparseSpSMDescr_t;
194 | 
195 | typedef enum {
196 |     // CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
197 |     // CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
198 |     // CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
199 |     // CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
200 |     // CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
201 |     CUSPARSE_SPMM_ALG_DEFAULT = 0,
202 |     CUSPARSE_SPMM_COO_ALG1 = 1,
203 |     CUSPARSE_SPMM_COO_ALG2 = 2,
204 |     CUSPARSE_SPMM_COO_ALG3 = 3,
205 |     CUSPARSE_SPMM_COO_ALG4 = 5,
206 |     CUSPARSE_SPMM_CSR_ALG1 = 4,
207 |     CUSPARSE_SPMM_CSR_ALG2 = 6,
208 |     CUSPARSE_SPMM_CSR_ALG3 = 12,
209 |     CUSPARSE_SPMM_BLOCKED_ELL_ALG1 = 13
210 | } cusparseSpMMAlg_t;
211 | 
212 | typedef enum {
213 |     CUSPARSE_SPGEMM_DEFAULT = 0,
214 |     CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC = 1,
215 |     CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC = 2
216 | } cusparseSpGEMMAlg_t;
217 | 
218 | struct cusparseSpGEMMDescr;
219 | typedef struct cusparseSpGEMMDescr *cusparseSpGEMMDescr_t;
220 | 
221 | typedef enum { CUSPARSE_SDDMM_ALG_DEFAULT = 0 } cusparseSDDMMAlg_t;
222 | 
223 | #ifdef __cplusplus
224 | }
225 | #endif
226 | 
227 | #endif  // __CUDA_HOOK_CUSPARSE_SUBSET_H__
228 | 


--------------------------------------------------------------------------------
/src/nvjpeg/nvjpeg_subset.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 22:07:19 on Wed, Jul 20, 2022
  4 | //
  5 | // Description: nvjpeg subset
  6 | 
  7 | #ifndef __CUDA_HOOK_NVJPEG_SUBSET_H__
  8 | #define __CUDA_HOOK_NVJPEG_SUBSET_H__
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | // Maximum number of channels nvjpeg decoder supports
 15 | #define NVJPEG_MAX_COMPONENT 4
 16 | 
 17 | // nvjpeg version information
 18 | #define NVJPEG_VER_MAJOR 11
 19 | #define NVJPEG_VER_MINOR 5
 20 | #define NVJPEG_VER_PATCH 2
 21 | #define NVJPEG_VER_BUILD 120
 22 | 
 23 | /* nvJPEG status enums, returned by nvJPEG API */
 24 | typedef enum {
 25 |     NVJPEG_STATUS_SUCCESS = 0,
 26 |     NVJPEG_STATUS_NOT_INITIALIZED = 1,
 27 |     NVJPEG_STATUS_INVALID_PARAMETER = 2,
 28 |     NVJPEG_STATUS_BAD_JPEG = 3,
 29 |     NVJPEG_STATUS_JPEG_NOT_SUPPORTED = 4,
 30 |     NVJPEG_STATUS_ALLOCATOR_FAILURE = 5,
 31 |     NVJPEG_STATUS_EXECUTION_FAILED = 6,
 32 |     NVJPEG_STATUS_ARCH_MISMATCH = 7,
 33 |     NVJPEG_STATUS_INTERNAL_ERROR = 8,
 34 |     NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED = 9,
 35 | } nvjpegStatus_t;
 36 | 
 37 | // Enum identifies image chroma subsampling values stored inside JPEG input stream
 38 | // In the case of NVJPEG_CSS_GRAY only 1 luminance channel is encoded in JPEG input stream
 39 | // Otherwise both chroma planes are present
 40 | typedef enum {
 41 |     NVJPEG_CSS_444 = 0,
 42 |     NVJPEG_CSS_422 = 1,
 43 |     NVJPEG_CSS_420 = 2,
 44 |     NVJPEG_CSS_440 = 3,
 45 |     NVJPEG_CSS_411 = 4,
 46 |     NVJPEG_CSS_410 = 5,
 47 |     NVJPEG_CSS_GRAY = 6,
 48 |     NVJPEG_CSS_410V = 7,
 49 |     NVJPEG_CSS_UNKNOWN = -1
 50 | } nvjpegChromaSubsampling_t;
 51 | 
 52 | // Parameter of this type specifies what type of output user wants for image decoding
 53 | typedef enum {
 54 |     // return decompressed image as it is - write planar output
 55 |     NVJPEG_OUTPUT_UNCHANGED = 0,
 56 |     // return planar luma and chroma, assuming YCbCr colorspace
 57 |     NVJPEG_OUTPUT_YUV = 1,
 58 |     // return luma component only, if YCbCr colorspace,
 59 |     // or try to convert to grayscale,
 60 |     // writes to 1-st channel of nvjpegImage_t
 61 |     NVJPEG_OUTPUT_Y = 2,
 62 |     // convert to planar RGB
 63 |     NVJPEG_OUTPUT_RGB = 3,
 64 |     // convert to planar BGR
 65 |     NVJPEG_OUTPUT_BGR = 4,
 66 |     // convert to interleaved RGB and write to 1-st channel of nvjpegImage_t
 67 |     NVJPEG_OUTPUT_RGBI = 5,
 68 |     // convert to interleaved BGR and write to 1-st channel of nvjpegImage_t
 69 |     NVJPEG_OUTPUT_BGRI = 6,
 70 |     // maximum allowed value
 71 |     NVJPEG_OUTPUT_FORMAT_MAX = 6
 72 | } nvjpegOutputFormat_t;
 73 | 
 74 | // Parameter of this type specifies what type of input user provides for encoding
 75 | typedef enum {
 76 |     NVJPEG_INPUT_RGB = 3,   // Input is RGB - will be converted to YCbCr before encoding
 77 |     NVJPEG_INPUT_BGR = 4,   // Input is RGB - will be converted to YCbCr before encoding
 78 |     NVJPEG_INPUT_RGBI = 5,  // Input is interleaved RGB - will be converted to YCbCr before encoding
 79 |     NVJPEG_INPUT_BGRI = 6   // Input is interleaved RGB - will be converted to YCbCr before encoding
 80 | } nvjpegInputFormat_t;
 81 | 
 82 | // Implementation
 83 | // NVJPEG_BACKEND_DEFAULT    : default value
 84 | // NVJPEG_BACKEND_HYBRID     : uses CPU for Huffman decode
 85 | // NVJPEG_BACKEND_GPU_HYBRID : uses GPU assisted Huffman decode. nvjpegDecodeBatched will use GPU decoding for baseline
 86 | // JPEG bitstreams with
 87 | //                             interleaved scan when batch size is bigger than 100
 88 | // NVJPEG_BACKEND_HARDWARE   : supports baseline JPEG bitstream with single scan. 410 and 411 sub-samplings are not
 89 | // supported
 90 | typedef enum {
 91 |     NVJPEG_BACKEND_DEFAULT = 0,
 92 |     NVJPEG_BACKEND_HYBRID = 1,
 93 |     NVJPEG_BACKEND_GPU_HYBRID = 2,
 94 |     NVJPEG_BACKEND_HARDWARE = 3
 95 | } nvjpegBackend_t;
 96 | 
 97 | // Currently parseable JPEG encodings (SOF markers)
 98 | typedef enum {
 99 |     NVJPEG_ENCODING_UNKNOWN = 0x0,
100 | 
101 |     NVJPEG_ENCODING_BASELINE_DCT = 0xc0,
102 |     NVJPEG_ENCODING_EXTENDED_SEQUENTIAL_DCT_HUFFMAN = 0xc1,
103 |     NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN = 0xc2
104 | 
105 | } nvjpegJpegEncoding_t;
106 | 
107 | typedef enum {
108 |     NVJPEG_SCALE_NONE = 0,    // decoded output is not scaled
109 |     NVJPEG_SCALE_1_BY_2 = 1,  // decoded output width and height is scaled by a factor of 1/2
110 |     NVJPEG_SCALE_1_BY_4 = 2,  // decoded output width and height is scaled by a factor of 1/4
111 |     NVJPEG_SCALE_1_BY_8 = 3,  // decoded output width and height is scaled by a factor of 1/8
112 | } nvjpegScaleFactor_t;
113 | 
114 | #define NVJPEG_FLAGS_DEFAULT 0
115 | #define NVJPEG_FLAGS_HW_DECODE_NO_PIPELINE 1
116 | #define NVJPEG_FLAGS_ENABLE_MEMORY_POOLS 1 << 1
117 | #define NVJPEG_FLAGS_BITSTREAM_STRICT 1 << 2
118 | 
119 | // Output descriptor.
120 | // Data that is written to planes depends on output format
121 | typedef struct {
122 |     unsigned char *channel[NVJPEG_MAX_COMPONENT];
123 |     size_t pitch[NVJPEG_MAX_COMPONENT];
124 | } nvjpegImage_t;
125 | 
126 | // Prototype for device memory allocation, modelled after cudaMalloc()
127 | typedef int (*tDevMalloc)(void **, size_t);
128 | // Prototype for device memory release
129 | typedef int (*tDevFree)(void *);
130 | 
131 | // Prototype for pinned memory allocation, modelled after cudaHostAlloc()
132 | typedef int (*tPinnedMalloc)(void **, size_t, unsigned int flags);
133 | // Prototype for device memory release
134 | typedef int (*tPinnedFree)(void *);
135 | 
136 | // Memory allocator using mentioned prototypes, provided to nvjpegCreateEx
137 | // This allocator will be used for all device memory allocations inside library
138 | // In any way library is doing smart allocations (reallocates memory only if needed)
139 | typedef struct {
140 |     tDevMalloc dev_malloc;
141 |     tDevFree dev_free;
142 | } nvjpegDevAllocator_t;
143 | 
144 | // Pinned memory allocator using mentioned prototypes, provided to nvjpegCreate
145 | // This allocator will be used for all pinned host memory allocations inside library
146 | // In any way library is doing smart allocations (reallocates memory only if needed)
147 | typedef struct {
148 |     tPinnedMalloc pinned_malloc;
149 |     tPinnedFree pinned_free;
150 | } nvjpegPinnedAllocator_t;
151 | 
152 | // Opaque library handle identifier.
153 | struct nvjpegHandle;
154 | typedef struct nvjpegHandle *nvjpegHandle_t;
155 | 
156 | // Opaque jpeg decoding state handle identifier - used to store intermediate information between deccding phases
157 | struct nvjpegJpegState;
158 | typedef struct nvjpegJpegState *nvjpegJpegState_t;
159 | 
160 | struct nvjpegEncoderState;
161 | typedef struct nvjpegEncoderState *nvjpegEncoderState_t;
162 | 
163 | struct nvjpegEncoderParams;
164 | typedef struct nvjpegEncoderParams *nvjpegEncoderParams_t;
165 | 
166 | struct nvjpegBufferPinned;
167 | typedef struct nvjpegBufferPinned *nvjpegBufferPinned_t;
168 | 
169 | struct nvjpegBufferDevice;
170 | typedef struct nvjpegBufferDevice *nvjpegBufferDevice_t;
171 | 
172 | struct nvjpegJpegStream;
173 | typedef struct nvjpegJpegStream *nvjpegJpegStream_t;
174 | 
175 | struct nvjpegDecodeParams;
176 | typedef struct nvjpegDecodeParams *nvjpegDecodeParams_t;
177 | 
178 | struct nvjpegJpegDecoder;
179 | typedef struct nvjpegJpegDecoder *nvjpegJpegDecoder_t;
180 | 
181 | #ifdef __cplusplus
182 | }
183 | #endif
184 | 
185 | #endif  // __CUDA_HOOK_NVJPEG_SUBSET_H__
186 | 


--------------------------------------------------------------------------------
/src/nvrtc/nvrtc_hook.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 17:19:12 on Sun, May 29, 2022
  4 | //
  5 | // Description: auto generate 18 apis
  6 | 
  7 | #include "hook.h"
  8 | #include "macro_common.h"
  9 | #include "nvrtc_subset.h"
 10 | #include "trace_profile.h"
 11 | 
 12 | HOOK_C_API HOOK_DECL_EXPORT const char *nvrtcGetErrorString(nvrtcResult result) {
 13 |     HOOK_TRACE_PROFILE("nvrtcGetErrorString");
 14 |     using func_ptr = const char *(*)(nvrtcResult);
 15 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetErrorString"));
 16 |     HOOK_CHECK(func_entry);
 17 |     return func_entry(result);
 18 | }
 19 | 
 20 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcVersion(int *major, int *minor) {
 21 |     HOOK_TRACE_PROFILE("nvrtcVersion");
 22 |     using func_ptr = nvrtcResult (*)(int *, int *);
 23 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcVersion"));
 24 |     HOOK_CHECK(func_entry);
 25 |     return func_entry(major, minor);
 26 | }
 27 | 
 28 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetNumSupportedArchs(int *numArchs) {
 29 |     HOOK_TRACE_PROFILE("nvrtcGetNumSupportedArchs");
 30 |     using func_ptr = nvrtcResult (*)(int *);
 31 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetNumSupportedArchs"));
 32 |     HOOK_CHECK(func_entry);
 33 |     return func_entry(numArchs);
 34 | }
 35 | 
 36 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetSupportedArchs(int *supportedArchs) {
 37 |     HOOK_TRACE_PROFILE("nvrtcGetSupportedArchs");
 38 |     using func_ptr = nvrtcResult (*)(int *);
 39 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetSupportedArchs"));
 40 |     HOOK_CHECK(func_entry);
 41 |     return func_entry(supportedArchs);
 42 | }
 43 | 
 44 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name,
 45 |                                                            int numHeaders, const char *const *headers,
 46 |                                                            const char *const *includeNames) {
 47 |     HOOK_TRACE_PROFILE("nvrtcCreateProgram");
 48 |     using func_ptr =
 49 |         nvrtcResult (*)(nvrtcProgram *, const char *, const char *, int, const char *const *, const char *const *);
 50 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcCreateProgram"));
 51 |     HOOK_CHECK(func_entry);
 52 |     return func_entry(prog, src, name, numHeaders, headers, includeNames);
 53 | }
 54 | 
 55 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog) {
 56 |     HOOK_TRACE_PROFILE("nvrtcDestroyProgram");
 57 |     using func_ptr = nvrtcResult (*)(nvrtcProgram *);
 58 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcDestroyProgram"));
 59 |     HOOK_CHECK(func_entry);
 60 |     return func_entry(prog);
 61 | }
 62 | 
 63 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions,
 64 |                                                             const char *const *options) {
 65 |     HOOK_TRACE_PROFILE("nvrtcCompileProgram");
 66 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, int, const char *const *);
 67 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcCompileProgram"));
 68 |     HOOK_CHECK(func_entry);
 69 |     return func_entry(prog, numOptions, options);
 70 | }
 71 | 
 72 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet) {
 73 |     HOOK_TRACE_PROFILE("nvrtcGetPTXSize");
 74 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, size_t *);
 75 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetPTXSize"));
 76 |     HOOK_CHECK(func_entry);
 77 |     return func_entry(prog, ptxSizeRet);
 78 | }
 79 | 
 80 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx) {
 81 |     HOOK_TRACE_PROFILE("nvrtcGetPTX");
 82 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, char *);
 83 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetPTX"));
 84 |     HOOK_CHECK(func_entry);
 85 |     return func_entry(prog, ptx);
 86 | }
 87 | 
 88 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet) {
 89 |     HOOK_TRACE_PROFILE("nvrtcGetCUBINSize");
 90 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, size_t *);
 91 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetCUBINSize"));
 92 |     HOOK_CHECK(func_entry);
 93 |     return func_entry(prog, cubinSizeRet);
 94 | }
 95 | 
 96 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin) {
 97 |     HOOK_TRACE_PROFILE("nvrtcGetCUBIN");
 98 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, char *);
 99 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetCUBIN"));
100 |     HOOK_CHECK(func_entry);
101 |     return func_entry(prog, cubin);
102 | }
103 | 
104 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet) {
105 |     HOOK_TRACE_PROFILE("nvrtcGetNVVMSize");
106 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, size_t *);
107 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetNVVMSize"));
108 |     HOOK_CHECK(func_entry);
109 |     return func_entry(prog, nvvmSizeRet);
110 | }
111 | 
112 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm) {
113 |     HOOK_TRACE_PROFILE("nvrtcGetNVVM");
114 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, char *);
115 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetNVVM"));
116 |     HOOK_CHECK(func_entry);
117 |     return func_entry(prog, nvvm);
118 | }
119 | 
120 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet) {
121 |     HOOK_TRACE_PROFILE("nvrtcGetProgramLogSize");
122 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, size_t *);
123 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetProgramLogSize"));
124 |     HOOK_CHECK(func_entry);
125 |     return func_entry(prog, logSizeRet);
126 | }
127 | 
128 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log) {
129 |     HOOK_TRACE_PROFILE("nvrtcGetProgramLog");
130 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, char *);
131 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetProgramLog"));
132 |     HOOK_CHECK(func_entry);
133 |     return func_entry(prog, log);
134 | }
135 | 
136 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog, const char *const name_expression) {
137 |     HOOK_TRACE_PROFILE("nvrtcAddNameExpression");
138 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, const char *const);
139 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcAddNameExpression"));
140 |     HOOK_CHECK(func_entry);
141 |     return func_entry(prog, name_expression);
142 | }
143 | 
144 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog, const char *const name_expression,
145 |                                                             const char **lowered_name) {
146 |     HOOK_TRACE_PROFILE("nvrtcGetLoweredName");
147 |     using func_ptr = nvrtcResult (*)(nvrtcProgram, const char *const, const char **);
148 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetLoweredName"));
149 |     HOOK_CHECK(func_entry);
150 |     return func_entry(prog, name_expression, lowered_name);
151 | }
152 | 
153 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result) {
154 |     HOOK_TRACE_PROFILE("nvrtcGetTypeName");
155 |     using func_ptr = nvrtcResult (*)(const std::type_info &, std::string *);
156 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVRTC_SYMBOL("nvrtcGetTypeName"));
157 |     HOOK_CHECK(func_entry);
158 |     return func_entry(tinfo, result);
159 | }
160 | 


--------------------------------------------------------------------------------
/src/nvrtc/nvrtc_subset.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2022. All Rights Reserved.
 2 | // Author: Bruce-Lee-LY
 3 | // Date: 17:19:12 on Sun, May 29, 2022
 4 | //
 5 | // Description: nvrtc subset
 6 | 
 7 | #ifndef __CUDA_HOOK_NVRTC_SUBSET_H__
 8 | #define __CUDA_HOOK_NVRTC_SUBSET_H__
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | /**
15 |  * \ingroup error
16 |  * \brief   The enumerated type nvrtcResult defines API call result codes.
17 |  *          NVRTC API functions return nvrtcResult to indicate the call
18 |  *          result.
19 |  */
20 | typedef enum {
21 |     NVRTC_SUCCESS = 0,
22 |     NVRTC_ERROR_OUT_OF_MEMORY = 1,
23 |     NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
24 |     NVRTC_ERROR_INVALID_INPUT = 3,
25 |     NVRTC_ERROR_INVALID_PROGRAM = 4,
26 |     NVRTC_ERROR_INVALID_OPTION = 5,
27 |     NVRTC_ERROR_COMPILATION = 6,
28 |     NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
29 |     NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
30 |     NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
31 |     NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
32 |     NVRTC_ERROR_INTERNAL_ERROR = 11
33 | } nvrtcResult;
34 | 
35 | /**
36 |  * \ingroup compilation
37 |  * \brief   nvrtcProgram is the unit of compilation, and an opaque handle for
38 |  *          a program.
39 |  *
40 |  * To compile a CUDA program string, an instance of nvrtcProgram must be
41 |  * created first with ::nvrtcCreateProgram, then compiled with
42 |  * ::nvrtcCompileProgram.
43 |  */
44 | typedef struct _nvrtcProgram *nvrtcProgram;
45 | 
46 | #ifdef __cplusplus
47 | }
48 | #endif
49 | 
50 | #endif  // __CUDA_HOOK_NVRTC_SUBSET_H__
51 | 


--------------------------------------------------------------------------------
/src/nvtx/nvtx_hook.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2022. All Rights Reserved.
  2 | // Author: Bruce-Lee-LY
  3 | // Date: 17:19:12 on Sun, May 29, 2022
  4 | //
  5 | // Description: auto generate 64 apis
  6 | 
  7 | #include "cuda_subset.h"
  8 | #include "cudart_subset.h"
  9 | #include "hook.h"
 10 | #include "macro_common.h"
 11 | #include "nvtx_subset.h"
 12 | #include "trace_profile.h"
 13 | 
 14 | HOOK_C_API HOOK_DECL_EXPORT int nvtxInitialize(const nvtxInitializationAttributes_t *initAttrib) {
 15 |     HOOK_TRACE_PROFILE("nvtxInitialize");
 16 |     using func_ptr = int (*)(const nvtxInitializationAttributes_t *);
 17 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxInitialize"));
 18 |     HOOK_CHECK(func_entry);
 19 |     return func_entry(initAttrib);
 20 | }
 21 | 
 22 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t *eventAttrib) {
 23 |     HOOK_TRACE_PROFILE("nvtxDomainMarkEx");
 24 |     using func_ptr = void (*)(nvtxDomainHandle_t, const nvtxEventAttributes_t *);
 25 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainMarkEx"));
 26 |     HOOK_CHECK(func_entry);
 27 |     return func_entry(domain, eventAttrib);
 28 | }
 29 | 
 30 | HOOK_C_API HOOK_DECL_EXPORT void nvtxMarkEx(const nvtxEventAttributes_t *eventAttrib) {
 31 |     HOOK_TRACE_PROFILE("nvtxMarkEx");
 32 |     using func_ptr = void (*)(const nvtxEventAttributes_t *);
 33 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxMarkEx"));
 34 |     HOOK_CHECK(func_entry);
 35 |     return func_entry(eventAttrib);
 36 | }
 37 | 
 38 | HOOK_C_API HOOK_DECL_EXPORT void nvtxMarkA(const char *message) {
 39 |     HOOK_TRACE_PROFILE("nvtxMarkA");
 40 |     using func_ptr = void (*)(const char *);
 41 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxMarkA"));
 42 |     HOOK_CHECK(func_entry);
 43 |     return func_entry(message);
 44 | }
 45 | 
 46 | HOOK_C_API HOOK_DECL_EXPORT void nvtxMarkW(const wchar_t *message) {
 47 |     HOOK_TRACE_PROFILE("nvtxMarkW");
 48 |     using func_ptr = void (*)(const wchar_t *);
 49 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxMarkW"));
 50 |     HOOK_CHECK(func_entry);
 51 |     return func_entry(message);
 52 | }
 53 | 
 54 | HOOK_C_API HOOK_DECL_EXPORT nvtxRangeId_t nvtxDomainRangeStartEx(nvtxDomainHandle_t domain,
 55 |                                                                  const nvtxEventAttributes_t *eventAttrib) {
 56 |     HOOK_TRACE_PROFILE("nvtxDomainRangeStartEx");
 57 |     using func_ptr = nvtxRangeId_t (*)(nvtxDomainHandle_t, const nvtxEventAttributes_t *);
 58 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainRangeStartEx"));
 59 |     HOOK_CHECK(func_entry);
 60 |     return func_entry(domain, eventAttrib);
 61 | }
 62 | 
 63 | HOOK_C_API HOOK_DECL_EXPORT nvtxRangeId_t nvtxRangeStartEx(const nvtxEventAttributes_t *eventAttrib) {
 64 |     HOOK_TRACE_PROFILE("nvtxRangeStartEx");
 65 |     using func_ptr = nvtxRangeId_t (*)(const nvtxEventAttributes_t *);
 66 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxRangeStartEx"));
 67 |     HOOK_CHECK(func_entry);
 68 |     return func_entry(eventAttrib);
 69 | }
 70 | 
 71 | HOOK_C_API HOOK_DECL_EXPORT nvtxRangeId_t nvtxRangeStartA(const char *message) {
 72 |     HOOK_TRACE_PROFILE("nvtxRangeStartA");
 73 |     using func_ptr = nvtxRangeId_t (*)(const char *);
 74 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxRangeStartA"));
 75 |     HOOK_CHECK(func_entry);
 76 |     return func_entry(message);
 77 | }
 78 | 
 79 | HOOK_C_API HOOK_DECL_EXPORT nvtxRangeId_t nvtxRangeStartW(const wchar_t *message) {
 80 |     HOOK_TRACE_PROFILE("nvtxRangeStartW");
 81 |     using func_ptr = nvtxRangeId_t (*)(const wchar_t *);
 82 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxRangeStartW"));
 83 |     HOOK_CHECK(func_entry);
 84 |     return func_entry(message);
 85 | }
 86 | 
 87 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id) {
 88 |     HOOK_TRACE_PROFILE("nvtxDomainRangeEnd");
 89 |     using func_ptr = void (*)(nvtxDomainHandle_t, nvtxRangeId_t);
 90 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainRangeEnd"));
 91 |     HOOK_CHECK(func_entry);
 92 |     return func_entry(domain, id);
 93 | }
 94 | 
 95 | HOOK_C_API HOOK_DECL_EXPORT void nvtxRangeEnd(nvtxRangeId_t id) {
 96 |     HOOK_TRACE_PROFILE("nvtxRangeEnd");
 97 |     using func_ptr = void (*)(nvtxRangeId_t);
 98 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxRangeEnd"));
 99 |     HOOK_CHECK(func_entry);
100 |     return func_entry(id);
101 | }
102 | 
103 | HOOK_C_API HOOK_DECL_EXPORT int nvtxDomainRangePushEx(nvtxDomainHandle_t domain,
104 |                                                       const nvtxEventAttributes_t *eventAttrib) {
105 |     HOOK_TRACE_PROFILE("nvtxDomainRangePushEx");
106 |     using func_ptr = int (*)(nvtxDomainHandle_t, const nvtxEventAttributes_t *);
107 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainRangePushEx"));
108 |     HOOK_CHECK(func_entry);
109 |     return func_entry(domain, eventAttrib);
110 | }
111 | 
112 | HOOK_C_API HOOK_DECL_EXPORT int nvtxRangePushEx(const nvtxEventAttributes_t *eventAttrib) {
113 |     HOOK_TRACE_PROFILE("nvtxRangePushEx");
114 |     using func_ptr = int (*)(const nvtxEventAttributes_t *);
115 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxRangePushEx"));
116 |     HOOK_CHECK(func_entry);
117 |     return func_entry(eventAttrib);
118 | }
119 | 
120 | HOOK_C_API HOOK_DECL_EXPORT int nvtxRangePushA(const char *message) {
121 |     HOOK_TRACE_PROFILE("nvtxRangePushA");
122 |     using func_ptr = int (*)(const char *);
123 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxRangePushA"));
124 |     HOOK_CHECK(func_entry);
125 |     return func_entry(message);
126 | }
127 | 
128 | HOOK_C_API HOOK_DECL_EXPORT int nvtxRangePushW(const wchar_t *message) {
129 |     HOOK_TRACE_PROFILE("nvtxRangePushW");
130 |     using func_ptr = int (*)(const wchar_t *);
131 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxRangePushW"));
132 |     HOOK_CHECK(func_entry);
133 |     return func_entry(message);
134 | }
135 | 
136 | HOOK_C_API HOOK_DECL_EXPORT int nvtxDomainRangePop(nvtxDomainHandle_t domain) {
137 |     HOOK_TRACE_PROFILE("nvtxDomainRangePop");
138 |     using func_ptr = int (*)(nvtxDomainHandle_t);
139 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainRangePop"));
140 |     HOOK_CHECK(func_entry);
141 |     return func_entry(domain);
142 | }
143 | 
144 | HOOK_C_API HOOK_DECL_EXPORT int nvtxRangePop() {
145 |     HOOK_TRACE_PROFILE("nvtxRangePop");
146 |     using func_ptr = int (*)();
147 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxRangePop"));
148 |     HOOK_CHECK(func_entry);
149 |     return func_entry();
150 | }
151 | 
152 | HOOK_C_API HOOK_DECL_EXPORT nvtxResourceHandle_t nvtxDomainResourceCreate(nvtxDomainHandle_t domain,
153 |                                                                           nvtxResourceAttributes_t *attribs) {
154 |     HOOK_TRACE_PROFILE("nvtxDomainResourceCreate");
155 |     using func_ptr = nvtxResourceHandle_t (*)(nvtxDomainHandle_t, nvtxResourceAttributes_t *);
156 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainResourceCreate"));
157 |     HOOK_CHECK(func_entry);
158 |     return func_entry(domain, attribs);
159 | }
160 | 
161 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainResourceDestroy(nvtxResourceHandle_t resource) {
162 |     HOOK_TRACE_PROFILE("nvtxDomainResourceDestroy");
163 |     using func_ptr = void (*)(nvtxResourceHandle_t);
164 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainResourceDestroy"));
165 |     HOOK_CHECK(func_entry);
166 |     return func_entry(resource);
167 | }
168 | 
169 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category,
170 |                                                          const char *name) {
171 |     HOOK_TRACE_PROFILE("nvtxDomainNameCategoryA");
172 |     using func_ptr = void (*)(nvtxDomainHandle_t, uint32_t, const char *);
173 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainNameCategoryA"));
174 |     HOOK_CHECK(func_entry);
175 |     return func_entry(domain, category, name);
176 | }
177 | 
178 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category,
179 |                                                          const wchar_t *name) {
180 |     HOOK_TRACE_PROFILE("nvtxDomainNameCategoryW");
181 |     using func_ptr = void (*)(nvtxDomainHandle_t, uint32_t, const wchar_t *);
182 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainNameCategoryW"));
183 |     HOOK_CHECK(func_entry);
184 |     return func_entry(domain, category, name);
185 | }
186 | 
187 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCategoryA(uint32_t category, const char *name) {
188 |     HOOK_TRACE_PROFILE("nvtxNameCategoryA");
189 |     using func_ptr = void (*)(uint32_t, const char *);
190 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCategoryA"));
191 |     HOOK_CHECK(func_entry);
192 |     return func_entry(category, name);
193 | }
194 | 
195 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCategoryW(uint32_t category, const wchar_t *name) {
196 |     HOOK_TRACE_PROFILE("nvtxNameCategoryW");
197 |     using func_ptr = void (*)(uint32_t, const wchar_t *);
198 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCategoryW"));
199 |     HOOK_CHECK(func_entry);
200 |     return func_entry(category, name);
201 | }
202 | 
203 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameOsThreadA(uint32_t threadId, const char *name) {
204 |     HOOK_TRACE_PROFILE("nvtxNameOsThreadA");
205 |     using func_ptr = void (*)(uint32_t, const char *);
206 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameOsThreadA"));
207 |     HOOK_CHECK(func_entry);
208 |     return func_entry(threadId, name);
209 | }
210 | 
211 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameOsThreadW(uint32_t threadId, const wchar_t *name) {
212 |     HOOK_TRACE_PROFILE("nvtxNameOsThreadW");
213 |     using func_ptr = void (*)(uint32_t, const wchar_t *);
214 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameOsThreadW"));
215 |     HOOK_CHECK(func_entry);
216 |     return func_entry(threadId, name);
217 | }
218 | 
219 | HOOK_C_API HOOK_DECL_EXPORT nvtxStringHandle_t nvtxDomainRegisterStringA(nvtxDomainHandle_t domain,
220 |                                                                          const char *string) {
221 |     HOOK_TRACE_PROFILE("nvtxDomainRegisterStringA");
222 |     using func_ptr = nvtxStringHandle_t (*)(nvtxDomainHandle_t, const char *);
223 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainRegisterStringA"));
224 |     HOOK_CHECK(func_entry);
225 |     return func_entry(domain, string);
226 | }
227 | 
228 | HOOK_C_API HOOK_DECL_EXPORT nvtxStringHandle_t nvtxDomainRegisterStringW(nvtxDomainHandle_t domain,
229 |                                                                          const wchar_t *string) {
230 |     HOOK_TRACE_PROFILE("nvtxDomainRegisterStringW");
231 |     using func_ptr = nvtxStringHandle_t (*)(nvtxDomainHandle_t, const wchar_t *);
232 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainRegisterStringW"));
233 |     HOOK_CHECK(func_entry);
234 |     return func_entry(domain, string);
235 | }
236 | 
237 | HOOK_C_API HOOK_DECL_EXPORT nvtxDomainHandle_t nvtxDomainCreateA(const char *name) {
238 |     HOOK_TRACE_PROFILE("nvtxDomainCreateA");
239 |     using func_ptr = nvtxDomainHandle_t (*)(const char *);
240 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainCreateA"));
241 |     HOOK_CHECK(func_entry);
242 |     return func_entry(name);
243 | }
244 | 
245 | HOOK_C_API HOOK_DECL_EXPORT nvtxDomainHandle_t nvtxDomainCreateW(const wchar_t *name) {
246 |     HOOK_TRACE_PROFILE("nvtxDomainCreateW");
247 |     using func_ptr = nvtxDomainHandle_t (*)(const wchar_t *);
248 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainCreateW"));
249 |     HOOK_CHECK(func_entry);
250 |     return func_entry(name);
251 | }
252 | 
253 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainDestroy(nvtxDomainHandle_t domain) {
254 |     HOOK_TRACE_PROFILE("nvtxDomainDestroy");
255 |     using func_ptr = void (*)(nvtxDomainHandle_t);
256 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainDestroy"));
257 |     HOOK_CHECK(func_entry);
258 |     return func_entry(domain);
259 | }
260 | 
261 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuDeviceA(CUdevice device, const char *name) {
262 |     HOOK_TRACE_PROFILE("nvtxNameCuDeviceA");
263 |     using func_ptr = void (*)(CUdevice, const char *);
264 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCuDeviceA"));
265 |     HOOK_CHECK(func_entry);
266 |     return func_entry(device, name);
267 | }
268 | 
269 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuDeviceW(CUdevice device, const wchar_t *name) {
270 |     HOOK_TRACE_PROFILE("nvtxNameCuDeviceW");
271 |     using func_ptr = void (*)(CUdevice, const wchar_t *);
272 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCuDeviceW"));
273 |     HOOK_CHECK(func_entry);
274 |     return func_entry(device, name);
275 | }
276 | 
277 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuContextA(CUcontext context, const char *name) {
278 |     HOOK_TRACE_PROFILE("nvtxNameCuContextA");
279 |     using func_ptr = void (*)(CUcontext, const char *);
280 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCuContextA"));
281 |     HOOK_CHECK(func_entry);
282 |     return func_entry(context, name);
283 | }
284 | 
285 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuContextW(CUcontext context, const wchar_t *name) {
286 |     HOOK_TRACE_PROFILE("nvtxNameCuContextW");
287 |     using func_ptr = void (*)(CUcontext, const wchar_t *);
288 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCuContextW"));
289 |     HOOK_CHECK(func_entry);
290 |     return func_entry(context, name);
291 | }
292 | 
293 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuStreamA(CUstream stream, const char *name) {
294 |     HOOK_TRACE_PROFILE("nvtxNameCuStreamA");
295 |     using func_ptr = void (*)(CUstream, const char *);
296 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCuStreamA"));
297 |     HOOK_CHECK(func_entry);
298 |     return func_entry(stream, name);
299 | }
300 | 
301 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuStreamW(CUstream stream, const wchar_t *name) {
302 |     HOOK_TRACE_PROFILE("nvtxNameCuStreamW");
303 |     using func_ptr = void (*)(CUstream, const wchar_t *);
304 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCuStreamW"));
305 |     HOOK_CHECK(func_entry);
306 |     return func_entry(stream, name);
307 | }
308 | 
309 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuEventA(CUevent event, const char *name) {
310 |     HOOK_TRACE_PROFILE("nvtxNameCuEventA");
311 |     using func_ptr = void (*)(CUevent, const char *);
312 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCuEventA"));
313 |     HOOK_CHECK(func_entry);
314 |     return func_entry(event, name);
315 | }
316 | 
317 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuEventW(CUevent event, const wchar_t *name) {
318 |     HOOK_TRACE_PROFILE("nvtxNameCuEventW");
319 |     using func_ptr = void (*)(CUevent, const wchar_t *);
320 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCuEventW"));
321 |     HOOK_CHECK(func_entry);
322 |     return func_entry(event, name);
323 | }
324 | 
325 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaDeviceA(int device, const char *name) {
326 |     HOOK_TRACE_PROFILE("nvtxNameCudaDeviceA");
327 |     using func_ptr = void (*)(int, const char *);
328 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCudaDeviceA"));
329 |     HOOK_CHECK(func_entry);
330 |     return func_entry(device, name);
331 | }
332 | 
333 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaDeviceW(int device, const wchar_t *name) {
334 |     HOOK_TRACE_PROFILE("nvtxNameCudaDeviceW");
335 |     using func_ptr = void (*)(int, const wchar_t *);
336 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCudaDeviceW"));
337 |     HOOK_CHECK(func_entry);
338 |     return func_entry(device, name);
339 | }
340 | 
341 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaStreamA(cudaStream_t stream, const char *name) {
342 |     HOOK_TRACE_PROFILE("nvtxNameCudaStreamA");
343 |     using func_ptr = void (*)(cudaStream_t, const char *);
344 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCudaStreamA"));
345 |     HOOK_CHECK(func_entry);
346 |     return func_entry(stream, name);
347 | }
348 | 
349 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t *name) {
350 |     HOOK_TRACE_PROFILE("nvtxNameCudaStreamW");
351 |     using func_ptr = void (*)(cudaStream_t, const wchar_t *);
352 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCudaStreamW"));
353 |     HOOK_CHECK(func_entry);
354 |     return func_entry(stream, name);
355 | }
356 | 
357 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaEventA(cudaEvent_t event, const char *name) {
358 |     HOOK_TRACE_PROFILE("nvtxNameCudaEventA");
359 |     using func_ptr = void (*)(cudaEvent_t, const char *);
360 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCudaEventA"));
361 |     HOOK_CHECK(func_entry);
362 |     return func_entry(event, name);
363 | }
364 | 
365 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaEventW(cudaEvent_t event, const wchar_t *name) {
366 |     HOOK_TRACE_PROFILE("nvtxNameCudaEventW");
367 |     using func_ptr = void (*)(cudaEvent_t, const wchar_t *);
368 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameCudaEventW"));
369 |     HOOK_CHECK(func_entry);
370 |     return func_entry(event, name);
371 | }
372 | 
373 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClDeviceA(cl_device_id device, const char *name) {
374 |     HOOK_TRACE_PROFILE("nvtxNameClDeviceA");
375 |     using func_ptr = void (*)(cl_device_id, const char *);
376 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClDeviceA"));
377 |     HOOK_CHECK(func_entry);
378 |     return func_entry(device, name);
379 | }
380 | 
381 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClDeviceW(cl_device_id device, const wchar_t *name) {
382 |     HOOK_TRACE_PROFILE("nvtxNameClDeviceW");
383 |     using func_ptr = void (*)(cl_device_id, const wchar_t *);
384 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClDeviceW"));
385 |     HOOK_CHECK(func_entry);
386 |     return func_entry(device, name);
387 | }
388 | 
389 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClContextA(cl_context context, const char *name) {
390 |     HOOK_TRACE_PROFILE("nvtxNameClContextA");
391 |     using func_ptr = void (*)(cl_context, const char *);
392 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClContextA"));
393 |     HOOK_CHECK(func_entry);
394 |     return func_entry(context, name);
395 | }
396 | 
397 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClContextW(cl_context context, const wchar_t *name) {
398 |     HOOK_TRACE_PROFILE("nvtxNameClContextW");
399 |     using func_ptr = void (*)(cl_context, const wchar_t *);
400 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClContextW"));
401 |     HOOK_CHECK(func_entry);
402 |     return func_entry(context, name);
403 | }
404 | 
405 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClCommandQueueA(cl_command_queue command_queue, const char *name) {
406 |     HOOK_TRACE_PROFILE("nvtxNameClCommandQueueA");
407 |     using func_ptr = void (*)(cl_command_queue, const char *);
408 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClCommandQueueA"));
409 |     HOOK_CHECK(func_entry);
410 |     return func_entry(command_queue, name);
411 | }
412 | 
413 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t *name) {
414 |     HOOK_TRACE_PROFILE("nvtxNameClCommandQueueW");
415 |     using func_ptr = void (*)(cl_command_queue, const wchar_t *);
416 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClCommandQueueW"));
417 |     HOOK_CHECK(func_entry);
418 |     return func_entry(command_queue, name);
419 | }
420 | 
421 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClMemObjectA(cl_mem memobj, const char *name) {
422 |     HOOK_TRACE_PROFILE("nvtxNameClMemObjectA");
423 |     using func_ptr = void (*)(cl_mem, const char *);
424 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClMemObjectA"));
425 |     HOOK_CHECK(func_entry);
426 |     return func_entry(memobj, name);
427 | }
428 | 
429 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClMemObjectW(cl_mem memobj, const wchar_t *name) {
430 |     HOOK_TRACE_PROFILE("nvtxNameClMemObjectW");
431 |     using func_ptr = void (*)(cl_mem, const wchar_t *);
432 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClMemObjectW"));
433 |     HOOK_CHECK(func_entry);
434 |     return func_entry(memobj, name);
435 | }
436 | 
437 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClSamplerA(cl_sampler sampler, const char *name) {
438 |     HOOK_TRACE_PROFILE("nvtxNameClSamplerA");
439 |     using func_ptr = void (*)(cl_sampler, const char *);
440 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClSamplerA"));
441 |     HOOK_CHECK(func_entry);
442 |     return func_entry(sampler, name);
443 | }
444 | 
445 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClSamplerW(cl_sampler sampler, const wchar_t *name) {
446 |     HOOK_TRACE_PROFILE("nvtxNameClSamplerW");
447 |     using func_ptr = void (*)(cl_sampler, const wchar_t *);
448 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClSamplerW"));
449 |     HOOK_CHECK(func_entry);
450 |     return func_entry(sampler, name);
451 | }
452 | 
453 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClProgramA(cl_program program, const char *name) {
454 |     HOOK_TRACE_PROFILE("nvtxNameClProgramA");
455 |     using func_ptr = void (*)(cl_program, const char *);
456 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClProgramA"));
457 |     HOOK_CHECK(func_entry);
458 |     return func_entry(program, name);
459 | }
460 | 
461 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClProgramW(cl_program program, const wchar_t *name) {
462 |     HOOK_TRACE_PROFILE("nvtxNameClProgramW");
463 |     using func_ptr = void (*)(cl_program, const wchar_t *);
464 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClProgramW"));
465 |     HOOK_CHECK(func_entry);
466 |     return func_entry(program, name);
467 | }
468 | 
469 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClEventA(cl_event evnt, const char *name) {
470 |     HOOK_TRACE_PROFILE("nvtxNameClEventA");
471 |     using func_ptr = void (*)(cl_event, const char *);
472 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClEventA"));
473 |     HOOK_CHECK(func_entry);
474 |     return func_entry(evnt, name);
475 | }
476 | 
477 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClEventW(cl_event evnt, const wchar_t *name) {
478 |     HOOK_TRACE_PROFILE("nvtxNameClEventW");
479 |     using func_ptr = void (*)(cl_event, const wchar_t *);
480 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxNameClEventW"));
481 |     HOOK_CHECK(func_entry);
482 |     return func_entry(evnt, name);
483 | }
484 | 
485 | HOOK_C_API HOOK_DECL_EXPORT nvtxSyncUser_t nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain,
486 |                                                                     const nvtxSyncUserAttributes_t *attribs) {
487 |     HOOK_TRACE_PROFILE("nvtxDomainSyncUserCreate");
488 |     using func_ptr = nvtxSyncUser_t (*)(nvtxDomainHandle_t, const nvtxSyncUserAttributes_t *);
489 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserCreate"));
490 |     HOOK_CHECK(func_entry);
491 |     return func_entry(domain, attribs);
492 | }
493 | 
494 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle) {
495 |     HOOK_TRACE_PROFILE("nvtxDomainSyncUserDestroy");
496 |     using func_ptr = void (*)(nvtxSyncUser_t);
497 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserDestroy"));
498 |     HOOK_CHECK(func_entry);
499 |     return func_entry(handle);
500 | }
501 | 
502 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle) {
503 |     HOOK_TRACE_PROFILE("nvtxDomainSyncUserAcquireStart");
504 |     using func_ptr = void (*)(nvtxSyncUser_t);
505 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserAcquireStart"));
506 |     HOOK_CHECK(func_entry);
507 |     return func_entry(handle);
508 | }
509 | 
510 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle) {
511 |     HOOK_TRACE_PROFILE("nvtxDomainSyncUserAcquireFailed");
512 |     using func_ptr = void (*)(nvtxSyncUser_t);
513 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserAcquireFailed"));
514 |     HOOK_CHECK(func_entry);
515 |     return func_entry(handle);
516 | }
517 | 
518 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle) {
519 |     HOOK_TRACE_PROFILE("nvtxDomainSyncUserAcquireSuccess");
520 |     using func_ptr = void (*)(nvtxSyncUser_t);
521 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserAcquireSuccess"));
522 |     HOOK_CHECK(func_entry);
523 |     return func_entry(handle);
524 | }
525 | 
526 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle) {
527 |     HOOK_TRACE_PROFILE("nvtxDomainSyncUserReleasing");
528 |     using func_ptr = void (*)(nvtxSyncUser_t);
529 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserReleasing"));
530 |     HOOK_CHECK(func_entry);
531 |     return func_entry(handle);
532 | }
533 | 


--------------------------------------------------------------------------------
/tools/code_generate/code_generate.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2022. All Rights Reserved.
  2 | # Author: Bruce-Lee-LY
  3 | # Date: 23:47:56 on Sat, May 28, 2022
  4 | #
  5 | # Description: code generate for cuda-related dynamic libraries
  6 | 
  7 | #!/usr/bin/python3
  8 | # coding=utf-8
  9 | 
 10 | from __future__ import print_function
 11 | from __future__ import division
 12 | from __future__ import absolute_import
 13 | from __future__ import with_statement
 14 | 
 15 | import os
 16 | import optparse
 17 | from CppHeaderParser import CppHeader
 18 | 
 19 | 
 20 | class CodeGenerate():
 21 |     def __init__(self, type_, file_, output_):
 22 |         self.type = type_
 23 |         self.file = file_
 24 |         self.output = output_
 25 | 
 26 |         self.func_list = []
 27 | 
 28 |         self.hook_file = self.output + "/" + self.type + "_hook.cpp"
 29 |         self.hook_list = []
 30 |         self.hook_include = """
 31 | // auto generate $hook_num$ apis
 32 | 
 33 | #include "$type$_subset.h"
 34 | #include "hook.h"
 35 | #include "macro_common.h"
 36 | #include "trace_profile.h"
 37 | """
 38 |         self.hook_template = """
 39 | HOOK_C_API HOOK_DECL_EXPORT $ret$ $func_name$($func_param$) {
 40 |     HOOK_TRACE_PROFILE("$func_name$");
 41 |     using func_ptr = $ret$ (*)($param_type$);
 42 |     static auto func_entry = reinterpret_cast<func_ptr>(HOOK_$type$_SYMBOL("$func_name$"));
 43 |     HOOK_CHECK(func_entry);
 44 |     return func_entry($param_name$);
 45 | }
 46 | """
 47 | 
 48 |     def parsę_header(self):
 49 |         self.header = CppHeader(self.file)
 50 |         print(
 51 |             "{} total func num: {}".format(
 52 |                 self.type, len(
 53 |                     self.header.functions)))
 54 | 
 55 |     def generate_func(self):
 56 |         for func in self.header.functions:
 57 |             func_name = func["name"]
 58 |             if func_name in self.func_list:
 59 |                 continue
 60 |             else:
 61 |                 self.func_list.append(func_name)
 62 | 
 63 |             ret = func["rtnType"].replace(
 64 |                 "CUDAAPI", "").replace(
 65 |                 "__CUDA_DEPRECATED", "").replace(
 66 |                 "DECLDIR", "").replace(
 67 |                 "CUDARTAPI_CDECL", "").replace(
 68 |                 "CUDARTAPI", "").replace(
 69 |                 "__host__", "").replace(
 70 |                 "__cudart_builtin__", "").replace(
 71 |                 "CUDNNWINAPI", "").replace(
 72 |                 "CUBLASWINAPI", "").replace(
 73 |                 "CUBLASAPI", "").replace(
 74 |                 "CUFFTAPI", "").replace(
 75 |                 "NVTX_DECLSPEC", "").replace(
 76 |                 "NVTX_API", "").replace(
 77 |                 "CURANDAPI", "").replace(
 78 |                 "CUSPARSEAPI", "").replace(
 79 |                 "CUSOLVERAPI", "").replace(
 80 |                 "NVJPEGAPI", "").strip(' ')
 81 | 
 82 |             func_param = ""
 83 |             param_type = ""
 84 |             param_name = ""
 85 |             for param in func["parameters"]:
 86 |                 if len(func_param) > 0:
 87 |                     func_param += ", "
 88 |                     param_type += ", "
 89 |                     param_name += ", "
 90 |                 if param["array"] == 1:
 91 |                     param["type"] += "*"
 92 |                 func_param += (param["type"] + " " + param["name"])
 93 |                 param_type += param["type"]
 94 |                 param_name += param["name"]
 95 | 
 96 |             hook_func = self.hook_template
 97 |             self.hook_list.append(
 98 |                 hook_func.replace(
 99 |                     "$ret$",
100 |                     ret).replace(
101 |                     "$func_name$",
102 |                     func_name).replace(
103 |                     "$func_param$",
104 |                     func_param).replace(
105 |                     "$param_type$",
106 |                     param_type).replace(
107 |                         "$param_name$",
108 |                         param_name).replace(
109 |                             "$type$",
110 |                     self.type.upper()))
111 |         print("{} valid func num: {}".format(self.type, len(self.func_list)))
112 | 
113 |     def save_output(self):
114 |         if not os.path.exists(self.output):
115 |             os.makedirs(self.output)
116 | 
117 |         with open(self.hook_file, 'w') as fh:
118 |             hook_include = self.hook_include.replace("$hook_num$", str(
119 |                 len(self.hook_list))).replace("$type$", self.type)
120 |             fh.write(hook_include)
121 |             for hook in self.hook_list:
122 |                 fh.write(hook)
123 | 
124 | 
125 | def main():
126 |     usage = "python3 code_generate.py -t/--type cuda -f/--file include/cuda.h -o/--output output"
127 |     parser = optparse.OptionParser(usage)
128 |     parser.add_option(
129 |         '-t',
130 |         '--type',
131 |         dest='type',
132 |         type='string',
133 |         help='header type',
134 |         default='cuda')
135 |     parser.add_option(
136 |         '-f',
137 |         '--file',
138 |         dest='file',
139 |         type='string',
140 |         help='header file',
141 |         default='include/cuda.h')
142 |     parser.add_option(
143 |         '-o',
144 |         '--output',
145 |         dest='output',
146 |         type='string',
147 |         help='output path',
148 |         default='output')
149 | 
150 |     options, args = parser.parse_args()
151 |     type_ = options.type
152 |     file_ = options.file
153 |     output_ = options.output
154 | 
155 |     code_gen = CodeGenerate(type_, file_, output_)
156 |     code_gen.parsę_header()
157 |     code_gen.generate_func()
158 |     code_gen.save_output()
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     main()
163 | 


--------------------------------------------------------------------------------
/tools/code_generate/code_generate.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2022. All Rights Reserved.
 2 | # Author: Bruce-Lee-LY
 3 | # Date: 23:56:07 on Sat, May 28, 2022
 4 | #
 5 | # Description: code generate script
 6 | 
 7 | #!/bin/bash
 8 | 
 9 | set -euo pipefail
10 | 
11 | WORK_PATH=$(cd $(dirname $0) && pwd) && cd $WORK_PATH
12 | 
13 | # python3 -m CppHeaderParser.tojson include/cuda.h > output/cuda.json
14 | python3 code_generate.py -t cuda -f include/cuda.h -o output
15 | # mkdir -p ../../src/cuda
16 | # cp output/cuda_hook.cpp ../../src/cuda
17 | 
18 | # python3 -m CppHeaderParser.tojson include/nvml.h > output/nvml.json
19 | python3 code_generate.py -t nvml -f include/nvml.h -o output
20 | # mkdir -p ../../src/nvml
21 | # cp output/nvml_hook.cpp ../../src/nvml
22 | 
23 | # python3 -m CppHeaderParser.tojson include/cuda_runtime_api.h > output/cuda_runtime_api.json
24 | python3 code_generate.py -t cudart -f include/cuda_runtime_api.h -o output
25 | # mkdir -p ../../src/cudart
26 | # cp output/cudart_hook.cpp ../../src/cudart
27 | 
28 | # python3 -m CppHeaderParser.tojson include/cudnn.h > output/cudnn.json
29 | python3 code_generate.py -t cudnn -f include/cudnn.h -o output
30 | # mkdir -p ../../src/cudnn
31 | # cp output/cudnn_hook.cpp ../../src/cudnn
32 | 
33 | # python3 -m CppHeaderParser.tojson include/cublas.h > output/cublas.json
34 | python3 code_generate.py -t cublas -f include/cublas.h -o output
35 | # mkdir -p ../../src/cublas
36 | # cp output/cublas_hook.cpp ../../src/cublas
37 | 
38 | # python3 -m CppHeaderParser.tojson include/cublasLt.h > output/cublasLt.json
39 | python3 code_generate.py -t cublasLt -f include/cublasLt.h -o output
40 | # mkdir -p ../../src/cublasLt
41 | # cp output/cublasLt_hook.cpp ../../src/cublasLt
42 | 
43 | # python3 -m CppHeaderParser.tojson include/cufft.h > output/cufft.json
44 | python3 code_generate.py -t cufft -f include/cufft.h -o output
45 | # mkdir -p ../../src/cufft
46 | # cp output/cufft_hook.cpp ../../src/cufft
47 | 
48 | # python3 -m CppHeaderParser.tojson include/nvToolsExt.h > output/nvToolsExt.json
49 | python3 code_generate.py -t nvtx -f include/nvToolsExt.h -o output
50 | # mkdir -p ../../src/nvtx
51 | # cp output/nvtx_hook.cpp ../../src/nvtx
52 | 
53 | # python3 -m CppHeaderParser.tojson include/nvrtc.h > output/nvrtc.json
54 | python3 code_generate.py -t nvrtc -f include/nvrtc.h -o output
55 | # mkdir -p ../../src/nvrtc
56 | # cp output/nvrtc_hook.cpp ../../src/nvrtc
57 | 
58 | # python3 -m CppHeaderParser.tojson include/curand.h > output/curand.json
59 | python3 code_generate.py -t curand -f include/curand.h -o output
60 | # mkdir -p ../../src/curand
61 | # cp output/curand_hook.cpp ../../src/curand
62 | 
63 | # python3 -m CppHeaderParser.tojson include/cusparse.h > output/cusparse.json
64 | python3 code_generate.py -t cusparse -f include/cusparse.h -o output
65 | # mkdir -p ../../src/cusparse
66 | # cp output/cusparse_hook.cpp ../../src/cusparse
67 | 
68 | # python3 -m CppHeaderParser.tojson include/cusolver_common.h > output/cusolver_common.json
69 | python3 code_generate.py -t cusolver -f include/cusolver_common.h -o output
70 | # mkdir -p ../../src/cusolver
71 | # cp output/cusolver_hook.cpp ../../src/cusolver
72 | 
73 | # python3 -m CppHeaderParser.tojson include/nvjpeg.h > output/nvjpeg.json
74 | python3 code_generate.py -t nvjpeg -f include/nvjpeg.h -o output
75 | # mkdir -p ../../src/nvjpeg
76 | # cp output/nvjpeg_hook.cpp ../../src/nvjpeg
77 | 
78 | # python3 -m CppHeaderParser.tojson include/nvblas.h > output/nvblas.json
79 | python3 code_generate.py -t nvblas -f include/nvblas.h -o output
80 | # mkdir -p ../../src/nvblas
81 | # cp output/nvblas_hook.cpp ../../src/nvblas
82 | 


--------------------------------------------------------------------------------
/tools/code_generate/include/cufft.h:
--------------------------------------------------------------------------------
  1 |  /* Copyright 2005-2021 NVIDIA Corporation.  All rights reserved.
  2 |   *
  3 |   * NOTICE TO LICENSEE:
  4 |   *
  5 |   * The source code and/or documentation ("Licensed Deliverables") are
  6 |   * subject to NVIDIA intellectual property rights under U.S. and
  7 |   * international Copyright laws.
  8 |   *
  9 |   * The Licensed Deliverables contained herein are PROPRIETARY and
 10 |   * CONFIDENTIAL to NVIDIA and are being provided under the terms and
 11 |   * conditions of a form of NVIDIA software license agreement by and
 12 |   * between NVIDIA and Licensee ("License Agreement") or electronically
 13 |   * accepted by Licensee.  Notwithstanding any terms or conditions to
 14 |   * the contrary in the License Agreement, reproduction or disclosure
 15 |   * of the Licensed Deliverables to any third party without the express
 16 |   * written consent of NVIDIA is prohibited.
 17 |   *
 18 |   * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 19 |   * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 20 |   * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
 21 |   * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 22 |   * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 23 |   * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 24 |   * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 25 |   * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 26 |   * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 27 |   * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 28 |   * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 29 |   * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 30 |   * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 31 |   * OF THESE LICENSED DELIVERABLES.
 32 |   *
 33 |   * U.S. Government End Users.  These Licensed Deliverables are a
 34 |   * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 35 |   * 1995), consisting of "commercial computer software" and "commercial
 36 |   * computer software documentation" as such terms are used in 48
 37 |   * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
 38 |   * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 39 |   * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 40 |   * U.S. Government End Users acquire the Licensed Deliverables with
 41 |   * only those rights set forth herein.
 42 |   *
 43 |   * Any use of the Licensed Deliverables in individual and commercial
 44 |   * software must include, in the user documentation and internal
 45 |   * comments to the code, the above Disclaimer and U.S. Government End
 46 |   * Users Notice.
 47 |   */
 48 | 
 49 | // the following one modifications have been made
 50 | // (1) add content in the bottom from cufftXt.h
 51 | 
 52 | /*!
 53 | * \file cufft.h
 54 | * \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
 55 | */
 56 | 
 57 | #ifndef _CUFFT_H_
 58 | #define _CUFFT_H_
 59 | 
 60 | 
 61 | #include "cuComplex.h"
 62 | #include "driver_types.h"
 63 | #include "library_types.h"
 64 | 
 65 | #ifndef CUFFTAPI
 66 | #ifdef _WIN32
 67 | #define CUFFTAPI __stdcall
 68 | #elif __GNUC__ >= 4
 69 | #define CUFFTAPI __attribute__ ((visibility ("default")))
 70 | #else
 71 | #define CUFFTAPI
 72 | #endif
 73 | #endif
 74 | 
 75 | #ifdef __cplusplus
 76 | extern "C" {
 77 | #endif
 78 | 
 79 | #define CUFFT_VER_MAJOR 10
 80 | #define CUFFT_VER_MINOR 5
 81 | #define CUFFT_VER_PATCH 2
 82 | #define CUFFT_VER_BUILD 100
 83 | 
 84 | // cuFFT library version
 85 | //
 86 | // CUFFT_VERSION / 1000 - major version
 87 | // CUFFT_VERSION / 100 % 100 - minor version
 88 | // CUFFT_VERSION % 100 - patch level
 89 | #define CUFFT_VERSION 10502
 90 | 
 91 | // CUFFT API function return values
 92 | typedef enum cufftResult_t {
 93 |   CUFFT_SUCCESS        = 0x0,
 94 |   CUFFT_INVALID_PLAN   = 0x1,
 95 |   CUFFT_ALLOC_FAILED   = 0x2,
 96 |   CUFFT_INVALID_TYPE   = 0x3,
 97 |   CUFFT_INVALID_VALUE  = 0x4,
 98 |   CUFFT_INTERNAL_ERROR = 0x5,
 99 |   CUFFT_EXEC_FAILED    = 0x6,
100 |   CUFFT_SETUP_FAILED   = 0x7,
101 |   CUFFT_INVALID_SIZE   = 0x8,
102 |   CUFFT_UNALIGNED_DATA = 0x9,
103 |   CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA,
104 |   CUFFT_INVALID_DEVICE = 0xB,
105 |   CUFFT_PARSE_ERROR = 0xC,
106 |   CUFFT_NO_WORKSPACE = 0xD,
107 |   CUFFT_NOT_IMPLEMENTED = 0xE,
108 |   CUFFT_LICENSE_ERROR = 0x0F,
109 |   CUFFT_NOT_SUPPORTED = 0x10
110 | 
111 | } cufftResult;
112 | 
113 | #define MAX_CUFFT_ERROR 0x11
114 | 
115 | 
116 | // CUFFT defines and supports the following data types
117 | 
118 | 
119 | // cufftReal is a single-precision, floating-point real data type.
120 | // cufftDoubleReal is a double-precision, real data type.
121 | typedef float cufftReal;
122 | typedef double cufftDoubleReal;
123 | 
124 | // cufftComplex is a single-precision, floating-point complex data type that
125 | // consists of interleaved real and imaginary components.
126 | // cufftDoubleComplex is the double-precision equivalent.
127 | typedef cuComplex cufftComplex;
128 | typedef cuDoubleComplex cufftDoubleComplex;
129 | 
130 | // CUFFT transform directions
131 | #define CUFFT_FORWARD -1 // Forward FFT
132 | #define CUFFT_INVERSE  1 // Inverse FFT
133 | 
134 | // CUFFT supports the following transform types
135 | typedef enum cufftType_t {
136 |   CUFFT_R2C = 0x2a,     // Real to Complex (interleaved)
137 |   CUFFT_C2R = 0x2c,     // Complex (interleaved) to Real
138 |   CUFFT_C2C = 0x29,     // Complex to Complex, interleaved
139 |   CUFFT_D2Z = 0x6a,     // Double to Double-Complex
140 |   CUFFT_Z2D = 0x6c,     // Double-Complex to Double
141 |   CUFFT_Z2Z = 0x69      // Double-Complex to Double-Complex
142 | } cufftType;
143 | 
144 | // CUFFT supports the following data layouts
145 | typedef enum cufftCompatibility_t {
146 |     CUFFT_COMPATIBILITY_FFTW_PADDING    = 0x01    // The default value
147 | } cufftCompatibility;
148 | 
149 | #define CUFFT_COMPATIBILITY_DEFAULT   CUFFT_COMPATIBILITY_FFTW_PADDING
150 | 
151 | //
152 | // structure definition used by the shim between old and new APIs
153 | //
154 | #define MAX_SHIM_RANK 3
155 | 
156 | // cufftHandle is a handle type used to store and access CUFFT plans.
157 | typedef int cufftHandle;
158 | 
159 | 
160 | cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
161 |                                  int nx,
162 |                                  cufftType type,
163 |                                  int batch);
164 | 
165 | cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
166 |                                  int nx, int ny,
167 |                                  cufftType type);
168 | 
169 | cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
170 |                                  int nx, int ny, int nz,
171 |                                  cufftType type);
172 | 
173 | cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
174 |                                    int rank,
175 |                                    int *n,
176 |                                    int *inembed, int istride, int idist,
177 |                                    int *onembed, int ostride, int odist,
178 |                                    cufftType type,
179 |                                    int batch);
180 | 
181 | cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan,
182 |                                      int nx,
183 |                                      cufftType type,
184 |                                      int batch,
185 |                                      size_t *workSize);
186 | 
187 | cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan,
188 |                                      int nx, int ny,
189 |                                      cufftType type,
190 |                                      size_t *workSize);
191 | 
192 | cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan,
193 |                                      int nx, int ny, int nz,
194 |                                      cufftType type,
195 |                                      size_t *workSize);
196 | 
197 | cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
198 |                                        int rank,
199 |                                        int *n,
200 |                                        int *inembed, int istride, int idist,
201 |                                        int *onembed, int ostride, int odist,
202 |                                        cufftType type,
203 |                                        int batch,
204 |                                        size_t *workSize);
205 | 
206 | cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan,
207 |                                          int rank,
208 |                                          long long int *n,
209 |                                          long long int *inembed,
210 |                                          long long int istride,
211 |                                          long long int idist,
212 |                                          long long int *onembed,
213 |                                          long long int ostride, long long int odist,
214 |                                          cufftType type,
215 |                                          long long int batch,
216 |                                          size_t * workSize);
217 | 
218 | cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
219 |                                         int rank,
220 |                                         long long int *n,
221 |                                         long long int *inembed,
222 |                                         long long int istride, long long int idist,
223 |                                         long long int *onembed,
224 |                                         long long int ostride, long long int odist,
225 |                                         cufftType type,
226 |                                         long long int batch,
227 |                                         size_t *workSize);
228 | 
229 | 
230 | 
231 | 
232 | cufftResult CUFFTAPI cufftEstimate1d(int nx,
233 |                                      cufftType type,
234 |                                      int batch,
235 |                                      size_t *workSize);
236 | 
237 | cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
238 |                                      cufftType type,
239 |                                      size_t *workSize);
240 | 
241 | cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz,
242 |                                      cufftType type,
243 |                                      size_t *workSize);
244 | 
245 | cufftResult CUFFTAPI cufftEstimateMany(int rank,
246 |                                        int *n,
247 |                                        int *inembed, int istride, int idist,
248 |                                        int *onembed, int ostride, int odist,
249 |                                        cufftType type,
250 |                                        int batch,
251 |                                        size_t *workSize);
252 | 
253 | cufftResult CUFFTAPI cufftCreate(cufftHandle * handle);
254 | 
255 | cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle,
256 |                                     int nx,
257 |                                     cufftType type,
258 |                                     int batch,
259 |                                     size_t *workSize );
260 | 
261 | cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle,
262 |                                     int nx, int ny,
263 |                                     cufftType type,
264 |                                     size_t *workSize);
265 | 
266 | cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
267 |                                     int nx, int ny, int nz,
268 |                                     cufftType type,
269 |                                     size_t *workSize);
270 | 
271 | cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle,
272 |                                       int rank, int *n,
273 |                                       int *inembed, int istride, int idist,
274 |                                       int *onembed, int ostride, int odist,
275 |                                       cufftType type, int batch, size_t *workArea);
276 | 
277 | cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize);
278 | 
279 | cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea);
280 | 
281 | cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate);
282 | 
283 | cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
284 |                                   cufftComplex *idata,
285 |                                   cufftComplex *odata,
286 |                                   int direction);
287 | 
288 | cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
289 |                                   cufftReal *idata,
290 |                                   cufftComplex *odata);
291 | 
292 | cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
293 |                                   cufftComplex *idata,
294 |                                   cufftReal *odata);
295 | 
296 | cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
297 |                                   cufftDoubleComplex *idata,
298 |                                   cufftDoubleComplex *odata,
299 |                                   int direction);
300 | 
301 | cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
302 |                                   cufftDoubleReal *idata,
303 |                                   cufftDoubleComplex *odata);
304 | 
305 | cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
306 |                                   cufftDoubleComplex *idata,
307 |                                   cufftDoubleReal *odata);
308 | 
309 | 
310 | // utility functions
311 | cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
312 |                                     cudaStream_t stream);
313 | 
314 | cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
315 | 
316 | cufftResult CUFFTAPI cufftGetVersion(int *version);
317 | 
318 | cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
319 |                                       int *value);
320 | 
321 | // cuda/targets/x86_64-linux/include/cufftXt.h
322 | //
323 | // cufftXtSubFormat identifies the data layout of
324 | // a memory descriptor owned by cufft.
325 | // note that multi GPU cufft does not yet support out-of-place transforms
326 | //
327 | 
328 | typedef enum cufftXtSubFormat_t {
329 |   CUFFT_XT_FORMAT_INPUT = 0x00,              //by default input is in linear order across GPUs
330 |   CUFFT_XT_FORMAT_OUTPUT = 0x01,             //by default output is in scrambled order depending on transform
331 |   CUFFT_XT_FORMAT_INPLACE = 0x02,            //by default inplace is input order, which is linear across GPUs
332 |   CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03,   //shuffled output order after execution of the transform
333 |   CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04,  //shuffled input order prior to execution of 1D transforms
334 |   CUFFT_FORMAT_UNDEFINED = 0x05
335 | } cufftXtSubFormat;
336 | 
337 | //
338 | // cufftXtCopyType specifies the type of copy for cufftXtMemcpy
339 | //
340 | typedef enum cufftXtCopyType_t {
341 |   CUFFT_COPY_HOST_TO_DEVICE = 0x00,
342 |   CUFFT_COPY_DEVICE_TO_HOST = 0x01,
343 |   CUFFT_COPY_DEVICE_TO_DEVICE = 0x02,
344 |   CUFFT_COPY_UNDEFINED = 0x03
345 | } cufftXtCopyType;
346 | 
347 | //
348 | // cufftXtQueryType specifies the type of query for cufftXtQueryPlan
349 | //
350 | typedef enum cufftXtQueryType_t {
351 |   CUFFT_QUERY_1D_FACTORS = 0x00,
352 |   CUFFT_QUERY_UNDEFINED = 0x01
353 | } cufftXtQueryType;
354 | 
355 | typedef struct cufftXt1dFactors_t {
356 |   long long int size;
357 |   long long int stringCount;
358 |   long long int stringLength;
359 |   long long int substringLength;
360 |   long long int factor1;
361 |   long long int factor2;
362 |   long long int stringMask;
363 |   long long int substringMask;
364 |   long long int factor1Mask;
365 |   long long int factor2Mask;
366 |   int stringShift;
367 |   int substringShift;
368 |   int factor1Shift;
369 |   int factor2Shift;
370 | } cufftXt1dFactors;
371 | 
372 | //
373 | // cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy
374 | //
375 | typedef enum cufftXtWorkAreaPolicy_t {
376 |   CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */
377 |   CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */
378 |   CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */
379 | } cufftXtWorkAreaPolicy;
380 | 
381 | // multi-GPU routines
382 | cufftResult CUFFTAPI cufftXtSetGPUs(cufftHandle handle, int nGPUs, int *whichGPUs);
383 | 
384 | cufftResult CUFFTAPI cufftXtMalloc(cufftHandle plan,
385 |                                  cudaLibXtDesc ** descriptor,
386 |                                  cufftXtSubFormat format);
387 | 
388 | cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan,
389 |                                  void *dstPointer,
390 |                                  void *srcPointer,
391 |                                  cufftXtCopyType type);
392 | 
393 | cufftResult CUFFTAPI cufftXtFree(cudaLibXtDesc *descriptor);
394 | 
395 | cufftResult CUFFTAPI cufftXtSetWorkArea(cufftHandle plan, void **workArea);
396 | 
397 | cufftResult CUFFTAPI cufftXtExecDescriptorC2C(cufftHandle plan,
398 |                                             cudaLibXtDesc *input,
399 |                                             cudaLibXtDesc *output,
400 |                                             int direction);
401 | 
402 | cufftResult CUFFTAPI cufftXtExecDescriptorR2C(cufftHandle plan,
403 |                                             cudaLibXtDesc *input,
404 |                                             cudaLibXtDesc *output);
405 | 
406 | cufftResult CUFFTAPI cufftXtExecDescriptorC2R(cufftHandle plan,
407 |                                             cudaLibXtDesc *input,
408 |                                             cudaLibXtDesc *output);
409 | 
410 | cufftResult CUFFTAPI cufftXtExecDescriptorZ2Z(cufftHandle plan,
411 |                                             cudaLibXtDesc *input,
412 |                                             cudaLibXtDesc *output,
413 |                                             int direction);
414 | 
415 | cufftResult CUFFTAPI cufftXtExecDescriptorD2Z(cufftHandle plan,
416 |                                             cudaLibXtDesc *input,
417 |                                             cudaLibXtDesc *output);
418 | 
419 | cufftResult CUFFTAPI cufftXtExecDescriptorZ2D(cufftHandle plan,
420 |                                             cudaLibXtDesc *input,
421 |                                             cudaLibXtDesc *output);
422 | 
423 | // Utility functions
424 | 
425 | cufftResult CUFFTAPI cufftXtQueryPlan(cufftHandle plan, void *queryStruct, cufftXtQueryType queryType);
426 | 
427 | 
428 | // callbacks
429 | 
430 | 
431 | typedef enum cufftXtCallbackType_t {
432 |   CUFFT_CB_LD_COMPLEX = 0x0,
433 |   CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1,
434 |   CUFFT_CB_LD_REAL = 0x2,
435 |   CUFFT_CB_LD_REAL_DOUBLE = 0x3,
436 |   CUFFT_CB_ST_COMPLEX = 0x4,
437 |   CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5,
438 |   CUFFT_CB_ST_REAL = 0x6,
439 |   CUFFT_CB_ST_REAL_DOUBLE = 0x7,
440 |   CUFFT_CB_UNDEFINED = 0x8
441 | 
442 | } cufftXtCallbackType;
443 | 
444 | typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
445 | typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
446 | typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
447 | typedef cufftDoubleReal(*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
448 | 
449 | typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer);
450 | typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, void *sharedPointer);
451 | typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, void *sharedPointer);
452 | typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, void *sharedPointer);
453 | 
454 | 
455 | cufftResult CUFFTAPI cufftXtSetCallback(cufftHandle plan, void **callback_routine, cufftXtCallbackType cbType, void **caller_info);
456 | cufftResult CUFFTAPI cufftXtClearCallback(cufftHandle plan, cufftXtCallbackType cbType);
457 | cufftResult CUFFTAPI cufftXtSetCallbackSharedSize(cufftHandle plan, cufftXtCallbackType cbType, size_t sharedSize);
458 | 
459 | cufftResult CUFFTAPI cufftXtMakePlanMany(cufftHandle plan,
460 |                                        int rank,
461 |                                        long long int *n,
462 |                                        long long int *inembed,
463 |                                        long long int istride,
464 |                                        long long int idist,
465 |                                        cudaDataType inputtype,
466 |                                        long long int *onembed,
467 |                                        long long int ostride,
468 |                                        long long int odist,
469 |                                        cudaDataType outputtype,
470 |                                        long long int batch,
471 |                                        size_t *workSize,
472 |                                         cudaDataType executiontype);
473 | 
474 | cufftResult CUFFTAPI cufftXtGetSizeMany(cufftHandle plan,
475 |                                       int rank,
476 |                                       long long int *n,
477 |                                       long long int *inembed,
478 |                                       long long int istride,
479 |                                       long long int idist,
480 |                                       cudaDataType inputtype,
481 |                                       long long int *onembed,
482 |                                       long long int ostride,
483 |                                       long long int odist,
484 |                                       cudaDataType outputtype,
485 |                                       long long int batch,
486 |                                       size_t *workSize,
487 |                                       cudaDataType executiontype);
488 | 
489 | 
490 | cufftResult CUFFTAPI cufftXtExec(cufftHandle plan,
491 |                                void *input,
492 |                                void *output,
493 |                                int direction);
494 | 
495 | cufftResult CUFFTAPI cufftXtExecDescriptor(cufftHandle plan,
496 |                                          cudaLibXtDesc *input,
497 |                                          cudaLibXtDesc *output,
498 |                                          int direction);
499 | 
500 | cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t *workSize);
501 | 
502 | #ifdef __cplusplus
503 | }
504 | #endif
505 | 
506 | #endif /* _CUFFT_H_ */
507 | 


--------------------------------------------------------------------------------