├── .clang-format ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── build.sh ├── requirements.txt ├── run_sample.sh ├── sample ├── cuda │ ├── bandwidth_test.cu │ ├── include │ │ ├── exception.h │ │ ├── helper_cuda.h │ │ ├── helper_functions.h │ │ ├── helper_image.h │ │ ├── helper_string.h │ │ └── helper_timer.h │ ├── matrix_mul.cu │ └── vector_add.cu └── nvml │ ├── nvml_example.c │ └── supported_vgpus.c ├── src ├── common │ ├── hook.h │ ├── macro_common.h │ └── trace_profile.h ├── cublas │ ├── cublas_hook.cpp │ └── cublas_subset.h ├── cublasLt │ ├── cublasLt_hook.cpp │ └── cublasLt_subset.h ├── cuda │ ├── cuda_hook.cpp │ └── cuda_subset.h ├── cudart │ ├── cudart_hook.cpp │ └── cudart_subset.h ├── cudnn │ ├── cudnn_hook.cpp │ └── cudnn_subset.h ├── cufft │ ├── cufft_hook.cpp │ └── cufft_subset.h ├── curand │ ├── curand_hook.cpp │ └── curand_subset.h ├── cusolver │ ├── cusolver_hook.cpp │ └── cusolver_subset.h ├── cusparse │ ├── cusparse_hook.cpp │ └── cusparse_subset.h ├── nvblas │ └── nvblas_hook.cpp ├── nvjpeg │ ├── nvjpeg_hook.cpp │ └── nvjpeg_subset.h ├── nvml │ ├── nvml_hook.cpp │ └── nvml_subset.h ├── nvrtc │ ├── nvrtc_hook.cpp │ └── nvrtc_subset.h └── nvtx │ ├── nvtx_hook.cpp │ └── nvtx_subset.h └── tools └── code_generate ├── code_generate.py ├── code_generate.sh └── include ├── cublas.h ├── cublasLt.h ├── cuda.h ├── cuda_runtime_api.h ├── cudnn.h ├── cufft.h ├── curand.h ├── cusolver_common.h ├── cusparse.h ├── nvToolsExt.h ├── nvblas.h ├── nvjpeg.h ├── nvml.h └── nvrtc.h /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | 3 | IndentWidth: 4 4 | 5 | TabWidth: 4 6 | 7 | UseTab: Never 8 | 9 | IndentCaseLabels: true 10 | 11 | IndentWrappedFunctionNames: true 12 | 13 | ColumnLimit: 120 14 | 15 | AccessModifierOffset: -4 16 | 17 | AllowShortFunctionsOnASingleLine: Empty 18 | 19 | AllowShortIfStatementsOnASingleLine: false 20 | 21 | AllowShortLoopsOnASingleLine: false 22 | 23 | AllowShortBlocksOnASingleLine: false 24 | 25 | AllowShortCaseLabelsOnASingleLine: false 26 | 27 | KeepEmptyLinesAtTheStartOfBlocks: true 28 | 29 | MaxEmptyLinesToKeep: 1 30 | 31 | DerivePointerAlignment: false 32 | 33 | PointerAlignment: Right 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .VSCodeCounter/ 3 | .idea/ 4 | 5 | __pycache__/ 6 | go/ 7 | 8 | *~/ 9 | build/ 10 | install/ 11 | release/ 12 | output/ 13 | bin/ 14 | log/ 15 | model/ 16 | ncu/ 17 | nsys/ 18 | roofline/ 19 | ptx/ 20 | sass/ 21 | tmp/ 22 | temp/ 23 | 24 | *.o 25 | *.so 26 | *.so.* 27 | *.out 28 | *.log 29 | *.bak 30 | *.pkz 31 | 32 | setting.h 33 | .config* 34 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2022. All Rights Reserved. 2 | # Author: Bruce-Lee-LY 3 | # Date: 16:19:40 on Sun, May 29, 2022 4 | # 5 | # Description: cmake for cuda hook 6 | 7 | cmake_minimum_required (VERSION 3.12) 8 | 9 | project (cuda_hook LANGUAGES C CXX) 10 | 11 | set (CMAKE_VERBOSE_MAKEFILE ${HOOK_VERBOSE_MAKEFILE}) 12 | 13 | set (CMAKE_C_VISIBILITY_PRESET hidden) 14 | set (CMAKE_CXX_VISIBILITY_PRESET hidden) 15 | set (CMAKE_POSITION_INDEPENDENT_CODE ON) 16 | 17 | set (CMAKE_C_FLAGS "-std=c11") 18 | set (CMAKE_C_FLAGS_DEBUG "$ENV{CFLAGS} -O0 -g2 -ggdb -DHOOK_BUILD_DEBUG") 19 | set (CMAKE_C_FLAGS_RELEASE "$ENV{CFLAGS} -O3") 20 | 21 | set (CMAKE_CXX_FLAGS "-std=c++11") 22 | set (CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O0 -g2 -ggdb -DHOOK_BUILD_DEBUG") 23 | set (CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} -O3") 24 | 25 | set (CMAKE_SHARED_LINKER_FLAGS "-s -Wl,--exclude-libs,ALL") 26 | set (CMAKE_EXE_LINKER_FLAGS "-Wl,--as-needed") 27 | 28 | add_compile_options ( 29 | -Wall 30 | -Werror 31 | -Wextra 32 | -Wswitch-default 33 | -Wfloat-equal 34 | -Wshadow 35 | -Wcast-qual 36 | ) 37 | 38 | include_directories ( 39 | ${PROJECT_SOURCE_DIR}/src/common 40 | ${PROJECT_SOURCE_DIR}/src/cuda 41 | ${PROJECT_SOURCE_DIR}/src/nvml 42 | ${PROJECT_SOURCE_DIR}/src/cudart 43 | ${PROJECT_SOURCE_DIR}/src/cudnn 44 | ${PROJECT_SOURCE_DIR}/src/cublas 45 | ${PROJECT_SOURCE_DIR}/src/cublasLt 46 | ${PROJECT_SOURCE_DIR}/src/cufft 47 | ${PROJECT_SOURCE_DIR}/src/nvtx 48 | ${PROJECT_SOURCE_DIR}/src/nvrtc 49 | ${PROJECT_SOURCE_DIR}/src/curand 50 | ${PROJECT_SOURCE_DIR}/src/cusparse 51 | ${PROJECT_SOURCE_DIR}/src/cusolver 52 | ${PROJECT_SOURCE_DIR}/src/nvjpeg 53 | ${PROJECT_SOURCE_DIR}/src/nvblas 54 | ) 55 | 56 | file (GLOB HOOK_SRCS 57 | ${PROJECT_SOURCE_DIR}/src/*/*.cpp 58 | ) 59 | 60 | # libcuda_hook.so 61 | add_library (cuda_hook SHARED ${HOOK_SRCS}) 62 | target_link_libraries (cuda_hook -ldl) 63 | 64 | install (TARGETS cuda_hook LIBRARY DESTINATION lib64) 65 | 66 | # libcuda_hook.a 67 | add_library (cuda_hook_static STATIC ${HOOK_SRCS}) 68 | target_link_libraries (cuda_hook_static -ldl) 69 | set_target_properties (cuda_hook_static PROPERTIES OUTPUT_NAME cuda_hook) 70 | 71 | install (TARGETS cuda_hook_static ARCHIVE DESTINATION lib64) 72 | 73 | if (HOOK_WITH_SAMPLE) 74 | find_package (CUDA REQUIRED) 75 | unset (CUDA_USE_STATIC_CUDA_RUNTIME CACHE) 76 | option (CUDA_USE_STATIC_CUDA_RUNTIME OFF) 77 | 78 | set (CUDA_VERBOSE_BUILD ${HOOK_VERBOSE_MAKEFILE}) 79 | set (CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11") 80 | if (${CMAKE_BUILD_TYPE} MATCHES "Debug") 81 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_${CMAKE_CUDA_ARCHITECTURES} -g -lineinfo -O0") 82 | else () 83 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_${CMAKE_CUDA_ARCHITECTURES},code=sm_${CMAKE_CUDA_ARCHITECTURES} --use_fast_math -O3") 84 | endif () 85 | 86 | set (SYS_CUDART_PATH "/usr/local/cuda") 87 | set (SYS_CUDA_DRIVER_PATH "/usr/lib/x86_64-linux-gnu") 88 | 89 | include_directories ( 90 | ${SYS_CUDART_PATH}/include 91 | ${PROJECT_SOURCE_DIR}/sample/cuda/include 92 | ) 93 | 94 | link_directories ( 95 | ${SYS_CUDART_PATH}/lib64 96 | ${SYS_CUDA_DRIVER_PATH} 97 | ) 98 | 99 | # cuda 100 | cuda_add_executable (bandwidth_test ${PROJECT_SOURCE_DIR}/sample/cuda/bandwidth_test.cu) 101 | cuda_add_executable (matrix_mul ${PROJECT_SOURCE_DIR}/sample/cuda/matrix_mul.cu) 102 | cuda_add_executable (vector_add ${PROJECT_SOURCE_DIR}/sample/cuda/vector_add.cu) 103 | 104 | install (TARGETS bandwidth_test matrix_mul vector_add RUNTIME DESTINATION sample/cuda) 105 | 106 | # nvml 107 | add_executable (nvml_example ${PROJECT_SOURCE_DIR}/sample/nvml/nvml_example.c) 108 | target_link_libraries (nvml_example -lnvidia-ml) 109 | add_executable (supported_vgpus ${PROJECT_SOURCE_DIR}/sample/nvml/supported_vgpus.c) 110 | target_link_libraries (supported_vgpus -lnvidia-ml) 111 | 112 | install (TARGETS nvml_example supported_vgpus RUNTIME DESTINATION sample/nvml) 113 | endif () 114 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Bruce-Lee-LY 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA Hook 2 | Hooked CUDA-related dynamic libraries by using automated code generation tools. Based on this, you can easily obtain the CUDA API called by the CUDA program, and you can also hijack the CUDA API to insert custom logic. 3 | 4 | It implements an ingenious tool to automatically generate code that hooks the CUDA api with CUDA native header files, and is extremely practical and extensible. 5 | 6 | At present, the hooking of dynamic libraries such as cuda driver, nvml, cuda runtime, cudnn, cublas, cublasLt, cufft, nvtx, nvrtc, curand, cusparse, cusolver, nvjpeg and nvblas has been completed, and it can also be easily extended to the hooking of other cuda dynamic libraries. 7 | 8 | # Support Dynamic Libraries 9 | - CUDA Driver: libcuda.so 10 | - NVML: libnvidia-ml.so 11 | - CUDA Runtime: libcudart.so 12 | - CUDNN: libcudnn.so 13 | - CUBLAS: libcublas.so 14 | - CUBLASLT: libcublasLt.so 15 | - CUFFT: libcufft.so 16 | - NVTX: libnvToolsExt.so 17 | - NVRTC: libnvrtc.so 18 | - CURAND: libcurand.so 19 | - CUSPARSE: libcusparse.so 20 | - CUSOLVER: libcusolver.so 21 | - NVJPEG: libnvjpeg.so 22 | - NVBLAS: libnvblas.so 23 | 24 | # Compile 25 | ## Environment 26 | - OS: Linux 27 | - Cmake Version: >= 3.12 28 | - GCC Version: >= 4.8 29 | - CUDA Version: 11.4 (best) 30 | - CUDA Driver Version: 470.129.06 (best) 31 | - CUDNN Version: 7.6.5 (best) 32 | 33 | ## Clone 34 | ``` 35 | git clone https://github.com/Bruce-Lee-LY/cuda_hook.git 36 | ``` 37 | 38 | ## Build 39 | ### GTX1080Ti 40 | ``` 41 | cd cuda_hook 42 | ./build.sh -a 61 -t Release -s ON -b OFF 43 | ./build.sh -a 61 -t Debug -s OFF -b ON 44 | ``` 45 | 46 | ### Tesla V100 47 | ``` 48 | cd cuda_hook 49 | ./build.sh -a 70 -t Release -s ON -b OFF 50 | ./build.sh -a 70 -t Debug -s OFF -b ON 51 | ``` 52 | 53 | ### RTX2080Ti 54 | ``` 55 | cd cuda_hook 56 | ./build.sh -a 75 -t Release -s ON -b OFF 57 | ./build.sh -a 75 -t Debug -s OFF -b ON 58 | ``` 59 | 60 | ### NVIDIA A100 61 | ``` 62 | cd cuda_hook 63 | ./build.sh -a 80 -t Release -s ON -b OFF 64 | ./build.sh -a 80 -t Debug -s OFF -b ON 65 | ``` 66 | 67 | ### RTX3080Ti / RTX3090 / RTX A6000 68 | ``` 69 | cd cuda_hook 70 | ./build.sh -a 86 -t Release -s ON -b OFF 71 | ./build.sh -a 86 -t Debug -s OFF -b ON 72 | ``` 73 | 74 | # Run Sample 75 | ``` 76 | ./run_sample.sh 77 | ``` 78 | 79 | # Tools 80 | ## Code Generate 81 | Use CUDA native header files to automatically generate code that hooks CUDA API. 82 | ``` 83 | cd tools/code_generate 84 | ./code_generate.sh 85 | ``` 86 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2022. All Rights Reserved. 2 | # Author: Bruce-Lee-LY 3 | # Date: 16:19:40 on Sun, May 29, 2022 4 | # 5 | # Description: compile script 6 | 7 | #!/bin/bash 8 | 9 | set -euo pipefail 10 | 11 | echo "========== build enter ==========" 12 | 13 | WORK_PATH=$(cd $(dirname $0) && pwd) && cd $WORK_PATH 14 | 15 | CUDA_ARCHITECTURE=86 # a: (Tesla P100: 60, GTX1080Ti: 61, Tesla V100: 70, RTX2080Ti: 75, NVIDIA A100: 80, RTX3080Ti / RTX3090 / RTX A6000: 86, RTX4090: 89, NVIDIA H100: 90) 16 | BUILD_TYPE=Debug # t: (Debug, Release) 17 | WITH_SAMPLE=ON # s: (ON, OFF) 18 | VERBOSE_MAKEFILE=OFF # b: (ON, OFF) 19 | 20 | while getopts ":a:t:s:b:" opt 21 | do 22 | case $opt in 23 | a) 24 | CUDA_ARCHITECTURE=$OPTARG 25 | echo "CUDA_ARCHITECTURE: $CUDA_ARCHITECTURE" 26 | ;; 27 | t) 28 | BUILD_TYPE=$OPTARG 29 | echo "BUILD_TYPE: $BUILD_TYPE" 30 | ;; 31 | s) 32 | WITH_SAMPLE=$OPTARG 33 | echo "WITH_SAMPLE: $WITH_SAMPLE" 34 | ;; 35 | b) 36 | VERBOSE_MAKEFILE=$OPTARG 37 | echo "VERBOSE_MAKEFILE: $VERBOSE_MAKEFILE" 38 | ;; 39 | ?) 40 | echo "invalid param: $OPTARG" 41 | exit 1 42 | ;; 43 | esac 44 | done 45 | 46 | echo_cmd() { 47 | echo $1 48 | $1 49 | } 50 | 51 | echo "========== build cuda_hook ==========" 52 | 53 | echo_cmd "rm -rf build output" 54 | echo_cmd "mkdir build" 55 | 56 | echo_cmd "cd build" 57 | echo_cmd "cmake -DCMAKE_CUDA_ARCHITECTURES=$CUDA_ARCHITECTURE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DHOOK_WITH_SAMPLE=$WITH_SAMPLE -DHOOK_VERBOSE_MAKEFILE=$VERBOSE_MAKEFILE -DCMAKE_INSTALL_PREFIX=$WORK_PATH/output -DCMAKE_SKIP_RPATH=ON .." 58 | echo_cmd "make -j$(nproc --ignore=2)" 59 | echo_cmd "make install" 60 | 61 | echo "========== create soft link ==========" 62 | 63 | # cuda 64 | echo_cmd "ln -s libcuda_hook.so libcuda.so.1" 65 | echo_cmd "ln -s libcuda.so.1 libcuda.so" 66 | 67 | # nvml 68 | echo_cmd "ln -s libcuda_hook.so libnvidia-ml.so.1" 69 | echo_cmd "ln -s libnvidia-ml.so.1 libnvidia-ml.so" 70 | 71 | # cudart 72 | echo_cmd "ln -s libcuda_hook.so libcudart.so.11.0" 73 | echo_cmd "ln -s libcudart.so.11.0 libcudart.so" 74 | 75 | # cudnn 76 | echo_cmd "ln -s libcuda_hook.so libcudnn.so.7" 77 | echo_cmd "ln -s libcudnn.so.7 libcudnn.so" 78 | 79 | # cublas 80 | echo_cmd "ln -s libcuda_hook.so libcublas.so.11" 81 | echo_cmd "ln -s libcublas.so.11 libcublas.so" 82 | 83 | # cublasLt 84 | echo_cmd "ln -s libcuda_hook.so libcublasLt.so.11" 85 | echo_cmd "ln -s libcublasLt.so.11 libcublasLt.so" 86 | 87 | # cufft 88 | echo_cmd "ln -s libcuda_hook.so libcufft.so.10" 89 | echo_cmd "ln -s libcufft.so.10 libcufft.so" 90 | 91 | # nvtx 92 | echo_cmd "ln -s libcuda_hook.so libnvToolsExt.so.1" 93 | echo_cmd "ln -s libnvToolsExt.so.1 libnvToolsExt.so" 94 | 95 | # nvrtc 96 | echo_cmd "ln -s libcuda_hook.so libnvrtc.so.11.2" 97 | echo_cmd "ln -s libnvrtc.so.11.2 libnvrtc.so" 98 | 99 | # curand 100 | echo_cmd "ln -s libcuda_hook.so libcurand.so.10" 101 | echo_cmd "ln -s libcurand.so.10 libcurand.so" 102 | 103 | # cusparse 104 | echo_cmd "ln -s libcuda_hook.so libcusparse.so.11" 105 | echo_cmd "ln -s libcusparse.so.11 libcusparse.so" 106 | 107 | # cusolver 108 | echo_cmd "ln -s libcuda_hook.so libcusolver.so.11" 109 | echo_cmd "ln -s libcusolver.so.11 libcusolver.so" 110 | 111 | # nvjpeg 112 | echo_cmd "ln -s libcuda_hook.so libnvjpeg.so.11" 113 | echo_cmd "ln -s libnvjpeg.so.11 libnvjpeg.so" 114 | 115 | # nvblas 116 | echo_cmd "ln -s libcuda_hook.so libnvblas.so.11" 117 | echo_cmd "ln -s libnvblas.so.11 libnvblas.so" 118 | 119 | echo_cmd "cp -d *.so *.so.* $WORK_PATH/output/lib64" 120 | 121 | echo "========== build info ==========" 122 | 123 | BRANCH=`git rev-parse --abbrev-ref HEAD` 124 | COMMIT=`git rev-parse HEAD` 125 | GCC_VERSION=`gcc -dumpversion` 126 | COMPILE_TIME=$(date "+%H:%M:%S %Y-%m-%d") 127 | 128 | echo "branch: $BRANCH" >> $WORK_PATH/output/hook_version 129 | echo "commit: $COMMIT" >> $WORK_PATH/output/hook_version 130 | echo "gcc_version: $GCC_VERSION" >> $WORK_PATH/output/hook_version 131 | echo "compile_time: $COMPILE_TIME" >> $WORK_PATH/output/hook_version 132 | 133 | echo "========== build exit ==========" 134 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | robotpy-cppheaderparser==5.0.16 -------------------------------------------------------------------------------- /run_sample.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2022. All Rights Reserved. 2 | # Author: Bruce-Lee-LY 3 | # Date: 22:25:53 on Sun, May 29, 2022 4 | # 5 | # Description: run sample script 6 | 7 | #!/bin/bash 8 | 9 | set -euo pipefail 10 | 11 | WORK_PATH=$(cd $(dirname $0) && pwd) && cd $WORK_PATH 12 | export LD_LIBRARY_PATH=$WORK_PATH/output/lib64:/usr/local/cuda/targets/x86_64-linux/lib:/usr/lib/x86_64-linux-gnu 13 | 14 | rm -rf log && mkdir -p log/sample/cuda log/sample/nvml 15 | 16 | # cuda/cudart 17 | nohup $WORK_PATH/output/sample/cuda/bandwidth_test > log/sample/cuda/bandwidth_test.log 2>&1 & 18 | nohup $WORK_PATH/output/sample/cuda/matrix_mul > log/sample/cuda/matrix_mul.log 2>&1 & 19 | nohup $WORK_PATH/output/sample/cuda/vector_add > log/sample/cuda/vector_add.log 2>&1 & 20 | 21 | # nvml 22 | nohup $WORK_PATH/output/sample/nvml/nvml_example > log/sample/nvml/nvml_example.log 2>&1 & 23 | nohup $WORK_PATH/output/sample/nvml/supported_vgpus > log/sample/nvml/supported_vgpus.log 2>&1 & 24 | -------------------------------------------------------------------------------- /sample/cuda/include/exception.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 22:17:08 on Sun, May 29, 2022 4 | // 5 | // Description: source file in /usr/local/cuda/samples/common/inc 6 | 7 | /* CUda UTility Library */ 8 | #ifndef COMMON_EXCEPTION_H_ 9 | #define COMMON_EXCEPTION_H_ 10 | 11 | // includes, system 12 | #include 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | //! Exception wrapper. 20 | //! @param Std_Exception Exception out of namespace std for easy typing. 21 | template 22 | class Exception : public Std_Exception { 23 | public: 24 | //! @brief Static construction interface 25 | //! @return Alwayss throws ( Located_Exception) 26 | //! @param file file in which the Exception occurs 27 | //! @param line line in which the Exception occurs 28 | //! @param detailed details on the code fragment causing the Exception 29 | static void throw_it(const char *file, const int line, const char *detailed = "-"); 30 | 31 | //! Static construction interface 32 | //! @return Alwayss throws ( Located_Exception) 33 | //! @param file file in which the Exception occurs 34 | //! @param line line in which the Exception occurs 35 | //! @param detailed details on the code fragment causing the Exception 36 | static void throw_it(const char *file, const int line, const std::string &detailed); 37 | 38 | //! Destructor 39 | virtual ~Exception() throw(); 40 | 41 | private: 42 | //! Constructor, default (private) 43 | Exception(); 44 | 45 | //! Constructor, standard 46 | //! @param str string returned by what() 47 | explicit Exception(const std::string &str); 48 | }; 49 | 50 | //////////////////////////////////////////////////////////////////////////////// 51 | //! Exception handler function for arbitrary exceptions 52 | //! @param ex exception to handle 53 | //////////////////////////////////////////////////////////////////////////////// 54 | template 55 | inline void handleException(const Exception_Typ &ex) { 56 | std::cerr << ex.what() << std::endl; 57 | 58 | exit(EXIT_FAILURE); 59 | } 60 | 61 | //! Convenience macros 62 | 63 | //! Exception caused by dynamic program behavior, e.g. file does not exist 64 | #define RUNTIME_EXCEPTION(msg) Exception::throw_it(__FILE__, __LINE__, msg) 65 | 66 | //! Logic exception in program, e.g. an assert failed 67 | #define LOGIC_EXCEPTION(msg) Exception::throw_it(__FILE__, __LINE__, msg) 68 | 69 | //! Out of range exception 70 | #define RANGE_EXCEPTION(msg) Exception::throw_it(__FILE__, __LINE__, msg) 71 | 72 | //////////////////////////////////////////////////////////////////////////////// 73 | //! Implementation 74 | 75 | // includes, system 76 | #include 77 | 78 | //////////////////////////////////////////////////////////////////////////////// 79 | //! Static construction interface. 80 | //! @param Exception causing code fragment (file and line) and detailed infos. 81 | //////////////////////////////////////////////////////////////////////////////// 82 | /*static*/ template 83 | void Exception::throw_it(const char *file, const int line, const char *detailed) { 84 | std::stringstream s; 85 | 86 | // Quiet heavy-weight but exceptions are not for 87 | // performance / release versions 88 | s << "Exception in file '" << file << "' in line " << line << "\n" 89 | << "Detailed description: " << detailed << "\n"; 90 | 91 | throw Exception(s.str()); 92 | } 93 | 94 | //////////////////////////////////////////////////////////////////////////////// 95 | //! Static construction interface. 96 | //! @param Exception causing code fragment (file and line) and detailed infos. 97 | //////////////////////////////////////////////////////////////////////////////// 98 | /*static*/ template 99 | void Exception::throw_it(const char *file, const int line, const std::string &msg) { 100 | throw_it(file, line, msg.c_str()); 101 | } 102 | 103 | //////////////////////////////////////////////////////////////////////////////// 104 | //! Constructor, default (private). 105 | //////////////////////////////////////////////////////////////////////////////// 106 | template 107 | Exception::Exception() : Std_Exception("Unknown Exception.\n") {} 108 | 109 | //////////////////////////////////////////////////////////////////////////////// 110 | //! Constructor, standard (private). 111 | //! String returned by what(). 112 | //////////////////////////////////////////////////////////////////////////////// 113 | template 114 | Exception::Exception(const std::string &s) : Std_Exception(s) {} 115 | 116 | //////////////////////////////////////////////////////////////////////////////// 117 | //! Destructor 118 | //////////////////////////////////////////////////////////////////////////////// 119 | template 120 | Exception::~Exception() throw() {} 121 | 122 | // functions, exported 123 | 124 | #endif // COMMON_EXCEPTION_H_ 125 | -------------------------------------------------------------------------------- /sample/cuda/include/helper_functions.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 22:17:08 on Sun, May 29, 2022 4 | // 5 | // Description: source file in /usr/local/cuda/samples/common/inc 6 | 7 | // These are helper functions for the SDK samples (string parsing, 8 | // timers, image helpers, etc) 9 | #ifndef COMMON_HELPER_FUNCTIONS_H_ 10 | #define COMMON_HELPER_FUNCTIONS_H_ 11 | 12 | #ifdef WIN32 13 | #pragma warning(disable : 4996) 14 | #endif 15 | 16 | // includes, project 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | // includes, timer, string parsing, image helpers 30 | #include // helper functions for image compare, dump, data comparisons 31 | #include // helper functions for string parsing 32 | #include // helper functions for timers 33 | 34 | #ifndef EXIT_WAIVED 35 | #define EXIT_WAIVED 2 36 | #endif 37 | 38 | #endif // COMMON_HELPER_FUNCTIONS_H_ 39 | -------------------------------------------------------------------------------- /sample/cuda/include/helper_timer.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 22:17:08 on Sun, May 29, 2022 4 | // 5 | // Description: source file in /usr/local/cuda/samples/common/inc 6 | 7 | // Helper Timing Functions 8 | #ifndef COMMON_HELPER_TIMER_H_ 9 | #define COMMON_HELPER_TIMER_H_ 10 | 11 | #ifndef EXIT_WAIVED 12 | #define EXIT_WAIVED 2 13 | #endif 14 | 15 | // includes, system 16 | #include 17 | 18 | // includes, project 19 | #include 20 | 21 | // Definition of the StopWatch Interface, this is used if we don't want to use 22 | // the CUT functions But rather in a self contained class interface 23 | class StopWatchInterface { 24 | public: 25 | StopWatchInterface() {} 26 | virtual ~StopWatchInterface() {} 27 | 28 | public: 29 | //! Start time measurement 30 | virtual void start() = 0; 31 | 32 | //! Stop time measurement 33 | virtual void stop() = 0; 34 | 35 | //! Reset time counters to zero 36 | virtual void reset() = 0; 37 | 38 | //! Time in msec. after start. If the stop watch is still running (i.e. there 39 | //! was no call to stop()) then the elapsed time is returned, otherwise the 40 | //! time between the last start() and stop call is returned 41 | virtual float getTime() = 0; 42 | 43 | //! Mean time to date based on the number of times the stopwatch has been 44 | //! _stopped_ (ie finished sessions) and the current total time 45 | virtual float getAverageTime() = 0; 46 | }; 47 | 48 | ////////////////////////////////////////////////////////////////// 49 | // Begin Stopwatch timer class definitions for all OS platforms // 50 | ////////////////////////////////////////////////////////////////// 51 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 52 | // includes, system 53 | #define WINDOWS_LEAN_AND_MEAN 54 | #include 55 | #undef min 56 | #undef max 57 | 58 | //! Windows specific implementation of StopWatch 59 | class StopWatchWin : public StopWatchInterface { 60 | public: 61 | //! Constructor, default 62 | StopWatchWin() 63 | : start_time(), 64 | end_time(), 65 | diff_time(0.0f), 66 | total_time(0.0f), 67 | running(false), 68 | clock_sessions(0), 69 | freq(0), 70 | freq_set(false) { 71 | if (!freq_set) { 72 | // helper variable 73 | LARGE_INTEGER temp; 74 | 75 | // get the tick frequency from the OS 76 | QueryPerformanceFrequency(reinterpret_cast(&temp)); 77 | 78 | // convert to type in which it is needed 79 | freq = (static_cast(temp.QuadPart)) / 1000.0; 80 | 81 | // rememeber query 82 | freq_set = true; 83 | } 84 | } 85 | 86 | // Destructor 87 | ~StopWatchWin() {} 88 | 89 | public: 90 | //! Start time measurement 91 | inline void start(); 92 | 93 | //! Stop time measurement 94 | inline void stop(); 95 | 96 | //! Reset time counters to zero 97 | inline void reset(); 98 | 99 | //! Time in msec. after start. If the stop watch is still running (i.e. there 100 | //! was no call to stop()) then the elapsed time is returned, otherwise the 101 | //! time between the last start() and stop call is returned 102 | inline float getTime(); 103 | 104 | //! Mean time to date based on the number of times the stopwatch has been 105 | //! _stopped_ (ie finished sessions) and the current total time 106 | inline float getAverageTime(); 107 | 108 | private: 109 | // member variables 110 | 111 | //! Start of measurement 112 | LARGE_INTEGER start_time; 113 | //! End of measurement 114 | LARGE_INTEGER end_time; 115 | 116 | //! Time difference between the last start and stop 117 | float diff_time; 118 | 119 | //! TOTAL time difference between starts and stops 120 | float total_time; 121 | 122 | //! flag if the stop watch is running 123 | bool running; 124 | 125 | //! Number of times clock has been started 126 | //! and stopped to allow averaging 127 | int clock_sessions; 128 | 129 | //! tick frequency 130 | double freq; 131 | 132 | //! flag if the frequency has been set 133 | bool freq_set; 134 | }; 135 | 136 | // functions, inlined 137 | 138 | //////////////////////////////////////////////////////////////////////////////// 139 | //! Start time measurement 140 | //////////////////////////////////////////////////////////////////////////////// 141 | inline void StopWatchWin::start() { 142 | QueryPerformanceCounter(reinterpret_cast(&start_time)); 143 | running = true; 144 | } 145 | 146 | //////////////////////////////////////////////////////////////////////////////// 147 | //! Stop time measurement and increment add to the current diff_time summation 148 | //! variable. Also increment the number of times this clock has been run. 149 | //////////////////////////////////////////////////////////////////////////////// 150 | inline void StopWatchWin::stop() { 151 | QueryPerformanceCounter(reinterpret_cast(&end_time)); 152 | diff_time = static_cast( 153 | ((static_cast(end_time.QuadPart) - static_cast(start_time.QuadPart)) / freq)); 154 | 155 | total_time += diff_time; 156 | clock_sessions++; 157 | running = false; 158 | } 159 | 160 | //////////////////////////////////////////////////////////////////////////////// 161 | //! Reset the timer to 0. Does not change the timer running state but does 162 | //! recapture this point in time as the current start time if it is running. 163 | //////////////////////////////////////////////////////////////////////////////// 164 | inline void StopWatchWin::reset() { 165 | diff_time = 0; 166 | total_time = 0; 167 | clock_sessions = 0; 168 | 169 | if (running) { 170 | QueryPerformanceCounter(reinterpret_cast(&start_time)); 171 | } 172 | } 173 | 174 | //////////////////////////////////////////////////////////////////////////////// 175 | //! Time in msec. after start. If the stop watch is still running (i.e. there 176 | //! was no call to stop()) then the elapsed time is returned added to the 177 | //! current diff_time sum, otherwise the current summed time difference alone 178 | //! is returned. 179 | //////////////////////////////////////////////////////////////////////////////// 180 | inline float StopWatchWin::getTime() { 181 | // Return the TOTAL time to date 182 | float retval = total_time; 183 | 184 | if (running) { 185 | LARGE_INTEGER temp; 186 | QueryPerformanceCounter(reinterpret_cast(&temp)); 187 | retval += static_cast( 188 | ((static_cast(temp.QuadPart) - static_cast(start_time.QuadPart)) / freq)); 189 | } 190 | 191 | return retval; 192 | } 193 | 194 | //////////////////////////////////////////////////////////////////////////////// 195 | //! Time in msec. for a single run based on the total number of COMPLETED runs 196 | //! and the total time. 197 | //////////////////////////////////////////////////////////////////////////////// 198 | inline float StopWatchWin::getAverageTime() { 199 | return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; 200 | } 201 | #else 202 | // Declarations for Stopwatch on Linux and Mac OSX 203 | // includes, system 204 | #include 205 | 206 | #include 207 | 208 | //! Windows specific implementation of StopWatch 209 | class StopWatchLinux : public StopWatchInterface { 210 | public: 211 | //! Constructor, default 212 | StopWatchLinux() : start_time(), diff_time(0.0), total_time(0.0), running(false), clock_sessions(0) {} 213 | 214 | // Destructor 215 | virtual ~StopWatchLinux() {} 216 | 217 | public: 218 | //! Start time measurement 219 | inline void start(); 220 | 221 | //! Stop time measurement 222 | inline void stop(); 223 | 224 | //! Reset time counters to zero 225 | inline void reset(); 226 | 227 | //! Time in msec. after start. If the stop watch is still running (i.e. there 228 | //! was no call to stop()) then the elapsed time is returned, otherwise the 229 | //! time between the last start() and stop call is returned 230 | inline float getTime(); 231 | 232 | //! Mean time to date based on the number of times the stopwatch has been 233 | //! _stopped_ (ie finished sessions) and the current total time 234 | inline float getAverageTime(); 235 | 236 | private: 237 | // helper functions 238 | 239 | //! Get difference between start time and current time 240 | inline float getDiffTime(); 241 | 242 | private: 243 | // member variables 244 | 245 | //! Start of measurement 246 | struct timeval start_time; 247 | 248 | //! Time difference between the last start and stop 249 | float diff_time; 250 | 251 | //! TOTAL time difference between starts and stops 252 | float total_time; 253 | 254 | //! flag if the stop watch is running 255 | bool running; 256 | 257 | //! Number of times clock has been started 258 | //! and stopped to allow averaging 259 | int clock_sessions; 260 | }; 261 | 262 | // functions, inlined 263 | 264 | //////////////////////////////////////////////////////////////////////////////// 265 | //! Start time measurement 266 | //////////////////////////////////////////////////////////////////////////////// 267 | inline void StopWatchLinux::start() { 268 | gettimeofday(&start_time, 0); 269 | running = true; 270 | } 271 | 272 | //////////////////////////////////////////////////////////////////////////////// 273 | //! Stop time measurement and increment add to the current diff_time summation 274 | //! variable. Also increment the number of times this clock has been run. 275 | //////////////////////////////////////////////////////////////////////////////// 276 | inline void StopWatchLinux::stop() { 277 | diff_time = getDiffTime(); 278 | total_time += diff_time; 279 | running = false; 280 | clock_sessions++; 281 | } 282 | 283 | //////////////////////////////////////////////////////////////////////////////// 284 | //! Reset the timer to 0. Does not change the timer running state but does 285 | //! recapture this point in time as the current start time if it is running. 286 | //////////////////////////////////////////////////////////////////////////////// 287 | inline void StopWatchLinux::reset() { 288 | diff_time = 0; 289 | total_time = 0; 290 | clock_sessions = 0; 291 | 292 | if (running) { 293 | gettimeofday(&start_time, 0); 294 | } 295 | } 296 | 297 | //////////////////////////////////////////////////////////////////////////////// 298 | //! Time in msec. after start. If the stop watch is still running (i.e. there 299 | //! was no call to stop()) then the elapsed time is returned added to the 300 | //! current diff_time sum, otherwise the current summed time difference alone 301 | //! is returned. 302 | //////////////////////////////////////////////////////////////////////////////// 303 | inline float StopWatchLinux::getTime() { 304 | // Return the TOTAL time to date 305 | float retval = total_time; 306 | 307 | if (running) { 308 | retval += getDiffTime(); 309 | } 310 | 311 | return retval; 312 | } 313 | 314 | //////////////////////////////////////////////////////////////////////////////// 315 | //! Time in msec. for a single run based on the total number of COMPLETED runs 316 | //! and the total time. 317 | //////////////////////////////////////////////////////////////////////////////// 318 | inline float StopWatchLinux::getAverageTime() { 319 | return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; 320 | } 321 | //////////////////////////////////////////////////////////////////////////////// 322 | 323 | //////////////////////////////////////////////////////////////////////////////// 324 | inline float StopWatchLinux::getDiffTime() { 325 | struct timeval t_time; 326 | gettimeofday(&t_time, 0); 327 | 328 | // time difference in milli-seconds 329 | return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + 330 | (0.001 * (t_time.tv_usec - start_time.tv_usec))); 331 | } 332 | #endif // WIN32 333 | 334 | //////////////////////////////////////////////////////////////////////////////// 335 | //! Timer functionality exported 336 | 337 | //////////////////////////////////////////////////////////////////////////////// 338 | //! Create a new timer 339 | //! @return true if a time has been created, otherwise false 340 | //! @param name of the new timer, 0 if the creation failed 341 | //////////////////////////////////////////////////////////////////////////////// 342 | inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { 343 | // printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); 344 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 345 | *timer_interface = reinterpret_cast(new StopWatchWin()); 346 | #else 347 | *timer_interface = reinterpret_cast(new StopWatchLinux()); 348 | #endif 349 | return (*timer_interface != NULL) ? true : false; 350 | } 351 | 352 | //////////////////////////////////////////////////////////////////////////////// 353 | //! Delete a timer 354 | //! @return true if a time has been deleted, otherwise false 355 | //! @param name of the timer to delete 356 | //////////////////////////////////////////////////////////////////////////////// 357 | inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { 358 | // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); 359 | if (*timer_interface) { 360 | delete *timer_interface; 361 | *timer_interface = NULL; 362 | } 363 | 364 | return true; 365 | } 366 | 367 | //////////////////////////////////////////////////////////////////////////////// 368 | //! Start the time with name \a name 369 | //! @param name name of the timer to start 370 | //////////////////////////////////////////////////////////////////////////////// 371 | inline bool sdkStartTimer(StopWatchInterface **timer_interface) { 372 | // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); 373 | if (*timer_interface) { 374 | (*timer_interface)->start(); 375 | } 376 | 377 | return true; 378 | } 379 | 380 | //////////////////////////////////////////////////////////////////////////////// 381 | //! Stop the time with name \a name. Does not reset. 382 | //! @param name name of the timer to stop 383 | //////////////////////////////////////////////////////////////////////////////// 384 | inline bool sdkStopTimer(StopWatchInterface **timer_interface) { 385 | // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); 386 | if (*timer_interface) { 387 | (*timer_interface)->stop(); 388 | } 389 | 390 | return true; 391 | } 392 | 393 | //////////////////////////////////////////////////////////////////////////////// 394 | //! Resets the timer's counter. 395 | //! @param name name of the timer to reset. 396 | //////////////////////////////////////////////////////////////////////////////// 397 | inline bool sdkResetTimer(StopWatchInterface **timer_interface) { 398 | // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); 399 | if (*timer_interface) { 400 | (*timer_interface)->reset(); 401 | } 402 | 403 | return true; 404 | } 405 | 406 | //////////////////////////////////////////////////////////////////////////////// 407 | //! Return the average time for timer execution as the total time 408 | //! for the timer dividied by the number of completed (stopped) runs the timer 409 | //! has made. 410 | //! Excludes the current running time if the timer is currently running. 411 | //! @param name name of the timer to return the time of 412 | //////////////////////////////////////////////////////////////////////////////// 413 | inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { 414 | // printf("sdkGetAverageTimerValue called object %08x\n", (void 415 | // *)*timer_interface); 416 | if (*timer_interface) { 417 | return (*timer_interface)->getAverageTime(); 418 | } else { 419 | return 0.0f; 420 | } 421 | } 422 | 423 | //////////////////////////////////////////////////////////////////////////////// 424 | //! Total execution time for the timer over all runs since the last reset 425 | //! or timer creation. 426 | //! @param name name of the timer to obtain the value of. 427 | //////////////////////////////////////////////////////////////////////////////// 428 | inline float sdkGetTimerValue(StopWatchInterface **timer_interface) { 429 | // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); 430 | if (*timer_interface) { 431 | return (*timer_interface)->getTime(); 432 | } else { 433 | return 0.0f; 434 | } 435 | } 436 | 437 | #endif // COMMON_HELPER_TIMER_H_ 438 | -------------------------------------------------------------------------------- /sample/cuda/matrix_mul.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 22:16:00 on Sun, May 29, 2022 4 | // 5 | // Description: source file in cuda/samples/0_Simple/matrixMul/matrixMul.cu 6 | 7 | /** 8 | * Matrix multiplication: C = A * B. 9 | * Host code. 10 | * 11 | * This sample implements matrix multiplication which makes use of shared memory 12 | * to ensure data reuse, the matrix multiplication is done using tiling 13 | * approach. It has been written for clarity of exposition to illustrate various 14 | * CUDA programming principles, not with the goal of providing the most 15 | * performant generic kernel for matrix multiplication. See also: V. Volkov and 16 | * J. Demmel, "Benchmarking GPUs to tune dense linear algebra," in Proc. 2008 17 | * ACM/IEEE Conf. on Supercomputing (SC '08), Piscataway, NJ: IEEE Press, 2008, 18 | * pp. Art. 31:1-11. 19 | */ 20 | 21 | // System includes 22 | #include 23 | #include 24 | 25 | // CUDA runtime 26 | #include 27 | 28 | // Helper functions and utilities to work with CUDA 29 | #include 30 | #include 31 | 32 | /** 33 | * Matrix multiplication (CUDA Kernel) on the device: C = A * B 34 | * wA is A's width and wB is B's width 35 | */ 36 | template 37 | __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB) { 38 | // Block index 39 | int bx = blockIdx.x; 40 | int by = blockIdx.y; 41 | 42 | // Thread index 43 | int tx = threadIdx.x; 44 | int ty = threadIdx.y; 45 | 46 | // Index of the first sub-matrix of A processed by the block 47 | int aBegin = wA * BLOCK_SIZE * by; 48 | 49 | // Index of the last sub-matrix of A processed by the block 50 | int aEnd = aBegin + wA - 1; 51 | 52 | // Step size used to iterate through the sub-matrices of A 53 | int aStep = BLOCK_SIZE; 54 | 55 | // Index of the first sub-matrix of B processed by the block 56 | int bBegin = BLOCK_SIZE * bx; 57 | 58 | // Step size used to iterate through the sub-matrices of B 59 | int bStep = BLOCK_SIZE * wB; 60 | 61 | // Csub is used to store the element of the block sub-matrix 62 | // that is computed by the thread 63 | float Csub = 0; 64 | 65 | // Loop over all the sub-matrices of A and B 66 | // required to compute the block sub-matrix 67 | for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { 68 | // Declaration of the shared memory array As used to 69 | // store the sub-matrix of A 70 | __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 71 | 72 | // Declaration of the shared memory array Bs used to 73 | // store the sub-matrix of B 74 | __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 75 | 76 | // Load the matrices from device memory 77 | // to shared memory; each thread loads 78 | // one element of each matrix 79 | As[ty][tx] = A[a + wA * ty + tx]; 80 | Bs[ty][tx] = B[b + wB * ty + tx]; 81 | 82 | // Synchronize to make sure the matrices are loaded 83 | __syncthreads(); 84 | 85 | // Multiply the two matrices together; 86 | // each thread computes one element 87 | // of the block sub-matrix 88 | #pragma unroll 89 | 90 | for (int k = 0; k < BLOCK_SIZE; ++k) { 91 | Csub += As[ty][k] * Bs[k][tx]; 92 | } 93 | 94 | // Synchronize to make sure that the preceding 95 | // computation is done before loading two new 96 | // sub-matrices of A and B in the next iteration 97 | __syncthreads(); 98 | } 99 | 100 | // Write the block sub-matrix to device memory; 101 | // each thread writes one element 102 | int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 103 | C[c + wB * ty + tx] = Csub; 104 | } 105 | 106 | void ConstantInit(float *data, int size, float val) { 107 | for (int i = 0; i < size; ++i) { 108 | data[i] = val; 109 | } 110 | } 111 | 112 | /** 113 | * Run a simple test of matrix multiplication using CUDA 114 | */ 115 | int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB) { 116 | // Allocate host memory for matrices A and B 117 | unsigned int size_A = dimsA.x * dimsA.y; 118 | unsigned int mem_size_A = sizeof(float) * size_A; 119 | float *h_A; 120 | checkCudaErrors(cudaMallocHost((void **)&h_A, mem_size_A)); 121 | unsigned int size_B = dimsB.x * dimsB.y; 122 | unsigned int mem_size_B = sizeof(float) * size_B; 123 | float *h_B; 124 | checkCudaErrors(cudaMallocHost((void **)&h_B, mem_size_B)); 125 | cudaStream_t stream; 126 | 127 | // Initialize host memory 128 | const float valB = 0.01f; 129 | ConstantInit(h_A, size_A, 1.0f); 130 | ConstantInit(h_B, size_B, valB); 131 | 132 | // Allocate device memory 133 | float *d_A, *d_B, *d_C; 134 | 135 | // Allocate host matrix C 136 | dim3 dimsC(dimsB.x, dimsA.y, 1); 137 | unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); 138 | float *h_C; 139 | checkCudaErrors(cudaMallocHost((void **)&h_C, mem_size_C)); 140 | if (h_C == NULL) { 141 | fprintf(stderr, "Failed to allocate host matrix C!\n"); 142 | exit(EXIT_FAILURE); 143 | } 144 | 145 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_A), mem_size_A)); 146 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_B), mem_size_B)); 147 | checkCudaErrors(cudaMalloc(reinterpret_cast(&d_C), mem_size_C)); 148 | 149 | // Allocate CUDA events that we'll use for timing 150 | cudaEvent_t start, stop; 151 | checkCudaErrors(cudaEventCreate(&start)); 152 | checkCudaErrors(cudaEventCreate(&stop)); 153 | 154 | checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); 155 | 156 | // copy host memory to device 157 | checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); 158 | checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream)); 159 | 160 | // Setup execution parameters 161 | dim3 threads(block_size, block_size); 162 | dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); 163 | 164 | // Create and start timer 165 | printf("Computing result using CUDA Kernel...\n"); 166 | 167 | // Performs warmup operation using matrixMul CUDA kernel 168 | if (block_size == 16) { 169 | MatrixMulCUDA<16><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 170 | } else { 171 | MatrixMulCUDA<32><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 172 | } 173 | 174 | printf("done\n"); 175 | checkCudaErrors(cudaStreamSynchronize(stream)); 176 | 177 | // Record the start event 178 | checkCudaErrors(cudaEventRecord(start, stream)); 179 | 180 | // Execute the kernel 181 | int nIter = 300; 182 | 183 | for (int j = 0; j < nIter; j++) { 184 | if (block_size == 16) { 185 | MatrixMulCUDA<16><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 186 | } else { 187 | MatrixMulCUDA<32><<>>(d_C, d_A, d_B, dimsA.x, dimsB.x); 188 | } 189 | } 190 | 191 | // Record the stop event 192 | checkCudaErrors(cudaEventRecord(stop, stream)); 193 | 194 | // Wait for the stop event to complete 195 | checkCudaErrors(cudaEventSynchronize(stop)); 196 | 197 | float msecTotal = 0.0f; 198 | checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); 199 | 200 | // Compute and print the performance 201 | float msecPerMatrixMul = msecTotal / nIter; 202 | double flopsPerMatrixMul = 203 | 2.0 * static_cast(dimsA.x) * static_cast(dimsA.y) * static_cast(dimsB.x); 204 | double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); 205 | printf( 206 | "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops," 207 | " WorkgroupSize= %u threads/block\n", 208 | gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y); 209 | 210 | // Copy result from device to host 211 | checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream)); 212 | checkCudaErrors(cudaStreamSynchronize(stream)); 213 | 214 | printf("Checking computed result for correctness: "); 215 | bool correct = true; 216 | 217 | // test relative error by the formula 218 | // |_cpu - _gpu|/<|x|, |y|> < eps 219 | double eps = 1.e-6; // machine zero 220 | 221 | for (int i = 0; i < static_cast(dimsC.x * dimsC.y); i++) { 222 | double abs_err = fabs(h_C[i] - (dimsA.x * valB)); 223 | double dot_length = dimsA.x; 224 | double abs_val = fabs(h_C[i]); 225 | double rel_err = abs_err / abs_val / dot_length; 226 | 227 | if (rel_err > eps) { 228 | printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps); 229 | correct = false; 230 | } 231 | } 232 | 233 | printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); 234 | 235 | // Clean up memory 236 | checkCudaErrors(cudaFreeHost(h_A)); 237 | checkCudaErrors(cudaFreeHost(h_B)); 238 | checkCudaErrors(cudaFreeHost(h_C)); 239 | 240 | checkCudaErrors(cudaFree(d_A)); 241 | checkCudaErrors(cudaFree(d_B)); 242 | checkCudaErrors(cudaFree(d_C)); 243 | 244 | checkCudaErrors(cudaEventDestroy(start)); 245 | checkCudaErrors(cudaEventDestroy(stop)); 246 | printf( 247 | "\nNOTE: The CUDA Samples are not meant for performance" 248 | "measurements. Results may vary when GPU Boost is enabled.\n"); 249 | 250 | if (correct) { 251 | return EXIT_SUCCESS; 252 | } else { 253 | return EXIT_FAILURE; 254 | } 255 | } 256 | 257 | /** 258 | * Program main 259 | */ 260 | int main(int argc, char **argv) { 261 | printf("[Matrix Multiply Using CUDA] - Starting...\n"); 262 | 263 | if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) { 264 | printf("Usage -device=n (n >= 0 for deviceID)\n"); 265 | printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); 266 | printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); 267 | printf( 268 | " Note: Outer matrix dimensions of A & B matrices" 269 | " must be equal.\n"); 270 | 271 | exit(EXIT_SUCCESS); 272 | } 273 | 274 | // This will pick the best possible CUDA capable device, otherwise 275 | // override the device ID based on input provided at the command line 276 | int dev = findCudaDevice(argc, (const char **)argv); 277 | 278 | int block_size = 32; 279 | 280 | dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); 281 | dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); 282 | 283 | // width of Matrix A 284 | if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { 285 | dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); 286 | } 287 | 288 | // height of Matrix A 289 | if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { 290 | dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); 291 | } 292 | 293 | // width of Matrix B 294 | if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { 295 | dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); 296 | } 297 | 298 | // height of Matrix B 299 | if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { 300 | dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); 301 | } 302 | 303 | if (dimsA.x != dimsB.y) { 304 | printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y); 305 | exit(EXIT_FAILURE); 306 | } 307 | 308 | printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); 309 | 310 | int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB); 311 | 312 | exit(matrix_result); 313 | } 314 | -------------------------------------------------------------------------------- /sample/cuda/vector_add.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 22:15:42 on Sun, May 29, 2022 4 | // 5 | // Description: source file in cuda/samples/0_Simple/vectorAdd/vectorAdd.cu 6 | 7 | /** 8 | * Vector addition: C = A + B. 9 | * 10 | * This sample is a very basic sample that implements element by element 11 | * vector addition. It is the same as the sample illustrating Chapter 2 12 | * of the programming guide with some additions like error checking. 13 | */ 14 | 15 | #include 16 | 17 | // For the CUDA runtime routines (prefixed with "cuda_") 18 | #include 19 | #include 20 | 21 | /** 22 | * CUDA Kernel Device code 23 | * 24 | * Computes the vector addition of A and B into C. The 3 vectors have the same 25 | * number of elements numElements. 26 | */ 27 | __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) { 28 | int i = blockDim.x * blockIdx.x + threadIdx.x; 29 | 30 | if (i < numElements) { 31 | C[i] = A[i] + B[i]; 32 | } 33 | } 34 | 35 | /** 36 | * Host main routine 37 | */ 38 | int main(void) { 39 | // Error code to check return values for CUDA calls 40 | cudaError_t err = cudaSuccess; 41 | 42 | // Print the vector length to be used, and compute its size 43 | int numElements = 50000; 44 | size_t size = numElements * sizeof(float); 45 | printf("[Vector addition of %d elements]\n", numElements); 46 | 47 | // Allocate the host input vector A 48 | float *h_A = (float *)malloc(size); 49 | 50 | // Allocate the host input vector B 51 | float *h_B = (float *)malloc(size); 52 | 53 | // Allocate the host output vector C 54 | float *h_C = (float *)malloc(size); 55 | 56 | // Verify that allocations succeeded 57 | if (h_A == NULL || h_B == NULL || h_C == NULL) { 58 | fprintf(stderr, "Failed to allocate host vectors!\n"); 59 | exit(EXIT_FAILURE); 60 | } 61 | 62 | // Initialize the host input vectors 63 | for (int i = 0; i < numElements; ++i) { 64 | h_A[i] = rand() / (float)RAND_MAX; 65 | h_B[i] = rand() / (float)RAND_MAX; 66 | } 67 | 68 | // Allocate the device input vector A 69 | float *d_A = NULL; 70 | err = cudaMalloc((void **)&d_A, size); 71 | if (err != cudaSuccess) { 72 | fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); 73 | exit(EXIT_FAILURE); 74 | } 75 | 76 | // Allocate the device input vector B 77 | float *d_B = NULL; 78 | err = cudaMalloc((void **)&d_B, size); 79 | if (err != cudaSuccess) { 80 | fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err)); 81 | exit(EXIT_FAILURE); 82 | } 83 | 84 | // Allocate the device output vector C 85 | float *d_C = NULL; 86 | err = cudaMalloc((void **)&d_C, size); 87 | if (err != cudaSuccess) { 88 | fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); 89 | exit(EXIT_FAILURE); 90 | } 91 | 92 | // Copy the host input vectors A and B in host memory to the device input vectors in 93 | // device memory 94 | printf("Copy input data from the host memory to the CUDA device\n"); 95 | err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 96 | if (err != cudaSuccess) { 97 | fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); 98 | exit(EXIT_FAILURE); 99 | } 100 | 101 | err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 102 | if (err != cudaSuccess) { 103 | fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err)); 104 | exit(EXIT_FAILURE); 105 | } 106 | 107 | // Launch the Vector Add CUDA Kernel 108 | int threadsPerBlock = 256; 109 | int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; 110 | printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); 111 | vectorAdd<<>>(d_A, d_B, d_C, numElements); 112 | err = cudaGetLastError(); 113 | if (err != cudaSuccess) { 114 | fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); 115 | exit(EXIT_FAILURE); 116 | } 117 | 118 | // Copy the device result vector in device memory to the host result vector 119 | // in host memory. 120 | printf("Copy output data from the CUDA device to the host memory\n"); 121 | err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 122 | if (err != cudaSuccess) { 123 | fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); 124 | exit(EXIT_FAILURE); 125 | } 126 | 127 | // Verify that the result vector is correct 128 | for (int i = 0; i < numElements; ++i) { 129 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { 130 | fprintf(stderr, "Result verification failed at element %d!\n", i); 131 | exit(EXIT_FAILURE); 132 | } 133 | } 134 | 135 | printf("Test PASSED\n"); 136 | 137 | // Free device global memory 138 | err = cudaFree(d_A); 139 | if (err != cudaSuccess) { 140 | fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); 141 | exit(EXIT_FAILURE); 142 | } 143 | 144 | err = cudaFree(d_B); 145 | if (err != cudaSuccess) { 146 | fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err)); 147 | exit(EXIT_FAILURE); 148 | } 149 | 150 | err = cudaFree(d_C); 151 | if (err != cudaSuccess) { 152 | fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err)); 153 | exit(EXIT_FAILURE); 154 | } 155 | 156 | // Free host memory 157 | free(h_A); 158 | free(h_B); 159 | free(h_C); 160 | 161 | printf("Done\n"); 162 | return 0; 163 | } 164 | -------------------------------------------------------------------------------- /sample/nvml/nvml_example.c: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 22:34:28 on Sun, May 29, 2022 4 | // 5 | // Description: source file in cuda/nvml/example/example.c 6 | 7 | /***************************************************************************\ 8 | |* *| 9 | |* Copyright 2010-2016 NVIDIA Corporation. All rights reserved. *| 10 | |* *| 11 | |* NOTICE TO USER: *| 12 | |* *| 13 | |* This source code is subject to NVIDIA ownership rights under U.S. *| 14 | |* and international Copyright laws. Users and possessors of this *| 15 | |* source code are hereby granted a nonexclusive, royalty-free *| 16 | |* license to use this code in individual and commercial software. *| 17 | |* *| 18 | |* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE *| 19 | |* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR *| 20 | |* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH *| 21 | |* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF *| 22 | |* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR *| 23 | |* PURPOSE. IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, *| 24 | |* INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES *| 25 | |* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN *| 26 | |* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING *| 27 | |* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE *| 28 | |* CODE. *| 29 | |* *| 30 | |* U.S. Government End Users. This source code is a "commercial item" *| 31 | |* as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting *| 32 | |* of "commercial computer software" and "commercial computer software *| 33 | |* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) *| 34 | |* and is provided to the U.S. Government only as a commercial end item. *| 35 | |* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through *| 36 | |* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the *| 37 | |* source code with only those rights set forth herein. *| 38 | |* *| 39 | |* Any use of this source code in individual and commercial software must *| 40 | |* include, in the user documentation and internal comments to the code, *| 41 | |* the above Disclaimer and U.S. Government End Users Notice. *| 42 | |* *| 43 | |* *| 44 | \***************************************************************************/ 45 | 46 | #include 47 | #include 48 | 49 | static const char *convertToComputeModeString(nvmlComputeMode_t mode) { 50 | switch (mode) { 51 | case NVML_COMPUTEMODE_DEFAULT: 52 | return "Default"; 53 | case NVML_COMPUTEMODE_EXCLUSIVE_THREAD: 54 | return "Exclusive_Thread"; 55 | case NVML_COMPUTEMODE_PROHIBITED: 56 | return "Prohibited"; 57 | case NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: 58 | return "Exclusive Process"; 59 | default: 60 | return "Unknown"; 61 | } 62 | } 63 | 64 | int main(void) { 65 | nvmlReturn_t result; 66 | unsigned int device_count, i; 67 | 68 | // First initialize NVML library 69 | result = nvmlInit(); 70 | if (NVML_SUCCESS != result) { 71 | printf("Failed to initialize NVML: %s\n", nvmlErrorString(result)); 72 | 73 | printf("Press ENTER to continue...\n"); 74 | getchar(); 75 | return 1; 76 | } 77 | 78 | result = nvmlDeviceGetCount(&device_count); 79 | if (NVML_SUCCESS != result) { 80 | printf("Failed to query device count: %s\n", nvmlErrorString(result)); 81 | goto Error; 82 | } 83 | printf("Found %u device%s\n\n", device_count, device_count != 1 ? "s" : ""); 84 | 85 | printf("Listing devices:\n"); 86 | for (i = 0; i < device_count; i++) { 87 | nvmlDevice_t device; 88 | char name[NVML_DEVICE_NAME_BUFFER_SIZE]; 89 | nvmlPciInfo_t pci; 90 | nvmlComputeMode_t compute_mode; 91 | 92 | // Query for device handle to perform operations on a device 93 | // You can also query device handle by other features like: 94 | // nvmlDeviceGetHandleBySerial 95 | // nvmlDeviceGetHandleByPciBusId 96 | result = nvmlDeviceGetHandleByIndex(i, &device); 97 | if (NVML_SUCCESS != result) { 98 | printf("Failed to get handle for device %u: %s\n", i, nvmlErrorString(result)); 99 | goto Error; 100 | } 101 | 102 | result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); 103 | if (NVML_SUCCESS != result) { 104 | printf("Failed to get name of device %u: %s\n", i, nvmlErrorString(result)); 105 | goto Error; 106 | } 107 | 108 | // pci.busId is very useful to know which device physically you're talking to 109 | // Using PCI identifier you can also match nvmlDevice handle to CUDA device. 110 | result = nvmlDeviceGetPciInfo(device, &pci); 111 | if (NVML_SUCCESS != result) { 112 | printf("Failed to get pci info for device %u: %s\n", i, nvmlErrorString(result)); 113 | goto Error; 114 | } 115 | 116 | printf("%u. %s [%s]\n", i, name, pci.busId); 117 | 118 | // This is a simple example on how you can modify GPU's state 119 | result = nvmlDeviceGetComputeMode(device, &compute_mode); 120 | if (NVML_ERROR_NOT_SUPPORTED == result) 121 | printf("\t This is not CUDA capable device\n"); 122 | else if (NVML_SUCCESS != result) { 123 | printf("Failed to get compute mode for device %u: %s\n", i, nvmlErrorString(result)); 124 | goto Error; 125 | } else { 126 | // try to change compute mode 127 | printf("\t Changing device's compute mode from '%s' to '%s'\n", convertToComputeModeString(compute_mode), 128 | convertToComputeModeString(NVML_COMPUTEMODE_PROHIBITED)); 129 | 130 | result = nvmlDeviceSetComputeMode(device, NVML_COMPUTEMODE_PROHIBITED); 131 | if (NVML_ERROR_NO_PERMISSION == result) 132 | printf("\t\t Need root privileges to do that: %s\n", nvmlErrorString(result)); 133 | else if (NVML_ERROR_NOT_SUPPORTED == result) 134 | printf( 135 | "\t\t Compute mode prohibited not supported. You might be running on\n" 136 | "\t\t windows in WDDM driver model or on non-CUDA capable GPU\n"); 137 | else if (NVML_SUCCESS != result) { 138 | printf("\t\t Failed to set compute mode for device %u: %s\n", i, nvmlErrorString(result)); 139 | goto Error; 140 | } else { 141 | printf("\t Restoring device's compute mode back to '%s'\n", convertToComputeModeString(compute_mode)); 142 | result = nvmlDeviceSetComputeMode(device, compute_mode); 143 | if (NVML_SUCCESS != result) { 144 | printf("\t\t Failed to restore compute mode for device %u: %s\n", i, nvmlErrorString(result)); 145 | goto Error; 146 | } 147 | } 148 | } 149 | } 150 | 151 | result = nvmlShutdown(); 152 | if (NVML_SUCCESS != result) 153 | printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); 154 | 155 | printf("All done.\n"); 156 | 157 | return 0; 158 | 159 | Error: 160 | result = nvmlShutdown(); 161 | if (NVML_SUCCESS != result) 162 | printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); 163 | 164 | return 1; 165 | } 166 | -------------------------------------------------------------------------------- /sample/nvml/supported_vgpus.c: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 22:35:09 on Sun, May 29, 2022 4 | // 5 | // Description: source file in cuda/nvml/example/supportedVgpus.c 6 | 7 | /***************************************************************************\ 8 | |* *| 9 | |* Copyright 2010-2016 NVIDIA Corporation. All rights reserved. *| 10 | |* *| 11 | |* NOTICE TO USER: *| 12 | |* *| 13 | |* This source code is subject to NVIDIA ownership rights under U.S. *| 14 | |* and international Copyright laws. Users and possessors of this *| 15 | |* source code are hereby granted a nonexclusive, royalty-free *| 16 | |* license to use this code in individual and commercial software. *| 17 | |* *| 18 | |* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE *| 19 | |* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR *| 20 | |* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH *| 21 | |* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF *| 22 | |* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR *| 23 | |* PURPOSE. IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, *| 24 | |* INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES *| 25 | |* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN *| 26 | |* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING *| 27 | |* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOURCE *| 28 | |* CODE. *| 29 | |* *| 30 | |* U.S. Government End Users. This source code is a "commercial item" *| 31 | |* as that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting *| 32 | |* of "commercial computer software" and "commercial computer software *| 33 | |* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) *| 34 | |* and is provided to the U.S. Government only as a commercial end item. *| 35 | |* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through *| 36 | |* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the *| 37 | |* source code with only those rights set forth herein. *| 38 | |* *| 39 | |* Any use of this source code in individual and commercial software must *| 40 | |* include, in the user documentation and internal comments to the code, *| 41 | |* the above Disclaimer and U.S. Government End Users Notice. *| 42 | |* *| 43 | |* *| 44 | \***************************************************************************/ 45 | 46 | #include 47 | #include 48 | #include 49 | 50 | int main(void) { 51 | nvmlReturn_t result; 52 | unsigned int device_count, i; 53 | 54 | // First initialize NVML library 55 | result = nvmlInit(); 56 | if (NVML_SUCCESS != result) { 57 | printf("Failed to initialize NVML: %s\n", nvmlErrorString(result)); 58 | return 1; 59 | } 60 | 61 | result = nvmlDeviceGetCount(&device_count); 62 | if (NVML_SUCCESS != result) { 63 | printf("Failed to query device count: %s\n", nvmlErrorString(result)); 64 | goto Error; 65 | } 66 | 67 | printf("Found %u device%s\n", device_count, device_count != 1 ? "s" : ""); 68 | printf("Listing devices:\n"); 69 | 70 | for (i = 0; i < device_count; i++) { 71 | nvmlDevice_t device; 72 | char name[NVML_DEVICE_NAME_BUFFER_SIZE]; 73 | nvmlPciInfo_t pci; 74 | 75 | // Query for device handle to perform operations on a device 76 | // You can also query device handle by other features like: 77 | // nvmlDeviceGetHandleBySerial 78 | // nvmlDeviceGetHandleByPciBusId 79 | result = nvmlDeviceGetHandleByIndex(i, &device); 80 | if (NVML_SUCCESS != result) { 81 | printf("Failed to get handle for device %u: %s\n", i, nvmlErrorString(result)); 82 | goto Error; 83 | } 84 | 85 | result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); 86 | if (NVML_SUCCESS != result) { 87 | printf("Failed to get name of device %u: %s\n", i, nvmlErrorString(result)); 88 | goto Error; 89 | } 90 | 91 | // pci.busId is very useful to know which device physically you're talking to 92 | // Using PCI identifier you can also match nvmlDevice handle to CUDA device. 93 | result = nvmlDeviceGetPciInfo(device, &pci); 94 | if (NVML_SUCCESS != result) { 95 | printf("Failed to get pci info for device %u: %s\n", i, nvmlErrorString(result)); 96 | goto Error; 97 | } 98 | 99 | printf("%u. %s [%s]\n", i, name, pci.busId); 100 | 101 | // This is an example to get the supported vGPUs type names 102 | unsigned int vgpuCount = 0; 103 | nvmlVgpuTypeId_t *vgpuTypeIds = NULL; 104 | unsigned int j; 105 | 106 | result = nvmlDeviceGetSupportedVgpus(device, &vgpuCount, NULL); 107 | if (NVML_ERROR_INSUFFICIENT_SIZE != result) 108 | goto Error; 109 | 110 | if (vgpuCount != 0) { 111 | vgpuTypeIds = malloc(sizeof(nvmlVgpuTypeId_t) * vgpuCount); 112 | if (!vgpuTypeIds) { 113 | printf("Memory allocation of %d bytes failed \n", (int)(sizeof(*vgpuTypeIds) * vgpuCount)); 114 | goto Error; 115 | } 116 | 117 | result = nvmlDeviceGetSupportedVgpus(device, &vgpuCount, vgpuTypeIds); 118 | if (NVML_SUCCESS != result) { 119 | printf("Failed to get the supported vGPUs with status %d \n", (int)result); 120 | goto Error; 121 | } 122 | 123 | printf(" Displaying vGPU type names: \n"); 124 | for (j = 0; j < vgpuCount; j++) { 125 | char vgpuTypeName[NVML_DEVICE_NAME_BUFFER_SIZE]; 126 | unsigned int bufferSize = NVML_DEVICE_NAME_BUFFER_SIZE; 127 | 128 | if (NVML_SUCCESS == (result = nvmlVgpuTypeGetName(vgpuTypeIds[j], vgpuTypeName, &bufferSize))) { 129 | printf(" %s\n", vgpuTypeName); 130 | } else { 131 | printf("Failed to query the vGPU type name with status %d \n", (int)result); 132 | } 133 | } 134 | } 135 | if (vgpuTypeIds) 136 | free(vgpuTypeIds); 137 | } 138 | 139 | result = nvmlShutdown(); 140 | if (NVML_SUCCESS != result) 141 | printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); 142 | 143 | printf("All done.\n"); 144 | return 0; 145 | 146 | Error: 147 | result = nvmlShutdown(); 148 | if (NVML_SUCCESS != result) 149 | printf("Failed to shutdown NVML: %s\n", nvmlErrorString(result)); 150 | 151 | return 1; 152 | } 153 | -------------------------------------------------------------------------------- /src/common/hook.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 15:04:22 on Sun, May 29, 2022 4 | // 5 | // Description: hook 6 | 7 | #ifndef __CUDA_HOOK_HOOK_H__ 8 | #define __CUDA_HOOK_HOOK_H__ 9 | 10 | #include 11 | 12 | #include 13 | 14 | #include "macro_common.h" 15 | 16 | #define HOOK_GET_SYMBOL(type, symbol_name) \ 17 | do { \ 18 | static void *type##_handle = dlopen(s_##type##_dso, RTLD_NOW | RTLD_LOCAL); \ 19 | HOOK_CHECK(type##_handle); \ 20 | return dlsym(type##_handle, symbol_name.c_str()); \ 21 | } while (0) 22 | 23 | class Hook { 24 | public: 25 | Hook() = default; 26 | ~Hook() = default; 27 | 28 | static void *GetCUDASymbol(const std::string &symbol_name) { 29 | HOOK_GET_SYMBOL(cuda, symbol_name); 30 | } 31 | 32 | static void *GetNVMLSymbol(const std::string &symbol_name) { 33 | HOOK_GET_SYMBOL(nvml, symbol_name); 34 | } 35 | 36 | static void *GetCUDARTSymbol(const std::string &symbol_name) { 37 | HOOK_GET_SYMBOL(cudart, symbol_name); 38 | } 39 | 40 | static void *GetCUDNNSymbol(const std::string &symbol_name) { 41 | HOOK_GET_SYMBOL(cudnn, symbol_name); 42 | } 43 | 44 | static void *GetCUBLASSymbol(const std::string &symbol_name) { 45 | HOOK_GET_SYMBOL(cublas, symbol_name); 46 | } 47 | 48 | static void *GetCUBLASLTSymbol(const std::string &symbol_name) { 49 | HOOK_GET_SYMBOL(cublasLt, symbol_name); 50 | } 51 | 52 | static void *GetCUFFTSymbol(const std::string &symbol_name) { 53 | HOOK_GET_SYMBOL(cufft, symbol_name); 54 | } 55 | 56 | static void *GetNVTXSymbol(const std::string &symbol_name) { 57 | HOOK_GET_SYMBOL(nvtx, symbol_name); 58 | } 59 | 60 | static void *GetNVRTCSymbol(const std::string &symbol_name) { 61 | HOOK_GET_SYMBOL(nvrtc, symbol_name); 62 | } 63 | 64 | static void *GetCURANDSymbol(const std::string &symbol_name) { 65 | HOOK_GET_SYMBOL(curand, symbol_name); 66 | } 67 | 68 | static void *GetCUSPARSESymbol(const std::string &symbol_name) { 69 | HOOK_GET_SYMBOL(cusparse, symbol_name); 70 | } 71 | 72 | static void *GetCUSOLVERSymbol(const std::string &symbol_name) { 73 | HOOK_GET_SYMBOL(cusolver, symbol_name); 74 | } 75 | 76 | static void *GetNVJPEGSymbol(const std::string &symbol_name) { 77 | HOOK_GET_SYMBOL(nvjpeg, symbol_name); 78 | } 79 | 80 | static void *GetNVBLASSymbol(const std::string &symbol_name) { 81 | HOOK_GET_SYMBOL(nvblas, symbol_name); 82 | } 83 | 84 | private: 85 | // nvidia native cuda dynamic library can be modified to any other unambiguous name, or moved to any path 86 | static constexpr const char *s_cuda_dso = "/usr/lib/x86_64-linux-gnu/libcuda.so"; 87 | static constexpr const char *s_nvml_dso = "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so"; 88 | static constexpr const char *s_cudart_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcudart.so"; 89 | static constexpr const char *s_cudnn_dso = "/usr/local/cudnn/lib64/libcudnn.so"; 90 | static constexpr const char *s_cublas_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcublas.so"; 91 | static constexpr const char *s_cublasLt_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so"; 92 | static constexpr const char *s_cufft_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcufft.so"; 93 | static constexpr const char *s_nvtx_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libnvToolsExt.so"; 94 | static constexpr const char *s_nvrtc_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libnvrtc.so"; 95 | static constexpr const char *s_curand_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcurand.so"; 96 | static constexpr const char *s_cusparse_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcusparse.so"; 97 | static constexpr const char *s_cusolver_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libcusolver.so"; 98 | static constexpr const char *s_nvjpeg_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg.so"; 99 | static constexpr const char *s_nvblas_dso = "/usr/local/cuda/targets/x86_64-linux/lib/libnvblas.so"; 100 | 101 | HOOK_DISALLOW_COPY_AND_ASSIGN(Hook); 102 | }; 103 | 104 | #define HOOK_CUDA_SYMBOL(symbol_name) Hook::GetCUDASymbol(symbol_name) 105 | #define HOOK_NVML_SYMBOL(symbol_name) Hook::GetNVMLSymbol(symbol_name) 106 | #define HOOK_CUDART_SYMBOL(symbol_name) Hook::GetCUDARTSymbol(symbol_name) 107 | #define HOOK_CUDNN_SYMBOL(symbol_name) Hook::GetCUDNNSymbol(symbol_name) 108 | #define HOOK_CUBLAS_SYMBOL(symbol_name) Hook::GetCUBLASSymbol(symbol_name) 109 | #define HOOK_CUBLASLT_SYMBOL(symbol_name) Hook::GetCUBLASLTSymbol(symbol_name) 110 | #define HOOK_CUFFT_SYMBOL(symbol_name) Hook::GetCUFFTSymbol(symbol_name) 111 | #define HOOK_NVTX_SYMBOL(symbol_name) Hook::GetNVTXSymbol(symbol_name) 112 | #define HOOK_NVRTC_SYMBOL(symbol_name) Hook::GetNVRTCSymbol(symbol_name) 113 | #define HOOK_CURAND_SYMBOL(symbol_name) Hook::GetCURANDSymbol(symbol_name) 114 | #define HOOK_CUSPARSE_SYMBOL(symbol_name) Hook::GetCUSPARSESymbol(symbol_name) 115 | #define HOOK_CUSOLVER_SYMBOL(symbol_name) Hook::GetCUSOLVERSymbol(symbol_name) 116 | #define HOOK_NVJPEG_SYMBOL(symbol_name) Hook::GetNVJPEGSymbol(symbol_name) 117 | #define HOOK_NVBLAS_SYMBOL(symbol_name) Hook::GetNVBLASSymbol(symbol_name) 118 | 119 | #endif // __CUDA_HOOK_HOOK_H__ 120 | -------------------------------------------------------------------------------- /src/common/macro_common.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 15:40:15 on Sun, May 29, 2022 4 | // 5 | // Description: common macro 6 | 7 | #ifndef __CUDA_HOOK_MACRO_COMMON_H__ 8 | #define __CUDA_HOOK_MACRO_COMMON_H__ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #define HOOK_C_API extern "C" 16 | #define HOOK_DECL_EXPORT __attribute__((visibility("default"))) 17 | 18 | #define HOOK_LIKELY(x) __builtin_expect(!!(x), 1) 19 | #define HOOK_UNLIKELY(x) __builtin_expect(!!(x), 0) 20 | 21 | inline char *curr_time() { 22 | time_t raw_time = time(nullptr); 23 | struct tm *time_info = localtime(&raw_time); 24 | static char now_time[64]; 25 | now_time[strftime(now_time, sizeof(now_time), "%Y-%m-%d %H:%M:%S", time_info)] = '\0'; 26 | 27 | return now_time; 28 | } 29 | 30 | inline int get_pid() { 31 | static int pid = getpid(); 32 | 33 | return pid; 34 | } 35 | 36 | inline long int get_tid() { 37 | thread_local long int tid = syscall(SYS_gettid); 38 | 39 | return tid; 40 | } 41 | 42 | #define HOOK_LOG_TAG "CUDA-HOOK" 43 | #define HOOK_LOG_FILE(x) (strrchr(x, '/') ? (strrchr(x, '/') + 1) : x) 44 | #define HLOG(format, ...) \ 45 | do { \ 46 | fprintf(stderr, "[%s %s %d:%ld %s:%d %s] " format "\n", HOOK_LOG_TAG, curr_time(), get_pid(), get_tid(), \ 47 | HOOK_LOG_FILE(__FILE__), __LINE__, __FUNCTION__, ##__VA_ARGS__); \ 48 | } while (0) 49 | 50 | #define HOOK_CHECK(x) \ 51 | do { \ 52 | if (HOOK_UNLIKELY(!(x))) { \ 53 | HLOG("Check failed: %s", #x); \ 54 | exit(EXIT_FAILURE); \ 55 | } \ 56 | } while (0) 57 | 58 | #define HOOK_DISALLOW_COPY_AND_ASSIGN(TypeName) \ 59 | TypeName(const TypeName &) = delete; \ 60 | void operator=(const TypeName &) = delete; 61 | 62 | #endif // __CUDA_HOOK_MACRO_COMMON_H__ 63 | -------------------------------------------------------------------------------- /src/common/trace_profile.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 14:54:28 on Sun, May 29, 2022 4 | // 5 | // Description: trace and profile 6 | 7 | #ifndef __CUDA_HOOK_TRACE_PROFILE_H__ 8 | #define __CUDA_HOOK_TRACE_PROFILE_H__ 9 | 10 | #include 11 | #include 12 | 13 | #include "macro_common.h" 14 | 15 | class TraceProfile { 16 | public: 17 | TraceProfile(const std::string &name) : m_name(name), m_start(std::chrono::steady_clock::now()) { 18 | HLOG("%s enter", m_name.c_str()); 19 | } 20 | 21 | ~TraceProfile() { 22 | m_end = std::chrono::steady_clock::now(); 23 | m_duration = std::chrono::duration_cast(m_end - m_start); 24 | HLOG("%s exit, taken %.3lf ms", m_name.c_str(), m_duration.count()); 25 | } 26 | 27 | private: 28 | const std::string m_name; 29 | std::chrono::steady_clock::time_point m_start; 30 | std::chrono::steady_clock::time_point m_end; 31 | std::chrono::duration m_duration; 32 | 33 | HOOK_DISALLOW_COPY_AND_ASSIGN(TraceProfile); 34 | }; 35 | 36 | #ifdef HOOK_BUILD_DEBUG 37 | #define HOOK_TRACE_PROFILE(name) TraceProfile _tp_##name_(name) 38 | #else 39 | #define HOOK_TRACE_PROFILE(name) 40 | #endif 41 | 42 | #endif // __CUDA_HOOK_TRACE_PROFILE_H__ 43 | -------------------------------------------------------------------------------- /src/cublas/cublas_subset.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: cublas subset 6 | 7 | #ifndef __CUDA_HOOK_CUBLAS_SUBSET_H__ 8 | #define __CUDA_HOOK_CUBLAS_SUBSET_H__ 9 | 10 | #include "cudart_subset.h" 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | typedef struct __half __half; 17 | 18 | typedef enum cudaDataType_t { 19 | CUDA_R_16F = 2, /* real as a half */ 20 | CUDA_C_16F = 6, /* complex as a pair of half numbers */ 21 | CUDA_R_32F = 0, /* real as a float */ 22 | CUDA_C_32F = 4, /* complex as a pair of float numbers */ 23 | CUDA_R_64F = 1, /* real as a double */ 24 | CUDA_C_64F = 5, /* complex as a pair of double numbers */ 25 | CUDA_R_8I = 3, /* real as a signed char */ 26 | CUDA_C_8I = 7, /* complex as a pair of signed char numbers */ 27 | CUDA_R_8U = 8, /* real as a unsigned char */ 28 | CUDA_C_8U = 9, /* complex as a pair of unsigned char numbers */ 29 | CUDA_R_32I = 10, /* real as a signed int */ 30 | CUDA_C_32I = 11, /* complex as a pair of signed int numbers */ 31 | CUDA_R_32U = 12, /* real as a unsigned int */ 32 | CUDA_C_32U = 13 /* complex as a pair of unsigned int numbers */ 33 | } cudaDataType; 34 | 35 | typedef enum libraryPropertyType_t { MAJOR_VERSION, MINOR_VERSION, PATCH_LEVEL } libraryPropertyType; 36 | 37 | struct float2 { 38 | float x, y; 39 | }; 40 | 41 | typedef float2 cuFloatComplex; 42 | /* aliases */ 43 | typedef cuFloatComplex cuComplex; 44 | 45 | struct double2 { 46 | double x, y; 47 | }; 48 | 49 | /* Double precision */ 50 | typedef double2 cuDoubleComplex; 51 | 52 | /* CUBLAS data types */ 53 | #define cublasStatus cublasStatus_t 54 | 55 | #define CUBLAS_VER_MAJOR 11 56 | #define CUBLAS_VER_MINOR 6 57 | #define CUBLAS_VER_PATCH 5 58 | #define CUBLAS_VER_BUILD 2 59 | #define CUBLAS_VERSION (CUBLAS_VER_MAJOR * 1000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH) 60 | 61 | /* CUBLAS status type returns */ 62 | typedef enum { 63 | CUBLAS_STATUS_SUCCESS = 0, 64 | CUBLAS_STATUS_NOT_INITIALIZED = 1, 65 | CUBLAS_STATUS_ALLOC_FAILED = 3, 66 | CUBLAS_STATUS_INVALID_VALUE = 7, 67 | CUBLAS_STATUS_ARCH_MISMATCH = 8, 68 | CUBLAS_STATUS_MAPPING_ERROR = 11, 69 | CUBLAS_STATUS_EXECUTION_FAILED = 13, 70 | CUBLAS_STATUS_INTERNAL_ERROR = 14, 71 | CUBLAS_STATUS_NOT_SUPPORTED = 15, 72 | CUBLAS_STATUS_LICENSE_ERROR = 16 73 | } cublasStatus_t; 74 | 75 | typedef enum { CUBLAS_FILL_MODE_LOWER = 0, CUBLAS_FILL_MODE_UPPER = 1, CUBLAS_FILL_MODE_FULL = 2 } cublasFillMode_t; 76 | 77 | typedef enum { CUBLAS_DIAG_NON_UNIT = 0, CUBLAS_DIAG_UNIT = 1 } cublasDiagType_t; 78 | 79 | typedef enum { CUBLAS_SIDE_LEFT = 0, CUBLAS_SIDE_RIGHT = 1 } cublasSideMode_t; 80 | 81 | typedef enum { 82 | CUBLAS_OP_N = 0, 83 | CUBLAS_OP_T = 1, 84 | CUBLAS_OP_C = 2, 85 | CUBLAS_OP_HERMITAN = 2, /* synonym if CUBLAS_OP_C */ 86 | CUBLAS_OP_CONJG = 3 /* conjugate, placeholder - not supported in the current release */ 87 | } cublasOperation_t; 88 | 89 | typedef enum { CUBLAS_POINTER_MODE_HOST = 0, CUBLAS_POINTER_MODE_DEVICE = 1 } cublasPointerMode_t; 90 | 91 | typedef enum { CUBLAS_ATOMICS_NOT_ALLOWED = 0, CUBLAS_ATOMICS_ALLOWED = 1 } cublasAtomicsMode_t; 92 | 93 | /*For different GEMM algorithm */ 94 | typedef enum { 95 | CUBLAS_GEMM_DFALT = -1, 96 | CUBLAS_GEMM_DEFAULT = -1, 97 | CUBLAS_GEMM_ALGO0 = 0, 98 | CUBLAS_GEMM_ALGO1 = 1, 99 | CUBLAS_GEMM_ALGO2 = 2, 100 | CUBLAS_GEMM_ALGO3 = 3, 101 | CUBLAS_GEMM_ALGO4 = 4, 102 | CUBLAS_GEMM_ALGO5 = 5, 103 | CUBLAS_GEMM_ALGO6 = 6, 104 | CUBLAS_GEMM_ALGO7 = 7, 105 | CUBLAS_GEMM_ALGO8 = 8, 106 | CUBLAS_GEMM_ALGO9 = 9, 107 | CUBLAS_GEMM_ALGO10 = 10, 108 | CUBLAS_GEMM_ALGO11 = 11, 109 | CUBLAS_GEMM_ALGO12 = 12, 110 | CUBLAS_GEMM_ALGO13 = 13, 111 | CUBLAS_GEMM_ALGO14 = 14, 112 | CUBLAS_GEMM_ALGO15 = 15, 113 | CUBLAS_GEMM_ALGO16 = 16, 114 | CUBLAS_GEMM_ALGO17 = 17, 115 | CUBLAS_GEMM_ALGO18 = 18, // sliced 32x32 116 | CUBLAS_GEMM_ALGO19 = 19, // sliced 64x32 117 | CUBLAS_GEMM_ALGO20 = 20, // sliced 128x32 118 | CUBLAS_GEMM_ALGO21 = 21, // sliced 32x32 -splitK 119 | CUBLAS_GEMM_ALGO22 = 22, // sliced 64x32 -splitK 120 | CUBLAS_GEMM_ALGO23 = 23, // sliced 128x32 -splitK 121 | CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99, 122 | CUBLAS_GEMM_DFALT_TENSOR_OP = 99, 123 | CUBLAS_GEMM_ALGO0_TENSOR_OP = 100, 124 | CUBLAS_GEMM_ALGO1_TENSOR_OP = 101, 125 | CUBLAS_GEMM_ALGO2_TENSOR_OP = 102, 126 | CUBLAS_GEMM_ALGO3_TENSOR_OP = 103, 127 | CUBLAS_GEMM_ALGO4_TENSOR_OP = 104, 128 | CUBLAS_GEMM_ALGO5_TENSOR_OP = 105, 129 | CUBLAS_GEMM_ALGO6_TENSOR_OP = 106, 130 | CUBLAS_GEMM_ALGO7_TENSOR_OP = 107, 131 | CUBLAS_GEMM_ALGO8_TENSOR_OP = 108, 132 | CUBLAS_GEMM_ALGO9_TENSOR_OP = 109, 133 | CUBLAS_GEMM_ALGO10_TENSOR_OP = 110, 134 | CUBLAS_GEMM_ALGO11_TENSOR_OP = 111, 135 | CUBLAS_GEMM_ALGO12_TENSOR_OP = 112, 136 | CUBLAS_GEMM_ALGO13_TENSOR_OP = 113, 137 | CUBLAS_GEMM_ALGO14_TENSOR_OP = 114, 138 | CUBLAS_GEMM_ALGO15_TENSOR_OP = 115 139 | } cublasGemmAlgo_t; 140 | 141 | /*Enum for default math mode/tensor operation*/ 142 | typedef enum { 143 | CUBLAS_DEFAULT_MATH = 0, 144 | 145 | /* deprecated, same effect as using CUBLAS_COMPUTE_32F_FAST_16F, will be removed in a future release */ 146 | CUBLAS_TENSOR_OP_MATH = 1, 147 | 148 | /* same as using matching _PEDANTIC compute type when using cublasroutine calls or cublasEx() calls with 149 | cudaDataType as compute type */ 150 | CUBLAS_PEDANTIC_MATH = 2, 151 | 152 | /* allow accelerating single precision routines using TF32 tensor cores */ 153 | CUBLAS_TF32_TENSOR_OP_MATH = 3, 154 | 155 | /* flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines 156 | with lower size output type */ 157 | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION = 16, 158 | } cublasMath_t; 159 | 160 | /* For backward compatibility purposes */ 161 | typedef cudaDataType cublasDataType_t; 162 | 163 | /* Enum for compute type 164 | * 165 | * - default types provide best available performance using all available hardware features 166 | * and guarantee internal storage precision with at least the same precision and range; 167 | * - _PEDANTIC types ensure standard arithmetic and exact specified internal storage format; 168 | * - _FAST types allow for some loss of precision to enable higher throughput arithmetic. 169 | */ 170 | typedef enum { 171 | CUBLAS_COMPUTE_16F = 64, /* half - default */ 172 | CUBLAS_COMPUTE_16F_PEDANTIC = 65, /* half - pedantic */ 173 | CUBLAS_COMPUTE_32F = 68, /* float - default */ 174 | CUBLAS_COMPUTE_32F_PEDANTIC = 69, /* float - pedantic */ 175 | CUBLAS_COMPUTE_32F_FAST_16F = 74, /* float - fast, allows down-converting inputs to half or TF32 */ 176 | CUBLAS_COMPUTE_32F_FAST_16BF = 75, /* float - fast, allows down-converting inputs to bfloat16 or TF32 */ 177 | CUBLAS_COMPUTE_32F_FAST_TF32 = 77, /* float - fast, allows down-converting inputs to TF32 */ 178 | CUBLAS_COMPUTE_64F = 70, /* double - default */ 179 | CUBLAS_COMPUTE_64F_PEDANTIC = 71, /* double - pedantic */ 180 | CUBLAS_COMPUTE_32I = 72, /* signed 32-bit int - default */ 181 | CUBLAS_COMPUTE_32I_PEDANTIC = 73, /* signed 32-bit int - pedantic */ 182 | } cublasComputeType_t; 183 | 184 | /* Opaque structure holding CUBLAS library context */ 185 | struct cublasContext; 186 | typedef struct cublasContext *cublasHandle_t; 187 | 188 | /* Cublas logging */ 189 | typedef void (*cublasLogCallback)(const char *msg); 190 | 191 | struct cublasXtContext; 192 | typedef struct cublasXtContext *cublasXtHandle_t; 193 | 194 | typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t; 195 | 196 | /* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */ 197 | typedef enum { 198 | CUBLASXT_FLOAT = 0, 199 | CUBLASXT_DOUBLE = 1, 200 | CUBLASXT_COMPLEX = 2, 201 | CUBLASXT_DOUBLECOMPLEX = 3, 202 | } cublasXtOpType_t; 203 | 204 | typedef enum { 205 | CUBLASXT_GEMM = 0, 206 | CUBLASXT_SYRK = 1, 207 | CUBLASXT_HERK = 2, 208 | CUBLASXT_SYMM = 3, 209 | CUBLASXT_HEMM = 4, 210 | CUBLASXT_TRSM = 5, 211 | CUBLASXT_SYR2K = 6, 212 | CUBLASXT_HER2K = 7, 213 | 214 | CUBLASXT_SPMM = 8, 215 | CUBLASXT_SYRKX = 9, 216 | CUBLASXT_HERKX = 10, 217 | CUBLASXT_TRMM = 11, 218 | CUBLASXT_ROUTINE_MAX = 12, 219 | } cublasXtBlasOp_t; 220 | 221 | #ifdef __cplusplus 222 | } 223 | #endif 224 | 225 | #endif // __CUDA_HOOK_CUBLAS_SUBSET_H__ 226 | -------------------------------------------------------------------------------- /src/cufft/cufft_subset.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: cufft subset 6 | 7 | #ifndef __CUDA_HOOK_CUFFT_SUBSET_H__ 8 | #define __CUDA_HOOK_CUFFT_SUBSET_H__ 9 | 10 | #include "cublas_subset.h" 11 | 12 | #ifdef __cplusplus 13 | extern "C" { 14 | #endif 15 | 16 | #define CUFFT_VER_MAJOR 10 17 | #define CUFFT_VER_MINOR 5 18 | #define CUFFT_VER_PATCH 2 19 | #define CUFFT_VER_BUILD 100 20 | 21 | // cuFFT library version 22 | // 23 | // CUFFT_VERSION / 1000 - major version 24 | // CUFFT_VERSION / 100 % 100 - minor version 25 | // CUFFT_VERSION % 100 - patch level 26 | #define CUFFT_VERSION 10502 27 | 28 | // CUFFT API function return values 29 | typedef enum cufftResult_t { 30 | CUFFT_SUCCESS = 0x0, 31 | CUFFT_INVALID_PLAN = 0x1, 32 | CUFFT_ALLOC_FAILED = 0x2, 33 | CUFFT_INVALID_TYPE = 0x3, 34 | CUFFT_INVALID_VALUE = 0x4, 35 | CUFFT_INTERNAL_ERROR = 0x5, 36 | CUFFT_EXEC_FAILED = 0x6, 37 | CUFFT_SETUP_FAILED = 0x7, 38 | CUFFT_INVALID_SIZE = 0x8, 39 | CUFFT_UNALIGNED_DATA = 0x9, 40 | CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA, 41 | CUFFT_INVALID_DEVICE = 0xB, 42 | CUFFT_PARSE_ERROR = 0xC, 43 | CUFFT_NO_WORKSPACE = 0xD, 44 | CUFFT_NOT_IMPLEMENTED = 0xE, 45 | CUFFT_LICENSE_ERROR = 0x0F, 46 | CUFFT_NOT_SUPPORTED = 0x10 47 | 48 | } cufftResult; 49 | 50 | #define MAX_CUFFT_ERROR 0x11 51 | 52 | // CUFFT defines and supports the following data types 53 | 54 | // cufftReal is a single-precision, floating-point real data type. 55 | // cufftDoubleReal is a double-precision, real data type. 56 | typedef float cufftReal; 57 | typedef double cufftDoubleReal; 58 | 59 | // cufftComplex is a single-precision, floating-point complex data type that 60 | // consists of interleaved real and imaginary components. 61 | // cufftDoubleComplex is the double-precision equivalent. 62 | typedef cuComplex cufftComplex; 63 | typedef cuDoubleComplex cufftDoubleComplex; 64 | 65 | // CUFFT transform directions 66 | #define CUFFT_FORWARD -1 // Forward FFT 67 | #define CUFFT_INVERSE 1 // Inverse FFT 68 | 69 | // CUFFT supports the following transform types 70 | typedef enum cufftType_t { 71 | CUFFT_R2C = 0x2a, // Real to Complex (interleaved) 72 | CUFFT_C2R = 0x2c, // Complex (interleaved) to Real 73 | CUFFT_C2C = 0x29, // Complex to Complex, interleaved 74 | CUFFT_D2Z = 0x6a, // Double to Double-Complex 75 | CUFFT_Z2D = 0x6c, // Double-Complex to Double 76 | CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex 77 | } cufftType; 78 | 79 | // CUFFT supports the following data layouts 80 | typedef enum cufftCompatibility_t { 81 | CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 // The default value 82 | } cufftCompatibility; 83 | 84 | #define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING 85 | 86 | // 87 | // structure definition used by the shim between old and new APIs 88 | // 89 | #define MAX_SHIM_RANK 3 90 | 91 | // cufftHandle is a handle type used to store and access CUFFT plans. 92 | typedef int cufftHandle; 93 | 94 | // 95 | // cufftXtSubFormat identifies the data layout of 96 | // a memory descriptor owned by cufft. 97 | // note that multi GPU cufft does not yet support out-of-place transforms 98 | // 99 | 100 | typedef enum cufftXtSubFormat_t { 101 | CUFFT_XT_FORMAT_INPUT = 0x00, // by default input is in linear order across GPUs 102 | CUFFT_XT_FORMAT_OUTPUT = 0x01, // by default output is in scrambled order depending on transform 103 | CUFFT_XT_FORMAT_INPLACE = 0x02, // by default inplace is input order, which is linear across GPUs 104 | CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03, // shuffled output order after execution of the transform 105 | CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04, // shuffled input order prior to execution of 1D transforms 106 | CUFFT_FORMAT_UNDEFINED = 0x05 107 | } cufftXtSubFormat; 108 | 109 | // 110 | // cufftXtCopyType specifies the type of copy for cufftXtMemcpy 111 | // 112 | typedef enum cufftXtCopyType_t { 113 | CUFFT_COPY_HOST_TO_DEVICE = 0x00, 114 | CUFFT_COPY_DEVICE_TO_HOST = 0x01, 115 | CUFFT_COPY_DEVICE_TO_DEVICE = 0x02, 116 | CUFFT_COPY_UNDEFINED = 0x03 117 | } cufftXtCopyType; 118 | 119 | // 120 | // cufftXtQueryType specifies the type of query for cufftXtQueryPlan 121 | // 122 | typedef enum cufftXtQueryType_t { CUFFT_QUERY_1D_FACTORS = 0x00, CUFFT_QUERY_UNDEFINED = 0x01 } cufftXtQueryType; 123 | 124 | typedef struct cufftXt1dFactors_t { 125 | long long int size; 126 | long long int stringCount; 127 | long long int stringLength; 128 | long long int substringLength; 129 | long long int factor1; 130 | long long int factor2; 131 | long long int stringMask; 132 | long long int substringMask; 133 | long long int factor1Mask; 134 | long long int factor2Mask; 135 | int stringShift; 136 | int substringShift; 137 | int factor1Shift; 138 | int factor2Shift; 139 | } cufftXt1dFactors; 140 | 141 | // 142 | // cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy 143 | // 144 | typedef enum cufftXtWorkAreaPolicy_t { 145 | CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */ 146 | CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */ 147 | CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */ 148 | } cufftXtWorkAreaPolicy; 149 | 150 | // callbacks 151 | 152 | typedef enum cufftXtCallbackType_t { 153 | CUFFT_CB_LD_COMPLEX = 0x0, 154 | CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1, 155 | CUFFT_CB_LD_REAL = 0x2, 156 | CUFFT_CB_LD_REAL_DOUBLE = 0x3, 157 | CUFFT_CB_ST_COMPLEX = 0x4, 158 | CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5, 159 | CUFFT_CB_ST_REAL = 0x6, 160 | CUFFT_CB_ST_REAL_DOUBLE = 0x7, 161 | CUFFT_CB_UNDEFINED = 0x8 162 | 163 | } cufftXtCallbackType; 164 | 165 | typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); 166 | typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); 167 | typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); 168 | typedef cufftDoubleReal (*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); 169 | 170 | typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, 171 | void *sharedPointer); 172 | typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, 173 | void *sharedPointer); 174 | typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, 175 | void *sharedPointer); 176 | typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, 177 | void *sharedPointer); 178 | 179 | #define CUDA_XT_DESCRIPTOR_VERSION 0x01000000 // This is added to CUDART_VERSION 180 | 181 | enum cudaXtCopyType_t { LIB_XT_COPY_HOST_TO_DEVICE, LIB_XT_COPY_DEVICE_TO_HOST, LIB_XT_COPY_DEVICE_TO_DEVICE }; 182 | typedef enum cudaXtCopyType_t cudaLibXtCopyType; 183 | 184 | enum libFormat_t { LIB_FORMAT_CUFFT = 0x0, LIB_FORMAT_UNDEFINED = 0x1 }; 185 | 186 | typedef enum libFormat_t libFormat; 187 | 188 | #define MAX_CUDA_DESCRIPTOR_GPUS 64 189 | 190 | struct cudaXtDesc_t { 191 | int version; // descriptor version 192 | int nGPUs; // number of GPUs 193 | int GPUs[MAX_CUDA_DESCRIPTOR_GPUS]; // array of device IDs 194 | void *data[MAX_CUDA_DESCRIPTOR_GPUS]; // array of pointers to data, one per GPU 195 | size_t size[MAX_CUDA_DESCRIPTOR_GPUS]; // array of data sizes, one per GPU 196 | void *cudaXtState; // opaque CUDA utility structure 197 | }; 198 | typedef struct cudaXtDesc_t cudaXtDesc; 199 | 200 | struct cudaLibXtDesc_t { 201 | int version; // descriptor version 202 | cudaXtDesc *descriptor; // multi-GPU memory descriptor 203 | libFormat library; // which library recognizes the format 204 | int subFormat; // library specific enumerator of sub formats 205 | void *libDescriptor; // library specific descriptor e.g. FFT transform plan object 206 | }; 207 | typedef struct cudaLibXtDesc_t cudaLibXtDesc; 208 | 209 | #ifdef __cplusplus 210 | } 211 | #endif 212 | 213 | #endif // __CUDA_HOOK_CUFFT_SUBSET_H__ 214 | -------------------------------------------------------------------------------- /src/curand/curand_hook.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: auto generate 29 apis 6 | 7 | #include "cublas_subset.h" 8 | #include "curand_subset.h" 9 | #include "hook.h" 10 | #include "macro_common.h" 11 | #include "trace_profile.h" 12 | 13 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandCreateGenerator(curandGenerator_t *generator, 14 | curandRngType_t rng_type) { 15 | HOOK_TRACE_PROFILE("curandCreateGenerator"); 16 | using func_ptr = curandStatus_t (*)(curandGenerator_t *, curandRngType_t); 17 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandCreateGenerator")); 18 | HOOK_CHECK(func_entry); 19 | return func_entry(generator, rng_type); 20 | } 21 | 22 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandCreateGeneratorHost(curandGenerator_t *generator, 23 | curandRngType_t rng_type) { 24 | HOOK_TRACE_PROFILE("curandCreateGeneratorHost"); 25 | using func_ptr = curandStatus_t (*)(curandGenerator_t *, curandRngType_t); 26 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandCreateGeneratorHost")); 27 | HOOK_CHECK(func_entry); 28 | return func_entry(generator, rng_type); 29 | } 30 | 31 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandDestroyGenerator(curandGenerator_t generator) { 32 | HOOK_TRACE_PROFILE("curandDestroyGenerator"); 33 | using func_ptr = curandStatus_t (*)(curandGenerator_t); 34 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandDestroyGenerator")); 35 | HOOK_CHECK(func_entry); 36 | return func_entry(generator); 37 | } 38 | 39 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetVersion(int *version) { 40 | HOOK_TRACE_PROFILE("curandGetVersion"); 41 | using func_ptr = curandStatus_t (*)(int *); 42 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGetVersion")); 43 | HOOK_CHECK(func_entry); 44 | return func_entry(version); 45 | } 46 | 47 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetProperty(libraryPropertyType type, int *value) { 48 | HOOK_TRACE_PROFILE("curandGetProperty"); 49 | using func_ptr = curandStatus_t (*)(libraryPropertyType, int *); 50 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGetProperty")); 51 | HOOK_CHECK(func_entry); 52 | return func_entry(type, value); 53 | } 54 | 55 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetStream(curandGenerator_t generator, cudaStream_t stream) { 56 | HOOK_TRACE_PROFILE("curandSetStream"); 57 | using func_ptr = curandStatus_t (*)(curandGenerator_t, cudaStream_t); 58 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandSetStream")); 59 | HOOK_CHECK(func_entry); 60 | return func_entry(generator, stream); 61 | } 62 | 63 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetPseudoRandomGeneratorSeed(curandGenerator_t generator, 64 | unsigned long long seed) { 65 | HOOK_TRACE_PROFILE("curandSetPseudoRandomGeneratorSeed"); 66 | using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned long long); 67 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandSetPseudoRandomGeneratorSeed")); 68 | HOOK_CHECK(func_entry); 69 | return func_entry(generator, seed); 70 | } 71 | 72 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetGeneratorOffset(curandGenerator_t generator, 73 | unsigned long long offset) { 74 | HOOK_TRACE_PROFILE("curandSetGeneratorOffset"); 75 | using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned long long); 76 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandSetGeneratorOffset")); 77 | HOOK_CHECK(func_entry); 78 | return func_entry(generator, offset); 79 | } 80 | 81 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetGeneratorOrdering(curandGenerator_t generator, 82 | curandOrdering_t order) { 83 | HOOK_TRACE_PROFILE("curandSetGeneratorOrdering"); 84 | using func_ptr = curandStatus_t (*)(curandGenerator_t, curandOrdering_t); 85 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandSetGeneratorOrdering")); 86 | HOOK_CHECK(func_entry); 87 | return func_entry(generator, order); 88 | } 89 | 90 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandSetQuasiRandomGeneratorDimensions(curandGenerator_t generator, 91 | unsigned int num_dimensions) { 92 | HOOK_TRACE_PROFILE("curandSetQuasiRandomGeneratorDimensions"); 93 | using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int); 94 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandSetQuasiRandomGeneratorDimensions")); 95 | HOOK_CHECK(func_entry); 96 | return func_entry(generator, num_dimensions); 97 | } 98 | 99 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerate(curandGenerator_t generator, unsigned int *outputPtr, 100 | size_t num) { 101 | HOOK_TRACE_PROFILE("curandGenerate"); 102 | using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t); 103 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerate")); 104 | HOOK_CHECK(func_entry); 105 | return func_entry(generator, outputPtr, num); 106 | } 107 | 108 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateLongLong(curandGenerator_t generator, 109 | unsigned long long *outputPtr, size_t num) { 110 | HOOK_TRACE_PROFILE("curandGenerateLongLong"); 111 | using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned long long *, size_t); 112 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateLongLong")); 113 | HOOK_CHECK(func_entry); 114 | return func_entry(generator, outputPtr, num); 115 | } 116 | 117 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateUniform(curandGenerator_t generator, float *outputPtr, 118 | size_t num) { 119 | HOOK_TRACE_PROFILE("curandGenerateUniform"); 120 | using func_ptr = curandStatus_t (*)(curandGenerator_t, float *, size_t); 121 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateUniform")); 122 | HOOK_CHECK(func_entry); 123 | return func_entry(generator, outputPtr, num); 124 | } 125 | 126 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateUniformDouble(curandGenerator_t generator, double *outputPtr, 127 | size_t num) { 128 | HOOK_TRACE_PROFILE("curandGenerateUniformDouble"); 129 | using func_ptr = curandStatus_t (*)(curandGenerator_t, double *, size_t); 130 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateUniformDouble")); 131 | HOOK_CHECK(func_entry); 132 | return func_entry(generator, outputPtr, num); 133 | } 134 | 135 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateNormal(curandGenerator_t generator, float *outputPtr, size_t n, 136 | float mean, float stddev) { 137 | HOOK_TRACE_PROFILE("curandGenerateNormal"); 138 | using func_ptr = curandStatus_t (*)(curandGenerator_t, float *, size_t, float, float); 139 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateNormal")); 140 | HOOK_CHECK(func_entry); 141 | return func_entry(generator, outputPtr, n, mean, stddev); 142 | } 143 | 144 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateNormalDouble(curandGenerator_t generator, double *outputPtr, 145 | size_t n, double mean, double stddev) { 146 | HOOK_TRACE_PROFILE("curandGenerateNormalDouble"); 147 | using func_ptr = curandStatus_t (*)(curandGenerator_t, double *, size_t, double, double); 148 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateNormalDouble")); 149 | HOOK_CHECK(func_entry); 150 | return func_entry(generator, outputPtr, n, mean, stddev); 151 | } 152 | 153 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateLogNormal(curandGenerator_t generator, float *outputPtr, 154 | size_t n, float mean, float stddev) { 155 | HOOK_TRACE_PROFILE("curandGenerateLogNormal"); 156 | using func_ptr = curandStatus_t (*)(curandGenerator_t, float *, size_t, float, float); 157 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateLogNormal")); 158 | HOOK_CHECK(func_entry); 159 | return func_entry(generator, outputPtr, n, mean, stddev); 160 | } 161 | 162 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr, 163 | size_t n, double mean, double stddev) { 164 | HOOK_TRACE_PROFILE("curandGenerateLogNormalDouble"); 165 | using func_ptr = curandStatus_t (*)(curandGenerator_t, double *, size_t, double, double); 166 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateLogNormalDouble")); 167 | HOOK_CHECK(func_entry); 168 | return func_entry(generator, outputPtr, n, mean, stddev); 169 | } 170 | 171 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t 172 | curandCreatePoissonDistribution(double lambda, curandDiscreteDistribution_t *discrete_distribution) { 173 | HOOK_TRACE_PROFILE("curandCreatePoissonDistribution"); 174 | using func_ptr = curandStatus_t (*)(double, curandDiscreteDistribution_t *); 175 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandCreatePoissonDistribution")); 176 | HOOK_CHECK(func_entry); 177 | return func_entry(lambda, discrete_distribution); 178 | } 179 | 180 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t 181 | curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution) { 182 | HOOK_TRACE_PROFILE("curandDestroyDistribution"); 183 | using func_ptr = curandStatus_t (*)(curandDiscreteDistribution_t); 184 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandDestroyDistribution")); 185 | HOOK_CHECK(func_entry); 186 | return func_entry(discrete_distribution); 187 | } 188 | 189 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGeneratePoisson(curandGenerator_t generator, unsigned int *outputPtr, 190 | size_t n, double lambda) { 191 | HOOK_TRACE_PROFILE("curandGeneratePoisson"); 192 | using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t, double); 193 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGeneratePoisson")); 194 | HOOK_CHECK(func_entry); 195 | return func_entry(generator, outputPtr, n, lambda); 196 | } 197 | 198 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGeneratePoissonMethod(curandGenerator_t generator, 199 | unsigned int *outputPtr, size_t n, double lambda, 200 | curandMethod_t method) { 201 | HOOK_TRACE_PROFILE("curandGeneratePoissonMethod"); 202 | using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t, double, curandMethod_t); 203 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGeneratePoissonMethod")); 204 | HOOK_CHECK(func_entry); 205 | return func_entry(generator, outputPtr, n, lambda, method); 206 | } 207 | 208 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateBinomial(curandGenerator_t generator, unsigned int *outputPtr, 209 | size_t num, unsigned int n, double p) { 210 | HOOK_TRACE_PROFILE("curandGenerateBinomial"); 211 | using func_ptr = curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t, unsigned int, double); 212 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateBinomial")); 213 | HOOK_CHECK(func_entry); 214 | return func_entry(generator, outputPtr, num, n, p); 215 | } 216 | 217 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateBinomialMethod(curandGenerator_t generator, 218 | unsigned int *outputPtr, size_t num, 219 | unsigned int n, double p, 220 | curandMethod_t method) { 221 | HOOK_TRACE_PROFILE("curandGenerateBinomialMethod"); 222 | using func_ptr = 223 | curandStatus_t (*)(curandGenerator_t, unsigned int *, size_t, unsigned int, double, curandMethod_t); 224 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateBinomialMethod")); 225 | HOOK_CHECK(func_entry); 226 | return func_entry(generator, outputPtr, num, n, p, method); 227 | } 228 | 229 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGenerateSeeds(curandGenerator_t generator) { 230 | HOOK_TRACE_PROFILE("curandGenerateSeeds"); 231 | using func_ptr = curandStatus_t (*)(curandGenerator_t); 232 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGenerateSeeds")); 233 | HOOK_CHECK(func_entry); 234 | return func_entry(generator); 235 | } 236 | 237 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetDirectionVectors32(curandDirectionVectors32_t *vectors, 238 | curandDirectionVectorSet_t set) { 239 | HOOK_TRACE_PROFILE("curandGetDirectionVectors32"); 240 | using func_ptr = curandStatus_t (*)(curandDirectionVectors32_t *, curandDirectionVectorSet_t); 241 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGetDirectionVectors32")); 242 | HOOK_CHECK(func_entry); 243 | return func_entry(vectors, set); 244 | } 245 | 246 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetScrambleConstants32(unsigned int **constants) { 247 | HOOK_TRACE_PROFILE("curandGetScrambleConstants32"); 248 | using func_ptr = curandStatus_t (*)(unsigned int **); 249 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGetScrambleConstants32")); 250 | HOOK_CHECK(func_entry); 251 | return func_entry(constants); 252 | } 253 | 254 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetDirectionVectors64(curandDirectionVectors64_t *vectors, 255 | curandDirectionVectorSet_t set) { 256 | HOOK_TRACE_PROFILE("curandGetDirectionVectors64"); 257 | using func_ptr = curandStatus_t (*)(curandDirectionVectors64_t *, curandDirectionVectorSet_t); 258 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGetDirectionVectors64")); 259 | HOOK_CHECK(func_entry); 260 | return func_entry(vectors, set); 261 | } 262 | 263 | HOOK_C_API HOOK_DECL_EXPORT curandStatus_t curandGetScrambleConstants64(unsigned long long **constants) { 264 | HOOK_TRACE_PROFILE("curandGetScrambleConstants64"); 265 | using func_ptr = curandStatus_t (*)(unsigned long long **); 266 | static auto func_entry = reinterpret_cast(HOOK_CURAND_SYMBOL("curandGetScrambleConstants64")); 267 | HOOK_CHECK(func_entry); 268 | return func_entry(constants); 269 | } 270 | -------------------------------------------------------------------------------- /src/curand/curand_subset.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: curand subset 6 | 7 | #ifndef __CUDA_HOOK_CURAND_SUBSET_H__ 8 | #define __CUDA_HOOK_CURAND_SUBSET_H__ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | #define CURAND_VER_MAJOR 10 15 | #define CURAND_VER_MINOR 2 16 | #define CURAND_VER_PATCH 5 17 | #define CURAND_VER_BUILD 120 18 | #define CURAND_VERSION (CURAND_VER_MAJOR * 1000 + CURAND_VER_MINOR * 100 + CURAND_VER_PATCH) 19 | /* CURAND Host API datatypes */ 20 | 21 | /** 22 | * @{ 23 | */ 24 | 25 | /** 26 | * CURAND function call status types 27 | */ 28 | enum curandStatus { 29 | CURAND_STATUS_SUCCESS = 0, ///< No errors 30 | CURAND_STATUS_VERSION_MISMATCH = 100, ///< Header file and linked library version do not match 31 | CURAND_STATUS_NOT_INITIALIZED = 101, ///< Generator not initialized 32 | CURAND_STATUS_ALLOCATION_FAILED = 102, ///< Memory allocation failed 33 | CURAND_STATUS_TYPE_ERROR = 103, ///< Generator is wrong type 34 | CURAND_STATUS_OUT_OF_RANGE = 104, ///< Argument out of range 35 | CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105, ///< Length requested is not a multple of dimension 36 | CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106, ///< GPU does not have double precision required by MRG32k3a 37 | CURAND_STATUS_LAUNCH_FAILURE = 201, ///< Kernel launch failure 38 | CURAND_STATUS_PREEXISTING_FAILURE = 202, ///< Preexisting failure on library entry 39 | CURAND_STATUS_INITIALIZATION_FAILED = 203, ///< Initialization of CUDA failed 40 | CURAND_STATUS_ARCH_MISMATCH = 204, ///< Architecture mismatch, GPU does not support requested feature 41 | CURAND_STATUS_INTERNAL_ERROR = 999 ///< Internal library error 42 | }; 43 | 44 | /* 45 | * CURAND function call status types 46 | */ 47 | /** \cond UNHIDE_TYPEDEFS */ 48 | typedef enum curandStatus curandStatus_t; 49 | /** \endcond */ 50 | 51 | /** 52 | * CURAND generator types 53 | */ 54 | enum curandRngType { 55 | CURAND_RNG_TEST = 0, 56 | CURAND_RNG_PSEUDO_DEFAULT = 100, ///< Default pseudorandom generator 57 | CURAND_RNG_PSEUDO_XORWOW = 101, ///< XORWOW pseudorandom generator 58 | CURAND_RNG_PSEUDO_MRG32K3A = 121, ///< MRG32k3a pseudorandom generator 59 | CURAND_RNG_PSEUDO_MTGP32 = 141, ///< Mersenne Twister MTGP32 pseudorandom generator 60 | CURAND_RNG_PSEUDO_MT19937 = 142, ///< Mersenne Twister MT19937 pseudorandom generator 61 | CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161, ///< PHILOX-4x32-10 pseudorandom generator 62 | CURAND_RNG_QUASI_DEFAULT = 200, ///< Default quasirandom generator 63 | CURAND_RNG_QUASI_SOBOL32 = 201, ///< Sobol32 quasirandom generator 64 | CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202, ///< Scrambled Sobol32 quasirandom generator 65 | CURAND_RNG_QUASI_SOBOL64 = 203, ///< Sobol64 quasirandom generator 66 | CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204 ///< Scrambled Sobol64 quasirandom generator 67 | }; 68 | 69 | /* 70 | * CURAND generator types 71 | */ 72 | /** \cond UNHIDE_TYPEDEFS */ 73 | typedef enum curandRngType curandRngType_t; 74 | /** \endcond */ 75 | 76 | /** 77 | * CURAND ordering of results in memory 78 | */ 79 | enum curandOrdering { 80 | CURAND_ORDERING_PSEUDO_BEST = 100, ///< Best ordering for pseudorandom results 81 | CURAND_ORDERING_PSEUDO_DEFAULT = 82 | 101, ///< Specific default thread sequence for pseudorandom results, same as CURAND_ORDERING_PSEUDO_BEST 83 | CURAND_ORDERING_PSEUDO_SEEDED = 102, ///< Specific seeding pattern for fast lower quality pseudorandom results 84 | CURAND_ORDERING_PSEUDO_LEGACY = 103, ///< Specific legacy sequence for pseudorandom results, guaranteed to remain 85 | ///< the same for all cuRAND release 86 | CURAND_ORDERING_QUASI_DEFAULT = 201 ///< Specific n-dimensional ordering for quasirandom results 87 | }; 88 | 89 | /* 90 | * CURAND ordering of results in memory 91 | */ 92 | /** \cond UNHIDE_TYPEDEFS */ 93 | typedef enum curandOrdering curandOrdering_t; 94 | /** \endcond */ 95 | 96 | /** 97 | * CURAND choice of direction vector set 98 | */ 99 | enum curandDirectionVectorSet { 100 | CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101, ///< Specific set of 32-bit direction vectors generated from polynomials 101 | ///< recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions 102 | CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 103 | 102, ///< Specific set of 32-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. 104 | ///< Kuo, for up to 20,000 dimensions, and scrambled 105 | CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103, ///< Specific set of 64-bit direction vectors generated from polynomials 106 | ///< recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions 107 | CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 108 | 104 ///< Specific set of 64-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. 109 | ///< Kuo, for up to 20,000 dimensions, and scrambled 110 | }; 111 | 112 | /* 113 | * CURAND choice of direction vector set 114 | */ 115 | /** \cond UNHIDE_TYPEDEFS */ 116 | typedef enum curandDirectionVectorSet curandDirectionVectorSet_t; 117 | /** \endcond */ 118 | 119 | /** 120 | * CURAND array of 32-bit direction vectors 121 | */ 122 | /** \cond UNHIDE_TYPEDEFS */ 123 | typedef unsigned int curandDirectionVectors32_t[32]; 124 | /** \endcond */ 125 | 126 | /** 127 | * CURAND array of 64-bit direction vectors 128 | */ 129 | /** \cond UNHIDE_TYPEDEFS */ 130 | typedef unsigned long long curandDirectionVectors64_t[64]; 131 | /** \endcond **/ 132 | 133 | /** 134 | * CURAND generator (opaque) 135 | */ 136 | struct curandGenerator_st; 137 | 138 | /** 139 | * CURAND generator 140 | */ 141 | /** \cond UNHIDE_TYPEDEFS */ 142 | typedef struct curandGenerator_st *curandGenerator_t; 143 | /** \endcond */ 144 | 145 | /** 146 | * CURAND distribution 147 | */ 148 | /** \cond UNHIDE_TYPEDEFS */ 149 | typedef double curandDistribution_st; 150 | typedef curandDistribution_st *curandDistribution_t; 151 | typedef struct curandDistributionShift_st *curandDistributionShift_t; 152 | /** \endcond */ 153 | /** 154 | * CURAND distribution M2 155 | */ 156 | /** \cond UNHIDE_TYPEDEFS */ 157 | typedef struct curandDistributionM2Shift_st *curandDistributionM2Shift_t; 158 | typedef struct curandHistogramM2_st *curandHistogramM2_t; 159 | typedef unsigned int curandHistogramM2K_st; 160 | typedef curandHistogramM2K_st *curandHistogramM2K_t; 161 | typedef curandDistribution_st curandHistogramM2V_st; 162 | typedef curandHistogramM2V_st *curandHistogramM2V_t; 163 | 164 | typedef struct curandDiscreteDistribution_st *curandDiscreteDistribution_t; 165 | /** \endcond */ 166 | 167 | /* 168 | * CURAND METHOD 169 | */ 170 | /** \cond UNHIDE_ENUMS */ 171 | enum curandMethod { 172 | CURAND_CHOOSE_BEST = 0, // choose best depends on args 173 | CURAND_ITR = 1, 174 | CURAND_KNUTH = 2, 175 | CURAND_HITR = 3, 176 | CURAND_M1 = 4, 177 | CURAND_M2 = 5, 178 | CURAND_BINARY_SEARCH = 6, 179 | CURAND_DISCRETE_GAUSS = 7, 180 | CURAND_REJECTION = 8, 181 | CURAND_DEVICE_API = 9, 182 | CURAND_FAST_REJECTION = 10, 183 | CURAND_3RD = 11, 184 | CURAND_DEFINITION = 12, 185 | CURAND_POISSON = 13 186 | }; 187 | 188 | typedef enum curandMethod curandMethod_t; 189 | 190 | #ifdef __cplusplus 191 | } 192 | #endif 193 | 194 | #endif // __CUDA_HOOK_CURAND_SUBSET_H__ 195 | -------------------------------------------------------------------------------- /src/cusolver/cusolver_subset.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: cusolver subset 6 | 7 | #ifndef __CUDA_HOOK_CUSOLVER_SUBSET_H__ 8 | #define __CUDA_HOOK_CUSOLVER_SUBSET_H__ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | typedef int cusolver_int_t; 15 | 16 | #define CUSOLVER_VER_MAJOR 11 17 | #define CUSOLVER_VER_MINOR 2 18 | #define CUSOLVER_VER_PATCH 0 19 | #define CUSOLVER_VER_BUILD 120 20 | #define CUSOLVER_VERSION (CUSOLVER_VER_MAJOR * 1000 + CUSOLVER_VER_MINOR * 100 + CUSOLVER_VER_PATCH) 21 | 22 | typedef enum { 23 | CUSOLVER_STATUS_SUCCESS = 0, 24 | CUSOLVER_STATUS_NOT_INITIALIZED = 1, 25 | CUSOLVER_STATUS_ALLOC_FAILED = 2, 26 | CUSOLVER_STATUS_INVALID_VALUE = 3, 27 | CUSOLVER_STATUS_ARCH_MISMATCH = 4, 28 | CUSOLVER_STATUS_MAPPING_ERROR = 5, 29 | CUSOLVER_STATUS_EXECUTION_FAILED = 6, 30 | CUSOLVER_STATUS_INTERNAL_ERROR = 7, 31 | CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8, 32 | CUSOLVER_STATUS_NOT_SUPPORTED = 9, 33 | CUSOLVER_STATUS_ZERO_PIVOT = 10, 34 | CUSOLVER_STATUS_INVALID_LICENSE = 11, 35 | CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED = 12, 36 | CUSOLVER_STATUS_IRS_PARAMS_INVALID = 13, 37 | CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC = 14, 38 | CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE = 15, 39 | CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER = 16, 40 | CUSOLVER_STATUS_IRS_INTERNAL_ERROR = 20, 41 | CUSOLVER_STATUS_IRS_NOT_SUPPORTED = 21, 42 | CUSOLVER_STATUS_IRS_OUT_OF_RANGE = 22, 43 | CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES = 23, 44 | CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED = 25, 45 | CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED = 26, 46 | CUSOLVER_STATUS_IRS_MATRIX_SINGULAR = 30, 47 | CUSOLVER_STATUS_INVALID_WORKSPACE = 31 48 | } cusolverStatus_t; 49 | 50 | typedef enum { CUSOLVER_EIG_TYPE_1 = 1, CUSOLVER_EIG_TYPE_2 = 2, CUSOLVER_EIG_TYPE_3 = 3 } cusolverEigType_t; 51 | 52 | typedef enum { CUSOLVER_EIG_MODE_NOVECTOR = 0, CUSOLVER_EIG_MODE_VECTOR = 1 } cusolverEigMode_t; 53 | 54 | typedef enum { 55 | CUSOLVER_EIG_RANGE_ALL = 1001, 56 | CUSOLVER_EIG_RANGE_I = 1002, 57 | CUSOLVER_EIG_RANGE_V = 1003, 58 | } cusolverEigRange_t; 59 | 60 | typedef enum { 61 | CUSOLVER_INF_NORM = 104, 62 | CUSOLVER_MAX_NORM = 105, 63 | CUSOLVER_ONE_NORM = 106, 64 | CUSOLVER_FRO_NORM = 107, 65 | } cusolverNorm_t; 66 | 67 | typedef enum { 68 | CUSOLVER_IRS_REFINE_NOT_SET = 1100, 69 | CUSOLVER_IRS_REFINE_NONE = 1101, 70 | CUSOLVER_IRS_REFINE_CLASSICAL = 1102, 71 | CUSOLVER_IRS_REFINE_CLASSICAL_GMRES = 1103, 72 | CUSOLVER_IRS_REFINE_GMRES = 1104, 73 | CUSOLVER_IRS_REFINE_GMRES_GMRES = 1105, 74 | CUSOLVER_IRS_REFINE_GMRES_NOPCOND = 1106, 75 | 76 | CUSOLVER_PREC_DD = 1150, 77 | CUSOLVER_PREC_SS = 1151, 78 | CUSOLVER_PREC_SHT = 1152, 79 | 80 | } cusolverIRSRefinement_t; 81 | 82 | typedef enum { 83 | CUSOLVER_R_8I = 1201, 84 | CUSOLVER_R_8U = 1202, 85 | CUSOLVER_R_64F = 1203, 86 | CUSOLVER_R_32F = 1204, 87 | CUSOLVER_R_16F = 1205, 88 | CUSOLVER_R_16BF = 1206, 89 | CUSOLVER_R_TF32 = 1207, 90 | CUSOLVER_R_AP = 1208, 91 | CUSOLVER_C_8I = 1211, 92 | CUSOLVER_C_8U = 1212, 93 | CUSOLVER_C_64F = 1213, 94 | CUSOLVER_C_32F = 1214, 95 | CUSOLVER_C_16F = 1215, 96 | CUSOLVER_C_16BF = 1216, 97 | CUSOLVER_C_TF32 = 1217, 98 | CUSOLVER_C_AP = 1218, 99 | } cusolverPrecType_t; 100 | 101 | typedef enum { 102 | CUSOLVER_ALG_0 = 0, /* default algorithm */ 103 | CUSOLVER_ALG_1 = 1 104 | } cusolverAlgMode_t; 105 | 106 | typedef enum { CUBLAS_STOREV_COLUMNWISE = 0, CUBLAS_STOREV_ROWWISE = 1 } cusolverStorevMode_t; 107 | 108 | typedef enum { CUBLAS_DIRECT_FORWARD = 0, CUBLAS_DIRECT_BACKWARD = 1 } cusolverDirectMode_t; 109 | 110 | struct cusolverDnContext; 111 | typedef struct cusolverDnContext *cusolverDnHandle_t; 112 | 113 | struct syevjInfo; 114 | typedef struct syevjInfo *syevjInfo_t; 115 | 116 | struct gesvdjInfo; 117 | typedef struct gesvdjInfo *gesvdjInfo_t; 118 | 119 | //------------------------------------------------------ 120 | // opaque cusolverDnIRS structure for IRS solver 121 | struct cusolverDnIRSParams; 122 | typedef struct cusolverDnIRSParams *cusolverDnIRSParams_t; 123 | 124 | struct cusolverDnIRSInfos; 125 | typedef struct cusolverDnIRSInfos *cusolverDnIRSInfos_t; 126 | //------------------------------------------------------ 127 | 128 | struct cusolverDnParams; 129 | typedef struct cusolverDnParams *cusolverDnParams_t; 130 | 131 | typedef enum { CUSOLVERDN_GETRF = 0 } cusolverDnFunction_t; 132 | 133 | struct cusolverMgContext; 134 | typedef struct cusolverMgContext *cusolverMgHandle_t; 135 | 136 | /** 137 | * \beief This enum decides how 1D device Ids (or process ranks) get mapped to a 2D grid. 138 | */ 139 | typedef enum { 140 | 141 | CUDALIBMG_GRID_MAPPING_ROW_MAJOR = 1, 142 | CUDALIBMG_GRID_MAPPING_COL_MAJOR = 0 143 | 144 | } cusolverMgGridMapping_t; 145 | 146 | /** \brief Opaque structure of the distributed grid */ 147 | typedef void *cudaLibMgGrid_t; 148 | /** \brief Opaque structure of the distributed matrix descriptor */ 149 | typedef void *cudaLibMgMatrixDesc_t; 150 | 151 | /* CUSOLVERRF mode */ 152 | typedef enum { 153 | CUSOLVERRF_RESET_VALUES_FAST_MODE_OFF = 0, // default 154 | CUSOLVERRF_RESET_VALUES_FAST_MODE_ON = 1 155 | } cusolverRfResetValuesFastMode_t; 156 | 157 | /* CUSOLVERRF matrix format */ 158 | typedef enum { 159 | CUSOLVERRF_MATRIX_FORMAT_CSR = 0, // default 160 | CUSOLVERRF_MATRIX_FORMAT_CSC = 1 161 | } cusolverRfMatrixFormat_t; 162 | 163 | /* CUSOLVERRF unit diagonal */ 164 | typedef enum { 165 | CUSOLVERRF_UNIT_DIAGONAL_STORED_L = 0, // default 166 | CUSOLVERRF_UNIT_DIAGONAL_STORED_U = 1, 167 | CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L = 2, 168 | CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_U = 3 169 | } cusolverRfUnitDiagonal_t; 170 | 171 | /* CUSOLVERRF factorization algorithm */ 172 | typedef enum { 173 | CUSOLVERRF_FACTORIZATION_ALG0 = 0, // default 174 | CUSOLVERRF_FACTORIZATION_ALG1 = 1, 175 | CUSOLVERRF_FACTORIZATION_ALG2 = 2, 176 | } cusolverRfFactorization_t; 177 | 178 | /* CUSOLVERRF triangular solve algorithm */ 179 | typedef enum { 180 | CUSOLVERRF_TRIANGULAR_SOLVE_ALG1 = 1, // default 181 | CUSOLVERRF_TRIANGULAR_SOLVE_ALG2 = 2, 182 | CUSOLVERRF_TRIANGULAR_SOLVE_ALG3 = 3 183 | } cusolverRfTriangularSolve_t; 184 | 185 | /* CUSOLVERRF numeric boost report */ 186 | typedef enum { 187 | CUSOLVERRF_NUMERIC_BOOST_NOT_USED = 0, // default 188 | CUSOLVERRF_NUMERIC_BOOST_USED = 1 189 | } cusolverRfNumericBoostReport_t; 190 | 191 | /* Opaque structure holding CUSOLVERRF library common */ 192 | struct cusolverRfCommon; 193 | typedef struct cusolverRfCommon *cusolverRfHandle_t; 194 | 195 | struct cusolverSpContext; 196 | typedef struct cusolverSpContext *cusolverSpHandle_t; 197 | 198 | struct csrqrInfo; 199 | typedef struct csrqrInfo *csrqrInfo_t; 200 | 201 | struct csrluInfoHost; 202 | typedef struct csrluInfoHost *csrluInfoHost_t; 203 | 204 | struct csrqrInfoHost; 205 | typedef struct csrqrInfoHost *csrqrInfoHost_t; 206 | 207 | struct csrcholInfoHost; 208 | typedef struct csrcholInfoHost *csrcholInfoHost_t; 209 | 210 | struct csrcholInfo; 211 | typedef struct csrcholInfo *csrcholInfo_t; 212 | 213 | #ifdef __cplusplus 214 | } 215 | #endif 216 | 217 | #endif // __CUDA_HOOK_CUSOLVER_SUBSET_H__ 218 | -------------------------------------------------------------------------------- /src/cusparse/cusparse_subset.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: cusparse subset 6 | 7 | #ifndef __CUDA_HOOK_CUSPARSE_SUBSET_H__ 8 | #define __CUDA_HOOK_CUSPARSE_SUBSET_H__ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | //############################################################################## 15 | //# CUSPARSE VERSION INFORMATION 16 | //############################################################################## 17 | 18 | #define CUSPARSE_VER_MAJOR 11 19 | #define CUSPARSE_VER_MINOR 6 20 | #define CUSPARSE_VER_PATCH 0 21 | #define CUSPARSE_VER_BUILD 120 22 | #define CUSPARSE_VERSION (CUSPARSE_VER_MAJOR * 1000 + CUSPARSE_VER_MINOR * 100 + CUSPARSE_VER_PATCH) 23 | 24 | //------------------------------------------------------------------------------ 25 | 26 | struct cusparseContext; 27 | typedef struct cusparseContext *cusparseHandle_t; 28 | 29 | struct cusparseMatDescr; 30 | typedef struct cusparseMatDescr *cusparseMatDescr_t; 31 | 32 | struct csrsv2Info; 33 | typedef struct csrsv2Info *csrsv2Info_t; 34 | 35 | struct csrsm2Info; 36 | typedef struct csrsm2Info *csrsm2Info_t; 37 | 38 | struct bsrsv2Info; 39 | typedef struct bsrsv2Info *bsrsv2Info_t; 40 | 41 | struct bsrsm2Info; 42 | typedef struct bsrsm2Info *bsrsm2Info_t; 43 | 44 | struct csric02Info; 45 | typedef struct csric02Info *csric02Info_t; 46 | 47 | struct bsric02Info; 48 | typedef struct bsric02Info *bsric02Info_t; 49 | 50 | struct csrilu02Info; 51 | typedef struct csrilu02Info *csrilu02Info_t; 52 | 53 | struct bsrilu02Info; 54 | typedef struct bsrilu02Info *bsrilu02Info_t; 55 | 56 | struct csrgemm2Info; 57 | typedef struct csrgemm2Info *csrgemm2Info_t; 58 | 59 | struct csru2csrInfo; 60 | typedef struct csru2csrInfo *csru2csrInfo_t; 61 | 62 | struct cusparseColorInfo; 63 | typedef struct cusparseColorInfo *cusparseColorInfo_t; 64 | 65 | struct pruneInfo; 66 | typedef struct pruneInfo *pruneInfo_t; 67 | 68 | //############################################################################## 69 | //# ENUMERATORS 70 | //############################################################################## 71 | 72 | typedef enum { 73 | CUSPARSE_STATUS_SUCCESS = 0, 74 | CUSPARSE_STATUS_NOT_INITIALIZED = 1, 75 | CUSPARSE_STATUS_ALLOC_FAILED = 2, 76 | CUSPARSE_STATUS_INVALID_VALUE = 3, 77 | CUSPARSE_STATUS_ARCH_MISMATCH = 4, 78 | CUSPARSE_STATUS_MAPPING_ERROR = 5, 79 | CUSPARSE_STATUS_EXECUTION_FAILED = 6, 80 | CUSPARSE_STATUS_INTERNAL_ERROR = 7, 81 | CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8, 82 | CUSPARSE_STATUS_ZERO_PIVOT = 9, 83 | CUSPARSE_STATUS_NOT_SUPPORTED = 10, 84 | CUSPARSE_STATUS_INSUFFICIENT_RESOURCES = 11 85 | } cusparseStatus_t; 86 | 87 | typedef enum { CUSPARSE_POINTER_MODE_HOST = 0, CUSPARSE_POINTER_MODE_DEVICE = 1 } cusparsePointerMode_t; 88 | 89 | typedef enum { CUSPARSE_ACTION_SYMBOLIC = 0, CUSPARSE_ACTION_NUMERIC = 1 } cusparseAction_t; 90 | 91 | typedef enum { 92 | CUSPARSE_MATRIX_TYPE_GENERAL = 0, 93 | CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1, 94 | CUSPARSE_MATRIX_TYPE_HERMITIAN = 2, 95 | CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 96 | } cusparseMatrixType_t; 97 | 98 | typedef enum { CUSPARSE_FILL_MODE_LOWER = 0, CUSPARSE_FILL_MODE_UPPER = 1 } cusparseFillMode_t; 99 | 100 | typedef enum { CUSPARSE_DIAG_TYPE_NON_UNIT = 0, CUSPARSE_DIAG_TYPE_UNIT = 1 } cusparseDiagType_t; 101 | 102 | typedef enum { CUSPARSE_INDEX_BASE_ZERO = 0, CUSPARSE_INDEX_BASE_ONE = 1 } cusparseIndexBase_t; 103 | 104 | typedef enum { 105 | CUSPARSE_OPERATION_NON_TRANSPOSE = 0, 106 | CUSPARSE_OPERATION_TRANSPOSE = 1, 107 | CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 108 | } cusparseOperation_t; 109 | 110 | typedef enum { CUSPARSE_DIRECTION_ROW = 0, CUSPARSE_DIRECTION_COLUMN = 1 } cusparseDirection_t; 111 | 112 | typedef enum { CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0, CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1 } cusparseSolvePolicy_t; 113 | 114 | typedef enum { CUSPARSE_SIDE_LEFT = 0, CUSPARSE_SIDE_RIGHT = 1 } cusparseSideMode_t; 115 | 116 | typedef enum { 117 | CUSPARSE_COLOR_ALG0 = 0, // default 118 | CUSPARSE_COLOR_ALG1 = 1 119 | } cusparseColorAlg_t; 120 | 121 | typedef enum { 122 | CUSPARSE_ALG_MERGE_PATH // merge path alias 123 | } cusparseAlgMode_t; 124 | 125 | typedef enum { 126 | CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc 127 | CUSPARSE_CSR2CSC_ALG2 = 2 // low memory requirement, non-deterministc 128 | } cusparseCsr2CscAlg_t; 129 | 130 | typedef enum { 131 | CUSPARSE_FORMAT_CSR = 1, ///< Compressed Sparse Row (CSR) 132 | CUSPARSE_FORMAT_CSC = 2, ///< Compressed Sparse Column (CSC) 133 | CUSPARSE_FORMAT_COO = 3, ///< Coordinate (COO) - Structure of Arrays 134 | CUSPARSE_FORMAT_COO_AOS = 4, ///< Coordinate (COO) - Array of Structures 135 | CUSPARSE_FORMAT_BLOCKED_ELL = 5, ///< Blocked ELL 136 | } cusparseFormat_t; 137 | 138 | typedef enum { 139 | CUSPARSE_ORDER_COL = 1, ///< Column-Major Order - Matrix memory layout 140 | CUSPARSE_ORDER_ROW = 2 ///< Row-Major Order - Matrix memory layout 141 | } cusparseOrder_t; 142 | 143 | typedef enum { 144 | CUSPARSE_INDEX_16U = 1, ///< 16-bit unsigned integer for matrix/vector 145 | ///< indices 146 | CUSPARSE_INDEX_32I = 2, ///< 32-bit signed integer for matrix/vector indices 147 | CUSPARSE_INDEX_64I = 3 ///< 64-bit signed integer for matrix/vector indices 148 | } cusparseIndexType_t; 149 | 150 | //------------------------------------------------------------------------------ 151 | 152 | struct cusparseSpVecDescr; 153 | struct cusparseDnVecDescr; 154 | struct cusparseSpMatDescr; 155 | struct cusparseDnMatDescr; 156 | typedef struct cusparseSpVecDescr *cusparseSpVecDescr_t; 157 | typedef struct cusparseDnVecDescr *cusparseDnVecDescr_t; 158 | typedef struct cusparseSpMatDescr *cusparseSpMatDescr_t; 159 | typedef struct cusparseDnMatDescr *cusparseDnMatDescr_t; 160 | 161 | typedef enum { CUSPARSE_SPMAT_FILL_MODE, CUSPARSE_SPMAT_DIAG_TYPE } cusparseSpMatAttribute_t; 162 | 163 | typedef enum { CUSPARSE_SPARSETODENSE_ALG_DEFAULT = 0 } cusparseSparseToDenseAlg_t; 164 | 165 | typedef enum { CUSPARSE_DENSETOSPARSE_ALG_DEFAULT = 0 } cusparseDenseToSparseAlg_t; 166 | 167 | typedef enum { 168 | CUSPARSE_MV_ALG_DEFAULT 169 | /*CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_ALG_DEFAULT)*/ 170 | = 0, 171 | // CUSPARSE_COOMV_ALG CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_COO_ALG1) = 1, 172 | // CUSPARSE_CSRMV_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_CSR_ALG1) = 2, 173 | // CUSPARSE_CSRMV_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_CSR_ALG2) = 3, 174 | CUSPARSE_SPMV_ALG_DEFAULT = 0, 175 | CUSPARSE_SPMV_CSR_ALG1 = 2, 176 | CUSPARSE_SPMV_CSR_ALG2 = 3, 177 | CUSPARSE_SPMV_COO_ALG1 = 1, 178 | CUSPARSE_SPMV_COO_ALG2 = 4 179 | } cusparseSpMVAlg_t; 180 | 181 | typedef enum { 182 | CUSPARSE_SPSV_ALG_DEFAULT = 0, 183 | } cusparseSpSVAlg_t; 184 | 185 | struct cusparseSpSVDescr; 186 | typedef struct cusparseSpSVDescr *cusparseSpSVDescr_t; 187 | 188 | typedef enum { 189 | CUSPARSE_SPSM_ALG_DEFAULT = 0, 190 | } cusparseSpSMAlg_t; 191 | 192 | struct cusparseSpSMDescr; 193 | typedef struct cusparseSpSMDescr *cusparseSpSMDescr_t; 194 | 195 | typedef enum { 196 | // CUSPARSE_MM_ALG_DEFAULT CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0, 197 | // CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1, 198 | // CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2, 199 | // CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3, 200 | // CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4, 201 | CUSPARSE_SPMM_ALG_DEFAULT = 0, 202 | CUSPARSE_SPMM_COO_ALG1 = 1, 203 | CUSPARSE_SPMM_COO_ALG2 = 2, 204 | CUSPARSE_SPMM_COO_ALG3 = 3, 205 | CUSPARSE_SPMM_COO_ALG4 = 5, 206 | CUSPARSE_SPMM_CSR_ALG1 = 4, 207 | CUSPARSE_SPMM_CSR_ALG2 = 6, 208 | CUSPARSE_SPMM_CSR_ALG3 = 12, 209 | CUSPARSE_SPMM_BLOCKED_ELL_ALG1 = 13 210 | } cusparseSpMMAlg_t; 211 | 212 | typedef enum { 213 | CUSPARSE_SPGEMM_DEFAULT = 0, 214 | CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC = 1, 215 | CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC = 2 216 | } cusparseSpGEMMAlg_t; 217 | 218 | struct cusparseSpGEMMDescr; 219 | typedef struct cusparseSpGEMMDescr *cusparseSpGEMMDescr_t; 220 | 221 | typedef enum { CUSPARSE_SDDMM_ALG_DEFAULT = 0 } cusparseSDDMMAlg_t; 222 | 223 | #ifdef __cplusplus 224 | } 225 | #endif 226 | 227 | #endif // __CUDA_HOOK_CUSPARSE_SUBSET_H__ 228 | -------------------------------------------------------------------------------- /src/nvjpeg/nvjpeg_subset.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 22:07:19 on Wed, Jul 20, 2022 4 | // 5 | // Description: nvjpeg subset 6 | 7 | #ifndef __CUDA_HOOK_NVJPEG_SUBSET_H__ 8 | #define __CUDA_HOOK_NVJPEG_SUBSET_H__ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | // Maximum number of channels nvjpeg decoder supports 15 | #define NVJPEG_MAX_COMPONENT 4 16 | 17 | // nvjpeg version information 18 | #define NVJPEG_VER_MAJOR 11 19 | #define NVJPEG_VER_MINOR 5 20 | #define NVJPEG_VER_PATCH 2 21 | #define NVJPEG_VER_BUILD 120 22 | 23 | /* nvJPEG status enums, returned by nvJPEG API */ 24 | typedef enum { 25 | NVJPEG_STATUS_SUCCESS = 0, 26 | NVJPEG_STATUS_NOT_INITIALIZED = 1, 27 | NVJPEG_STATUS_INVALID_PARAMETER = 2, 28 | NVJPEG_STATUS_BAD_JPEG = 3, 29 | NVJPEG_STATUS_JPEG_NOT_SUPPORTED = 4, 30 | NVJPEG_STATUS_ALLOCATOR_FAILURE = 5, 31 | NVJPEG_STATUS_EXECUTION_FAILED = 6, 32 | NVJPEG_STATUS_ARCH_MISMATCH = 7, 33 | NVJPEG_STATUS_INTERNAL_ERROR = 8, 34 | NVJPEG_STATUS_IMPLEMENTATION_NOT_SUPPORTED = 9, 35 | } nvjpegStatus_t; 36 | 37 | // Enum identifies image chroma subsampling values stored inside JPEG input stream 38 | // In the case of NVJPEG_CSS_GRAY only 1 luminance channel is encoded in JPEG input stream 39 | // Otherwise both chroma planes are present 40 | typedef enum { 41 | NVJPEG_CSS_444 = 0, 42 | NVJPEG_CSS_422 = 1, 43 | NVJPEG_CSS_420 = 2, 44 | NVJPEG_CSS_440 = 3, 45 | NVJPEG_CSS_411 = 4, 46 | NVJPEG_CSS_410 = 5, 47 | NVJPEG_CSS_GRAY = 6, 48 | NVJPEG_CSS_410V = 7, 49 | NVJPEG_CSS_UNKNOWN = -1 50 | } nvjpegChromaSubsampling_t; 51 | 52 | // Parameter of this type specifies what type of output user wants for image decoding 53 | typedef enum { 54 | // return decompressed image as it is - write planar output 55 | NVJPEG_OUTPUT_UNCHANGED = 0, 56 | // return planar luma and chroma, assuming YCbCr colorspace 57 | NVJPEG_OUTPUT_YUV = 1, 58 | // return luma component only, if YCbCr colorspace, 59 | // or try to convert to grayscale, 60 | // writes to 1-st channel of nvjpegImage_t 61 | NVJPEG_OUTPUT_Y = 2, 62 | // convert to planar RGB 63 | NVJPEG_OUTPUT_RGB = 3, 64 | // convert to planar BGR 65 | NVJPEG_OUTPUT_BGR = 4, 66 | // convert to interleaved RGB and write to 1-st channel of nvjpegImage_t 67 | NVJPEG_OUTPUT_RGBI = 5, 68 | // convert to interleaved BGR and write to 1-st channel of nvjpegImage_t 69 | NVJPEG_OUTPUT_BGRI = 6, 70 | // maximum allowed value 71 | NVJPEG_OUTPUT_FORMAT_MAX = 6 72 | } nvjpegOutputFormat_t; 73 | 74 | // Parameter of this type specifies what type of input user provides for encoding 75 | typedef enum { 76 | NVJPEG_INPUT_RGB = 3, // Input is RGB - will be converted to YCbCr before encoding 77 | NVJPEG_INPUT_BGR = 4, // Input is RGB - will be converted to YCbCr before encoding 78 | NVJPEG_INPUT_RGBI = 5, // Input is interleaved RGB - will be converted to YCbCr before encoding 79 | NVJPEG_INPUT_BGRI = 6 // Input is interleaved RGB - will be converted to YCbCr before encoding 80 | } nvjpegInputFormat_t; 81 | 82 | // Implementation 83 | // NVJPEG_BACKEND_DEFAULT : default value 84 | // NVJPEG_BACKEND_HYBRID : uses CPU for Huffman decode 85 | // NVJPEG_BACKEND_GPU_HYBRID : uses GPU assisted Huffman decode. nvjpegDecodeBatched will use GPU decoding for baseline 86 | // JPEG bitstreams with 87 | // interleaved scan when batch size is bigger than 100 88 | // NVJPEG_BACKEND_HARDWARE : supports baseline JPEG bitstream with single scan. 410 and 411 sub-samplings are not 89 | // supported 90 | typedef enum { 91 | NVJPEG_BACKEND_DEFAULT = 0, 92 | NVJPEG_BACKEND_HYBRID = 1, 93 | NVJPEG_BACKEND_GPU_HYBRID = 2, 94 | NVJPEG_BACKEND_HARDWARE = 3 95 | } nvjpegBackend_t; 96 | 97 | // Currently parseable JPEG encodings (SOF markers) 98 | typedef enum { 99 | NVJPEG_ENCODING_UNKNOWN = 0x0, 100 | 101 | NVJPEG_ENCODING_BASELINE_DCT = 0xc0, 102 | NVJPEG_ENCODING_EXTENDED_SEQUENTIAL_DCT_HUFFMAN = 0xc1, 103 | NVJPEG_ENCODING_PROGRESSIVE_DCT_HUFFMAN = 0xc2 104 | 105 | } nvjpegJpegEncoding_t; 106 | 107 | typedef enum { 108 | NVJPEG_SCALE_NONE = 0, // decoded output is not scaled 109 | NVJPEG_SCALE_1_BY_2 = 1, // decoded output width and height is scaled by a factor of 1/2 110 | NVJPEG_SCALE_1_BY_4 = 2, // decoded output width and height is scaled by a factor of 1/4 111 | NVJPEG_SCALE_1_BY_8 = 3, // decoded output width and height is scaled by a factor of 1/8 112 | } nvjpegScaleFactor_t; 113 | 114 | #define NVJPEG_FLAGS_DEFAULT 0 115 | #define NVJPEG_FLAGS_HW_DECODE_NO_PIPELINE 1 116 | #define NVJPEG_FLAGS_ENABLE_MEMORY_POOLS 1 << 1 117 | #define NVJPEG_FLAGS_BITSTREAM_STRICT 1 << 2 118 | 119 | // Output descriptor. 120 | // Data that is written to planes depends on output format 121 | typedef struct { 122 | unsigned char *channel[NVJPEG_MAX_COMPONENT]; 123 | size_t pitch[NVJPEG_MAX_COMPONENT]; 124 | } nvjpegImage_t; 125 | 126 | // Prototype for device memory allocation, modelled after cudaMalloc() 127 | typedef int (*tDevMalloc)(void **, size_t); 128 | // Prototype for device memory release 129 | typedef int (*tDevFree)(void *); 130 | 131 | // Prototype for pinned memory allocation, modelled after cudaHostAlloc() 132 | typedef int (*tPinnedMalloc)(void **, size_t, unsigned int flags); 133 | // Prototype for device memory release 134 | typedef int (*tPinnedFree)(void *); 135 | 136 | // Memory allocator using mentioned prototypes, provided to nvjpegCreateEx 137 | // This allocator will be used for all device memory allocations inside library 138 | // In any way library is doing smart allocations (reallocates memory only if needed) 139 | typedef struct { 140 | tDevMalloc dev_malloc; 141 | tDevFree dev_free; 142 | } nvjpegDevAllocator_t; 143 | 144 | // Pinned memory allocator using mentioned prototypes, provided to nvjpegCreate 145 | // This allocator will be used for all pinned host memory allocations inside library 146 | // In any way library is doing smart allocations (reallocates memory only if needed) 147 | typedef struct { 148 | tPinnedMalloc pinned_malloc; 149 | tPinnedFree pinned_free; 150 | } nvjpegPinnedAllocator_t; 151 | 152 | // Opaque library handle identifier. 153 | struct nvjpegHandle; 154 | typedef struct nvjpegHandle *nvjpegHandle_t; 155 | 156 | // Opaque jpeg decoding state handle identifier - used to store intermediate information between deccding phases 157 | struct nvjpegJpegState; 158 | typedef struct nvjpegJpegState *nvjpegJpegState_t; 159 | 160 | struct nvjpegEncoderState; 161 | typedef struct nvjpegEncoderState *nvjpegEncoderState_t; 162 | 163 | struct nvjpegEncoderParams; 164 | typedef struct nvjpegEncoderParams *nvjpegEncoderParams_t; 165 | 166 | struct nvjpegBufferPinned; 167 | typedef struct nvjpegBufferPinned *nvjpegBufferPinned_t; 168 | 169 | struct nvjpegBufferDevice; 170 | typedef struct nvjpegBufferDevice *nvjpegBufferDevice_t; 171 | 172 | struct nvjpegJpegStream; 173 | typedef struct nvjpegJpegStream *nvjpegJpegStream_t; 174 | 175 | struct nvjpegDecodeParams; 176 | typedef struct nvjpegDecodeParams *nvjpegDecodeParams_t; 177 | 178 | struct nvjpegJpegDecoder; 179 | typedef struct nvjpegJpegDecoder *nvjpegJpegDecoder_t; 180 | 181 | #ifdef __cplusplus 182 | } 183 | #endif 184 | 185 | #endif // __CUDA_HOOK_NVJPEG_SUBSET_H__ 186 | -------------------------------------------------------------------------------- /src/nvrtc/nvrtc_hook.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: auto generate 18 apis 6 | 7 | #include "hook.h" 8 | #include "macro_common.h" 9 | #include "nvrtc_subset.h" 10 | #include "trace_profile.h" 11 | 12 | HOOK_C_API HOOK_DECL_EXPORT const char *nvrtcGetErrorString(nvrtcResult result) { 13 | HOOK_TRACE_PROFILE("nvrtcGetErrorString"); 14 | using func_ptr = const char *(*)(nvrtcResult); 15 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetErrorString")); 16 | HOOK_CHECK(func_entry); 17 | return func_entry(result); 18 | } 19 | 20 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcVersion(int *major, int *minor) { 21 | HOOK_TRACE_PROFILE("nvrtcVersion"); 22 | using func_ptr = nvrtcResult (*)(int *, int *); 23 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcVersion")); 24 | HOOK_CHECK(func_entry); 25 | return func_entry(major, minor); 26 | } 27 | 28 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetNumSupportedArchs(int *numArchs) { 29 | HOOK_TRACE_PROFILE("nvrtcGetNumSupportedArchs"); 30 | using func_ptr = nvrtcResult (*)(int *); 31 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetNumSupportedArchs")); 32 | HOOK_CHECK(func_entry); 33 | return func_entry(numArchs); 34 | } 35 | 36 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetSupportedArchs(int *supportedArchs) { 37 | HOOK_TRACE_PROFILE("nvrtcGetSupportedArchs"); 38 | using func_ptr = nvrtcResult (*)(int *); 39 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetSupportedArchs")); 40 | HOOK_CHECK(func_entry); 41 | return func_entry(supportedArchs); 42 | } 43 | 44 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, 45 | int numHeaders, const char *const *headers, 46 | const char *const *includeNames) { 47 | HOOK_TRACE_PROFILE("nvrtcCreateProgram"); 48 | using func_ptr = 49 | nvrtcResult (*)(nvrtcProgram *, const char *, const char *, int, const char *const *, const char *const *); 50 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcCreateProgram")); 51 | HOOK_CHECK(func_entry); 52 | return func_entry(prog, src, name, numHeaders, headers, includeNames); 53 | } 54 | 55 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog) { 56 | HOOK_TRACE_PROFILE("nvrtcDestroyProgram"); 57 | using func_ptr = nvrtcResult (*)(nvrtcProgram *); 58 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcDestroyProgram")); 59 | HOOK_CHECK(func_entry); 60 | return func_entry(prog); 61 | } 62 | 63 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, 64 | const char *const *options) { 65 | HOOK_TRACE_PROFILE("nvrtcCompileProgram"); 66 | using func_ptr = nvrtcResult (*)(nvrtcProgram, int, const char *const *); 67 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcCompileProgram")); 68 | HOOK_CHECK(func_entry); 69 | return func_entry(prog, numOptions, options); 70 | } 71 | 72 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet) { 73 | HOOK_TRACE_PROFILE("nvrtcGetPTXSize"); 74 | using func_ptr = nvrtcResult (*)(nvrtcProgram, size_t *); 75 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetPTXSize")); 76 | HOOK_CHECK(func_entry); 77 | return func_entry(prog, ptxSizeRet); 78 | } 79 | 80 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx) { 81 | HOOK_TRACE_PROFILE("nvrtcGetPTX"); 82 | using func_ptr = nvrtcResult (*)(nvrtcProgram, char *); 83 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetPTX")); 84 | HOOK_CHECK(func_entry); 85 | return func_entry(prog, ptx); 86 | } 87 | 88 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet) { 89 | HOOK_TRACE_PROFILE("nvrtcGetCUBINSize"); 90 | using func_ptr = nvrtcResult (*)(nvrtcProgram, size_t *); 91 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetCUBINSize")); 92 | HOOK_CHECK(func_entry); 93 | return func_entry(prog, cubinSizeRet); 94 | } 95 | 96 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin) { 97 | HOOK_TRACE_PROFILE("nvrtcGetCUBIN"); 98 | using func_ptr = nvrtcResult (*)(nvrtcProgram, char *); 99 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetCUBIN")); 100 | HOOK_CHECK(func_entry); 101 | return func_entry(prog, cubin); 102 | } 103 | 104 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet) { 105 | HOOK_TRACE_PROFILE("nvrtcGetNVVMSize"); 106 | using func_ptr = nvrtcResult (*)(nvrtcProgram, size_t *); 107 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetNVVMSize")); 108 | HOOK_CHECK(func_entry); 109 | return func_entry(prog, nvvmSizeRet); 110 | } 111 | 112 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm) { 113 | HOOK_TRACE_PROFILE("nvrtcGetNVVM"); 114 | using func_ptr = nvrtcResult (*)(nvrtcProgram, char *); 115 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetNVVM")); 116 | HOOK_CHECK(func_entry); 117 | return func_entry(prog, nvvm); 118 | } 119 | 120 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet) { 121 | HOOK_TRACE_PROFILE("nvrtcGetProgramLogSize"); 122 | using func_ptr = nvrtcResult (*)(nvrtcProgram, size_t *); 123 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetProgramLogSize")); 124 | HOOK_CHECK(func_entry); 125 | return func_entry(prog, logSizeRet); 126 | } 127 | 128 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log) { 129 | HOOK_TRACE_PROFILE("nvrtcGetProgramLog"); 130 | using func_ptr = nvrtcResult (*)(nvrtcProgram, char *); 131 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetProgramLog")); 132 | HOOK_CHECK(func_entry); 133 | return func_entry(prog, log); 134 | } 135 | 136 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog, const char *const name_expression) { 137 | HOOK_TRACE_PROFILE("nvrtcAddNameExpression"); 138 | using func_ptr = nvrtcResult (*)(nvrtcProgram, const char *const); 139 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcAddNameExpression")); 140 | HOOK_CHECK(func_entry); 141 | return func_entry(prog, name_expression); 142 | } 143 | 144 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog, const char *const name_expression, 145 | const char **lowered_name) { 146 | HOOK_TRACE_PROFILE("nvrtcGetLoweredName"); 147 | using func_ptr = nvrtcResult (*)(nvrtcProgram, const char *const, const char **); 148 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetLoweredName")); 149 | HOOK_CHECK(func_entry); 150 | return func_entry(prog, name_expression, lowered_name); 151 | } 152 | 153 | HOOK_C_API HOOK_DECL_EXPORT nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result) { 154 | HOOK_TRACE_PROFILE("nvrtcGetTypeName"); 155 | using func_ptr = nvrtcResult (*)(const std::type_info &, std::string *); 156 | static auto func_entry = reinterpret_cast(HOOK_NVRTC_SYMBOL("nvrtcGetTypeName")); 157 | HOOK_CHECK(func_entry); 158 | return func_entry(tinfo, result); 159 | } 160 | -------------------------------------------------------------------------------- /src/nvrtc/nvrtc_subset.h: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: nvrtc subset 6 | 7 | #ifndef __CUDA_HOOK_NVRTC_SUBSET_H__ 8 | #define __CUDA_HOOK_NVRTC_SUBSET_H__ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | /** 15 | * \ingroup error 16 | * \brief The enumerated type nvrtcResult defines API call result codes. 17 | * NVRTC API functions return nvrtcResult to indicate the call 18 | * result. 19 | */ 20 | typedef enum { 21 | NVRTC_SUCCESS = 0, 22 | NVRTC_ERROR_OUT_OF_MEMORY = 1, 23 | NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, 24 | NVRTC_ERROR_INVALID_INPUT = 3, 25 | NVRTC_ERROR_INVALID_PROGRAM = 4, 26 | NVRTC_ERROR_INVALID_OPTION = 5, 27 | NVRTC_ERROR_COMPILATION = 6, 28 | NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, 29 | NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, 30 | NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, 31 | NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, 32 | NVRTC_ERROR_INTERNAL_ERROR = 11 33 | } nvrtcResult; 34 | 35 | /** 36 | * \ingroup compilation 37 | * \brief nvrtcProgram is the unit of compilation, and an opaque handle for 38 | * a program. 39 | * 40 | * To compile a CUDA program string, an instance of nvrtcProgram must be 41 | * created first with ::nvrtcCreateProgram, then compiled with 42 | * ::nvrtcCompileProgram. 43 | */ 44 | typedef struct _nvrtcProgram *nvrtcProgram; 45 | 46 | #ifdef __cplusplus 47 | } 48 | #endif 49 | 50 | #endif // __CUDA_HOOK_NVRTC_SUBSET_H__ 51 | -------------------------------------------------------------------------------- /src/nvtx/nvtx_hook.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2022. All Rights Reserved. 2 | // Author: Bruce-Lee-LY 3 | // Date: 17:19:12 on Sun, May 29, 2022 4 | // 5 | // Description: auto generate 64 apis 6 | 7 | #include "cuda_subset.h" 8 | #include "cudart_subset.h" 9 | #include "hook.h" 10 | #include "macro_common.h" 11 | #include "nvtx_subset.h" 12 | #include "trace_profile.h" 13 | 14 | HOOK_C_API HOOK_DECL_EXPORT int nvtxInitialize(const nvtxInitializationAttributes_t *initAttrib) { 15 | HOOK_TRACE_PROFILE("nvtxInitialize"); 16 | using func_ptr = int (*)(const nvtxInitializationAttributes_t *); 17 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxInitialize")); 18 | HOOK_CHECK(func_entry); 19 | return func_entry(initAttrib); 20 | } 21 | 22 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t *eventAttrib) { 23 | HOOK_TRACE_PROFILE("nvtxDomainMarkEx"); 24 | using func_ptr = void (*)(nvtxDomainHandle_t, const nvtxEventAttributes_t *); 25 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainMarkEx")); 26 | HOOK_CHECK(func_entry); 27 | return func_entry(domain, eventAttrib); 28 | } 29 | 30 | HOOK_C_API HOOK_DECL_EXPORT void nvtxMarkEx(const nvtxEventAttributes_t *eventAttrib) { 31 | HOOK_TRACE_PROFILE("nvtxMarkEx"); 32 | using func_ptr = void (*)(const nvtxEventAttributes_t *); 33 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxMarkEx")); 34 | HOOK_CHECK(func_entry); 35 | return func_entry(eventAttrib); 36 | } 37 | 38 | HOOK_C_API HOOK_DECL_EXPORT void nvtxMarkA(const char *message) { 39 | HOOK_TRACE_PROFILE("nvtxMarkA"); 40 | using func_ptr = void (*)(const char *); 41 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxMarkA")); 42 | HOOK_CHECK(func_entry); 43 | return func_entry(message); 44 | } 45 | 46 | HOOK_C_API HOOK_DECL_EXPORT void nvtxMarkW(const wchar_t *message) { 47 | HOOK_TRACE_PROFILE("nvtxMarkW"); 48 | using func_ptr = void (*)(const wchar_t *); 49 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxMarkW")); 50 | HOOK_CHECK(func_entry); 51 | return func_entry(message); 52 | } 53 | 54 | HOOK_C_API HOOK_DECL_EXPORT nvtxRangeId_t nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, 55 | const nvtxEventAttributes_t *eventAttrib) { 56 | HOOK_TRACE_PROFILE("nvtxDomainRangeStartEx"); 57 | using func_ptr = nvtxRangeId_t (*)(nvtxDomainHandle_t, const nvtxEventAttributes_t *); 58 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainRangeStartEx")); 59 | HOOK_CHECK(func_entry); 60 | return func_entry(domain, eventAttrib); 61 | } 62 | 63 | HOOK_C_API HOOK_DECL_EXPORT nvtxRangeId_t nvtxRangeStartEx(const nvtxEventAttributes_t *eventAttrib) { 64 | HOOK_TRACE_PROFILE("nvtxRangeStartEx"); 65 | using func_ptr = nvtxRangeId_t (*)(const nvtxEventAttributes_t *); 66 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxRangeStartEx")); 67 | HOOK_CHECK(func_entry); 68 | return func_entry(eventAttrib); 69 | } 70 | 71 | HOOK_C_API HOOK_DECL_EXPORT nvtxRangeId_t nvtxRangeStartA(const char *message) { 72 | HOOK_TRACE_PROFILE("nvtxRangeStartA"); 73 | using func_ptr = nvtxRangeId_t (*)(const char *); 74 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxRangeStartA")); 75 | HOOK_CHECK(func_entry); 76 | return func_entry(message); 77 | } 78 | 79 | HOOK_C_API HOOK_DECL_EXPORT nvtxRangeId_t nvtxRangeStartW(const wchar_t *message) { 80 | HOOK_TRACE_PROFILE("nvtxRangeStartW"); 81 | using func_ptr = nvtxRangeId_t (*)(const wchar_t *); 82 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxRangeStartW")); 83 | HOOK_CHECK(func_entry); 84 | return func_entry(message); 85 | } 86 | 87 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id) { 88 | HOOK_TRACE_PROFILE("nvtxDomainRangeEnd"); 89 | using func_ptr = void (*)(nvtxDomainHandle_t, nvtxRangeId_t); 90 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainRangeEnd")); 91 | HOOK_CHECK(func_entry); 92 | return func_entry(domain, id); 93 | } 94 | 95 | HOOK_C_API HOOK_DECL_EXPORT void nvtxRangeEnd(nvtxRangeId_t id) { 96 | HOOK_TRACE_PROFILE("nvtxRangeEnd"); 97 | using func_ptr = void (*)(nvtxRangeId_t); 98 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxRangeEnd")); 99 | HOOK_CHECK(func_entry); 100 | return func_entry(id); 101 | } 102 | 103 | HOOK_C_API HOOK_DECL_EXPORT int nvtxDomainRangePushEx(nvtxDomainHandle_t domain, 104 | const nvtxEventAttributes_t *eventAttrib) { 105 | HOOK_TRACE_PROFILE("nvtxDomainRangePushEx"); 106 | using func_ptr = int (*)(nvtxDomainHandle_t, const nvtxEventAttributes_t *); 107 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainRangePushEx")); 108 | HOOK_CHECK(func_entry); 109 | return func_entry(domain, eventAttrib); 110 | } 111 | 112 | HOOK_C_API HOOK_DECL_EXPORT int nvtxRangePushEx(const nvtxEventAttributes_t *eventAttrib) { 113 | HOOK_TRACE_PROFILE("nvtxRangePushEx"); 114 | using func_ptr = int (*)(const nvtxEventAttributes_t *); 115 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxRangePushEx")); 116 | HOOK_CHECK(func_entry); 117 | return func_entry(eventAttrib); 118 | } 119 | 120 | HOOK_C_API HOOK_DECL_EXPORT int nvtxRangePushA(const char *message) { 121 | HOOK_TRACE_PROFILE("nvtxRangePushA"); 122 | using func_ptr = int (*)(const char *); 123 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxRangePushA")); 124 | HOOK_CHECK(func_entry); 125 | return func_entry(message); 126 | } 127 | 128 | HOOK_C_API HOOK_DECL_EXPORT int nvtxRangePushW(const wchar_t *message) { 129 | HOOK_TRACE_PROFILE("nvtxRangePushW"); 130 | using func_ptr = int (*)(const wchar_t *); 131 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxRangePushW")); 132 | HOOK_CHECK(func_entry); 133 | return func_entry(message); 134 | } 135 | 136 | HOOK_C_API HOOK_DECL_EXPORT int nvtxDomainRangePop(nvtxDomainHandle_t domain) { 137 | HOOK_TRACE_PROFILE("nvtxDomainRangePop"); 138 | using func_ptr = int (*)(nvtxDomainHandle_t); 139 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainRangePop")); 140 | HOOK_CHECK(func_entry); 141 | return func_entry(domain); 142 | } 143 | 144 | HOOK_C_API HOOK_DECL_EXPORT int nvtxRangePop() { 145 | HOOK_TRACE_PROFILE("nvtxRangePop"); 146 | using func_ptr = int (*)(); 147 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxRangePop")); 148 | HOOK_CHECK(func_entry); 149 | return func_entry(); 150 | } 151 | 152 | HOOK_C_API HOOK_DECL_EXPORT nvtxResourceHandle_t nvtxDomainResourceCreate(nvtxDomainHandle_t domain, 153 | nvtxResourceAttributes_t *attribs) { 154 | HOOK_TRACE_PROFILE("nvtxDomainResourceCreate"); 155 | using func_ptr = nvtxResourceHandle_t (*)(nvtxDomainHandle_t, nvtxResourceAttributes_t *); 156 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainResourceCreate")); 157 | HOOK_CHECK(func_entry); 158 | return func_entry(domain, attribs); 159 | } 160 | 161 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainResourceDestroy(nvtxResourceHandle_t resource) { 162 | HOOK_TRACE_PROFILE("nvtxDomainResourceDestroy"); 163 | using func_ptr = void (*)(nvtxResourceHandle_t); 164 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainResourceDestroy")); 165 | HOOK_CHECK(func_entry); 166 | return func_entry(resource); 167 | } 168 | 169 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, 170 | const char *name) { 171 | HOOK_TRACE_PROFILE("nvtxDomainNameCategoryA"); 172 | using func_ptr = void (*)(nvtxDomainHandle_t, uint32_t, const char *); 173 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainNameCategoryA")); 174 | HOOK_CHECK(func_entry); 175 | return func_entry(domain, category, name); 176 | } 177 | 178 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, 179 | const wchar_t *name) { 180 | HOOK_TRACE_PROFILE("nvtxDomainNameCategoryW"); 181 | using func_ptr = void (*)(nvtxDomainHandle_t, uint32_t, const wchar_t *); 182 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainNameCategoryW")); 183 | HOOK_CHECK(func_entry); 184 | return func_entry(domain, category, name); 185 | } 186 | 187 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCategoryA(uint32_t category, const char *name) { 188 | HOOK_TRACE_PROFILE("nvtxNameCategoryA"); 189 | using func_ptr = void (*)(uint32_t, const char *); 190 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCategoryA")); 191 | HOOK_CHECK(func_entry); 192 | return func_entry(category, name); 193 | } 194 | 195 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCategoryW(uint32_t category, const wchar_t *name) { 196 | HOOK_TRACE_PROFILE("nvtxNameCategoryW"); 197 | using func_ptr = void (*)(uint32_t, const wchar_t *); 198 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCategoryW")); 199 | HOOK_CHECK(func_entry); 200 | return func_entry(category, name); 201 | } 202 | 203 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameOsThreadA(uint32_t threadId, const char *name) { 204 | HOOK_TRACE_PROFILE("nvtxNameOsThreadA"); 205 | using func_ptr = void (*)(uint32_t, const char *); 206 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameOsThreadA")); 207 | HOOK_CHECK(func_entry); 208 | return func_entry(threadId, name); 209 | } 210 | 211 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameOsThreadW(uint32_t threadId, const wchar_t *name) { 212 | HOOK_TRACE_PROFILE("nvtxNameOsThreadW"); 213 | using func_ptr = void (*)(uint32_t, const wchar_t *); 214 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameOsThreadW")); 215 | HOOK_CHECK(func_entry); 216 | return func_entry(threadId, name); 217 | } 218 | 219 | HOOK_C_API HOOK_DECL_EXPORT nvtxStringHandle_t nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, 220 | const char *string) { 221 | HOOK_TRACE_PROFILE("nvtxDomainRegisterStringA"); 222 | using func_ptr = nvtxStringHandle_t (*)(nvtxDomainHandle_t, const char *); 223 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainRegisterStringA")); 224 | HOOK_CHECK(func_entry); 225 | return func_entry(domain, string); 226 | } 227 | 228 | HOOK_C_API HOOK_DECL_EXPORT nvtxStringHandle_t nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, 229 | const wchar_t *string) { 230 | HOOK_TRACE_PROFILE("nvtxDomainRegisterStringW"); 231 | using func_ptr = nvtxStringHandle_t (*)(nvtxDomainHandle_t, const wchar_t *); 232 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainRegisterStringW")); 233 | HOOK_CHECK(func_entry); 234 | return func_entry(domain, string); 235 | } 236 | 237 | HOOK_C_API HOOK_DECL_EXPORT nvtxDomainHandle_t nvtxDomainCreateA(const char *name) { 238 | HOOK_TRACE_PROFILE("nvtxDomainCreateA"); 239 | using func_ptr = nvtxDomainHandle_t (*)(const char *); 240 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainCreateA")); 241 | HOOK_CHECK(func_entry); 242 | return func_entry(name); 243 | } 244 | 245 | HOOK_C_API HOOK_DECL_EXPORT nvtxDomainHandle_t nvtxDomainCreateW(const wchar_t *name) { 246 | HOOK_TRACE_PROFILE("nvtxDomainCreateW"); 247 | using func_ptr = nvtxDomainHandle_t (*)(const wchar_t *); 248 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainCreateW")); 249 | HOOK_CHECK(func_entry); 250 | return func_entry(name); 251 | } 252 | 253 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainDestroy(nvtxDomainHandle_t domain) { 254 | HOOK_TRACE_PROFILE("nvtxDomainDestroy"); 255 | using func_ptr = void (*)(nvtxDomainHandle_t); 256 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainDestroy")); 257 | HOOK_CHECK(func_entry); 258 | return func_entry(domain); 259 | } 260 | 261 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuDeviceA(CUdevice device, const char *name) { 262 | HOOK_TRACE_PROFILE("nvtxNameCuDeviceA"); 263 | using func_ptr = void (*)(CUdevice, const char *); 264 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCuDeviceA")); 265 | HOOK_CHECK(func_entry); 266 | return func_entry(device, name); 267 | } 268 | 269 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuDeviceW(CUdevice device, const wchar_t *name) { 270 | HOOK_TRACE_PROFILE("nvtxNameCuDeviceW"); 271 | using func_ptr = void (*)(CUdevice, const wchar_t *); 272 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCuDeviceW")); 273 | HOOK_CHECK(func_entry); 274 | return func_entry(device, name); 275 | } 276 | 277 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuContextA(CUcontext context, const char *name) { 278 | HOOK_TRACE_PROFILE("nvtxNameCuContextA"); 279 | using func_ptr = void (*)(CUcontext, const char *); 280 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCuContextA")); 281 | HOOK_CHECK(func_entry); 282 | return func_entry(context, name); 283 | } 284 | 285 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuContextW(CUcontext context, const wchar_t *name) { 286 | HOOK_TRACE_PROFILE("nvtxNameCuContextW"); 287 | using func_ptr = void (*)(CUcontext, const wchar_t *); 288 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCuContextW")); 289 | HOOK_CHECK(func_entry); 290 | return func_entry(context, name); 291 | } 292 | 293 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuStreamA(CUstream stream, const char *name) { 294 | HOOK_TRACE_PROFILE("nvtxNameCuStreamA"); 295 | using func_ptr = void (*)(CUstream, const char *); 296 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCuStreamA")); 297 | HOOK_CHECK(func_entry); 298 | return func_entry(stream, name); 299 | } 300 | 301 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuStreamW(CUstream stream, const wchar_t *name) { 302 | HOOK_TRACE_PROFILE("nvtxNameCuStreamW"); 303 | using func_ptr = void (*)(CUstream, const wchar_t *); 304 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCuStreamW")); 305 | HOOK_CHECK(func_entry); 306 | return func_entry(stream, name); 307 | } 308 | 309 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuEventA(CUevent event, const char *name) { 310 | HOOK_TRACE_PROFILE("nvtxNameCuEventA"); 311 | using func_ptr = void (*)(CUevent, const char *); 312 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCuEventA")); 313 | HOOK_CHECK(func_entry); 314 | return func_entry(event, name); 315 | } 316 | 317 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCuEventW(CUevent event, const wchar_t *name) { 318 | HOOK_TRACE_PROFILE("nvtxNameCuEventW"); 319 | using func_ptr = void (*)(CUevent, const wchar_t *); 320 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCuEventW")); 321 | HOOK_CHECK(func_entry); 322 | return func_entry(event, name); 323 | } 324 | 325 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaDeviceA(int device, const char *name) { 326 | HOOK_TRACE_PROFILE("nvtxNameCudaDeviceA"); 327 | using func_ptr = void (*)(int, const char *); 328 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCudaDeviceA")); 329 | HOOK_CHECK(func_entry); 330 | return func_entry(device, name); 331 | } 332 | 333 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaDeviceW(int device, const wchar_t *name) { 334 | HOOK_TRACE_PROFILE("nvtxNameCudaDeviceW"); 335 | using func_ptr = void (*)(int, const wchar_t *); 336 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCudaDeviceW")); 337 | HOOK_CHECK(func_entry); 338 | return func_entry(device, name); 339 | } 340 | 341 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaStreamA(cudaStream_t stream, const char *name) { 342 | HOOK_TRACE_PROFILE("nvtxNameCudaStreamA"); 343 | using func_ptr = void (*)(cudaStream_t, const char *); 344 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCudaStreamA")); 345 | HOOK_CHECK(func_entry); 346 | return func_entry(stream, name); 347 | } 348 | 349 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t *name) { 350 | HOOK_TRACE_PROFILE("nvtxNameCudaStreamW"); 351 | using func_ptr = void (*)(cudaStream_t, const wchar_t *); 352 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCudaStreamW")); 353 | HOOK_CHECK(func_entry); 354 | return func_entry(stream, name); 355 | } 356 | 357 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaEventA(cudaEvent_t event, const char *name) { 358 | HOOK_TRACE_PROFILE("nvtxNameCudaEventA"); 359 | using func_ptr = void (*)(cudaEvent_t, const char *); 360 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCudaEventA")); 361 | HOOK_CHECK(func_entry); 362 | return func_entry(event, name); 363 | } 364 | 365 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameCudaEventW(cudaEvent_t event, const wchar_t *name) { 366 | HOOK_TRACE_PROFILE("nvtxNameCudaEventW"); 367 | using func_ptr = void (*)(cudaEvent_t, const wchar_t *); 368 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameCudaEventW")); 369 | HOOK_CHECK(func_entry); 370 | return func_entry(event, name); 371 | } 372 | 373 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClDeviceA(cl_device_id device, const char *name) { 374 | HOOK_TRACE_PROFILE("nvtxNameClDeviceA"); 375 | using func_ptr = void (*)(cl_device_id, const char *); 376 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClDeviceA")); 377 | HOOK_CHECK(func_entry); 378 | return func_entry(device, name); 379 | } 380 | 381 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClDeviceW(cl_device_id device, const wchar_t *name) { 382 | HOOK_TRACE_PROFILE("nvtxNameClDeviceW"); 383 | using func_ptr = void (*)(cl_device_id, const wchar_t *); 384 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClDeviceW")); 385 | HOOK_CHECK(func_entry); 386 | return func_entry(device, name); 387 | } 388 | 389 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClContextA(cl_context context, const char *name) { 390 | HOOK_TRACE_PROFILE("nvtxNameClContextA"); 391 | using func_ptr = void (*)(cl_context, const char *); 392 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClContextA")); 393 | HOOK_CHECK(func_entry); 394 | return func_entry(context, name); 395 | } 396 | 397 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClContextW(cl_context context, const wchar_t *name) { 398 | HOOK_TRACE_PROFILE("nvtxNameClContextW"); 399 | using func_ptr = void (*)(cl_context, const wchar_t *); 400 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClContextW")); 401 | HOOK_CHECK(func_entry); 402 | return func_entry(context, name); 403 | } 404 | 405 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClCommandQueueA(cl_command_queue command_queue, const char *name) { 406 | HOOK_TRACE_PROFILE("nvtxNameClCommandQueueA"); 407 | using func_ptr = void (*)(cl_command_queue, const char *); 408 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClCommandQueueA")); 409 | HOOK_CHECK(func_entry); 410 | return func_entry(command_queue, name); 411 | } 412 | 413 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t *name) { 414 | HOOK_TRACE_PROFILE("nvtxNameClCommandQueueW"); 415 | using func_ptr = void (*)(cl_command_queue, const wchar_t *); 416 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClCommandQueueW")); 417 | HOOK_CHECK(func_entry); 418 | return func_entry(command_queue, name); 419 | } 420 | 421 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClMemObjectA(cl_mem memobj, const char *name) { 422 | HOOK_TRACE_PROFILE("nvtxNameClMemObjectA"); 423 | using func_ptr = void (*)(cl_mem, const char *); 424 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClMemObjectA")); 425 | HOOK_CHECK(func_entry); 426 | return func_entry(memobj, name); 427 | } 428 | 429 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClMemObjectW(cl_mem memobj, const wchar_t *name) { 430 | HOOK_TRACE_PROFILE("nvtxNameClMemObjectW"); 431 | using func_ptr = void (*)(cl_mem, const wchar_t *); 432 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClMemObjectW")); 433 | HOOK_CHECK(func_entry); 434 | return func_entry(memobj, name); 435 | } 436 | 437 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClSamplerA(cl_sampler sampler, const char *name) { 438 | HOOK_TRACE_PROFILE("nvtxNameClSamplerA"); 439 | using func_ptr = void (*)(cl_sampler, const char *); 440 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClSamplerA")); 441 | HOOK_CHECK(func_entry); 442 | return func_entry(sampler, name); 443 | } 444 | 445 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClSamplerW(cl_sampler sampler, const wchar_t *name) { 446 | HOOK_TRACE_PROFILE("nvtxNameClSamplerW"); 447 | using func_ptr = void (*)(cl_sampler, const wchar_t *); 448 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClSamplerW")); 449 | HOOK_CHECK(func_entry); 450 | return func_entry(sampler, name); 451 | } 452 | 453 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClProgramA(cl_program program, const char *name) { 454 | HOOK_TRACE_PROFILE("nvtxNameClProgramA"); 455 | using func_ptr = void (*)(cl_program, const char *); 456 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClProgramA")); 457 | HOOK_CHECK(func_entry); 458 | return func_entry(program, name); 459 | } 460 | 461 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClProgramW(cl_program program, const wchar_t *name) { 462 | HOOK_TRACE_PROFILE("nvtxNameClProgramW"); 463 | using func_ptr = void (*)(cl_program, const wchar_t *); 464 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClProgramW")); 465 | HOOK_CHECK(func_entry); 466 | return func_entry(program, name); 467 | } 468 | 469 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClEventA(cl_event evnt, const char *name) { 470 | HOOK_TRACE_PROFILE("nvtxNameClEventA"); 471 | using func_ptr = void (*)(cl_event, const char *); 472 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClEventA")); 473 | HOOK_CHECK(func_entry); 474 | return func_entry(evnt, name); 475 | } 476 | 477 | HOOK_C_API HOOK_DECL_EXPORT void nvtxNameClEventW(cl_event evnt, const wchar_t *name) { 478 | HOOK_TRACE_PROFILE("nvtxNameClEventW"); 479 | using func_ptr = void (*)(cl_event, const wchar_t *); 480 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxNameClEventW")); 481 | HOOK_CHECK(func_entry); 482 | return func_entry(evnt, name); 483 | } 484 | 485 | HOOK_C_API HOOK_DECL_EXPORT nvtxSyncUser_t nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, 486 | const nvtxSyncUserAttributes_t *attribs) { 487 | HOOK_TRACE_PROFILE("nvtxDomainSyncUserCreate"); 488 | using func_ptr = nvtxSyncUser_t (*)(nvtxDomainHandle_t, const nvtxSyncUserAttributes_t *); 489 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserCreate")); 490 | HOOK_CHECK(func_entry); 491 | return func_entry(domain, attribs); 492 | } 493 | 494 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle) { 495 | HOOK_TRACE_PROFILE("nvtxDomainSyncUserDestroy"); 496 | using func_ptr = void (*)(nvtxSyncUser_t); 497 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserDestroy")); 498 | HOOK_CHECK(func_entry); 499 | return func_entry(handle); 500 | } 501 | 502 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle) { 503 | HOOK_TRACE_PROFILE("nvtxDomainSyncUserAcquireStart"); 504 | using func_ptr = void (*)(nvtxSyncUser_t); 505 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserAcquireStart")); 506 | HOOK_CHECK(func_entry); 507 | return func_entry(handle); 508 | } 509 | 510 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle) { 511 | HOOK_TRACE_PROFILE("nvtxDomainSyncUserAcquireFailed"); 512 | using func_ptr = void (*)(nvtxSyncUser_t); 513 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserAcquireFailed")); 514 | HOOK_CHECK(func_entry); 515 | return func_entry(handle); 516 | } 517 | 518 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle) { 519 | HOOK_TRACE_PROFILE("nvtxDomainSyncUserAcquireSuccess"); 520 | using func_ptr = void (*)(nvtxSyncUser_t); 521 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserAcquireSuccess")); 522 | HOOK_CHECK(func_entry); 523 | return func_entry(handle); 524 | } 525 | 526 | HOOK_C_API HOOK_DECL_EXPORT void nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle) { 527 | HOOK_TRACE_PROFILE("nvtxDomainSyncUserReleasing"); 528 | using func_ptr = void (*)(nvtxSyncUser_t); 529 | static auto func_entry = reinterpret_cast(HOOK_NVTX_SYMBOL("nvtxDomainSyncUserReleasing")); 530 | HOOK_CHECK(func_entry); 531 | return func_entry(handle); 532 | } 533 | -------------------------------------------------------------------------------- /tools/code_generate/code_generate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022. All Rights Reserved. 2 | # Author: Bruce-Lee-LY 3 | # Date: 23:47:56 on Sat, May 28, 2022 4 | # 5 | # Description: code generate for cuda-related dynamic libraries 6 | 7 | #!/usr/bin/python3 8 | # coding=utf-8 9 | 10 | from __future__ import print_function 11 | from __future__ import division 12 | from __future__ import absolute_import 13 | from __future__ import with_statement 14 | 15 | import os 16 | import optparse 17 | from CppHeaderParser import CppHeader 18 | 19 | 20 | class CodeGenerate(): 21 | def __init__(self, type_, file_, output_): 22 | self.type = type_ 23 | self.file = file_ 24 | self.output = output_ 25 | 26 | self.func_list = [] 27 | 28 | self.hook_file = self.output + "/" + self.type + "_hook.cpp" 29 | self.hook_list = [] 30 | self.hook_include = """ 31 | // auto generate $hook_num$ apis 32 | 33 | #include "$type$_subset.h" 34 | #include "hook.h" 35 | #include "macro_common.h" 36 | #include "trace_profile.h" 37 | """ 38 | self.hook_template = """ 39 | HOOK_C_API HOOK_DECL_EXPORT $ret$ $func_name$($func_param$) { 40 | HOOK_TRACE_PROFILE("$func_name$"); 41 | using func_ptr = $ret$ (*)($param_type$); 42 | static auto func_entry = reinterpret_cast(HOOK_$type$_SYMBOL("$func_name$")); 43 | HOOK_CHECK(func_entry); 44 | return func_entry($param_name$); 45 | } 46 | """ 47 | 48 | def parsę_header(self): 49 | self.header = CppHeader(self.file) 50 | print( 51 | "{} total func num: {}".format( 52 | self.type, len( 53 | self.header.functions))) 54 | 55 | def generate_func(self): 56 | for func in self.header.functions: 57 | func_name = func["name"] 58 | if func_name in self.func_list: 59 | continue 60 | else: 61 | self.func_list.append(func_name) 62 | 63 | ret = func["rtnType"].replace( 64 | "CUDAAPI", "").replace( 65 | "__CUDA_DEPRECATED", "").replace( 66 | "DECLDIR", "").replace( 67 | "CUDARTAPI_CDECL", "").replace( 68 | "CUDARTAPI", "").replace( 69 | "__host__", "").replace( 70 | "__cudart_builtin__", "").replace( 71 | "CUDNNWINAPI", "").replace( 72 | "CUBLASWINAPI", "").replace( 73 | "CUBLASAPI", "").replace( 74 | "CUFFTAPI", "").replace( 75 | "NVTX_DECLSPEC", "").replace( 76 | "NVTX_API", "").replace( 77 | "CURANDAPI", "").replace( 78 | "CUSPARSEAPI", "").replace( 79 | "CUSOLVERAPI", "").replace( 80 | "NVJPEGAPI", "").strip(' ') 81 | 82 | func_param = "" 83 | param_type = "" 84 | param_name = "" 85 | for param in func["parameters"]: 86 | if len(func_param) > 0: 87 | func_param += ", " 88 | param_type += ", " 89 | param_name += ", " 90 | if param["array"] == 1: 91 | param["type"] += "*" 92 | func_param += (param["type"] + " " + param["name"]) 93 | param_type += param["type"] 94 | param_name += param["name"] 95 | 96 | hook_func = self.hook_template 97 | self.hook_list.append( 98 | hook_func.replace( 99 | "$ret$", 100 | ret).replace( 101 | "$func_name$", 102 | func_name).replace( 103 | "$func_param$", 104 | func_param).replace( 105 | "$param_type$", 106 | param_type).replace( 107 | "$param_name$", 108 | param_name).replace( 109 | "$type$", 110 | self.type.upper())) 111 | print("{} valid func num: {}".format(self.type, len(self.func_list))) 112 | 113 | def save_output(self): 114 | if not os.path.exists(self.output): 115 | os.makedirs(self.output) 116 | 117 | with open(self.hook_file, 'w') as fh: 118 | hook_include = self.hook_include.replace("$hook_num$", str( 119 | len(self.hook_list))).replace("$type$", self.type) 120 | fh.write(hook_include) 121 | for hook in self.hook_list: 122 | fh.write(hook) 123 | 124 | 125 | def main(): 126 | usage = "python3 code_generate.py -t/--type cuda -f/--file include/cuda.h -o/--output output" 127 | parser = optparse.OptionParser(usage) 128 | parser.add_option( 129 | '-t', 130 | '--type', 131 | dest='type', 132 | type='string', 133 | help='header type', 134 | default='cuda') 135 | parser.add_option( 136 | '-f', 137 | '--file', 138 | dest='file', 139 | type='string', 140 | help='header file', 141 | default='include/cuda.h') 142 | parser.add_option( 143 | '-o', 144 | '--output', 145 | dest='output', 146 | type='string', 147 | help='output path', 148 | default='output') 149 | 150 | options, args = parser.parse_args() 151 | type_ = options.type 152 | file_ = options.file 153 | output_ = options.output 154 | 155 | code_gen = CodeGenerate(type_, file_, output_) 156 | code_gen.parsę_header() 157 | code_gen.generate_func() 158 | code_gen.save_output() 159 | 160 | 161 | if __name__ == '__main__': 162 | main() 163 | -------------------------------------------------------------------------------- /tools/code_generate/code_generate.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2022. All Rights Reserved. 2 | # Author: Bruce-Lee-LY 3 | # Date: 23:56:07 on Sat, May 28, 2022 4 | # 5 | # Description: code generate script 6 | 7 | #!/bin/bash 8 | 9 | set -euo pipefail 10 | 11 | WORK_PATH=$(cd $(dirname $0) && pwd) && cd $WORK_PATH 12 | 13 | # python3 -m CppHeaderParser.tojson include/cuda.h > output/cuda.json 14 | python3 code_generate.py -t cuda -f include/cuda.h -o output 15 | # mkdir -p ../../src/cuda 16 | # cp output/cuda_hook.cpp ../../src/cuda 17 | 18 | # python3 -m CppHeaderParser.tojson include/nvml.h > output/nvml.json 19 | python3 code_generate.py -t nvml -f include/nvml.h -o output 20 | # mkdir -p ../../src/nvml 21 | # cp output/nvml_hook.cpp ../../src/nvml 22 | 23 | # python3 -m CppHeaderParser.tojson include/cuda_runtime_api.h > output/cuda_runtime_api.json 24 | python3 code_generate.py -t cudart -f include/cuda_runtime_api.h -o output 25 | # mkdir -p ../../src/cudart 26 | # cp output/cudart_hook.cpp ../../src/cudart 27 | 28 | # python3 -m CppHeaderParser.tojson include/cudnn.h > output/cudnn.json 29 | python3 code_generate.py -t cudnn -f include/cudnn.h -o output 30 | # mkdir -p ../../src/cudnn 31 | # cp output/cudnn_hook.cpp ../../src/cudnn 32 | 33 | # python3 -m CppHeaderParser.tojson include/cublas.h > output/cublas.json 34 | python3 code_generate.py -t cublas -f include/cublas.h -o output 35 | # mkdir -p ../../src/cublas 36 | # cp output/cublas_hook.cpp ../../src/cublas 37 | 38 | # python3 -m CppHeaderParser.tojson include/cublasLt.h > output/cublasLt.json 39 | python3 code_generate.py -t cublasLt -f include/cublasLt.h -o output 40 | # mkdir -p ../../src/cublasLt 41 | # cp output/cublasLt_hook.cpp ../../src/cublasLt 42 | 43 | # python3 -m CppHeaderParser.tojson include/cufft.h > output/cufft.json 44 | python3 code_generate.py -t cufft -f include/cufft.h -o output 45 | # mkdir -p ../../src/cufft 46 | # cp output/cufft_hook.cpp ../../src/cufft 47 | 48 | # python3 -m CppHeaderParser.tojson include/nvToolsExt.h > output/nvToolsExt.json 49 | python3 code_generate.py -t nvtx -f include/nvToolsExt.h -o output 50 | # mkdir -p ../../src/nvtx 51 | # cp output/nvtx_hook.cpp ../../src/nvtx 52 | 53 | # python3 -m CppHeaderParser.tojson include/nvrtc.h > output/nvrtc.json 54 | python3 code_generate.py -t nvrtc -f include/nvrtc.h -o output 55 | # mkdir -p ../../src/nvrtc 56 | # cp output/nvrtc_hook.cpp ../../src/nvrtc 57 | 58 | # python3 -m CppHeaderParser.tojson include/curand.h > output/curand.json 59 | python3 code_generate.py -t curand -f include/curand.h -o output 60 | # mkdir -p ../../src/curand 61 | # cp output/curand_hook.cpp ../../src/curand 62 | 63 | # python3 -m CppHeaderParser.tojson include/cusparse.h > output/cusparse.json 64 | python3 code_generate.py -t cusparse -f include/cusparse.h -o output 65 | # mkdir -p ../../src/cusparse 66 | # cp output/cusparse_hook.cpp ../../src/cusparse 67 | 68 | # python3 -m CppHeaderParser.tojson include/cusolver_common.h > output/cusolver_common.json 69 | python3 code_generate.py -t cusolver -f include/cusolver_common.h -o output 70 | # mkdir -p ../../src/cusolver 71 | # cp output/cusolver_hook.cpp ../../src/cusolver 72 | 73 | # python3 -m CppHeaderParser.tojson include/nvjpeg.h > output/nvjpeg.json 74 | python3 code_generate.py -t nvjpeg -f include/nvjpeg.h -o output 75 | # mkdir -p ../../src/nvjpeg 76 | # cp output/nvjpeg_hook.cpp ../../src/nvjpeg 77 | 78 | # python3 -m CppHeaderParser.tojson include/nvblas.h > output/nvblas.json 79 | python3 code_generate.py -t nvblas -f include/nvblas.h -o output 80 | # mkdir -p ../../src/nvblas 81 | # cp output/nvblas_hook.cpp ../../src/nvblas 82 | -------------------------------------------------------------------------------- /tools/code_generate/include/cufft.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2005-2021 NVIDIA Corporation. All rights reserved. 2 | * 3 | * NOTICE TO LICENSEE: 4 | * 5 | * The source code and/or documentation ("Licensed Deliverables") are 6 | * subject to NVIDIA intellectual property rights under U.S. and 7 | * international Copyright laws. 8 | * 9 | * The Licensed Deliverables contained herein are PROPRIETARY and 10 | * CONFIDENTIAL to NVIDIA and are being provided under the terms and 11 | * conditions of a form of NVIDIA software license agreement by and 12 | * between NVIDIA and Licensee ("License Agreement") or electronically 13 | * accepted by Licensee. Notwithstanding any terms or conditions to 14 | * the contrary in the License Agreement, reproduction or disclosure 15 | * of the Licensed Deliverables to any third party without the express 16 | * written consent of NVIDIA is prohibited. 17 | * 18 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 19 | * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE 20 | * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE 21 | * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. 22 | * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED 23 | * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, 24 | * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. 25 | * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE 26 | * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY 27 | * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY 28 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 29 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS 30 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE 31 | * OF THESE LICENSED DELIVERABLES. 32 | * 33 | * U.S. Government End Users. These Licensed Deliverables are a 34 | * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT 35 | * 1995), consisting of "commercial computer software" and "commercial 36 | * computer software documentation" as such terms are used in 48 37 | * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government 38 | * only as a commercial end item. Consistent with 48 C.F.R.12.212 and 39 | * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all 40 | * U.S. Government End Users acquire the Licensed Deliverables with 41 | * only those rights set forth herein. 42 | * 43 | * Any use of the Licensed Deliverables in individual and commercial 44 | * software must include, in the user documentation and internal 45 | * comments to the code, the above Disclaimer and U.S. Government End 46 | * Users Notice. 47 | */ 48 | 49 | // the following one modifications have been made 50 | // (1) add content in the bottom from cufftXt.h 51 | 52 | /*! 53 | * \file cufft.h 54 | * \brief Public header file for the NVIDIA CUDA FFT library (CUFFT) 55 | */ 56 | 57 | #ifndef _CUFFT_H_ 58 | #define _CUFFT_H_ 59 | 60 | 61 | #include "cuComplex.h" 62 | #include "driver_types.h" 63 | #include "library_types.h" 64 | 65 | #ifndef CUFFTAPI 66 | #ifdef _WIN32 67 | #define CUFFTAPI __stdcall 68 | #elif __GNUC__ >= 4 69 | #define CUFFTAPI __attribute__ ((visibility ("default"))) 70 | #else 71 | #define CUFFTAPI 72 | #endif 73 | #endif 74 | 75 | #ifdef __cplusplus 76 | extern "C" { 77 | #endif 78 | 79 | #define CUFFT_VER_MAJOR 10 80 | #define CUFFT_VER_MINOR 5 81 | #define CUFFT_VER_PATCH 2 82 | #define CUFFT_VER_BUILD 100 83 | 84 | // cuFFT library version 85 | // 86 | // CUFFT_VERSION / 1000 - major version 87 | // CUFFT_VERSION / 100 % 100 - minor version 88 | // CUFFT_VERSION % 100 - patch level 89 | #define CUFFT_VERSION 10502 90 | 91 | // CUFFT API function return values 92 | typedef enum cufftResult_t { 93 | CUFFT_SUCCESS = 0x0, 94 | CUFFT_INVALID_PLAN = 0x1, 95 | CUFFT_ALLOC_FAILED = 0x2, 96 | CUFFT_INVALID_TYPE = 0x3, 97 | CUFFT_INVALID_VALUE = 0x4, 98 | CUFFT_INTERNAL_ERROR = 0x5, 99 | CUFFT_EXEC_FAILED = 0x6, 100 | CUFFT_SETUP_FAILED = 0x7, 101 | CUFFT_INVALID_SIZE = 0x8, 102 | CUFFT_UNALIGNED_DATA = 0x9, 103 | CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA, 104 | CUFFT_INVALID_DEVICE = 0xB, 105 | CUFFT_PARSE_ERROR = 0xC, 106 | CUFFT_NO_WORKSPACE = 0xD, 107 | CUFFT_NOT_IMPLEMENTED = 0xE, 108 | CUFFT_LICENSE_ERROR = 0x0F, 109 | CUFFT_NOT_SUPPORTED = 0x10 110 | 111 | } cufftResult; 112 | 113 | #define MAX_CUFFT_ERROR 0x11 114 | 115 | 116 | // CUFFT defines and supports the following data types 117 | 118 | 119 | // cufftReal is a single-precision, floating-point real data type. 120 | // cufftDoubleReal is a double-precision, real data type. 121 | typedef float cufftReal; 122 | typedef double cufftDoubleReal; 123 | 124 | // cufftComplex is a single-precision, floating-point complex data type that 125 | // consists of interleaved real and imaginary components. 126 | // cufftDoubleComplex is the double-precision equivalent. 127 | typedef cuComplex cufftComplex; 128 | typedef cuDoubleComplex cufftDoubleComplex; 129 | 130 | // CUFFT transform directions 131 | #define CUFFT_FORWARD -1 // Forward FFT 132 | #define CUFFT_INVERSE 1 // Inverse FFT 133 | 134 | // CUFFT supports the following transform types 135 | typedef enum cufftType_t { 136 | CUFFT_R2C = 0x2a, // Real to Complex (interleaved) 137 | CUFFT_C2R = 0x2c, // Complex (interleaved) to Real 138 | CUFFT_C2C = 0x29, // Complex to Complex, interleaved 139 | CUFFT_D2Z = 0x6a, // Double to Double-Complex 140 | CUFFT_Z2D = 0x6c, // Double-Complex to Double 141 | CUFFT_Z2Z = 0x69 // Double-Complex to Double-Complex 142 | } cufftType; 143 | 144 | // CUFFT supports the following data layouts 145 | typedef enum cufftCompatibility_t { 146 | CUFFT_COMPATIBILITY_FFTW_PADDING = 0x01 // The default value 147 | } cufftCompatibility; 148 | 149 | #define CUFFT_COMPATIBILITY_DEFAULT CUFFT_COMPATIBILITY_FFTW_PADDING 150 | 151 | // 152 | // structure definition used by the shim between old and new APIs 153 | // 154 | #define MAX_SHIM_RANK 3 155 | 156 | // cufftHandle is a handle type used to store and access CUFFT plans. 157 | typedef int cufftHandle; 158 | 159 | 160 | cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan, 161 | int nx, 162 | cufftType type, 163 | int batch); 164 | 165 | cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan, 166 | int nx, int ny, 167 | cufftType type); 168 | 169 | cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan, 170 | int nx, int ny, int nz, 171 | cufftType type); 172 | 173 | cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan, 174 | int rank, 175 | int *n, 176 | int *inembed, int istride, int idist, 177 | int *onembed, int ostride, int odist, 178 | cufftType type, 179 | int batch); 180 | 181 | cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan, 182 | int nx, 183 | cufftType type, 184 | int batch, 185 | size_t *workSize); 186 | 187 | cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan, 188 | int nx, int ny, 189 | cufftType type, 190 | size_t *workSize); 191 | 192 | cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan, 193 | int nx, int ny, int nz, 194 | cufftType type, 195 | size_t *workSize); 196 | 197 | cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan, 198 | int rank, 199 | int *n, 200 | int *inembed, int istride, int idist, 201 | int *onembed, int ostride, int odist, 202 | cufftType type, 203 | int batch, 204 | size_t *workSize); 205 | 206 | cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan, 207 | int rank, 208 | long long int *n, 209 | long long int *inembed, 210 | long long int istride, 211 | long long int idist, 212 | long long int *onembed, 213 | long long int ostride, long long int odist, 214 | cufftType type, 215 | long long int batch, 216 | size_t * workSize); 217 | 218 | cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan, 219 | int rank, 220 | long long int *n, 221 | long long int *inembed, 222 | long long int istride, long long int idist, 223 | long long int *onembed, 224 | long long int ostride, long long int odist, 225 | cufftType type, 226 | long long int batch, 227 | size_t *workSize); 228 | 229 | 230 | 231 | 232 | cufftResult CUFFTAPI cufftEstimate1d(int nx, 233 | cufftType type, 234 | int batch, 235 | size_t *workSize); 236 | 237 | cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny, 238 | cufftType type, 239 | size_t *workSize); 240 | 241 | cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz, 242 | cufftType type, 243 | size_t *workSize); 244 | 245 | cufftResult CUFFTAPI cufftEstimateMany(int rank, 246 | int *n, 247 | int *inembed, int istride, int idist, 248 | int *onembed, int ostride, int odist, 249 | cufftType type, 250 | int batch, 251 | size_t *workSize); 252 | 253 | cufftResult CUFFTAPI cufftCreate(cufftHandle * handle); 254 | 255 | cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle, 256 | int nx, 257 | cufftType type, 258 | int batch, 259 | size_t *workSize ); 260 | 261 | cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle, 262 | int nx, int ny, 263 | cufftType type, 264 | size_t *workSize); 265 | 266 | cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle, 267 | int nx, int ny, int nz, 268 | cufftType type, 269 | size_t *workSize); 270 | 271 | cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle, 272 | int rank, int *n, 273 | int *inembed, int istride, int idist, 274 | int *onembed, int ostride, int odist, 275 | cufftType type, int batch, size_t *workArea); 276 | 277 | cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize); 278 | 279 | cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea); 280 | 281 | cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate); 282 | 283 | cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan, 284 | cufftComplex *idata, 285 | cufftComplex *odata, 286 | int direction); 287 | 288 | cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan, 289 | cufftReal *idata, 290 | cufftComplex *odata); 291 | 292 | cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan, 293 | cufftComplex *idata, 294 | cufftReal *odata); 295 | 296 | cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan, 297 | cufftDoubleComplex *idata, 298 | cufftDoubleComplex *odata, 299 | int direction); 300 | 301 | cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan, 302 | cufftDoubleReal *idata, 303 | cufftDoubleComplex *odata); 304 | 305 | cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan, 306 | cufftDoubleComplex *idata, 307 | cufftDoubleReal *odata); 308 | 309 | 310 | // utility functions 311 | cufftResult CUFFTAPI cufftSetStream(cufftHandle plan, 312 | cudaStream_t stream); 313 | 314 | cufftResult CUFFTAPI cufftDestroy(cufftHandle plan); 315 | 316 | cufftResult CUFFTAPI cufftGetVersion(int *version); 317 | 318 | cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type, 319 | int *value); 320 | 321 | // cuda/targets/x86_64-linux/include/cufftXt.h 322 | // 323 | // cufftXtSubFormat identifies the data layout of 324 | // a memory descriptor owned by cufft. 325 | // note that multi GPU cufft does not yet support out-of-place transforms 326 | // 327 | 328 | typedef enum cufftXtSubFormat_t { 329 | CUFFT_XT_FORMAT_INPUT = 0x00, //by default input is in linear order across GPUs 330 | CUFFT_XT_FORMAT_OUTPUT = 0x01, //by default output is in scrambled order depending on transform 331 | CUFFT_XT_FORMAT_INPLACE = 0x02, //by default inplace is input order, which is linear across GPUs 332 | CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03, //shuffled output order after execution of the transform 333 | CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04, //shuffled input order prior to execution of 1D transforms 334 | CUFFT_FORMAT_UNDEFINED = 0x05 335 | } cufftXtSubFormat; 336 | 337 | // 338 | // cufftXtCopyType specifies the type of copy for cufftXtMemcpy 339 | // 340 | typedef enum cufftXtCopyType_t { 341 | CUFFT_COPY_HOST_TO_DEVICE = 0x00, 342 | CUFFT_COPY_DEVICE_TO_HOST = 0x01, 343 | CUFFT_COPY_DEVICE_TO_DEVICE = 0x02, 344 | CUFFT_COPY_UNDEFINED = 0x03 345 | } cufftXtCopyType; 346 | 347 | // 348 | // cufftXtQueryType specifies the type of query for cufftXtQueryPlan 349 | // 350 | typedef enum cufftXtQueryType_t { 351 | CUFFT_QUERY_1D_FACTORS = 0x00, 352 | CUFFT_QUERY_UNDEFINED = 0x01 353 | } cufftXtQueryType; 354 | 355 | typedef struct cufftXt1dFactors_t { 356 | long long int size; 357 | long long int stringCount; 358 | long long int stringLength; 359 | long long int substringLength; 360 | long long int factor1; 361 | long long int factor2; 362 | long long int stringMask; 363 | long long int substringMask; 364 | long long int factor1Mask; 365 | long long int factor2Mask; 366 | int stringShift; 367 | int substringShift; 368 | int factor1Shift; 369 | int factor2Shift; 370 | } cufftXt1dFactors; 371 | 372 | // 373 | // cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy 374 | // 375 | typedef enum cufftXtWorkAreaPolicy_t { 376 | CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */ 377 | CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */ 378 | CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */ 379 | } cufftXtWorkAreaPolicy; 380 | 381 | // multi-GPU routines 382 | cufftResult CUFFTAPI cufftXtSetGPUs(cufftHandle handle, int nGPUs, int *whichGPUs); 383 | 384 | cufftResult CUFFTAPI cufftXtMalloc(cufftHandle plan, 385 | cudaLibXtDesc ** descriptor, 386 | cufftXtSubFormat format); 387 | 388 | cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan, 389 | void *dstPointer, 390 | void *srcPointer, 391 | cufftXtCopyType type); 392 | 393 | cufftResult CUFFTAPI cufftXtFree(cudaLibXtDesc *descriptor); 394 | 395 | cufftResult CUFFTAPI cufftXtSetWorkArea(cufftHandle plan, void **workArea); 396 | 397 | cufftResult CUFFTAPI cufftXtExecDescriptorC2C(cufftHandle plan, 398 | cudaLibXtDesc *input, 399 | cudaLibXtDesc *output, 400 | int direction); 401 | 402 | cufftResult CUFFTAPI cufftXtExecDescriptorR2C(cufftHandle plan, 403 | cudaLibXtDesc *input, 404 | cudaLibXtDesc *output); 405 | 406 | cufftResult CUFFTAPI cufftXtExecDescriptorC2R(cufftHandle plan, 407 | cudaLibXtDesc *input, 408 | cudaLibXtDesc *output); 409 | 410 | cufftResult CUFFTAPI cufftXtExecDescriptorZ2Z(cufftHandle plan, 411 | cudaLibXtDesc *input, 412 | cudaLibXtDesc *output, 413 | int direction); 414 | 415 | cufftResult CUFFTAPI cufftXtExecDescriptorD2Z(cufftHandle plan, 416 | cudaLibXtDesc *input, 417 | cudaLibXtDesc *output); 418 | 419 | cufftResult CUFFTAPI cufftXtExecDescriptorZ2D(cufftHandle plan, 420 | cudaLibXtDesc *input, 421 | cudaLibXtDesc *output); 422 | 423 | // Utility functions 424 | 425 | cufftResult CUFFTAPI cufftXtQueryPlan(cufftHandle plan, void *queryStruct, cufftXtQueryType queryType); 426 | 427 | 428 | // callbacks 429 | 430 | 431 | typedef enum cufftXtCallbackType_t { 432 | CUFFT_CB_LD_COMPLEX = 0x0, 433 | CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1, 434 | CUFFT_CB_LD_REAL = 0x2, 435 | CUFFT_CB_LD_REAL_DOUBLE = 0x3, 436 | CUFFT_CB_ST_COMPLEX = 0x4, 437 | CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5, 438 | CUFFT_CB_ST_REAL = 0x6, 439 | CUFFT_CB_ST_REAL_DOUBLE = 0x7, 440 | CUFFT_CB_UNDEFINED = 0x8 441 | 442 | } cufftXtCallbackType; 443 | 444 | typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); 445 | typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); 446 | typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); 447 | typedef cufftDoubleReal(*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer); 448 | 449 | typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer); 450 | typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, void *sharedPointer); 451 | typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, void *sharedPointer); 452 | typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, void *sharedPointer); 453 | 454 | 455 | cufftResult CUFFTAPI cufftXtSetCallback(cufftHandle plan, void **callback_routine, cufftXtCallbackType cbType, void **caller_info); 456 | cufftResult CUFFTAPI cufftXtClearCallback(cufftHandle plan, cufftXtCallbackType cbType); 457 | cufftResult CUFFTAPI cufftXtSetCallbackSharedSize(cufftHandle plan, cufftXtCallbackType cbType, size_t sharedSize); 458 | 459 | cufftResult CUFFTAPI cufftXtMakePlanMany(cufftHandle plan, 460 | int rank, 461 | long long int *n, 462 | long long int *inembed, 463 | long long int istride, 464 | long long int idist, 465 | cudaDataType inputtype, 466 | long long int *onembed, 467 | long long int ostride, 468 | long long int odist, 469 | cudaDataType outputtype, 470 | long long int batch, 471 | size_t *workSize, 472 | cudaDataType executiontype); 473 | 474 | cufftResult CUFFTAPI cufftXtGetSizeMany(cufftHandle plan, 475 | int rank, 476 | long long int *n, 477 | long long int *inembed, 478 | long long int istride, 479 | long long int idist, 480 | cudaDataType inputtype, 481 | long long int *onembed, 482 | long long int ostride, 483 | long long int odist, 484 | cudaDataType outputtype, 485 | long long int batch, 486 | size_t *workSize, 487 | cudaDataType executiontype); 488 | 489 | 490 | cufftResult CUFFTAPI cufftXtExec(cufftHandle plan, 491 | void *input, 492 | void *output, 493 | int direction); 494 | 495 | cufftResult CUFFTAPI cufftXtExecDescriptor(cufftHandle plan, 496 | cudaLibXtDesc *input, 497 | cudaLibXtDesc *output, 498 | int direction); 499 | 500 | cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t *workSize); 501 | 502 | #ifdef __cplusplus 503 | } 504 | #endif 505 | 506 | #endif /* _CUFFT_H_ */ 507 | --------------------------------------------------------------------------------