├── .github └── workflows │ └── cmake-single-platform.yml ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.MD ├── driverapi ├── CMakeLists.txt ├── include │ ├── librecuda.h │ ├── librecuda_all_statuses.h │ └── librecuda_status.h ├── internal │ ├── cmdqueue.h │ ├── librecuda_internal.h │ ├── librecuda_status_internal.h │ ├── memcopy.h │ ├── memcopy_kernels.h │ └── nvidia │ │ ├── alloc_channel.h │ │ ├── cc_drv.h │ │ ├── cl0040.h │ │ ├── cl0080.h │ │ ├── cl0080_notification.h │ │ ├── cl2080.h │ │ ├── cl2080_notification.h │ │ ├── cl9010.h │ │ ├── cl9010_callback.h │ │ ├── cl9067.h │ │ ├── cl90f1.h │ │ ├── cla06c.h │ │ ├── cla0c0qmd.h │ │ ├── clc3c0qmd.h │ │ ├── clc461.h │ │ ├── clc56f.h │ │ ├── clc5c0qmd.h │ │ ├── clc6b5.h │ │ ├── clc6c0.h │ │ ├── clc6c0qmd.h │ │ ├── cpuopsys.h │ │ ├── ctrl0000base.h │ │ ├── ctrl0000gpu.h │ │ ├── ctrl0000system.h │ │ ├── ctrl0073base.h │ │ ├── ctrl0073system.h │ │ ├── ctrl0080base.h │ │ ├── ctrl0080bsp.h │ │ ├── ctrl0080gpu.h │ │ ├── ctrl0080gr.h │ │ ├── ctrl0080msenc.h │ │ ├── ctrl2080base.h │ │ ├── ctrl2080fifo.h │ │ ├── ctrl2080gpu.h │ │ ├── ctrl2080gr.h │ │ ├── ctrl2080internal.h │ │ ├── ctrl2080mc.1.h │ │ ├── ctrl2080mc.h │ │ ├── ctrl30f1.h │ │ ├── ctrl906f.h │ │ ├── ctrl90f1.h │ │ ├── ctrla06c.h │ │ ├── ctrla06f.h │ │ ├── ctrla06fbase.h │ │ ├── ctrla06fevent.h │ │ ├── ctrla06fgpfifo.h │ │ ├── ctrla06finternal.h │ │ ├── ctrlc36f.h │ │ ├── ctrlxxxx.h │ │ ├── dev_mmu.h │ │ ├── g_allclasses.h │ │ ├── mmu_fmt_types.h │ │ ├── nv-ioctl-numbers.h │ │ ├── nv-ioctl.h │ │ ├── nv-unix-nvos-params-wrappers.h │ │ ├── nvCpuUuid.h │ │ ├── nv_escape.h │ │ ├── nvcfg_sdk.h │ │ ├── nvgputypes.h │ │ ├── nvimpshared.h │ │ ├── nvlimits.h │ │ ├── nvmisc.h │ │ ├── nvos.h │ │ ├── nvstatus.h │ │ ├── nvstatuscodes.h │ │ ├── nvtypes.h │ │ ├── rs_access.h │ │ ├── uvm_ioctl.h │ │ ├── uvm_linux_ioctl.h │ │ └── uvm_types.h ├── kernels │ └── memcpy │ │ ├── compile_memcpy.sh │ │ ├── generate_header.py │ │ ├── memcopy_kernels.h │ │ ├── memcpy.cu │ │ └── output │ │ ├── memcpy_sm_50.cubin │ │ ├── memcpy_sm_50.ptx │ │ ├── memcpy_sm_52.cubin │ │ ├── memcpy_sm_52.ptx │ │ ├── memcpy_sm_53.cubin │ │ ├── memcpy_sm_53.ptx │ │ ├── memcpy_sm_60.cubin │ │ ├── memcpy_sm_60.ptx │ │ ├── memcpy_sm_61.cubin │ │ ├── memcpy_sm_61.ptx │ │ ├── memcpy_sm_62.cubin │ │ ├── memcpy_sm_62.ptx │ │ ├── memcpy_sm_70.cubin │ │ ├── memcpy_sm_70.ptx │ │ ├── memcpy_sm_72.cubin │ │ ├── memcpy_sm_72.ptx │ │ ├── memcpy_sm_75.cubin │ │ ├── memcpy_sm_75.ptx │ │ ├── memcpy_sm_80.cubin │ │ ├── memcpy_sm_80.ptx │ │ ├── memcpy_sm_86.cubin │ │ ├── memcpy_sm_86.ptx │ │ ├── memcpy_sm_87.cubin │ │ ├── memcpy_sm_87.ptx │ │ ├── memcpy_sm_89.cubin │ │ ├── memcpy_sm_89.ptx │ │ ├── memcpy_sm_90.cubin │ │ └── memcpy_sm_90.ptx └── src │ ├── cmdqueue.cpp │ ├── librecuda.cpp │ ├── librecuda_status.cpp │ └── memcopy.cpp └── tests ├── CMakeLists.txt ├── async_kernels ├── CMakeLists.txt ├── main.cpp ├── write_float.asm ├── write_float.cu ├── write_float.cu.asm ├── write_float.cubin └── write_float.ptx ├── compile_cubin.sh ├── complex ├── CMakeLists.txt └── complex.ptx ├── compute_chronological_consistency ├── CMakeLists.txt ├── main.cpp ├── write_float.asm ├── write_float.cu ├── write_float.cu.asm ├── write_float.cubin └── write_float.ptx ├── dma_chronological_consistency ├── CMakeLists.txt └── main.cpp ├── dynamic_shared_mem ├── CMakeLists.txt ├── main.cpp ├── write_float.asm ├── write_float.cu ├── write_float.cubin └── write_float.ptx ├── indexing ├── CMakeLists.txt ├── main.cpp ├── write_float.asm ├── write_float.cu ├── write_float.cubin └── write_float.ptx ├── kernel_struct_param ├── CMakeLists.txt ├── main.cpp ├── read_from_struct.asm ├── read_from_struct.cu ├── read_from_struct.cubin └── read_from_struct.ptx ├── many_kernels_launch ├── CMakeLists.txt ├── empty_kernel.asm ├── empty_kernel.cu ├── empty_kernel.cubin ├── empty_kernel.ptx └── main.cpp ├── memcopy ├── CMakeLists.txt └── main.cpp ├── stream_events ├── CMakeLists.txt ├── main.cpp ├── write_float.asm ├── write_float.cu ├── write_float.cu.asm ├── write_float.cubin └── write_float.ptx └── write_float ├── CMakeLists.txt ├── main.cpp ├── write_float.asm ├── write_float.cu ├── write_float.cubin └── write_float.ptx /.github/workflows/cmake-single-platform.yml: -------------------------------------------------------------------------------- 1 | # This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage. 2 | # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml 3 | name: CMake on a single platform 4 | 5 | on: 6 | push: 7 | branches: [ "master" ] 8 | pull_request: 9 | branches: [ "master" ] 10 | 11 | env: 12 | # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) 13 | BUILD_TYPE: Release 14 | 15 | jobs: 16 | build: 17 | # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac. 18 | # You can convert this to a matrix build if you need cross-platform coverage. 19 | # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - name: Configure CMake 26 | # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. 27 | # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type 28 | run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} 29 | 30 | - name: Build 31 | # Build your program with the given configuration 32 | run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}} 33 | 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | 9 | # Precompiled Headers 10 | *.gch 11 | *.pch 12 | 13 | # Compiled Dynamic libraries 14 | *.so 15 | *.dll 16 | 17 | # Compiled Static libraries 18 | *.lai 19 | *.la 20 | *.a 21 | *.lib 22 | 23 | # Executables 24 | *.exe 25 | *.out 26 | 27 | # Visaul Studio files 28 | *.sdf 29 | *.suo 30 | *.sln 31 | *.vcxproj 32 | 33 | # Cmake folders 34 | cmake-build-*/ 35 | build/ 36 | debug/ 37 | 38 | # IntelliJ 39 | .idea/ 40 | 41 | # Vscode 42 | .vscode/ -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/ELFIO"] 2 | path = third_party/ELFIO 3 | url = https://github.com/serge1/ELFIO 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(LibreCuda CXX) 3 | 4 | option(BUILD_LIBRECUDA_DRIVER_API_STATIC_LIB "Build the LibreCUDA driverapi as a static library" ON) 5 | include(CheckCXXCompilerFlag) 6 | CHECK_CXX_COMPILER_FLAG("-std=c++17" COMPILER_SUPPORTS_CXX17) 7 | if (COMPILER_SUPPORTS_CXX17) 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") 9 | add_definitions(-DCOMPILEDWITHC17) 10 | message(STATUS "Using flag -std=c++17.") 11 | else () 12 | message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} has no C++17 support. Please use a different C++ compiler.") 13 | endif () 14 | 15 | if (NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/third_party/ELFIO/elfio") 16 | message("-- ELFIO not found, fetching ELFIO...") 17 | execute_process(COMMAND git submodule update --init) 18 | add_subdirectory(third_party/ELFIO) 19 | else () 20 | message("-- ELFIO found.") 21 | add_subdirectory(third_party/ELFIO) 22 | endif () 23 | 24 | add_subdirectory(driverapi) 25 | 26 | add_subdirectory(tests) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 Michael Keiblinger 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining 5 | * a copy of this software and associated documentation files 6 | * (the "Software"), to deal in the Software without restriction, 7 | * including without limitation the rights to use, copy, modify, merge, 8 | * publish, distribute, sublicense, and/or sell copies of the Software, 9 | * and to permit persons to whom the Software is furnished to do so, 10 | * subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be 13 | * included in all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 21 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | */ 23 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # LibreCUDA 2 | 3 | LibreCUDA is a project aimed at replacing the CUDA driver API to enable launching CUDA code on Nvidia GPUs without 4 | relying on the proprietary CUDA runtime. It achieves this by communicating directly with the hardware via ioctls, ( 5 | specifically what Nvidia's open-gpu-kernel-modules refer to as the rmapi), as well as QMD, Nvidia's MMIO command queue 6 | structure. LibreCUDA is capable of uploading CUDA ELF binaries onto the GPU and launching them via the command queue. 7 | 8 | ## Current features 9 | 10 | - Allocate and free gpu memory & map the memory to be accessible by the CPU 11 | - Upload cuda kernels (CUDA ELF binaries) 12 | - Set dynamic shared memory for cuda functions 13 | - Launch CUDA kernels 14 | - Supports cheap async kernel launches on a single stream 15 | - host to device (DMA), device to device (Compute), device to host memcpy (DMA) 16 | - Supports cheap async memcpys on a single stream 17 | 18 | ## Example 19 | 20 | Below is an example demonstrating the usage of LibreCUDA: 21 | 22 | ```cpp 23 | int main() { 24 | libreCuInit(0); 25 | 26 | int device_count{}; 27 | libreCuDeviceGetCount(&device_count); 28 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 29 | 30 | LibreCUdevice device{}; 31 | libreCuDeviceGet(&device, 0); 32 | 33 | LibreCUcontext ctx{}; 34 | libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device); 35 | 36 | LibreCUmodule module{}; 37 | 38 | uint8_t *image; 39 | size_t n_bytes; 40 | { 41 | std::ifstream input("write_float.cubin", std::ios::binary); 42 | std::vector bytes( 43 | (std::istreambuf_iterator(input)), 44 | (std::istreambuf_iterator())); 45 | input.close(); 46 | image = new uint8_t[bytes.size()]; 47 | doMemcpy(image, bytes.data(), bytes.size()); 48 | n_bytes = bytes.size(); 49 | } 50 | libreCuModuleLoadData(&module, image, n_bytes); 51 | 52 | uint32_t num_funcs{}; 53 | libreCuModuleGetFunctionCount(&num_funcs, module); 54 | std::cout << "Num functions: " << num_funcs << std::endl; 55 | 56 | auto *functions = new LibreCUFunction[num_funcs]; 57 | libreCuModuleEnumerateFunctions(functions, num_funcs, module); 58 | 59 | for (size_t i = 0; i < num_funcs; i++) { 60 | LibreCUFunction func = functions[i]; 61 | const char *func_name{}; 62 | libreCuFuncGetName(&func_name, func); 63 | std::cout << " function \"" << func_name << "\"" << std::endl; 64 | } 65 | 66 | delete[] functions; 67 | 68 | LibreCUFunction func{}; 69 | libreCuModuleGetFunction(&func, module, "write_float"); 70 | 71 | LibreCUstream stream{}; 72 | libreCuStreamCreate(&stream, 0); 73 | 74 | void *float_dst_va{}; 75 | libreCuMemAlloc(&float_dst_va, sizeof(float), true); 76 | 77 | float float_value = 3.1415f; 78 | void *float_src_va{}; 79 | libreCuMemAlloc(&float_src_va, sizeof(float), true); 80 | *(float *) (float_src_va) = float_value; 81 | 82 | std::cout << "Src value: " << float_value << std::endl; 83 | std::cout << "Dst value (pre exec): " << *(float *) (float_dst_va) << std::endl; 84 | 85 | void *params[] = { 86 | &float_dst_va, // dst 87 | &float_src_va // src 88 | }; 89 | libreCuLaunchKernel(func, 90 | 1, 1, 1, 91 | 1, 1, 1, 92 | 0, 93 | stream, 94 | params, sizeof(params) / sizeof(void *), 95 | nullptr 96 | ); 97 | 98 | libreCuStreamCommence(stream); 99 | 100 | libreCuStreamAwait(stream); 101 | std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl; 102 | 103 | libreCuMemFree(float_dst_va); 104 | libreCuStreamDestroy(stream); 105 | libreCuModuleUnload(module); 106 | libreCuCtxDestroy(ctx); 107 | return 0; 108 | } 109 | ``` 110 | 111 | ### Outputs 112 | 113 | ```console 114 | Device count: 1 115 | Num functions: 1 116 | function "write_float" 117 | Src value: 3.1415 118 | Dst value (pre exec): 0 119 | Dst value (post exec): 3.1415 120 | ``` 121 | 122 | ### How to Use 123 | 124 | The recommended way to use librecuda is to clone the LibreCUDA repository and link against the `driverapi` library in 125 | CMake: 126 | 127 | ``` 128 | git clone --recurse https://github.com/mikex86/LibreCuda.git 129 | ``` 130 | 131 | #### Add the repository as a CMake directory 132 | 133 | ```cmake 134 | add_subdirectory(LibreCuda) 135 | ``` 136 | 137 | #### Link against the driver api library 138 | 139 | ```cmake 140 | target_link_libraries(YourTarget PRIVATE driverapi) 141 | ``` 142 | 143 | #### Include headers 144 | 145 | ```c 146 | #include 147 | ``` 148 | 149 | ## Project Status 150 | 151 | The project is in its early stages and currently implements only rudimentary CUDA functions. It is not yet ready for 152 | production use. 153 | 154 | ## Contributing 155 | 156 | Contributions are welcome! Please submit issues and pull requests to help improve LibreCUDA. 157 | 158 | ## License 159 | 160 | This project is licensed under the MIT License. 161 | -------------------------------------------------------------------------------- /driverapi/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set( 2 | DRIVERAPI_SOURCES 3 | src/librecuda.cpp 4 | src/librecuda_status.cpp 5 | src/cmdqueue.cpp 6 | src/memcopy.cpp 7 | internal/memcopy.h 8 | internal/memcopy_kernels.h 9 | ) 10 | if (BUILD_LIBRECUDA_DRIVER_API_STATIC_LIB) 11 | add_library(driverapi STATIC ${DRIVERAPI_SOURCES}) 12 | else() 13 | add_library(driverapi SHARED ${DRIVERAPI_SOURCES}) 14 | endif() 15 | 16 | target_include_directories(driverapi PRIVATE "internal/") 17 | target_include_directories(driverapi PUBLIC "include/") 18 | target_link_libraries(driverapi PRIVATE elfio) 19 | -------------------------------------------------------------------------------- /driverapi/include/librecuda_all_statuses.h: -------------------------------------------------------------------------------- 1 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_SUCCESS, 0) 2 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_VALUE, 1) 3 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_OUT_OF_MEMORY, 2) 4 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_NOT_INITIALIZED, 3) 5 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_DEVICE, 101) 6 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_IMAGE, 200) 7 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_CONTEXT, 201) 8 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_NOT_FOUND, 500) 9 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_NOT_READY, 600) 10 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, 701) 11 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE, 804) 12 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_UNKNOWN, 999) -------------------------------------------------------------------------------- /driverapi/include/librecuda_status.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBRECUDA_VALIDATE_H 2 | #define LIBRECUDA_VALIDATE_H 3 | 4 | #include 5 | 6 | typedef int libreCudaStatus_t; 7 | 8 | #define LIBRECUDA_DEBUG(msg) std::cerr << "[LibreCuda Debug]: " << msg << std::endl 9 | #define __LIBRECUDA_STRINGIFY(x) #x 10 | #define __LIBRECUDA_TOSTRING(x) __LIBRECUDA_STRINGIFY(x) 11 | 12 | #define LIBRECUDA_VALIDATE(condition, err) { \ 13 | if (!(condition)) { \ 14 | LIBRECUDA_DEBUG(__FILE__ ":" __LIBRECUDA_TOSTRING(__LINE__) ": " #condition); \ 15 | return err; \ 16 | } \ 17 | } 18 | 19 | // declare all cuda error codes 20 | #define LIBRECUDA_DECLARE_STATUS(status, code) extern "C" libreCudaStatus_t status; 21 | #include "librecuda_all_statuses.h" 22 | #undef LIBRECUDA_DECLARE_STATUS 23 | 24 | #define LIBRECUDA_SUCCEED() { return LIBRECUDA_SUCCESS; } 25 | #define LIBRECUDA_FAIL(status) { return status; } 26 | 27 | #define LIBRECUDA_ERR_PROPAGATE(status) { libreCudaStatus_t status_val = status; if (status_val != LIBRECUDA_SUCCESS) { LIBRECUDA_FAIL(status_val); } } 28 | 29 | #endif //LIBRECUDA_VALIDATE_H 30 | -------------------------------------------------------------------------------- /driverapi/internal/librecuda_status_internal.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBRECUDA_LIBRECUDA_STATUS_INTERNAL_H 2 | #define LIBRECUDA_LIBRECUDA_STATUS_INTERNAL_H 3 | 4 | void internalLibreCuInitStatusNames(); 5 | 6 | const char *internalLibreCuGetStatusName(int code); 7 | 8 | bool internalLibreCuInitStatusNamesInitialized(); 9 | 10 | #endif //LIBRECUDA_LIBRECUDA_STATUS_INTERNAL_H 11 | -------------------------------------------------------------------------------- /driverapi/internal/memcopy.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | libreCudaStatus_t loadMemcpyKernelsIfNeeded(LibreCUdevice device); 7 | 8 | libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream, bool async); -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cc_drv.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | #include "nvtypes.h" 27 | 28 | // 29 | // This file was generated with FINN, an NVIDIA coding tool. 30 | // Source file: cc_drv.finn 31 | // 32 | 33 | 34 | 35 | #include "nvtypes.h" 36 | #include "nvcfg_sdk.h" 37 | 38 | // CLASS NV_CONF_COMPUTE 39 | #define CC_AES_256_GCM_IV_SIZE_BYTES (0xcU) /* finn: Evaluated from "(96 / 8)" */ 40 | #define CC_AES_256_GCM_IV_SIZE_DWORD (0x3U) /* finn: Evaluated from "(CC_AES_256_GCM_IV_SIZE_BYTES / 4)" */ 41 | #define CC_AES_256_GCM_KEY_SIZE_BYTES (0x20U) /* finn: Evaluated from "(256 / 8)" */ 42 | #define CC_AES_256_GCM_KEY_SIZE_DWORD (0x8U) /* finn: Evaluated from "(CC_AES_256_GCM_KEY_SIZE_BYTES / 4)" */ 43 | 44 | #define CC_HMAC_NONCE_SIZE_BYTES (0x20U) /* finn: Evaluated from "(256 / 8)" */ 45 | #define CC_HMAC_NONCE_SIZE_DWORD (0x8U) /* finn: Evaluated from "(CC_HMAC_NONCE_SIZE_BYTES / 4)" */ 46 | #define CC_HMAC_KEY_SIZE_BYTES (0x20U) /* finn: Evaluated from "(256 / 8)" */ 47 | #define CC_HMAC_KEY_SIZE_DWORD (0x8U) /* finn: Evaluated from "(CC_HMAC_KEY_SIZE_BYTES / 4)" */ 48 | 49 | 50 | // Type is shared between CC control calls and RMKeyStore 51 | typedef enum ROTATE_IV_TYPE { 52 | ROTATE_IV_ENCRYPT = 0, // Rotate the IV for encryptBundle 53 | ROTATE_IV_DECRYPT = 1, // Rotate the IV for decryptBundle 54 | ROTATE_IV_HMAC = 2, // Rotate the IV for hmacBundle 55 | ROTATE_IV_ALL_VALID = 3, // Rotate the IV for all valid bundles in the KMB 56 | } ROTATE_IV_TYPE; 57 | 58 | // Status value written into NvNotification.Info16 59 | typedef enum KEY_ROTATION_STATUS { 60 | KEY_ROTATION_STATUS_IDLE = 0, // Key rotation complete/not in progress 61 | KEY_ROTATION_STATUS_PENDING = 1, // RM is waiting for clients to report their channels are idle for key rotation 62 | KEY_ROTATION_STATUS_IN_PROGRESS = 2, // Key rotation is in progress 63 | KEY_ROTATION_STATUS_FAILED_TIMEOUT = 3, // Key rotation timeout failure, RM will RC non-idle channels 64 | KEY_ROTATION_STATUS_FAILED_THRESHOLD = 4, // Key rotation failed because upper threshold was crossed, RM will RC non-idle channels 65 | KEY_ROTATION_STATUS_FAILED_ROTATION = 5, // Internal RM failure while rotating keys for a certain channel, RM will RC the channel 66 | KEY_ROTATION_STATUS_PENDING_TIMER_SUSPENDED = 6, // Key rotation timer suspended waiting for kernel key rotation to complete 67 | KEY_ROTATION_STATUS_MAX_COUNT = 7, 68 | } KEY_ROTATION_STATUS; 69 | 70 | typedef struct CC_AES_CRYPTOBUNDLE { 71 | NvU32 iv[CC_AES_256_GCM_IV_SIZE_DWORD]; 72 | NvU32 key[CC_AES_256_GCM_KEY_SIZE_DWORD]; 73 | NvU32 ivMask[CC_AES_256_GCM_IV_SIZE_DWORD]; 74 | } CC_AES_CRYPTOBUNDLE; 75 | typedef struct CC_AES_CRYPTOBUNDLE *PCC_AES_CRYPTOBUNDLE; 76 | 77 | typedef struct CC_HMAC_CRYPTOBUNDLE { 78 | NvU32 nonce[CC_HMAC_NONCE_SIZE_DWORD]; 79 | NvU32 key[CC_HMAC_KEY_SIZE_DWORD]; 80 | } CC_HMAC_CRYPTOBUNDLE; 81 | typedef struct CC_HMAC_CRYPTOBUNDLE *PCC_HMAC_CRYPTOBUNDLE; 82 | 83 | typedef struct CC_KMB { 84 | CC_AES_CRYPTOBUNDLE encryptBundle; // Bundle of encyption material 85 | 86 | union { 87 | CC_HMAC_CRYPTOBUNDLE hmacBundle; // HMAC bundle used for method stream authenticity 88 | CC_AES_CRYPTOBUNDLE decryptBundle; // Bundle of decryption material 89 | }; 90 | NvBool bIsWorkLaunch; // False if decryption parameters are valid 91 | } CC_KMB; 92 | typedef struct CC_KMB *PCC_KMB; 93 | 94 | typedef struct CC_CRYPTOBUNDLE_STATS { 95 | NV_DECLARE_ALIGNED(NvU64 numEncryptionsH2D, 8); 96 | NV_DECLARE_ALIGNED(NvU64 numEncryptionsD2H, 8); 97 | NV_DECLARE_ALIGNED(NvU64 bytesEncryptedH2D, 8); 98 | NV_DECLARE_ALIGNED(NvU64 bytesEncryptedD2H, 8); 99 | } CC_CRYPTOBUNDLE_STATS; 100 | typedef struct CC_CRYPTOBUNDLE_STATS *PCC_CRYPTOBUNDLE_STATS; 101 | 102 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cl0040.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2001-2001 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | 25 | #ifndef _cl0040_h_ 26 | #define _cl0040_h_ 27 | 28 | #ifdef __cplusplus 29 | extern "C" { 30 | #endif 31 | 32 | #include "nvtypes.h" 33 | 34 | #define NV01_MEMORY_LOCAL_USER (0x00000040) 35 | /* NvNotification[] fields and values */ 36 | #define NV040_NOTIFICATION_STATUS_ERROR_PROTECTION_FAULT (0x4000) 37 | /* pio method data structure */ 38 | typedef volatile struct _cl0040_tag0 { 39 | NvV32 Reserved00[0x7c0]; 40 | } Nv040Typedef, Nv01MemoryLocalUser; 41 | #define NV040_TYPEDEF Nv01MemoryLocalUser 42 | /* obsolete stuff */ 43 | #define NV01_MEMORY_USER (0x00000040) 44 | #define NV1_MEMORY_USER (0x00000040) 45 | #define Nv01MemoryUser Nv01MemoryLocalUser 46 | #define nv01MemoryUser Nv01MemoryLocalUser 47 | #define Nv1MemoryUser Nv01MemoryLocalUser 48 | #define nv1MemoryUser Nv01MemoryLocalUser 49 | #define nv01MemoryLocalUser Nv01MemoryLocalUser 50 | 51 | #ifdef __cplusplus 52 | }; /* extern "C" */ 53 | #endif 54 | 55 | #endif /* _cl0040_h_ */ 56 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cl0080.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2001-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | #include "nvtypes.h" 27 | 28 | // 29 | // This file was generated with FINN, an NVIDIA coding tool. 30 | // Source file: class/cl0080.finn 31 | // 32 | 33 | #include "nvlimits.h" 34 | #include "cl0080_notification.h" 35 | 36 | #define NV01_DEVICE_0 (0x80U) /* finn: Evaluated from "NV0080_ALLOC_PARAMETERS_MESSAGE_ID" */ 37 | 38 | /* NvAlloc parameteters */ 39 | #define NV0080_MAX_DEVICES NV_MAX_DEVICES 40 | 41 | /** 42 | * @brief Alloc param 43 | * 44 | * @param vaMode mode for virtual address space allocation 45 | * Three modes: 46 | * NV_DEVICE_ALLOCATION_VAMODE_OPTIONAL_MULTIPLE_VASPACES 47 | * NV_DEVICE_ALLOCATION_VAMODE_SINGLE_VASPACE 48 | * NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES 49 | * Detailed description of these modes is in nvos.h 50 | **/ 51 | 52 | #define NV0080_ALLOC_PARAMETERS_MESSAGE_ID (0x0080U) 53 | 54 | typedef struct NV0080_ALLOC_PARAMETERS { 55 | NvU32 deviceId; 56 | NvHandle hClientShare; 57 | NvHandle hTargetClient; 58 | NvHandle hTargetDevice; 59 | NvV32 flags; 60 | NV_DECLARE_ALIGNED(NvU64 vaSpaceSize, 8); 61 | NV_DECLARE_ALIGNED(NvU64 vaStartInternal, 8); 62 | NV_DECLARE_ALIGNED(NvU64 vaLimitInternal, 8); 63 | NvV32 vaMode; 64 | } NV0080_ALLOC_PARAMETERS; 65 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cl0080_notification.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #ifndef _cl0080_notification_h_ 25 | #define _cl0080_notification_h_ 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | /* NvNotification[] fields and values */ 32 | #define NV080_NOTIFICATION_STATUS_ERROR_PROTECTION_FAULT (0x4000) 33 | 34 | /* pio method data structure */ 35 | typedef volatile struct _cl0080_tag0 { 36 | NvV32 Reserved00[0x7c0]; 37 | } Nv080Typedef, Nv01Device0; 38 | 39 | #define NV080_TYPEDEF Nv01Device0 40 | 41 | #ifdef __cplusplus 42 | }; /* extern "C" */ 43 | #endif 44 | 45 | #endif /* _cl0080_notification_h_ */ 46 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cl2080.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2002-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | #include "nvtypes.h" 27 | 28 | // 29 | // This file was generated with FINN, an NVIDIA coding tool. 30 | // Source file: class/cl2080.finn 31 | // 32 | 33 | #include "nvlimits.h" 34 | #include "cl2080_notification.h" 35 | 36 | #define NV20_SUBDEVICE_0 (0x2080U) /* finn: Evaluated from "NV2080_ALLOC_PARAMETERS_MESSAGE_ID" */ 37 | 38 | /* NvAlloc parameteters */ 39 | #define NV2080_MAX_SUBDEVICES NV_MAX_SUBDEVICES 40 | 41 | #define NV2080_ALLOC_PARAMETERS_MESSAGE_ID (0x2080U) 42 | 43 | typedef struct NV2080_ALLOC_PARAMETERS { 44 | NvU32 subDeviceId; 45 | } NV2080_ALLOC_PARAMETERS; 46 | 47 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cl9010.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #pragma once 24 | 25 | #include "nvtypes.h" 26 | 27 | // 28 | // This file was generated with FINN, an NVIDIA coding tool. 29 | // Source file: class/cl9010.finn 30 | // 31 | 32 | #include "cl9010_callback.h" 33 | 34 | #define NV9010_VBLANK_CALLBACK (0x9010U) /* finn: Evaluated from "NV_VBLANK_CALLBACK_ALLOCATION_PARAMETERS_MESSAGE_ID" */ 35 | 36 | #define NV_VBLANK_CALLBACK_ALLOCATION_PARAMETERS_MESSAGE_ID (0x9010U) 37 | 38 | typedef struct NV_VBLANK_CALLBACK_ALLOCATION_PARAMETERS { 39 | NV_DECLARE_ALIGNED(NvP64 pProc, 8); // Routine to call at vblank time 40 | // A function pointer of OSVBLANKCALLBACKPROC 41 | NvV32 LogicalHead; // Logical Head 42 | NV_DECLARE_ALIGNED(NvP64 pParm1, 8); // pParm1 43 | NV_DECLARE_ALIGNED(NvP64 pParm2, 8); // pParm2 44 | } NV_VBLANK_CALLBACK_ALLOCATION_PARAMETERS; 45 | 46 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cl9010_callback.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a 5 | * copy of this software and associated documentation files (the "Software"), 6 | * to deal in the Software without restriction, including without limitation 7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 | * and/or sell copies of the Software, and to permit persons to whom the 9 | * Software is furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | * DEALINGS IN THE SOFTWARE. 21 | */ 22 | #ifndef SDK_CL9010_CALLBACK_H 23 | #define SDK_CL9010_CALLBACK_H 24 | 25 | typedef void (*OSVBLANKCALLBACKPROC)(NvP64 pParm1, NvP64 pParm2); 26 | 27 | #endif // SDK_CL9010_CALLBACK_H 28 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cl9067.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2010-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #include "nvtypes.h" 25 | 26 | #ifndef _cl9067_h_ 27 | #define _cl9067_h_ 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #define FERMI_CONTEXT_SHARE_A (0x00009067) 34 | 35 | #ifdef __cplusplus 36 | }; /* extern "C" */ 37 | #endif 38 | #endif // _cl9067_h 39 | 40 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cl90f1.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2011 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #include "nvtypes.h" 25 | 26 | #ifndef _cl90f1_h_ 27 | #define _cl90f1_h_ 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #define FERMI_VASPACE_A (0x000090f1) 34 | 35 | #ifdef __cplusplus 36 | }; /* extern "C" */ 37 | #endif 38 | #endif // _cl90f1_h 39 | 40 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/cla06c.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2010-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #include "nvtypes.h" 25 | 26 | #ifndef _cla06c_h_ 27 | #define _cla06c_h_ 28 | 29 | #ifdef __cplusplus 30 | extern "C" { 31 | #endif 32 | 33 | #define KEPLER_CHANNEL_GROUP_A (0x0000A06C) 34 | 35 | #ifdef __cplusplus 36 | }; /* extern "C" */ 37 | #endif 38 | #endif // _cla06c_h 39 | 40 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/clc461.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #ifndef _clc461_h_ 25 | #define _clc461_h_ 26 | 27 | #define TURING_USERMODE_A (0xc461) 28 | 29 | #endif // _clc461_h_ 30 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrl0000base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2005-2015 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | // 27 | // This file was generated with FINN, an NVIDIA coding tool. 28 | // Source file: ctrl/ctrl0000/ctrl0000base.finn 29 | // 30 | 31 | #include "ctrlxxxx.h" 32 | /* NV01_ROOT (client) control commands and parameters */ 33 | 34 | #define NV0000_CTRL_CMD(cat,idx) NVXXXX_CTRL_CMD(0x0000,NV0000_CTRL_##cat,idx) 35 | 36 | /* Client command categories (6bits) */ 37 | #define NV0000_CTRL_RESERVED (0x00) 38 | #define NV0000_CTRL_SYSTEM (0x01) 39 | #define NV0000_CTRL_GPU (0x02) 40 | #define NV0000_CTRL_GSYNC (0x03) 41 | #define NV0000_CTRL_DIAG (0x04) 42 | #define NV0000_CTRL_EVENT (0x05) 43 | #define NV0000_CTRL_NVD (0x06) 44 | #define NV0000_CTRL_SWINSTR (0x07) 45 | #define NV0000_CTRL_PROC (0x09) 46 | #define NV0000_CTRL_SYNC_GPU_BOOST (0x0A) 47 | #define NV0000_CTRL_GPUACCT (0x0B) 48 | #define NV0000_CTRL_VGPU (0x0C) 49 | #define NV0000_CTRL_CLIENT (0x0D) 50 | 51 | // per-OS categories start at highest category and work backwards 52 | #define NV0000_CTRL_OS_WINDOWS (0x3F) 53 | #define NV0000_CTRL_OS_MACOS (0x3E) 54 | #define NV0000_CTRL_OS_UNIX (0x3D) 55 | 56 | 57 | /* 58 | * NV0000_CTRL_CMD_NULL 59 | * 60 | * This command does nothing. 61 | * This command does not take any parameters. 62 | * 63 | * Possible status values returned are: 64 | * NV_OK 65 | */ 66 | #define NV0000_CTRL_CMD_NULL (0x0) /* finn: Evaluated from "(FINN_NV01_ROOT_RESERVED_INTERFACE_ID << 8) | 0x0" */ 67 | 68 | /* _ctrl0000_base_h_ */ 69 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrl0073base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | // 27 | // This file was generated with FINN, an NVIDIA coding tool. 28 | // Source file: ctrl/ctrl0073/ctrl0073base.finn 29 | // 30 | 31 | #include "ctrlxxxx.h" 32 | /* NV04_DISPLAY_COMMON control commands and parameters */ 33 | 34 | #define NV0073_CTRL_CMD(cat,idx) NVXXXX_CTRL_CMD(0x0073, NV0073_CTRL_##cat, idx) 35 | 36 | /* NV04_DISPLAY_COMMON command categories (6bits) */ 37 | #define NV0073_CTRL_RESERVED (0x00U) 38 | #define NV0073_CTRL_SYSTEM (0x01U) 39 | #define NV0073_CTRL_SPECIFIC (0x02U) 40 | #define NV0073_CTRL_EVENT (0x03U) 41 | #define NV0073_CTRL_INTERNAL (0x04U) 42 | #define NV0073_CTRL_COMMON (0x05U) 43 | #define NV0073_CTRL_DFP (0x11U) 44 | #define NV0073_CTRL_DP (0x13U) 45 | #define NV0073_CTRL_SVP (0x14U) 46 | #define NV0073_CTRL_DPU (0x15U) 47 | #define NV0073_CTRL_PSR (0x16U) 48 | #define NV0073_CTRL_STEREO (0x17U) 49 | 50 | /* 51 | * NV0073_CTRL_CMD_NULL 52 | * 53 | * This command does nothing. 54 | * This command does not take any parameters. 55 | * 56 | * Possible status values returned are: 57 | * NV_OK 58 | */ 59 | #define NV0073_CTRL_CMD_NULL (0x730000U) /* finn: Evaluated from "(FINN_NV04_DISPLAY_COMMON_RESERVED_INTERFACE_ID << 8) | 0x0" */ 60 | 61 | /* _ctrl0073base_h_ */ 62 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrl0080base.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2019-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | // 27 | // This file was generated with FINN, an NVIDIA coding tool. 28 | // Source file: ctrl/ctrl0080/ctrl0080base.finn 29 | // 30 | 31 | #include "ctrlxxxx.h" 32 | /* NV01_DEVICE_XX/NV03_DEVICE control commands and parameters */ 33 | 34 | #define NV0080_CTRL_CMD(cat,idx) NVXXXX_CTRL_CMD(0x0080, NV0080_CTRL_##cat, idx) 35 | 36 | /* GPU device command categories (6bits) */ 37 | #define NV0080_CTRL_RESERVED (0x00) 38 | #define NV0080_CTRL_BIF (0x01) 39 | #define NV0080_CTRL_GPU (0x02) 40 | #define NV0080_CTRL_CLK (0x10) 41 | #define NV0080_CTRL_GR (0x11) 42 | #define NV0080_CTRL_CIPHER (0x12) 43 | #define NV0080_CTRL_FB (0x13) 44 | #define NV0080_CTRL_HOST (0x14) 45 | #define NV0080_CTRL_VIDEO (0x15) 46 | #define NV0080_CTRL_FIFO (0x17) 47 | #define NV0080_CTRL_DMA (0x18) 48 | #define NV0080_CTRL_PERF (0x19) 49 | #define NV0080_CTRL_PERF_LEGACY_NON_PRIVILEGED (0x99) /* finn: Evaluated from "(NV0080_CTRL_PERF | NVxxxx_CTRL_LEGACY_NON_PRIVILEGED)" */ 50 | #define NV0080_CTRL_MSENC (0x1B) 51 | #define NV0080_CTRL_BSP (0x1C) 52 | #define NV0080_CTRL_RC (0x1D) 53 | #define NV0080_CTRL_OS_UNIX (0x1E) 54 | #define NV0080_CTRL_NVJPG (0x1F) 55 | #define NV0080_CTRL_INTERNAL (0x20) 56 | #define NV0080_CTRL_NVLINK (0x21) 57 | 58 | /* 59 | * NV0080_CTRL_CMD_NULL 60 | * 61 | * This command does nothing. 62 | * This command does not take any parameters. 63 | * 64 | * Possible status values returned are: 65 | * NV_OK 66 | */ 67 | #define NV0080_CTRL_CMD_NULL (0x800000) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_RESERVED_INTERFACE_ID << 8) | 0x0" */ 68 | 69 | /* _ctrl0080base_h_ */ 70 | 71 | /* extract device cap setting from specified category-specific caps table */ 72 | #define NV0080_CTRL_GET_CAP(cat,tbl,c) \ 73 | NV0080_CTRL_##cat##_GET_CAP(tbl, NV0080_CTRL_##cat##_CAPS_##c) 74 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrl0080bsp.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2014-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | #pragma once 24 | 25 | #include "nvtypes.h" 26 | 27 | // 28 | // This file was generated with FINN, an NVIDIA coding tool. 29 | // Source file: ctrl/ctrl0080/ctrl0080bsp.finn 30 | // 31 | 32 | #include "ctrl0080base.h" 33 | 34 | /* NV01_DEVICE_XX/NV03_DEVICE bit stream processor control commands and parameters */ 35 | 36 | /* 37 | * NV0080_CTRL_CMD_BSP_GET_CAPS 38 | * 39 | * This command returns the set of BSP capabilities for the device 40 | * in the form of an array of unsigned bytes. BSP capabilities 41 | * include supported features and required workarounds for the decoder 42 | * within the device, each represented by a byte offset into the 43 | * table and a bit position within that byte. 44 | * 45 | * capsTblSize 46 | * This parameter specifies the size in bytes of the caps table. 47 | * This value should be set to NV0080_CTRL_BSP_CAPS_TBL_SIZE. 48 | * capsTbl 49 | * This parameter specifies a pointer to the client's caps table buffer 50 | * into which the BSP caps bits will be transferred by the RM. 51 | * The caps table is an array of unsigned bytes. 52 | * instanceId 53 | * This parameter specifies the instance Id of NVDEC for which 54 | * cap bits are requested. 55 | * 56 | * Possible status values returned are: 57 | * NV_OK 58 | * NV_ERR_INVALID_PARAM_STRUCT 59 | * NV_ERR_INVALID_ARGUMENT 60 | */ 61 | 62 | #define NV0080_CTRL_CMD_BSP_GET_CAPS (0x801c01) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_BSP_INTERFACE_ID << 8) | NV0080_CTRL_BSP_GET_CAPS_PARAMS_MESSAGE_ID" */ 63 | 64 | #define NV0080_CTRL_BSP_GET_CAPS_PARAMS_MESSAGE_ID (0x1U) 65 | 66 | typedef struct NV0080_CTRL_BSP_GET_CAPS_PARAMS { 67 | NvU32 capsTblSize; 68 | NV_DECLARE_ALIGNED(NvP64 capsTbl, 8); 69 | NvU32 instanceId; 70 | } NV0080_CTRL_BSP_GET_CAPS_PARAMS; 71 | 72 | 73 | 74 | /* 75 | * Size in bytes of bsp caps table. This value should be one greater 76 | * than the largest byte_index value above. 77 | */ 78 | #define NV0080_CTRL_BSP_CAPS_TBL_SIZE 8 79 | 80 | /* 81 | * NV0080_CTRL_CMD_BSP_GET_CAPS_V2 82 | * 83 | * This command returns the set of BSP capabilities for the device 84 | * in the form of an array of unsigned bytes. BSP capabilities 85 | * include supported features and required workarounds for the decoder 86 | * within the device, each represented by a byte offset into the 87 | * table and a bit position within that byte. 88 | * (The V2 version flattens the capsTbl array pointer). 89 | * 90 | * capsTbl 91 | * This parameter is an array of unsigned bytes where the BSP caps bits 92 | * will be transferred by the RM. 93 | * instanceId 94 | * This parameter specifies the instance Id of NVDEC for which 95 | * cap bits are requested. 96 | * 97 | * Possible status values returned are: 98 | * NV_OK 99 | * NV_ERR_INVALID_PARAM_STRUCT 100 | * NV_ERR_INVALID_ARGUMENT 101 | */ 102 | 103 | #define NV0080_CTRL_CMD_BSP_GET_CAPS_V2 (0x801c02) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_BSP_INTERFACE_ID << 8) | NV0080_CTRL_BSP_GET_CAPS_PARAMS_V2_MESSAGE_ID" */ 104 | 105 | #define NV0080_CTRL_BSP_GET_CAPS_PARAMS_V2_MESSAGE_ID (0x2U) 106 | 107 | typedef struct NV0080_CTRL_BSP_GET_CAPS_PARAMS_V2 { 108 | NvU8 capsTbl[NV0080_CTRL_BSP_CAPS_TBL_SIZE]; 109 | NvU32 instanceId; 110 | } NV0080_CTRL_BSP_GET_CAPS_PARAMS_V2; 111 | 112 | /* _ctrl0080bsp_h_ */ 113 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrl0080msenc.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2004-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | #include "nvtypes.h" 27 | 28 | // 29 | // This file was generated with FINN, an NVIDIA coding tool. 30 | // Source file: ctrl/ctrl0080/ctrl0080msenc.finn 31 | // 32 | 33 | #include "ctrl0080base.h" 34 | 35 | /* NV01_DEVICE_XX/NV03_DEVICE MSENC control commands and parameters */ 36 | 37 | /* 38 | * NV0080_CTRL_CMD_MSENC_GET_CAPS 39 | * 40 | * This command returns the set of MSENC capabilities for the device 41 | * in the form of an array of unsigned bytes. MSENC capabilities 42 | * include supported features and required workarounds for the MSENC-related 43 | * engine(s) within the device, each represented by a byte offset into 44 | * the table and a bit position within that byte. 45 | * 46 | * capsTblSize 47 | * This parameter specifies the size in bytes of the caps table. 48 | * This value should be set to NV0080_CTRL_MSENC_CAPS_TBL_SIZE. 49 | * capsTbl 50 | * This parameter specifies a pointer to the client's caps table buffer 51 | * into which the MSENC caps bits will be transferred by the RM. 52 | * The caps table is an array of unsigned bytes. 53 | * 54 | * Possible status values returned are: 55 | * NV_OK 56 | * NV_ERR_INVALID_PARAM_STRUCT 57 | * NV_ERR_INVALID_ARGUMENT 58 | */ 59 | #define NV0080_CTRL_CMD_MSENC_GET_CAPS (0x801b01) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_MSENC_INTERFACE_ID << 8) | NV0080_CTRL_MSENC_GET_CAPS_PARAMS_MESSAGE_ID" */ 60 | 61 | #define NV0080_CTRL_MSENC_GET_CAPS_PARAMS_MESSAGE_ID (0x1U) 62 | 63 | typedef struct NV0080_CTRL_MSENC_GET_CAPS_PARAMS { 64 | NvU32 capsTblSize; 65 | NV_DECLARE_ALIGNED(NvP64 capsTbl, 8); 66 | } NV0080_CTRL_MSENC_GET_CAPS_PARAMS; 67 | 68 | 69 | 70 | /* size in bytes of MSENC caps table */ 71 | #define NV0080_CTRL_MSENC_CAPS_TBL_SIZE 4 72 | 73 | /* 74 | * NV0080_CTRL_CMD_MSENC_GET_CAPS_V2 75 | * 76 | * This command is a version of NV0080_CTRL_CMD_MSENC_GET_CAPS with caps passed inline in capsTbl. 77 | * 78 | * For consistency with other video caps controls, it adds `instanceId` parameter. Currently it is 79 | * ignored. 80 | */ 81 | #define NV0080_CTRL_CMD_MSENC_GET_CAPS_V2 (0x801b02) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_MSENC_INTERFACE_ID << 8) | NV0080_CTRL_MSENC_GET_CAPS_V2_PARAMS_MESSAGE_ID" */ 82 | 83 | #define NV0080_CTRL_MSENC_GET_CAPS_V2_PARAMS_MESSAGE_ID (0x2U) 84 | 85 | typedef struct NV0080_CTRL_MSENC_GET_CAPS_V2_PARAMS { 86 | NvU8 capsTbl[NV0080_CTRL_MSENC_CAPS_TBL_SIZE]; 87 | NvU32 instanceId; // ignored 88 | } NV0080_CTRL_MSENC_GET_CAPS_V2_PARAMS; 89 | 90 | /* _ctrl0080msenc_h_ */ 91 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrl2080mc.1.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2013-2023 NVIDIA Corporation 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to 6 | deal in the Software without restriction, including without limitation the 7 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 | sell copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | 22 | *******************************************************************************/ 23 | 24 | #ifndef _ctrl2080mc_h_ 25 | #define _ctrl2080mc_h_ 26 | 27 | /* valid architecture values */ 28 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_T13X (0xE0000013) 29 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM000 (0x00000110) 30 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM200 (0x00000120) 31 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GP100 (0x00000130) 32 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GV100 (0x00000140) 33 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_TU100 (0x00000160) 34 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GA100 (0x00000170) 35 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100 (0x00000180) 36 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_AD100 (0x00000190) 37 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100 (0x000001A0) 38 | 39 | /* valid ARCHITECTURE_GP10x implementation values */ 40 | #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GP100 (0x00000000) 41 | #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GP000 (0x00000001) 42 | 43 | #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GA100 (0x00000000) 44 | #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GA000 (0x00000001) 45 | #endif /* _ctrl2080mc_h_ */ 46 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrla06f.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2007-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | // 27 | // This file was generated with FINN, an NVIDIA coding tool. 28 | // Source file: ctrl/ctrla06f.finn 29 | // 30 | 31 | 32 | 33 | #include "ctrlxxxx.h" 34 | #include "ctrla06fbase.h" 35 | #include "ctrla06fgpfifo.h" 36 | #include "ctrla06fevent.h" 37 | #include "ctrla06finternal.h" 38 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrla06fbase.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2007-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | // 27 | // This file was generated with FINN, an NVIDIA coding tool. 28 | // Source file: ctrl/ctrla06f/ctrla06fbase.finn 29 | // 30 | 31 | 32 | 33 | 34 | /* GK100_GPFIFO control commands and parameters */ 35 | 36 | #include "ctrlxxxx.h" 37 | #include "ctrl906f.h" /* A06F is partially derived from 906F */ 38 | 39 | #define NVA06F_CTRL_CMD(cat,idx) \ 40 | NVXXXX_CTRL_CMD(0xA06F, NVA06F_CTRL_##cat, idx) 41 | 42 | /* GK100_GPFIFO command categories (6bits) */ 43 | #define NVA06F_CTRL_RESERVED (0x00) 44 | #define NVA06F_CTRL_GPFIFO (0x01) 45 | #define NVA06F_CTRL_EVENT (0x02) 46 | #define NVA06F_CTRL_INTERNAL (0x03) 47 | 48 | /* 49 | * NVA06F_CTRL_CMD_NULL 50 | * 51 | * This command does nothing. 52 | * This command does not take any parameters. 53 | * 54 | * Possible status values returned are: 55 | * NV_OK 56 | * 57 | */ 58 | #define NVA06F_CTRL_CMD_NULL (0xa06f0000) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_RESERVED_INTERFACE_ID << 8) | 0x0" */ 59 | 60 | /* _ctrla06fbase_h_ */ 61 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrla06fevent.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2007-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | #include "nvtypes.h" 27 | 28 | // 29 | // This file was generated with FINN, an NVIDIA coding tool. 30 | // Source file: ctrl/ctrla06f/ctrla06fevent.finn 31 | // 32 | 33 | #include "ctrla06fbase.h" 34 | 35 | /* 36 | * NVA06F_CTRL_CMD_EVENT_SET_NOTIFICATION 37 | * 38 | * This command sets event notification state for the associated channel. 39 | * This command requires that an instance of NV01_EVENT has been previously 40 | * bound to the associated channel object. 41 | * 42 | * event 43 | * This parameter specifies the type of event to which the specified 44 | * action is to be applied. This parameter must specify a valid 45 | * NVA06F_NOTIFIERS value (see cla06f.h for more details) and should 46 | * not exceed one less NVA06F_NOTIFIERS_MAXCOUNT. 47 | * action 48 | * This parameter specifies the desired event notification action. 49 | * Valid notification actions include: 50 | * NVA06F_CTRL_SET_EVENT_NOTIFICATION_ACTION_DISABLE 51 | * This action disables event notification for the specified 52 | * event for the associated channel object. 53 | * NVA06F_CTRL_SET_EVENT_NOTIFICATION_ACTION_SINGLE 54 | * This action enables single-shot event notification for the 55 | * specified event for the associated channel object. 56 | * NVA06F_CTRL_SET_EVENT_NOTIFICATION_ACTION_REPEAT 57 | * This action enables repeated event notification for the specified 58 | * event for the associated channel object. 59 | * 60 | * Possible status values returned are: 61 | * NV_OK 62 | * NV_ERR_INVALID_PARAM_STRUCT 63 | * NV_ERR_INVALID_ARGUMENT 64 | * NV_ERR_INVALID_STATE 65 | */ 66 | #define NVA06F_CTRL_CMD_EVENT_SET_NOTIFICATION (0xa06f0205) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_EVENT_INTERFACE_ID << 8) | NVA06F_CTRL_EVENT_SET_NOTIFICATION_PARAMS_MESSAGE_ID" */ 67 | 68 | #define NVA06F_CTRL_EVENT_SET_NOTIFICATION_PARAMS_MESSAGE_ID (0x5U) 69 | 70 | typedef struct NVA06F_CTRL_EVENT_SET_NOTIFICATION_PARAMS { 71 | NvU32 event; 72 | NvU32 action; 73 | } NVA06F_CTRL_EVENT_SET_NOTIFICATION_PARAMS; 74 | 75 | /* valid action values */ 76 | #define NVA06F_CTRL_EVENT_SET_NOTIFICATION_ACTION_DISABLE (0x00000000) 77 | #define NVA06F_CTRL_EVENT_SET_NOTIFICATION_ACTION_SINGLE (0x00000001) 78 | #define NVA06F_CTRL_EVENT_SET_NOTIFICATION_ACTION_REPEAT (0x00000002) 79 | 80 | /* 81 | * NVA06F_CTRL_CMD_EVENT_SET_TRIGGER 82 | * 83 | * This command triggers a software event for the associated channel. 84 | * This command accepts no parameters. 85 | * 86 | * Possible status values returned are: 87 | * NV_OK 88 | */ 89 | #define NVA06F_CTRL_CMD_EVENT_SET_TRIGGER (0xa06f0206) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_EVENT_INTERFACE_ID << 8) | 0x6" */ 90 | 91 | 92 | /* _ctrla06fevent_h_ */ 93 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrla06finternal.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2007-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | // 27 | // This file was generated with FINN, an NVIDIA coding tool. 28 | // Source file: ctrl/ctrla06f/ctrla06finternal.finn 29 | // 30 | 31 | #include "ctrla06fbase.h" 32 | #include "ctrla06fgpfifo.h" 33 | 34 | /* 35 | * NVA06F_CTRL_CMD_INTERNAL_STOP_CHANNEL 36 | * 37 | * This command is an internal command sent from Kernel RM to Physical RM 38 | * to stop the channel in hardware 39 | * 40 | * Please see description of NVA06F_CTRL_CMD_STOP_CHANNEL for more information. 41 | * 42 | */ 43 | #define NVA06F_CTRL_CMD_INTERNAL_STOP_CHANNEL (0xa06f0301) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_INTERNAL_INTERFACE_ID << 8) | NVA06F_CTRL_INTERNAL_STOP_CHANNEL_PARAMS_MESSAGE_ID" */ 44 | 45 | #define NVA06F_CTRL_INTERNAL_STOP_CHANNEL_PARAMS_MESSAGE_ID (0x1U) 46 | 47 | typedef NVA06F_CTRL_STOP_CHANNEL_PARAMS NVA06F_CTRL_INTERNAL_STOP_CHANNEL_PARAMS; 48 | 49 | /* 50 | * NVA06F_CTRL_CMD_INTERNAL_GPFIFO_SCHEDULE 51 | * 52 | * This command is an internal command sent from Kernel RM to Physical RM 53 | * to schedule the channel in hardware 54 | * 55 | * Please see description of NVA06F_CTRL_CMD_GPFIFO_SCHEDULE for more information. 56 | * 57 | */ 58 | #define NVA06F_CTRL_CMD_INTERNAL_GPFIFO_SCHEDULE (0xa06f0303) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_INTERNAL_INTERFACE_ID << 8) | NVA06F_CTRL_INTERNAL_GPFIFO_SCHEDULE_PARAMS_MESSAGE_ID" */ 59 | 60 | #define NVA06F_CTRL_INTERNAL_GPFIFO_SCHEDULE_PARAMS_MESSAGE_ID (0x3U) 61 | 62 | typedef NVA06F_CTRL_GPFIFO_SCHEDULE_PARAMS NVA06F_CTRL_INTERNAL_GPFIFO_SCHEDULE_PARAMS; 63 | 64 | /* ctrla06finternal_h */ 65 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/ctrlxxxx.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2005-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | #include "nvtypes.h" 27 | 28 | // 29 | // This file was generated with FINN, an NVIDIA coding tool. 30 | // Source file: ctrl/ctrlxxxx.finn 31 | // 32 | 33 | 34 | 35 | #include "nvtypes.h" 36 | 37 | /* definitions shared by all CTRL interfaces */ 38 | 39 | /* Basic command format: 40 | * cmd_class [31:16], 41 | * cmd_reserved [15:15], 42 | * cmd_reserved [14:14], 43 | * cmd_category [13:8], 44 | * cmd_index [7:0] 45 | */ 46 | 47 | #define NVXXXX_CTRL_CMD_CLASS 31:16 48 | 49 | #define NVXXXX_CTRL_CMD_CATEGORY 13:8 50 | #define NVXXXX_CTRL_CMD_INDEX 7:0 51 | 52 | /* don't use DRF_NUM - not always available */ 53 | # define NVXXXX_CTRL_CMD(cls,cat,idx) \ 54 | (((cls) << 16) | ((0) << 15) | ((0) << 14) \ 55 | | ((cat) << 8) | ((idx) & 0xFF)) 56 | /* 57 | * NVXXXX_CTRL_CMD_NULL 58 | * 59 | * This command does nothing. 60 | * This command does not take any parameters. 61 | * This command is valid for all classes. 62 | * 63 | * Possible status values returned are: 64 | * NV_OK 65 | */ 66 | #define NVXXXX_CTRL_CMD_NULL (0x00000000) 67 | 68 | #define NVxxxx_CTRL_LEGACY_PRIVILEGED (0xC0) 69 | #define NVxxxx_CTRL_LEGACY_NON_PRIVILEGED (0x80) 70 | 71 | typedef struct NVXXXX_CTRL_XXX_INFO { 72 | NvU32 index; 73 | NvU32 data; 74 | } NVXXXX_CTRL_XXX_INFO; 75 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/nv-ioctl-numbers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | 25 | #ifndef NV_IOCTL_NUMBERS_H 26 | #define NV_IOCTL_NUMBERS_H 27 | 28 | /* NOTE: using an ioctl() number > 55 will overflow! */ 29 | #define NV_IOCTL_MAGIC 'F' 30 | #define NV_IOCTL_BASE 200 31 | #define NV_ESC_CARD_INFO (NV_IOCTL_BASE + 0) 32 | #define NV_ESC_REGISTER_FD (NV_IOCTL_BASE + 1) 33 | #define NV_ESC_ALLOC_OS_EVENT (NV_IOCTL_BASE + 6) 34 | #define NV_ESC_FREE_OS_EVENT (NV_IOCTL_BASE + 7) 35 | #define NV_ESC_STATUS_CODE (NV_IOCTL_BASE + 9) 36 | #define NV_ESC_CHECK_VERSION_STR (NV_IOCTL_BASE + 10) 37 | #define NV_ESC_IOCTL_XFER_CMD (NV_IOCTL_BASE + 11) 38 | #define NV_ESC_ATTACH_GPUS_TO_FD (NV_IOCTL_BASE + 12) 39 | #define NV_ESC_QUERY_DEVICE_INTR (NV_IOCTL_BASE + 13) 40 | #define NV_ESC_SYS_PARAMS (NV_IOCTL_BASE + 14) 41 | #define NV_ESC_EXPORT_TO_DMABUF_FD (NV_IOCTL_BASE + 17) 42 | #define NV_ESC_WAIT_OPEN_COMPLETE (NV_IOCTL_BASE + 18) 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/nv-unix-nvos-params-wrappers.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #ifndef _NV_UNIX_NVOS_PARAMS_WRAPPERS_H_ 25 | #define _NV_UNIX_NVOS_PARAMS_WRAPPERS_H_ 26 | 27 | #include "nvos.h" 28 | 29 | /* 30 | * This is a wrapper for NVOS02_PARAMETERS with file descriptor 31 | */ 32 | 33 | typedef struct 34 | { 35 | NVOS02_PARAMETERS params; 36 | int fd; 37 | } nv_ioctl_nvos02_parameters_with_fd; 38 | 39 | /* 40 | * This is a wrapper for NVOS33_PARAMETERS with file descriptor 41 | */ 42 | typedef struct 43 | { 44 | NVOS33_PARAMETERS params; 45 | int fd; 46 | } nv_ioctl_nvos33_parameters_with_fd; 47 | 48 | #endif // _NV_UNIX_NVOS_PARAMS_WRAPPERS_H_ 49 | 50 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/nvCpuUuid.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2015-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #ifndef _NV_CPU_UUID_H_ 25 | #define _NV_CPU_UUID_H_ 26 | 27 | #define NV_UUID_LEN 16 28 | 29 | typedef struct nv_uuid 30 | { 31 | NvU8 uuid[NV_UUID_LEN]; 32 | } NvUuid; 33 | 34 | #define NV_UUID_HI(pUuid) (*((NvU64*)((pUuid)->uuid + (NV_UUID_LEN >> 1)))) 35 | #define NV_UUID_LO(pUuid) (*((NvU64*)((pUuid)->uuid + 0))) 36 | 37 | typedef NvUuid NvSystemUuid; 38 | 39 | typedef NvUuid NvProcessorUuid; 40 | 41 | extern const NvProcessorUuid NV_PROCESSOR_UUID_CPU_DEFAULT; 42 | 43 | #endif // _NV_CPU_UUID_H_ 44 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/nv_escape.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 1999-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #ifndef NV_ESCAPE_H_INCLUDED 25 | #define NV_ESCAPE_H_INCLUDED 26 | 27 | #define NV_ESC_RM_ALLOC_MEMORY 0x27 28 | #define NV_ESC_RM_ALLOC_OBJECT 0x28 29 | #define NV_ESC_RM_FREE 0x29 30 | #define NV_ESC_RM_CONTROL 0x2A 31 | #define NV_ESC_RM_ALLOC 0x2B 32 | #define NV_ESC_RM_CONFIG_GET 0x32 33 | #define NV_ESC_RM_CONFIG_SET 0x33 34 | #define NV_ESC_RM_DUP_OBJECT 0x34 35 | #define NV_ESC_RM_SHARE 0x35 36 | #define NV_ESC_RM_CONFIG_GET_EX 0x37 37 | #define NV_ESC_RM_CONFIG_SET_EX 0x38 38 | #define NV_ESC_RM_I2C_ACCESS 0x39 39 | #define NV_ESC_RM_IDLE_CHANNELS 0x41 40 | #define NV_ESC_RM_VID_HEAP_CONTROL 0x4A 41 | #define NV_ESC_RM_ACCESS_REGISTRY 0x4D 42 | #define NV_ESC_RM_MAP_MEMORY 0x4E 43 | #define NV_ESC_RM_UNMAP_MEMORY 0x4F 44 | #define NV_ESC_RM_GET_EVENT_DATA 0x52 45 | #define NV_ESC_RM_ALLOC_CONTEXT_DMA2 0x54 46 | #define NV_ESC_RM_ADD_VBLANK_CALLBACK 0x56 47 | #define NV_ESC_RM_MAP_MEMORY_DMA 0x57 48 | #define NV_ESC_RM_UNMAP_MEMORY_DMA 0x58 49 | #define NV_ESC_RM_BIND_CONTEXT_DMA 0x59 50 | #define NV_ESC_RM_EXPORT_OBJECT_TO_FD 0x5C 51 | #define NV_ESC_RM_IMPORT_OBJECT_FROM_FD 0x5D 52 | #define NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO 0x5E 53 | #define NV_ESC_RM_LOCKLESS_DIAGNOSTIC 0x5F 54 | 55 | #endif // NV_ESCAPE_H_INCLUDED 56 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/nvcfg_sdk.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | 25 | #ifndef NV_CFG_SDK_INCLUDED 26 | #define NV_CFG_SDK_INCLUDED 27 | 28 | 29 | #endif // NV_CFG_SDK_INCLUDED 30 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/nvimpshared.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | /******************************************************************************\ 25 | * * 26 | * Description: * 27 | * Accommodates sharing of IMP-related structures between kernel interface * 28 | * files and core RM. * 29 | * * 30 | \******************************************************************************/ 31 | 32 | #pragma once 33 | 34 | #include "nvtypes.h" 35 | 36 | // 37 | // This file was generated with FINN, an NVIDIA coding tool. 38 | // Source file: nvimpshared.finn 39 | // 40 | 41 | 42 | 43 | 44 | // 45 | // There are only a small number of discrete dramclk frequencies available on 46 | // the system. This structure contains IMP-relevant information associated 47 | // with a specific dramclk frequency. 48 | // 49 | typedef struct DRAM_CLK_INSTANCE { 50 | NvU32 dram_clk_freq_khz; 51 | 52 | NvU32 mchub_clk_khz; 53 | 54 | NvU32 mc_clk_khz; 55 | 56 | NvU32 max_iso_bw_kbps; 57 | 58 | // 59 | // switch_latency_ns is the maximum time required to switch the dramclk 60 | // frequency to the frequency specified in dram_clk_freq_khz. 61 | // 62 | NvU32 switch_latency_ns; 63 | } DRAM_CLK_INSTANCE; 64 | 65 | // 66 | // This table is used to collect information from other modules that is needed 67 | // for RM IMP calculations. (Used on Tegra only.) 68 | // 69 | typedef struct TEGRA_IMP_IMPORT_DATA { 70 | // 71 | // max_iso_bw_kbps stores the maximum possible ISO bandwidth available to 72 | // display, assuming display is the only active ISO client. (Note that ISO 73 | // bandwidth will typically be allocated to multiple clients, so display 74 | // will generally not have access to the maximum possible bandwidth.) 75 | // 76 | NvU32 max_iso_bw_kbps; 77 | 78 | // On Orin, each dram channel is 16 bits wide. 79 | NvU32 num_dram_channels; 80 | 81 | // 82 | // dram_clk_instance stores entries for all possible dramclk frequencies, 83 | // sorted by dramclk frequency in increasing order. 84 | // 85 | // "24" is expected to be larger than the actual number of required entries 86 | // (which is provided by a BPMP API), but it can be increased if necessary. 87 | // 88 | // num_dram_clk_entries is filled in with the actual number of distinct 89 | // dramclk entries. 90 | // 91 | NvU32 num_dram_clk_entries; 92 | DRAM_CLK_INSTANCE dram_clk_instance[24]; 93 | } TEGRA_IMP_IMPORT_DATA; 94 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/nvlimits.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #pragma once 25 | 26 | // 27 | // This file was generated with FINN, an NVIDIA coding tool. 28 | // Source file: nvlimits.finn 29 | // 30 | 31 | 32 | 33 | 34 | /* 35 | * This is the maximum number of GPUs supported in a single system. 36 | */ 37 | #define NV_MAX_DEVICES 32 38 | 39 | /* 40 | * This is the maximum number of subdevices within a single device. 41 | */ 42 | #define NV_MAX_SUBDEVICES 8 43 | 44 | /* 45 | * This is the maximum length of the process name string. 46 | */ 47 | #define NV_PROC_NAME_MAX_LENGTH 100U 48 | 49 | /* 50 | * This is the maximum number of heads per GPU. 51 | */ 52 | #define NV_MAX_HEADS 4 53 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/nvstatus.h: -------------------------------------------------------------------------------- 1 | /* 2 | * SPDX-FileCopyrightText: Copyright (c) 2014-2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 3 | * SPDX-License-Identifier: MIT 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a 6 | * copy of this software and associated documentation files (the "Software"), 7 | * to deal in the Software without restriction, including without limitation 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 | * and/or sell copies of the Software, and to permit persons to whom the 10 | * Software is furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 | * DEALINGS IN THE SOFTWARE. 22 | */ 23 | 24 | #ifndef SDK_NVSTATUS_H 25 | #define SDK_NVSTATUS_H 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | #include "nvtypes.h" 32 | 33 | typedef NvU32 NV_STATUS; 34 | 35 | #define NV_STATUS_CODE( name, code, string ) name = (code), 36 | 37 | enum 38 | { 39 | #include "nvstatuscodes.h" 40 | }; 41 | 42 | #undef NV_STATUS_CODE 43 | 44 | /*! 45 | * @def NV_STATUS_LEVEL_OK 46 | * @see NV_STATUS_LEVEL 47 | * @brief Success: No error or special condition 48 | */ 49 | #define NV_STATUS_LEVEL_OK 0 50 | 51 | /*! 52 | * @def NV_STATUS_LEVEL_WARN 53 | * @see NV_STATUS_LEVEL 54 | * @brief Success, but there is an special condition 55 | * 56 | * @details In general, NV_STATUS_LEVEL_WARN status codes are handled the 57 | * same as NV_STATUS_LEVEL_OK, but are usefil to indicate that 58 | * there is a condition that may be specially handled. 59 | * 60 | * Therefore, in most cases, client function should test for 61 | * status <= NV_STATUS_LEVEL_WARN or status > NV_STATUS_LEVEL_WARN 62 | * to determine success v. failure of a call. 63 | */ 64 | #define NV_STATUS_LEVEL_WARN 1 65 | 66 | /*! 67 | * @def NV_STATUS_LEVEL_ERR 68 | * @see NV_STATUS_LEVEL 69 | * @brief Unrecoverable error condition 70 | */ 71 | #define NV_STATUS_LEVEL_ERR 3 72 | 73 | /*! 74 | * @def NV_STATUS_LEVEL 75 | * @see NV_STATUS_LEVEL_OK 76 | * @see NV_STATUS_LEVEL_WARN 77 | * @see NV_STATUS_LEVEL_ERR 78 | * @brief Level of the status code 79 | * 80 | * @warning IMPORTANT: When comparing NV_STATUS_LEVEL(_S) against one of 81 | * these constants, it is important to use '<=' or '>' (rather 82 | * than '<' or '>='). 83 | * 84 | * For example. do: 85 | * if (NV_STATUS_LEVEL(status) <= NV_STATUS_LEVEL_WARN) 86 | * rather than: 87 | * if (NV_STATUS_LEVEL(status) < NV_STATUS_LEVEL_ERR) 88 | * 89 | * By being consistent in this manner, it is easier to systematically 90 | * add additional level constants. New levels are likely to lower 91 | * (rather than raise) the severity of _ERR codes. For example, 92 | * if we were to add NV_STATUS_LEVEL_RETRY to indicate hardware 93 | * failures that may be recoverable (e.g. RM_ERR_TIMEOUT_RETRY 94 | * or RM_ERR_BUSY_RETRY), it would be less severe than 95 | * NV_STATUS_LEVEL_ERR the level to which these status codes now 96 | * belong. Using '<=' and '>' ensures your code is not broken in 97 | * cases like this. 98 | */ 99 | #define NV_STATUS_LEVEL(_S) \ 100 | ((_S) == NV_OK? NV_STATUS_LEVEL_OK: \ 101 | ((_S) != NV_ERR_GENERIC && (_S) & 0x00010000? NV_STATUS_LEVEL_WARN: \ 102 | NV_STATUS_LEVEL_ERR)) 103 | 104 | /*! 105 | * @def NV_STATUS_LEVEL 106 | * @see NV_STATUS_LEVEL_OK 107 | * @see NV_STATUS_LEVEL_WARN 108 | * @see NV_STATUS_LEVEL_ERR 109 | * @brief Character representing status code level 110 | */ 111 | #define NV_STATUS_LEVEL_CHAR(_S) \ 112 | ((_S) == NV_OK? '0': \ 113 | ((_S) != NV_ERR_GENERIC && (_S) & 0x00010000? 'W': \ 114 | 'E')) 115 | 116 | // Function definitions 117 | const char *nvstatusToString(NV_STATUS nvStatusIn); 118 | 119 | #ifdef __cplusplus 120 | } 121 | #endif 122 | 123 | #endif /* SDK_NVSTATUS_H */ 124 | -------------------------------------------------------------------------------- /driverapi/internal/nvidia/uvm_linux_ioctl.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | Copyright (c) 2013 NVidia Corporation 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to 6 | deal in the Software without restriction, including without limitation the 7 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 | sell copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 | DEALINGS IN THE SOFTWARE. 21 | *******************************************************************************/ 22 | 23 | #ifndef _UVM_LINUX_IOCTL_H 24 | #define _UVM_LINUX_IOCTL_H 25 | 26 | #include "uvm_ioctl.h" 27 | 28 | // This ioctl must be the first operation performed on the UVM file descriptor 29 | // after opening it. Until this ioctl is made, the UVM file descriptor is 30 | // inoperable: all other ioctls will return NV_ERR_ILLEGAL_ACTION and mmap will 31 | // return EBADFD. 32 | #define UVM_INITIALIZE 0x30000001 33 | 34 | typedef struct 35 | { 36 | NvU64 flags NV_ALIGN_BYTES(8); // IN 37 | NV_STATUS rmStatus; // OUT 38 | } UVM_INITIALIZE_PARAMS; 39 | 40 | #define UVM_DEINITIALIZE 0x30000002 41 | 42 | #endif // _UVM_LINUX_IOCTL_H 43 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/compile_memcpy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # NOTE: THIS FILE IS NOT PART OF CI 4 | # Why? Because it would require having nvcc & ptxas installed, which we don't wan to assume 5 | 6 | # Define the list of compute capabilities and corresponding architecture 7 | declare -A compute_capabilities=( 8 | [5.0]="sm_50" 9 | [5.2]="sm_52" 10 | [5.3]="sm_53" 11 | [6.0]="sm_60" 12 | [6.1]="sm_61" 13 | [6.2]="sm_62" 14 | [7.0]="sm_70" 15 | [7.2]="sm_72" 16 | [7.5]="sm_75" 17 | [8.0]="sm_80" 18 | [8.6]="sm_86" 19 | [8.7]="sm_87" 20 | [8.9]="sm_89" 21 | [9.0]="sm_90" 22 | ) 23 | 24 | # Define the CUDA source file and the output directory 25 | source_file="memcpy.cu" 26 | output_dir="output" 27 | 28 | # Create the output directory if it does not exist 29 | mkdir -p "$output_dir" 30 | 31 | # Loop through each compute capability and run nvcc and ptxas 32 | for capability in "${!compute_capabilities[@]}"; do 33 | arch="${compute_capabilities[$capability]}" 34 | ptx_file="$output_dir/memcpy_${arch}.ptx" 35 | ptxas_file="$output_dir/memcpy_${arch}.cubin" 36 | 37 | # Run nvcc to generate the PTX file 38 | nvcc -ptx -arch="$arch" "$source_file" -o "$ptx_file" 39 | 40 | # Run ptxas to compile the PTX file to SASS 41 | ptxas -arch="$arch" "$ptx_file" -o "$ptxas_file" 42 | 43 | echo "Processed compute capability $capability ($arch)" 44 | done 45 | 46 | echo "Processing complete." 47 | 48 | python3 generate_header.py -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/generate_header.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import binascii 4 | 5 | def generate_header_from_cubin(cubin_dir, header_file): 6 | # Open the header file for writing 7 | with open(header_file, 'w') as header: 8 | # Write header guards 9 | header.write('#pragma once\n\n#include \n\n') 10 | 11 | # Iterate over all .cubin files in the directory 12 | for file_name in os.listdir(cubin_dir): 13 | if file_name.endswith('.cubin'): 14 | # Determine the array name from the file name 15 | array_name = file_name.replace('.cubin', '').replace('-', '_').replace(' ', '_').upper() 16 | cubin_path = os.path.join(cubin_dir, file_name) 17 | 18 | # Read the contents of the .cubin file 19 | with open(cubin_path, 'rb') as cubin_file: 20 | cubin_data = cubin_file.read() 21 | 22 | # Convert binary data to hex and format as uint8_t array 23 | hex_data = binascii.hexlify(cubin_data).decode('ascii') 24 | hex_data_lines = [hex_data[i:i+64] for i in range(0, len(hex_data), 64)] # Split into 64-char lines 25 | 26 | # Write array declaration to header file 27 | header.write(f'const uint8_t {array_name}[] = {{\n') 28 | 29 | for line in hex_data_lines: 30 | header.write(' ' + ', '.join(f'0x{line[i:i+2]}' for i in range(0, len(line), 2)) + ',\n') 31 | 32 | header.write('};\n\n') 33 | 34 | if __name__ == '__main__': 35 | generate_header_from_cubin("output", "memcopy_kernels.h") 36 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/memcpy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define MEMCPY_BLOCK_SIZE 256UL 4 | 5 | extern "C" __global__ void memcpyKernelHighBW(uint32_t *dst, const uint32_t *src) { 6 | size_t idx = ((MEMCPY_BLOCK_SIZE * (blockIdx.y * gridDim.x + blockIdx.x)) << 2) + threadIdx.x; 7 | 8 | #pragma unroll 9 | for (int i = 0; i < 4; i++) { 10 | dst[idx] = src[idx]; 11 | idx += MEMCPY_BLOCK_SIZE; 12 | } 13 | } 14 | 15 | extern "C" __global__ void memcpyKernelLowLatency(uint32_t *dst, const uint32_t *src, size_t n) { 16 | size_t tid = MEMCPY_BLOCK_SIZE * blockIdx.x + threadIdx.x; 17 | if (tid < n) { 18 | dst[tid] = src[tid]; 19 | } 20 | } 21 | 22 | extern "C" __global__ void memcpyKernelTrailing(uint8_t *dst, const uint8_t *src, size_t n) { 23 | size_t tid = MEMCPY_BLOCK_SIZE * blockIdx.x + threadIdx.x; 24 | if (tid < n) { 25 | dst[tid] = src[tid]; 26 | } 27 | } -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_50.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_50.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_50.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_50 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_52.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_52.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_52.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_52 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_53.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_53.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_53.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_53 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_60.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_60.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_60.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_60 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_61.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_61.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_61.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_61 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_62.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_62.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_62.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_62 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_70.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_70.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_70.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_70 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_72.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_72.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_72.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_72 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_75.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_75.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_75.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_75 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_80.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_80.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_80.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_86.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_86.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_86.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_86 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_87.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_87.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_87.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_87 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_89.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_89.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_89.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_89 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_90.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_90.cubin -------------------------------------------------------------------------------- /driverapi/kernels/memcpy/output/memcpy_sm_90.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_90 11 | .address_size 64 12 | 13 | // .globl memcpyKernelHighBW 14 | 15 | .visible .entry memcpyKernelHighBW( 16 | .param .u64 memcpyKernelHighBW_param_0, 17 | .param .u64 memcpyKernelHighBW_param_1 18 | ) 19 | { 20 | .reg .b32 %r<10>; 21 | .reg .b64 %rd<11>; 22 | 23 | 24 | ld.param.u64 %rd1, [memcpyKernelHighBW_param_0]; 25 | ld.param.u64 %rd2, [memcpyKernelHighBW_param_1]; 26 | cvta.to.global.u64 %rd3, %rd1; 27 | cvta.to.global.u64 %rd4, %rd2; 28 | mov.u32 %r1, %ctaid.y; 29 | mov.u32 %r2, %nctaid.x; 30 | mov.u32 %r3, %ctaid.x; 31 | mad.lo.s32 %r4, %r1, %r2, %r3; 32 | mul.wide.u32 %rd5, %r4, 1024; 33 | mov.u32 %r5, %tid.x; 34 | cvt.u64.u32 %rd6, %r5; 35 | add.s64 %rd7, %rd5, %rd6; 36 | shl.b64 %rd8, %rd7, 2; 37 | add.s64 %rd9, %rd4, %rd8; 38 | ld.global.u32 %r6, [%rd9]; 39 | add.s64 %rd10, %rd3, %rd8; 40 | st.global.u32 [%rd10], %r6; 41 | ld.global.u32 %r7, [%rd9+1024]; 42 | st.global.u32 [%rd10+1024], %r7; 43 | ld.global.u32 %r8, [%rd9+2048]; 44 | st.global.u32 [%rd10+2048], %r8; 45 | ld.global.u32 %r9, [%rd9+3072]; 46 | st.global.u32 [%rd10+3072], %r9; 47 | ret; 48 | 49 | } 50 | // .globl memcpyKernelLowLatency 51 | .visible .entry memcpyKernelLowLatency( 52 | .param .u64 memcpyKernelLowLatency_param_0, 53 | .param .u64 memcpyKernelLowLatency_param_1, 54 | .param .u64 memcpyKernelLowLatency_param_2 55 | ) 56 | { 57 | .reg .pred %p<2>; 58 | .reg .b32 %r<4>; 59 | .reg .b64 %rd<12>; 60 | 61 | 62 | ld.param.u64 %rd2, [memcpyKernelLowLatency_param_0]; 63 | ld.param.u64 %rd3, [memcpyKernelLowLatency_param_1]; 64 | ld.param.u64 %rd4, [memcpyKernelLowLatency_param_2]; 65 | mov.u32 %r1, %ctaid.x; 66 | mul.wide.u32 %rd5, %r1, 256; 67 | mov.u32 %r2, %tid.x; 68 | cvt.u64.u32 %rd6, %r2; 69 | add.s64 %rd1, %rd5, %rd6; 70 | setp.ge.u64 %p1, %rd1, %rd4; 71 | @%p1 bra $L__BB1_2; 72 | 73 | cvta.to.global.u64 %rd7, %rd3; 74 | shl.b64 %rd8, %rd1, 2; 75 | add.s64 %rd9, %rd7, %rd8; 76 | ld.global.u32 %r3, [%rd9]; 77 | cvta.to.global.u64 %rd10, %rd2; 78 | add.s64 %rd11, %rd10, %rd8; 79 | st.global.u32 [%rd11], %r3; 80 | 81 | $L__BB1_2: 82 | ret; 83 | 84 | } 85 | // .globl memcpyKernelTrailing 86 | .visible .entry memcpyKernelTrailing( 87 | .param .u64 memcpyKernelTrailing_param_0, 88 | .param .u64 memcpyKernelTrailing_param_1, 89 | .param .u64 memcpyKernelTrailing_param_2 90 | ) 91 | { 92 | .reg .pred %p<2>; 93 | .reg .b16 %rs<2>; 94 | .reg .b32 %r<3>; 95 | .reg .b64 %rd<11>; 96 | 97 | 98 | ld.param.u64 %rd2, [memcpyKernelTrailing_param_0]; 99 | ld.param.u64 %rd3, [memcpyKernelTrailing_param_1]; 100 | ld.param.u64 %rd4, [memcpyKernelTrailing_param_2]; 101 | mov.u32 %r1, %ctaid.x; 102 | mul.wide.u32 %rd5, %r1, 256; 103 | mov.u32 %r2, %tid.x; 104 | cvt.u64.u32 %rd6, %r2; 105 | add.s64 %rd1, %rd5, %rd6; 106 | setp.ge.u64 %p1, %rd1, %rd4; 107 | @%p1 bra $L__BB2_2; 108 | 109 | cvta.to.global.u64 %rd7, %rd3; 110 | add.s64 %rd8, %rd7, %rd1; 111 | ld.global.u8 %rs1, [%rd8]; 112 | cvta.to.global.u64 %rd9, %rd2; 113 | add.s64 %rd10, %rd9, %rd1; 114 | st.global.u8 [%rd10], %rs1; 115 | 116 | $L__BB2_2: 117 | ret; 118 | 119 | } 120 | 121 | -------------------------------------------------------------------------------- /driverapi/src/librecuda_status.cpp: -------------------------------------------------------------------------------- 1 | #include "librecuda_status.h" 2 | #include "librecuda_status_internal.h" 3 | 4 | #include 5 | #include 6 | 7 | #define LIBRECUDA_DECLARE_STATUS(status, code) libreCudaStatus_t status = code; 8 | 9 | #include "librecuda_all_statuses.h" 10 | 11 | #undef LIBRECUDA_DECLARE_STATUS 12 | 13 | 14 | std::unordered_map status_to_name = {}; 15 | static bool initialized = false; 16 | 17 | void internalLibreCuInitStatusNames() { 18 | 19 | #define LIBRECUDA_DECLARE_STATUS(status, code) status_to_name[code] = #status; 20 | 21 | #include "librecuda_all_statuses.h" 22 | 23 | #undef LIBRECUDA_DECLARE_STATUS 24 | 25 | initialized = true; 26 | } 27 | 28 | const char *internalLibreCuGetStatusName(int code) { 29 | auto it = status_to_name.find(code); 30 | if (it == status_to_name.end()) { 31 | return nullptr; 32 | } 33 | return it->second.c_str(); 34 | } 35 | 36 | bool internalLibreCuInitStatusNamesInitialized() { 37 | return initialized; 38 | } -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(write_float) 2 | add_subdirectory(memcopy) 3 | add_subdirectory(dynamic_shared_mem) 4 | add_subdirectory(compute_chronological_consistency) 5 | add_subdirectory(async_kernels) 6 | add_subdirectory(dma_chronological_consistency) 7 | add_subdirectory(kernel_struct_param) 8 | add_subdirectory(indexing) 9 | add_subdirectory(stream_events) 10 | add_subdirectory(many_kernels_launch) -------------------------------------------------------------------------------- /tests/async_kernels/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_async_kernels 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_async_kernels 7 | PRIVATE 8 | driverapi 9 | ) 10 | 11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/async_kernels COPYONLY) -------------------------------------------------------------------------------- /tests/async_kernels/write_float.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void write_float(float *dst, float *input) { 2 | double x = 0; 3 | int n = 100000000; 4 | for (int i = 0; i < n; i++) { 5 | x += 1.0; 6 | } 7 | x /= n; 8 | *dst = (float) x + (*input); 9 | } -------------------------------------------------------------------------------- /tests/async_kernels/write_float.cu.asm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/async_kernels/write_float.cu.asm -------------------------------------------------------------------------------- /tests/async_kernels/write_float.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/async_kernels/write_float.cubin -------------------------------------------------------------------------------- /tests/async_kernels/write_float.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-34097967 5 | // Cuda compilation tools, release 12.4, V12.4.131 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl write_float 14 | 15 | .visible .entry write_float( 16 | .param .u64 write_float_param_0, 17 | .param .u64 write_float_param_1 18 | ) 19 | { 20 | .reg .pred %p<2>; 21 | .reg .f32 %f<4>; 22 | .reg .b32 %r<5>; 23 | .reg .f64 %fd<69>; 24 | .reg .b64 %rd<5>; 25 | 26 | 27 | ld.param.u64 %rd3, [write_float_param_0]; 28 | ld.param.u64 %rd2, [write_float_param_1]; 29 | cvta.to.global.u64 %rd1, %rd3; 30 | mov.f64 %fd68, 0d0000000000000000; 31 | mov.u32 %r4, 0; 32 | 33 | $L__BB0_1: 34 | add.f64 %fd4, %fd68, 0d3FF0000000000000; 35 | add.f64 %fd5, %fd4, 0d3FF0000000000000; 36 | add.f64 %fd6, %fd5, 0d3FF0000000000000; 37 | add.f64 %fd7, %fd6, 0d3FF0000000000000; 38 | add.f64 %fd8, %fd7, 0d3FF0000000000000; 39 | add.f64 %fd9, %fd8, 0d3FF0000000000000; 40 | add.f64 %fd10, %fd9, 0d3FF0000000000000; 41 | add.f64 %fd11, %fd10, 0d3FF0000000000000; 42 | add.f64 %fd12, %fd11, 0d3FF0000000000000; 43 | add.f64 %fd13, %fd12, 0d3FF0000000000000; 44 | add.f64 %fd14, %fd13, 0d3FF0000000000000; 45 | add.f64 %fd15, %fd14, 0d3FF0000000000000; 46 | add.f64 %fd16, %fd15, 0d3FF0000000000000; 47 | add.f64 %fd17, %fd16, 0d3FF0000000000000; 48 | add.f64 %fd18, %fd17, 0d3FF0000000000000; 49 | add.f64 %fd19, %fd18, 0d3FF0000000000000; 50 | add.f64 %fd20, %fd19, 0d3FF0000000000000; 51 | add.f64 %fd21, %fd20, 0d3FF0000000000000; 52 | add.f64 %fd22, %fd21, 0d3FF0000000000000; 53 | add.f64 %fd23, %fd22, 0d3FF0000000000000; 54 | add.f64 %fd24, %fd23, 0d3FF0000000000000; 55 | add.f64 %fd25, %fd24, 0d3FF0000000000000; 56 | add.f64 %fd26, %fd25, 0d3FF0000000000000; 57 | add.f64 %fd27, %fd26, 0d3FF0000000000000; 58 | add.f64 %fd28, %fd27, 0d3FF0000000000000; 59 | add.f64 %fd29, %fd28, 0d3FF0000000000000; 60 | add.f64 %fd30, %fd29, 0d3FF0000000000000; 61 | add.f64 %fd31, %fd30, 0d3FF0000000000000; 62 | add.f64 %fd32, %fd31, 0d3FF0000000000000; 63 | add.f64 %fd33, %fd32, 0d3FF0000000000000; 64 | add.f64 %fd34, %fd33, 0d3FF0000000000000; 65 | add.f64 %fd35, %fd34, 0d3FF0000000000000; 66 | add.f64 %fd36, %fd35, 0d3FF0000000000000; 67 | add.f64 %fd37, %fd36, 0d3FF0000000000000; 68 | add.f64 %fd38, %fd37, 0d3FF0000000000000; 69 | add.f64 %fd39, %fd38, 0d3FF0000000000000; 70 | add.f64 %fd40, %fd39, 0d3FF0000000000000; 71 | add.f64 %fd41, %fd40, 0d3FF0000000000000; 72 | add.f64 %fd42, %fd41, 0d3FF0000000000000; 73 | add.f64 %fd43, %fd42, 0d3FF0000000000000; 74 | add.f64 %fd44, %fd43, 0d3FF0000000000000; 75 | add.f64 %fd45, %fd44, 0d3FF0000000000000; 76 | add.f64 %fd46, %fd45, 0d3FF0000000000000; 77 | add.f64 %fd47, %fd46, 0d3FF0000000000000; 78 | add.f64 %fd48, %fd47, 0d3FF0000000000000; 79 | add.f64 %fd49, %fd48, 0d3FF0000000000000; 80 | add.f64 %fd50, %fd49, 0d3FF0000000000000; 81 | add.f64 %fd51, %fd50, 0d3FF0000000000000; 82 | add.f64 %fd52, %fd51, 0d3FF0000000000000; 83 | add.f64 %fd53, %fd52, 0d3FF0000000000000; 84 | add.f64 %fd54, %fd53, 0d3FF0000000000000; 85 | add.f64 %fd55, %fd54, 0d3FF0000000000000; 86 | add.f64 %fd56, %fd55, 0d3FF0000000000000; 87 | add.f64 %fd57, %fd56, 0d3FF0000000000000; 88 | add.f64 %fd58, %fd57, 0d3FF0000000000000; 89 | add.f64 %fd59, %fd58, 0d3FF0000000000000; 90 | add.f64 %fd60, %fd59, 0d3FF0000000000000; 91 | add.f64 %fd61, %fd60, 0d3FF0000000000000; 92 | add.f64 %fd62, %fd61, 0d3FF0000000000000; 93 | add.f64 %fd63, %fd62, 0d3FF0000000000000; 94 | add.f64 %fd64, %fd63, 0d3FF0000000000000; 95 | add.f64 %fd65, %fd64, 0d3FF0000000000000; 96 | add.f64 %fd66, %fd65, 0d3FF0000000000000; 97 | add.f64 %fd68, %fd66, 0d3FF0000000000000; 98 | add.s32 %r4, %r4, 64; 99 | setp.ne.s32 %p1, %r4, 100000000; 100 | @%p1 bra $L__BB0_1; 101 | 102 | cvta.to.global.u64 %rd4, %rd2; 103 | div.rn.f64 %fd67, %fd68, 0d4197D78400000000; 104 | cvt.rn.f32.f64 %f1, %fd67; 105 | ld.global.f32 %f2, [%rd4]; 106 | add.f32 %f3, %f2, %f1; 107 | st.global.f32 [%rd1], %f3; 108 | ret; 109 | 110 | } 111 | 112 | -------------------------------------------------------------------------------- /tests/compile_cubin.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | compile_cubin() { 3 | if [ -z "$1" ]; then 4 | echo "Usage: ./compile_cubin.sh . (e.g ./compile_cubin.sh write_float/write_float)" 5 | return 1 6 | fi 7 | 8 | filename="$1" 9 | 10 | nvcc -ptx -std=c++11 -arch=sm_80 "${filename}.cu" -o "${filename}.ptx" 11 | ptxas -arch=sm_80 "${filename}.ptx" -o "${filename}.cubin" 12 | nvdisasm "${filename}.cubin" > "${filename}.asm" 13 | 14 | echo "Successfully compiled and disassembled ${filename}.cu" 15 | } 16 | 17 | compile_cubin "$1" 18 | -------------------------------------------------------------------------------- /tests/complex/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | message("Todo: add complex test.") -------------------------------------------------------------------------------- /tests/compute_chronological_consistency/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_chronological_consistency 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_chronological_consistency 7 | PRIVATE 8 | driverapi 9 | ) 10 | 11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/compute_chronological_consistency COPYONLY) -------------------------------------------------------------------------------- /tests/compute_chronological_consistency/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 9 | if (error != LIBRECUDA_SUCCESS) { 10 | const char *error_string; 11 | libreCuGetErrorString(error, &error_string); 12 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 13 | exit(EXIT_FAILURE); 14 | } 15 | }; 16 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 17 | 18 | int main() { 19 | CUDA_CHECK(libreCuInit(0)); 20 | 21 | int device_count{}; 22 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 23 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 24 | 25 | LibreCUdevice device{}; 26 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 27 | 28 | LibreCUcontext ctx{}; 29 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 30 | 31 | char name_buffer[256] = {}; 32 | libreCuDeviceGetName(name_buffer, 256, device); 33 | std::cout << "Device Name: " + std::string(name_buffer) << std::endl; 34 | 35 | LibreCUmodule module{}; 36 | 37 | // read cubin file 38 | uint8_t *image; 39 | size_t n_bytes; 40 | { 41 | std::ifstream input("write_float.cubin", std::ios::binary); 42 | std::vector bytes( 43 | (std::istreambuf_iterator(input)), 44 | (std::istreambuf_iterator())); 45 | input.close(); 46 | image = new uint8_t[bytes.size()]; 47 | memcpy(image, bytes.data(), bytes.size()); 48 | n_bytes = bytes.size(); 49 | } 50 | CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes)); 51 | 52 | // read functions 53 | uint32_t num_funcs{}; 54 | CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module)); 55 | std::cout << "Num functions: " << num_funcs << std::endl; 56 | 57 | auto *functions = new LibreCUFunction[num_funcs]; 58 | CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module)); 59 | 60 | for (size_t i = 0; i < num_funcs; i++) { 61 | LibreCUFunction func = functions[i]; 62 | const char *func_name{}; 63 | CUDA_CHECK(libreCuFuncGetName(&func_name, func)); 64 | std::cout << " function \"" << func_name << "\"" << std::endl; 65 | } 66 | 67 | delete[] functions; 68 | 69 | // find function 70 | LibreCUFunction func{}; 71 | CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float")); 72 | 73 | // create stream 74 | LibreCUstream stream{}; 75 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 76 | 77 | void *float_dst_compute_va{}; 78 | void *float_dst_dma_va{}; 79 | CUDA_CHECK(libreCuMemAlloc(&float_dst_compute_va, sizeof(float), true)); 80 | CUDA_CHECK(libreCuMemAlloc(&float_dst_dma_va, sizeof(float), true)); 81 | *(float *) float_dst_compute_va = 0.0f; 82 | *(float *) float_dst_dma_va = 0.0f; 83 | 84 | { 85 | void *params[] = { 86 | &float_dst_compute_va, &float_dst_dma_va 87 | }; 88 | CUDA_CHECK( 89 | libreCuLaunchKernel(func, 90 | 1, 1, 1, 91 | 1, 1, 1, 92 | 0, 93 | stream, 94 | params, sizeof(params) / sizeof(void *), 95 | nullptr 96 | ) 97 | ); 98 | } 99 | CUDA_CHECK(libreCuMemCpy(float_dst_dma_va, float_dst_compute_va, sizeof(float), stream)); 100 | { 101 | void *params[] = { 102 | &float_dst_compute_va, &float_dst_dma_va 103 | }; 104 | CUDA_CHECK( 105 | libreCuLaunchKernel(func, 106 | 1, 1, 1, 107 | 1, 1, 1, 108 | 0, 109 | stream, 110 | params, sizeof(params) / sizeof(void *), 111 | nullptr 112 | ) 113 | ); 114 | } 115 | CUDA_CHECK(libreCuStreamCommence(stream)); 116 | CUDA_CHECK(libreCuStreamAwait(stream)); 117 | 118 | std::cout << "Dst compute value (post exec): " << *(float *) (float_dst_compute_va) << std::endl; 119 | std::cout << "Dst dma value (post exec): " << *(float *) (float_dst_dma_va) << std::endl; 120 | 121 | // free memory 122 | CUDA_CHECK(libreCuMemFree(float_dst_compute_va)); 123 | CUDA_CHECK(libreCuMemFree(float_dst_dma_va)); 124 | 125 | // destroy stream 126 | CUDA_CHECK(libreCuStreamDestroy(stream)); 127 | 128 | // unload module 129 | CUDA_CHECK(libreCuModuleUnload(module)); 130 | 131 | // destroy ctx 132 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 133 | return 0; 134 | } -------------------------------------------------------------------------------- /tests/compute_chronological_consistency/write_float.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void write_float(float *dst, float *input) { 2 | double x = 0; 3 | int n = 100000000; 4 | for (int i = 0; i < n; i++) { 5 | x += 1.0; 6 | } 7 | x /= n; 8 | *dst = (float) x + (*input); 9 | } -------------------------------------------------------------------------------- /tests/compute_chronological_consistency/write_float.cu.asm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/compute_chronological_consistency/write_float.cu.asm -------------------------------------------------------------------------------- /tests/compute_chronological_consistency/write_float.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/compute_chronological_consistency/write_float.cubin -------------------------------------------------------------------------------- /tests/compute_chronological_consistency/write_float.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-34097967 5 | // Cuda compilation tools, release 12.4, V12.4.131 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl write_float 14 | 15 | .visible .entry write_float( 16 | .param .u64 write_float_param_0, 17 | .param .u64 write_float_param_1 18 | ) 19 | { 20 | .reg .pred %p<2>; 21 | .reg .f32 %f<4>; 22 | .reg .b32 %r<5>; 23 | .reg .f64 %fd<69>; 24 | .reg .b64 %rd<5>; 25 | 26 | 27 | ld.param.u64 %rd3, [write_float_param_0]; 28 | ld.param.u64 %rd2, [write_float_param_1]; 29 | cvta.to.global.u64 %rd1, %rd3; 30 | mov.f64 %fd68, 0d0000000000000000; 31 | mov.u32 %r4, 0; 32 | 33 | $L__BB0_1: 34 | add.f64 %fd4, %fd68, 0d3FF0000000000000; 35 | add.f64 %fd5, %fd4, 0d3FF0000000000000; 36 | add.f64 %fd6, %fd5, 0d3FF0000000000000; 37 | add.f64 %fd7, %fd6, 0d3FF0000000000000; 38 | add.f64 %fd8, %fd7, 0d3FF0000000000000; 39 | add.f64 %fd9, %fd8, 0d3FF0000000000000; 40 | add.f64 %fd10, %fd9, 0d3FF0000000000000; 41 | add.f64 %fd11, %fd10, 0d3FF0000000000000; 42 | add.f64 %fd12, %fd11, 0d3FF0000000000000; 43 | add.f64 %fd13, %fd12, 0d3FF0000000000000; 44 | add.f64 %fd14, %fd13, 0d3FF0000000000000; 45 | add.f64 %fd15, %fd14, 0d3FF0000000000000; 46 | add.f64 %fd16, %fd15, 0d3FF0000000000000; 47 | add.f64 %fd17, %fd16, 0d3FF0000000000000; 48 | add.f64 %fd18, %fd17, 0d3FF0000000000000; 49 | add.f64 %fd19, %fd18, 0d3FF0000000000000; 50 | add.f64 %fd20, %fd19, 0d3FF0000000000000; 51 | add.f64 %fd21, %fd20, 0d3FF0000000000000; 52 | add.f64 %fd22, %fd21, 0d3FF0000000000000; 53 | add.f64 %fd23, %fd22, 0d3FF0000000000000; 54 | add.f64 %fd24, %fd23, 0d3FF0000000000000; 55 | add.f64 %fd25, %fd24, 0d3FF0000000000000; 56 | add.f64 %fd26, %fd25, 0d3FF0000000000000; 57 | add.f64 %fd27, %fd26, 0d3FF0000000000000; 58 | add.f64 %fd28, %fd27, 0d3FF0000000000000; 59 | add.f64 %fd29, %fd28, 0d3FF0000000000000; 60 | add.f64 %fd30, %fd29, 0d3FF0000000000000; 61 | add.f64 %fd31, %fd30, 0d3FF0000000000000; 62 | add.f64 %fd32, %fd31, 0d3FF0000000000000; 63 | add.f64 %fd33, %fd32, 0d3FF0000000000000; 64 | add.f64 %fd34, %fd33, 0d3FF0000000000000; 65 | add.f64 %fd35, %fd34, 0d3FF0000000000000; 66 | add.f64 %fd36, %fd35, 0d3FF0000000000000; 67 | add.f64 %fd37, %fd36, 0d3FF0000000000000; 68 | add.f64 %fd38, %fd37, 0d3FF0000000000000; 69 | add.f64 %fd39, %fd38, 0d3FF0000000000000; 70 | add.f64 %fd40, %fd39, 0d3FF0000000000000; 71 | add.f64 %fd41, %fd40, 0d3FF0000000000000; 72 | add.f64 %fd42, %fd41, 0d3FF0000000000000; 73 | add.f64 %fd43, %fd42, 0d3FF0000000000000; 74 | add.f64 %fd44, %fd43, 0d3FF0000000000000; 75 | add.f64 %fd45, %fd44, 0d3FF0000000000000; 76 | add.f64 %fd46, %fd45, 0d3FF0000000000000; 77 | add.f64 %fd47, %fd46, 0d3FF0000000000000; 78 | add.f64 %fd48, %fd47, 0d3FF0000000000000; 79 | add.f64 %fd49, %fd48, 0d3FF0000000000000; 80 | add.f64 %fd50, %fd49, 0d3FF0000000000000; 81 | add.f64 %fd51, %fd50, 0d3FF0000000000000; 82 | add.f64 %fd52, %fd51, 0d3FF0000000000000; 83 | add.f64 %fd53, %fd52, 0d3FF0000000000000; 84 | add.f64 %fd54, %fd53, 0d3FF0000000000000; 85 | add.f64 %fd55, %fd54, 0d3FF0000000000000; 86 | add.f64 %fd56, %fd55, 0d3FF0000000000000; 87 | add.f64 %fd57, %fd56, 0d3FF0000000000000; 88 | add.f64 %fd58, %fd57, 0d3FF0000000000000; 89 | add.f64 %fd59, %fd58, 0d3FF0000000000000; 90 | add.f64 %fd60, %fd59, 0d3FF0000000000000; 91 | add.f64 %fd61, %fd60, 0d3FF0000000000000; 92 | add.f64 %fd62, %fd61, 0d3FF0000000000000; 93 | add.f64 %fd63, %fd62, 0d3FF0000000000000; 94 | add.f64 %fd64, %fd63, 0d3FF0000000000000; 95 | add.f64 %fd65, %fd64, 0d3FF0000000000000; 96 | add.f64 %fd66, %fd65, 0d3FF0000000000000; 97 | add.f64 %fd68, %fd66, 0d3FF0000000000000; 98 | add.s32 %r4, %r4, 64; 99 | setp.ne.s32 %p1, %r4, 100000000; 100 | @%p1 bra $L__BB0_1; 101 | 102 | cvta.to.global.u64 %rd4, %rd2; 103 | div.rn.f64 %fd67, %fd68, 0d4197D78400000000; 104 | cvt.rn.f32.f64 %f1, %fd67; 105 | ld.global.f32 %f2, [%rd4]; 106 | add.f32 %f3, %f2, %f1; 107 | st.global.f32 [%rd1], %f3; 108 | ret; 109 | 110 | } 111 | 112 | -------------------------------------------------------------------------------- /tests/dma_chronological_consistency/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_dma_chronological_consistency 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_dma_chronological_consistency 7 | PRIVATE 8 | driverapi 9 | ) -------------------------------------------------------------------------------- /tests/dma_chronological_consistency/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 6 | if (error != LIBRECUDA_SUCCESS) { 7 | const char *error_string; 8 | libreCuGetErrorString(error, &error_string); 9 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 10 | exit(EXIT_FAILURE); 11 | } 12 | }; 13 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 14 | 15 | int main() { 16 | CUDA_CHECK(libreCuInit(0)); 17 | 18 | int device_count{}; 19 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 20 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 21 | 22 | LibreCUdevice device{}; 23 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 24 | 25 | LibreCUcontext ctx{}; 26 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 27 | 28 | // create stream 29 | LibreCUstream stream{}; 30 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 31 | 32 | // declare host array 33 | uint8_t host_array[1024 * 1024 + 128 + 3]{}; // size chosen to require all 3 memcpy hierarchy kernels to be launched 34 | for (size_t i = 0; i < sizeof(host_array); i++) { 35 | host_array[i] = i % 256; 36 | } 37 | 38 | // declare host array 39 | uint8_t dst_host_array[sizeof(host_array)] = {}; 40 | 41 | // allocate memory 42 | uint8_t *device_array_1{}; 43 | uint8_t *device_array_2{}; 44 | uint8_t *device_array_3{}; 45 | uint8_t *device_array_4{}; 46 | uint8_t *device_array_5{}; 47 | uint8_t *device_array_6{}; 48 | CUDA_CHECK(libreCuMemAlloc(reinterpret_cast(&device_array_1), sizeof(host_array))); 49 | CUDA_CHECK(libreCuMemAlloc(reinterpret_cast(&device_array_2), sizeof(host_array))); 50 | CUDA_CHECK(libreCuMemAlloc(reinterpret_cast(&device_array_3), sizeof(host_array))); 51 | CUDA_CHECK(libreCuMemAlloc(reinterpret_cast(&device_array_4), sizeof(host_array))); 52 | CUDA_CHECK(libreCuMemAlloc(reinterpret_cast(&device_array_5), sizeof(host_array))); 53 | CUDA_CHECK(libreCuMemAlloc(reinterpret_cast(&device_array_6), sizeof(host_array))); 54 | 55 | // copy to gpu 56 | CUDA_CHECK(libreCuMemCpy(device_array_1, host_array, sizeof(host_array), stream)); 57 | 58 | // copy d2d 59 | CUDA_CHECK(libreCuMemCpy(device_array_2, device_array_1, sizeof(host_array), stream)); 60 | CUDA_CHECK(libreCuMemCpy(device_array_3, device_array_2, sizeof(host_array), stream)); 61 | CUDA_CHECK(libreCuMemCpy(device_array_4, device_array_3, sizeof(host_array), stream)); 62 | CUDA_CHECK(libreCuMemCpy(device_array_5, device_array_4, sizeof(host_array), stream)); 63 | CUDA_CHECK(libreCuMemCpy(device_array_6, device_array_5, sizeof(host_array), stream)); 64 | 65 | // copy back to host 66 | CUDA_CHECK(libreCuMemCpy(dst_host_array, device_array_6, sizeof(host_array), stream)); 67 | 68 | // commence stream 69 | CUDA_CHECK(libreCuStreamCommence(stream)); 70 | CUDA_CHECK(libreCuStreamAwait(stream)); 71 | 72 | // print device array 73 | bool is_equal = true; 74 | size_t i; 75 | for (i = 0; i < sizeof(host_array); i++) { 76 | if (host_array[i] != dst_host_array[i]) { 77 | is_equal = false; 78 | break; 79 | } 80 | } 81 | if (!is_equal) { 82 | std::cerr << "Mismatch at " + std::to_string(i) + ": memcpy screwed something up!" << std::endl; 83 | } else { 84 | std::cout << "Memory is equal!" << std::endl; 85 | } 86 | 87 | // destroy stream 88 | CUDA_CHECK(libreCuStreamDestroy(stream)); 89 | 90 | // free memory 91 | CUDA_CHECK(libreCuMemFree(device_array_1)); 92 | CUDA_CHECK(libreCuMemFree(device_array_2)); 93 | 94 | // destroy ctx 95 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 96 | return 0; 97 | } -------------------------------------------------------------------------------- /tests/dynamic_shared_mem/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_dynamic_shared_mem 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_dynamic_shared_mem 7 | PRIVATE 8 | driverapi 9 | ) 10 | 11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/dynamic_shared_mem COPYONLY) -------------------------------------------------------------------------------- /tests/dynamic_shared_mem/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 10 | if (error != LIBRECUDA_SUCCESS) { 11 | const char *error_string; 12 | libreCuGetErrorString(error, &error_string); 13 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 14 | exit(EXIT_FAILURE); 15 | } 16 | }; 17 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 18 | 19 | int main() { 20 | CUDA_CHECK(libreCuInit(0)); 21 | 22 | int device_count{}; 23 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 24 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 25 | 26 | LibreCUdevice device{}; 27 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 28 | 29 | LibreCUcontext ctx{}; 30 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 31 | 32 | char name_buffer[256] = {}; 33 | libreCuDeviceGetName(name_buffer, 256, device); 34 | std::cout << "Device Name: " + std::string(name_buffer) << std::endl; 35 | 36 | int maxSharedMemoryPerBlock{}; 37 | CUDA_CHECK(libreCuDeviceGetAttribute(&maxSharedMemoryPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device)); 38 | 39 | int maxSharedMemoryPerBlockOptIn{}; 40 | CUDA_CHECK(libreCuDeviceGetAttribute(&maxSharedMemoryPerBlockOptIn, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device)); 41 | 42 | std::cout << "Maximum shared memory per block: " << maxSharedMemoryPerBlock << " bytes" << std::endl; 43 | std::cout << "Maximum shared memory per block (opt-in): " << maxSharedMemoryPerBlockOptIn << " bytes" << std::endl; 44 | 45 | LibreCUmodule module{}; 46 | 47 | // read cubin file 48 | uint8_t *image; 49 | size_t n_bytes; 50 | { 51 | std::ifstream input("write_float.cubin", std::ios::binary); 52 | std::vector bytes( 53 | (std::istreambuf_iterator(input)), 54 | (std::istreambuf_iterator())); 55 | input.close(); 56 | image = new uint8_t[bytes.size()]; 57 | memcpy(image, bytes.data(), bytes.size()); 58 | n_bytes = bytes.size(); 59 | } 60 | CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes)); 61 | 62 | // read functions 63 | uint32_t num_funcs{}; 64 | CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module)); 65 | std::cout << "Num functions: " << num_funcs << std::endl; 66 | 67 | auto *functions = new LibreCUFunction[num_funcs]; 68 | CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module)); 69 | 70 | for (size_t i = 0; i < num_funcs; i++) { 71 | LibreCUFunction func = functions[i]; 72 | const char *func_name{}; 73 | CUDA_CHECK(libreCuFuncGetName(&func_name, func)); 74 | std::cout << " function \"" << func_name << "\"" << std::endl; 75 | } 76 | 77 | delete[] functions; 78 | 79 | // find function 80 | LibreCUFunction func{}; 81 | CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float_sum")); 82 | 83 | // set dynamic shared memory 84 | CUDA_CHECK(libreCuFuncSetAttribute(func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192)); 85 | 86 | // create stream 87 | LibreCUstream stream{}; 88 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 89 | 90 | void *float_dst_va{}; 91 | CUDA_CHECK(libreCuMemAlloc(&float_dst_va, sizeof(float), true)); 92 | 93 | float float_value = 0.31415f; 94 | short short_value = 314; 95 | 96 | std::cout << std::fixed; 97 | std::cout << std::setprecision(5); 98 | 99 | std::cout << "A value: " << short_value << std::endl; 100 | std::cout << "B value: " << float_value << std::endl; 101 | std::cout << "Dst value (pre exec): " << *(float *) (float_dst_va) << std::endl; 102 | 103 | void *params[] = { 104 | &float_dst_va, // dst 105 | &short_value, // a 106 | &float_value // b 107 | }; 108 | CUDA_CHECK( 109 | libreCuLaunchKernel(func, 110 | 1, 1, 1, 111 | 1, 1, 1, 112 | 8192, 113 | stream, 114 | params, sizeof(params) / sizeof(void *), 115 | nullptr 116 | ) 117 | ); 118 | 119 | // dispatch built up command buffer to GPU 120 | CUDA_CHECK(libreCuStreamCommence(stream)); 121 | 122 | // wait for work to complete 123 | CUDA_CHECK(libreCuStreamAwait(stream)); 124 | std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl; 125 | 126 | // free memory 127 | CUDA_CHECK(libreCuMemFree(float_dst_va)); 128 | 129 | // destroy stream 130 | CUDA_CHECK(libreCuStreamDestroy(stream)); 131 | 132 | // unload module 133 | CUDA_CHECK(libreCuModuleUnload(module)); 134 | 135 | // destroy ctx 136 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 137 | return 0; 138 | } -------------------------------------------------------------------------------- /tests/dynamic_shared_mem/write_float.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void write_float_sum(float *dst, short a, float b) { 2 | extern __shared__ float sharedData[]; 3 | 4 | sharedData[1024] = (float) a; 5 | sharedData[1025] = (float) b; 6 | 7 | *dst = (sharedData[1024] + sharedData[1025]); 8 | } -------------------------------------------------------------------------------- /tests/dynamic_shared_mem/write_float.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/dynamic_shared_mem/write_float.cubin -------------------------------------------------------------------------------- /tests/dynamic_shared_mem/write_float.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-34097967 5 | // Cuda compilation tools, release 12.4, V12.4.131 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl write_float_sum 14 | .extern .shared .align 16 .b8 sharedData[]; 15 | 16 | .visible .entry write_float_sum( 17 | .param .u64 write_float_sum_param_0, 18 | .param .u16 write_float_sum_param_1, 19 | .param .f32 write_float_sum_param_2 20 | ) 21 | { 22 | .reg .b16 %rs<2>; 23 | .reg .f32 %f<4>; 24 | .reg .b64 %rd<3>; 25 | 26 | 27 | ld.param.u64 %rd1, [write_float_sum_param_0]; 28 | ld.param.u16 %rs1, [write_float_sum_param_1]; 29 | cvta.to.global.u64 %rd2, %rd1; 30 | cvt.rn.f32.s16 %f1, %rs1; 31 | ld.param.f32 %f2, [write_float_sum_param_2]; 32 | st.shared.v2.f32 [sharedData+4096], {%f1, %f2}; 33 | add.f32 %f3, %f1, %f2; 34 | st.global.f32 [%rd2], %f3; 35 | ret; 36 | 37 | } 38 | 39 | -------------------------------------------------------------------------------- /tests/indexing/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_indexing 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_indexing 7 | PRIVATE 8 | driverapi 9 | ) 10 | 11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_indexing/ COPYONLY) -------------------------------------------------------------------------------- /tests/indexing/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 10 | if (error != LIBRECUDA_SUCCESS) { 11 | const char *error_string; 12 | libreCuGetErrorString(error, &error_string); 13 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 14 | exit(EXIT_FAILURE); 15 | } 16 | }; 17 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 18 | 19 | int main() { 20 | CUDA_CHECK(libreCuInit(0)); 21 | 22 | int device_count{}; 23 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 24 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 25 | 26 | LibreCUdevice device{}; 27 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 28 | 29 | LibreCUcontext ctx{}; 30 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 31 | 32 | char name_buffer[256] = {}; 33 | libreCuDeviceGetName(name_buffer, 256, device); 34 | std::cout << "Device Name: " + std::string(name_buffer) << std::endl; 35 | 36 | LibreCUmodule module{}; 37 | 38 | // read cubin file 39 | uint8_t *image; 40 | size_t n_bytes; 41 | { 42 | std::ifstream input("write_float.cubin", std::ios::binary); 43 | std::vector bytes( 44 | (std::istreambuf_iterator(input)), 45 | (std::istreambuf_iterator())); 46 | input.close(); 47 | image = new uint8_t[bytes.size()]; 48 | memcpy(image, bytes.data(), bytes.size()); 49 | n_bytes = bytes.size(); 50 | } 51 | CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes)); 52 | 53 | // read functions 54 | uint32_t num_funcs{}; 55 | CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module)); 56 | std::cout << "Num functions: " << num_funcs << std::endl; 57 | 58 | auto *functions = new LibreCUFunction[num_funcs]; 59 | CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module)); 60 | 61 | for (size_t i = 0; i < num_funcs; i++) { 62 | LibreCUFunction func = functions[i]; 63 | const char *func_name{}; 64 | CUDA_CHECK(libreCuFuncGetName(&func_name, func)); 65 | std::cout << " function \"" << func_name << "\"" << std::endl; 66 | } 67 | 68 | delete[] functions; 69 | 70 | // find function 71 | LibreCUFunction func{}; 72 | CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float")); 73 | 74 | // set dynamic shared memory 75 | CUDA_CHECK(libreCuFuncSetAttribute(func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192)); 76 | 77 | // create stream 78 | LibreCUstream stream{}; 79 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 80 | 81 | void *float_dst_va{}; 82 | size_t n_elements = 50256 * 768; 83 | CUDA_CHECK(libreCuMemAlloc(&float_dst_va, n_elements * sizeof(float), true)); 84 | 85 | auto *host_dst = new float[n_elements]; 86 | 87 | void *params[] = { 88 | &float_dst_va, // dst 89 | &n_elements 90 | }; 91 | CUDA_CHECK( 92 | libreCuLaunchKernel(func, 93 | n_elements/256, 1, 1, 94 | 256, 1, 1, 95 | 8192, 96 | stream, 97 | params, sizeof(params) / sizeof(void *), 98 | nullptr 99 | ) 100 | ); 101 | CUDA_CHECK(libreCuMemCpy(host_dst, float_dst_va, n_elements * sizeof(float), stream, false)); 102 | 103 | // dispatch built up command buffer to GPU 104 | CUDA_CHECK(libreCuStreamCommence(stream)); 105 | 106 | // wait for work to complete 107 | CUDA_CHECK(libreCuStreamAwait(stream)); 108 | 109 | for (size_t i = 0; i < n_elements; i++) { 110 | if (host_dst[i] != 1.0) { 111 | std::cerr << "Not all values were filled!" << std::endl; 112 | break; 113 | } 114 | } 115 | 116 | // free memory 117 | CUDA_CHECK(libreCuMemFree(float_dst_va)); 118 | 119 | delete[] host_dst; 120 | 121 | // destroy stream 122 | CUDA_CHECK(libreCuStreamDestroy(stream)); 123 | 124 | // unload module 125 | CUDA_CHECK(libreCuModuleUnload(module)); 126 | 127 | // destroy ctx 128 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 129 | return 0; 130 | } -------------------------------------------------------------------------------- /tests/indexing/write_float.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void write_float(float *dst, size_t n) { 2 | size_t tid = blockDim.x * blockIdx.x + threadIdx.x; 3 | dst[tid] = 1.0f; 4 | } -------------------------------------------------------------------------------- /tests/indexing/write_float.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/indexing/write_float.cubin -------------------------------------------------------------------------------- /tests/indexing/write_float.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-34097967 5 | // Cuda compilation tools, release 12.4, V12.4.131 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl write_float 14 | 15 | .visible .entry write_float( 16 | .param .u64 write_float_param_0, 17 | .param .u64 write_float_param_1 18 | ) 19 | { 20 | .reg .b32 %r<6>; 21 | .reg .b64 %rd<5>; 22 | 23 | 24 | ld.param.u64 %rd1, [write_float_param_0]; 25 | cvta.to.global.u64 %rd2, %rd1; 26 | mov.u32 %r1, %ntid.x; 27 | mov.u32 %r2, %ctaid.x; 28 | mov.u32 %r3, %tid.x; 29 | mad.lo.s32 %r4, %r1, %r2, %r3; 30 | mul.wide.u32 %rd3, %r4, 4; 31 | add.s64 %rd4, %rd2, %rd3; 32 | mov.u32 %r5, 1065353216; 33 | st.global.u32 [%rd4], %r5; 34 | ret; 35 | 36 | } 37 | 38 | -------------------------------------------------------------------------------- /tests/kernel_struct_param/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_kernel_struct_param 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_kernel_struct_param 7 | PRIVATE 8 | driverapi 9 | ) 10 | 11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/read_from_struct.cubin" ${CMAKE_BINARY_DIR}/tests/kernel_struct_param COPYONLY) -------------------------------------------------------------------------------- /tests/kernel_struct_param/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 10 | if (error != LIBRECUDA_SUCCESS) { 11 | const char *error_string; 12 | libreCuGetErrorString(error, &error_string); 13 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 14 | exit(EXIT_FAILURE); 15 | } 16 | }; 17 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 18 | 19 | struct struct_t { 20 | int x, y, z; 21 | int w, h, d; 22 | char str[32]; 23 | char me_ugly; 24 | }; 25 | static_assert(sizeof(struct_t) == 60); 26 | 27 | int main() { 28 | CUDA_CHECK(libreCuInit(0)); 29 | 30 | int device_count{}; 31 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 32 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 33 | 34 | LibreCUdevice device{}; 35 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 36 | 37 | LibreCUcontext ctx{}; 38 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 39 | 40 | char name_buffer[256] = {}; 41 | libreCuDeviceGetName(name_buffer, 256, device); 42 | std::cout << "Device Name: " + std::string(name_buffer) << std::endl; 43 | LibreCUmodule module{}; 44 | 45 | // read cubin file 46 | uint8_t *image; 47 | size_t n_bytes; 48 | { 49 | std::ifstream input("read_from_struct.cubin", std::ios::binary); 50 | std::vector bytes( 51 | (std::istreambuf_iterator(input)), 52 | (std::istreambuf_iterator())); 53 | input.close(); 54 | image = new uint8_t[bytes.size()]; 55 | memcpy(image, bytes.data(), bytes.size()); 56 | n_bytes = bytes.size(); 57 | } 58 | CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes)); 59 | 60 | // read functions 61 | uint32_t num_funcs{}; 62 | CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module)); 63 | std::cout << "Num functions: " << num_funcs << std::endl; 64 | 65 | auto *functions = new LibreCUFunction[num_funcs]; 66 | CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module)); 67 | 68 | for (size_t i = 0; i < num_funcs; i++) { 69 | LibreCUFunction func = functions[i]; 70 | const char *func_name{}; 71 | CUDA_CHECK(libreCuFuncGetName(&func_name, func)); 72 | std::cout << " function \"" << func_name << "\"" << std::endl; 73 | } 74 | 75 | delete[] functions; 76 | 77 | // find function 78 | LibreCUFunction func{}; 79 | CUDA_CHECK(libreCuModuleGetFunction(&func, module, "read_from_struct")); 80 | // create stream 81 | LibreCUstream stream{}; 82 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 83 | 84 | void *w_dst_va{}; 85 | CUDA_CHECK(libreCuMemAlloc(&w_dst_va, sizeof(int), true)); 86 | 87 | struct_t s = { 88 | .w=64, 89 | }; 90 | 91 | void *params[] = { 92 | &s, // struct 93 | &w_dst_va, // dst 94 | }; 95 | 96 | CUDA_CHECK( 97 | libreCuLaunchKernel(func, 98 | 1, 1, 1, 99 | 1, 1, 1, 100 | 8192, 101 | stream, 102 | params, sizeof(params) / sizeof(void *), 103 | nullptr 104 | ) 105 | ); 106 | 107 | // dispatch built up command buffer to GPU 108 | CUDA_CHECK(libreCuStreamCommence(stream)); 109 | 110 | // wait for work to complete 111 | CUDA_CHECK(libreCuStreamAwait(stream)); 112 | std::cout << "Dst value (post exec): " << *(int *) (w_dst_va) << std::endl; 113 | 114 | // free memory 115 | CUDA_CHECK(libreCuMemFree(w_dst_va)); 116 | 117 | // destroy stream 118 | CUDA_CHECK(libreCuStreamDestroy(stream)); 119 | 120 | // unload module 121 | CUDA_CHECK(libreCuModuleUnload(module)); 122 | 123 | // destroy ctx 124 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 125 | return 0; 126 | } -------------------------------------------------------------------------------- /tests/kernel_struct_param/read_from_struct.cu: -------------------------------------------------------------------------------- 1 | struct struct_t { 2 | int x, y, z; 3 | int w, h, d; 4 | char str[33]; 5 | char me_ugly; 6 | }; 7 | 8 | extern "C" __global__ void read_from_struct(struct_t s, int *pWout) { 9 | *pWout = s.w; 10 | } -------------------------------------------------------------------------------- /tests/kernel_struct_param/read_from_struct.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/kernel_struct_param/read_from_struct.cubin -------------------------------------------------------------------------------- /tests/kernel_struct_param/read_from_struct.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-34097967 5 | // Cuda compilation tools, release 12.4, V12.4.131 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl read_from_struct 14 | 15 | .visible .entry read_from_struct( 16 | .param .align 4 .b8 read_from_struct_param_0[56], 17 | .param .u64 read_from_struct_param_1 18 | ) 19 | { 20 | .reg .b32 %r<2>; 21 | .reg .b64 %rd<3>; 22 | 23 | 24 | ld.param.u64 %rd1, [read_from_struct_param_1]; 25 | ld.param.u32 %r1, [read_from_struct_param_0+12]; 26 | cvta.to.global.u64 %rd2, %rd1; 27 | st.global.u32 [%rd2], %r1; 28 | ret; 29 | 30 | } 31 | 32 | -------------------------------------------------------------------------------- /tests/many_kernels_launch/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | many_kernels_launch 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | many_kernels_launch 7 | PRIVATE 8 | driverapi 9 | ) 10 | 11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/empty_kernel.cubin" ${CMAKE_BINARY_DIR}/tests/many_kernels_launch COPYONLY) -------------------------------------------------------------------------------- /tests/many_kernels_launch/empty_kernel.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void emtpy_kernel() { 2 | } -------------------------------------------------------------------------------- /tests/many_kernels_launch/empty_kernel.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/many_kernels_launch/empty_kernel.cubin -------------------------------------------------------------------------------- /tests/many_kernels_launch/empty_kernel.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-34097967 5 | // Cuda compilation tools, release 12.4, V12.4.131 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl emtpy_kernel 14 | 15 | .visible .entry emtpy_kernel() 16 | { 17 | 18 | 19 | 20 | ret; 21 | 22 | } 23 | 24 | -------------------------------------------------------------------------------- /tests/many_kernels_launch/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 8 | if (error != LIBRECUDA_SUCCESS) { 9 | const char *error_string; 10 | libreCuGetErrorString(error, &error_string); 11 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 12 | exit(EXIT_FAILURE); 13 | } 14 | }; 15 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 16 | 17 | 18 | int main() { 19 | CUDA_CHECK(libreCuInit(0)); 20 | 21 | int device_count{}; 22 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 23 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 24 | 25 | LibreCUdevice device{}; 26 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 27 | 28 | LibreCUcontext ctx{}; 29 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 30 | 31 | char name_buffer[256] = {}; 32 | libreCuDeviceGetName(name_buffer, 256, device); 33 | std::cout << "Device Name: " + std::string(name_buffer) << std::endl; 34 | 35 | 36 | // read cubin file 37 | uint8_t *image; 38 | size_t n_bytes; { 39 | std::ifstream input("empty_kernel.cubin", std::ios::binary); 40 | std::vector bytes( 41 | (std::istreambuf_iterator(input)), 42 | (std::istreambuf_iterator())); 43 | input.close(); 44 | image = new uint8_t[bytes.size()]; 45 | std::memcpy(image, bytes.data(), bytes.size()); 46 | n_bytes = bytes.size(); 47 | } 48 | 49 | size_t num_kernels = 1025; 50 | LibreCUmodule modules[num_kernels]; 51 | for (int i = 0; i < num_kernels; i++) { 52 | CUDA_CHECK(libreCuModuleLoadData(modules + i, image, n_bytes)); 53 | } 54 | 55 | // find functions 56 | LibreCUFunction funcs[num_kernels]; 57 | for (int i = 0; i < num_kernels; i++) { 58 | CUDA_CHECK(libreCuModuleGetFunction(funcs + i, modules[i], "emtpy_kernel")); 59 | } 60 | 61 | // create stream 62 | LibreCUstream stream{}; 63 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 64 | 65 | void *params[] = {}; 66 | 67 | auto start = std::chrono::high_resolution_clock::now(); 68 | for (int i = 0; i < num_kernels; ++i) { 69 | CUDA_CHECK(libreCuLaunchKernel(funcs[i], 70 | 1, 1, 1, 71 | 1, 1, 1, 72 | 0, 73 | stream, 74 | params, sizeof(params) / sizeof(void *), 75 | nullptr 76 | )); 77 | } 78 | // dispatch built up command buffer to GPU 79 | CUDA_CHECK(libreCuStreamCommence(stream)); 80 | auto end = std::chrono::high_resolution_clock::now(); 81 | 82 | // wait for work to complete 83 | CUDA_CHECK(libreCuStreamAwait(stream)); 84 | 85 | // Calculate the duration in seconds as a double 86 | std::chrono::duration elapsed = end - start; 87 | double elapsedSeconds = elapsed.count(); 88 | 89 | // Print the elapsed time 90 | std::cout << "Average time: " << elapsedSeconds / num_kernels << ", Total: " << elapsedSeconds; 91 | 92 | // destroy stream 93 | CUDA_CHECK(libreCuStreamDestroy(stream)); 94 | 95 | // unload module 96 | for (int i = 0; i < num_kernels; ++i) { 97 | CUDA_CHECK(libreCuModuleUnload(modules[i])); 98 | } 99 | 100 | // destroy ctx 101 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /tests/memcopy/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_memcopy 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_memcopy 7 | PRIVATE 8 | driverapi 9 | ) -------------------------------------------------------------------------------- /tests/memcopy/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 6 | if (error != LIBRECUDA_SUCCESS) { 7 | const char *error_string; 8 | libreCuGetErrorString(error, &error_string); 9 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 10 | exit(EXIT_FAILURE); 11 | } 12 | }; 13 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 14 | 15 | int main() { 16 | CUDA_CHECK(libreCuInit(0)); 17 | 18 | int device_count{}; 19 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 20 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 21 | 22 | LibreCUdevice device{}; 23 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 24 | 25 | LibreCUcontext ctx{}; 26 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 27 | 28 | // create stream 29 | LibreCUstream stream{}; 30 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 31 | 32 | // declare host array 33 | float host_array[] = { 34 | 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f 35 | }; 36 | 37 | // declare host array 38 | float dst_host_array[10] = {}; 39 | 40 | // allocate memory 41 | float *device_array_1{}; 42 | float *device_array_2{}; 43 | CUDA_CHECK(libreCuMemAlloc(reinterpret_cast(&device_array_1), sizeof(host_array))); 44 | CUDA_CHECK(libreCuMemAlloc(reinterpret_cast(&device_array_2), sizeof(host_array))); 45 | 46 | // copy to gpu 47 | CUDA_CHECK(libreCuMemCpy(device_array_1, host_array, sizeof(host_array), stream)); 48 | 49 | // copy d2d 50 | CUDA_CHECK(libreCuMemCpy(device_array_2, device_array_1, sizeof(host_array), stream)); 51 | 52 | // copy back to host 53 | CUDA_CHECK(libreCuMemCpy(dst_host_array, device_array_2, sizeof(host_array), stream)); 54 | 55 | // commence stream 56 | CUDA_CHECK(libreCuStreamCommence(stream)); 57 | CUDA_CHECK(libreCuStreamAwait(stream)); 58 | 59 | // print device array 60 | for (int i = 0; i < 10; i++) { 61 | std::cout << dst_host_array[i] << ", "; 62 | } 63 | 64 | // destroy stream 65 | CUDA_CHECK(libreCuStreamDestroy(stream)); 66 | 67 | // free memory 68 | CUDA_CHECK(libreCuMemFree(device_array_1)); 69 | CUDA_CHECK(libreCuMemFree(device_array_2)); 70 | 71 | // destroy ctx 72 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 73 | return 0; 74 | } -------------------------------------------------------------------------------- /tests/stream_events/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_stream_events 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_stream_events 7 | PRIVATE 8 | driverapi 9 | ) 10 | 11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_stream_events COPYONLY) -------------------------------------------------------------------------------- /tests/stream_events/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 9 | if (error != LIBRECUDA_SUCCESS) { 10 | const char *error_string; 11 | libreCuGetErrorString(error, &error_string); 12 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 13 | exit(EXIT_FAILURE); 14 | } 15 | }; 16 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 17 | 18 | int main() { 19 | CUDA_CHECK(libreCuInit(0)); 20 | 21 | int device_count{}; 22 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 23 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 24 | 25 | LibreCUdevice device{}; 26 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 27 | 28 | LibreCUcontext ctx{}; 29 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 30 | 31 | char name_buffer[256] = {}; 32 | libreCuDeviceGetName(name_buffer, 256, device); 33 | std::cout << "Device Name: " + std::string(name_buffer) << std::endl; 34 | 35 | LibreCUmodule module{}; 36 | 37 | // read cubin file 38 | uint8_t *image; 39 | size_t n_bytes; 40 | { 41 | std::ifstream input("write_float.cubin", std::ios::binary); 42 | std::vector bytes( 43 | (std::istreambuf_iterator(input)), 44 | (std::istreambuf_iterator())); 45 | input.close(); 46 | image = new uint8_t[bytes.size()]; 47 | memcpy(image, bytes.data(), bytes.size()); 48 | n_bytes = bytes.size(); 49 | } 50 | CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes)); 51 | 52 | // read functions 53 | uint32_t num_funcs{}; 54 | CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module)); 55 | std::cout << "Num functions: " << num_funcs << std::endl; 56 | 57 | auto *functions = new LibreCUFunction[num_funcs]; 58 | CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module)); 59 | 60 | for (size_t i = 0; i < num_funcs; i++) { 61 | LibreCUFunction func = functions[i]; 62 | const char *func_name{}; 63 | CUDA_CHECK(libreCuFuncGetName(&func_name, func)); 64 | std::cout << " function \"" << func_name << "\"" << std::endl; 65 | } 66 | 67 | delete[] functions; 68 | 69 | // find function 70 | LibreCUFunction func{}; 71 | CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float")); 72 | 73 | // create stream 74 | LibreCUstream stream{}; 75 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 76 | 77 | void *float_dst_va{}; 78 | void *float_src_va{}; 79 | CUDA_CHECK(libreCuMemAlloc(&float_dst_va, sizeof(float), true)); 80 | CUDA_CHECK(libreCuMemAlloc(&float_src_va, sizeof(float), true)); 81 | *(float *) float_dst_va = 0.0f; 82 | *(float *) float_src_va = 1.0f; 83 | 84 | LibreCUEvent start{}, end{}; 85 | CUDA_CHECK(libreCuEventCreate(&start, 0)); 86 | CUDA_CHECK(libreCuEventCreate(&end, 0)); 87 | 88 | CUDA_CHECK(libreCuEventRecord(start, stream)); 89 | { 90 | void *params[] = { 91 | &float_dst_va, &float_src_va 92 | }; 93 | CUDA_CHECK( 94 | libreCuLaunchKernel(func, 95 | 1, 1, 1, 96 | 1, 1, 1, 97 | 0, 98 | stream, 99 | params, sizeof(params) / sizeof(void *), 100 | nullptr 101 | ) 102 | ); 103 | } 104 | CUDA_CHECK(libreCuEventRecord(end, stream)); 105 | CUDA_CHECK(libreCuStreamCommence(stream)); 106 | CUDA_CHECK(libreCuEventSynchronize(end)); 107 | 108 | float elapsed{}; 109 | CUDA_CHECK(libreCuEventElapsedTime(&elapsed, start, end)); 110 | std::cout << "Elapsed: " << elapsed << "ms" << std::endl; 111 | 112 | CUDA_CHECK(libreCuStreamAwait(stream)); 113 | 114 | std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl; 115 | 116 | // free memory 117 | CUDA_CHECK(libreCuMemFree(float_dst_va)); 118 | CUDA_CHECK(libreCuMemFree(float_src_va)); 119 | 120 | // destroy stream 121 | CUDA_CHECK(libreCuStreamDestroy(stream)); 122 | 123 | // unload module 124 | CUDA_CHECK(libreCuModuleUnload(module)); 125 | 126 | // destroy ctx 127 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 128 | return 0; 129 | } -------------------------------------------------------------------------------- /tests/stream_events/write_float.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void write_float(float *dst, float *input) { 2 | double x = 0; 3 | int n = 100000000; 4 | for (int i = 0; i < n; i++) { 5 | x += 1.0; 6 | } 7 | x /= n; 8 | *dst = (float) x + (*input); 9 | } -------------------------------------------------------------------------------- /tests/stream_events/write_float.cu.asm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/stream_events/write_float.cu.asm -------------------------------------------------------------------------------- /tests/stream_events/write_float.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/stream_events/write_float.cubin -------------------------------------------------------------------------------- /tests/stream_events/write_float.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-34097967 5 | // Cuda compilation tools, release 12.4, V12.4.131 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl write_float 14 | 15 | .visible .entry write_float( 16 | .param .u64 write_float_param_0, 17 | .param .u64 write_float_param_1 18 | ) 19 | { 20 | .reg .pred %p<2>; 21 | .reg .f32 %f<4>; 22 | .reg .b32 %r<5>; 23 | .reg .f64 %fd<69>; 24 | .reg .b64 %rd<5>; 25 | 26 | 27 | ld.param.u64 %rd3, [write_float_param_0]; 28 | ld.param.u64 %rd2, [write_float_param_1]; 29 | cvta.to.global.u64 %rd1, %rd3; 30 | mov.f64 %fd68, 0d0000000000000000; 31 | mov.u32 %r4, 0; 32 | 33 | $L__BB0_1: 34 | add.f64 %fd4, %fd68, 0d3FF0000000000000; 35 | add.f64 %fd5, %fd4, 0d3FF0000000000000; 36 | add.f64 %fd6, %fd5, 0d3FF0000000000000; 37 | add.f64 %fd7, %fd6, 0d3FF0000000000000; 38 | add.f64 %fd8, %fd7, 0d3FF0000000000000; 39 | add.f64 %fd9, %fd8, 0d3FF0000000000000; 40 | add.f64 %fd10, %fd9, 0d3FF0000000000000; 41 | add.f64 %fd11, %fd10, 0d3FF0000000000000; 42 | add.f64 %fd12, %fd11, 0d3FF0000000000000; 43 | add.f64 %fd13, %fd12, 0d3FF0000000000000; 44 | add.f64 %fd14, %fd13, 0d3FF0000000000000; 45 | add.f64 %fd15, %fd14, 0d3FF0000000000000; 46 | add.f64 %fd16, %fd15, 0d3FF0000000000000; 47 | add.f64 %fd17, %fd16, 0d3FF0000000000000; 48 | add.f64 %fd18, %fd17, 0d3FF0000000000000; 49 | add.f64 %fd19, %fd18, 0d3FF0000000000000; 50 | add.f64 %fd20, %fd19, 0d3FF0000000000000; 51 | add.f64 %fd21, %fd20, 0d3FF0000000000000; 52 | add.f64 %fd22, %fd21, 0d3FF0000000000000; 53 | add.f64 %fd23, %fd22, 0d3FF0000000000000; 54 | add.f64 %fd24, %fd23, 0d3FF0000000000000; 55 | add.f64 %fd25, %fd24, 0d3FF0000000000000; 56 | add.f64 %fd26, %fd25, 0d3FF0000000000000; 57 | add.f64 %fd27, %fd26, 0d3FF0000000000000; 58 | add.f64 %fd28, %fd27, 0d3FF0000000000000; 59 | add.f64 %fd29, %fd28, 0d3FF0000000000000; 60 | add.f64 %fd30, %fd29, 0d3FF0000000000000; 61 | add.f64 %fd31, %fd30, 0d3FF0000000000000; 62 | add.f64 %fd32, %fd31, 0d3FF0000000000000; 63 | add.f64 %fd33, %fd32, 0d3FF0000000000000; 64 | add.f64 %fd34, %fd33, 0d3FF0000000000000; 65 | add.f64 %fd35, %fd34, 0d3FF0000000000000; 66 | add.f64 %fd36, %fd35, 0d3FF0000000000000; 67 | add.f64 %fd37, %fd36, 0d3FF0000000000000; 68 | add.f64 %fd38, %fd37, 0d3FF0000000000000; 69 | add.f64 %fd39, %fd38, 0d3FF0000000000000; 70 | add.f64 %fd40, %fd39, 0d3FF0000000000000; 71 | add.f64 %fd41, %fd40, 0d3FF0000000000000; 72 | add.f64 %fd42, %fd41, 0d3FF0000000000000; 73 | add.f64 %fd43, %fd42, 0d3FF0000000000000; 74 | add.f64 %fd44, %fd43, 0d3FF0000000000000; 75 | add.f64 %fd45, %fd44, 0d3FF0000000000000; 76 | add.f64 %fd46, %fd45, 0d3FF0000000000000; 77 | add.f64 %fd47, %fd46, 0d3FF0000000000000; 78 | add.f64 %fd48, %fd47, 0d3FF0000000000000; 79 | add.f64 %fd49, %fd48, 0d3FF0000000000000; 80 | add.f64 %fd50, %fd49, 0d3FF0000000000000; 81 | add.f64 %fd51, %fd50, 0d3FF0000000000000; 82 | add.f64 %fd52, %fd51, 0d3FF0000000000000; 83 | add.f64 %fd53, %fd52, 0d3FF0000000000000; 84 | add.f64 %fd54, %fd53, 0d3FF0000000000000; 85 | add.f64 %fd55, %fd54, 0d3FF0000000000000; 86 | add.f64 %fd56, %fd55, 0d3FF0000000000000; 87 | add.f64 %fd57, %fd56, 0d3FF0000000000000; 88 | add.f64 %fd58, %fd57, 0d3FF0000000000000; 89 | add.f64 %fd59, %fd58, 0d3FF0000000000000; 90 | add.f64 %fd60, %fd59, 0d3FF0000000000000; 91 | add.f64 %fd61, %fd60, 0d3FF0000000000000; 92 | add.f64 %fd62, %fd61, 0d3FF0000000000000; 93 | add.f64 %fd63, %fd62, 0d3FF0000000000000; 94 | add.f64 %fd64, %fd63, 0d3FF0000000000000; 95 | add.f64 %fd65, %fd64, 0d3FF0000000000000; 96 | add.f64 %fd66, %fd65, 0d3FF0000000000000; 97 | add.f64 %fd68, %fd66, 0d3FF0000000000000; 98 | add.s32 %r4, %r4, 64; 99 | setp.ne.s32 %p1, %r4, 100000000; 100 | @%p1 bra $L__BB0_1; 101 | 102 | cvta.to.global.u64 %rd4, %rd2; 103 | div.rn.f64 %fd67, %fd68, 0d4197D78400000000; 104 | cvt.rn.f32.f64 %f1, %fd67; 105 | ld.global.f32 %f2, [%rd4]; 106 | add.f32 %f3, %f2, %f1; 107 | st.global.f32 [%rd1], %f3; 108 | ret; 109 | 110 | } 111 | 112 | -------------------------------------------------------------------------------- /tests/write_float/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | test_write_float 3 | main.cpp 4 | ) 5 | target_link_libraries( 6 | test_write_float 7 | PRIVATE 8 | driverapi 9 | ) 10 | 11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/write_float COPYONLY) -------------------------------------------------------------------------------- /tests/write_float/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) { 11 | if (error != LIBRECUDA_SUCCESS) { 12 | const char *error_string; 13 | libreCuGetErrorString(error, &error_string); 14 | printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string); 15 | exit(EXIT_FAILURE); 16 | } 17 | }; 18 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__)) 19 | 20 | int main() { 21 | CUDA_CHECK(libreCuInit(0)); 22 | 23 | int device_count{}; 24 | CUDA_CHECK(libreCuDeviceGetCount(&device_count)); 25 | std::cout << "Device count: " + std::to_string(device_count) << std::endl; 26 | 27 | LibreCUdevice device{}; 28 | CUDA_CHECK(libreCuDeviceGet(&device, 0)); 29 | 30 | LibreCUcontext ctx{}; 31 | CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device)); 32 | 33 | char name_buffer[256] = {}; 34 | libreCuDeviceGetName(name_buffer, 256, device); 35 | std::cout << "Device Name: " + std::string(name_buffer) << std::endl; 36 | 37 | LibreCUmodule module{}; 38 | 39 | // read cubin file 40 | uint8_t *image; 41 | size_t n_bytes; 42 | { 43 | std::ifstream input("write_float.cubin", std::ios::binary); 44 | std::vector bytes( 45 | (std::istreambuf_iterator(input)), 46 | (std::istreambuf_iterator())); 47 | input.close(); 48 | image = new uint8_t[bytes.size()]; 49 | memcpy(image, bytes.data(), bytes.size()); 50 | n_bytes = bytes.size(); 51 | } 52 | CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes)); 53 | 54 | // read functions 55 | uint32_t num_funcs{}; 56 | CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module)); 57 | std::cout << "Num functions: " << num_funcs << std::endl; 58 | 59 | auto *functions = new LibreCUFunction[num_funcs]; 60 | CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module)); 61 | 62 | for (size_t i = 0; i < num_funcs; i++) { 63 | LibreCUFunction func = functions[i]; 64 | const char *func_name{}; 65 | CUDA_CHECK(libreCuFuncGetName(&func_name, func)); 66 | std::cout << " function \"" << func_name << "\"" << std::endl; 67 | } 68 | 69 | delete[] functions; 70 | 71 | // find function 72 | LibreCUFunction func{}; 73 | CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float_sum")); 74 | 75 | // create stream 76 | LibreCUstream stream{}; 77 | CUDA_CHECK(libreCuStreamCreate(&stream, 0)); 78 | 79 | void *float_dst_va{}; 80 | CUDA_CHECK(libreCuMemAlloc(&float_dst_va, sizeof(float), true)); 81 | 82 | float float_value = 0.31415f; 83 | short short_value = 314; 84 | 85 | std::cout << std::fixed; 86 | std::cout << std::setprecision(5); 87 | 88 | std::cout << "A value: " << short_value << std::endl; 89 | std::cout << "B value: " << float_value << std::endl; 90 | std::cout << "Dst value (pre exec): " << *(float *) (float_dst_va) << std::endl; 91 | 92 | void *params[] = { 93 | &float_dst_va, // dst 94 | &short_value, // a 95 | &float_value // b 96 | }; 97 | CUDA_CHECK( 98 | libreCuLaunchKernel(func, 99 | 1, 1, 1, 100 | 1, 1, 1, 101 | 0, 102 | stream, 103 | params, sizeof(params) / sizeof(void *), 104 | nullptr 105 | ) 106 | ); 107 | 108 | // dispatch built up command buffer to GPU 109 | CUDA_CHECK(libreCuStreamCommence(stream)); 110 | 111 | // wait for work to complete 112 | CUDA_CHECK(libreCuStreamAwait(stream)); 113 | std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl; 114 | 115 | // free memory 116 | CUDA_CHECK(libreCuMemFree(float_dst_va)); 117 | 118 | // destroy stream 119 | CUDA_CHECK(libreCuStreamDestroy(stream)); 120 | 121 | // unload module 122 | CUDA_CHECK(libreCuModuleUnload(module)); 123 | 124 | // destroy ctx 125 | CUDA_CHECK(libreCuCtxDestroy(ctx)); 126 | return 0; 127 | } -------------------------------------------------------------------------------- /tests/write_float/write_float.cu: -------------------------------------------------------------------------------- 1 | extern "C" __global__ void write_float_ptr(float *dst, float *src) { 2 | *dst = *src; 3 | } 4 | 5 | extern "C" __global__ void write_float_value(float *dst, float value) { 6 | *dst = value; 7 | } 8 | 9 | __device__ int global_int = 0x69; 10 | 11 | extern "C" __global__ void write_float_sum(float *dst, short a, float b) { 12 | *dst = (a + b + global_int); 13 | } -------------------------------------------------------------------------------- /tests/write_float/write_float.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/write_float/write_float.cubin -------------------------------------------------------------------------------- /tests/write_float/write_float.ptx: -------------------------------------------------------------------------------- 1 | // 2 | // Generated by NVIDIA NVVM Compiler 3 | // 4 | // Compiler Build ID: CL-33961263 5 | // Cuda compilation tools, release 12.4, V12.4.99 6 | // Based on NVVM 7.0.1 7 | // 8 | 9 | .version 8.4 10 | .target sm_80 11 | .address_size 64 12 | 13 | // .globl write_float_ptr 14 | .global .align 4 .u32 global_int = 105; 15 | 16 | .visible .entry write_float_ptr( 17 | .param .u64 write_float_ptr_param_0, 18 | .param .u64 write_float_ptr_param_1 19 | ) 20 | { 21 | .reg .f32 %f<2>; 22 | .reg .b64 %rd<5>; 23 | 24 | 25 | ld.param.u64 %rd1, [write_float_ptr_param_0]; 26 | ld.param.u64 %rd2, [write_float_ptr_param_1]; 27 | cvta.to.global.u64 %rd3, %rd1; 28 | cvta.to.global.u64 %rd4, %rd2; 29 | ld.global.f32 %f1, [%rd4]; 30 | st.global.f32 [%rd3], %f1; 31 | ret; 32 | 33 | } 34 | // .globl write_float_value 35 | .visible .entry write_float_value( 36 | .param .u64 write_float_value_param_0, 37 | .param .f32 write_float_value_param_1 38 | ) 39 | { 40 | .reg .f32 %f<2>; 41 | .reg .b64 %rd<3>; 42 | 43 | 44 | ld.param.u64 %rd1, [write_float_value_param_0]; 45 | ld.param.f32 %f1, [write_float_value_param_1]; 46 | cvta.to.global.u64 %rd2, %rd1; 47 | st.global.f32 [%rd2], %f1; 48 | ret; 49 | 50 | } 51 | // .globl write_float_sum 52 | .visible .entry write_float_sum( 53 | .param .u64 write_float_sum_param_0, 54 | .param .u16 write_float_sum_param_1, 55 | .param .f32 write_float_sum_param_2 56 | ) 57 | { 58 | .reg .b16 %rs<2>; 59 | .reg .f32 %f<6>; 60 | .reg .b32 %r<2>; 61 | .reg .b64 %rd<3>; 62 | 63 | 64 | ld.param.u64 %rd1, [write_float_sum_param_0]; 65 | ld.param.u16 %rs1, [write_float_sum_param_1]; 66 | ld.param.f32 %f1, [write_float_sum_param_2]; 67 | cvta.to.global.u64 %rd2, %rd1; 68 | cvt.rn.f32.s16 %f2, %rs1; 69 | add.f32 %f3, %f2, %f1; 70 | ld.global.u32 %r1, [global_int]; 71 | cvt.rn.f32.s32 %f4, %r1; 72 | add.f32 %f5, %f3, %f4; 73 | st.global.f32 [%rd2], %f5; 74 | ret; 75 | 76 | } 77 | 78 | --------------------------------------------------------------------------------