├── .github
    └── workflows
    │   └── cmake-single-platform.yml
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.MD
├── driverapi
    ├── CMakeLists.txt
    ├── include
    │   ├── librecuda.h
    │   ├── librecuda_all_statuses.h
    │   └── librecuda_status.h
    ├── internal
    │   ├── cmdqueue.h
    │   ├── librecuda_internal.h
    │   ├── librecuda_status_internal.h
    │   ├── memcopy.h
    │   ├── memcopy_kernels.h
    │   └── nvidia
    │   │   ├── alloc_channel.h
    │   │   ├── cc_drv.h
    │   │   ├── cl0040.h
    │   │   ├── cl0080.h
    │   │   ├── cl0080_notification.h
    │   │   ├── cl2080.h
    │   │   ├── cl2080_notification.h
    │   │   ├── cl9010.h
    │   │   ├── cl9010_callback.h
    │   │   ├── cl9067.h
    │   │   ├── cl90f1.h
    │   │   ├── cla06c.h
    │   │   ├── cla0c0qmd.h
    │   │   ├── clc3c0qmd.h
    │   │   ├── clc461.h
    │   │   ├── clc56f.h
    │   │   ├── clc5c0qmd.h
    │   │   ├── clc6b5.h
    │   │   ├── clc6c0.h
    │   │   ├── clc6c0qmd.h
    │   │   ├── cpuopsys.h
    │   │   ├── ctrl0000base.h
    │   │   ├── ctrl0000gpu.h
    │   │   ├── ctrl0000system.h
    │   │   ├── ctrl0073base.h
    │   │   ├── ctrl0073system.h
    │   │   ├── ctrl0080base.h
    │   │   ├── ctrl0080bsp.h
    │   │   ├── ctrl0080gpu.h
    │   │   ├── ctrl0080gr.h
    │   │   ├── ctrl0080msenc.h
    │   │   ├── ctrl2080base.h
    │   │   ├── ctrl2080fifo.h
    │   │   ├── ctrl2080gpu.h
    │   │   ├── ctrl2080gr.h
    │   │   ├── ctrl2080internal.h
    │   │   ├── ctrl2080mc.1.h
    │   │   ├── ctrl2080mc.h
    │   │   ├── ctrl30f1.h
    │   │   ├── ctrl906f.h
    │   │   ├── ctrl90f1.h
    │   │   ├── ctrla06c.h
    │   │   ├── ctrla06f.h
    │   │   ├── ctrla06fbase.h
    │   │   ├── ctrla06fevent.h
    │   │   ├── ctrla06fgpfifo.h
    │   │   ├── ctrla06finternal.h
    │   │   ├── ctrlc36f.h
    │   │   ├── ctrlxxxx.h
    │   │   ├── dev_mmu.h
    │   │   ├── g_allclasses.h
    │   │   ├── mmu_fmt_types.h
    │   │   ├── nv-ioctl-numbers.h
    │   │   ├── nv-ioctl.h
    │   │   ├── nv-unix-nvos-params-wrappers.h
    │   │   ├── nvCpuUuid.h
    │   │   ├── nv_escape.h
    │   │   ├── nvcfg_sdk.h
    │   │   ├── nvgputypes.h
    │   │   ├── nvimpshared.h
    │   │   ├── nvlimits.h
    │   │   ├── nvmisc.h
    │   │   ├── nvos.h
    │   │   ├── nvstatus.h
    │   │   ├── nvstatuscodes.h
    │   │   ├── nvtypes.h
    │   │   ├── rs_access.h
    │   │   ├── uvm_ioctl.h
    │   │   ├── uvm_linux_ioctl.h
    │   │   └── uvm_types.h
    ├── kernels
    │   └── memcpy
    │   │   ├── compile_memcpy.sh
    │   │   ├── generate_header.py
    │   │   ├── memcopy_kernels.h
    │   │   ├── memcpy.cu
    │   │   └── output
    │   │       ├── memcpy_sm_50.cubin
    │   │       ├── memcpy_sm_50.ptx
    │   │       ├── memcpy_sm_52.cubin
    │   │       ├── memcpy_sm_52.ptx
    │   │       ├── memcpy_sm_53.cubin
    │   │       ├── memcpy_sm_53.ptx
    │   │       ├── memcpy_sm_60.cubin
    │   │       ├── memcpy_sm_60.ptx
    │   │       ├── memcpy_sm_61.cubin
    │   │       ├── memcpy_sm_61.ptx
    │   │       ├── memcpy_sm_62.cubin
    │   │       ├── memcpy_sm_62.ptx
    │   │       ├── memcpy_sm_70.cubin
    │   │       ├── memcpy_sm_70.ptx
    │   │       ├── memcpy_sm_72.cubin
    │   │       ├── memcpy_sm_72.ptx
    │   │       ├── memcpy_sm_75.cubin
    │   │       ├── memcpy_sm_75.ptx
    │   │       ├── memcpy_sm_80.cubin
    │   │       ├── memcpy_sm_80.ptx
    │   │       ├── memcpy_sm_86.cubin
    │   │       ├── memcpy_sm_86.ptx
    │   │       ├── memcpy_sm_87.cubin
    │   │       ├── memcpy_sm_87.ptx
    │   │       ├── memcpy_sm_89.cubin
    │   │       ├── memcpy_sm_89.ptx
    │   │       ├── memcpy_sm_90.cubin
    │   │       └── memcpy_sm_90.ptx
    └── src
    │   ├── cmdqueue.cpp
    │   ├── librecuda.cpp
    │   ├── librecuda_status.cpp
    │   └── memcopy.cpp
└── tests
    ├── CMakeLists.txt
    ├── async_kernels
        ├── CMakeLists.txt
        ├── main.cpp
        ├── write_float.asm
        ├── write_float.cu
        ├── write_float.cu.asm
        ├── write_float.cubin
        └── write_float.ptx
    ├── compile_cubin.sh
    ├── complex
        ├── CMakeLists.txt
        └── complex.ptx
    ├── compute_chronological_consistency
        ├── CMakeLists.txt
        ├── main.cpp
        ├── write_float.asm
        ├── write_float.cu
        ├── write_float.cu.asm
        ├── write_float.cubin
        └── write_float.ptx
    ├── dma_chronological_consistency
        ├── CMakeLists.txt
        └── main.cpp
    ├── dynamic_shared_mem
        ├── CMakeLists.txt
        ├── main.cpp
        ├── write_float.asm
        ├── write_float.cu
        ├── write_float.cubin
        └── write_float.ptx
    ├── indexing
        ├── CMakeLists.txt
        ├── main.cpp
        ├── write_float.asm
        ├── write_float.cu
        ├── write_float.cubin
        └── write_float.ptx
    ├── kernel_struct_param
        ├── CMakeLists.txt
        ├── main.cpp
        ├── read_from_struct.asm
        ├── read_from_struct.cu
        ├── read_from_struct.cubin
        └── read_from_struct.ptx
    ├── many_kernels_launch
        ├── CMakeLists.txt
        ├── empty_kernel.asm
        ├── empty_kernel.cu
        ├── empty_kernel.cubin
        ├── empty_kernel.ptx
        └── main.cpp
    ├── memcopy
        ├── CMakeLists.txt
        └── main.cpp
    ├── stream_events
        ├── CMakeLists.txt
        ├── main.cpp
        ├── write_float.asm
        ├── write_float.cu
        ├── write_float.cu.asm
        ├── write_float.cubin
        └── write_float.ptx
    └── write_float
        ├── CMakeLists.txt
        ├── main.cpp
        ├── write_float.asm
        ├── write_float.cu
        ├── write_float.cubin
        └── write_float.ptx


/.github/workflows/cmake-single-platform.yml:
--------------------------------------------------------------------------------
 1 | # This starter workflow is for a CMake project running on a single platform. There is a different starter workflow if you need cross-platform coverage.
 2 | # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-multi-platform.yml
 3 | name: CMake on a single platform
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: [ "master" ]
 8 |   pull_request:
 9 |     branches: [ "master" ]
10 | 
11 | env:
12 |   # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.)
13 |   BUILD_TYPE: Release
14 | 
15 | jobs:
16 |   build:
17 |     # The CMake configure and build commands are platform agnostic and should work equally well on Windows or Mac.
18 |     # You can convert this to a matrix build if you need cross-platform coverage.
19 |     # See: https://docs.github.com/en/free-pro-team@latest/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v4
24 | 
25 |     - name: Configure CMake
26 |       # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
27 |       # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
28 |       run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
29 | 
30 |     - name: Build
31 |       # Build your program with the given configuration
32 |       run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
33 | 
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | 
 9 | # Precompiled Headers
10 | *.gch
11 | *.pch
12 | 
13 | # Compiled Dynamic libraries
14 | *.so
15 | *.dll
16 | 
17 | # Compiled Static libraries
18 | *.lai
19 | *.la
20 | *.a
21 | *.lib
22 | 
23 | # Executables
24 | *.exe
25 | *.out
26 | 
27 | # Visaul Studio files
28 | *.sdf
29 | *.suo
30 | *.sln
31 | *.vcxproj
32 | 
33 | # Cmake folders
34 | cmake-build-*/
35 | build/
36 | debug/
37 | 
38 | # IntelliJ
39 | .idea/
40 | 
41 | # Vscode
42 | .vscode/


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/ELFIO"]
2 | 	path = third_party/ELFIO
3 | 	url = https://github.com/serge1/ELFIO
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(LibreCuda CXX)
 3 | 
 4 | option(BUILD_LIBRECUDA_DRIVER_API_STATIC_LIB "Build the LibreCUDA driverapi as a static library" ON)
 5 | include(CheckCXXCompilerFlag)
 6 | CHECK_CXX_COMPILER_FLAG("-std=c++17" COMPILER_SUPPORTS_CXX17)
 7 | if (COMPILER_SUPPORTS_CXX17)
 8 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
 9 |     add_definitions(-DCOMPILEDWITHC17)
10 |     message(STATUS "Using flag -std=c++17.")
11 | else ()
12 |     message(FATAL_ERROR "The compiler ${CMAKE_CXX_COMPILER} has no C++17 support. Please use a different C++ compiler.")
13 | endif ()
14 | 
15 | if (NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/third_party/ELFIO/elfio")
16 |     message("-- ELFIO not found, fetching ELFIO...")
17 |     execute_process(COMMAND git submodule update --init)
18 |     add_subdirectory(third_party/ELFIO)
19 | else ()
20 |     message("-- ELFIO found.")
21 |     add_subdirectory(third_party/ELFIO)
22 | endif ()
23 | 
24 | add_subdirectory(driverapi)
25 | 
26 | add_subdirectory(tests)
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright 2024 Michael Keiblinger
 3 | *
 4 | * Permission is hereby granted, free of charge, to any person obtaining
 5 | * a copy of this software and associated documentation files
 6 | * (the "Software"), to deal in the Software without restriction,
 7 | * including without limitation the rights to use, copy, modify, merge,
 8 | * publish, distribute, sublicense, and/or sell copies of the Software,
 9 | * and to permit persons to whom the Software is furnished to do so,
10 | * subject to the following conditions:
11 | *
12 | * The above copyright notice and this permission notice shall be
13 | * included in all copies or substantial portions of the Software.
14 | *
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
18 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
20 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
21 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22 | */
23 | 


--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
  1 | # LibreCUDA
  2 | 
  3 | LibreCUDA is a project aimed at replacing the CUDA driver API to enable launching CUDA code on Nvidia GPUs without
  4 | relying on the proprietary CUDA runtime. It achieves this by communicating directly with the hardware via ioctls, (
  5 | specifically what Nvidia's open-gpu-kernel-modules refer to as the rmapi), as well as QMD, Nvidia's MMIO command queue
  6 | structure. LibreCUDA is capable of uploading CUDA ELF binaries onto the GPU and launching them via the command queue.
  7 | 
  8 | ## Current features
  9 | 
 10 | - Allocate and free gpu memory & map the memory to be accessible by the CPU
 11 | - Upload cuda kernels (CUDA ELF binaries)
 12 | - Set dynamic shared memory for cuda functions
 13 | - Launch CUDA kernels
 14 | - Supports cheap async kernel launches on a single stream
 15 | - host to device (DMA), device to device (Compute), device to host memcpy (DMA)
 16 | - Supports cheap async memcpys on a single stream
 17 | 
 18 | ## Example
 19 | 
 20 | Below is an example demonstrating the usage of LibreCUDA:
 21 | 
 22 | ```cpp
 23 | int main() {
 24 |     libreCuInit(0);
 25 | 
 26 |     int device_count{};
 27 |     libreCuDeviceGetCount(&device_count);
 28 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
 29 | 
 30 |     LibreCUdevice device{};
 31 |     libreCuDeviceGet(&device, 0);
 32 | 
 33 |     LibreCUcontext ctx{};
 34 |     libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device);
 35 | 
 36 |     LibreCUmodule module{};
 37 | 
 38 |     uint8_t *image;
 39 |     size_t n_bytes;
 40 |     {
 41 |         std::ifstream input("write_float.cubin", std::ios::binary);
 42 |         std::vector<uint8_t> bytes(
 43 |                 (std::istreambuf_iterator<char>(input)),
 44 |                 (std::istreambuf_iterator<char>()));
 45 |         input.close();
 46 |         image = new uint8_t[bytes.size()];
 47 |         doMemcpy(image, bytes.data(), bytes.size());
 48 |         n_bytes = bytes.size();
 49 |     }
 50 |     libreCuModuleLoadData(&module, image, n_bytes);
 51 | 
 52 |     uint32_t num_funcs{};
 53 |     libreCuModuleGetFunctionCount(&num_funcs, module);
 54 |     std::cout << "Num functions: " << num_funcs << std::endl;
 55 | 
 56 |     auto *functions = new LibreCUFunction[num_funcs];
 57 |     libreCuModuleEnumerateFunctions(functions, num_funcs, module);
 58 | 
 59 |     for (size_t i = 0; i < num_funcs; i++) {
 60 |         LibreCUFunction func = functions[i];
 61 |         const char *func_name{};
 62 |         libreCuFuncGetName(&func_name, func);
 63 |         std::cout << "  function \"" << func_name << "\"" << std::endl;
 64 |     }
 65 | 
 66 |     delete[] functions;
 67 | 
 68 |     LibreCUFunction func{};
 69 |     libreCuModuleGetFunction(&func, module, "write_float");
 70 | 
 71 |     LibreCUstream stream{};
 72 |     libreCuStreamCreate(&stream, 0);
 73 | 
 74 |     void *float_dst_va{};
 75 |     libreCuMemAlloc(&float_dst_va, sizeof(float), true);
 76 | 
 77 |     float float_value = 3.1415f;
 78 |     void *float_src_va{};
 79 |     libreCuMemAlloc(&float_src_va, sizeof(float), true);
 80 |     *(float *) (float_src_va) = float_value;
 81 | 
 82 |     std::cout << "Src value: " << float_value << std::endl;
 83 |     std::cout << "Dst value (pre exec): " << *(float *) (float_dst_va) << std::endl;
 84 | 
 85 |     void *params[] = {
 86 |             &float_dst_va, // dst
 87 |             &float_src_va // src
 88 |     };
 89 |     libreCuLaunchKernel(func,
 90 |                         1, 1, 1,
 91 |                         1, 1, 1,
 92 |                         0,
 93 |                         stream,
 94 |                         params, sizeof(params) / sizeof(void *),
 95 |                         nullptr
 96 |     );
 97 | 
 98 |     libreCuStreamCommence(stream);
 99 |     
100 |     libreCuStreamAwait(stream);
101 |     std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl;
102 | 
103 |     libreCuMemFree(float_dst_va);
104 |     libreCuStreamDestroy(stream);
105 |     libreCuModuleUnload(module);
106 |     libreCuCtxDestroy(ctx);
107 |     return 0;
108 | }
109 | ```
110 | 
111 | ### Outputs
112 | 
113 | ```console
114 | Device count: 1
115 | Num functions: 1
116 |   function "write_float"
117 | Src value: 3.1415
118 | Dst value (pre exec): 0
119 | Dst value (post exec): 3.1415
120 | ```
121 | 
122 | ### How to Use
123 | 
124 | The recommended way to use librecuda is to clone the LibreCUDA repository and link against the `driverapi` library in
125 | CMake:
126 | 
127 | ```
128 | git clone --recurse https://github.com/mikex86/LibreCuda.git
129 | ```
130 | 
131 | #### Add the repository as a CMake directory
132 | 
133 | ```cmake
134 | add_subdirectory(LibreCuda)
135 | ```
136 | 
137 | #### Link against the driver api library
138 | 
139 | ```cmake
140 | target_link_libraries(YourTarget PRIVATE driverapi)
141 | ```
142 | 
143 | #### Include headers
144 | 
145 | ```c
146 | #include <librecuda.h>
147 | ```
148 | 
149 | ## Project Status
150 | 
151 | The project is in its early stages and currently implements only rudimentary CUDA functions. It is not yet ready for
152 | production use.
153 | 
154 | ## Contributing
155 | 
156 | Contributions are welcome! Please submit issues and pull requests to help improve LibreCUDA.
157 | 
158 | ## License
159 | 
160 | This project is licensed under the MIT License.
161 | 


--------------------------------------------------------------------------------
/driverapi/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(
 2 |         DRIVERAPI_SOURCES
 3 |         src/librecuda.cpp
 4 |         src/librecuda_status.cpp
 5 |         src/cmdqueue.cpp
 6 |         src/memcopy.cpp
 7 |         internal/memcopy.h
 8 |         internal/memcopy_kernels.h
 9 | )
10 | if (BUILD_LIBRECUDA_DRIVER_API_STATIC_LIB)
11 |     add_library(driverapi STATIC ${DRIVERAPI_SOURCES})
12 | else()
13 |     add_library(driverapi SHARED ${DRIVERAPI_SOURCES})
14 | endif()
15 | 
16 | target_include_directories(driverapi PRIVATE "internal/")
17 | target_include_directories(driverapi PUBLIC "include/")
18 | target_link_libraries(driverapi PRIVATE elfio)
19 | 


--------------------------------------------------------------------------------
/driverapi/include/librecuda_all_statuses.h:
--------------------------------------------------------------------------------
 1 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_SUCCESS, 0)
 2 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_VALUE, 1)
 3 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_OUT_OF_MEMORY, 2)
 4 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_NOT_INITIALIZED, 3)
 5 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_DEVICE, 101)
 6 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_IMAGE, 200)
 7 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_INVALID_CONTEXT, 201)
 8 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_NOT_FOUND, 500)
 9 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_NOT_READY, 600)
10 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, 701)
11 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE, 804)
12 | LIBRECUDA_DECLARE_STATUS(LIBRECUDA_ERROR_UNKNOWN, 999)


--------------------------------------------------------------------------------
/driverapi/include/librecuda_status.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIBRECUDA_VALIDATE_H
 2 | #define LIBRECUDA_VALIDATE_H
 3 | 
 4 | #include <iostream>
 5 | 
 6 | typedef int libreCudaStatus_t;
 7 | 
 8 | #define LIBRECUDA_DEBUG(msg) std::cerr << "[LibreCuda Debug]: " << msg << std::endl
 9 | #define __LIBRECUDA_STRINGIFY(x) #x
10 | #define __LIBRECUDA_TOSTRING(x) __LIBRECUDA_STRINGIFY(x)
11 | 
12 | #define LIBRECUDA_VALIDATE(condition, err) { \
13 |     if (!(condition)) {                      \
14 |         LIBRECUDA_DEBUG(__FILE__ ":" __LIBRECUDA_TOSTRING(__LINE__) ": " #condition);         \
15 |         return err;                          \
16 |     }                                        \
17 | }
18 | 
19 | // declare all cuda error codes
20 | #define LIBRECUDA_DECLARE_STATUS(status, code) extern "C" libreCudaStatus_t status;
21 | #include "librecuda_all_statuses.h"
22 | #undef LIBRECUDA_DECLARE_STATUS
23 | 
24 | #define LIBRECUDA_SUCCEED() { return LIBRECUDA_SUCCESS; }
25 | #define LIBRECUDA_FAIL(status) { return status; }
26 | 
27 | #define LIBRECUDA_ERR_PROPAGATE(status) { libreCudaStatus_t status_val = status; if (status_val != LIBRECUDA_SUCCESS) { LIBRECUDA_FAIL(status_val); } }
28 | 
29 | #endif //LIBRECUDA_VALIDATE_H
30 | 


--------------------------------------------------------------------------------
/driverapi/internal/librecuda_status_internal.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIBRECUDA_LIBRECUDA_STATUS_INTERNAL_H
 2 | #define LIBRECUDA_LIBRECUDA_STATUS_INTERNAL_H
 3 | 
 4 | void internalLibreCuInitStatusNames();
 5 | 
 6 | const char *internalLibreCuGetStatusName(int code);
 7 | 
 8 | bool internalLibreCuInitStatusNamesInitialized();
 9 | 
10 | #endif //LIBRECUDA_LIBRECUDA_STATUS_INTERNAL_H
11 | 


--------------------------------------------------------------------------------
/driverapi/internal/memcopy.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <librecuda.h>
4 | #include <cstddef>
5 | 
6 | libreCudaStatus_t loadMemcpyKernelsIfNeeded(LibreCUdevice device);
7 | 
8 | libreCudaStatus_t memcpyD2D(void *dst, void *src, size_t size, LibreCUstream stream, bool async);


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cc_drv.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2021-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: MIT
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a
  6 |  * copy of this software and associated documentation files (the "Software"),
  7 |  * to deal in the Software without restriction, including without limitation
  8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 |  * and/or sell copies of the Software, and to permit persons to whom the
 10 |  * Software is furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 |  * DEALINGS IN THE SOFTWARE.
 22 |  */
 23 | 
 24 | #pragma once
 25 | 
 26 | #include "nvtypes.h"
 27 | 
 28 | //
 29 | // This file was generated with FINN, an NVIDIA coding tool.
 30 | // Source file:      cc_drv.finn
 31 | //
 32 | 
 33 | 
 34 | 
 35 | #include "nvtypes.h"
 36 | #include "nvcfg_sdk.h"
 37 | 
 38 | // CLASS NV_CONF_COMPUTE
 39 | #define CC_AES_256_GCM_IV_SIZE_BYTES  (0xcU) /* finn: Evaluated from "(96 / 8)" */
 40 | #define CC_AES_256_GCM_IV_SIZE_DWORD  (0x3U) /* finn: Evaluated from "(CC_AES_256_GCM_IV_SIZE_BYTES / 4)" */
 41 | #define CC_AES_256_GCM_KEY_SIZE_BYTES (0x20U) /* finn: Evaluated from "(256 / 8)" */
 42 | #define CC_AES_256_GCM_KEY_SIZE_DWORD (0x8U) /* finn: Evaluated from "(CC_AES_256_GCM_KEY_SIZE_BYTES / 4)" */
 43 | 
 44 | #define CC_HMAC_NONCE_SIZE_BYTES      (0x20U) /* finn: Evaluated from "(256 / 8)" */
 45 | #define CC_HMAC_NONCE_SIZE_DWORD      (0x8U) /* finn: Evaluated from "(CC_HMAC_NONCE_SIZE_BYTES / 4)" */
 46 | #define CC_HMAC_KEY_SIZE_BYTES        (0x20U) /* finn: Evaluated from "(256 / 8)" */
 47 | #define CC_HMAC_KEY_SIZE_DWORD        (0x8U) /* finn: Evaluated from "(CC_HMAC_KEY_SIZE_BYTES / 4)" */
 48 | 
 49 | 
 50 | // Type is shared between CC control calls and RMKeyStore
 51 | typedef enum ROTATE_IV_TYPE {
 52 |     ROTATE_IV_ENCRYPT = 0,  // Rotate the IV for encryptBundle
 53 |     ROTATE_IV_DECRYPT = 1,  // Rotate the IV for decryptBundle
 54 |     ROTATE_IV_HMAC = 2,     // Rotate the IV for hmacBundle
 55 |     ROTATE_IV_ALL_VALID = 3, // Rotate the IV for all valid bundles in the KMB
 56 | } ROTATE_IV_TYPE;
 57 | 
 58 | // Status value written into NvNotification.Info16
 59 | typedef enum KEY_ROTATION_STATUS {
 60 |     KEY_ROTATION_STATUS_IDLE = 0,                    // Key rotation complete/not in progress
 61 |     KEY_ROTATION_STATUS_PENDING = 1,                 // RM is waiting for clients to report their channels are idle for key rotation
 62 |     KEY_ROTATION_STATUS_IN_PROGRESS = 2,             // Key rotation is in progress
 63 |     KEY_ROTATION_STATUS_FAILED_TIMEOUT = 3,          // Key rotation timeout failure, RM will RC non-idle channels
 64 |     KEY_ROTATION_STATUS_FAILED_THRESHOLD = 4,        // Key rotation failed because upper threshold was crossed, RM will RC non-idle channels
 65 |     KEY_ROTATION_STATUS_FAILED_ROTATION = 5,         // Internal RM failure while rotating keys for a certain channel, RM will RC the channel
 66 |     KEY_ROTATION_STATUS_PENDING_TIMER_SUSPENDED = 6, // Key rotation timer suspended waiting for kernel key rotation to complete
 67 |     KEY_ROTATION_STATUS_MAX_COUNT = 7,
 68 | } KEY_ROTATION_STATUS;
 69 | 
 70 | typedef struct CC_AES_CRYPTOBUNDLE {
 71 |     NvU32 iv[CC_AES_256_GCM_IV_SIZE_DWORD];
 72 |     NvU32 key[CC_AES_256_GCM_KEY_SIZE_DWORD];
 73 |     NvU32 ivMask[CC_AES_256_GCM_IV_SIZE_DWORD];
 74 | } CC_AES_CRYPTOBUNDLE;
 75 | typedef struct CC_AES_CRYPTOBUNDLE *PCC_AES_CRYPTOBUNDLE;
 76 | 
 77 | typedef struct CC_HMAC_CRYPTOBUNDLE {
 78 |     NvU32 nonce[CC_HMAC_NONCE_SIZE_DWORD];
 79 |     NvU32 key[CC_HMAC_KEY_SIZE_DWORD];
 80 | } CC_HMAC_CRYPTOBUNDLE;
 81 | typedef struct CC_HMAC_CRYPTOBUNDLE *PCC_HMAC_CRYPTOBUNDLE;
 82 | 
 83 | typedef struct CC_KMB {
 84 |     CC_AES_CRYPTOBUNDLE encryptBundle;           // Bundle of encyption material
 85 | 
 86 |     union {
 87 |         CC_HMAC_CRYPTOBUNDLE hmacBundle;  // HMAC bundle used for method stream authenticity
 88 |         CC_AES_CRYPTOBUNDLE  decryptBundle;   // Bundle of decryption material
 89 |     };
 90 |     NvBool bIsWorkLaunch;                        // False if decryption parameters are valid
 91 | } CC_KMB;
 92 | typedef struct CC_KMB *PCC_KMB;
 93 | 
 94 | typedef struct CC_CRYPTOBUNDLE_STATS {
 95 |     NV_DECLARE_ALIGNED(NvU64 numEncryptionsH2D, 8);
 96 |     NV_DECLARE_ALIGNED(NvU64 numEncryptionsD2H, 8);
 97 |     NV_DECLARE_ALIGNED(NvU64 bytesEncryptedH2D, 8);
 98 |     NV_DECLARE_ALIGNED(NvU64 bytesEncryptedD2H, 8);
 99 | } CC_CRYPTOBUNDLE_STATS;
100 | typedef struct CC_CRYPTOBUNDLE_STATS *PCC_CRYPTOBUNDLE_STATS;
101 | 
102 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cl0040.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2001-2001 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | 
25 | #ifndef _cl0040_h_
26 | #define _cl0040_h_
27 | 
28 | #ifdef __cplusplus
29 | extern "C" {
30 | #endif
31 | 
32 | #include "nvtypes.h"
33 | 
34 | #define  NV01_MEMORY_LOCAL_USER                                    (0x00000040)
35 | /* NvNotification[] fields and values */
36 | #define NV040_NOTIFICATION_STATUS_ERROR_PROTECTION_FAULT           (0x4000)
37 | /* pio method data structure */
38 | typedef volatile struct _cl0040_tag0 {
39 |  NvV32 Reserved00[0x7c0];
40 | } Nv040Typedef, Nv01MemoryLocalUser;
41 | #define NV040_TYPEDEF                                       Nv01MemoryLocalUser
42 | /* obsolete stuff */
43 | #define NV01_MEMORY_USER                                           (0x00000040)
44 | #define NV1_MEMORY_USER                                            (0x00000040)
45 | #define Nv01MemoryUser                                      Nv01MemoryLocalUser
46 | #define nv01MemoryUser                                      Nv01MemoryLocalUser
47 | #define Nv1MemoryUser                                       Nv01MemoryLocalUser
48 | #define nv1MemoryUser                                       Nv01MemoryLocalUser
49 | #define nv01MemoryLocalUser                                 Nv01MemoryLocalUser
50 | 
51 | #ifdef __cplusplus
52 | };     /* extern "C" */
53 | #endif
54 | 
55 | #endif /* _cl0040_h_ */
56 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cl0080.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2001-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | #include "nvtypes.h"
27 | 
28 | //
29 | // This file was generated with FINN, an NVIDIA coding tool.
30 | // Source file:      class/cl0080.finn
31 | //
32 | 
33 | #include "nvlimits.h"
34 | #include "cl0080_notification.h"
35 | 
36 | #define NV01_DEVICE_0      (0x80U) /* finn: Evaluated from "NV0080_ALLOC_PARAMETERS_MESSAGE_ID" */
37 | 
38 | /* NvAlloc parameteters */
39 | #define NV0080_MAX_DEVICES NV_MAX_DEVICES
40 | 
41 | /**
42 |  * @brief Alloc param 
43 |  *
44 |  * @param vaMode mode for virtual address space allocation
45 |  *  Three modes:
46 |  *  NV_DEVICE_ALLOCATION_VAMODE_OPTIONAL_MULTIPLE_VASPACES
47 |  *  NV_DEVICE_ALLOCATION_VAMODE_SINGLE_VASPACE
48 |  *  NV_DEVICE_ALLOCATION_VAMODE_MULTIPLE_VASPACES
49 |  *  Detailed description of these modes is in nvos.h
50 |  **/
51 | 
52 | #define NV0080_ALLOC_PARAMETERS_MESSAGE_ID (0x0080U)
53 | 
54 | typedef struct NV0080_ALLOC_PARAMETERS {
55 |     NvU32    deviceId;
56 |     NvHandle hClientShare;
57 |     NvHandle hTargetClient;
58 |     NvHandle hTargetDevice;
59 |     NvV32    flags;
60 |     NV_DECLARE_ALIGNED(NvU64 vaSpaceSize, 8);
61 |     NV_DECLARE_ALIGNED(NvU64 vaStartInternal, 8);
62 |     NV_DECLARE_ALIGNED(NvU64 vaLimitInternal, 8);
63 |     NvV32    vaMode;
64 | } NV0080_ALLOC_PARAMETERS;
65 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cl0080_notification.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #ifndef _cl0080_notification_h_
25 | #define _cl0080_notification_h_
26 | 
27 | #ifdef __cplusplus
28 | extern "C" {
29 | #endif
30 | 
31 | /* NvNotification[] fields and values */
32 | #define NV080_NOTIFICATION_STATUS_ERROR_PROTECTION_FAULT           (0x4000)
33 | 
34 | /* pio method data structure */
35 | typedef volatile struct _cl0080_tag0 {
36 |  NvV32 Reserved00[0x7c0];
37 | } Nv080Typedef, Nv01Device0;
38 | 
39 | #define  NV080_TYPEDEF                                             Nv01Device0
40 | 
41 | #ifdef __cplusplus
42 | };     /* extern "C" */
43 | #endif
44 | 
45 | #endif /* _cl0080_notification_h_ */
46 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cl2080.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2002-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | #include "nvtypes.h"
27 | 
28 | //
29 | // This file was generated with FINN, an NVIDIA coding tool.
30 | // Source file:      class/cl2080.finn
31 | //
32 | 
33 | #include "nvlimits.h"
34 | #include "cl2080_notification.h"
35 | 
36 | #define NV20_SUBDEVICE_0      (0x2080U) /* finn: Evaluated from "NV2080_ALLOC_PARAMETERS_MESSAGE_ID" */
37 | 
38 | /* NvAlloc parameteters */
39 | #define NV2080_MAX_SUBDEVICES NV_MAX_SUBDEVICES
40 | 
41 | #define NV2080_ALLOC_PARAMETERS_MESSAGE_ID (0x2080U)
42 | 
43 | typedef struct NV2080_ALLOC_PARAMETERS {
44 |     NvU32 subDeviceId;
45 | } NV2080_ALLOC_PARAMETERS;
46 | 
47 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cl9010.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in 
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | 
23 | #pragma once
24 | 
25 | #include "nvtypes.h"
26 | 
27 | //
28 | // This file was generated with FINN, an NVIDIA coding tool.
29 | // Source file:      class/cl9010.finn
30 | //
31 | 
32 | #include "cl9010_callback.h"
33 | 
34 | #define NV9010_VBLANK_CALLBACK (0x9010U) /* finn: Evaluated from "NV_VBLANK_CALLBACK_ALLOCATION_PARAMETERS_MESSAGE_ID" */
35 | 
36 | #define NV_VBLANK_CALLBACK_ALLOCATION_PARAMETERS_MESSAGE_ID (0x9010U)
37 | 
38 | typedef struct NV_VBLANK_CALLBACK_ALLOCATION_PARAMETERS {
39 |     NV_DECLARE_ALIGNED(NvP64 pProc, 8);      // Routine to call at vblank time
40 |                                              // A function pointer of OSVBLANKCALLBACKPROC
41 |     NvV32 LogicalHead;                       // Logical Head
42 |     NV_DECLARE_ALIGNED(NvP64 pParm1, 8);     // pParm1
43 |     NV_DECLARE_ALIGNED(NvP64 pParm2, 8);     // pParm2
44 | } NV_VBLANK_CALLBACK_ALLOCATION_PARAMETERS;
45 | 
46 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cl9010_callback.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a
 5 |  * copy of this software and associated documentation files (the "Software"),
 6 |  * to deal in the Software without restriction, including without limitation
 7 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 |  * and/or sell copies of the Software, and to permit persons to whom the
 9 |  * Software is furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in 
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |  * DEALINGS IN THE SOFTWARE.
21 |  */
22 | #ifndef SDK_CL9010_CALLBACK_H
23 | #define SDK_CL9010_CALLBACK_H
24 | 
25 | typedef void (*OSVBLANKCALLBACKPROC)(NvP64 pParm1, NvP64 pParm2);
26 | 
27 | #endif // SDK_CL9010_CALLBACK_H
28 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cl9067.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2010-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #include "nvtypes.h"
25 | 
26 | #ifndef _cl9067_h_
27 | #define _cl9067_h_
28 | 
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 | 
33 | #define FERMI_CONTEXT_SHARE_A                                     (0x00009067)
34 | 
35 | #ifdef __cplusplus
36 | };     /* extern "C" */
37 | #endif
38 | #endif // _cl9067_h
39 | 
40 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cl90f1.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2011 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #include "nvtypes.h"
25 | 
26 | #ifndef _cl90f1_h_
27 | #define _cl90f1_h_
28 | 
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 | 
33 | #define FERMI_VASPACE_A                                     (0x000090f1)
34 | 
35 | #ifdef __cplusplus
36 | };     /* extern "C" */
37 | #endif
38 | #endif // _cl90f1_h
39 | 
40 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/cla06c.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2010-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #include "nvtypes.h"
25 | 
26 | #ifndef _cla06c_h_
27 | #define _cla06c_h_
28 | 
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 | 
33 | #define KEPLER_CHANNEL_GROUP_A                                     (0x0000A06C)
34 | 
35 | #ifdef __cplusplus
36 | };     /* extern "C" */
37 | #endif
38 | #endif // _cla06c_h
39 | 
40 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/clc461.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #ifndef _clc461_h_
25 | #define _clc461_h_
26 | 
27 | #define TURING_USERMODE_A (0xc461)
28 | 
29 | #endif // _clc461_h_
30 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrl0000base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2005-2015 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | //
27 | // This file was generated with FINN, an NVIDIA coding tool.
28 | // Source file:      ctrl/ctrl0000/ctrl0000base.finn
29 | //
30 | 
31 | #include "ctrlxxxx.h"
32 | /* NV01_ROOT (client) control commands and parameters */
33 | 
34 | #define NV0000_CTRL_CMD(cat,idx)          NVXXXX_CTRL_CMD(0x0000,NV0000_CTRL_##cat,idx)
35 | 
36 | /* Client command categories (6bits) */
37 | #define NV0000_CTRL_RESERVED       (0x00)
38 | #define NV0000_CTRL_SYSTEM         (0x01)
39 | #define NV0000_CTRL_GPU            (0x02)
40 | #define NV0000_CTRL_GSYNC          (0x03)
41 | #define NV0000_CTRL_DIAG           (0x04)
42 | #define NV0000_CTRL_EVENT          (0x05)
43 | #define NV0000_CTRL_NVD            (0x06)
44 | #define NV0000_CTRL_SWINSTR        (0x07)
45 | #define NV0000_CTRL_PROC           (0x09)
46 | #define NV0000_CTRL_SYNC_GPU_BOOST (0x0A)
47 | #define NV0000_CTRL_GPUACCT        (0x0B)
48 | #define NV0000_CTRL_VGPU           (0x0C)
49 | #define NV0000_CTRL_CLIENT         (0x0D)
50 | 
51 | // per-OS categories start at highest category and work backwards
52 | #define NV0000_CTRL_OS_WINDOWS     (0x3F)
53 | #define NV0000_CTRL_OS_MACOS       (0x3E)
54 | #define NV0000_CTRL_OS_UNIX        (0x3D)
55 | 
56 | 
57 | /*
58 |  * NV0000_CTRL_CMD_NULL
59 |  *
60 |  * This command does nothing.
61 |  * This command does not take any parameters.
62 |  *
63 |  * Possible status values returned are:
64 |  *   NV_OK
65 |  */
66 | #define NV0000_CTRL_CMD_NULL       (0x0) /* finn: Evaluated from "(FINN_NV01_ROOT_RESERVED_INTERFACE_ID << 8) | 0x0" */
67 | 
68 | /* _ctrl0000_base_h_ */
69 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrl0073base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2019-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | //
27 | // This file was generated with FINN, an NVIDIA coding tool.
28 | // Source file:      ctrl/ctrl0073/ctrl0073base.finn
29 | //
30 | 
31 | #include "ctrlxxxx.h"
32 | /* NV04_DISPLAY_COMMON control commands and parameters */
33 | 
34 | #define NV0073_CTRL_CMD(cat,idx)                NVXXXX_CTRL_CMD(0x0073, NV0073_CTRL_##cat, idx)
35 | 
36 | /* NV04_DISPLAY_COMMON command categories (6bits) */
37 | #define NV0073_CTRL_RESERVED (0x00U)
38 | #define NV0073_CTRL_SYSTEM   (0x01U)
39 | #define NV0073_CTRL_SPECIFIC (0x02U)
40 | #define NV0073_CTRL_EVENT    (0x03U)
41 | #define NV0073_CTRL_INTERNAL (0x04U)
42 | #define NV0073_CTRL_COMMON   (0x05U)
43 | #define NV0073_CTRL_DFP      (0x11U)
44 | #define NV0073_CTRL_DP       (0x13U)
45 | #define NV0073_CTRL_SVP      (0x14U)
46 | #define NV0073_CTRL_DPU      (0x15U)
47 | #define NV0073_CTRL_PSR      (0x16U)
48 | #define NV0073_CTRL_STEREO   (0x17U)
49 | 
50 | /*
51 |  * NV0073_CTRL_CMD_NULL
52 |  *
53 |  * This command does nothing.
54 |  * This command does not take any parameters.
55 |  *
56 |  * Possible status values returned are:
57 |  *   NV_OK
58 |  */
59 | #define NV0073_CTRL_CMD_NULL (0x730000U) /* finn: Evaluated from "(FINN_NV04_DISPLAY_COMMON_RESERVED_INTERFACE_ID << 8) | 0x0" */
60 | 
61 | /* _ctrl0073base_h_ */
62 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrl0080base.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2019-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | //
27 | // This file was generated with FINN, an NVIDIA coding tool.
28 | // Source file:      ctrl/ctrl0080/ctrl0080base.finn
29 | //
30 | 
31 | #include "ctrlxxxx.h"
32 | /* NV01_DEVICE_XX/NV03_DEVICE control commands and parameters */
33 | 
34 | #define NV0080_CTRL_CMD(cat,idx)                NVXXXX_CTRL_CMD(0x0080, NV0080_CTRL_##cat, idx)
35 | 
36 | /* GPU device command categories (6bits) */
37 | #define NV0080_CTRL_RESERVED                   (0x00)
38 | #define NV0080_CTRL_BIF                        (0x01)
39 | #define NV0080_CTRL_GPU                        (0x02)
40 | #define NV0080_CTRL_CLK                        (0x10)
41 | #define NV0080_CTRL_GR                         (0x11)
42 | #define NV0080_CTRL_CIPHER                     (0x12)
43 | #define NV0080_CTRL_FB                         (0x13)
44 | #define NV0080_CTRL_HOST                       (0x14)
45 | #define NV0080_CTRL_VIDEO                      (0x15)
46 | #define NV0080_CTRL_FIFO                       (0x17)
47 | #define NV0080_CTRL_DMA                        (0x18)
48 | #define NV0080_CTRL_PERF                       (0x19)
49 | #define NV0080_CTRL_PERF_LEGACY_NON_PRIVILEGED (0x99) /* finn: Evaluated from "(NV0080_CTRL_PERF | NVxxxx_CTRL_LEGACY_NON_PRIVILEGED)" */
50 | #define NV0080_CTRL_MSENC                      (0x1B)
51 | #define NV0080_CTRL_BSP                        (0x1C)
52 | #define NV0080_CTRL_RC                         (0x1D)
53 | #define NV0080_CTRL_OS_UNIX                    (0x1E)
54 | #define NV0080_CTRL_NVJPG                      (0x1F)
55 | #define NV0080_CTRL_INTERNAL                   (0x20)
56 | #define NV0080_CTRL_NVLINK                     (0x21)
57 | 
58 | /*
59 |  * NV0080_CTRL_CMD_NULL
60 |  *
61 |  * This command does nothing.
62 |  * This command does not take any parameters.
63 |  *
64 |  * Possible status values returned are:
65 |  *   NV_OK
66 |  */
67 | #define NV0080_CTRL_CMD_NULL                   (0x800000) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_RESERVED_INTERFACE_ID << 8) | 0x0" */
68 | 
69 | /* _ctrl0080base_h_ */
70 | 
71 | /* extract device cap setting from specified category-specific caps table */
72 | #define NV0080_CTRL_GET_CAP(cat,tbl,c)    \
73 |     NV0080_CTRL_##cat##_GET_CAP(tbl, NV0080_CTRL_##cat##_CAPS_##c)
74 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrl0080bsp.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2014-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: MIT
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a
  6 |  * copy of this software and associated documentation files (the "Software"),
  7 |  * to deal in the Software without restriction, including without limitation
  8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 |  * and/or sell copies of the Software, and to permit persons to whom the
 10 |  * Software is furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 |  * DEALINGS IN THE SOFTWARE.
 22 |  */
 23 | #pragma once
 24 | 
 25 | #include "nvtypes.h"
 26 | 
 27 | //
 28 | // This file was generated with FINN, an NVIDIA coding tool.
 29 | // Source file:      ctrl/ctrl0080/ctrl0080bsp.finn
 30 | //
 31 | 
 32 | #include "ctrl0080base.h"
 33 | 
 34 | /* NV01_DEVICE_XX/NV03_DEVICE bit stream processor control commands and parameters */
 35 | 
 36 | /*
 37 |  * NV0080_CTRL_CMD_BSP_GET_CAPS
 38 |  *
 39 |  * This command returns the set of BSP capabilities for the device
 40 |  * in the form of an array of unsigned bytes.  BSP capabilities
 41 |  * include supported features and required workarounds for the decoder
 42 |  * within the device, each represented by a byte offset into the
 43 |  * table and a bit position within that byte.
 44 |  *
 45 |  *   capsTblSize
 46 |  *     This parameter specifies the size in bytes of the caps table.
 47 |  *     This value should be set to NV0080_CTRL_BSP_CAPS_TBL_SIZE.
 48 |  *   capsTbl
 49 |  *     This parameter specifies a pointer to the client's caps table buffer
 50 |  *     into which the BSP caps bits will be transferred by the RM.
 51 |  *     The caps table is an array of unsigned bytes.
 52 |  *   instanceId
 53 |  *     This parameter specifies the instance Id of NVDEC for which
 54 |  *     cap bits are requested. 
 55 |  *
 56 |  * Possible status values returned are:
 57 |  *   NV_OK
 58 |  *   NV_ERR_INVALID_PARAM_STRUCT
 59 |  *   NV_ERR_INVALID_ARGUMENT
 60 |  */
 61 | 
 62 | #define NV0080_CTRL_CMD_BSP_GET_CAPS (0x801c01) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_BSP_INTERFACE_ID << 8) | NV0080_CTRL_BSP_GET_CAPS_PARAMS_MESSAGE_ID" */
 63 | 
 64 | #define NV0080_CTRL_BSP_GET_CAPS_PARAMS_MESSAGE_ID (0x1U)
 65 | 
 66 | typedef struct NV0080_CTRL_BSP_GET_CAPS_PARAMS {
 67 |     NvU32 capsTblSize;
 68 |     NV_DECLARE_ALIGNED(NvP64 capsTbl, 8);
 69 |     NvU32 instanceId;
 70 | } NV0080_CTRL_BSP_GET_CAPS_PARAMS;
 71 | 
 72 | 
 73 | 
 74 | /*
 75 |  * Size in bytes of bsp caps table. This value should be one greater
 76 |  * than the largest byte_index value above.
 77 |  */
 78 | #define NV0080_CTRL_BSP_CAPS_TBL_SIZE   8
 79 | 
 80 | /*
 81 |  * NV0080_CTRL_CMD_BSP_GET_CAPS_V2
 82 |  *
 83 |  * This command returns the set of BSP capabilities for the device
 84 |  * in the form of an array of unsigned bytes.  BSP capabilities
 85 |  * include supported features and required workarounds for the decoder
 86 |  * within the device, each represented by a byte offset into the
 87 |  * table and a bit position within that byte.
 88 |  * (The V2 version flattens the capsTbl array pointer).
 89 |  *
 90 |  *   capsTbl
 91 |  *     This parameter is an array of unsigned bytes where the BSP caps bits
 92 |  *     will be transferred by the RM.
 93 |  *   instanceId
 94 |  *     This parameter specifies the instance Id of NVDEC for which
 95 |  *     cap bits are requested. 
 96 |  *
 97 |  * Possible status values returned are:
 98 |  *   NV_OK
 99 |  *   NV_ERR_INVALID_PARAM_STRUCT
100 |  *   NV_ERR_INVALID_ARGUMENT
101 |  */
102 | 
103 | #define NV0080_CTRL_CMD_BSP_GET_CAPS_V2 (0x801c02) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_BSP_INTERFACE_ID << 8) | NV0080_CTRL_BSP_GET_CAPS_PARAMS_V2_MESSAGE_ID" */
104 | 
105 | #define NV0080_CTRL_BSP_GET_CAPS_PARAMS_V2_MESSAGE_ID (0x2U)
106 | 
107 | typedef struct NV0080_CTRL_BSP_GET_CAPS_PARAMS_V2 {
108 |     NvU8  capsTbl[NV0080_CTRL_BSP_CAPS_TBL_SIZE];
109 |     NvU32 instanceId;
110 | } NV0080_CTRL_BSP_GET_CAPS_PARAMS_V2;
111 | 
112 | /* _ctrl0080bsp_h_ */
113 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrl0080msenc.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2004-2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | #include "nvtypes.h"
27 | 
28 | //
29 | // This file was generated with FINN, an NVIDIA coding tool.
30 | // Source file:      ctrl/ctrl0080/ctrl0080msenc.finn
31 | //
32 | 
33 | #include "ctrl0080base.h"
34 | 
35 | /* NV01_DEVICE_XX/NV03_DEVICE MSENC control commands and parameters */
36 | 
37 | /*
38 |  * NV0080_CTRL_CMD_MSENC_GET_CAPS
39 |  *
40 |  * This command returns the set of MSENC capabilities for the device
41 |  * in the form of an array of unsigned bytes. MSENC capabilities
42 |  * include supported features and required workarounds for the MSENC-related
43 |  * engine(s) within the device, each represented by a byte offset into
44 |  * the table and a bit position within that byte.
45 |  *
46 |  *   capsTblSize
47 |  *     This parameter specifies the size in bytes of the caps table.
48 |  *     This value should be set to NV0080_CTRL_MSENC_CAPS_TBL_SIZE.
49 |  *   capsTbl
50 |  *     This parameter specifies a pointer to the client's caps table buffer
51 |  *     into which the MSENC caps bits will be transferred by the RM.
52 |  *     The caps table is an array of unsigned bytes.
53 |  *
54 |  * Possible status values returned are:
55 |  *   NV_OK
56 |  *   NV_ERR_INVALID_PARAM_STRUCT
57 |  *   NV_ERR_INVALID_ARGUMENT
58 |  */
59 | #define NV0080_CTRL_CMD_MSENC_GET_CAPS (0x801b01) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_MSENC_INTERFACE_ID << 8) | NV0080_CTRL_MSENC_GET_CAPS_PARAMS_MESSAGE_ID" */
60 | 
61 | #define NV0080_CTRL_MSENC_GET_CAPS_PARAMS_MESSAGE_ID (0x1U)
62 | 
63 | typedef struct NV0080_CTRL_MSENC_GET_CAPS_PARAMS {
64 |     NvU32 capsTblSize;
65 |     NV_DECLARE_ALIGNED(NvP64 capsTbl, 8);
66 | } NV0080_CTRL_MSENC_GET_CAPS_PARAMS;
67 | 
68 | 
69 | 
70 | /* size in bytes of MSENC caps table */
71 | #define NV0080_CTRL_MSENC_CAPS_TBL_SIZE   4
72 | 
73 | /*
74 |  * NV0080_CTRL_CMD_MSENC_GET_CAPS_V2
75 |  *
76 |  * This command is a version of NV0080_CTRL_CMD_MSENC_GET_CAPS with caps passed inline in capsTbl.
77 |  *
78 |  * For consistency with other video caps controls, it adds `instanceId` parameter. Currently it is
79 |  * ignored.
80 |  */
81 | #define NV0080_CTRL_CMD_MSENC_GET_CAPS_V2 (0x801b02) /* finn: Evaluated from "(FINN_NV01_DEVICE_0_MSENC_INTERFACE_ID << 8) | NV0080_CTRL_MSENC_GET_CAPS_V2_PARAMS_MESSAGE_ID" */
82 | 
83 | #define NV0080_CTRL_MSENC_GET_CAPS_V2_PARAMS_MESSAGE_ID (0x2U)
84 | 
85 | typedef struct NV0080_CTRL_MSENC_GET_CAPS_V2_PARAMS {
86 |     NvU8  capsTbl[NV0080_CTRL_MSENC_CAPS_TBL_SIZE];
87 |     NvU32 instanceId; // ignored
88 | } NV0080_CTRL_MSENC_GET_CAPS_V2_PARAMS;
89 | 
90 | /* _ctrl0080msenc_h_ */
91 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrl2080mc.1.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 |     Copyright (c) 2013-2023 NVIDIA Corporation
 3 | 
 4 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 5 |     of this software and associated documentation files (the "Software"), to
 6 |     deal in the Software without restriction, including without limitation the
 7 |     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 8 |     sell copies of the Software, and to permit persons to whom the Software is
 9 |     furnished to do so, subject to the following conditions:
10 | 
11 |         The above copyright notice and this permission notice shall be
12 |         included in all copies or substantial portions of the Software.
13 | 
14 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 |     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |     DEALINGS IN THE SOFTWARE.
21 | 
22 | *******************************************************************************/
23 | 
24 | #ifndef _ctrl2080mc_h_
25 | #define _ctrl2080mc_h_
26 | 
27 | /* valid architecture values */
28 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_T13X                 (0xE0000013)
29 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM000                (0x00000110)
30 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GM200                (0x00000120)
31 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GP100                (0x00000130)
32 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GV100                (0x00000140)
33 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_TU100                (0x00000160)
34 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GA100                (0x00000170)
35 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GH100                (0x00000180)
36 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_AD100                (0x00000190)
37 | #define NV2080_CTRL_MC_ARCH_INFO_ARCHITECTURE_GB100                (0x000001A0)
38 | 
39 | /* valid ARCHITECTURE_GP10x implementation values */
40 | #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GP100              (0x00000000)
41 | #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GP000              (0x00000001)
42 | 
43 | #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GA100              (0x00000000)
44 | #define NV2080_CTRL_MC_ARCH_INFO_IMPLEMENTATION_GA000              (0x00000001)
45 | #endif /* _ctrl2080mc_h_ */
46 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrla06f.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2007-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  * 
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  * 
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  * 
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | //
27 | // This file was generated with FINN, an NVIDIA coding tool.
28 | // Source file:      ctrl/ctrla06f.finn
29 | //
30 | 
31 | 
32 | 
33 | #include "ctrlxxxx.h"
34 | #include "ctrla06fbase.h"
35 | #include "ctrla06fgpfifo.h"
36 | #include "ctrla06fevent.h"
37 | #include "ctrla06finternal.h"
38 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrla06fbase.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2007-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  * 
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  * 
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  * 
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | //
27 | // This file was generated with FINN, an NVIDIA coding tool.
28 | // Source file:      ctrl/ctrla06f/ctrla06fbase.finn
29 | //
30 | 
31 | 
32 | 
33 | 
34 | /* GK100_GPFIFO control commands and parameters */
35 | 
36 | #include "ctrlxxxx.h"
37 | #include "ctrl906f.h"          /* A06F is partially derived from 906F */
38 | 
39 | #define NVA06F_CTRL_CMD(cat,idx)  \
40 |     NVXXXX_CTRL_CMD(0xA06F, NVA06F_CTRL_##cat, idx)
41 | 
42 | /* GK100_GPFIFO command categories (6bits) */
43 | #define NVA06F_CTRL_RESERVED (0x00)
44 | #define NVA06F_CTRL_GPFIFO   (0x01)
45 | #define NVA06F_CTRL_EVENT    (0x02)
46 | #define NVA06F_CTRL_INTERNAL (0x03)
47 | 
48 | /*
49 |  * NVA06F_CTRL_CMD_NULL
50 |  *
51 |  * This command does nothing.
52 |  * This command does not take any parameters.
53 |  *
54 |  * Possible status values returned are:
55 |  *   NV_OK
56 |  *
57 |  */
58 | #define NVA06F_CTRL_CMD_NULL (0xa06f0000) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_RESERVED_INTERFACE_ID << 8) | 0x0" */
59 | 
60 | /* _ctrla06fbase_h_ */
61 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrla06fevent.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2007-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  * 
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  * 
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  * 
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | #include "nvtypes.h"
27 | 
28 | //
29 | // This file was generated with FINN, an NVIDIA coding tool.
30 | // Source file:      ctrl/ctrla06f/ctrla06fevent.finn
31 | //
32 | 
33 | #include "ctrla06fbase.h"
34 | 
35 | /*
36 |  * NVA06F_CTRL_CMD_EVENT_SET_NOTIFICATION
37 |  *
38 |  * This command sets event notification state for the associated channel.
39 |  * This command requires that an instance of NV01_EVENT has been previously
40 |  * bound to the associated channel object.
41 |  *
42 |  *   event
43 |  *     This parameter specifies the type of event to which the specified
44 |  *     action is to be applied.  This parameter must specify a valid
45 |  *     NVA06F_NOTIFIERS value (see cla06f.h for more details) and should
46 |  *     not exceed one less NVA06F_NOTIFIERS_MAXCOUNT.
47 |  *   action
48 |  *     This parameter specifies the desired event notification action.
49 |  *     Valid notification actions include:
50 |  *       NVA06F_CTRL_SET_EVENT_NOTIFICATION_ACTION_DISABLE
51 |  *         This action disables event notification for the specified
52 |  *         event for the associated channel object.
53 |  *       NVA06F_CTRL_SET_EVENT_NOTIFICATION_ACTION_SINGLE
54 |  *         This action enables single-shot event notification for the
55 |  *         specified event for the associated channel object.
56 |  *       NVA06F_CTRL_SET_EVENT_NOTIFICATION_ACTION_REPEAT
57 |  *         This action enables repeated event notification for the specified
58 |  *         event for the associated channel object.
59 |  *
60 |  * Possible status values returned are:
61 |  *   NV_OK
62 |  *   NV_ERR_INVALID_PARAM_STRUCT
63 |  *   NV_ERR_INVALID_ARGUMENT
64 |  *   NV_ERR_INVALID_STATE
65 |  */
66 | #define NVA06F_CTRL_CMD_EVENT_SET_NOTIFICATION (0xa06f0205) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_EVENT_INTERFACE_ID << 8) | NVA06F_CTRL_EVENT_SET_NOTIFICATION_PARAMS_MESSAGE_ID" */
67 | 
68 | #define NVA06F_CTRL_EVENT_SET_NOTIFICATION_PARAMS_MESSAGE_ID (0x5U)
69 | 
70 | typedef struct NVA06F_CTRL_EVENT_SET_NOTIFICATION_PARAMS {
71 |     NvU32 event;
72 |     NvU32 action;
73 | } NVA06F_CTRL_EVENT_SET_NOTIFICATION_PARAMS;
74 | 
75 | /* valid action values */
76 | #define NVA06F_CTRL_EVENT_SET_NOTIFICATION_ACTION_DISABLE (0x00000000)
77 | #define NVA06F_CTRL_EVENT_SET_NOTIFICATION_ACTION_SINGLE  (0x00000001)
78 | #define NVA06F_CTRL_EVENT_SET_NOTIFICATION_ACTION_REPEAT  (0x00000002)
79 | 
80 | /*
81 |  * NVA06F_CTRL_CMD_EVENT_SET_TRIGGER
82 |  *
83 |  * This command triggers a software event for the associated channel.
84 |  * This command accepts no parameters.
85 |  *
86 |  * Possible status values returned are:
87 |  *   NV_OK
88 |  */
89 | #define NVA06F_CTRL_CMD_EVENT_SET_TRIGGER                 (0xa06f0206) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_EVENT_INTERFACE_ID << 8) | 0x6" */
90 | 
91 | 
92 | /* _ctrla06fevent_h_ */
93 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrla06finternal.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2007-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | //
27 | // This file was generated with FINN, an NVIDIA coding tool.
28 | // Source file:      ctrl/ctrla06f/ctrla06finternal.finn
29 | //
30 | 
31 | #include "ctrla06fbase.h"
32 | #include "ctrla06fgpfifo.h"
33 | 
34 | /*
35 |  * NVA06F_CTRL_CMD_INTERNAL_STOP_CHANNEL
36 |  *
37 |  * This command is an internal command sent from Kernel RM to Physical RM
38 |  * to stop the channel in hardware
39 |  *
40 |  * Please see description of NVA06F_CTRL_CMD_STOP_CHANNEL for more information.
41 |  *
42 |  */
43 | #define NVA06F_CTRL_CMD_INTERNAL_STOP_CHANNEL (0xa06f0301) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_INTERNAL_INTERFACE_ID << 8) | NVA06F_CTRL_INTERNAL_STOP_CHANNEL_PARAMS_MESSAGE_ID" */
44 | 
45 | #define NVA06F_CTRL_INTERNAL_STOP_CHANNEL_PARAMS_MESSAGE_ID (0x1U)
46 | 
47 | typedef NVA06F_CTRL_STOP_CHANNEL_PARAMS NVA06F_CTRL_INTERNAL_STOP_CHANNEL_PARAMS;
48 | 
49 | /*
50 |  * NVA06F_CTRL_CMD_INTERNAL_GPFIFO_SCHEDULE
51 |  *
52 |  * This command is an internal command sent from Kernel RM to Physical RM
53 |  * to schedule the channel in hardware
54 |  *
55 |  * Please see description of NVA06F_CTRL_CMD_GPFIFO_SCHEDULE for more information.
56 |  *
57 |  */
58 | #define NVA06F_CTRL_CMD_INTERNAL_GPFIFO_SCHEDULE (0xa06f0303) /* finn: Evaluated from "(FINN_KEPLER_CHANNEL_GPFIFO_A_INTERNAL_INTERFACE_ID << 8) | NVA06F_CTRL_INTERNAL_GPFIFO_SCHEDULE_PARAMS_MESSAGE_ID" */
59 | 
60 | #define NVA06F_CTRL_INTERNAL_GPFIFO_SCHEDULE_PARAMS_MESSAGE_ID (0x3U)
61 | 
62 | typedef NVA06F_CTRL_GPFIFO_SCHEDULE_PARAMS NVA06F_CTRL_INTERNAL_GPFIFO_SCHEDULE_PARAMS;
63 | 
64 | /* ctrla06finternal_h */
65 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/ctrlxxxx.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2005-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | #include "nvtypes.h"
27 | 
28 | //
29 | // This file was generated with FINN, an NVIDIA coding tool.
30 | // Source file:      ctrl/ctrlxxxx.finn
31 | //
32 | 
33 | 
34 | 
35 | #include "nvtypes.h"
36 | 
37 | /* definitions shared by all CTRL interfaces */
38 | 
39 | /* Basic command format:
40 | *   cmd_class       [31:16],
41 | *   cmd_reserved    [15:15],
42 | *   cmd_reserved    [14:14],
43 | *   cmd_category    [13:8],
44 | *   cmd_index       [7:0]
45 | */
46 | 
47 | #define NVXXXX_CTRL_CMD_CLASS                                             31:16
48 | 
49 | #define NVXXXX_CTRL_CMD_CATEGORY                                           13:8
50 | #define NVXXXX_CTRL_CMD_INDEX                                               7:0
51 | 
52 | /* don't use DRF_NUM - not always available */
53 | #  define NVXXXX_CTRL_CMD(cls,cat,idx)     \
54 |                                (((cls) << 16) | ((0) << 15) | ((0) << 14) \
55 |                                | ((cat) << 8) | ((idx) & 0xFF))
56 | /*
57 |  * NVXXXX_CTRL_CMD_NULL
58 |  *
59 |  * This command does nothing.
60 |  * This command does not take any parameters.
61 |  * This command is valid for all classes.
62 |  *
63 |  * Possible status values returned are:
64 |  *   NV_OK
65 |  */
66 | #define NVXXXX_CTRL_CMD_NULL              (0x00000000)
67 | 
68 | #define NVxxxx_CTRL_LEGACY_PRIVILEGED     (0xC0)
69 | #define NVxxxx_CTRL_LEGACY_NON_PRIVILEGED (0x80)
70 | 
71 | typedef struct NVXXXX_CTRL_XXX_INFO {
72 |     NvU32 index;
73 |     NvU32 data;
74 | } NVXXXX_CTRL_XXX_INFO;
75 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/nv-ioctl-numbers.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | 
25 | #ifndef NV_IOCTL_NUMBERS_H
26 | #define NV_IOCTL_NUMBERS_H
27 | 
28 | /* NOTE: using an ioctl() number > 55 will overflow! */
29 | #define NV_IOCTL_MAGIC      'F'
30 | #define NV_IOCTL_BASE       200
31 | #define NV_ESC_CARD_INFO             (NV_IOCTL_BASE + 0)
32 | #define NV_ESC_REGISTER_FD           (NV_IOCTL_BASE + 1)
33 | #define NV_ESC_ALLOC_OS_EVENT        (NV_IOCTL_BASE + 6)
34 | #define NV_ESC_FREE_OS_EVENT         (NV_IOCTL_BASE + 7)
35 | #define NV_ESC_STATUS_CODE           (NV_IOCTL_BASE + 9)
36 | #define NV_ESC_CHECK_VERSION_STR     (NV_IOCTL_BASE + 10)
37 | #define NV_ESC_IOCTL_XFER_CMD        (NV_IOCTL_BASE + 11)
38 | #define NV_ESC_ATTACH_GPUS_TO_FD     (NV_IOCTL_BASE + 12)
39 | #define NV_ESC_QUERY_DEVICE_INTR     (NV_IOCTL_BASE + 13)
40 | #define NV_ESC_SYS_PARAMS            (NV_IOCTL_BASE + 14)
41 | #define NV_ESC_EXPORT_TO_DMABUF_FD   (NV_IOCTL_BASE + 17)
42 | #define NV_ESC_WAIT_OPEN_COMPLETE    (NV_IOCTL_BASE + 18)
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/nv-unix-nvos-params-wrappers.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #ifndef _NV_UNIX_NVOS_PARAMS_WRAPPERS_H_
25 | #define _NV_UNIX_NVOS_PARAMS_WRAPPERS_H_
26 | 
27 | #include "nvos.h"
28 | 
29 | /*
30 |  * This is a wrapper for NVOS02_PARAMETERS with file descriptor
31 |  */
32 | 
33 | typedef struct
34 | {
35 |     NVOS02_PARAMETERS params;
36 |     int fd;
37 | } nv_ioctl_nvos02_parameters_with_fd;
38 | 
39 | /*
40 |  * This is a wrapper for NVOS33_PARAMETERS with file descriptor
41 |  */
42 | typedef struct
43 | {
44 |     NVOS33_PARAMETERS params;
45 |     int fd;
46 | } nv_ioctl_nvos33_parameters_with_fd;
47 | 
48 | #endif // _NV_UNIX_NVOS_PARAMS_WRAPPERS_H_
49 | 
50 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/nvCpuUuid.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2015-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #ifndef _NV_CPU_UUID_H_
25 | #define _NV_CPU_UUID_H_
26 | 
27 | #define NV_UUID_LEN 16
28 | 
29 | typedef struct nv_uuid
30 | {
31 |     NvU8 uuid[NV_UUID_LEN];
32 | } NvUuid;
33 | 
34 | #define NV_UUID_HI(pUuid) (*((NvU64*)((pUuid)->uuid + (NV_UUID_LEN >> 1))))
35 | #define NV_UUID_LO(pUuid) (*((NvU64*)((pUuid)->uuid + 0)))
36 | 
37 | typedef NvUuid NvSystemUuid;
38 | 
39 | typedef NvUuid NvProcessorUuid;
40 | 
41 | extern const NvProcessorUuid NV_PROCESSOR_UUID_CPU_DEFAULT;
42 | 
43 | #endif // _NV_CPU_UUID_H_
44 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/nv_escape.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 1999-2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #ifndef NV_ESCAPE_H_INCLUDED
25 | #define NV_ESCAPE_H_INCLUDED
26 | 
27 | #define NV_ESC_RM_ALLOC_MEMORY                      0x27
28 | #define NV_ESC_RM_ALLOC_OBJECT                      0x28
29 | #define NV_ESC_RM_FREE                              0x29
30 | #define NV_ESC_RM_CONTROL                           0x2A
31 | #define NV_ESC_RM_ALLOC                             0x2B
32 | #define NV_ESC_RM_CONFIG_GET                        0x32
33 | #define NV_ESC_RM_CONFIG_SET                        0x33
34 | #define NV_ESC_RM_DUP_OBJECT                        0x34
35 | #define NV_ESC_RM_SHARE                             0x35
36 | #define NV_ESC_RM_CONFIG_GET_EX                     0x37
37 | #define NV_ESC_RM_CONFIG_SET_EX                     0x38
38 | #define NV_ESC_RM_I2C_ACCESS                        0x39
39 | #define NV_ESC_RM_IDLE_CHANNELS                     0x41
40 | #define NV_ESC_RM_VID_HEAP_CONTROL                  0x4A
41 | #define NV_ESC_RM_ACCESS_REGISTRY                   0x4D
42 | #define NV_ESC_RM_MAP_MEMORY                        0x4E
43 | #define NV_ESC_RM_UNMAP_MEMORY                      0x4F
44 | #define NV_ESC_RM_GET_EVENT_DATA                    0x52
45 | #define NV_ESC_RM_ALLOC_CONTEXT_DMA2                0x54
46 | #define NV_ESC_RM_ADD_VBLANK_CALLBACK               0x56
47 | #define NV_ESC_RM_MAP_MEMORY_DMA                    0x57
48 | #define NV_ESC_RM_UNMAP_MEMORY_DMA                  0x58
49 | #define NV_ESC_RM_BIND_CONTEXT_DMA                  0x59
50 | #define NV_ESC_RM_EXPORT_OBJECT_TO_FD               0x5C
51 | #define NV_ESC_RM_IMPORT_OBJECT_FROM_FD             0x5D
52 | #define NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO        0x5E
53 | #define NV_ESC_RM_LOCKLESS_DIAGNOSTIC               0x5F
54 | 
55 | #endif // NV_ESCAPE_H_INCLUDED
56 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/nvcfg_sdk.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | 
25 | #ifndef NV_CFG_SDK_INCLUDED
26 | #define NV_CFG_SDK_INCLUDED
27 | 
28 | 
29 | #endif // NV_CFG_SDK_INCLUDED
30 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/nvimpshared.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | /******************************************************************************\
25 | *                                                                              *
26 | *  Description:                                                                *
27 | *    Accommodates sharing of IMP-related structures between kernel interface   *
28 | *    files and core RM.                                                        *
29 | *                                                                              *
30 | \******************************************************************************/
31 | 
32 | #pragma once
33 | 
34 | #include "nvtypes.h"
35 | 
36 | //
37 | // This file was generated with FINN, an NVIDIA coding tool.
38 | // Source file:      nvimpshared.finn
39 | //
40 | 
41 | 
42 | 
43 | 
44 | //
45 | // There are only a small number of discrete dramclk frequencies available on
46 | // the system.  This structure contains IMP-relevant information associated
47 | // with a specific dramclk frequency.
48 | //
49 | typedef struct DRAM_CLK_INSTANCE {
50 |     NvU32 dram_clk_freq_khz;
51 | 
52 |     NvU32 mchub_clk_khz;
53 | 
54 |     NvU32 mc_clk_khz;
55 | 
56 |     NvU32 max_iso_bw_kbps;
57 | 
58 |     //
59 |     // switch_latency_ns is the maximum time required to switch the dramclk
60 |     // frequency to the frequency specified in dram_clk_freq_khz.
61 |     //
62 |     NvU32 switch_latency_ns;
63 | } DRAM_CLK_INSTANCE;
64 | 
65 | //
66 | // This table is used to collect information from other modules that is needed
67 | // for RM IMP calculations.  (Used on Tegra only.)
68 | //
69 | typedef struct TEGRA_IMP_IMPORT_DATA {
70 |     //
71 |     // max_iso_bw_kbps stores the maximum possible ISO bandwidth available to
72 |     // display, assuming display is the only active ISO client.  (Note that ISO
73 |     // bandwidth will typically be allocated to multiple clients, so display
74 |     // will generally not have access to the maximum possible bandwidth.)
75 |     //
76 |     NvU32             max_iso_bw_kbps;
77 | 
78 |     // On Orin, each dram channel is 16 bits wide.
79 |     NvU32             num_dram_channels;
80 | 
81 |     //
82 |     // dram_clk_instance stores entries for all possible dramclk frequencies,
83 |     // sorted by dramclk frequency in increasing order.
84 |     //
85 |     // "24" is expected to be larger than the actual number of required entries
86 |     // (which is provided by a BPMP API), but it can be increased if necessary.
87 |     //
88 |     // num_dram_clk_entries is filled in with the actual number of distinct
89 |     // dramclk entries.
90 |     //
91 |     NvU32             num_dram_clk_entries;
92 |     DRAM_CLK_INSTANCE dram_clk_instance[24];
93 | } TEGRA_IMP_IMPORT_DATA;
94 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/nvlimits.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 3 |  * SPDX-License-Identifier: MIT
 4 |  *
 5 |  * Permission is hereby granted, free of charge, to any person obtaining a
 6 |  * copy of this software and associated documentation files (the "Software"),
 7 |  * to deal in the Software without restriction, including without limitation
 8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 |  * and/or sell copies of the Software, and to permit persons to whom the
10 |  * Software is furnished to do so, subject to the following conditions:
11 |  *
12 |  * The above copyright notice and this permission notice shall be included in
13 |  * all copies or substantial portions of the Software.
14 |  *
15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 |  * DEALINGS IN THE SOFTWARE.
22 |  */
23 | 
24 | #pragma once
25 | 
26 | //
27 | // This file was generated with FINN, an NVIDIA coding tool.
28 | // Source file:      nvlimits.finn
29 | //
30 | 
31 | 
32 | 
33 | 
34 | /*
35 |  * This is the maximum number of GPUs supported in a single system.
36 |  */
37 | #define NV_MAX_DEVICES          32
38 | 
39 | /*
40 |  * This is the maximum number of subdevices within a single device.
41 |  */
42 | #define NV_MAX_SUBDEVICES       8
43 | 
44 | /*
45 |  * This is the maximum length of the process name string.
46 |  */
47 | #define NV_PROC_NAME_MAX_LENGTH 100U
48 | 
49 | /*
50 |  * This is the maximum number of heads per GPU.
51 |  */
52 | #define NV_MAX_HEADS            4
53 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/nvstatus.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2014-2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: MIT
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a
  6 |  * copy of this software and associated documentation files (the "Software"),
  7 |  * to deal in the Software without restriction, including without limitation
  8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 |  * and/or sell copies of the Software, and to permit persons to whom the
 10 |  * Software is furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 |  * DEALINGS IN THE SOFTWARE.
 22 |  */
 23 | 
 24 | #ifndef SDK_NVSTATUS_H
 25 | #define SDK_NVSTATUS_H
 26 | 
 27 | #ifdef __cplusplus
 28 | extern "C" {
 29 | #endif
 30 | 
 31 | #include "nvtypes.h"
 32 | 
 33 | typedef NvU32 NV_STATUS;
 34 | 
 35 | #define NV_STATUS_CODE( name, code, string ) name = (code),
 36 | 
 37 | enum 
 38 | {
 39 |     #include "nvstatuscodes.h"
 40 | };
 41 | 
 42 | #undef NV_STATUS_CODE
 43 | 
 44 | /*!
 45 |  * @def         NV_STATUS_LEVEL_OK
 46 |  * @see         NV_STATUS_LEVEL
 47 |  * @brief       Success: No error or special condition
 48 |  */
 49 | #define NV_STATUS_LEVEL_OK              0
 50 | 
 51 | /*!
 52 |  * @def         NV_STATUS_LEVEL_WARN
 53 |  * @see         NV_STATUS_LEVEL
 54 |  * @brief       Success, but there is an special condition
 55 |  *
 56 |  * @details     In general, NV_STATUS_LEVEL_WARN status codes are handled the
 57 |  *              same as NV_STATUS_LEVEL_OK, but are usefil to indicate that
 58 |  *              there is a condition that may be specially handled.
 59 |  *
 60 |  *              Therefore, in most cases, client function should test for
 61 |  *              status <= NV_STATUS_LEVEL_WARN or status > NV_STATUS_LEVEL_WARN
 62 |  *              to determine success v. failure of a call.
 63 |  */
 64 | #define NV_STATUS_LEVEL_WARN            1
 65 | 
 66 | /*!
 67 |  * @def         NV_STATUS_LEVEL_ERR
 68 |  * @see         NV_STATUS_LEVEL
 69 |  * @brief       Unrecoverable error condition
 70 |  */
 71 | #define NV_STATUS_LEVEL_ERR             3
 72 | 
 73 | /*!
 74 |  * @def         NV_STATUS_LEVEL
 75 |  * @see         NV_STATUS_LEVEL_OK
 76 |  * @see         NV_STATUS_LEVEL_WARN
 77 |  * @see         NV_STATUS_LEVEL_ERR
 78 |  * @brief       Level of the status code
 79 |  *
 80 |  * @warning     IMPORTANT: When comparing NV_STATUS_LEVEL(_S) against one of
 81 |  *              these constants, it is important to use '<=' or '>' (rather
 82 |  *              than '<' or '>=').
 83 |  *
 84 |  *              For example. do:
 85 |  *                  if (NV_STATUS_LEVEL(status) <= NV_STATUS_LEVEL_WARN)
 86 |  *              rather than:
 87 |  *                  if (NV_STATUS_LEVEL(status) < NV_STATUS_LEVEL_ERR)
 88 |  *
 89 |  *              By being consistent in this manner, it is easier to systematically
 90 |  *              add additional level constants.  New levels are likely to lower
 91 |  *              (rather than raise) the severity of _ERR codes.  For example,
 92 |  *              if we were to add NV_STATUS_LEVEL_RETRY to indicate hardware
 93 |  *              failures that may be recoverable (e.g. RM_ERR_TIMEOUT_RETRY
 94 |  *              or RM_ERR_BUSY_RETRY), it would be less severe than
 95 |  *              NV_STATUS_LEVEL_ERR the level to which these status codes now
 96 |  *              belong.  Using '<=' and '>' ensures your code is not broken in
 97 |  *              cases like this.
 98 |  */
 99 | #define NV_STATUS_LEVEL(_S)                                               \
100 |     ((_S) == NV_OK?                               NV_STATUS_LEVEL_OK:     \
101 |     ((_S) != NV_ERR_GENERIC && (_S) & 0x00010000? NV_STATUS_LEVEL_WARN:   \
102 |                                                   NV_STATUS_LEVEL_ERR))
103 | 
104 | /*!
105 |  * @def         NV_STATUS_LEVEL
106 |  * @see         NV_STATUS_LEVEL_OK
107 |  * @see         NV_STATUS_LEVEL_WARN
108 |  * @see         NV_STATUS_LEVEL_ERR
109 |  * @brief       Character representing status code level
110 |  */
111 | #define NV_STATUS_LEVEL_CHAR(_S)                           \
112 |     ((_S) == NV_OK?                                '0':    \
113 |     ((_S) != NV_ERR_GENERIC && (_S) & 0x00010000?  'W':    \
114 |                                                    'E'))
115 | 
116 | // Function definitions
117 | const char *nvstatusToString(NV_STATUS nvStatusIn);
118 | 
119 | #ifdef __cplusplus
120 | }
121 | #endif
122 | 
123 | #endif /* SDK_NVSTATUS_H */
124 | 


--------------------------------------------------------------------------------
/driverapi/internal/nvidia/uvm_linux_ioctl.h:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 |     Copyright (c) 2013 NVidia Corporation
 3 | 
 4 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 5 |     of this software and associated documentation files (the "Software"), to
 6 |     deal in the Software without restriction, including without limitation the
 7 |     rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 8 |     sell copies of the Software, and to permit persons to whom the Software is
 9 |     furnished to do so, subject to the following conditions:
10 | 
11 |         The above copyright notice and this permission notice shall be
12 |         included in all copies or substantial portions of the Software.
13 | 
14 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 |     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 |     DEALINGS IN THE SOFTWARE.
21 | *******************************************************************************/
22 | 
23 | #ifndef _UVM_LINUX_IOCTL_H
24 | #define _UVM_LINUX_IOCTL_H
25 | 
26 | #include "uvm_ioctl.h"
27 | 
28 | // This ioctl must be the first operation performed on the UVM file descriptor
29 | // after opening it. Until this ioctl is made, the UVM file descriptor is
30 | // inoperable: all other ioctls will return NV_ERR_ILLEGAL_ACTION and mmap will
31 | // return EBADFD.
32 | #define UVM_INITIALIZE                                                0x30000001
33 | 
34 | typedef struct
35 | {
36 |     NvU64     flags     NV_ALIGN_BYTES(8); // IN
37 |     NV_STATUS rmStatus;                    // OUT
38 | } UVM_INITIALIZE_PARAMS;
39 | 
40 | #define UVM_DEINITIALIZE                                              0x30000002
41 | 
42 | #endif // _UVM_LINUX_IOCTL_H
43 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/compile_memcpy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # NOTE: THIS FILE IS NOT PART OF CI
 4 | # Why? Because it would require having nvcc & ptxas installed, which we don't wan to assume
 5 | 
 6 | # Define the list of compute capabilities and corresponding architecture
 7 | declare -A compute_capabilities=(
 8 |     [5.0]="sm_50"
 9 |     [5.2]="sm_52"
10 |     [5.3]="sm_53"
11 |     [6.0]="sm_60"
12 |     [6.1]="sm_61"
13 |     [6.2]="sm_62"
14 |     [7.0]="sm_70"
15 |     [7.2]="sm_72"
16 |     [7.5]="sm_75"
17 |     [8.0]="sm_80"
18 |     [8.6]="sm_86"
19 |     [8.7]="sm_87"
20 |     [8.9]="sm_89"
21 |     [9.0]="sm_90"
22 | )
23 | 
24 | # Define the CUDA source file and the output directory
25 | source_file="memcpy.cu"
26 | output_dir="output"
27 | 
28 | # Create the output directory if it does not exist
29 | mkdir -p "$output_dir"
30 | 
31 | # Loop through each compute capability and run nvcc and ptxas
32 | for capability in "${!compute_capabilities[@]}"; do
33 |     arch="${compute_capabilities[$capability]}"
34 |     ptx_file="$output_dir/memcpy_${arch}.ptx"
35 |     ptxas_file="$output_dir/memcpy_${arch}.cubin"
36 | 
37 |     # Run nvcc to generate the PTX file
38 |     nvcc -ptx -arch="$arch" "$source_file" -o "$ptx_file"
39 | 
40 |     # Run ptxas to compile the PTX file to SASS
41 |     ptxas -arch="$arch" "$ptx_file" -o "$ptxas_file"
42 | 
43 |     echo "Processed compute capability $capability ($arch)"
44 | done
45 | 
46 | echo "Processing complete."
47 | 
48 | python3 generate_header.py


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/generate_header.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import binascii
 4 | 
 5 | def generate_header_from_cubin(cubin_dir, header_file):
 6 |     # Open the header file for writing
 7 |     with open(header_file, 'w') as header:
 8 |         # Write header guards
 9 |         header.write('#pragma once\n\n#include <cstdint>\n\n')
10 | 
11 |         # Iterate over all .cubin files in the directory
12 |         for file_name in os.listdir(cubin_dir):
13 |             if file_name.endswith('.cubin'):
14 |                 # Determine the array name from the file name
15 |                 array_name = file_name.replace('.cubin', '').replace('-', '_').replace(' ', '_').upper()
16 |                 cubin_path = os.path.join(cubin_dir, file_name)
17 | 
18 |                 # Read the contents of the .cubin file
19 |                 with open(cubin_path, 'rb') as cubin_file:
20 |                     cubin_data = cubin_file.read()
21 | 
22 |                 # Convert binary data to hex and format as uint8_t array
23 |                 hex_data = binascii.hexlify(cubin_data).decode('ascii')
24 |                 hex_data_lines = [hex_data[i:i+64] for i in range(0, len(hex_data), 64)]  # Split into 64-char lines
25 | 
26 |                 # Write array declaration to header file
27 |                 header.write(f'const uint8_t {array_name}[] = {{\n')
28 | 
29 |                 for line in hex_data_lines:
30 |                     header.write('    ' + ', '.join(f'0x{line[i:i+2]}' for i in range(0, len(line), 2)) + ',\n')
31 | 
32 |                 header.write('};\n\n')
33 | 
34 | if __name__ == '__main__':
35 |     generate_header_from_cubin("output", "memcopy_kernels.h")
36 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/memcpy.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdint>
 2 | 
 3 | #define MEMCPY_BLOCK_SIZE 256UL
 4 | 
 5 | extern "C" __global__ void memcpyKernelHighBW(uint32_t *dst, const uint32_t *src) {
 6 |     size_t idx = ((MEMCPY_BLOCK_SIZE * (blockIdx.y * gridDim.x + blockIdx.x)) << 2) + threadIdx.x;
 7 | 
 8 |     #pragma unroll
 9 |     for (int i = 0; i < 4; i++) {
10 |         dst[idx] = src[idx];
11 |         idx += MEMCPY_BLOCK_SIZE;
12 |     }
13 | }
14 | 
15 | extern "C" __global__ void memcpyKernelLowLatency(uint32_t *dst, const uint32_t *src, size_t n) {
16 |     size_t tid = MEMCPY_BLOCK_SIZE * blockIdx.x + threadIdx.x;
17 |     if (tid < n) {
18 |         dst[tid] = src[tid];
19 |     }
20 | }
21 | 
22 | extern "C" __global__ void memcpyKernelTrailing(uint8_t *dst, const uint8_t *src, size_t n) {
23 |     size_t tid = MEMCPY_BLOCK_SIZE * blockIdx.x + threadIdx.x;
24 |     if (tid < n) {
25 |         dst[tid] = src[tid];
26 |     }
27 | }


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_50.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_50.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_50.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_50
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_52.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_52.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_52.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_52
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_53.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_53.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_53.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_53
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_60.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_60.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_60.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_60
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_61.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_61.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_61.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_61
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_62.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_62.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_62.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_62
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_70.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_70.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_70.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_70
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_72.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_72.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_72.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_72
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_75.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_75.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_75.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_75
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_80.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_80.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_80.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_80
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_86.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_86.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_86.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_86
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_87.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_87.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_87.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_87
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_89.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_89.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_89.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_89
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_90.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/driverapi/kernels/memcpy/output/memcpy_sm_90.cubin


--------------------------------------------------------------------------------
/driverapi/kernels/memcpy/output/memcpy_sm_90.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-33961263
  5 | // Cuda compilation tools, release 12.4, V12.4.99
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_90
 11 | .address_size 64
 12 | 
 13 | 	// .globl	memcpyKernelHighBW
 14 | 
 15 | .visible .entry memcpyKernelHighBW(
 16 | 	.param .u64 memcpyKernelHighBW_param_0,
 17 | 	.param .u64 memcpyKernelHighBW_param_1
 18 | )
 19 | {
 20 | 	.reg .b32 	%r<10>;
 21 | 	.reg .b64 	%rd<11>;
 22 | 
 23 | 
 24 | 	ld.param.u64 	%rd1, [memcpyKernelHighBW_param_0];
 25 | 	ld.param.u64 	%rd2, [memcpyKernelHighBW_param_1];
 26 | 	cvta.to.global.u64 	%rd3, %rd1;
 27 | 	cvta.to.global.u64 	%rd4, %rd2;
 28 | 	mov.u32 	%r1, %ctaid.y;
 29 | 	mov.u32 	%r2, %nctaid.x;
 30 | 	mov.u32 	%r3, %ctaid.x;
 31 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
 32 | 	mul.wide.u32 	%rd5, %r4, 1024;
 33 | 	mov.u32 	%r5, %tid.x;
 34 | 	cvt.u64.u32 	%rd6, %r5;
 35 | 	add.s64 	%rd7, %rd5, %rd6;
 36 | 	shl.b64 	%rd8, %rd7, 2;
 37 | 	add.s64 	%rd9, %rd4, %rd8;
 38 | 	ld.global.u32 	%r6, [%rd9];
 39 | 	add.s64 	%rd10, %rd3, %rd8;
 40 | 	st.global.u32 	[%rd10], %r6;
 41 | 	ld.global.u32 	%r7, [%rd9+1024];
 42 | 	st.global.u32 	[%rd10+1024], %r7;
 43 | 	ld.global.u32 	%r8, [%rd9+2048];
 44 | 	st.global.u32 	[%rd10+2048], %r8;
 45 | 	ld.global.u32 	%r9, [%rd9+3072];
 46 | 	st.global.u32 	[%rd10+3072], %r9;
 47 | 	ret;
 48 | 
 49 | }
 50 | 	// .globl	memcpyKernelLowLatency
 51 | .visible .entry memcpyKernelLowLatency(
 52 | 	.param .u64 memcpyKernelLowLatency_param_0,
 53 | 	.param .u64 memcpyKernelLowLatency_param_1,
 54 | 	.param .u64 memcpyKernelLowLatency_param_2
 55 | )
 56 | {
 57 | 	.reg .pred 	%p<2>;
 58 | 	.reg .b32 	%r<4>;
 59 | 	.reg .b64 	%rd<12>;
 60 | 
 61 | 
 62 | 	ld.param.u64 	%rd2, [memcpyKernelLowLatency_param_0];
 63 | 	ld.param.u64 	%rd3, [memcpyKernelLowLatency_param_1];
 64 | 	ld.param.u64 	%rd4, [memcpyKernelLowLatency_param_2];
 65 | 	mov.u32 	%r1, %ctaid.x;
 66 | 	mul.wide.u32 	%rd5, %r1, 256;
 67 | 	mov.u32 	%r2, %tid.x;
 68 | 	cvt.u64.u32 	%rd6, %r2;
 69 | 	add.s64 	%rd1, %rd5, %rd6;
 70 | 	setp.ge.u64 	%p1, %rd1, %rd4;
 71 | 	@%p1 bra 	$L__BB1_2;
 72 | 
 73 | 	cvta.to.global.u64 	%rd7, %rd3;
 74 | 	shl.b64 	%rd8, %rd1, 2;
 75 | 	add.s64 	%rd9, %rd7, %rd8;
 76 | 	ld.global.u32 	%r3, [%rd9];
 77 | 	cvta.to.global.u64 	%rd10, %rd2;
 78 | 	add.s64 	%rd11, %rd10, %rd8;
 79 | 	st.global.u32 	[%rd11], %r3;
 80 | 
 81 | $L__BB1_2:
 82 | 	ret;
 83 | 
 84 | }
 85 | 	// .globl	memcpyKernelTrailing
 86 | .visible .entry memcpyKernelTrailing(
 87 | 	.param .u64 memcpyKernelTrailing_param_0,
 88 | 	.param .u64 memcpyKernelTrailing_param_1,
 89 | 	.param .u64 memcpyKernelTrailing_param_2
 90 | )
 91 | {
 92 | 	.reg .pred 	%p<2>;
 93 | 	.reg .b16 	%rs<2>;
 94 | 	.reg .b32 	%r<3>;
 95 | 	.reg .b64 	%rd<11>;
 96 | 
 97 | 
 98 | 	ld.param.u64 	%rd2, [memcpyKernelTrailing_param_0];
 99 | 	ld.param.u64 	%rd3, [memcpyKernelTrailing_param_1];
100 | 	ld.param.u64 	%rd4, [memcpyKernelTrailing_param_2];
101 | 	mov.u32 	%r1, %ctaid.x;
102 | 	mul.wide.u32 	%rd5, %r1, 256;
103 | 	mov.u32 	%r2, %tid.x;
104 | 	cvt.u64.u32 	%rd6, %r2;
105 | 	add.s64 	%rd1, %rd5, %rd6;
106 | 	setp.ge.u64 	%p1, %rd1, %rd4;
107 | 	@%p1 bra 	$L__BB2_2;
108 | 
109 | 	cvta.to.global.u64 	%rd7, %rd3;
110 | 	add.s64 	%rd8, %rd7, %rd1;
111 | 	ld.global.u8 	%rs1, [%rd8];
112 | 	cvta.to.global.u64 	%rd9, %rd2;
113 | 	add.s64 	%rd10, %rd9, %rd1;
114 | 	st.global.u8 	[%rd10], %rs1;
115 | 
116 | $L__BB2_2:
117 | 	ret;
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/driverapi/src/librecuda_status.cpp:
--------------------------------------------------------------------------------
 1 | #include "librecuda_status.h"
 2 | #include "librecuda_status_internal.h"
 3 | 
 4 | #include <unordered_map>
 5 | #include <string>
 6 | 
 7 | #define LIBRECUDA_DECLARE_STATUS(status, code) libreCudaStatus_t status = code;
 8 | 
 9 | #include "librecuda_all_statuses.h"
10 | 
11 | #undef LIBRECUDA_DECLARE_STATUS
12 | 
13 | 
14 | std::unordered_map<int, std::string> status_to_name = {};
15 | static bool initialized = false;
16 | 
17 | void internalLibreCuInitStatusNames() {
18 | 
19 | #define LIBRECUDA_DECLARE_STATUS(status, code) status_to_name[code] = #status;
20 | 
21 | #include "librecuda_all_statuses.h"
22 | 
23 | #undef LIBRECUDA_DECLARE_STATUS
24 | 
25 |     initialized = true;
26 | }
27 | 
28 | const char *internalLibreCuGetStatusName(int code) {
29 |     auto it = status_to_name.find(code);
30 |     if (it == status_to_name.end()) {
31 |         return nullptr;
32 |     }
33 |     return it->second.c_str();
34 | }
35 | 
36 | bool internalLibreCuInitStatusNamesInitialized() {
37 |     return initialized;
38 | }


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_subdirectory(write_float)
 2 | add_subdirectory(memcopy)
 3 | add_subdirectory(dynamic_shared_mem)
 4 | add_subdirectory(compute_chronological_consistency)
 5 | add_subdirectory(async_kernels)
 6 | add_subdirectory(dma_chronological_consistency)
 7 | add_subdirectory(kernel_struct_param)
 8 | add_subdirectory(indexing)
 9 | add_subdirectory(stream_events)
10 | add_subdirectory(many_kernels_launch)


--------------------------------------------------------------------------------
/tests/async_kernels/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(
 2 |         test_async_kernels
 3 |         main.cpp
 4 | )
 5 | target_link_libraries(
 6 |         test_async_kernels
 7 |         PRIVATE
 8 |         driverapi
 9 | )
10 | 
11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/async_kernels COPYONLY)


--------------------------------------------------------------------------------
/tests/async_kernels/write_float.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__ void write_float(float *dst, float *input) {
2 |     double x = 0;
3 |     int n = 100000000;
4 |     for (int i = 0; i < n; i++) {
5 |         x += 1.0;
6 |     }
7 |     x /= n;
8 |     *dst = (float) x + (*input);
9 | }


--------------------------------------------------------------------------------
/tests/async_kernels/write_float.cu.asm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/async_kernels/write_float.cu.asm


--------------------------------------------------------------------------------
/tests/async_kernels/write_float.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/async_kernels/write_float.cubin


--------------------------------------------------------------------------------
/tests/async_kernels/write_float.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-34097967
  5 | // Cuda compilation tools, release 12.4, V12.4.131
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_80
 11 | .address_size 64
 12 | 
 13 | 	// .globl	write_float
 14 | 
 15 | .visible .entry write_float(
 16 | 	.param .u64 write_float_param_0,
 17 | 	.param .u64 write_float_param_1
 18 | )
 19 | {
 20 | 	.reg .pred 	%p<2>;
 21 | 	.reg .f32 	%f<4>;
 22 | 	.reg .b32 	%r<5>;
 23 | 	.reg .f64 	%fd<69>;
 24 | 	.reg .b64 	%rd<5>;
 25 | 
 26 | 
 27 | 	ld.param.u64 	%rd3, [write_float_param_0];
 28 | 	ld.param.u64 	%rd2, [write_float_param_1];
 29 | 	cvta.to.global.u64 	%rd1, %rd3;
 30 | 	mov.f64 	%fd68, 0d0000000000000000;
 31 | 	mov.u32 	%r4, 0;
 32 | 
 33 | $L__BB0_1:
 34 | 	add.f64 	%fd4, %fd68, 0d3FF0000000000000;
 35 | 	add.f64 	%fd5, %fd4, 0d3FF0000000000000;
 36 | 	add.f64 	%fd6, %fd5, 0d3FF0000000000000;
 37 | 	add.f64 	%fd7, %fd6, 0d3FF0000000000000;
 38 | 	add.f64 	%fd8, %fd7, 0d3FF0000000000000;
 39 | 	add.f64 	%fd9, %fd8, 0d3FF0000000000000;
 40 | 	add.f64 	%fd10, %fd9, 0d3FF0000000000000;
 41 | 	add.f64 	%fd11, %fd10, 0d3FF0000000000000;
 42 | 	add.f64 	%fd12, %fd11, 0d3FF0000000000000;
 43 | 	add.f64 	%fd13, %fd12, 0d3FF0000000000000;
 44 | 	add.f64 	%fd14, %fd13, 0d3FF0000000000000;
 45 | 	add.f64 	%fd15, %fd14, 0d3FF0000000000000;
 46 | 	add.f64 	%fd16, %fd15, 0d3FF0000000000000;
 47 | 	add.f64 	%fd17, %fd16, 0d3FF0000000000000;
 48 | 	add.f64 	%fd18, %fd17, 0d3FF0000000000000;
 49 | 	add.f64 	%fd19, %fd18, 0d3FF0000000000000;
 50 | 	add.f64 	%fd20, %fd19, 0d3FF0000000000000;
 51 | 	add.f64 	%fd21, %fd20, 0d3FF0000000000000;
 52 | 	add.f64 	%fd22, %fd21, 0d3FF0000000000000;
 53 | 	add.f64 	%fd23, %fd22, 0d3FF0000000000000;
 54 | 	add.f64 	%fd24, %fd23, 0d3FF0000000000000;
 55 | 	add.f64 	%fd25, %fd24, 0d3FF0000000000000;
 56 | 	add.f64 	%fd26, %fd25, 0d3FF0000000000000;
 57 | 	add.f64 	%fd27, %fd26, 0d3FF0000000000000;
 58 | 	add.f64 	%fd28, %fd27, 0d3FF0000000000000;
 59 | 	add.f64 	%fd29, %fd28, 0d3FF0000000000000;
 60 | 	add.f64 	%fd30, %fd29, 0d3FF0000000000000;
 61 | 	add.f64 	%fd31, %fd30, 0d3FF0000000000000;
 62 | 	add.f64 	%fd32, %fd31, 0d3FF0000000000000;
 63 | 	add.f64 	%fd33, %fd32, 0d3FF0000000000000;
 64 | 	add.f64 	%fd34, %fd33, 0d3FF0000000000000;
 65 | 	add.f64 	%fd35, %fd34, 0d3FF0000000000000;
 66 | 	add.f64 	%fd36, %fd35, 0d3FF0000000000000;
 67 | 	add.f64 	%fd37, %fd36, 0d3FF0000000000000;
 68 | 	add.f64 	%fd38, %fd37, 0d3FF0000000000000;
 69 | 	add.f64 	%fd39, %fd38, 0d3FF0000000000000;
 70 | 	add.f64 	%fd40, %fd39, 0d3FF0000000000000;
 71 | 	add.f64 	%fd41, %fd40, 0d3FF0000000000000;
 72 | 	add.f64 	%fd42, %fd41, 0d3FF0000000000000;
 73 | 	add.f64 	%fd43, %fd42, 0d3FF0000000000000;
 74 | 	add.f64 	%fd44, %fd43, 0d3FF0000000000000;
 75 | 	add.f64 	%fd45, %fd44, 0d3FF0000000000000;
 76 | 	add.f64 	%fd46, %fd45, 0d3FF0000000000000;
 77 | 	add.f64 	%fd47, %fd46, 0d3FF0000000000000;
 78 | 	add.f64 	%fd48, %fd47, 0d3FF0000000000000;
 79 | 	add.f64 	%fd49, %fd48, 0d3FF0000000000000;
 80 | 	add.f64 	%fd50, %fd49, 0d3FF0000000000000;
 81 | 	add.f64 	%fd51, %fd50, 0d3FF0000000000000;
 82 | 	add.f64 	%fd52, %fd51, 0d3FF0000000000000;
 83 | 	add.f64 	%fd53, %fd52, 0d3FF0000000000000;
 84 | 	add.f64 	%fd54, %fd53, 0d3FF0000000000000;
 85 | 	add.f64 	%fd55, %fd54, 0d3FF0000000000000;
 86 | 	add.f64 	%fd56, %fd55, 0d3FF0000000000000;
 87 | 	add.f64 	%fd57, %fd56, 0d3FF0000000000000;
 88 | 	add.f64 	%fd58, %fd57, 0d3FF0000000000000;
 89 | 	add.f64 	%fd59, %fd58, 0d3FF0000000000000;
 90 | 	add.f64 	%fd60, %fd59, 0d3FF0000000000000;
 91 | 	add.f64 	%fd61, %fd60, 0d3FF0000000000000;
 92 | 	add.f64 	%fd62, %fd61, 0d3FF0000000000000;
 93 | 	add.f64 	%fd63, %fd62, 0d3FF0000000000000;
 94 | 	add.f64 	%fd64, %fd63, 0d3FF0000000000000;
 95 | 	add.f64 	%fd65, %fd64, 0d3FF0000000000000;
 96 | 	add.f64 	%fd66, %fd65, 0d3FF0000000000000;
 97 | 	add.f64 	%fd68, %fd66, 0d3FF0000000000000;
 98 | 	add.s32 	%r4, %r4, 64;
 99 | 	setp.ne.s32 	%p1, %r4, 100000000;
100 | 	@%p1 bra 	$L__BB0_1;
101 | 
102 | 	cvta.to.global.u64 	%rd4, %rd2;
103 | 	div.rn.f64 	%fd67, %fd68, 0d4197D78400000000;
104 | 	cvt.rn.f32.f64 	%f1, %fd67;
105 | 	ld.global.f32 	%f2, [%rd4];
106 | 	add.f32 	%f3, %f2, %f1;
107 | 	st.global.f32 	[%rd1], %f3;
108 | 	ret;
109 | 
110 | }
111 | 
112 | 


--------------------------------------------------------------------------------
/tests/compile_cubin.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | compile_cubin() {
 3 |     if [ -z "$1" ]; then
 4 |         echo "Usage: ./compile_cubin.sh <filename_base>. (e.g ./compile_cubin.sh write_float/write_float)"
 5 |         return 1
 6 |     fi
 7 | 
 8 |     filename="$1"
 9 |     
10 |     nvcc -ptx -std=c++11 -arch=sm_80 "${filename}.cu" -o "${filename}.ptx"
11 |     ptxas -arch=sm_80 "${filename}.ptx" -o "${filename}.cubin"
12 |     nvdisasm "${filename}.cubin" > "${filename}.asm"
13 |     
14 |     echo "Successfully compiled and disassembled ${filename}.cu"
15 | }
16 | 
17 | compile_cubin "$1"
18 | 


--------------------------------------------------------------------------------
/tests/complex/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | message("Todo: add complex test.")


--------------------------------------------------------------------------------
/tests/compute_chronological_consistency/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(
 2 |         test_chronological_consistency
 3 |         main.cpp
 4 | )
 5 | target_link_libraries(
 6 |         test_chronological_consistency
 7 |         PRIVATE
 8 |         driverapi
 9 | )
10 | 
11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/compute_chronological_consistency COPYONLY)


--------------------------------------------------------------------------------
/tests/compute_chronological_consistency/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <librecuda.h>
  2 | 
  3 | #include <iostream>
  4 | #include <vector>
  5 | #include <fstream>
  6 | #include <cstring>
  7 | 
  8 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
  9 |     if (error != LIBRECUDA_SUCCESS) {
 10 |         const char *error_string;
 11 |         libreCuGetErrorString(error, &error_string);
 12 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
 13 |         exit(EXIT_FAILURE);
 14 |     }
 15 | };
 16 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
 17 | 
 18 | int main() {
 19 |     CUDA_CHECK(libreCuInit(0));
 20 | 
 21 |     int device_count{};
 22 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
 23 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
 24 | 
 25 |     LibreCUdevice device{};
 26 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
 27 | 
 28 |     LibreCUcontext ctx{};
 29 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
 30 | 
 31 |     char name_buffer[256] = {};
 32 |     libreCuDeviceGetName(name_buffer, 256, device);
 33 |     std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
 34 | 
 35 |     LibreCUmodule module{};
 36 | 
 37 |     // read cubin file
 38 |     uint8_t *image;
 39 |     size_t n_bytes;
 40 |     {
 41 |         std::ifstream input("write_float.cubin", std::ios::binary);
 42 |         std::vector<uint8_t> bytes(
 43 |                 (std::istreambuf_iterator<char>(input)),
 44 |                 (std::istreambuf_iterator<char>()));
 45 |         input.close();
 46 |         image = new uint8_t[bytes.size()];
 47 |         memcpy(image, bytes.data(), bytes.size());
 48 |         n_bytes = bytes.size();
 49 |     }
 50 |     CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
 51 | 
 52 |     // read functions
 53 |     uint32_t num_funcs{};
 54 |     CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
 55 |     std::cout << "Num functions: " << num_funcs << std::endl;
 56 | 
 57 |     auto *functions = new LibreCUFunction[num_funcs];
 58 |     CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
 59 | 
 60 |     for (size_t i = 0; i < num_funcs; i++) {
 61 |         LibreCUFunction func = functions[i];
 62 |         const char *func_name{};
 63 |         CUDA_CHECK(libreCuFuncGetName(&func_name, func));
 64 |         std::cout << "  function \"" << func_name << "\"" << std::endl;
 65 |     }
 66 | 
 67 |     delete[] functions;
 68 | 
 69 |     // find function
 70 |     LibreCUFunction func{};
 71 |     CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float"));
 72 | 
 73 |     // create stream
 74 |     LibreCUstream stream{};
 75 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
 76 | 
 77 |     void *float_dst_compute_va{};
 78 |     void *float_dst_dma_va{};
 79 |     CUDA_CHECK(libreCuMemAlloc(&float_dst_compute_va, sizeof(float), true));
 80 |     CUDA_CHECK(libreCuMemAlloc(&float_dst_dma_va, sizeof(float), true));
 81 |     *(float *) float_dst_compute_va = 0.0f;
 82 |     *(float *) float_dst_dma_va = 0.0f;
 83 | 
 84 |     {
 85 |         void *params[] = {
 86 |                 &float_dst_compute_va, &float_dst_dma_va
 87 |         };
 88 |         CUDA_CHECK(
 89 |                 libreCuLaunchKernel(func,
 90 |                                     1, 1, 1,
 91 |                                     1, 1, 1,
 92 |                                     0,
 93 |                                     stream,
 94 |                                     params, sizeof(params) / sizeof(void *),
 95 |                                     nullptr
 96 |                 )
 97 |         );
 98 |     }
 99 |     CUDA_CHECK(libreCuMemCpy(float_dst_dma_va, float_dst_compute_va, sizeof(float), stream));
100 |     {
101 |         void *params[] = {
102 |                 &float_dst_compute_va, &float_dst_dma_va
103 |         };
104 |         CUDA_CHECK(
105 |                 libreCuLaunchKernel(func,
106 |                                     1, 1, 1,
107 |                                     1, 1, 1,
108 |                                     0,
109 |                                     stream,
110 |                                     params, sizeof(params) / sizeof(void *),
111 |                                     nullptr
112 |                 )
113 |         );
114 |     }
115 |     CUDA_CHECK(libreCuStreamCommence(stream));
116 |     CUDA_CHECK(libreCuStreamAwait(stream));
117 | 
118 |     std::cout << "Dst compute value (post exec): " << *(float *) (float_dst_compute_va) << std::endl;
119 |     std::cout << "Dst dma value (post exec): " << *(float *) (float_dst_dma_va) << std::endl;
120 | 
121 |     // free memory
122 |     CUDA_CHECK(libreCuMemFree(float_dst_compute_va));
123 |     CUDA_CHECK(libreCuMemFree(float_dst_dma_va));
124 | 
125 |     // destroy stream
126 |     CUDA_CHECK(libreCuStreamDestroy(stream));
127 | 
128 |     // unload module
129 |     CUDA_CHECK(libreCuModuleUnload(module));
130 | 
131 |     // destroy ctx
132 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
133 |     return 0;
134 | }


--------------------------------------------------------------------------------
/tests/compute_chronological_consistency/write_float.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__ void write_float(float *dst, float *input) {
2 |     double x = 0;
3 |     int n = 100000000;
4 |     for (int i = 0; i < n; i++) {
5 |         x += 1.0;
6 |     }
7 |     x /= n;
8 |     *dst = (float) x + (*input);
9 | }


--------------------------------------------------------------------------------
/tests/compute_chronological_consistency/write_float.cu.asm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/compute_chronological_consistency/write_float.cu.asm


--------------------------------------------------------------------------------
/tests/compute_chronological_consistency/write_float.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/compute_chronological_consistency/write_float.cubin


--------------------------------------------------------------------------------
/tests/compute_chronological_consistency/write_float.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-34097967
  5 | // Cuda compilation tools, release 12.4, V12.4.131
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_80
 11 | .address_size 64
 12 | 
 13 | 	// .globl	write_float
 14 | 
 15 | .visible .entry write_float(
 16 | 	.param .u64 write_float_param_0,
 17 | 	.param .u64 write_float_param_1
 18 | )
 19 | {
 20 | 	.reg .pred 	%p<2>;
 21 | 	.reg .f32 	%f<4>;
 22 | 	.reg .b32 	%r<5>;
 23 | 	.reg .f64 	%fd<69>;
 24 | 	.reg .b64 	%rd<5>;
 25 | 
 26 | 
 27 | 	ld.param.u64 	%rd3, [write_float_param_0];
 28 | 	ld.param.u64 	%rd2, [write_float_param_1];
 29 | 	cvta.to.global.u64 	%rd1, %rd3;
 30 | 	mov.f64 	%fd68, 0d0000000000000000;
 31 | 	mov.u32 	%r4, 0;
 32 | 
 33 | $L__BB0_1:
 34 | 	add.f64 	%fd4, %fd68, 0d3FF0000000000000;
 35 | 	add.f64 	%fd5, %fd4, 0d3FF0000000000000;
 36 | 	add.f64 	%fd6, %fd5, 0d3FF0000000000000;
 37 | 	add.f64 	%fd7, %fd6, 0d3FF0000000000000;
 38 | 	add.f64 	%fd8, %fd7, 0d3FF0000000000000;
 39 | 	add.f64 	%fd9, %fd8, 0d3FF0000000000000;
 40 | 	add.f64 	%fd10, %fd9, 0d3FF0000000000000;
 41 | 	add.f64 	%fd11, %fd10, 0d3FF0000000000000;
 42 | 	add.f64 	%fd12, %fd11, 0d3FF0000000000000;
 43 | 	add.f64 	%fd13, %fd12, 0d3FF0000000000000;
 44 | 	add.f64 	%fd14, %fd13, 0d3FF0000000000000;
 45 | 	add.f64 	%fd15, %fd14, 0d3FF0000000000000;
 46 | 	add.f64 	%fd16, %fd15, 0d3FF0000000000000;
 47 | 	add.f64 	%fd17, %fd16, 0d3FF0000000000000;
 48 | 	add.f64 	%fd18, %fd17, 0d3FF0000000000000;
 49 | 	add.f64 	%fd19, %fd18, 0d3FF0000000000000;
 50 | 	add.f64 	%fd20, %fd19, 0d3FF0000000000000;
 51 | 	add.f64 	%fd21, %fd20, 0d3FF0000000000000;
 52 | 	add.f64 	%fd22, %fd21, 0d3FF0000000000000;
 53 | 	add.f64 	%fd23, %fd22, 0d3FF0000000000000;
 54 | 	add.f64 	%fd24, %fd23, 0d3FF0000000000000;
 55 | 	add.f64 	%fd25, %fd24, 0d3FF0000000000000;
 56 | 	add.f64 	%fd26, %fd25, 0d3FF0000000000000;
 57 | 	add.f64 	%fd27, %fd26, 0d3FF0000000000000;
 58 | 	add.f64 	%fd28, %fd27, 0d3FF0000000000000;
 59 | 	add.f64 	%fd29, %fd28, 0d3FF0000000000000;
 60 | 	add.f64 	%fd30, %fd29, 0d3FF0000000000000;
 61 | 	add.f64 	%fd31, %fd30, 0d3FF0000000000000;
 62 | 	add.f64 	%fd32, %fd31, 0d3FF0000000000000;
 63 | 	add.f64 	%fd33, %fd32, 0d3FF0000000000000;
 64 | 	add.f64 	%fd34, %fd33, 0d3FF0000000000000;
 65 | 	add.f64 	%fd35, %fd34, 0d3FF0000000000000;
 66 | 	add.f64 	%fd36, %fd35, 0d3FF0000000000000;
 67 | 	add.f64 	%fd37, %fd36, 0d3FF0000000000000;
 68 | 	add.f64 	%fd38, %fd37, 0d3FF0000000000000;
 69 | 	add.f64 	%fd39, %fd38, 0d3FF0000000000000;
 70 | 	add.f64 	%fd40, %fd39, 0d3FF0000000000000;
 71 | 	add.f64 	%fd41, %fd40, 0d3FF0000000000000;
 72 | 	add.f64 	%fd42, %fd41, 0d3FF0000000000000;
 73 | 	add.f64 	%fd43, %fd42, 0d3FF0000000000000;
 74 | 	add.f64 	%fd44, %fd43, 0d3FF0000000000000;
 75 | 	add.f64 	%fd45, %fd44, 0d3FF0000000000000;
 76 | 	add.f64 	%fd46, %fd45, 0d3FF0000000000000;
 77 | 	add.f64 	%fd47, %fd46, 0d3FF0000000000000;
 78 | 	add.f64 	%fd48, %fd47, 0d3FF0000000000000;
 79 | 	add.f64 	%fd49, %fd48, 0d3FF0000000000000;
 80 | 	add.f64 	%fd50, %fd49, 0d3FF0000000000000;
 81 | 	add.f64 	%fd51, %fd50, 0d3FF0000000000000;
 82 | 	add.f64 	%fd52, %fd51, 0d3FF0000000000000;
 83 | 	add.f64 	%fd53, %fd52, 0d3FF0000000000000;
 84 | 	add.f64 	%fd54, %fd53, 0d3FF0000000000000;
 85 | 	add.f64 	%fd55, %fd54, 0d3FF0000000000000;
 86 | 	add.f64 	%fd56, %fd55, 0d3FF0000000000000;
 87 | 	add.f64 	%fd57, %fd56, 0d3FF0000000000000;
 88 | 	add.f64 	%fd58, %fd57, 0d3FF0000000000000;
 89 | 	add.f64 	%fd59, %fd58, 0d3FF0000000000000;
 90 | 	add.f64 	%fd60, %fd59, 0d3FF0000000000000;
 91 | 	add.f64 	%fd61, %fd60, 0d3FF0000000000000;
 92 | 	add.f64 	%fd62, %fd61, 0d3FF0000000000000;
 93 | 	add.f64 	%fd63, %fd62, 0d3FF0000000000000;
 94 | 	add.f64 	%fd64, %fd63, 0d3FF0000000000000;
 95 | 	add.f64 	%fd65, %fd64, 0d3FF0000000000000;
 96 | 	add.f64 	%fd66, %fd65, 0d3FF0000000000000;
 97 | 	add.f64 	%fd68, %fd66, 0d3FF0000000000000;
 98 | 	add.s32 	%r4, %r4, 64;
 99 | 	setp.ne.s32 	%p1, %r4, 100000000;
100 | 	@%p1 bra 	$L__BB0_1;
101 | 
102 | 	cvta.to.global.u64 	%rd4, %rd2;
103 | 	div.rn.f64 	%fd67, %fd68, 0d4197D78400000000;
104 | 	cvt.rn.f32.f64 	%f1, %fd67;
105 | 	ld.global.f32 	%f2, [%rd4];
106 | 	add.f32 	%f3, %f2, %f1;
107 | 	st.global.f32 	[%rd1], %f3;
108 | 	ret;
109 | 
110 | }
111 | 
112 | 


--------------------------------------------------------------------------------
/tests/dma_chronological_consistency/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(
2 |         test_dma_chronological_consistency
3 |         main.cpp
4 | )
5 | target_link_libraries(
6 |         test_dma_chronological_consistency
7 |         PRIVATE
8 |         driverapi
9 | )


--------------------------------------------------------------------------------
/tests/dma_chronological_consistency/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <librecuda.h>
 2 | 
 3 | #include <iostream>
 4 | 
 5 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
 6 |     if (error != LIBRECUDA_SUCCESS) {
 7 |         const char *error_string;
 8 |         libreCuGetErrorString(error, &error_string);
 9 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
10 |         exit(EXIT_FAILURE);
11 |     }
12 | };
13 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
14 | 
15 | int main() {
16 |     CUDA_CHECK(libreCuInit(0));
17 | 
18 |     int device_count{};
19 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
20 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
21 | 
22 |     LibreCUdevice device{};
23 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
24 | 
25 |     LibreCUcontext ctx{};
26 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
27 | 
28 |     // create stream
29 |     LibreCUstream stream{};
30 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
31 | 
32 |     // declare host array
33 |     uint8_t host_array[1024 * 1024 + 128 + 3]{}; // size chosen to require all 3 memcpy hierarchy kernels to be launched
34 |     for (size_t i = 0; i < sizeof(host_array); i++) {
35 |         host_array[i] = i % 256;
36 |     }
37 | 
38 |     // declare host array
39 |     uint8_t dst_host_array[sizeof(host_array)] = {};
40 | 
41 |     // allocate memory
42 |     uint8_t *device_array_1{};
43 |     uint8_t *device_array_2{};
44 |     uint8_t *device_array_3{};
45 |     uint8_t *device_array_4{};
46 |     uint8_t *device_array_5{};
47 |     uint8_t *device_array_6{};
48 |     CUDA_CHECK(libreCuMemAlloc(reinterpret_cast<void **>(&device_array_1), sizeof(host_array)));
49 |     CUDA_CHECK(libreCuMemAlloc(reinterpret_cast<void **>(&device_array_2), sizeof(host_array)));
50 |     CUDA_CHECK(libreCuMemAlloc(reinterpret_cast<void **>(&device_array_3), sizeof(host_array)));
51 |     CUDA_CHECK(libreCuMemAlloc(reinterpret_cast<void **>(&device_array_4), sizeof(host_array)));
52 |     CUDA_CHECK(libreCuMemAlloc(reinterpret_cast<void **>(&device_array_5), sizeof(host_array)));
53 |     CUDA_CHECK(libreCuMemAlloc(reinterpret_cast<void **>(&device_array_6), sizeof(host_array)));
54 | 
55 |     // copy to gpu
56 |     CUDA_CHECK(libreCuMemCpy(device_array_1, host_array, sizeof(host_array), stream));
57 | 
58 |     // copy d2d
59 |     CUDA_CHECK(libreCuMemCpy(device_array_2, device_array_1, sizeof(host_array), stream));
60 |     CUDA_CHECK(libreCuMemCpy(device_array_3, device_array_2, sizeof(host_array), stream));
61 |     CUDA_CHECK(libreCuMemCpy(device_array_4, device_array_3, sizeof(host_array), stream));
62 |     CUDA_CHECK(libreCuMemCpy(device_array_5, device_array_4, sizeof(host_array), stream));
63 |     CUDA_CHECK(libreCuMemCpy(device_array_6, device_array_5, sizeof(host_array), stream));
64 | 
65 |     // copy back to host
66 |     CUDA_CHECK(libreCuMemCpy(dst_host_array, device_array_6, sizeof(host_array), stream));
67 | 
68 |     // commence stream
69 |     CUDA_CHECK(libreCuStreamCommence(stream));
70 |     CUDA_CHECK(libreCuStreamAwait(stream));
71 | 
72 |     // print device array
73 |     bool is_equal = true;
74 |     size_t i;
75 |     for (i = 0; i < sizeof(host_array); i++) {
76 |         if (host_array[i] != dst_host_array[i]) {
77 |             is_equal = false;
78 |             break;
79 |         }
80 |     }
81 |     if (!is_equal) {
82 |         std::cerr << "Mismatch at " + std::to_string(i) + ": memcpy screwed something up!" << std::endl;
83 |     } else {
84 |         std::cout << "Memory is equal!" << std::endl;
85 |     }
86 | 
87 |     // destroy stream
88 |     CUDA_CHECK(libreCuStreamDestroy(stream));
89 | 
90 |     // free memory
91 |     CUDA_CHECK(libreCuMemFree(device_array_1));
92 |     CUDA_CHECK(libreCuMemFree(device_array_2));
93 | 
94 |     // destroy ctx
95 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
96 |     return 0;
97 | }


--------------------------------------------------------------------------------
/tests/dynamic_shared_mem/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(
 2 |         test_dynamic_shared_mem
 3 |         main.cpp
 4 | )
 5 | target_link_libraries(
 6 |         test_dynamic_shared_mem
 7 |         PRIVATE
 8 |         driverapi
 9 | )
10 | 
11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/dynamic_shared_mem COPYONLY)


--------------------------------------------------------------------------------
/tests/dynamic_shared_mem/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <librecuda.h>
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <vector>
  6 | #include <fstream>
  7 | #include <cstring>
  8 | 
  9 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
 10 |     if (error != LIBRECUDA_SUCCESS) {
 11 |         const char *error_string;
 12 |         libreCuGetErrorString(error, &error_string);
 13 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
 14 |         exit(EXIT_FAILURE);
 15 |     }
 16 | };
 17 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
 18 | 
 19 | int main() {
 20 |     CUDA_CHECK(libreCuInit(0));
 21 | 
 22 |     int device_count{};
 23 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
 24 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
 25 | 
 26 |     LibreCUdevice device{};
 27 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
 28 | 
 29 |     LibreCUcontext ctx{};
 30 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
 31 | 
 32 |     char name_buffer[256] = {};
 33 |     libreCuDeviceGetName(name_buffer, 256, device);
 34 |     std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
 35 | 
 36 |     int maxSharedMemoryPerBlock{};
 37 |     CUDA_CHECK(libreCuDeviceGetAttribute(&maxSharedMemoryPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, device));
 38 | 
 39 |     int maxSharedMemoryPerBlockOptIn{};
 40 |     CUDA_CHECK(libreCuDeviceGetAttribute(&maxSharedMemoryPerBlockOptIn, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, device));
 41 | 
 42 |     std::cout << "Maximum shared memory per block: " << maxSharedMemoryPerBlock << " bytes" << std::endl;
 43 |     std::cout << "Maximum shared memory per block (opt-in): " << maxSharedMemoryPerBlockOptIn << " bytes" << std::endl;
 44 | 
 45 |     LibreCUmodule module{};
 46 | 
 47 |     // read cubin file
 48 |     uint8_t *image;
 49 |     size_t n_bytes;
 50 |     {
 51 |         std::ifstream input("write_float.cubin", std::ios::binary);
 52 |         std::vector<uint8_t> bytes(
 53 |                 (std::istreambuf_iterator<char>(input)),
 54 |                 (std::istreambuf_iterator<char>()));
 55 |         input.close();
 56 |         image = new uint8_t[bytes.size()];
 57 |         memcpy(image, bytes.data(), bytes.size());
 58 |         n_bytes = bytes.size();
 59 |     }
 60 |     CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
 61 | 
 62 |     // read functions
 63 |     uint32_t num_funcs{};
 64 |     CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
 65 |     std::cout << "Num functions: " << num_funcs << std::endl;
 66 | 
 67 |     auto *functions = new LibreCUFunction[num_funcs];
 68 |     CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
 69 | 
 70 |     for (size_t i = 0; i < num_funcs; i++) {
 71 |         LibreCUFunction func = functions[i];
 72 |         const char *func_name{};
 73 |         CUDA_CHECK(libreCuFuncGetName(&func_name, func));
 74 |         std::cout << "  function \"" << func_name << "\"" << std::endl;
 75 |     }
 76 | 
 77 |     delete[] functions;
 78 | 
 79 |     // find function
 80 |     LibreCUFunction func{};
 81 |     CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float_sum"));
 82 | 
 83 |     // set dynamic shared memory
 84 |     CUDA_CHECK(libreCuFuncSetAttribute(func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192));
 85 | 
 86 |     // create stream
 87 |     LibreCUstream stream{};
 88 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
 89 | 
 90 |     void *float_dst_va{};
 91 |     CUDA_CHECK(libreCuMemAlloc(&float_dst_va, sizeof(float), true));
 92 | 
 93 |     float float_value = 0.31415f;
 94 |     short short_value = 314;
 95 | 
 96 |     std::cout << std::fixed;
 97 |     std::cout << std::setprecision(5);
 98 | 
 99 |     std::cout << "A value: " << short_value << std::endl;
100 |     std::cout << "B value: " << float_value << std::endl;
101 |     std::cout << "Dst value (pre exec): " << *(float *) (float_dst_va) << std::endl;
102 | 
103 |     void *params[] = {
104 |             &float_dst_va, // dst
105 |             &short_value, // a
106 |             &float_value // b
107 |     };
108 |     CUDA_CHECK(
109 |             libreCuLaunchKernel(func,
110 |                                 1, 1, 1,
111 |                                 1, 1, 1,
112 |                                 8192,
113 |                                 stream,
114 |                                 params, sizeof(params) / sizeof(void *),
115 |                                 nullptr
116 |             )
117 |     );
118 | 
119 |     // dispatch built up command buffer to GPU
120 |     CUDA_CHECK(libreCuStreamCommence(stream));
121 | 
122 |     // wait for work to complete
123 |     CUDA_CHECK(libreCuStreamAwait(stream));
124 |     std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl;
125 | 
126 |     // free memory
127 |     CUDA_CHECK(libreCuMemFree(float_dst_va));
128 | 
129 |     // destroy stream
130 |     CUDA_CHECK(libreCuStreamDestroy(stream));
131 | 
132 |     // unload module
133 |     CUDA_CHECK(libreCuModuleUnload(module));
134 | 
135 |     // destroy ctx
136 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
137 |     return 0;
138 | }


--------------------------------------------------------------------------------
/tests/dynamic_shared_mem/write_float.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__ void write_float_sum(float *dst, short a, float b) {
2 |     extern __shared__ float sharedData[];
3 | 
4 |     sharedData[1024] = (float) a;
5 |     sharedData[1025] = (float) b;
6 | 
7 |     *dst = (sharedData[1024] + sharedData[1025]);
8 | }


--------------------------------------------------------------------------------
/tests/dynamic_shared_mem/write_float.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/dynamic_shared_mem/write_float.cubin


--------------------------------------------------------------------------------
/tests/dynamic_shared_mem/write_float.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-34097967
 5 | // Cuda compilation tools, release 12.4, V12.4.131
 6 | // Based on NVVM 7.0.1
 7 | //
 8 | 
 9 | .version 8.4
10 | .target sm_80
11 | .address_size 64
12 | 
13 | 	// .globl	write_float_sum
14 | .extern .shared .align 16 .b8 sharedData[];
15 | 
16 | .visible .entry write_float_sum(
17 | 	.param .u64 write_float_sum_param_0,
18 | 	.param .u16 write_float_sum_param_1,
19 | 	.param .f32 write_float_sum_param_2
20 | )
21 | {
22 | 	.reg .b16 	%rs<2>;
23 | 	.reg .f32 	%f<4>;
24 | 	.reg .b64 	%rd<3>;
25 | 
26 | 
27 | 	ld.param.u64 	%rd1, [write_float_sum_param_0];
28 | 	ld.param.u16 	%rs1, [write_float_sum_param_1];
29 | 	cvta.to.global.u64 	%rd2, %rd1;
30 | 	cvt.rn.f32.s16 	%f1, %rs1;
31 | 	ld.param.f32 	%f2, [write_float_sum_param_2];
32 | 	st.shared.v2.f32 	[sharedData+4096], {%f1, %f2};
33 | 	add.f32 	%f3, %f1, %f2;
34 | 	st.global.f32 	[%rd2], %f3;
35 | 	ret;
36 | 
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/indexing/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(
 2 |         test_indexing
 3 |         main.cpp
 4 | )
 5 | target_link_libraries(
 6 |         test_indexing
 7 |         PRIVATE
 8 |         driverapi
 9 | )
10 | 
11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_indexing/ COPYONLY)


--------------------------------------------------------------------------------
/tests/indexing/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <librecuda.h>
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <vector>
  6 | #include <fstream>
  7 | #include <cstring>
  8 | 
  9 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
 10 |     if (error != LIBRECUDA_SUCCESS) {
 11 |         const char *error_string;
 12 |         libreCuGetErrorString(error, &error_string);
 13 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
 14 |         exit(EXIT_FAILURE);
 15 |     }
 16 | };
 17 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
 18 | 
 19 | int main() {
 20 |     CUDA_CHECK(libreCuInit(0));
 21 | 
 22 |     int device_count{};
 23 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
 24 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
 25 | 
 26 |     LibreCUdevice device{};
 27 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
 28 | 
 29 |     LibreCUcontext ctx{};
 30 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
 31 | 
 32 |     char name_buffer[256] = {};
 33 |     libreCuDeviceGetName(name_buffer, 256, device);
 34 |     std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
 35 | 
 36 |     LibreCUmodule module{};
 37 | 
 38 |     // read cubin file
 39 |     uint8_t *image;
 40 |     size_t n_bytes;
 41 |     {
 42 |         std::ifstream input("write_float.cubin", std::ios::binary);
 43 |         std::vector<uint8_t> bytes(
 44 |                 (std::istreambuf_iterator<char>(input)),
 45 |                 (std::istreambuf_iterator<char>()));
 46 |         input.close();
 47 |         image = new uint8_t[bytes.size()];
 48 |         memcpy(image, bytes.data(), bytes.size());
 49 |         n_bytes = bytes.size();
 50 |     }
 51 |     CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
 52 | 
 53 |     // read functions
 54 |     uint32_t num_funcs{};
 55 |     CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
 56 |     std::cout << "Num functions: " << num_funcs << std::endl;
 57 | 
 58 |     auto *functions = new LibreCUFunction[num_funcs];
 59 |     CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
 60 | 
 61 |     for (size_t i = 0; i < num_funcs; i++) {
 62 |         LibreCUFunction func = functions[i];
 63 |         const char *func_name{};
 64 |         CUDA_CHECK(libreCuFuncGetName(&func_name, func));
 65 |         std::cout << "  function \"" << func_name << "\"" << std::endl;
 66 |     }
 67 | 
 68 |     delete[] functions;
 69 | 
 70 |     // find function
 71 |     LibreCUFunction func{};
 72 |     CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float"));
 73 | 
 74 |     // set dynamic shared memory
 75 |     CUDA_CHECK(libreCuFuncSetAttribute(func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 8192));
 76 | 
 77 |     // create stream
 78 |     LibreCUstream stream{};
 79 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
 80 | 
 81 |     void *float_dst_va{};
 82 |     size_t n_elements = 50256 * 768;
 83 |     CUDA_CHECK(libreCuMemAlloc(&float_dst_va, n_elements * sizeof(float), true));
 84 | 
 85 |     auto *host_dst = new float[n_elements];
 86 | 
 87 |     void *params[] = {
 88 |             &float_dst_va, // dst
 89 |             &n_elements
 90 |     };
 91 |     CUDA_CHECK(
 92 |             libreCuLaunchKernel(func,
 93 |                                 n_elements/256, 1, 1,
 94 |                                 256, 1, 1,
 95 |                                 8192,
 96 |                                 stream,
 97 |                                 params, sizeof(params) / sizeof(void *),
 98 |                                 nullptr
 99 |             )
100 |     );
101 |     CUDA_CHECK(libreCuMemCpy(host_dst, float_dst_va, n_elements * sizeof(float), stream, false));
102 | 
103 |     // dispatch built up command buffer to GPU
104 |     CUDA_CHECK(libreCuStreamCommence(stream));
105 | 
106 |     // wait for work to complete
107 |     CUDA_CHECK(libreCuStreamAwait(stream));
108 | 
109 |     for (size_t i = 0; i < n_elements; i++) {
110 |         if (host_dst[i] != 1.0) {
111 |             std::cerr << "Not all values were filled!" << std::endl;
112 |             break;
113 |         }
114 |     }
115 | 
116 |     // free memory
117 |     CUDA_CHECK(libreCuMemFree(float_dst_va));
118 | 
119 |     delete[] host_dst;
120 | 
121 |     // destroy stream
122 |     CUDA_CHECK(libreCuStreamDestroy(stream));
123 | 
124 |     // unload module
125 |     CUDA_CHECK(libreCuModuleUnload(module));
126 | 
127 |     // destroy ctx
128 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
129 |     return 0;
130 | }


--------------------------------------------------------------------------------
/tests/indexing/write_float.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__ void write_float(float *dst, size_t n) {
2 |     size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
3 |     dst[tid] = 1.0f;
4 | }


--------------------------------------------------------------------------------
/tests/indexing/write_float.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/indexing/write_float.cubin


--------------------------------------------------------------------------------
/tests/indexing/write_float.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-34097967
 5 | // Cuda compilation tools, release 12.4, V12.4.131
 6 | // Based on NVVM 7.0.1
 7 | //
 8 | 
 9 | .version 8.4
10 | .target sm_80
11 | .address_size 64
12 | 
13 | 	// .globl	write_float
14 | 
15 | .visible .entry write_float(
16 | 	.param .u64 write_float_param_0,
17 | 	.param .u64 write_float_param_1
18 | )
19 | {
20 | 	.reg .b32 	%r<6>;
21 | 	.reg .b64 	%rd<5>;
22 | 
23 | 
24 | 	ld.param.u64 	%rd1, [write_float_param_0];
25 | 	cvta.to.global.u64 	%rd2, %rd1;
26 | 	mov.u32 	%r1, %ntid.x;
27 | 	mov.u32 	%r2, %ctaid.x;
28 | 	mov.u32 	%r3, %tid.x;
29 | 	mad.lo.s32 	%r4, %r1, %r2, %r3;
30 | 	mul.wide.u32 	%rd3, %r4, 4;
31 | 	add.s64 	%rd4, %rd2, %rd3;
32 | 	mov.u32 	%r5, 1065353216;
33 | 	st.global.u32 	[%rd4], %r5;
34 | 	ret;
35 | 
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/tests/kernel_struct_param/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(
 2 |         test_kernel_struct_param
 3 |         main.cpp
 4 | )
 5 | target_link_libraries(
 6 |         test_kernel_struct_param
 7 |         PRIVATE
 8 |         driverapi
 9 | )
10 | 
11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/read_from_struct.cubin" ${CMAKE_BINARY_DIR}/tests/kernel_struct_param COPYONLY)


--------------------------------------------------------------------------------
/tests/kernel_struct_param/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <librecuda.h>
  2 | 
  3 | #include <iostream>
  4 | #include <iomanip>
  5 | #include <vector>
  6 | #include <fstream>
  7 | #include <cstring>
  8 | 
  9 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
 10 |     if (error != LIBRECUDA_SUCCESS) {
 11 |         const char *error_string;
 12 |         libreCuGetErrorString(error, &error_string);
 13 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
 14 |         exit(EXIT_FAILURE);
 15 |     }
 16 | };
 17 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
 18 | 
 19 | struct struct_t {
 20 |     int x, y, z;
 21 |     int w, h, d;
 22 |     char str[32];
 23 |     char me_ugly;
 24 | };
 25 | static_assert(sizeof(struct_t) == 60);
 26 | 
 27 | int main() {
 28 |     CUDA_CHECK(libreCuInit(0));
 29 | 
 30 |     int device_count{};
 31 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
 32 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
 33 | 
 34 |     LibreCUdevice device{};
 35 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
 36 | 
 37 |     LibreCUcontext ctx{};
 38 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
 39 | 
 40 |     char name_buffer[256] = {};
 41 |     libreCuDeviceGetName(name_buffer, 256, device);
 42 |     std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
 43 |     LibreCUmodule module{};
 44 | 
 45 |     // read cubin file
 46 |     uint8_t *image;
 47 |     size_t n_bytes;
 48 |     {
 49 |         std::ifstream input("read_from_struct.cubin", std::ios::binary);
 50 |         std::vector<uint8_t> bytes(
 51 |                 (std::istreambuf_iterator<char>(input)),
 52 |                 (std::istreambuf_iterator<char>()));
 53 |         input.close();
 54 |         image = new uint8_t[bytes.size()];
 55 |         memcpy(image, bytes.data(), bytes.size());
 56 |         n_bytes = bytes.size();
 57 |     }
 58 |     CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
 59 | 
 60 |     // read functions
 61 |     uint32_t num_funcs{};
 62 |     CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
 63 |     std::cout << "Num functions: " << num_funcs << std::endl;
 64 | 
 65 |     auto *functions = new LibreCUFunction[num_funcs];
 66 |     CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
 67 | 
 68 |     for (size_t i = 0; i < num_funcs; i++) {
 69 |         LibreCUFunction func = functions[i];
 70 |         const char *func_name{};
 71 |         CUDA_CHECK(libreCuFuncGetName(&func_name, func));
 72 |         std::cout << "  function \"" << func_name << "\"" << std::endl;
 73 |     }
 74 | 
 75 |     delete[] functions;
 76 | 
 77 |     // find function
 78 |     LibreCUFunction func{};
 79 |     CUDA_CHECK(libreCuModuleGetFunction(&func, module, "read_from_struct"));
 80 |     // create stream
 81 |     LibreCUstream stream{};
 82 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
 83 | 
 84 |     void *w_dst_va{};
 85 |     CUDA_CHECK(libreCuMemAlloc(&w_dst_va, sizeof(int), true));
 86 | 
 87 |     struct_t s = {
 88 |             .w=64,
 89 |     };
 90 | 
 91 |     void *params[] = {
 92 |             &s, // struct
 93 |             &w_dst_va, // dst
 94 |     };
 95 | 
 96 |     CUDA_CHECK(
 97 |             libreCuLaunchKernel(func,
 98 |                                 1, 1, 1,
 99 |                                 1, 1, 1,
100 |                                 8192,
101 |                                 stream,
102 |                                 params, sizeof(params) / sizeof(void *),
103 |                                 nullptr
104 |             )
105 |     );
106 | 
107 |     // dispatch built up command buffer to GPU
108 |     CUDA_CHECK(libreCuStreamCommence(stream));
109 | 
110 |     // wait for work to complete
111 |     CUDA_CHECK(libreCuStreamAwait(stream));
112 |     std::cout << "Dst value (post exec): " << *(int *) (w_dst_va) << std::endl;
113 | 
114 |     // free memory
115 |     CUDA_CHECK(libreCuMemFree(w_dst_va));
116 | 
117 |     // destroy stream
118 |     CUDA_CHECK(libreCuStreamDestroy(stream));
119 | 
120 |     // unload module
121 |     CUDA_CHECK(libreCuModuleUnload(module));
122 | 
123 |     // destroy ctx
124 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
125 |     return 0;
126 | }


--------------------------------------------------------------------------------
/tests/kernel_struct_param/read_from_struct.cu:
--------------------------------------------------------------------------------
 1 | struct struct_t {
 2 |     int x, y, z;
 3 |     int w, h, d;
 4 |     char str[33];
 5 |     char me_ugly;
 6 | };
 7 | 
 8 | extern "C" __global__ void read_from_struct(struct_t s, int *pWout) {
 9 |     *pWout = s.w;
10 | }


--------------------------------------------------------------------------------
/tests/kernel_struct_param/read_from_struct.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/kernel_struct_param/read_from_struct.cubin


--------------------------------------------------------------------------------
/tests/kernel_struct_param/read_from_struct.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-34097967
 5 | // Cuda compilation tools, release 12.4, V12.4.131
 6 | // Based on NVVM 7.0.1
 7 | //
 8 | 
 9 | .version 8.4
10 | .target sm_80
11 | .address_size 64
12 | 
13 | 	// .globl	read_from_struct
14 | 
15 | .visible .entry read_from_struct(
16 | 	.param .align 4 .b8 read_from_struct_param_0[56],
17 | 	.param .u64 read_from_struct_param_1
18 | )
19 | {
20 | 	.reg .b32 	%r<2>;
21 | 	.reg .b64 	%rd<3>;
22 | 
23 | 
24 | 	ld.param.u64 	%rd1, [read_from_struct_param_1];
25 | 	ld.param.u32 	%r1, [read_from_struct_param_0+12];
26 | 	cvta.to.global.u64 	%rd2, %rd1;
27 | 	st.global.u32 	[%rd2], %r1;
28 | 	ret;
29 | 
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/tests/many_kernels_launch/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(
 2 |         many_kernels_launch
 3 |         main.cpp
 4 | )
 5 | target_link_libraries(
 6 |         many_kernels_launch
 7 |         PRIVATE
 8 |         driverapi
 9 | )
10 | 
11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/empty_kernel.cubin" ${CMAKE_BINARY_DIR}/tests/many_kernels_launch COPYONLY)


--------------------------------------------------------------------------------
/tests/many_kernels_launch/empty_kernel.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__ void emtpy_kernel() {
2 | }


--------------------------------------------------------------------------------
/tests/many_kernels_launch/empty_kernel.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/many_kernels_launch/empty_kernel.cubin


--------------------------------------------------------------------------------
/tests/many_kernels_launch/empty_kernel.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-34097967
 5 | // Cuda compilation tools, release 12.4, V12.4.131
 6 | // Based on NVVM 7.0.1
 7 | //
 8 | 
 9 | .version 8.4
10 | .target sm_80
11 | .address_size 64
12 | 
13 | 	// .globl	emtpy_kernel
14 | 
15 | .visible .entry emtpy_kernel()
16 | {
17 | 
18 | 
19 | 
20 | 	ret;
21 | 
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/many_kernels_launch/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <fstream>
  3 | #include <vector>
  4 | #include <cstring>
  5 | #include <librecuda.h>
  6 | 
  7 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
  8 |     if (error != LIBRECUDA_SUCCESS) {
  9 |         const char *error_string;
 10 |         libreCuGetErrorString(error, &error_string);
 11 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
 12 |         exit(EXIT_FAILURE);
 13 |     }
 14 | };
 15 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
 16 | 
 17 | 
 18 | int main() {
 19 |     CUDA_CHECK(libreCuInit(0));
 20 | 
 21 |     int device_count{};
 22 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
 23 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
 24 | 
 25 |     LibreCUdevice device{};
 26 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
 27 | 
 28 |     LibreCUcontext ctx{};
 29 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
 30 | 
 31 |     char name_buffer[256] = {};
 32 |     libreCuDeviceGetName(name_buffer, 256, device);
 33 |     std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
 34 | 
 35 | 
 36 |     // read cubin file
 37 |     uint8_t *image;
 38 |     size_t n_bytes; {
 39 |         std::ifstream input("empty_kernel.cubin", std::ios::binary);
 40 |         std::vector<uint8_t> bytes(
 41 |             (std::istreambuf_iterator<char>(input)),
 42 |             (std::istreambuf_iterator<char>()));
 43 |         input.close();
 44 |         image = new uint8_t[bytes.size()];
 45 |         std::memcpy(image, bytes.data(), bytes.size());
 46 |         n_bytes = bytes.size();
 47 |     }
 48 | 
 49 |     size_t num_kernels = 1025;
 50 |     LibreCUmodule modules[num_kernels];
 51 |     for (int i = 0; i < num_kernels; i++) {
 52 |         CUDA_CHECK(libreCuModuleLoadData(modules + i, image, n_bytes));
 53 |     }
 54 | 
 55 |     // find functions
 56 |     LibreCUFunction funcs[num_kernels];
 57 |     for (int i = 0; i < num_kernels; i++) {
 58 |         CUDA_CHECK(libreCuModuleGetFunction(funcs + i, modules[i], "emtpy_kernel"));
 59 |     }
 60 | 
 61 |     // create stream
 62 |     LibreCUstream stream{};
 63 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
 64 | 
 65 |     void *params[] = {};
 66 | 
 67 |     auto start = std::chrono::high_resolution_clock::now();
 68 |     for (int i = 0; i < num_kernels; ++i) {
 69 |         CUDA_CHECK(libreCuLaunchKernel(funcs[i],
 70 |                             1, 1, 1,
 71 |                             1, 1, 1,
 72 |                             0,
 73 |                             stream,
 74 |                             params, sizeof(params) / sizeof(void *),
 75 |                             nullptr
 76 |         ));
 77 |     }
 78 |     // dispatch built up command buffer to GPU
 79 |     CUDA_CHECK(libreCuStreamCommence(stream));
 80 |     auto end = std::chrono::high_resolution_clock::now();
 81 | 
 82 |     // wait for work to complete
 83 |     CUDA_CHECK(libreCuStreamAwait(stream));
 84 | 
 85 |     // Calculate the duration in seconds as a double
 86 |     std::chrono::duration<double> elapsed = end - start;
 87 |     double elapsedSeconds = elapsed.count();
 88 | 
 89 |     // Print the elapsed time
 90 |     std::cout << "Average time: " << elapsedSeconds / num_kernels << ", Total: " << elapsedSeconds;
 91 | 
 92 |     // destroy stream
 93 |     CUDA_CHECK(libreCuStreamDestroy(stream));
 94 | 
 95 |     // unload module
 96 |     for (int i = 0; i < num_kernels; ++i) {
 97 |         CUDA_CHECK(libreCuModuleUnload(modules[i]));
 98 |     }
 99 | 
100 |     // destroy ctx
101 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
102 |     return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/tests/memcopy/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(
2 |         test_memcopy
3 |         main.cpp
4 | )
5 | target_link_libraries(
6 |         test_memcopy
7 |         PRIVATE
8 |         driverapi
9 | )


--------------------------------------------------------------------------------
/tests/memcopy/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <librecuda.h>
 2 | 
 3 | #include <iostream>
 4 | 
 5 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
 6 |     if (error != LIBRECUDA_SUCCESS) {
 7 |         const char *error_string;
 8 |         libreCuGetErrorString(error, &error_string);
 9 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
10 |         exit(EXIT_FAILURE);
11 |     }
12 | };
13 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
14 | 
15 | int main() {
16 |     CUDA_CHECK(libreCuInit(0));
17 | 
18 |     int device_count{};
19 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
20 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
21 | 
22 |     LibreCUdevice device{};
23 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
24 | 
25 |     LibreCUcontext ctx{};
26 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
27 | 
28 |     // create stream
29 |     LibreCUstream stream{};
30 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
31 | 
32 |     // declare host array
33 |     float host_array[] = {
34 |             1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f
35 |     };
36 | 
37 |     // declare host array
38 |     float dst_host_array[10] = {};
39 | 
40 |     // allocate memory
41 |     float *device_array_1{};
42 |     float *device_array_2{};
43 |     CUDA_CHECK(libreCuMemAlloc(reinterpret_cast<void **>(&device_array_1), sizeof(host_array)));
44 |     CUDA_CHECK(libreCuMemAlloc(reinterpret_cast<void **>(&device_array_2), sizeof(host_array)));
45 | 
46 |     // copy to gpu
47 |     CUDA_CHECK(libreCuMemCpy(device_array_1, host_array, sizeof(host_array), stream));
48 | 
49 |     // copy d2d
50 |     CUDA_CHECK(libreCuMemCpy(device_array_2, device_array_1, sizeof(host_array), stream));
51 | 
52 |     // copy back to host
53 |     CUDA_CHECK(libreCuMemCpy(dst_host_array, device_array_2, sizeof(host_array), stream));
54 | 
55 |     // commence stream
56 |     CUDA_CHECK(libreCuStreamCommence(stream));
57 |     CUDA_CHECK(libreCuStreamAwait(stream));
58 | 
59 |     // print device array
60 |     for (int i = 0; i < 10; i++) {
61 |         std::cout << dst_host_array[i] << ", ";
62 |     }
63 | 
64 |     // destroy stream
65 |     CUDA_CHECK(libreCuStreamDestroy(stream));
66 | 
67 |     // free memory
68 |     CUDA_CHECK(libreCuMemFree(device_array_1));
69 |     CUDA_CHECK(libreCuMemFree(device_array_2));
70 | 
71 |     // destroy ctx
72 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
73 |     return 0;
74 | }


--------------------------------------------------------------------------------
/tests/stream_events/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(
 2 |         test_stream_events
 3 |         main.cpp
 4 | )
 5 | target_link_libraries(
 6 |         test_stream_events
 7 |         PRIVATE
 8 |         driverapi
 9 | )
10 | 
11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/test_stream_events COPYONLY)


--------------------------------------------------------------------------------
/tests/stream_events/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <librecuda.h>
  2 | 
  3 | #include <iostream>
  4 | #include <vector>
  5 | #include <fstream>
  6 | #include <cstring>
  7 | 
  8 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
  9 |     if (error != LIBRECUDA_SUCCESS) {
 10 |         const char *error_string;
 11 |         libreCuGetErrorString(error, &error_string);
 12 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
 13 |         exit(EXIT_FAILURE);
 14 |     }
 15 | };
 16 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
 17 | 
 18 | int main() {
 19 |     CUDA_CHECK(libreCuInit(0));
 20 | 
 21 |     int device_count{};
 22 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
 23 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
 24 | 
 25 |     LibreCUdevice device{};
 26 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
 27 | 
 28 |     LibreCUcontext ctx{};
 29 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
 30 | 
 31 |     char name_buffer[256] = {};
 32 |     libreCuDeviceGetName(name_buffer, 256, device);
 33 |     std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
 34 | 
 35 |     LibreCUmodule module{};
 36 | 
 37 |     // read cubin file
 38 |     uint8_t *image;
 39 |     size_t n_bytes;
 40 |     {
 41 |         std::ifstream input("write_float.cubin", std::ios::binary);
 42 |         std::vector<uint8_t> bytes(
 43 |                 (std::istreambuf_iterator<char>(input)),
 44 |                 (std::istreambuf_iterator<char>()));
 45 |         input.close();
 46 |         image = new uint8_t[bytes.size()];
 47 |         memcpy(image, bytes.data(), bytes.size());
 48 |         n_bytes = bytes.size();
 49 |     }
 50 |     CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
 51 | 
 52 |     // read functions
 53 |     uint32_t num_funcs{};
 54 |     CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
 55 |     std::cout << "Num functions: " << num_funcs << std::endl;
 56 | 
 57 |     auto *functions = new LibreCUFunction[num_funcs];
 58 |     CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
 59 | 
 60 |     for (size_t i = 0; i < num_funcs; i++) {
 61 |         LibreCUFunction func = functions[i];
 62 |         const char *func_name{};
 63 |         CUDA_CHECK(libreCuFuncGetName(&func_name, func));
 64 |         std::cout << "  function \"" << func_name << "\"" << std::endl;
 65 |     }
 66 | 
 67 |     delete[] functions;
 68 | 
 69 |     // find function
 70 |     LibreCUFunction func{};
 71 |     CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float"));
 72 | 
 73 |     // create stream
 74 |     LibreCUstream stream{};
 75 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
 76 | 
 77 |     void *float_dst_va{};
 78 |     void *float_src_va{};
 79 |     CUDA_CHECK(libreCuMemAlloc(&float_dst_va, sizeof(float), true));
 80 |     CUDA_CHECK(libreCuMemAlloc(&float_src_va, sizeof(float), true));
 81 |     *(float *) float_dst_va = 0.0f;
 82 |     *(float *) float_src_va = 1.0f;
 83 | 
 84 |     LibreCUEvent start{}, end{};
 85 |     CUDA_CHECK(libreCuEventCreate(&start, 0));
 86 |     CUDA_CHECK(libreCuEventCreate(&end, 0));
 87 | 
 88 |     CUDA_CHECK(libreCuEventRecord(start, stream));
 89 |     {
 90 |         void *params[] = {
 91 |                 &float_dst_va, &float_src_va
 92 |         };
 93 |         CUDA_CHECK(
 94 |                 libreCuLaunchKernel(func,
 95 |                                     1, 1, 1,
 96 |                                     1, 1, 1,
 97 |                                     0,
 98 |                                     stream,
 99 |                                     params, sizeof(params) / sizeof(void *),
100 |                                     nullptr
101 |                 )
102 |         );
103 |     }
104 |     CUDA_CHECK(libreCuEventRecord(end, stream));
105 |     CUDA_CHECK(libreCuStreamCommence(stream));
106 |     CUDA_CHECK(libreCuEventSynchronize(end));
107 | 
108 |     float elapsed{};
109 |     CUDA_CHECK(libreCuEventElapsedTime(&elapsed, start, end));
110 |     std::cout << "Elapsed: " << elapsed << "ms" << std::endl;
111 | 
112 |     CUDA_CHECK(libreCuStreamAwait(stream));
113 | 
114 |     std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl;
115 | 
116 |     // free memory
117 |     CUDA_CHECK(libreCuMemFree(float_dst_va));
118 |     CUDA_CHECK(libreCuMemFree(float_src_va));
119 | 
120 |     // destroy stream
121 |     CUDA_CHECK(libreCuStreamDestroy(stream));
122 | 
123 |     // unload module
124 |     CUDA_CHECK(libreCuModuleUnload(module));
125 | 
126 |     // destroy ctx
127 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
128 |     return 0;
129 | }


--------------------------------------------------------------------------------
/tests/stream_events/write_float.cu:
--------------------------------------------------------------------------------
1 | extern "C" __global__ void write_float(float *dst, float *input) {
2 |     double x = 0;
3 |     int n = 100000000;
4 |     for (int i = 0; i < n; i++) {
5 |         x += 1.0;
6 |     }
7 |     x /= n;
8 |     *dst = (float) x + (*input);
9 | }


--------------------------------------------------------------------------------
/tests/stream_events/write_float.cu.asm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/stream_events/write_float.cu.asm


--------------------------------------------------------------------------------
/tests/stream_events/write_float.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/stream_events/write_float.cubin


--------------------------------------------------------------------------------
/tests/stream_events/write_float.ptx:
--------------------------------------------------------------------------------
  1 | //
  2 | // Generated by NVIDIA NVVM Compiler
  3 | //
  4 | // Compiler Build ID: CL-34097967
  5 | // Cuda compilation tools, release 12.4, V12.4.131
  6 | // Based on NVVM 7.0.1
  7 | //
  8 | 
  9 | .version 8.4
 10 | .target sm_80
 11 | .address_size 64
 12 | 
 13 | 	// .globl	write_float
 14 | 
 15 | .visible .entry write_float(
 16 | 	.param .u64 write_float_param_0,
 17 | 	.param .u64 write_float_param_1
 18 | )
 19 | {
 20 | 	.reg .pred 	%p<2>;
 21 | 	.reg .f32 	%f<4>;
 22 | 	.reg .b32 	%r<5>;
 23 | 	.reg .f64 	%fd<69>;
 24 | 	.reg .b64 	%rd<5>;
 25 | 
 26 | 
 27 | 	ld.param.u64 	%rd3, [write_float_param_0];
 28 | 	ld.param.u64 	%rd2, [write_float_param_1];
 29 | 	cvta.to.global.u64 	%rd1, %rd3;
 30 | 	mov.f64 	%fd68, 0d0000000000000000;
 31 | 	mov.u32 	%r4, 0;
 32 | 
 33 | $L__BB0_1:
 34 | 	add.f64 	%fd4, %fd68, 0d3FF0000000000000;
 35 | 	add.f64 	%fd5, %fd4, 0d3FF0000000000000;
 36 | 	add.f64 	%fd6, %fd5, 0d3FF0000000000000;
 37 | 	add.f64 	%fd7, %fd6, 0d3FF0000000000000;
 38 | 	add.f64 	%fd8, %fd7, 0d3FF0000000000000;
 39 | 	add.f64 	%fd9, %fd8, 0d3FF0000000000000;
 40 | 	add.f64 	%fd10, %fd9, 0d3FF0000000000000;
 41 | 	add.f64 	%fd11, %fd10, 0d3FF0000000000000;
 42 | 	add.f64 	%fd12, %fd11, 0d3FF0000000000000;
 43 | 	add.f64 	%fd13, %fd12, 0d3FF0000000000000;
 44 | 	add.f64 	%fd14, %fd13, 0d3FF0000000000000;
 45 | 	add.f64 	%fd15, %fd14, 0d3FF0000000000000;
 46 | 	add.f64 	%fd16, %fd15, 0d3FF0000000000000;
 47 | 	add.f64 	%fd17, %fd16, 0d3FF0000000000000;
 48 | 	add.f64 	%fd18, %fd17, 0d3FF0000000000000;
 49 | 	add.f64 	%fd19, %fd18, 0d3FF0000000000000;
 50 | 	add.f64 	%fd20, %fd19, 0d3FF0000000000000;
 51 | 	add.f64 	%fd21, %fd20, 0d3FF0000000000000;
 52 | 	add.f64 	%fd22, %fd21, 0d3FF0000000000000;
 53 | 	add.f64 	%fd23, %fd22, 0d3FF0000000000000;
 54 | 	add.f64 	%fd24, %fd23, 0d3FF0000000000000;
 55 | 	add.f64 	%fd25, %fd24, 0d3FF0000000000000;
 56 | 	add.f64 	%fd26, %fd25, 0d3FF0000000000000;
 57 | 	add.f64 	%fd27, %fd26, 0d3FF0000000000000;
 58 | 	add.f64 	%fd28, %fd27, 0d3FF0000000000000;
 59 | 	add.f64 	%fd29, %fd28, 0d3FF0000000000000;
 60 | 	add.f64 	%fd30, %fd29, 0d3FF0000000000000;
 61 | 	add.f64 	%fd31, %fd30, 0d3FF0000000000000;
 62 | 	add.f64 	%fd32, %fd31, 0d3FF0000000000000;
 63 | 	add.f64 	%fd33, %fd32, 0d3FF0000000000000;
 64 | 	add.f64 	%fd34, %fd33, 0d3FF0000000000000;
 65 | 	add.f64 	%fd35, %fd34, 0d3FF0000000000000;
 66 | 	add.f64 	%fd36, %fd35, 0d3FF0000000000000;
 67 | 	add.f64 	%fd37, %fd36, 0d3FF0000000000000;
 68 | 	add.f64 	%fd38, %fd37, 0d3FF0000000000000;
 69 | 	add.f64 	%fd39, %fd38, 0d3FF0000000000000;
 70 | 	add.f64 	%fd40, %fd39, 0d3FF0000000000000;
 71 | 	add.f64 	%fd41, %fd40, 0d3FF0000000000000;
 72 | 	add.f64 	%fd42, %fd41, 0d3FF0000000000000;
 73 | 	add.f64 	%fd43, %fd42, 0d3FF0000000000000;
 74 | 	add.f64 	%fd44, %fd43, 0d3FF0000000000000;
 75 | 	add.f64 	%fd45, %fd44, 0d3FF0000000000000;
 76 | 	add.f64 	%fd46, %fd45, 0d3FF0000000000000;
 77 | 	add.f64 	%fd47, %fd46, 0d3FF0000000000000;
 78 | 	add.f64 	%fd48, %fd47, 0d3FF0000000000000;
 79 | 	add.f64 	%fd49, %fd48, 0d3FF0000000000000;
 80 | 	add.f64 	%fd50, %fd49, 0d3FF0000000000000;
 81 | 	add.f64 	%fd51, %fd50, 0d3FF0000000000000;
 82 | 	add.f64 	%fd52, %fd51, 0d3FF0000000000000;
 83 | 	add.f64 	%fd53, %fd52, 0d3FF0000000000000;
 84 | 	add.f64 	%fd54, %fd53, 0d3FF0000000000000;
 85 | 	add.f64 	%fd55, %fd54, 0d3FF0000000000000;
 86 | 	add.f64 	%fd56, %fd55, 0d3FF0000000000000;
 87 | 	add.f64 	%fd57, %fd56, 0d3FF0000000000000;
 88 | 	add.f64 	%fd58, %fd57, 0d3FF0000000000000;
 89 | 	add.f64 	%fd59, %fd58, 0d3FF0000000000000;
 90 | 	add.f64 	%fd60, %fd59, 0d3FF0000000000000;
 91 | 	add.f64 	%fd61, %fd60, 0d3FF0000000000000;
 92 | 	add.f64 	%fd62, %fd61, 0d3FF0000000000000;
 93 | 	add.f64 	%fd63, %fd62, 0d3FF0000000000000;
 94 | 	add.f64 	%fd64, %fd63, 0d3FF0000000000000;
 95 | 	add.f64 	%fd65, %fd64, 0d3FF0000000000000;
 96 | 	add.f64 	%fd66, %fd65, 0d3FF0000000000000;
 97 | 	add.f64 	%fd68, %fd66, 0d3FF0000000000000;
 98 | 	add.s32 	%r4, %r4, 64;
 99 | 	setp.ne.s32 	%p1, %r4, 100000000;
100 | 	@%p1 bra 	$L__BB0_1;
101 | 
102 | 	cvta.to.global.u64 	%rd4, %rd2;
103 | 	div.rn.f64 	%fd67, %fd68, 0d4197D78400000000;
104 | 	cvt.rn.f32.f64 	%f1, %fd67;
105 | 	ld.global.f32 	%f2, [%rd4];
106 | 	add.f32 	%f3, %f2, %f1;
107 | 	st.global.f32 	[%rd1], %f3;
108 | 	ret;
109 | 
110 | }
111 | 
112 | 


--------------------------------------------------------------------------------
/tests/write_float/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(
 2 |         test_write_float
 3 |         main.cpp
 4 | )
 5 | target_link_libraries(
 6 |         test_write_float
 7 |         PRIVATE
 8 |         driverapi
 9 | )
10 | 
11 | configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/write_float COPYONLY)


--------------------------------------------------------------------------------
/tests/write_float/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <librecuda.h>
  2 | 
  3 | #include <iostream>
  4 | #include <cstdint>
  5 | #include <vector>
  6 | #include <fstream>
  7 | #include <cstring>
  8 | #include <iomanip>
  9 | 
 10 | inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
 11 |     if (error != LIBRECUDA_SUCCESS) {
 12 |         const char *error_string;
 13 |         libreCuGetErrorString(error, &error_string);
 14 |         printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
 15 |         exit(EXIT_FAILURE);
 16 |     }
 17 | };
 18 | #define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
 19 | 
 20 | int main() {
 21 |     CUDA_CHECK(libreCuInit(0));
 22 | 
 23 |     int device_count{};
 24 |     CUDA_CHECK(libreCuDeviceGetCount(&device_count));
 25 |     std::cout << "Device count: " + std::to_string(device_count) << std::endl;
 26 | 
 27 |     LibreCUdevice device{};
 28 |     CUDA_CHECK(libreCuDeviceGet(&device, 0));
 29 | 
 30 |     LibreCUcontext ctx{};
 31 |     CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
 32 | 
 33 |     char name_buffer[256] = {};
 34 |     libreCuDeviceGetName(name_buffer, 256, device);
 35 |     std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
 36 | 
 37 |     LibreCUmodule module{};
 38 | 
 39 |     // read cubin file
 40 |     uint8_t *image;
 41 |     size_t n_bytes;
 42 |     {
 43 |         std::ifstream input("write_float.cubin", std::ios::binary);
 44 |         std::vector<uint8_t> bytes(
 45 |                 (std::istreambuf_iterator<char>(input)),
 46 |                 (std::istreambuf_iterator<char>()));
 47 |         input.close();
 48 |         image = new uint8_t[bytes.size()];
 49 |         memcpy(image, bytes.data(), bytes.size());
 50 |         n_bytes = bytes.size();
 51 |     }
 52 |     CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
 53 | 
 54 |     // read functions
 55 |     uint32_t num_funcs{};
 56 |     CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
 57 |     std::cout << "Num functions: " << num_funcs << std::endl;
 58 | 
 59 |     auto *functions = new LibreCUFunction[num_funcs];
 60 |     CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
 61 | 
 62 |     for (size_t i = 0; i < num_funcs; i++) {
 63 |         LibreCUFunction func = functions[i];
 64 |         const char *func_name{};
 65 |         CUDA_CHECK(libreCuFuncGetName(&func_name, func));
 66 |         std::cout << "  function \"" << func_name << "\"" << std::endl;
 67 |     }
 68 | 
 69 |     delete[] functions;
 70 | 
 71 |     // find function
 72 |     LibreCUFunction func{};
 73 |     CUDA_CHECK(libreCuModuleGetFunction(&func, module, "write_float_sum"));
 74 | 
 75 |     // create stream
 76 |     LibreCUstream stream{};
 77 |     CUDA_CHECK(libreCuStreamCreate(&stream, 0));
 78 | 
 79 |     void *float_dst_va{};
 80 |     CUDA_CHECK(libreCuMemAlloc(&float_dst_va, sizeof(float), true));
 81 | 
 82 |     float float_value = 0.31415f;
 83 |     short short_value = 314;
 84 | 
 85 |     std::cout << std::fixed;
 86 |     std::cout << std::setprecision(5);
 87 | 
 88 |     std::cout << "A value: " << short_value << std::endl;
 89 |     std::cout << "B value: " << float_value << std::endl;
 90 |     std::cout << "Dst value (pre exec): " << *(float *) (float_dst_va) << std::endl;
 91 | 
 92 |     void *params[] = {
 93 |             &float_dst_va, // dst
 94 |             &short_value, // a
 95 |             &float_value // b
 96 |     };
 97 |     CUDA_CHECK(
 98 |             libreCuLaunchKernel(func,
 99 |                                 1, 1, 1,
100 |                                 1, 1, 1,
101 |                                 0,
102 |                                 stream,
103 |                                 params, sizeof(params) / sizeof(void *),
104 |                                 nullptr
105 |             )
106 |     );
107 | 
108 |     // dispatch built up command buffer to GPU
109 |     CUDA_CHECK(libreCuStreamCommence(stream));
110 | 
111 |     // wait for work to complete
112 |     CUDA_CHECK(libreCuStreamAwait(stream));
113 |     std::cout << "Dst value (post exec): " << *(float *) (float_dst_va) << std::endl;
114 | 
115 |     // free memory
116 |     CUDA_CHECK(libreCuMemFree(float_dst_va));
117 | 
118 |     // destroy stream
119 |     CUDA_CHECK(libreCuStreamDestroy(stream));
120 | 
121 |     // unload module
122 |     CUDA_CHECK(libreCuModuleUnload(module));
123 | 
124 |     // destroy ctx
125 |     CUDA_CHECK(libreCuCtxDestroy(ctx));
126 |     return 0;
127 | }


--------------------------------------------------------------------------------
/tests/write_float/write_float.cu:
--------------------------------------------------------------------------------
 1 | extern "C" __global__ void write_float_ptr(float *dst, float *src) {
 2 |     *dst = *src;
 3 | }
 4 | 
 5 | extern "C" __global__ void write_float_value(float *dst, float value) {
 6 |     *dst = value;
 7 | }
 8 | 
 9 | __device__ int global_int = 0x69;
10 | 
11 | extern "C" __global__ void write_float_sum(float *dst, short a, float b) {
12 |     *dst = (a + b + global_int);
13 | }


--------------------------------------------------------------------------------
/tests/write_float/write_float.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mikex86/LibreCuda/7470f81a5c910c3b2c6e0088fb07d55338b5041d/tests/write_float/write_float.cubin


--------------------------------------------------------------------------------
/tests/write_float/write_float.ptx:
--------------------------------------------------------------------------------
 1 | //
 2 | // Generated by NVIDIA NVVM Compiler
 3 | //
 4 | // Compiler Build ID: CL-33961263
 5 | // Cuda compilation tools, release 12.4, V12.4.99
 6 | // Based on NVVM 7.0.1
 7 | //
 8 | 
 9 | .version 8.4
10 | .target sm_80
11 | .address_size 64
12 | 
13 | 	// .globl	write_float_ptr
14 | .global .align 4 .u32 global_int = 105;
15 | 
16 | .visible .entry write_float_ptr(
17 | 	.param .u64 write_float_ptr_param_0,
18 | 	.param .u64 write_float_ptr_param_1
19 | )
20 | {
21 | 	.reg .f32 	%f<2>;
22 | 	.reg .b64 	%rd<5>;
23 | 
24 | 
25 | 	ld.param.u64 	%rd1, [write_float_ptr_param_0];
26 | 	ld.param.u64 	%rd2, [write_float_ptr_param_1];
27 | 	cvta.to.global.u64 	%rd3, %rd1;
28 | 	cvta.to.global.u64 	%rd4, %rd2;
29 | 	ld.global.f32 	%f1, [%rd4];
30 | 	st.global.f32 	[%rd3], %f1;
31 | 	ret;
32 | 
33 | }
34 | 	// .globl	write_float_value
35 | .visible .entry write_float_value(
36 | 	.param .u64 write_float_value_param_0,
37 | 	.param .f32 write_float_value_param_1
38 | )
39 | {
40 | 	.reg .f32 	%f<2>;
41 | 	.reg .b64 	%rd<3>;
42 | 
43 | 
44 | 	ld.param.u64 	%rd1, [write_float_value_param_0];
45 | 	ld.param.f32 	%f1, [write_float_value_param_1];
46 | 	cvta.to.global.u64 	%rd2, %rd1;
47 | 	st.global.f32 	[%rd2], %f1;
48 | 	ret;
49 | 
50 | }
51 | 	// .globl	write_float_sum
52 | .visible .entry write_float_sum(
53 | 	.param .u64 write_float_sum_param_0,
54 | 	.param .u16 write_float_sum_param_1,
55 | 	.param .f32 write_float_sum_param_2
56 | )
57 | {
58 | 	.reg .b16 	%rs<2>;
59 | 	.reg .f32 	%f<6>;
60 | 	.reg .b32 	%r<2>;
61 | 	.reg .b64 	%rd<3>;
62 | 
63 | 
64 | 	ld.param.u64 	%rd1, [write_float_sum_param_0];
65 | 	ld.param.u16 	%rs1, [write_float_sum_param_1];
66 | 	ld.param.f32 	%f1, [write_float_sum_param_2];
67 | 	cvta.to.global.u64 	%rd2, %rd1;
68 | 	cvt.rn.f32.s16 	%f2, %rs1;
69 | 	add.f32 	%f3, %f2, %f1;
70 | 	ld.global.u32 	%r1, [global_int];
71 | 	cvt.rn.f32.s32 	%f4, %r1;
72 | 	add.f32 	%f5, %f3, %f4;
73 | 	st.global.f32 	[%rd2], %f5;
74 | 	ret;
75 | 
76 | }
77 | 
78 | 


--------------------------------------------------------------------------------