├── vsov
    ├── config.h.in
    ├── CMakeLists.txt
    ├── win32.cpp
    └── README.md
├── vsmigx
    ├── config.h.in
    ├── CMakeLists.txt
    ├── win32.cpp
    └── README.md
├── vsncnn
    ├── config.h.in
    ├── onnx2ncnn.hpp
    └── CMakeLists.txt
├── vsort
    ├── config.h.in
    ├── CMakeLists.txt
    ├── README.md
    └── win32.cpp
├── vstrt
    ├── config.h.in
    ├── longpath.manifest
    ├── cuda_helper.h
    ├── trtexec
    │   ├── CMakeLists.txt
    │   ├── trtexec.patch
    │   └── logfile.cpp
    ├── cuda_utils.h
    ├── win32.cpp
    ├── CMakeLists.txt
    ├── inference_helper.h
    ├── README.md
    ├── utils.h
    └── trt_utils.h
├── common
    ├── onnx_utils.h
    ├── convert_float_to_float16.h
    └── onnx_utils.cpp
├── .github
    └── workflows
    │   ├── linux-trt.yml
    │   ├── linux-trt-arm64.yml
    │   ├── linux-trt-rtx.yml
    │   ├── linux-migx.yml
    │   ├── windows-hip-dependency.yml
    │   ├── windows-migx.yml
    │   ├── windows-trt_rtx.yml
    │   ├── linux-ncnn.yml
    │   ├── windows-cuda-dependency.yml
    │   ├── macos-ort.yml
    │   ├── linux-ov.yml
    │   ├── linux-ov-arm64.yml
    │   ├── windows-trt.yml
    │   ├── linux-ort.yml
    │   ├── windows-ncnn.yml
    │   ├── windows-ov.yml
    │   ├── windows-ort.yml
    │   └── windows-release.yml
└── README.md


/vsov/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vsmigx/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vsncnn/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vsort/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vstrt/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vsncnn/onnx2ncnn.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef ONNX2NCNN_HPP
 2 | #define ONNX2NCNN_HPP
 3 | 
 4 | #include <optional>
 5 | #include <string>
 6 | #include <tuple>
 7 | 
 8 | #include <onnx/onnx_pb.h>
 9 | 
10 | extern std::optional<std::tuple<char *, unsigned char *>> onnx2ncnn(ONNX_NAMESPACE::ModelProto & model);
11 | 
12 | #endif // ONNX2NCNN_HPP
13 | 


--------------------------------------------------------------------------------
/common/onnx_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef ONNX_UTILS_H
 2 | #define ONNX_UTILS_H
 3 | 
 4 | #include <cstdint>
 5 | #include <string>
 6 | #include <string_view>
 7 | #include <variant>
 8 | 
 9 | #include <onnx/onnx_pb.h>
10 | 
11 | std::variant<std::string, ONNX_NAMESPACE::ModelProto> loadONNX(
12 |     const std::string_view & path,
13 |     int64_t tile_w,
14 |     int64_t tile_h,
15 |     bool path_is_serialization
16 | ) noexcept;
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/vstrt/longpath.manifest:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8" standalone="yes"?>
2 | <assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0" xmlns:asmv3="urn:schemas-microsoft-com:asm.v3" >
3 | <application xmlns="urn:schemas-microsoft-com:asm.v3">
4 |     <windowsSettings xmlns:ws2="http://schemas.microsoft.com/SMI/2016/WindowsSettings">
5 |         <ws2:longPathAware>true</ws2:longPathAware>
6 |     </windowsSettings>
7 | </application>
8 | </assembly>
9 | 


--------------------------------------------------------------------------------
/common/convert_float_to_float16.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONVERT_FLOAT_TO_FLOAT16_H
 2 | #define CONVERT_FLOAT_TO_FLOAT16_H
 3 | 
 4 | #include <string>
 5 | #include <unordered_set>
 6 | 
 7 | #include <onnx/onnx_pb.h>
 8 | 
 9 | void convert_float_to_float16(
10 |     ONNX_NAMESPACE::ModelProto & model,
11 |     bool force_fp16_initializers,
12 |     // bool keep_io_types = True,
13 |     // bool disable_shape_infer = True,
14 |     // const std::optional<std::unordered_set<std::string>> op_block_list = DEFAULT_OP_BLOCK_LIST,
15 |     // const std::optional<std::unordered_set<std::string>> op_block_list = {},
16 |     const std::unordered_set<std::string> & op_block_list,
17 |     bool cast_input = true,
18 |     bool cast_output = true
19 | ) noexcept;
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/vstrt/cuda_helper.h:
--------------------------------------------------------------------------------
 1 | #ifndef VSTRT_CUDA_HELPER_H_
 2 | #define VSTRT_CUDA_HELPER_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | #include <cuda_runtime_api.h>
 7 | 
 8 | #define checkError(expr) do {                                                  \
 9 |     using namespace std::string_literals;                                      \
10 |     cudaError_t __err = expr;                                                  \
11 |     if (__err != cudaSuccess) {                                                \
12 |         const char * message = cudaGetErrorString(__err);                      \
13 |         return set_error("'"s + # expr + "' failed: " + message);              \
14 |     }                                                                          \
15 | } while(0)
16 | 
17 | #endif // VSTRT_CUDA_HELPER_H_
18 | 


--------------------------------------------------------------------------------
/vstrt/trtexec/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(trtexec LANGUAGES CXX)
 4 | 
 5 | find_package(CUDAToolkit REQUIRED)
 6 | 
 7 | add_executable(trtexec
 8 |     $<$<PLATFORM_ID:Windows>: longpath.manifest>
 9 |     trtexec.cpp
10 |     logfile.cpp
11 |     ../common/bfloat16.cpp
12 |     ../common/debugTensorWriter.cpp
13 |     ../common/logger.cpp
14 |     ../common/sampleDevice.cpp
15 |     ../common/sampleEngines.cpp
16 |     ../common/sampleInference.cpp
17 |     ../common/sampleOptions.cpp
18 |     ../common/sampleReporting.cpp
19 |     ../common/sampleUtils.cpp
20 |     ../../shared/utils/fileLock.cpp
21 |     ../../shared/utils/cacheUtils.cpp
22 | )
23 | 
24 | target_include_directories(trtexec PRIVATE
25 |     ../common
26 |     ..
27 |     ../../include
28 |     ../../shared
29 | )
30 | 
31 | target_link_libraries(trtexec PRIVATE CUDA::cudart_static)
32 | 
33 | install(TARGETS trtexec RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
34 | 


--------------------------------------------------------------------------------
/vstrt/trtexec/trtexec.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/shared/utils/fileLock.cpp b/shared/utils/fileLock.cpp
 2 | index e155c0b..de6bce2 100644
 3 | --- a/shared/utils/fileLock.cpp
 4 | +++ b/shared/utils/fileLock.cpp
 5 | @@ -35,8 +35,11 @@ FileLock::FileLock(ILogger& logger, std::string const& fileName)
 6 |          ss << "Trying to set exclusive file lock " << lockFileName << std::endl;
 7 |          mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str());
 8 |      }
 9 | +    int size = MultiByteToWideChar(CP_UTF8, 0, lockFileName.c_str(), -1, nullptr, 0);
10 | +    std::wstring lockFileNameW (size, L'\0');
11 | +    MultiByteToWideChar(CP_UTF8, 0, lockFileName.c_str(), -1, &lockFileNameW[0], size);
12 |      // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided
13 | -    mHandle = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL);
14 | +    mHandle = CreateFileW(lockFileNameW.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, FILE_FLAG_DELETE_ON_CLOSE | FILE_ATTRIBUTE_TEMPORARY, NULL);
15 |      if (mHandle == INVALID_HANDLE_VALUE)
16 |      {
17 |          throw std::runtime_error("Failed to lock " + lockFileName + "!");
18 | 


--------------------------------------------------------------------------------
/vstrt/trtexec/logfile.cpp:
--------------------------------------------------------------------------------
 1 | // When $TRTEXEC_LOG_FILE is set, redirect stdout and stderr to the specified
 2 | // file as well.
 3 | #include <iostream>
 4 | #include <streambuf>
 5 | #include <fstream>
 6 | #include <stdlib.h>
 7 | 
 8 | namespace {
 9 | static struct redirect {
10 | 	class teebuf: public std::streambuf {
11 | 		public:
12 | 			teebuf(std::streambuf *a, std::streambuf *b): s1(a), s2(b) {}
13 | 		private:
14 | 			std::streambuf *s1, *s2;
15 | 
16 | 			virtual int overflow(int c) override {
17 | 				if (c == EOF)
18 | 					return EOF;
19 | 				else {
20 | 					int r1 = s1->sputc(c);
21 | 					int r2 = s2->sputc(c);
22 | 					return (r1 == EOF || r2 == EOF) ? EOF : c;
23 | 				}
24 | 			}
25 | 
26 | 			virtual int sync() override {
27 | 				int r1 = s1->pubsync();
28 | 				int r2 = s2->pubsync();
29 | 				return (r1 == 0 && r2 == 0) ? 0 : -1;
30 | 			}
31 | 	};
32 | 	redirect() {
33 | 		const char *fn = getenv("TRTEXEC_LOG_FILE");
34 | 		if (fn) {
35 | 			static std::ofstream ofs(fn, std::ios::app);
36 | 			static teebuf out(ofs.rdbuf(), std::cout.rdbuf());
37 | 			static teebuf err(ofs.rdbuf(), std::cerr.rdbuf());
38 | 			std::cout.rdbuf(&out);
39 | 			std::cerr.rdbuf(&err);
40 | 		}
41 | 	}
42 | } _;
43 | } // namespace
44 | 


--------------------------------------------------------------------------------
/vsmigx/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-migraphx VERSION 3.1 LANGUAGES CXX)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 8 | 
 9 | find_package(migraphx REQUIRED CONFIG)
10 | find_package(hip REQUIRED CONFIG)
11 | 
12 | add_library(vsmigx SHARED vs_migraphx.cpp win32.cpp)
13 | 
14 | target_include_directories(vsmigx PRIVATE ${VAPOURSYNTH_INCLUDE_DIRECTORY})
15 | 
16 | target_link_libraries(vsmigx PRIVATE migraphx::c hip::host)
17 | 
18 | set_target_properties(vsmigx PROPERTIES
19 |     CXX_EXTENSIONS OFF
20 |     POSITION_INDEPENDENT_CODE ON
21 |     CXX_STANDARD 20
22 |     CXX_STANDARD_REQUIRED ON
23 | )
24 | 
25 | if (WIN32)
26 |     target_link_options(vsmigx PRIVATE
27 |         "/DELAYLOAD:migraphx_c.dll"
28 |         "/DELAYLOAD:amdhip64_6.dll"
29 |         "delayimp.lib"
30 |     )
31 | endif()
32 | 
33 | find_package(Git REQUIRED)
34 | execute_process(
35 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
36 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
37 |     OUTPUT_VARIABLE VCS_TAG
38 | )
39 | string(STRIP ${VCS_TAG} VCS_TAG)
40 | configure_file(config.h.in config.h)
41 | target_include_directories(vsmigx PUBLIC "${PROJECT_BINARY_DIR}")
42 | 
43 | install(TARGETS vsmigx
44 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
45 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
46 | )
47 | 


--------------------------------------------------------------------------------
/vsncnn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-ncnn VERSION 3.0 LANGUAGES CXX)
 4 | 
 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 6 | 
 7 | find_package(protobuf REQUIRED CONFIG)
 8 | find_package(ONNX REQUIRED CONFIG)
 9 | find_package(ncnn REQUIRED CONFIG)
10 | 
11 | add_library(vsncnn SHARED vs_ncnn.cpp onnx2ncnn.cpp ../common/onnx_utils.cpp)
12 | 
13 | target_include_directories(vsncnn PRIVATE
14 |     ${VAPOURSYNTH_INCLUDE_DIRECTORY}
15 |     ${ONNX_INCLUDE_DIRS}
16 | )
17 | 
18 | target_link_libraries(vsncnn PRIVATE ncnn)
19 | 
20 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5
21 | if (ONNX_VERSION VERSION_LESS 1.16.0)
22 |     target_link_libraries(vsncnn PRIVATE onnx)
23 | else()
24 |     target_link_libraries(vsncnn PRIVATE ONNX::onnx)
25 | endif()
26 | 
27 | set_target_properties(vsncnn PROPERTIES
28 |     POSITION_INDEPENDENT_CODE ON
29 |     CXX_EXTENSIONS OFF
30 |     CXX_STANDARD 17
31 |     CXX_STANDARD_REQUIRED ON
32 | )
33 | 
34 | if (CMAKE_CXX_STANDARD GREATER 17)
35 |     set_target_properties(vsncnn PROPERTIES CXX_STANDARD ${CMAKE_CXX_STANDARD})
36 | endif()
37 | 
38 | target_include_directories(vsncnn PUBLIC
39 |     "${PROJECT_BINARY_DIR}"
40 | )
41 | 
42 | find_package(Git REQUIRED)
43 | execute_process(
44 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
45 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
46 |     OUTPUT_VARIABLE VCS_TAG
47 | )
48 | string(STRIP ${VCS_TAG} VCS_TAG)
49 | configure_file(config.h.in config.h)
50 | 
51 | install(TARGETS vsncnn
52 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
53 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
54 | )
55 | 


--------------------------------------------------------------------------------
/vstrt/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef VSTRT_CUDA_UTILS_H_
 2 | #define VSTRT_CUDA_UTILS_H_
 3 | 
 4 | #include <concepts>
 5 | #include <cstdint>
 6 | #include <type_traits>
 7 | #include <utility>
 8 | 
 9 | #include <cuda_runtime_api.h>
10 | 
11 | template <typename T, auto deleter>
12 |     requires
13 |         std::default_initializable<T> &&
14 |         std::movable<T> &&
15 |         std::is_trivially_copy_assignable_v<T> &&
16 |         std::convertible_to<T, bool> &&
17 |         std::invocable<decltype(deleter), T>
18 | struct Resource {
19 |     T data;
20 | 
21 |     [[nodiscard]]
22 |     constexpr Resource() noexcept = default;
23 | 
24 |     [[nodiscard]]
25 |     constexpr Resource(T && x) noexcept : data(x) {}
26 | 
27 |     [[nodiscard]]
28 |     constexpr Resource(Resource&& other) noexcept
29 |         : data(std::exchange(other.data, T{}))
30 |     { }
31 | 
32 |     constexpr Resource& operator=(Resource&& other) noexcept {
33 |         if (this == &other) return *this;
34 |         deleter_(std::move(data));
35 |         data = std::exchange(other.data, T{});
36 |         return *this;
37 |     }
38 | 
39 |     constexpr Resource& operator=(const Resource & other) = delete;
40 | 
41 |     Resource(const Resource& other) = delete;
42 | 
43 |     constexpr operator T() const noexcept {
44 |         return data;
45 |     }
46 | 
47 |     constexpr auto deleter_(T && x) noexcept {
48 |         if (x) {
49 |             deleter(x);
50 |         }
51 |     }
52 | 
53 |     constexpr Resource& operator=(T && x) noexcept {
54 |         deleter_(std::move(data));
55 |         data = x;
56 |         return *this;
57 |     }
58 | 
59 |     constexpr ~Resource() noexcept {
60 |         deleter_(std::move(data));
61 |     }
62 | };
63 | 
64 | struct MemoryResource {
65 |     Resource<uint8_t *, cudaFreeHost> h_data;
66 |     Resource<uint8_t *, cudaFree> d_data;
67 |     size_t size;
68 | };
69 | 
70 | using StreamResource = Resource<cudaStream_t, cudaStreamDestroy>;
71 | using GraphExecResource = Resource<cudaGraphExec_t, cudaGraphExecDestroy>;
72 | 
73 | #endif // VSTRT_CUDA_UTILS_H_
74 | 


--------------------------------------------------------------------------------
/vsov/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-ov VERSION 3.0 LANGUAGES CXX)
 4 | 
 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 6 | set(ENABLE_VISUALIZATION OFF CACHE BOOL "Enable support for network visualization")
 7 | set(WIN32_SHARED_OPENVINO OFF CACHE BOOL "Build for win32 with shared openvino library")
 8 | 
 9 | find_package(OpenVINO REQUIRED CONFIG)
10 | 
11 | add_library(vsov SHARED
12 |     vs_openvino.cpp
13 |     win32.cpp
14 |     ../common/onnx_utils.cpp
15 |     ../common/convert_float_to_float16.cpp
16 | )
17 | 
18 | if(ENABLE_VISUALIZATION)
19 |     target_compile_definitions(vsov PRIVATE ENABLE_VISUALIZATION)
20 | endif()
21 | 
22 | if(WIN32_SHARED_OPENVINO)
23 |     target_compile_definitions(vsov PRIVATE WIN32_SHARED_OPENVINO)
24 | endif()
25 | 
26 | find_package(protobuf REQUIRED CONFIG)
27 | find_package(ONNX REQUIRED CONFIG)
28 | 
29 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5
30 | if (ONNX_VERSION VERSION_LESS 1.16.0)
31 |     target_link_libraries(vsov PRIVATE onnx)
32 | else()
33 |     target_link_libraries(vsov PRIVATE ONNX::onnx)
34 | endif()
35 | 
36 | target_include_directories(vsov PRIVATE
37 |     ${VAPOURSYNTH_INCLUDE_DIRECTORY}
38 |     ${ONNX_INCLUDE_DIRS}
39 | )
40 | 
41 | target_link_libraries(vsov PRIVATE openvino::runtime)
42 | 
43 | set_target_properties(vsov PROPERTIES
44 |     CXX_EXTENSIONS OFF
45 |     CXX_STANDARD 17
46 |     CXX_STANDARD_REQUIRED ON
47 | )
48 | 
49 | if (WIN32)
50 |     if(WIN32_SHARED_OPENVINO)
51 |         target_link_options(vsov PRIVATE "/DELAYLOAD:openvino.dll" "delayimp.lib")
52 |     else()
53 |         target_link_options(vsov PRIVATE "/DELAYLOAD:tbb.dll" "delayimp.lib")
54 |     endif()
55 | endif()
56 | 
57 | target_include_directories(vsov PUBLIC
58 |     "${PROJECT_BINARY_DIR}"
59 | )
60 | 
61 | find_package(Git REQUIRED)
62 | execute_process(
63 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
64 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
65 |     OUTPUT_VARIABLE VCS_TAG
66 | )
67 | string(STRIP ${VCS_TAG} VCS_TAG)
68 | configure_file(config.h.in config.h)
69 | 
70 | install(TARGETS vsov
71 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
72 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
73 | )
74 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-trt.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux-TRT)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'vstrt/**'
 7 |       - '.github/workflows/linux-trt.yml'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build-linux:
12 |     runs-on: ubuntu-24.04
13 |     
14 |     defaults:
15 |       run:
16 |         working-directory: vstrt
17 | 
18 |     steps:
19 |     - name: Checkout repo
20 |       uses: actions/checkout@v4
21 |       with:
22 |         fetch-depth: 0
23 | 
24 |     - name: Setup Ninja
25 |       run: pip install ninja --break-system-packages
26 | 
27 |     - name: Download VapourSynth headers
28 |       run: |
29 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
30 |         unzip -q vs.zip
31 |         mv vapoursynth*/ vapoursynth
32 | 
33 |     - name: Setup CUDA and TensorRT
34 |       run: |
35 |         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
36 |         sudo dpkg -i cuda-keyring_1.1-1_all.deb
37 |         sudo apt-get update
38 |         export TRT_VER=10.13.2.6-1+cuda13.0
39 |         sudo apt-get install -y --no-install-recommends cuda-nvcc-13-0 cuda-cudart-dev-13-0 libnvinfer-dev=${TRT_VER} libnvinfer-headers-dev=${TRT_VER} libnvinfer10=${TRT_VER}
40 |         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
41 |         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
42 |         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
43 | 
44 |     - name: Configure
45 |       run: cmake -S . -B build -G Ninja -LA
46 |         -D CMAKE_BUILD_TYPE=Release
47 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
48 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
49 | 
50 |     - name: Build
51 |       run: cmake --build build --verbose
52 | 
53 |     - name: Install
54 |       run: cmake --install build --prefix install
55 | 
56 |     - name: Prepare for upload
57 |       run: |
58 |         mkdir artifact
59 |         cp -v install/lib/*.so artifact
60 | 
61 |     - name: Describe
62 |       run: git describe --tags --long
63 | 
64 |     - name: Upload
65 |       uses: actions/upload-artifact@v4
66 |       with:
67 |         name: VSTRT-Linux-x64
68 |         path: vstrt/artifact
69 |         overwrite: true
70 | 
71 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-trt-arm64.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux-TRT, ARM64)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'vstrt/**'
 7 |       - '.github/workflows/linux-trt-arm64.yml'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build-linux:
12 |     runs-on: ubuntu-24.04-arm
13 |     
14 |     defaults:
15 |       run:
16 |         working-directory: vstrt
17 | 
18 |     steps:
19 |     - name: Checkout repo
20 |       uses: actions/checkout@v4
21 |       with:
22 |         fetch-depth: 0
23 | 
24 |     - name: Setup Ninja
25 |       run: pip install ninja --break-system-packages
26 | 
27 |     - name: Download VapourSynth headers
28 |       run: |
29 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
30 |         unzip -q vs.zip
31 |         mv vapoursynth*/ vapoursynth
32 | 
33 |     - name: Setup CUDA and TensorRT
34 |       run: |
35 |         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb
36 |         sudo dpkg -i cuda-keyring_1.1-1_all.deb
37 |         sudo apt-get update
38 |         export TRT_VER=10.8.0.43-1+cuda12.8
39 |         sudo apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-cudart-dev-12-8 libnvinfer-dev=${TRT_VER} libnvinfer-headers-dev=${TRT_VER} libnvinfer10=${TRT_VER}
40 |         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
41 |         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
42 |         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
43 | 
44 |     - name: Configure
45 |       run: cmake -S . -B build -G Ninja -LA
46 |         -D CMAKE_BUILD_TYPE=Release
47 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
48 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math"
49 | 
50 |     - name: Build
51 |       run: cmake --build build --verbose
52 | 
53 |     - name: Install
54 |       run: cmake --install build --prefix install
55 | 
56 |     - name: Prepare for upload
57 |       run: |
58 |         mkdir artifact
59 |         cp -v install/lib/*.so artifact
60 | 
61 |     - name: Describe
62 |       run: git describe --tags --long
63 | 
64 |     - name: Upload
65 |       uses: actions/upload-artifact@v4
66 |       with:
67 |         name: VSTRT-Linux-ARM64
68 |         path: vstrt/artifact
69 |         overwrite: true
70 | 
71 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-trt-rtx.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux-TRT-RTX)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'vstrt/**'
 7 |       - '.github/workflows/linux-trt-rtx.yml'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build-linux:
12 |     runs-on: ubuntu-24.04
13 |     
14 |     defaults:
15 |       run:
16 |         working-directory: vstrt
17 | 
18 |     steps:
19 |     - name: Checkout repo
20 |       uses: actions/checkout@v4
21 |       with:
22 |         fetch-depth: 0
23 | 
24 |     - name: Setup Ninja
25 |       run: pip install ninja --break-system-packages
26 | 
27 |     - name: Download VapourSynth headers
28 |       run: |
29 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
30 |         unzip -q vs.zip
31 |         mv vapoursynth*/ vapoursynth
32 | 
33 |     - name: Setup CUDA
34 |       run: |
35 |         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
36 |         sudo dpkg -i cuda-keyring_1.1-1_all.deb
37 |         sudo apt-get update
38 |         sudo apt-get install -y --no-install-recommends cuda-nvcc-13-0 cuda-cudart-dev-13-0
39 |         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
40 |         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
41 |         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
42 | 
43 |     - name: Download TensorRT-RTX
44 |       run: |
45 |         curl -L -o trt.tar.gz https://developer.nvidia.com/downloads/trt/rtx_sdk/secure/1.2/TensorRT-RTX-1.2.0.44-Linux-x86_64-cuda-13.0-Release-external.tar.gz
46 |         tar -xzf trt.tar.gz --verbose
47 |         mv TensorRT-RTX-*/ tensorrt/
48 | 
49 |     - name: Configure
50 |       run: cmake -S . -B build -G Ninja -LA
51 |         -D CMAKE_BUILD_TYPE=Release
52 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
53 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
54 |         -D TENSORRT_HOME="$(pwd)\tensorrt"
55 | 
56 |     - name: Build
57 |       run: cmake --build build --verbose
58 | 
59 |     - name: Install
60 |       run: cmake --install build --prefix install
61 | 
62 |     - name: Prepare for upload
63 |       run: |
64 |         mkdir artifact
65 |         cp -v install/lib/*.so artifact
66 | 
67 |     - name: Describe
68 |       run: git describe --tags --long
69 | 
70 |     - name: Upload
71 |       uses: actions/upload-artifact@v4
72 |       with:
73 |         name: VSTRT-RTX-Linux-x64
74 |         path: vstrt/artifact
75 |         overwrite: true
76 | 
77 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-migx.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux-MIGX)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'vsmigx/**'
 7 |       - '.github/workflows/linux-migx.yml'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build-linux:
12 |     runs-on: ubuntu-24.04
13 |     
14 |     defaults:
15 |       run:
16 |         working-directory: vsmigx
17 | 
18 |     steps:
19 |     - name: Checkout repo
20 |       uses: actions/checkout@v4
21 |       with:
22 |         fetch-depth: 0
23 | 
24 |     - name: Setup Ninja
25 |       run: pip install ninja
26 | 
27 |     - name: Download VapourSynth headers
28 |       run: |
29 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
30 |         unzip -q vs.zip
31 |         mv vapoursynth*/ vapoursynth
32 | 
33 |     - name: Setup HIP and MIGraphX
34 |       run: |
35 |         wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
36 |         echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.3.1 noble main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
37 |         echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
38 |         sudo apt update
39 |         sudo apt install hip-runtime-amd rocm-device-libs migraphx-dev hipcc
40 |         ls -R /opt/rocm
41 | 
42 |     - name: Configure
43 |       run: cmake -S . -B build -G Ninja -Wno-dev -LA
44 |         -D CMAKE_BUILD_TYPE=Release
45 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
46 |         -D CMAKE_CXX_COMPILER=g++-13
47 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
48 |         -D migraphx_DIR=/opt/rocm/lib/cmake/migraphx
49 |         -D MIOpen_DIR=/opt/rocm/lib/cmake/miopen
50 |         -D hip_DIR=/opt/rocm/lib/cmake/hip
51 |         -D AMDDeviceLibs_DIR=/opt/rocm/lib/cmake/AMDDeviceLibs
52 |         -D amd_comgr_DIR=/opt/rocm/lib/cmake/amd_comgr
53 |         -D hsa-runtime64_DIR=/opt/rocm/lib/cmake/hsa-runtime64
54 |         -D rocblas_DIR=/opt/rocm/lib/cmake/rocblas
55 |         -D hipblaslt_DIR=/opt/rocm/lib/cmake/hipblaslt
56 |         -D CMAKE_PREFIX_PATH=/opt/rocm/lib/cmake
57 | 
58 |     - name: Build
59 |       run: cmake --build build --verbose
60 | 
61 |     - name: Install
62 |       run: cmake --install build --prefix install
63 | 
64 |     - name: Prepare for upload
65 |       run: |
66 |         mkdir artifact
67 |         cp -v install/lib/*.so artifact
68 | 
69 |     - name: Describe
70 |       run: git describe --tags --long
71 | 
72 |     - name: Upload
73 |       uses: actions/upload-artifact@v4
74 |       with:
75 |         name: VSMIGX-Linux-x64
76 |         path: vsmigx/artifact
77 |         overwrite: true
78 | 
79 | 


--------------------------------------------------------------------------------
/vsort/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-ort VERSION 3.0 LANGUAGES CXX)
 4 | 
 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 6 | set(ONNX_RUNTIME_API_DIRECTORY "" CACHE PATH "Path to ONNX API headers")
 7 | set(ONNX_RUNTIME_LIB_DIRECTORY "" CACHE PATH "Path to ONNX Runtime libraries")
 8 | 
 9 | set(ENABLE_CUDA OFF CACHE BOOL "Enable CUDA backend")
10 | set(ENABLE_DML OFF CACHE BOOL "Enable DirectML backend")
11 | set(ENABLE_COREML OFF CACHE BOOL "Enable CoreML support")
12 | 
13 | find_package(protobuf REQUIRED CONFIG)
14 | find_package(ONNX REQUIRED CONFIG)
15 | 
16 | add_library(vsort SHARED
17 |     vs_onnxruntime.cpp
18 |     win32.cpp
19 |     ../common/onnx_utils.cpp
20 |     ../common/convert_float_to_float16.cpp
21 | )
22 | 
23 | target_include_directories(vsort PRIVATE
24 |     ${VAPOURSYNTH_INCLUDE_DIRECTORY}
25 |     ${ONNX_INCLUDE_DIRS}
26 |     ${ONNX_RUNTIME_API_DIRECTORY}
27 | )
28 | 
29 | target_link_directories(vsort PRIVATE
30 |     ${ONNX_RUNTIME_LIB_DIRECTORY}
31 | )
32 | 
33 | set_target_properties(vsort PROPERTIES
34 |     POSITION_INDEPENDENT_CODE ON
35 |     CXX_EXTENSIONS OFF
36 |     CXX_STANDARD 17
37 |     CXX_STANDARD_REQUIRED ON)
38 | 
39 | if (CMAKE_CXX_STANDARD GREATER_EQUAL 20)
40 |     set_target_properties(vsort PROPERTIES CXX_STANDARD 20)
41 | endif()
42 | 
43 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5
44 | if (ONNX_VERSION VERSION_LESS 1.16.0)
45 |     target_link_libraries(vsort PRIVATE onnx)
46 | else()
47 |     target_link_libraries(vsort PRIVATE ONNX::onnx)
48 | endif()
49 | 
50 | target_link_libraries(vsort PRIVATE onnxruntime)
51 | 
52 | if (ENABLE_CUDA)
53 |     find_package(CUDAToolkit REQUIRED)
54 | 
55 |     add_compile_definitions(ENABLE_CUDA)
56 |     target_include_directories(vsort PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
57 |     target_link_libraries(vsort PRIVATE CUDA::cudart_static)
58 | 
59 |     if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
60 |         target_link_options(vsort PRIVATE "/DELAYLOAD:onnxruntime.dll" "delayimp.lib")
61 |     endif()
62 | endif()
63 | 
64 | if (ENABLE_DML)
65 |     add_compile_definitions(ENABLE_DML)
66 | endif()
67 | 
68 | if(ENABLE_COREML)
69 |     add_compile_definitions(ENABLE_COREML=1)
70 | endif()
71 | 
72 | target_include_directories(vsort PUBLIC
73 |     "${PROJECT_BINARY_DIR}"
74 | )
75 | 
76 | find_package(Git REQUIRED)
77 | execute_process(
78 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
79 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
80 |     OUTPUT_VARIABLE VCS_TAG
81 | )
82 | string(STRIP ${VCS_TAG} VCS_TAG)
83 | configure_file(config.h.in config.h)
84 | 
85 | install(TARGETS vsort
86 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
87 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
88 | )
89 | 


--------------------------------------------------------------------------------
/vsmigx/win32.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef _MSC_VER
 2 | #include <windows.h>
 3 | #include <delayimp.h>
 4 | #include <string>
 5 | #include <vector>
 6 | #include <stdexcept>
 7 | #include <filesystem>
 8 | 
 9 | #define DLL_DIR L"vsmlrt-hip"
10 | 
11 | #include <iostream>
12 | 
13 | namespace {
14 | std::vector<std::wstring> dlls = {
15 |     // This list must be sorted by dependency.
16 |     L"amdhip64_6.dll",
17 |     L"migraphx.dll",
18 |     L"migraphx_tf.dll",
19 |     L"migraphx_onnx.dll",
20 |     L"migraphx_c.dll", // must be the last
21 | };
22 | 
23 | namespace fs = std::filesystem;
24 | static fs::path dllDir() {
25 |     static const std::wstring res = []() -> std::wstring {
26 |         HMODULE mod = 0;
27 |         if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
28 |             std::vector<wchar_t> buf;
29 |             size_t n = 0;
30 |             do {
31 |                 buf.resize(buf.size() + MAX_PATH);
32 |                 n = GetModuleFileNameW(mod, buf.data(), buf.size());
33 |             } while (n >= buf.size());
34 |             buf.resize(n);
35 |             std::wstring path(buf.begin(), buf.end());
36 |             return path;
37 |         }
38 |         throw std::runtime_error("unable to locate myself");
39 |     }();
40 |     return fs::path(res).parent_path();
41 | }
42 | 
43 | FARPROC loadDLLs() {
44 |     fs::path dir = dllDir() / DLL_DIR;
45 |     HMODULE h = nullptr;
46 |     for (const auto dll: dlls) {
47 |         fs::path p = dir / dll;
48 |         std::wstring s = p;
49 |         h = LoadLibraryW(s.c_str());
50 |         if (getenv("VSMIGX_VERBOSE"))
51 |             std::wcerr << L"vsmigx: preloading " << p << L": " << h << std::endl;
52 |         if (!h)
53 |             std::wcerr << L"vsmigx: failed to preload " << s << std::endl;
54 |     }
55 |     return (FARPROC)h;
56 | }
57 | 
58 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
59 |     switch (reason) {
60 |     case dliNoteStartProcessing:
61 |     case dliNoteEndProcessing:
62 |         // Nothing to do here.
63 |         break;
64 |     case dliNotePreLoadLibrary:
65 |         //std::cerr << "loading " << info->szDll << std::endl;
66 |         if (std::string(info->szDll).find("migraphx_c.dll") != std::string::npos ||
67 |             std::string(info->szDll).find("amdhip64_6.dll") != std::string::npos
68 |         )
69 |             return loadDLLs();
70 |         break;
71 |     case dliNotePreGetProcAddress:
72 |         // Nothing to do here.
73 |         break;
74 |     case dliFailLoadLib:
75 |     case dliFailGetProc:
76 |         // Returning NULL from error notifications will cause the delay load
77 |         // runtime to raise a VcppException structured exception, that some code
78 |         // might want to handle.
79 |         return NULL;
80 |         break;
81 |     default:
82 |         abort(); // unreachable.
83 |         break;
84 |     }
85 |     // Returning NULL causes the delay load machinery to perform default
86 |     // processing for this notification.
87 |     return NULL;
88 | }
89 | } // namespace
90 | 
91 | extern "C" {
92 |     const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
93 |     const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
94 | };
95 | #endif
96 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-hip-dependency.yml:
--------------------------------------------------------------------------------
  1 | name: Upload vs-mlrt-hip dependencies
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       tag:
  7 |         description: 'which tag to upload to'
  8 |         required: true
  9 |         default: 'v100'
 10 |   workflow_call:
 11 |     inputs:
 12 |       tag:
 13 |         description: 'which tag to upload to'
 14 |         required: true
 15 |         type: string
 16 |     secrets:
 17 |       REPO_TOKEN:
 18 |         required: true
 19 | 
 20 | jobs:
 21 |   build-windows:
 22 |     runs-on: windows-2022
 23 |     outputs:
 24 |       runID: ${{ steps.output.outputs.runID }}
 25 | 
 26 |     defaults:
 27 |       run:
 28 |         shell: bash
 29 | 
 30 |     steps:
 31 |     - name: Download MIGraphX Precompilation
 32 |       run: |
 33 |         curl -s -o migx.zip -LJO https://github.com/AmusementClub/AMDMIGraphX/releases/download/rocm-4.1.0-1730-g6acc1f957-241221-0629/migraphx-win64.zip
 34 | 
 35 |     - name: Extract MIGraphX Precompilation
 36 |       run: |
 37 |         unzip migx.zip
 38 | 
 39 |     - name: Move MIGraphX Precompilation
 40 |       run: |
 41 |         mkdir vsmlrt-hip
 42 |         mv migraphx/bin/* vsmlrt-hip -v
 43 | 
 44 |     - name: Setup VC commands
 45 |       uses: ilammy/msvc-dev-cmd@v1
 46 |       with:
 47 |         arch: amd64
 48 | 
 49 |     - name: List Dependencies
 50 |       shell: bash
 51 |       run: |
 52 |         cd vsmlrt-hip
 53 |         for dll in *.[dD][lL][lL]; do
 54 |           echo $(dumpbin -dependents "$dll")
 55 |         done
 56 | 
 57 |     - name: Cache HIP
 58 |       id: cache-hip
 59 |       uses: actions/cache@v4
 60 |       with:
 61 |         path: C:\Program Files\AMD\ROCm
 62 |         key: ${{ runner.os }}-rocm-6.2.4
 63 | 
 64 |     - name: Setup HIP
 65 |       if: steps.cache-hip.outputs.cache-hit != 'true'
 66 |       shell: pwsh
 67 |       run: |
 68 |         curl -s -o hip_installer.exe -L https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
 69 |         Start-Process hip_installer.exe -ArgumentList '-install' -NoNewWindow -Wait
 70 | 
 71 |     - name: Move HIP Libraries
 72 |       shell: cmd
 73 |       run: |
 74 |         copy "C:\Program Files\AMD\ROCm\6.2\bin\amd_comgr_2.dll" vsmlrt-hip
 75 |         copy "C:\Program Files\AMD\ROCm\6.2\bin\amdhip64_6.dll" vsmlrt-hip
 76 |         copy "C:\Program Files\AMD\ROCm\6.2\bin\hiprtc0602.dll" vsmlrt-hip
 77 |         copy "C:\Program Files\AMD\ROCm\6.2\bin\hiprtc-builtins0602.dll" vsmlrt-hip
 78 | 
 79 |     - name: Compress
 80 |       run: |
 81 |         7z a -t7z -mx=3 vsmlrt-hip.7z vsmlrt-hip
 82 | 
 83 |     - name: Upload
 84 |       uses: actions/upload-artifact@v4
 85 |       with:
 86 |         name: vsmlrt-hip
 87 |         path: vsmlrt-hip.7z
 88 |         retention-days: 1
 89 |         compression-level: 0
 90 | 
 91 |     - name: Rename release asset
 92 |       run: |
 93 |         mv vsmlrt-hip.7z vsmlrt-hip.${{ github.event.inputs.tag}}.7z
 94 | 
 95 |     - name: Release
 96 |       uses: softprops/action-gh-release@v2
 97 |       with:
 98 |         tag_name: ${{ github.event.inputs.tag }}
 99 |         files: vsmlrt-hip.${{ github.event.inputs.tag }}.7z
100 |         fail_on_unmatched_files: true
101 |         generate_release_notes: false
102 |         prerelease: true
103 | 


--------------------------------------------------------------------------------
/vsov/win32.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _MSC_VER
  2 | #include <windows.h>
  3 | #include <delayimp.h>
  4 | #include <string>
  5 | #include <vector>
  6 | #include <stdexcept>
  7 | #include <filesystem>
  8 | 
  9 | #define DLL_DIR L"vsov"
 10 | 
 11 | #include <iostream>
 12 | 
 13 | namespace {
 14 | std::vector<std::wstring> dlls = {
 15 |     // This list must be sorted by dependency.
 16 | #ifdef WIN32_SHARED_OPENVINO
 17 |     L"tbb12.dll",
 18 |     L"openvino.dll", // must be the last
 19 | #else // WIN32_SHARED_OPENVINO
 20 |     L"tbb12.dll", // must be the last
 21 | #endif // WIN32_SHARED_OPENVINO
 22 | };
 23 | 
 24 | namespace fs = std::filesystem;
 25 | static fs::path dllDir() {
 26 |     static const std::wstring res = []() -> std::wstring {
 27 |         HMODULE mod = 0;
 28 |         if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
 29 |             std::vector<wchar_t> buf;
 30 |             size_t n = 0;
 31 |             do {
 32 |                 buf.resize(buf.size() + MAX_PATH);
 33 |                 n = GetModuleFileNameW(mod, buf.data(), buf.size());
 34 |             } while (n >= buf.size());
 35 |             buf.resize(n);
 36 |             std::wstring path(buf.begin(), buf.end());
 37 |             return path;
 38 |         }
 39 |         throw std::runtime_error("unable to locate myself");
 40 |     }();
 41 |     return fs::path(res).parent_path();
 42 | }
 43 | 
 44 | FARPROC loadDLLs() {
 45 |     fs::path dir = dllDir() / DLL_DIR;
 46 |     HMODULE h = nullptr;
 47 |     for (const auto dll: dlls) {
 48 |         fs::path p = dir / dll;
 49 |         std::wstring s = p;
 50 |         h = LoadLibraryW(s.c_str());
 51 |         if (getenv("VSOV_VERBOSE"))
 52 |             std::wcerr << L"vsov: preloading " << p << L": " << h << std::endl;
 53 |         if (!h)
 54 |             std::wcerr << L"vsov: failed to preload " << s << std::endl;
 55 |     }
 56 |     return (FARPROC)h;
 57 | }
 58 | 
 59 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
 60 |     switch (reason) {
 61 |     case dliNoteStartProcessing:
 62 |     case dliNoteEndProcessing:
 63 |         // Nothing to do here.
 64 |         break;
 65 |     case dliNotePreLoadLibrary:
 66 |         //std::cerr << "loading " << info->szDll << std::endl;
 67 | #ifdef WIN32_SHARED_OPENVINO
 68 |         if (std::string(info->szDll).find("openvino.dll") != std::string::npos)
 69 |             return loadDLLs();
 70 | #else // WIN32_SHARED_OPENVINO
 71 |         if (std::string(info->szDll).find("tbb.dll") != std::string::npos)
 72 |             return loadDLLs();
 73 | #endif // WIN32_SHARED_OPENVINO
 74 |         break;
 75 |     case dliNotePreGetProcAddress:
 76 |         // Nothing to do here.
 77 |         break;
 78 |     case dliFailLoadLib:
 79 |     case dliFailGetProc:
 80 |         // Returning NULL from error notifications will cause the delay load
 81 |         // runtime to raise a VcppException structured exception, that some code
 82 |         // might want to handle.
 83 |         return NULL;
 84 |         break;
 85 |     default:
 86 |         abort(); // unreachable.
 87 |         break;
 88 |     }
 89 |     // Returning NULL causes the delay load machinery to perform default
 90 |     // processing for this notification.
 91 |     return NULL;
 92 | }
 93 | } // namespace
 94 | 
 95 | extern "C" {
 96 |     const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
 97 |     const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
 98 | };
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-migx.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-MIGX)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vsmigx/**'
  7 |       - '.github/workflows/windows-migx.yml'
  8 |   workflow_call:
  9 |     inputs:
 10 |       tag:
 11 |         description: 'which tag to upload to'
 12 |         required: true
 13 |         type: string
 14 |     secrets:
 15 |       REPO_TOKEN:
 16 |         required: true
 17 |   workflow_dispatch:
 18 |     inputs:
 19 |       tag:
 20 |         description: 'which tag to upload to'
 21 |         default: ''
 22 | 
 23 | jobs:
 24 |   build-windows:
 25 |     runs-on: windows-2022
 26 | 
 27 |     defaults:
 28 |       run:
 29 |         shell: cmd
 30 |         working-directory: vsmigx
 31 | 
 32 |     steps:
 33 |     - name: Checkout repo
 34 |       uses: actions/checkout@v4
 35 |       with:
 36 |         fetch-depth: 0
 37 | 
 38 |     - name: Setup MSVC
 39 |       uses: ilammy/msvc-dev-cmd@v1
 40 | 
 41 |     - name: Cache HIP
 42 |       id: cache-hip
 43 |       uses: actions/cache@v4
 44 |       with:
 45 |         path: C:\Program Files\AMD\ROCm
 46 |         key: ${{ runner.os }}-rocm-6.2.4
 47 | 
 48 |     - name: Setup HIP
 49 |       if: steps.cache-hip.outputs.cache-hit != 'true'
 50 |       shell: pwsh
 51 |       run: |
 52 |         curl -s -o hip_installer.exe -L https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
 53 |         Start-Process hip_installer.exe -ArgumentList '-install' -NoNewWindow -Wait
 54 | 
 55 |     - name: Download MIGraphX Precompilation
 56 |       run: |
 57 |         curl -s -o migx.zip -LJO https://github.com/AmusementClub/AMDMIGraphX/releases/download/rocm-4.1.0-1730-g6acc1f957-241221-0629/migraphx-win64.zip
 58 |         unzip -q migx.zip
 59 | 
 60 |     - name: Download VapourSynth headers
 61 |       run: |
 62 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
 63 |         unzip -q vs.zip
 64 |         mv vapoursynth-*/ vapoursynth/
 65 | 
 66 |     - name: Configure
 67 |       run: cmake -S . -B build -G Ninja -Wno-dev -LA
 68 |         -D CMAKE_BUILD_TYPE=Release
 69 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
 70 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%/vapoursynth/include"
 71 |         -D hip_DIR="C:/Program Files/AMD/ROCm/6.2/lib/cmake/hip"
 72 |         -D HIP_PLATFORM=amd
 73 |         -D migraphx_DIR="%cd%/migraphx/lib/cmake/migraphx"
 74 | 
 75 |     - name: Build
 76 |       run: cmake --build build --verbose
 77 | 
 78 |     - name: Install
 79 |       run: cmake --install build --prefix install
 80 | 
 81 |     - name: Prepare for upload
 82 |       run: |
 83 |         mkdir artifact
 84 |         copy install\bin\vsmigx.dll artifact\
 85 | 
 86 |     - name: Describe
 87 |       run: git describe --tags --long
 88 | 
 89 |     - name: Dump dependencies
 90 |       run: dumpbin /dependents artifact/vsmigx.dll
 91 | 
 92 |     - name: Upload
 93 |       uses: actions/upload-artifact@v4
 94 |       with:
 95 |         name: VSMIGX-Windows-x64
 96 |         path: vsmigx/artifact
 97 | 
 98 |     - name: Compress artifact for release
 99 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
100 |       run: |
101 |         cd artifact
102 |         7z a -t7z -mx=7 ../../VSMIGX-Windows-x64.${{ github.event.inputs.tag }}.7z .
103 | 
104 |     - name: Release
105 |       uses: softprops/action-gh-release@v2
106 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
107 |       with:
108 |         tag_name: ${{ inputs.tag }}
109 |         files: VSMIGX-Windows-x64.${{ github.event.inputs.tag }}.7z
110 |         fail_on_unmatched_files: true
111 |         generate_release_notes: false
112 |         prerelease: true
113 | 
114 | 


--------------------------------------------------------------------------------
/common/onnx_utils.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdint>
  2 | #include <fstream>
  3 | #include <optional>
  4 | #include <variant>
  5 | #include <string>
  6 | #include <string_view>
  7 | 
  8 | #include <onnx/onnx_pb.h>
  9 | #include <onnx/shape_inference/implementation.h>
 10 | 
 11 | #include "onnx_utils.h"
 12 | 
 13 | 
 14 | using namespace std::string_literals;
 15 | 
 16 | #ifdef _WIN32
 17 | #include <locale>
 18 | #include <codecvt>
 19 | static inline std::wstring translateName(const char *name) noexcept {
 20 |     std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
 21 |     return converter.from_bytes(name);
 22 | }
 23 | #else
 24 | #define translateName(n) (n)
 25 | #endif
 26 | 
 27 | 
 28 | [[nodiscard]]
 29 | static std::optional<std::string> specifyShape(
 30 |     ONNX_NAMESPACE::ModelProto & model,
 31 |     int64_t tile_w,
 32 |     int64_t tile_h,
 33 |     int64_t batch = 1
 34 | ) noexcept {
 35 | 
 36 |     if (model.graph().input_size() != 1) {
 37 |         return "graph must has a single input";
 38 |     }
 39 |     ONNX_NAMESPACE::TensorShapeProto * input_shape {
 40 |         model
 41 |             .mutable_graph()
 42 |             ->mutable_input(0)
 43 |             ->mutable_type()
 44 |             ->mutable_tensor_type()
 45 |             ->mutable_shape()
 46 |     };
 47 | 
 48 |     if (model.graph().output_size() != 1) {
 49 |         return "graph must has a single output";
 50 |     }
 51 |     ONNX_NAMESPACE::TensorShapeProto * output_shape {
 52 |         model
 53 |             .mutable_graph()
 54 |             ->mutable_output(0)
 55 |             ->mutable_type()
 56 |             ->mutable_tensor_type()
 57 |             ->mutable_shape()
 58 |     };
 59 | 
 60 |     constexpr auto n_idx = 0;
 61 |     constexpr auto h_idx = 2;
 62 |     constexpr auto w_idx = 3;
 63 | 
 64 |     if (input_shape->dim_size() != 4) {
 65 |         return "input dimension must be 4";
 66 |     }
 67 | 
 68 |     input_shape->mutable_dim(n_idx)->set_dim_value(batch);
 69 |     input_shape->mutable_dim(h_idx)->set_dim_value(tile_h);
 70 |     input_shape->mutable_dim(w_idx)->set_dim_value(tile_w);
 71 | 
 72 |     if (output_shape->dim_size() != 4) {
 73 |         return "output dimsion must be 4";
 74 |     }
 75 | 
 76 |     output_shape->mutable_dim(n_idx)->set_dim_value(batch);
 77 |     output_shape->mutable_dim(h_idx)->clear_dim_value();
 78 |     output_shape->mutable_dim(w_idx)->clear_dim_value();
 79 | 
 80 |     // remove shape info
 81 |     if (model.graph().value_info_size() != 0) {
 82 |         model.mutable_graph()->mutable_value_info()->Clear();
 83 |     }
 84 | 
 85 |     try {
 86 |         ONNX_NAMESPACE::shape_inference::InferShapes(model);
 87 |     } catch (const ONNX_NAMESPACE::InferenceError & e) {
 88 |         return e.what();
 89 |     }
 90 | 
 91 |     return {};
 92 | }
 93 | 
 94 | 
 95 | std::variant<std::string, ONNX_NAMESPACE::ModelProto> loadONNX(
 96 |     const std::string_view & path,
 97 |     int64_t tile_w,
 98 |     int64_t tile_h,
 99 |     bool path_is_serialization
100 | ) noexcept {
101 | 
102 |     ONNX_NAMESPACE::ModelProto onnx_proto;
103 | 
104 |     if (path_is_serialization) {
105 |         if (!onnx_proto.ParseFromArray(path.data(), static_cast<int>(path.size()))) {
106 |             return "parse onnx serialization failed"s;
107 |         }
108 |     } else {
109 |         std::ifstream onnx_stream(
110 |             translateName(path.data()),
111 |             std::ios::binary
112 |         );
113 | 
114 |         if (!onnx_stream.good()) {
115 |             return "open "s + std::string{ path } + " failed"s;
116 |         }
117 | 
118 |         if (!onnx_proto.ParseFromIstream(&onnx_stream)) {
119 |             return "parse "s + std::string{ path } + " failed"s;
120 |         }
121 |     }
122 | 
123 |     if (auto err = specifyShape(onnx_proto, tile_w, tile_h); err.has_value()) {
124 |         return err.value();
125 |     }
126 | 
127 |     return onnx_proto;
128 | }
129 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-trt_rtx.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-TRT-RTX)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vstrt/**'
  7 |       - '.github/workflows/windows-trt_rtx.yml'
  8 |   workflow_call:
  9 |     inputs:
 10 |       tag:
 11 |         description: 'which tag to upload to'
 12 |         required: true
 13 |         type: string
 14 |     secrets:
 15 |       REPO_TOKEN:
 16 |         required: true
 17 |   workflow_dispatch:
 18 |     inputs:
 19 |       tag:
 20 |         description: 'which tag to upload to'
 21 |         default: ''
 22 | 
 23 | jobs:
 24 |   build-windows:
 25 |     runs-on: windows-2025
 26 | 
 27 |     defaults:
 28 |       run:
 29 |         shell: cmd
 30 |         working-directory: vstrt
 31 | 
 32 |     steps:
 33 |     - name: Checkout repo
 34 |       uses: actions/checkout@v5
 35 |       with:
 36 |         fetch-depth: 0
 37 | 
 38 |     - name: Setup MSVC
 39 |       uses: ilammy/msvc-dev-cmd@v1
 40 | 
 41 |     - name: Setup Ninja
 42 |       run: pip install ninja
 43 | 
 44 |     - name: Cache CUDA
 45 |       id: cache-cuda
 46 |       uses: actions/cache@v4
 47 |       with:
 48 |         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
 49 |         key: ${{ runner.os }}-vstrt-cuda-13.0.2
 50 | 
 51 |     - name: Setup CUDA
 52 |       if: steps.cache-cuda.outputs.cache-hit != 'true'
 53 |       run: |
 54 |         curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/13.0.2/network_installers/cuda_13.0.2_windows_network.exe
 55 |         cuda_installer.exe -s nvcc_13.0 cudart_13.0 cuda_profiler_api_13.0 crt_13.0 nvptxcompiler_13.0
 56 | 
 57 |     - name: Download TensorRT-RTX
 58 |       run: |
 59 |         curl -L -o trt.zip https://developer.nvidia.com/downloads/trt/rtx_sdk/secure/1.2/TensorRT-RTX-1.2.0.44-win10-amd64-cuda-13.0-Release-external.zip
 60 |         unzip trt.zip
 61 |         mv TensorRT-RTX-*/ tensorrt/
 62 | 
 63 |     - name: Download VapourSynth headers
 64 |       run: |
 65 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
 66 |         unzip -q vs.zip
 67 |         mv vapoursynth-*/ vapoursynth/
 68 | 
 69 |     - name: Configure
 70 |       run: cmake -S . -B build -G Ninja -LA
 71 |         -D CMAKE_BUILD_TYPE=Release
 72 |         -D CMAKE_CXX_FLAGS="/EHsc /Wall /wd4100 /wd4625 /wd4626 /wd4710 /wd4711 /wd4820 /wd4996 /wd5026 /wd5027"
 73 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
 74 |         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
 75 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
 76 |         -D TENSORRT_HOME="%cd%\tensorrt"
 77 |         -D TENSORRT_LIBRARY_SUFFIX="_1_2"
 78 | 
 79 |     - name: Build
 80 |       run: cmake --build build --config Release --verbose
 81 | 
 82 |     - name: Install
 83 |       run: cmake --install build --prefix install
 84 | 
 85 |     - name: Prepare for upload
 86 |       run: |
 87 |         mkdir artifact
 88 |         copy install\bin\vstrt_rtx.dll artifact\
 89 | 
 90 |     - name: Describe
 91 |       run: git describe --tags --long
 92 | 
 93 |     - name: Dump dependencies
 94 |       run: dumpbin /dependents artifact\vstrt_rtx.dll
 95 | 
 96 |     - name: Upload
 97 |       uses: actions/upload-artifact@v4
 98 |       with:
 99 |         name: VSTRT-RTX-Windows-x64
100 |         path: vstrt/artifact
101 | 
102 |     - name: Compress artifact for release
103 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
104 |       run: |
105 |         cd artifact
106 |         7z a -t7z -mx=9 ../../VSTRT-RTX-Windows-x64.${{ github.event.inputs.tag }}.7z .
107 | 
108 |     - name: Release
109 |       uses: softprops/action-gh-release@v2
110 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
111 |       with:
112 |         tag_name: ${{ inputs.tag }}
113 |         files: VSTRT-RTX-Windows-x64.${{ github.event.inputs.tag }}.7z
114 |         fail_on_unmatched_files: true
115 |         generate_release_notes: false
116 |         prerelease: true
117 | 


--------------------------------------------------------------------------------
/vstrt/win32.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _MSC_VER
  2 | #include <windows.h>
  3 | #include <delayimp.h>
  4 | #include <string>
  5 | #include <vector>
  6 | #include <stdexcept>
  7 | #include <filesystem>
  8 | 
  9 | #define DLL_DIR L"vsmlrt-cuda"
 10 | 
 11 | #include <iostream>
 12 | 
 13 | #include <NvInferRuntime.h>
 14 | 
 15 | #if NV_TENSORRT_VERSION >= 100001 || defined(TRT_MAJOR_RTX)
 16 | #define TO_STRING(x) #x
 17 | #define CONCAT_VERSION(name, version) (name "_" TO_STRING(version) ".dll")
 18 | #define CONCAT_VERSION2(name, major, minor) (name "_" TO_STRING(major) "_" TO_STRING(minor) ".dll")
 19 | #endif // NV_TENSORRT_VERSION >= 100001
 20 | 
 21 | namespace {
 22 | std::vector<std::wstring> dlls = {
 23 | 	// This list must be sorted by dependency.
 24 | #if defined(TRT_MAJOR_RTX)
 25 | 	CONCAT_VERSION2(L"tensorrt_rtx", TRT_MAJOR_RTX, TRT_MINOR_RTX),
 26 | #elif NV_TENSORRT_VERSION >= 100001
 27 | #ifdef USE_NVINFER_PLUGIN
 28 | 	// nvinfer_plugin dependencies
 29 | 	CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR),
 30 | 	CONCAT_VERSION(L"nvinfer_plugin", NV_TENSORRT_MAJOR),
 31 | #endif // USE_NVINFER_PLUGIN
 32 | 	// Finally, nvinfer again.
 33 | 	CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), // must be the last
 34 | #else // NV_TENSORRT_VERSION >= 100001
 35 | #ifdef USE_NVINFER_PLUGIN
 36 | 	// nvinfer_plugin dependencies
 37 | 	L"nvinfer.dll",
 38 | 	L"nvinfer_plugin.dll",
 39 | #endif // USE_NVINFER_PLUGIN
 40 | 	// Finally, nvinfer again.
 41 | 	L"nvinfer.dll", // must be the last
 42 | #endif // NV_TENSORRT_VERSION >= 100001
 43 | };
 44 | 
 45 | namespace fs = std::filesystem;
 46 | static fs::path dllDir() {
 47 | 	static const std::wstring res = []() -> std::wstring {
 48 | 		HMODULE mod = 0;
 49 | 		if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
 50 | 			std::vector<wchar_t> buf;
 51 | 			size_t n = 0;
 52 | 			do {
 53 | 				buf.resize(buf.size() + MAX_PATH);
 54 | 				n = GetModuleFileNameW(mod, buf.data(), static_cast<DWORD>(buf.size()));
 55 | 			} while (n >= buf.size());
 56 | 			buf.resize(n);
 57 | 			std::wstring path(buf.begin(), buf.end());
 58 | 			return path;
 59 | 		}
 60 | 		throw std::runtime_error("unable to locate myself");
 61 | 	}();
 62 | 	return fs::path(res).parent_path();
 63 | }
 64 | 
 65 | FARPROC loadDLLs() {
 66 | 	fs::path dir = dllDir() / DLL_DIR;
 67 | 	HMODULE h = nullptr;
 68 | 	for (const auto dll: dlls) {
 69 | 		fs::path p = dir / dll;
 70 | 		std::wstring s = p;
 71 | 		h = LoadLibraryW(s.c_str());
 72 | 		DWORD err = GetLastError();
 73 | 		if (getenv("VSTRT_VERBOSE"))
 74 | 			std::wcerr << L"vstrt: preloading " << p << L": " << h << std::endl;
 75 | 		if (!h)
 76 | 			std::wcerr << L"vstrt: failed to preload " << s << L", errno " << err << std::endl;
 77 | 	}
 78 | 	return (FARPROC)h;
 79 | }
 80 | 
 81 | #if (NV_TENSORRT_MAJOR == 9 && !defined(TRT_MAJOR_RTX)) && defined(_WIN32)
 82 | static void * dummy() { // mimic getPluginRegistry
 83 | #else
 84 | static int dummy() { // mimic getInferLibVersion
 85 | #endif
 86 | 	return 0;
 87 | }
 88 | 
 89 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
 90 | 	switch (reason) {
 91 | 	case dliNoteStartProcessing:
 92 | 	case dliNoteEndProcessing:
 93 | 		// Nothing to do here.
 94 | 		break;
 95 | 	case dliNotePreLoadLibrary:
 96 | 		//std::cerr << "loading " << info->szDll << std::endl;
 97 | 		loadDLLs();
 98 | 		return (FARPROC)LoadLibraryA(info->szDll);
 99 | 	case dliNotePreGetProcAddress:
100 | 		// Nothing to do here.
101 | 		break;
102 | 	case dliFailLoadLib:
103 | 	case dliFailGetProc:
104 | 		// Returning NULL from error notifications will cause the delay load
105 | 		// runtime to raise a VcppException structured exception, that some code
106 | 		// might want to handle.
107 | 		//return NULL;
108 | 		// The SE will crash the process, so instead we return a dummy function.
109 | 		return (FARPROC)dummy;
110 | 		break;
111 | 	default:
112 | 		abort(); // unreachable.
113 | 		break;
114 | 	}
115 | 	// Returning NULL causes the delay load machinery to perform default
116 | 	// processing for this notification.
117 | 	return NULL;
118 | }
119 | } // namespace
120 | 
121 | extern "C" {
122 | 	const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
123 | 	const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
124 | };
125 | #endif // _MSC_VER
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # vs-mlrt
 2 | 
 3 | This project provides VapourSynth ML filter runtimes for a variety of platforms:
 4 |  - x86 CPUs: [vsov-cpu](#vsov-openvino-based-pure-cpu--intel-gpu-runtime), [vsort-cpu](#vsort-onnx-runtime-based-cpugpu-runtime)
 5 |  - Intel GPU (both integrated & discrete): [vsov-gpu](#vsov-openvino-based-pure-cpu--intel-gpu-runtime), [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime)
 6 |  - NVidia GPU: [vsort-cuda](#vsort-onnx-runtime-based-cpugpu-runtime), [vstrt & vstrt_rtx](#vstrt-tensorrt-based-gpu-runtime), [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime)
 7 |  - AMD GPU: [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime), [vsmigx](#vsmigx-migraphx-based-gpu-runtime)
 8 |  - Apple SoC: [vsort-coreml](#vsort-onnx-runtime-based-cpugpu-runtime)
 9 | 
10 | To simplify usage, we also provide a Python wrapper [vsmlrt.py](https://github.com/AmusementClub/vs-mlrt/blob/master/scripts/vsmlrt.py)
11 | for all bundled models and a unified interface to select different backends.
12 | 
13 | Please refer to [the wiki](https://github.com/AmusementClub/vs-mlrt/wiki) for supported models & usage information.
14 | 
15 | ## vsov: OpenVINO-based Pure CPU & Intel GPU Runtime
16 | 
17 | [OpenVINO](https://docs.openvino.ai/latest/index.html) is an AI inference runtime developed
18 | by Intel, mainly targeting x86 CPUs and Intel GPUs.
19 | 
20 | The vs-openvino plugin provides optimized *pure* CPU & Intel GPU runtime for some popular AI filters.
21 | Intel GPU supports Gen 8+ on Broadwell+ and the Arc series GPUs.
22 | 
23 | To install, download the latest release and extract them into your VS `plugins` directory.
24 | 
25 | Please visit the [vsov](vsov) directory for details.
26 | 
27 | ## vsort: ONNX Runtime-based CPU/GPU Runtime
28 | 
29 | [ONNX Runtime](https://onnxruntime.ai/) is an AI inference runtime with many backends.
30 | 
31 | The vs-onnxruntime plugin provides optimized CPU and CUDA GPU runtime for some popular AI filters.
32 | 
33 | To install, download the latest release and extract them into your VS `plugins` directory.
34 | 
35 | Please visit the [vsort](vsort) directory for details.
36 | 
37 | ## vstrt: TensorRT-based GPU Runtime
38 | 
39 | [TensorRT](https://developer.nvidia.com/tensorrt) is a highly optimized AI inference runtime
40 | for NVidia GPUs. It uses benchmarking to find the optimal kernel to use for your specific
41 | GPU, and so there is an extra step to build an engine from ONNX network on the machine
42 | you are going to use the vstrt filter, and this extra step makes deploying models a little
43 | harder than the other runtimes. However, the resulting performance is also typically
44 | *much much better* than the CUDA backend of [vsort](vsort).
45 | 
46 | [TensorRT-RTX](https://developer.nvidia.com/tensorrt-rtx) is a specialization of TensorRT
47 | for NVIDIA RTX GPUs, which compiles engines faster with comparable performance with TensorRT.
48 | 
49 | To install, download the latest release and extract them into your VS `plugins` directory.
50 | 
51 | Please visit the [vstrt](vstrt) directory for details.
52 | 
53 | ## vsmigx: MIGraphX-based GPU Runtime
54 | 
55 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX) is a highly optimized AI inference runtime
56 | for AMD GPUs. It also uses benchmarking to find the optimal kernel, similar to vstrt.
57 | 
58 | To install, download the latest release and extract them into your VS `plugins` directory.
59 | 
60 | Please visit the [vsmigx](vsmigx) directory for details.
61 | 
62 | ## vsncnn: NCNN-based GPU (Vulkan) Runtime
63 | 
64 | [ncnn](https://github.com/Tencent/ncnn) is a popular AI inference runtime. [vsncnn](vsncnn)
65 | provides a vulkan based runtime for some AI filters. It includes support for on-the-fly
66 | ONNX to ncnn native format conversion so as to provide a unified interface across all
67 | runtimes provided by this project. As it uses the device-independent
68 | [Vulkan](https://en.wikipedia.org/wiki/Vulkan) interface for GPU accelerated inference,
69 | this plugin supports all GPUs that provides Vulkan interface (NVidia, AMD, Intel integrated &
70 | discrete GPUs all provide this interface.) Another benefit is that it has a significant
71 | smaller footprint than other GPU runtimes (both vsort and vstrt CUDA backends require >1GB
72 | CUDA libraries.) The main drawback is that it's slower.
73 | 
74 | To install, download the latest release and extract them into your VS `plugins` directory.
75 | 
76 | Please visit the [vsncnn](vsncnn) directory for details.
77 | 


--------------------------------------------------------------------------------
/vsmigx/README.md:
--------------------------------------------------------------------------------
 1 | # VapourSynth MIGraphX
 2 | 
 3 | The vs-migraphx plugin provides optimized HIP runtime for some popular AI filters on AMD GPUs.
 4 | 
 5 | ## Usage
 6 | 
 7 | Prototype: `core.migx.Model(clip[] clips, string program_path[, int[] overlap, int[] tilesize, int device_id=0, int num_streams=1, string flexible_output_prop=""])`
 8 | 
 9 | Arguments:
10 | - `clip[] clips`: the input clips, only 16/32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki).
11 | - `string program_path`: the path to the prebuilt program (see below)
12 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size.
13 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size.
14 | - `int device_id`: Specifies the GPU device id to use, default 0. Requires AMD GPUs with gfx1030 target or RDNA3 architecture onwards ([list](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus)).
15 | - `int num_streams`: number of concurrent HIP streams to use. Default 1. Increase if GPU not saturated.
16 | - `string flexible_output_prop`: used to support onnx models with arbitrary number of output planes.
17 | 
18 |   ```python3
19 |   from typing import TypedDict
20 | 
21 |   class Output(TypedDict):
22 |       clip: vs.VideoNode
23 |       num_planes: int
24 | 
25 |   prop = "planes" # arbitrary non-empty string
26 |   output = core.migx.Model(src, program_path, flexible_output_prop=prop) # type: Output
27 | 
28 |   clip = output["clip"]
29 |   num_planes = output["num_planes"]
30 | 
31 |   output_planes = [
32 |       clip.std.PropToClip(prop=f"{prop}{i}")
33 |       for i in range(num_planes)
34 |   ] # type: list[vs.VideoNode]
35 |   ```
36 |   
37 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
38 | 
39 | The general rule is to either:
40 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or
41 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used.
42 | 
43 | ## Instructions
44 | 
45 | ### Build program
46 |    ```shell
47 |    migraphx-driver compile --onnx drunet_gray.onnx --gpu --input-dim @input 1 2 1080 1920 --output dpir_gray_1080p.mxr
48 |    ```
49 |    
50 |    The program can be applied to `1920x1080` input.
51 |     
52 |    Also check [migraphx-driver useful arguments](#migraphx-driver-useful-arguments)
53 | 
54 | ### Run model
55 | In vpy script:
56 | ```python3
57 | # DPIR
58 | src = core.std.BlankClip(src, width=1920, height=1080, format=vs.GRAYS)
59 | sigma = 10.0
60 | flt = core.migx.Model([src, core.std.BlankClip(src, color=sigma/255.0)], engine_path="dpir_gray_1080p.mxr", tilesize=[1920, 1080])
61 | ```
62 | 
63 | ## trtexec useful arguments
64 | - `--fp16`: Enable fp16 precision, in addition to fp32 (default = disabled)
65 | 
66 | - `--output <file>`: Save the serialized program
67 | 
68 | - `--migraphx <file>`: Load a serialized program
69 | 
70 | - `--optimize`: Performs common graph optimizations
71 | 
72 | - `--exhaustive-tune`: Enables exhaustive search to find the fastest kernel
73 | 
74 | - `--disable-fast-math`: Disable fast math optimization
75 | 
76 | Also check the [full list of options](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/migraphx-driver.html#options) and [environment variables](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/dev/env_vars.html).
77 | 
78 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-ncnn.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Linux-NCNN)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsncnn/**'
  8 |       - '.github/workflows/linux-ncnn.yml'
  9 |   workflow_dispatch:
 10 | 
 11 | jobs:
 12 |   build-linux:
 13 |     runs-on: ubuntu-24.04
 14 |     
 15 |     defaults:
 16 |       run:
 17 |         working-directory: vsncnn
 18 | 
 19 |     steps:
 20 |     - name: Checkout repo
 21 |       uses: actions/checkout@v4
 22 |       with:
 23 |         fetch-depth: 0
 24 | 
 25 |     - name: Setup Ninja
 26 |       run: pip install ninja
 27 | 
 28 |     - name: Cache protobuf
 29 |       id: cache-protobuf
 30 |       uses: actions/cache@v4
 31 |       with:
 32 |         path: vsncnn/protobuf/install
 33 |         key: ${{ runner.os }}-vsncnn-protobuf-v1
 34 | 
 35 |     - name: Checkout protobuf
 36 |       uses: actions/checkout@v4
 37 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 38 |       with:
 39 |         repository: protocolbuffers/protobuf
 40 |         ref: v3.21.12
 41 |         fetch-depth: 1
 42 |         path: vsncnn/protobuf
 43 | 
 44 |     - name: Configure protobuf
 45 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 46 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 47 |         -D CMAKE_BUILD_TYPE=Release
 48 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 49 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 50 | 
 51 |     - name: Build protobuf
 52 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 53 |       run: cmake --build protobuf/build_rel --verbose
 54 | 
 55 |     - name: Install protobuf
 56 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 57 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 58 | 
 59 |     - name: Cache onnx
 60 |       id: cache-onnx
 61 |       uses: actions/cache@v4
 62 |       with:
 63 |         path: vsncnn/onnx/install
 64 |         key: ${{ runner.os }}-vsncnn-onnx-v1
 65 | 
 66 |     - name: Checkout onnx
 67 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 68 |       uses: actions/checkout@v4
 69 |       with:
 70 |         repository: onnx/onnx
 71 |         ref: b86cc54efce19530fb953e4b21f57e6b3888534c
 72 |         fetch-depth: 1
 73 |         path: vsncnn/onnx
 74 | 
 75 |     - name: Configure onnx
 76 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 77 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 78 |         -D CMAKE_BUILD_TYPE=Release
 79 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 80 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 81 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 82 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 83 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 84 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 85 | 
 86 |     - name: Build onnx
 87 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 88 |       run: cmake --build onnx/build --verbose
 89 | 
 90 |     - name: Install onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --install onnx/build --prefix onnx/install
 93 | 
 94 |     - name: Download VapourSynth headers
 95 |       run: |
 96 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
 97 |         unzip -q vs.zip
 98 |         mv vapoursynth*/ vapoursynth
 99 | 
100 |     - name: Download NCNN Precompilation
101 |       run: |
102 |         curl -s -o ncnn.zip -LJO https://github.com/Tencent/ncnn/releases/download/20250503/ncnn-20250503-ubuntu-2404.zip
103 |         unzip -q ncnn.zip
104 | 
105 |     - name: Configure
106 |       run: cmake -S . -B build -G Ninja -LA
107 |         -D CMAKE_BUILD_TYPE=Release
108 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth/include
109 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
110 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
111 |         -D ncnn_DIR=ncnn-20250503-ubuntu-2404/lib/cmake/ncnn
112 |         -D CMAKE_CXX_STANDARD=20
113 | 
114 |     - name: Build
115 |       run: cmake --build build --verbose
116 | 
117 |     - name: Install
118 |       run: cmake --install build --prefix install
119 | 
120 |     - name: Prepare for upload
121 |       run: |
122 |         mkdir artifact
123 |         cp -v install/lib/*.so artifact
124 | 
125 |     - name: Describe
126 |       run: git describe --tags --long
127 | 
128 |     - name: Upload
129 |       uses: actions/upload-artifact@v4
130 |       with:
131 |         name: vsncnn-linux-x64
132 |         path: vsncnn/artifact
133 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-cuda-dependency.yml:
--------------------------------------------------------------------------------
  1 | name: Upload vs-mlrt-cuda dependencies
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       tag:
  7 |         description: 'which tag to upload to'
  8 |         required: true
  9 |         default: 'v100'
 10 |   workflow_call:
 11 |     inputs:
 12 |       tag:
 13 |         description: 'which tag to upload to'
 14 |         required: true
 15 |         type: string
 16 |     secrets:
 17 |       REPO_TOKEN:
 18 |         required: true
 19 | 
 20 | jobs:
 21 |   build-windows:
 22 |     runs-on: windows-2025
 23 |     outputs:
 24 |       runID: ${{ steps.output.outputs.runID }}
 25 | 
 26 |     defaults:
 27 |       run:
 28 |         shell: bash
 29 | 
 30 |     steps:
 31 |     - name: Download cuDNN inference library
 32 |       run: curl -LJ https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.13.0.50_cuda13-archive.zip -o cudnn.zip
 33 | 
 34 |     - name: Extract cuDNN library
 35 |       run: unzip cudnn.zip
 36 | 
 37 |     - name: Move cuDNN library
 38 |       run: |
 39 |         mkdir -p vsmlrt-cuda
 40 |         mv cudnn-windows-*/bin/*.dll vsmlrt-cuda/ -v
 41 |         # rm vsmlrt-cuda/cudnn_*_train*.dll -v
 42 | 
 43 |     - name: Download TensorRT library
 44 |       run: |
 45 |         curl -L -o trt.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.14.1/zip/TensorRT-10.14.1.48.Windows.win10.cuda-13.0.zip
 46 | 
 47 |     - name: Extract TensorRT library
 48 |       run: |
 49 |         unzip trt.zip
 50 |         mv TensorRT-*/ TensorRT/
 51 | 
 52 |     - name: Move TensorRT library
 53 |       run: mv TensorRT/bin/*.dll vsmlrt-cuda -v
 54 | 
 55 |     - name: Download CUDA Libraries
 56 |       shell: cmd
 57 |       run: |
 58 |         curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/13.0.2/network_installers/cuda_13.0.2_windows_network.exe
 59 |         cuda_installer.exe -s cudart_13.0 cublas_13.0 cufft_13.0 cupti_13.0 nvrtc_13.0
 60 | 
 61 |     - name: Move CUDA Libraries
 62 |       shell: cmd
 63 |       run: |
 64 |         move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda
 65 |         move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64\*.dll" vsmlrt-cuda
 66 |         del vsmlrt-cuda\cudart32*.dll
 67 | 
 68 |     - name: Download TensorRT-RTX library
 69 |       run: |
 70 |         curl -L -o trt-rtx.zip https://developer.nvidia.com/downloads/trt/rtx_sdk/secure/1.2/TensorRT-RTX-1.2.0.44-win10-amd64-cuda-13.0-Release-external.zip
 71 | 
 72 |     - name: Extract TensorRT-RTX library
 73 |       run: |
 74 |         unzip trt-rtx.zip
 75 |         mv TensorRT-RTX-*/ TensorRT-RTX/
 76 | 
 77 |     - name: Move TensorRT-RTX library
 78 |       run: |
 79 |         mv TensorRT-RTX/bin/tensorrt_rtx.exe vsmlrt-cuda -v
 80 |         mv TensorRT-RTX/bin/*.dll vsmlrt-cuda -v
 81 | 
 82 |     - name: Setup VC commands
 83 |       uses: ilammy/msvc-dev-cmd@v1
 84 |       with:
 85 |         arch: amd64
 86 | 
 87 |     - name: Copy VC Runtime Libraries
 88 |       shell: bash
 89 |       run: |
 90 |         cd vsmlrt-cuda
 91 |         while true; do
 92 |           changed=false
 93 |           for dll in *.[dD][lL][lL]; do
 94 |              for dep in $(dumpbin -dependents "$dll" | grep -o -i '\<\(vc\|msvc\)[a-z0-9_-]*\.dll'); do
 95 |                 echo "finding $dep for $dll"
 96 |                 if ! test -f ./"$dep"; then
 97 |                   changed=true
 98 |                   src="$(where "$dep" | grep -i 'MSVC' | head -1)"
 99 |                   echo "copying $src for $dep"
100 |                   test -f "$src" || exit 1
101 |                   cp -f "$src" .
102 |                 fi
103 |              done
104 |           done
105 |           $changed || break
106 |         done
107 | 
108 |     - name: Compress
109 |       run: |
110 |         7z a -t7z -bb3 -mx=9 -v2147483647b vsmlrt-cuda.7z vsmlrt-cuda
111 | 
112 |     - name: Upload
113 |       uses: actions/upload-artifact@v4
114 |       with:
115 |         name: vsmlrt-cuda
116 |         path: vsmlrt-cuda.7z*
117 |         retention-days: 1
118 |         compression-level: 0
119 | 
120 |     - name: Rename release asset
121 |       run: |
122 |         mv vsmlrt-cuda.7z.001 vsmlrt-cuda.${{ github.event.inputs.tag}}.7z.001
123 |         mv vsmlrt-cuda.7z.002 vsmlrt-cuda.${{ github.event.inputs.tag}}.7z.002
124 | 
125 |     - name: Release
126 |       uses: softprops/action-gh-release@v2
127 |       with:
128 |         tag_name: ${{ github.event.inputs.tag }}
129 |         files: vsmlrt-cuda.${{ github.event.inputs.tag }}.7z*
130 |         fail_on_unmatched_files: true
131 |         generate_release_notes: false
132 |         prerelease: true
133 | 


--------------------------------------------------------------------------------
/vstrt/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.20)
  2 | 
  3 | project(vs-trt VERSION 3.1 LANGUAGES CXX)
  4 | 
  5 | set(CMAKE_CXX_STANDARD 17)
  6 | 
  7 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
  8 | set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT")
  9 | option(USE_NVINFER_PLUGIN "Initialize nvinfer_plugin" FALSE)
 10 | option(USE_NVINFER_PLUGIN_STATIC "Use static nvinfer_plugin" FALSE)
 11 | set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix")
 12 | 
 13 | if (WIN32)
 14 |     file(GLOB files "${TENSORRT_HOME}/lib/tensorrt_rtx*.lib")
 15 | else()
 16 |     file(GLOB files "${TENSORRT_HOME}/lib/libtensorrt_rtx*.so")
 17 | endif()
 18 | if (files)
 19 |     set(USE_TRT_RTX TRUE)
 20 | else()
 21 |     set(USE_TRT_RTX FALSE)
 22 | endif()
 23 | 
 24 | FIND_PACKAGE(CUDAToolkit REQUIRED)
 25 | 
 26 | if (USE_TRT_RTX)
 27 |     add_library(vstrt_rtx SHARED
 28 |         $<$<PLATFORM_ID:Windows>: longpath.manifest>
 29 |         vs_tensorrt.cpp
 30 |         win32.cpp
 31 |     )
 32 | 
 33 |     target_include_directories(vstrt_rtx PRIVATE
 34 |         ${VAPOURSYNTH_INCLUDE_DIRECTORY}
 35 |         ${CUDAToolkit_INCLUDE_DIRS}
 36 |         ${TENSORRT_HOME}/include
 37 |     )
 38 | 
 39 |     set_target_properties(vstrt_rtx PROPERTIES
 40 |         CXX_EXTENSIONS OFF
 41 |         POSITION_INDEPENDENT_CODE ON
 42 |         CXX_STANDARD 20
 43 |         CXX_STANDARD_REQUIRED ON
 44 |     )
 45 | 
 46 |     target_link_directories(vstrt_rtx PRIVATE ${TENSORRT_HOME}/lib)
 47 |     target_link_libraries(vstrt_rtx PRIVATE CUDA::cudart_static)
 48 | 
 49 |     if (WIN32)
 50 |         target_link_libraries(vstrt_rtx PRIVATE "tensorrt_rtx${TENSORRT_LIBRARY_SUFFIX}")
 51 |         if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
 52 |             target_link_options(vstrt_rtx PRIVATE "/DELAYLOAD:tensorrt_rtx${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib")
 53 |         endif()
 54 |     else()
 55 |         target_link_libraries(vstrt_rtx PRIVATE tensorrt_rtx)
 56 |     endif()
 57 | 
 58 |     target_compile_definitions(vstrt_rtx PRIVATE USE_NVINFER_PLUGIN)
 59 | 
 60 |     target_include_directories(vstrt_rtx PUBLIC
 61 |         "${PROJECT_BINARY_DIR}"
 62 |     )
 63 | 
 64 |     find_package(Git REQUIRED)
 65 |     execute_process(
 66 |         COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
 67 |         WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
 68 |         OUTPUT_VARIABLE VCS_TAG
 69 |     )
 70 |     string(STRIP ${VCS_TAG} VCS_TAG)
 71 |     configure_file(config.h.in config.h)
 72 | 
 73 |     install(TARGETS vstrt_rtx
 74 |         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
 75 |         RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
 76 |     )
 77 | else()
 78 |     add_library(vstrt SHARED
 79 |         $<$<PLATFORM_ID:Windows>: longpath.manifest>
 80 |         vs_tensorrt.cpp
 81 |         win32.cpp
 82 |     )
 83 | 
 84 |     target_include_directories(vstrt PRIVATE
 85 |         ${VAPOURSYNTH_INCLUDE_DIRECTORY}
 86 |         ${CUDAToolkit_INCLUDE_DIRS}
 87 |         ${TENSORRT_HOME}/include
 88 |     )
 89 | 
 90 |     set_target_properties(vstrt PROPERTIES
 91 |         CXX_EXTENSIONS OFF
 92 |         POSITION_INDEPENDENT_CODE ON
 93 |         CXX_STANDARD 20
 94 |         CXX_STANDARD_REQUIRED ON
 95 |     )
 96 | 
 97 |     target_link_directories(vstrt PRIVATE ${TENSORRT_HOME}/lib)
 98 |     target_link_libraries(vstrt PRIVATE CUDA::cudart_static "nvinfer${TENSORRT_LIBRARY_SUFFIX}")
 99 | 
100 |     if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
101 |         target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib")
102 |     endif()
103 | 
104 |     if (USE_NVINFER_PLUGIN)
105 |         add_definitions(-DUSE_NVINFER_PLUGIN)
106 |         if (USE_NVINFER_PLUGIN_STATIC)
107 |             target_link_libraries(vstrt PRIVATE "nvinfer_plugin_static${TENSORRT_LIBRARY_SUFFIX}")
108 |         else()
109 |             target_link_libraries(vstrt PRIVATE "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}")
110 | 
111 |             if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
112 |                 target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}.dll")
113 |             endif()
114 |         endif()
115 |     endif()
116 | 
117 |     target_include_directories(vstrt PUBLIC
118 |         "${PROJECT_BINARY_DIR}"
119 |     )
120 | 
121 |     find_package(Git REQUIRED)
122 |     execute_process(
123 |         COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
124 |         WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
125 |         OUTPUT_VARIABLE VCS_TAG
126 |     )
127 |     string(STRIP ${VCS_TAG} VCS_TAG)
128 |     configure_file(config.h.in config.h)
129 | 
130 |     install(TARGETS vstrt
131 |         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
132 |         RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
133 |     )
134 | endif()
135 | 
136 | 


--------------------------------------------------------------------------------
/vsort/README.md:
--------------------------------------------------------------------------------
 1 | # VapourSynth ONNX Runtime
 2 | 
 3 | The vs-onnxruntime plugin provides optimized CPU & CUDA runtime for some popular AI filters.
 4 | 
 5 | ## Building and Installation
 6 | 
 7 | To build, you will need [ONNX Runtime](https://www.onnxruntime.ai/), [protobuf](https://github.com/protocolbuffers/protobuf), [ONNX](https://github.com/onnx/onnx) and their dependencies.
 8 | 
 9 | Please refer to [ONNX Runtime Docs](https://onnxruntime.ai/docs/install/) for installation notes.
10 | Or, you can use our prebuilt Windows binary releases from [AmusementClub](https://github.com/AmusementClub/onnxruntime/releases/latest/).
11 | 
12 | Please refer to our [github actions workflow](../.github/workflows/windows-ort.yml) for sample building instructions.
13 | 
14 | If you only use the CPU backend, then you just need to extract binary release into your `vapoursynth/plugins` directory.
15 | 
16 | However, if you also use the CUDA backend, you will need to download some CUDA libraries as well, please see the release page for details. Those CUDA libraries also need to be extracted into VS `vapoursynth/plugins` directory. The plugin will try to load them from `vapoursynth/plugins/vsort/` directory or `vapoursynth/plugins/vsmlrt-cuda/` directory.
17 | 
18 | ## Usage
19 | 
20 | Prototype: `core.ort.Model(clip[] clips, string network_path[, int[] overlap = None, int[] tilesize = None, string provider = "", int device_id = 0, int verbosity = 2, bint cudnn_benchmark = True, bint builtin = False, string builtindir="models", bint fp16 = False, bint path_is_serialization = False, bint use_cuda_graph = False])`
21 | 
22 | Arguments:
23 |  - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki).
24 |  - `string network_path`: the path to the network in ONNX format.
25 |  - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size.
26 |  - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size.
27 |  - `string provider`: Specifies the device to run the inference on.
28 |    - `"CPU"` or `""`: pure CPU backend
29 |    - `"CUDA"`: CUDA GPU backend, requires Nvidia Maxwell+ GPUs.
30 |    - `"DML"`: DirectML backend
31 |    - `"COREML"`: CoreML backend
32 |  - `int device_id`: select the GPU device for the CUDA backend.'
33 |  - `int verbosity`: specify the verbosity of logging, the default is warning.
34 |    - 0: fatal error only, `ORT_LOGGING_LEVEL_FATAL`
35 |    - 1: also errors, `ORT_LOGGING_LEVEL_ERROR`
36 |    - 2: also warnings, `ORT_LOGGING_LEVEL_WARNING`
37 |    - 3: also info, `ORT_LOGGING_LEVEL_INFO`
38 |    - 4: everything, `ORT_LOGGING_LEVEL_VERBOSE`
39 |  - `bint cudnn_benchmark`: whether to let cuDNN use benchmarking to search for the best convolution kernel to use. Default True. It might incur some startup latency.
40 |  - `bint builtin`: whether to load the model from the VS plugins directory, see also `builtindir`.
41 |  - `string builtindir`: the model directory under VS plugins directory for builtin models, default "models".
42 |  - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation.
43 |  - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`.
44 |  - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead in CUDA backend. Not all models are supported.
45 |  - `int ml_program`: select CoreML provider.
46 |    - 0: NeuralNetwork
47 |    - 1: MLProgram
48 | 
49 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
50 | 
51 | The general rule is to either:
52 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or
53 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used.
54 | 


--------------------------------------------------------------------------------
/.github/workflows/macos-ort.yml:
--------------------------------------------------------------------------------
  1 | name: Build (macOS-ORT)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsort/**'
  8 |       - '.github/workflows/macos-ort.yml'
  9 |   workflow_dispatch:
 10 | 
 11 | jobs:
 12 |   build-macos:
 13 |     runs-on: macos-14
 14 |     
 15 |     defaults:
 16 |       run:
 17 |         working-directory: vsort
 18 | 
 19 |     steps:
 20 |     - name: Checkout repo
 21 |       uses: actions/checkout@v4
 22 |       with:
 23 |         fetch-depth: 0
 24 | 
 25 |     - name: Setup Ninja
 26 |       run: brew install ninja
 27 | 
 28 |     - name: Cache protobuf
 29 |       id: cache-protobuf
 30 |       uses: actions/cache@v4
 31 |       with:
 32 |         path: vsort/protobuf/install
 33 |         key: ${{ runner.os }}-vsort-protobuf-v1
 34 | 
 35 |     - name: Checkout protobuf
 36 |       uses: actions/checkout@v4
 37 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 38 |       with:
 39 |         repository: protocolbuffers/protobuf
 40 |         ref: v3.21.12
 41 |         fetch-depth: 1
 42 |         path: vsort/protobuf
 43 | 
 44 |     - name: Configure protobuf
 45 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 46 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 47 |         -D CMAKE_BUILD_TYPE=Release
 48 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 49 |         -D protobuf_BUILD_SHARED_LIBS=OFF
 50 |         -D protobuf_BUILD_TESTS=OFF
 51 | 
 52 |     - name: Build protobuf
 53 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 54 |       run: cmake --build protobuf/build_rel --verbose
 55 | 
 56 |     - name: Install protobuf
 57 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 58 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 59 | 
 60 |     - name: Cache onnx
 61 |       id: cache-onnx
 62 |       uses: actions/cache@v4
 63 |       with:
 64 |         path: vsort/onnx/install
 65 |         key: ${{ runner.os }}-vsort-onnx-v2
 66 | 
 67 |     - name: Checkout onnx
 68 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 69 |       uses: actions/checkout@v4
 70 |       with:
 71 |         repository: onnx/onnx
 72 |         # follows onnx in https://github.com/microsoft/onnxruntime/tree/main/cmake/external
 73 |         ref: 595228d99e3977ac27cb79d5963adda262af99ad
 74 |         fetch-depth: 1
 75 |         path: vsort/onnx
 76 | 
 77 |     - name: Configure onnx
 78 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 79 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 80 |         -D CMAKE_BUILD_TYPE=Release
 81 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 82 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 83 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 84 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 85 |         -D ONNX_USE_LITE_PROTO=ON
 86 |         -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 87 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF
 88 |         -D ONNX_ML=0
 89 | 
 90 |     - name: Build onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --build onnx/build --verbose
 93 | 
 94 |     - name: Install onnx
 95 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 96 |       run: cmake --install onnx/build --prefix onnx/install
 97 | 
 98 |     - name: Download VapourSynth headers
 99 |       run: |
100 |         curl -L -o vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
101 |         unzip -q vs.zip
102 |         mv vapoursynth*/ vapoursynth
103 | 
104 |     - name: Setup ONNX Runtime
105 |       run: |
106 |         curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.20.0/onnxruntime-osx-arm64-1.20.0.tgz
107 |         tar -xf ort.tgz
108 |         mv onnxruntime-* onnxruntime
109 | 
110 |     - name: Configure
111 |       run: cmake -S . -B build -G Ninja -LA
112 |         -D CMAKE_BUILD_TYPE=Release
113 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -mcpu=apple-m1"
114 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
115 |         -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime/include
116 |         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime/lib
117 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
118 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
119 |         -D CMAKE_CXX_STANDARD=20
120 |         -D ENABLE_COREML=ON
121 | 
122 |     - name: Build
123 |       run: cmake --build build --verbose
124 | 
125 |     - name: Install
126 |       run: cmake --install build --prefix install
127 | 
128 |     - name: Prepare for upload
129 |       run: |
130 |         mkdir artifact
131 |         cp -v install/lib/*.dylib artifact
132 | 
133 |     - name: Describe
134 |       run: git describe --tags --long
135 | 
136 |     - name: Upload
137 |       uses: actions/upload-artifact@v4
138 |       with:
139 |         name: vsort-macos-arm64
140 |         path: vsort/artifact
141 | 


--------------------------------------------------------------------------------
/vsov/README.md:
--------------------------------------------------------------------------------
 1 | # VapourSynth OpenVINO
 2 | 
 3 | The vs-openvino plugin provides optimized *pure* CPU runtime for some popular AI filters.
 4 | 
 5 | ## Building and Installation
 6 | 
 7 | To build, you will need [OpenVINO](https://docs.openvino.ai/latest/get_started.html) and its dependencies.
 8 | Only `Model Optimizer` and `Inference Engine` are required.
 9 | 
10 | You can download official Intel releases:
11 | - [Linux](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_linux_header.html)
12 | - [Windows](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_windows_header.html)
13 | - [macOS](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_macos_header.html)
14 | 
15 | Or, you can use our prebuilt Windows binary releases from [AmusementClub](https://github.com/AmusementClub/openvino/releases/latest/), our release has the benefit of static linking support.
16 | 
17 | Sample cmake commands to build:
18 | ```bash
19 | cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release
20 | 	-D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
21 | 	-D InferenceEngine_DIR=openvino/runtime/cmake
22 | 	-D VAPOURSYNTH_INCLUDE_DIRECTORY="path/to/vapoursynth/include"
23 | cmake --build build
24 | cmake --install build --prefix install
25 | ```
26 | You should find `vsov.dll` (or libvsov.so) under `install/bin`. You will also need Intel TBB (you can get
27 | `tbb.dll` from OpenVINO release). On windows, `tbb.dll` must be placed under `vapoursynth/plugins/vsov/`
28 | directory for `vsov.dll` to find.
29 | 
30 | ## Usage
31 | 
32 | Prototype: `core.ov.Model(clip[] clips, string network_path[, int[] overlap = None, int[] tilesize = None, string device = "CPU", bint builtin = 0, string builtindir="models", bint fp16 = False, function config = None, bint path_is_serialization = False])`
33 | 
34 | Arguments:
35 |  - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki).
36 |  - `string network_path`: the path to the network in ONNX format.
37 |  - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size.
38 |  - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size.
39 |  - `string device`: Specifies the device to run the inference on. Currently `"CPU"` and `"GPU"` are supported. `"GPU"` requires Intel graphics (Broadwell+ processors with Gen8+ integrated GPUs or Xe discrete GPUs) with compatible graphics driver and compute runtime.
40 |  - `bint builtin`: whether to load the model from the VS plugins directory, see also `builtindir`.
41 |  - `string builtindir`: the model directory under VS plugins directory for builtin models, default "models".
42 |  - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation.
43 |  - `function config`: plugin configuration parameters. It must be a callable object (e.g. a function) with no positional arguments, and returns the configuration parameter in a dictionary `dict`. The dictionary must use string `str` for its key and `int`, `float` or `str` for its values. Supported parameters: [CPU](https://docs.openvino.ai/2021.4/openvino_docs_IE_DG_supported_plugins_CPU.html#supported-configuration-parameters), [GPU](https://docs.openvino.ai/2021.4/openvino_docs_IE_DG_supported_plugins_GPU.html#supported-configuration-parameters) (the prefix `KEY_` has to be removed). Example: `config = lambda: dict(CPU_THROUGHPUT_STREAMS=2)`
44 |  - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`.
45 | 
46 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
47 | 
48 | The general rule is to either:
49 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or
50 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used.
51 | 


--------------------------------------------------------------------------------
/vstrt/inference_helper.h:
--------------------------------------------------------------------------------
  1 | #ifndef VSTRT_INFERENCE_HELPER_H_
  2 | #define VSTRT_INFERENCE_HELPER_H_
  3 | 
  4 | #include <algorithm>
  5 | #include <cstdint>
  6 | #include <optional>
  7 | #include <string>
  8 | #include <vector>
  9 | 
 10 | #include <VSHelper.h>
 11 | 
 12 | #include "cuda_helper.h"
 13 | #include "trt_utils.h"
 14 | 
 15 | struct InputInfo {
 16 |     int width;
 17 |     int height;
 18 |     int pitch;
 19 |     int bytes_per_sample;
 20 |     int tile_w;
 21 |     int tile_h;
 22 | };
 23 | 
 24 | struct OutputInfo {
 25 |     int pitch;
 26 |     int bytes_per_sample;
 27 | };
 28 | 
 29 | struct IOInfo {
 30 |     InputInfo in;
 31 |     OutputInfo out;
 32 |     int w_scale;
 33 |     int h_scale;
 34 |     int overlap_w;
 35 |     int overlap_h;
 36 | };
 37 | 
 38 | static inline
 39 | std::optional<ErrorMessage> inference(
 40 |     const InferenceInstance & instance,
 41 |     int device_id,
 42 |     bool use_cuda_graph, 
 43 |     const IOInfo & info,
 44 |     const std::vector<const uint8_t *> & src_ptrs,
 45 |     const std::vector<uint8_t *> & dst_ptrs
 46 | ) noexcept {
 47 | 
 48 |     const auto set_error = [](const ErrorMessage & error_message) {
 49 |         return error_message;
 50 |     };
 51 | 
 52 |     checkError(cudaSetDevice(device_id));
 53 | 
 54 |     int src_tile_w_bytes = info.in.tile_w * info.in.bytes_per_sample;
 55 |     int src_tile_bytes = info.in.tile_h * info.in.tile_w * info.in.bytes_per_sample;
 56 |     int dst_tile_w = info.in.tile_w * info.w_scale;
 57 |     int dst_tile_h = info.in.tile_h * info.h_scale;
 58 |     int dst_tile_w_bytes = dst_tile_w * info.out.bytes_per_sample;
 59 |     int dst_tile_bytes = dst_tile_h * dst_tile_w * info.out.bytes_per_sample;
 60 | 
 61 |     int step_w = info.in.tile_w - 2 * info.overlap_w;
 62 |     int step_h = info.in.tile_h - 2 * info.overlap_h;
 63 | 
 64 |     int y = 0;
 65 |     while (true) {
 66 |         int y_crop_start = (y == 0) ? 0 : info.overlap_h;
 67 |         int y_crop_end = (y == info.in.height - info.in.tile_h) ? 0 : info.overlap_h;
 68 | 
 69 |         int x = 0;
 70 |         while (true) {
 71 |             int x_crop_start = (x == 0) ? 0 : info.overlap_w;
 72 |             int x_crop_end = (x == info.in.width - info.in.tile_w) ? 0 : info.overlap_w;
 73 | 
 74 |             {
 75 |                 uint8_t * h_data = instance.src.h_data.data;
 76 |                 for (const uint8_t * _src_ptr : src_ptrs) {
 77 |                     const uint8_t * src_ptr { _src_ptr +
 78 |                         y * info.in.pitch + x * info.in.bytes_per_sample
 79 |                     };
 80 | 
 81 |                     vs_bitblt(
 82 |                         h_data, src_tile_w_bytes,
 83 |                         src_ptr, info.in.pitch,
 84 |                         static_cast<size_t>(src_tile_w_bytes),
 85 |                         static_cast<size_t>(info.in.tile_h)
 86 |                     );
 87 | 
 88 |                     h_data += src_tile_bytes;
 89 |                 }
 90 |             }
 91 | 
 92 |             if (use_cuda_graph) {
 93 |                 checkError(cudaGraphLaunch(instance.graphexec, instance.stream));
 94 |             } else {
 95 |                 auto result = enqueue(
 96 |                     instance.src, instance.dst,
 97 |                     instance.exec_context, instance.stream
 98 |                 );
 99 | 
100 |                 if (result.has_value()) {
101 |                     return set_error(result.value());
102 |                 }
103 |             }
104 |             checkError(cudaStreamSynchronize(instance.stream));
105 | 
106 |             {
107 |                 const uint8_t * h_data = instance.dst.h_data.data;
108 |                 for (uint8_t * _dst_ptr : dst_ptrs) {
109 |                     uint8_t * dst_ptr { _dst_ptr +
110 |                         info.h_scale * y * info.out.pitch + info.w_scale * x * info.out.bytes_per_sample
111 |                     };
112 | 
113 |                     vs_bitblt(
114 |                         dst_ptr + (y_crop_start * info.out.pitch + x_crop_start * info.out.bytes_per_sample),
115 |                         info.out.pitch,
116 |                         h_data + (y_crop_start * dst_tile_w_bytes + x_crop_start * info.out.bytes_per_sample),
117 |                         dst_tile_w_bytes,
118 |                         static_cast<size_t>(dst_tile_w_bytes - (x_crop_start + x_crop_end) * info.out.bytes_per_sample),
119 |                         static_cast<size_t>(dst_tile_h - (y_crop_start + y_crop_end))
120 |                     );
121 | 
122 |                     h_data += dst_tile_bytes;
123 |                 }
124 |             }
125 | 
126 |             if (x + info.in.tile_w == info.in.width) {
127 |                 break;
128 |             }
129 | 
130 |             x = std::min(x + step_w, info.in.width - info.in.tile_w);
131 |         }
132 | 
133 |         if (y + info.in.tile_h == info.in.height) {
134 |             break;
135 |         }
136 | 
137 |         y = std::min(y + step_h, info.in.height - info.in.tile_h);
138 |     }
139 | 
140 |     return {};
141 | }
142 | 
143 | #endif // VSTRT_INFERENCE_HELPER_H_
144 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-ov.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Linux-OV)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vsov/**'
  7 |       - '.github/workflows/linux-ov.yml'
  8 |   workflow_dispatch:
  9 | 
 10 | jobs:
 11 |   build-linux:
 12 |     runs-on: ubuntu-22.04
 13 |     
 14 |     defaults:
 15 |       run:
 16 |         working-directory: vsov
 17 | 
 18 |     steps:
 19 |     - name: Checkout repo
 20 |       uses: actions/checkout@v4
 21 |       with:
 22 |         fetch-depth: 0
 23 | 
 24 |     - name: Setup Ninja
 25 |       run: pip install ninja
 26 | 
 27 |     - name: Cache protobuf
 28 |       id: cache-protobuf
 29 |       uses: actions/cache@v4
 30 |       with:
 31 |         path: vsov/protobuf/install
 32 |         key: ${{ runner.os }}-vsov-protobuf-v1
 33 | 
 34 |     - name: Checkout protobuf
 35 |       uses: actions/checkout@v4
 36 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 37 |       with:
 38 |         repository: protocolbuffers/protobuf
 39 |         # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/protobuf
 40 |         # if you change this, remember to bump the version of the cache key.
 41 |         ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c
 42 |         fetch-depth: 1
 43 |         path: vsov/protobuf
 44 | 
 45 |     - name: Configure protobuf
 46 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 47 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 48 |         -D CMAKE_BUILD_TYPE=Release
 49 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 50 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 51 | 
 52 |     - name: Build protobuf
 53 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 54 |       run: cmake --build protobuf/build_rel --verbose
 55 | 
 56 |     - name: Install protobuf
 57 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 58 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 59 | 
 60 |     - name: Cache onnx
 61 |       id: cache-onnx
 62 |       uses: actions/cache@v4
 63 |       with:
 64 |         path: vsov/onnx/install
 65 |         key: ${{ runner.os }}-vsov-onnx-v1
 66 | 
 67 |     - name: Checkout onnx
 68 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 69 |       uses: actions/checkout@v4
 70 |       with:
 71 |         repository: onnx/onnx
 72 |         # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/onnx
 73 |         # if you change this, remember to bump the version of the cache key.
 74 |         ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c
 75 |         fetch-depth: 1
 76 |         path: vsov/onnx
 77 | 
 78 |     - name: Configure onnx
 79 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 80 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 81 |         -D CMAKE_BUILD_TYPE=Release
 82 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 83 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 84 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 85 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 86 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 87 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 88 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
 89 | 
 90 |     - name: Build onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --build onnx/build --verbose
 93 | 
 94 |     - name: Install onnx
 95 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 96 |       run: cmake --install onnx/build --prefix onnx/install
 97 | 
 98 |     - name: Download VapourSynth headers
 99 |       run: |
100 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
101 |         unzip -q vs.zip
102 |         mv vapoursynth*/ vapoursynth
103 | 
104 |     - name: Setup OpenVINO
105 |       run: |
106 |         curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/linux/l_openvino_toolkit_ubuntu24_2024.6.0.17404.4c0f47d2335_x86_64.tgz
107 |         tar -xf ov.tgz
108 |         mv l_openvino_* openvino -v
109 | 
110 |     - name: Configure
111 |       run: cmake -S . -B build -G Ninja -LA
112 |         -D CMAKE_BUILD_TYPE=Release
113 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
114 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
115 |         -D OpenVINO_DIR=openvino/runtime/cmake
116 |         -D ENABLE_VISUALIZATION=ON
117 |         -D WIN32_SHARED_OPENVINO=ON
118 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
119 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
120 | 
121 |     - name: Build
122 |       run: cmake --build build --verbose
123 | 
124 |     - name: Install
125 |       run: cmake --install build --prefix install
126 | 
127 |     - name: Prepare for upload
128 |       run: |
129 |         mkdir artifact
130 |         cp -v install/lib/*.so artifact
131 | 
132 |     - name: Describe
133 |       run: git describe --tags --long
134 | 
135 |     - name: Upload
136 |       uses: actions/upload-artifact@v4
137 |       with:
138 |         name: VSOV-Linux-x64
139 |         path: vsov/artifact
140 | 
141 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-ov-arm64.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Linux-OV, ARM64)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vsov/**'
  7 |       - '.github/workflows/linux-ov-arm64.yml'
  8 |   workflow_dispatch:
  9 | 
 10 | jobs:
 11 |   build-linux:
 12 |     runs-on: ubuntu-24.04-arm
 13 |     
 14 |     defaults:
 15 |       run:
 16 |         working-directory: vsov
 17 | 
 18 |     steps:
 19 |     - name: Checkout repo
 20 |       uses: actions/checkout@v4
 21 |       with:
 22 |         fetch-depth: 0
 23 | 
 24 |     - name: Setup Ninja
 25 |       run: pip install ninja
 26 | 
 27 |     - name: Cache protobuf
 28 |       id: cache-protobuf
 29 |       uses: actions/cache@v4
 30 |       with:
 31 |         path: vsov/protobuf/install
 32 |         key: ${{ runner.os }}-vsov-protobuf-arm64-v1
 33 | 
 34 |     - name: Checkout protobuf
 35 |       uses: actions/checkout@v4
 36 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 37 |       with:
 38 |         repository: protocolbuffers/protobuf
 39 |         # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/protobuf
 40 |         # if you change this, remember to bump the version of the cache key.
 41 |         ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c
 42 |         fetch-depth: 1
 43 |         path: vsov/protobuf
 44 | 
 45 |     - name: Configure protobuf
 46 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 47 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 48 |         -D CMAKE_BUILD_TYPE=Release
 49 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 50 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 51 | 
 52 |     - name: Build protobuf
 53 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 54 |       run: cmake --build protobuf/build_rel --verbose
 55 | 
 56 |     - name: Install protobuf
 57 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 58 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 59 | 
 60 |     - name: Cache onnx
 61 |       id: cache-onnx
 62 |       uses: actions/cache@v4
 63 |       with:
 64 |         path: vsov/onnx/install
 65 |         key: ${{ runner.os }}-vsov-onnx-arm64-v1
 66 | 
 67 |     - name: Checkout onnx
 68 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 69 |       uses: actions/checkout@v4
 70 |       with:
 71 |         repository: onnx/onnx
 72 |         # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/onnx
 73 |         # if you change this, remember to bump the version of the cache key.
 74 |         ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c
 75 |         fetch-depth: 1
 76 |         path: vsov/onnx
 77 | 
 78 |     - name: Configure onnx
 79 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 80 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 81 |         -D CMAKE_BUILD_TYPE=Release
 82 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 83 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 84 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 85 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 86 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 87 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 88 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
 89 | 
 90 |     - name: Build onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --build onnx/build --verbose
 93 | 
 94 |     - name: Install onnx
 95 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 96 |       run: cmake --install onnx/build --prefix onnx/install
 97 | 
 98 |     - name: Download VapourSynth headers
 99 |       run: |
100 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
101 |         unzip -q vs.zip
102 |         mv vapoursynth*/ vapoursynth
103 | 
104 |     - name: Setup OpenVINO
105 |       run: |
106 |         curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/linux/l_openvino_toolkit_ubuntu20_2024.6.0.17404.4c0f47d2335_arm64.tgz
107 |         tar -xf ov.tgz
108 |         mv l_openvino_* openvino -v
109 | 
110 |     - name: Configure
111 |       run: cmake -S . -B build -G Ninja -LA
112 |         -D CMAKE_BUILD_TYPE=Release
113 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math"
114 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
115 |         -D OpenVINO_DIR=openvino/runtime/cmake
116 |         -D ENABLE_VISUALIZATION=ON
117 |         -D WIN32_SHARED_OPENVINO=ON
118 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
119 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
120 | 
121 |     - name: Build
122 |       run: cmake --build build --verbose
123 | 
124 |     - name: Install
125 |       run: cmake --install build --prefix install
126 | 
127 |     - name: Prepare for upload
128 |       run: |
129 |         mkdir artifact
130 |         cp -v install/lib/*.so artifact
131 | 
132 |     - name: Describe
133 |       run: git describe --tags --long
134 | 
135 |     - name: Upload
136 |       uses: actions/upload-artifact@v4
137 |       with:
138 |         name: VSOV-Linux-ARM64
139 |         path: vsov/artifact
140 | 
141 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-trt.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-TRT)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vstrt/**'
  7 |       - '.github/workflows/windows-trt.yml'
  8 |   workflow_call:
  9 |     inputs:
 10 |       tag:
 11 |         description: 'which tag to upload to'
 12 |         required: true
 13 |         type: string
 14 |     secrets:
 15 |       REPO_TOKEN:
 16 |         required: true
 17 |   workflow_dispatch:
 18 |     inputs:
 19 |       tag:
 20 |         description: 'which tag to upload to'
 21 |         default: ''
 22 | 
 23 | jobs:
 24 |   build-windows:
 25 |     runs-on: windows-2025
 26 | 
 27 |     defaults:
 28 |       run:
 29 |         shell: cmd
 30 |         working-directory: vstrt
 31 | 
 32 |     steps:
 33 |     - name: Checkout repo
 34 |       uses: actions/checkout@v4
 35 |       with:
 36 |         fetch-depth: 0
 37 | 
 38 |     - name: Setup MSVC
 39 |       uses: ilammy/msvc-dev-cmd@v1
 40 | 
 41 |     - name: Setup Ninja
 42 |       run: pip install ninja
 43 | 
 44 |     - name: Cache CUDA
 45 |       id: cache-cuda
 46 |       uses: actions/cache@v4
 47 |       with:
 48 |         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
 49 |         key: ${{ runner.os }}-vstrt-cuda-13.0.2
 50 | 
 51 |     - name: Setup CUDA
 52 |       if: steps.cache-cuda.outputs.cache-hit != 'true'
 53 |       run: |
 54 |         curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/13.0.2/network_installers/cuda_13.0.2_windows_network.exe
 55 |         cuda_installer.exe -s nvcc_13.0 cudart_13.0 cuda_profiler_api_13.0 crt_13.0 nvptxcompiler_13.0
 56 | 
 57 |     - name: Download TensorRT
 58 |       run: |
 59 |         curl -L -o trt.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.14.1/zip/TensorRT-10.14.1.48.Windows.win10.cuda-13.0.zip
 60 |         unzip trt.zip
 61 |         mv TensorRT-*/ tensorrt/
 62 | 
 63 |     - name: Download VapourSynth headers
 64 |       run: |
 65 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
 66 |         unzip -q vs.zip
 67 |         mv vapoursynth-*/ vapoursynth/
 68 | 
 69 |     - name: Configure
 70 |       run: cmake -S . -B build -G Ninja -LA
 71 |         -D CMAKE_BUILD_TYPE=Release
 72 |         -D CMAKE_CXX_FLAGS="/EHsc /Wall /wd4100 /wd4625 /wd4626 /wd4710 /wd4711 /wd4820 /wd4996 /wd5026 /wd5027"
 73 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
 74 |         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
 75 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
 76 |         -D TENSORRT_HOME="%cd%\tensorrt"
 77 |         -D USE_NVINFER_PLUGIN=ON
 78 |         -D TENSORRT_LIBRARY_SUFFIX="_10"
 79 | 
 80 |     - name: Build
 81 |       run: cmake --build build --config Release --verbose
 82 | 
 83 |     - name: Install
 84 |       run: cmake --install build --prefix install
 85 | 
 86 |     - name: Checkout TensorRT OSS
 87 |       uses: actions/checkout@v4
 88 |       with:
 89 |         repository: NVIDIA/TensorRT
 90 |         ref: v10.14
 91 |         fetch-depth: 1
 92 |         path: tensorrt-oss
 93 | 
 94 |     - name: Override trtexec CMake file
 95 |       run: |
 96 |         mv trtexec/CMakeLists.txt ../tensorrt-oss/samples/trtexec
 97 |         mv trtexec/*.cpp ../tensorrt-oss/samples/trtexec
 98 |         mv longpath.manifest ../tensorrt-oss/samples/trtexec
 99 | 
100 |     - name: Apply patch
101 |       run: |
102 |         mv trtexec/trtexec.patch ../tensorrt-oss
103 |         cd ../tensorrt-oss
104 | 
105 |         git apply trtexec.patch --verbose
106 | 
107 |     - name: Apply patch2
108 |       shell: bash
109 |       run: sed -i 's/fp16 || bf16 || int8 || fp8 || int4 || best/0/g' ../tensorrt-oss/samples/common/sampleOptions.cpp
110 | 
111 |     - name: Configure trtexec
112 |       run: cmake -S ../tensorrt-oss/samples/trtexec -B build_trtexec -G Ninja
113 |         -D CMAKE_BUILD_TYPE=Release
114 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
115 |         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
116 |         -D CMAKE_UNITY_BUILD=ON -D CMAKE_UNITY_BUILD_BATCH_SIZE=0
117 |         -D CMAKE_CXX_STANDARD=20
118 | 
119 |     - name: Build trtexec
120 |       run: cmake --build build_trtexec --verbose
121 | 
122 |     - name: Install trtexec
123 |       run: cmake --install build_trtexec --prefix trtexec
124 | 
125 |     - name: Prepare for upload
126 |       run: |
127 |         mkdir artifact
128 |         copy install\bin\vstrt.dll artifact\
129 |         mkdir artifact\vsmlrt-cuda
130 |         copy trtexec\bin\trtexec.exe artifact\vsmlrt-cuda
131 | 
132 |     - name: Describe
133 |       run: git describe --tags --long
134 | 
135 |     - name: Dump dependencies
136 |       run: dumpbin /dependents artifact\vstrt.dll
137 | 
138 |     - name: Upload
139 |       uses: actions/upload-artifact@v4
140 |       with:
141 |         name: VSTRT-Windows-x64
142 |         path: vstrt/artifact
143 | 
144 |     - name: Compress artifact for release
145 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
146 |       run: |
147 |         cd artifact
148 |         7z a -t7z -mx=7 ../../VSTRT-Windows-x64.${{ github.event.inputs.tag }}.7z .
149 | 
150 |     - name: Release
151 |       uses: softprops/action-gh-release@v2
152 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
153 |       with:
154 |         tag_name: ${{ inputs.tag }}
155 |         files: VSTRT-Windows-x64.${{ github.event.inputs.tag }}.7z
156 |         fail_on_unmatched_files: true
157 |         generate_release_notes: false
158 |         prerelease: true
159 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-ort.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Linux-ORT)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsort/**'
  8 |       - '.github/workflows/linux-ort.yml'
  9 |   workflow_dispatch:
 10 | 
 11 | jobs:
 12 |   build-linux:
 13 |     runs-on: ubuntu-22.04
 14 |     
 15 |     defaults:
 16 |       run:
 17 |         working-directory: vsort
 18 | 
 19 |     steps:
 20 |     - name: Checkout repo
 21 |       uses: actions/checkout@v4
 22 |       with:
 23 |         fetch-depth: 0
 24 | 
 25 |     - name: Setup Ninja
 26 |       run: pip install ninja
 27 | 
 28 |     - name: Cache protobuf
 29 |       id: cache-protobuf
 30 |       uses: actions/cache@v4
 31 |       with:
 32 |         path: vsort/protobuf/install
 33 |         key: ${{ runner.os }}-vsort-protobuf-v1
 34 | 
 35 |     - name: Checkout protobuf
 36 |       uses: actions/checkout@v4
 37 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 38 |       with:
 39 |         repository: protocolbuffers/protobuf
 40 |         # follows protobuf in https://github.com/microsoft/onnxruntime/blob/v1.17.1/cmake/external/onnxruntime_external_deps.cmake#L183
 41 |         # if you change this, remember to bump the version of the cache key.
 42 |         ref: v3.21.12
 43 |         fetch-depth: 1
 44 |         path: vsort/protobuf
 45 | 
 46 |     - name: Configure protobuf
 47 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 48 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 49 |         -D CMAKE_BUILD_TYPE=Release
 50 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 51 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 52 | 
 53 |     - name: Build protobuf
 54 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 55 |       run: cmake --build protobuf/build_rel --verbose
 56 | 
 57 |     - name: Install protobuf
 58 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 59 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 60 | 
 61 |     - name: Cache onnx
 62 |       id: cache-onnx
 63 |       uses: actions/cache@v4
 64 |       with:
 65 |         path: vsort/onnx/install
 66 |         key: ${{ runner.os }}-vsort-onnx-v1
 67 | 
 68 |     - name: Checkout onnx
 69 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 70 |       uses: actions/checkout@v4
 71 |       with:
 72 |         repository: onnx/onnx
 73 |         # follows onnx in https://github.com/microsoft/onnxruntime/tree/v1.17.1/cmake/external
 74 |         # if you change this, remember to bump the version of the cache key.
 75 |         ref: b86cc54efce19530fb953e4b21f57e6b3888534c
 76 |         fetch-depth: 1
 77 |         path: vsort/onnx
 78 | 
 79 |     - name: Configure onnx
 80 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 81 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 82 |         -D CMAKE_BUILD_TYPE=Release
 83 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 84 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 85 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 86 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 87 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 88 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 89 | 
 90 |     - name: Build onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --build onnx/build --verbose
 93 | 
 94 |     - name: Install onnx
 95 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 96 |       run: cmake --install onnx/build --prefix onnx/install
 97 | 
 98 |     - name: Download VapourSynth headers
 99 |       run: |
100 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
101 |         unzip -q vs.zip
102 |         mv vapoursynth*/ vapoursynth
103 | 
104 |     - name: Setup ONNX Runtime
105 |       run: |
106 |         curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-cuda12-1.17.1.tgz
107 |         tar -xf ort.tgz
108 |         mv onnxruntime-* onnxruntime -v
109 | 
110 |     - name: Setup CUDA
111 |       run: |
112 |         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
113 |         sudo dpkg -i cuda-keyring_1.1-1_all.deb
114 |         sudo apt-get update
115 |         sudo apt-get install -y cuda-nvcc-12-1 cuda-cudart-dev-12-1
116 |         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
117 |         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
118 |         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
119 | 
120 |     - name: Configure
121 |       run: cmake -S . -B build -G Ninja -LA
122 |         -D CMAKE_BUILD_TYPE=Release
123 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
124 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
125 |         -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime/include
126 |         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime/lib
127 |         -D ENABLE_CUDA=1
128 |         -D CUDAToolkit_ROOT=/usr/local/cuda
129 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
130 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
131 |         -D CMAKE_CXX_STANDARD=20
132 | 
133 |     - name: Build
134 |       run: cmake --build build --verbose
135 | 
136 |     - name: Install
137 |       run: cmake --install build --prefix install
138 | 
139 |     - name: Prepare for upload
140 |       run: |
141 |         mkdir artifact
142 |         cp -v install/lib/*.so artifact
143 | 
144 |     - name: Describe
145 |       run: git describe --tags --long
146 | 
147 |     - name: Upload
148 |       uses: actions/upload-artifact@v4
149 |       with:
150 |         name: vsort-linux-x64-cuda12.1
151 |         path: vsort/artifact
152 | 


--------------------------------------------------------------------------------
/vsort/win32.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _MSC_VER
  2 | #include <windows.h>
  3 | #include <delayimp.h>
  4 | #include <iostream>
  5 | #include <map>
  6 | #include <string>
  7 | #include <vector>
  8 | #include <stdexcept>
  9 | #include <filesystem>
 10 | 
 11 | #define DLL_DIR L"vsort"
 12 | #define COMMON_CUDA_DIR L"vsmlrt-cuda"
 13 | 
 14 | namespace {
 15 | std::vector<std::wstring> dlls = {
 16 |     // This list must be sorted by dependency.
 17 |     L"DirectML.dll",
 18 |     L"onnxruntime.dll", // must be the last
 19 | };
 20 | 
 21 | static std::vector<std::wstring> cudaDlls {
 22 |     L"cudart64",
 23 |     L"cublasLt64", L"cublas64",
 24 |     L"cufft64",
 25 |     // follows the dependency graph in 
 26 |     // https://docs.nvidia.com/deeplearning/cudnn/backend/v9.12.0/api/overview.html#backend-api-overview
 27 |     L"cudnn_graph64",
 28 |     L"cudnn_engines_precompiled64", L"cudnn_heuristic64", L"cudnn_engines_runtime_compiled64",
 29 |     L"cudnn_ops64", L"cudnn_cnn64", L"cudnn_adv64",
 30 |     L"cudnn64",
 31 |     L"cupti64",
 32 | };
 33 | 
 34 | bool verbose() { return getenv("VSORT_VERBOSE") != nullptr; }
 35 | 
 36 | namespace fs = std::filesystem;
 37 | static fs::path dllDir() {
 38 |     static const std::wstring res = []() -> std::wstring {
 39 |         HMODULE mod = 0;
 40 |         if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
 41 |             std::vector<wchar_t> buf;
 42 |             size_t n = 0;
 43 |             do {
 44 |                 buf.resize(buf.size() + MAX_PATH);
 45 |                 n = GetModuleFileNameW(mod, buf.data(), buf.size());
 46 |             } while (n >= buf.size());
 47 |             buf.resize(n);
 48 |             std::wstring path(buf.begin(), buf.end());
 49 |             return path;
 50 |         }
 51 |         throw std::runtime_error("unable to locate myself");
 52 |     }();
 53 |     return fs::path(res).parent_path();
 54 | }
 55 | 
 56 | FARPROC loadDLLs() {
 57 |     fs::path dir = dllDir() / DLL_DIR;
 58 |     HMODULE h = nullptr;
 59 |     for (const auto dll: dlls) {
 60 |         fs::path p = dir / dll;
 61 |         std::wstring s = p;
 62 |         h = LoadLibraryW(s.c_str());
 63 |         if (verbose())
 64 |             std::wcerr << DLL_DIR << L": preloading " << p << L": " << h << std::endl;
 65 |         if (!h)
 66 |             std::wcerr << DLL_DIR << L": failed to preload " << s << std::endl;
 67 |     }
 68 |     return (FARPROC)h;
 69 | }
 70 | 
 71 | static void *dummy() { // mimic OrtGetApiBase
 72 |     return nullptr;
 73 | }
 74 | 
 75 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
 76 |     switch (reason) {
 77 |     case dliNoteStartProcessing:
 78 |     case dliNoteEndProcessing:
 79 |         // Nothing to do here.
 80 |         break;
 81 |     case dliNotePreLoadLibrary:
 82 |         //std::cerr << "loading " << info->szDll << std::endl;
 83 |         if (std::string(info->szDll).find("onnxruntime.dll") != std::string::npos)
 84 |             return loadDLLs();
 85 |         break;
 86 |     case dliNotePreGetProcAddress:
 87 |         // Nothing to do here.
 88 |         break;
 89 |     case dliFailLoadLib:
 90 |     case dliFailGetProc:
 91 |         // Returning NULL from error notifications will cause the delay load
 92 |         // runtime to raise a VcppException structured exception, that some code
 93 |         // might want to handle.
 94 |         // The SE will crash the process, so instead we return a dummy function.
 95 |         return (FARPROC)dummy;
 96 |         break;
 97 |     default:
 98 |         abort(); // unreachable.
 99 |         break;
100 |     }
101 |     // Returning NULL causes the delay load machinery to perform default
102 |     // processing for this notification.
103 |     return NULL;
104 | }
105 | } // namespace
106 | 
107 | extern "C" {
108 |     const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
109 |     const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
110 | };
111 | 
112 | bool preloadCudaDlls() {
113 |     std::map<std::wstring, std::filesystem::path> dllmap;
114 | 
115 |     auto findDllIn = [&](const std::filesystem::path &dir) {
116 |         if (!std::filesystem::is_directory(dir))
117 |             return;
118 |         for (const auto &ent: std::filesystem::directory_iterator{dir}) {
119 |             if (!ent.is_regular_file())
120 |                 continue;
121 |             const auto path = ent.path();
122 |             if (path.extension() != ".dll")
123 |                 continue;
124 |             const std::wstring filename = path.filename().wstring();
125 |             for (const auto &dll: cudaDlls) {
126 |                 if (dllmap.count(dll) > 0)
127 |                     continue;
128 |                 if (filename.find(dll) == 0) {
129 |                     if (verbose())
130 |                         std::wcerr << DLL_DIR << L": found " << path << L" for " << dll << std::endl;
131 |                     dllmap.insert({ dll, path });
132 |                     break;
133 |                 }
134 |             }
135 |         }
136 |     };
137 |     const fs::path dir = dllDir();
138 |     findDllIn(dir / DLL_DIR);
139 |     findDllIn(dir / COMMON_CUDA_DIR);
140 | 
141 |     if (verbose()) {
142 |         for (const auto pair: dllmap)
143 |             std::wcerr << DLL_DIR << L": will load " << pair.first << L" from " << pair.second << std::endl;
144 |     }
145 |     for (const auto &dll: cudaDlls) {
146 |         if (dllmap.count(dll) == 0) {
147 |             if (verbose()) {
148 |                 std::wcerr << DLL_DIR << L": unable to preload " << dll << L": not found" << std::endl;
149 |                 return false;
150 |             }
151 |         }
152 |         std::wstring p = dllmap[dll];
153 |         HMODULE h = LoadLibraryW(p.c_str());
154 |         if (verbose())
155 |             std::wcerr << DLL_DIR << L": preloading " << p << L": " << h << std::endl;
156 |         if (!h) return false;
157 |     }
158 |     return true;
159 | }
160 | #endif
161 | 


--------------------------------------------------------------------------------
/vstrt/README.md:
--------------------------------------------------------------------------------
  1 | # VapourSynth TensorRT & TensorRT-RTX
  2 | 
  3 | The vs-tensorrt plugin provides optimized CUDA runtime for some popular AI filters.
  4 | 
  5 | ## Usage
  6 | 
  7 | Prototype: `core.{trt, trt_rtx}.Model(clip[] clips, string engine_path[, int[] overlap, int[] tilesize, int device_id=0, bint use_cuda_graph=False, int num_streams=1, int verbosity=2, string flexible_output_prop=""])`
  8 | 
  9 | Arguments:
 10 | - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki).
 11 | - `string engine_path`: the path to the prebuilt engine (see below)
 12 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size.
 13 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size.
 14 | - `int device_id`: Specifies the GPU device id to use, default 0. Requires Nvidia GPUs with second-generation Kepler architecture onwards.
 15 | - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead.
 16 | - `int num_streams`: number of concurrent CUDA streams to use. Default 1. Increase if GPU not saturated.
 17 | - `verbosity`: The verbosity level of TensorRT runtime. The message writes to `stderr`.
 18 |   `0`: Internal error. `1`: Application error. `2`: Warning. `3`: Informational messages with instructional information. `4`: Verbose messages with debugging information.
 19 | - `string flexible_output_prop`: used to support onnx models with arbitrary number of output planes.
 20 | 
 21 |   ```python3
 22 |   from typing import TypedDict
 23 | 
 24 |   class Output(TypedDict):
 25 |       clip: vs.VideoNode
 26 |       num_planes: int
 27 | 
 28 |   prop = "planes" # arbitrary non-empty string
 29 |   output = core.trt.Model(src, engine_path, flexible_output_prop=prop) # type: Output
 30 | 
 31 |   clip = output["clip"]
 32 |   num_planes = output["num_planes"]
 33 | 
 34 |   output_planes = [
 35 |       clip.std.PropToClip(prop=f"{prop}{i}")
 36 |       for i in range(num_planes)
 37 |   ] # type: list[vs.VideoNode]
 38 |   ```
 39 |   
 40 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
 41 | 
 42 | The general rule is to either:
 43 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or
 44 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used.
 45 | 
 46 | ## Instructions for TensorRT
 47 | 
 48 | ### Build engine with dynamic shape support
 49 | - Requires models with built-in dynamic shape support, e.g. `waifu2x_v3.7z` and `dpir_v3.7z`.
 50 | 
 51 | 1. Build engine
 52 |    ```shell
 53 |    trtexec --onnx=drunet_gray.onnx --minShapes=input:1x2x8x8 --optShapes=input:1x2x64x64 --maxShapes=input:1x2x1080x1920 --saveEngine=dpir_gray_1080p_dynamic.engine
 54 |    ```
 55 |    
 56 |    The engine will be optimized for `64x64` input and can be applied to eligible inputs with shape from `8x8` to `1920x1080` by specifying parameter `tilesize` in the `trt` plugin.
 57 |     
 58 |    Also check [trtexec useful arguments](#trtexec-useful-arguments)
 59 | 
 60 | ### Run model
 61 | In vpy script:
 62 | ```python3
 63 | # DPIR
 64 | src = core.std.BlankClip(src, width=640, height=360, format=vs.GRAYS)
 65 | sigma = 10.0
 66 | flt = core.trt.Model([src, core.std.BlankClip(src, color=sigma/255.0)], engine_path="dpir_gray_1080p_dynamic.engine", tilesize=[640, 360])
 67 | ```
 68 | 
 69 | ## trtexec useful arguments
 70 | - `--workspace=N`: Set workspace size in megabytes (default = 16)
 71 | 
 72 | - `--fp16`: Enable fp16 precision, in addition to fp32 (default = disabled)
 73 | 
 74 | - `--noTF32`: Disable tf32 precision (default is to enable tf32, in addition to fp32, Ampere only)
 75 | 
 76 | - `--device=N`: Select cuda device N (default = 0)
 77 | 
 78 | - `--timingCacheFile=<file>`:  Save/load the serialized global timing cache
 79 | 
 80 | - `--verbose`: Use verbose logging (default = false)
 81 | 
 82 | - `--profilingVerbosity=mode`: Specify profiling verbosity.
 83 | 
 84 |   ```
 85 |   mode ::= layer_names_only|detailed|none
 86 |   ```
 87 | 
 88 |   (default = layer_names_only)
 89 | 
 90 | - `--tacticSources=tactics`: Specify the tactics to be used by adding (+) or removing (-) tactics from the default
 91 | 
 92 |   tactic sources (default = all available tactics).
 93 | 
 94 |   Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics.
 95 | 
 96 |   Tactic Sources: 
 97 |   ```
 98 |   tactics ::= [","tactic]
 99 |   tactic  ::= (+|-)lib
100 |   lib     ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN"
101 |   ```
102 | 
103 |   For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS
104 | 
105 | - `--useCudaGraph`: Use CUDA graph to capture engine execution and then launch inference (default = disabled).
106 |   This flag may be ignored if the graph capture fails.
107 | 
108 | - `--noDataTransfers`: Disable DMA transfers to and from device (default = enabled).
109 | 
110 | - `--saveEngine=<file>`: Save the serialized engine
111 | 
112 | - `--loadEngine=<file>`: Load a serialized engine
113 | 
114 | ## Instructions for TensorRT-RTX
115 | Replace the `trtexec` executable by the `tensorrt_rtx` executable. Some options may not be supported, e.g. `--fp16`.
116 | 
117 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-ncnn.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-NCNN)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsncnn/**'
  8 |       - '.github/workflows/windows-ncnn.yml'
  9 |   workflow_call:
 10 |     inputs:
 11 |       tag:
 12 |         description: 'which tag to upload to'
 13 |         required: true
 14 |         type: string
 15 |   workflow_dispatch:
 16 |     inputs:
 17 |       tag:
 18 |         description: 'which tag to upload to'
 19 |         default: ''
 20 | 
 21 | jobs:
 22 |   build-windows:
 23 |     runs-on: windows-2025
 24 | 
 25 |     defaults:
 26 |       run:
 27 |         shell: cmd
 28 |         working-directory: vsncnn
 29 | 
 30 |     steps:
 31 |     - name: Checkout repo
 32 |       uses: actions/checkout@v5
 33 |       with:
 34 |         fetch-depth: 0
 35 | 
 36 |     - name: Setup MSVC
 37 |       uses: ilammy/msvc-dev-cmd@v1
 38 | 
 39 |     - name: Cache onnx
 40 |       id: cache-onnx
 41 |       uses: actions/cache@v4
 42 |       with:
 43 |         path: vsncnn/onnx/install
 44 |         key: ${{ runner.os }}-vsncnn-onnx-v1
 45 | 
 46 |     - name: Checkout onnx
 47 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 48 |       uses: actions/checkout@v4
 49 |       with:
 50 |         repository: onnx/onnx
 51 |         ref: v1.19.0
 52 |         path: vsncnn/onnx
 53 | 
 54 |     - name: Configure onnx
 55 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 56 |       run: cmake -S onnx -B onnx\build -G Ninja -LA
 57 |         -D CMAKE_BUILD_TYPE=Release
 58 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 59 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 60 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
 61 |         -D ONNX_BUILD_CUSTOM_PROTOBUF=ON
 62 | 
 63 |     - name: Build onnx
 64 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 65 |       run: cmake --build onnx\build --verbose
 66 | 
 67 |     - name: Install onnx
 68 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 69 |       run: cmake --install onnx\build --prefix onnx\install
 70 | 
 71 |     - name: Download VapourSynth headers
 72 |       run: |
 73 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
 74 |         unzip -q vs.zip
 75 |         mv vapoursynth-*/ vapoursynth/
 76 | 
 77 |     - name: Download NCNN Precompilation
 78 |       shell: bash
 79 |       run: |
 80 |         curl -s -o ncnn.zip -LJO https://github.com/AmusementClub/ncnn/releases/download/250919-1038-g86efe80/ncnn-gpu-x64-windows.zip
 81 |         unzip -q ncnn.zip
 82 | 
 83 |     # follows vulkan sdk in https://github.com/AmusementClub/ncnn/blob/github-actions/.github/workflows/windows-x64-gpu.yml
 84 |     - name: Setup Vulkan SDK
 85 |       shell: pwsh
 86 |       run: |
 87 |         Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.4.321.1/windows/vulkansdk-windows-X64-1.4.321.1.exe?Human=true -OutFile VulkanSDK.exe
 88 |         $installer = Start-Process -FilePath VulkanSDK.exe -Wait -PassThru -ArgumentList "--accept-licenses --default-answer --confirm-command install";
 89 |         $installer.WaitForExit();
 90 | 
 91 |     - name: Configure
 92 |       run: cmake -S . -B build -G Ninja -LA
 93 |         -D CMAKE_BUILD_TYPE=Release
 94 |         -D CMAKE_PREFIX_PATH=onnx\install
 95 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
 96 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include
 97 |         -D ncnn_DIR=ncnn\lib\cmake\ncnn
 98 |         -D CMAKE_CXX_STANDARD=20
 99 |       env:
100 |         VULKAN_SDK: C:\VulkanSDK\1.4.321.1
101 | 
102 |     - name: Build
103 |       run: cmake --build build --verbose
104 | 
105 |     - name: Install
106 |       run: |
107 |         cmake --install build --prefix install
108 |         mkdir artifact
109 |         copy install\bin\vsncnn.dll artifact\
110 | 
111 |     - name: Upload
112 |       uses: actions/upload-artifact@v4
113 |       with:
114 |         name: VSNCNN-GPU-Windows-x64
115 |         path: vsncnn/artifact
116 | 
117 |     - name: Setup Python portable
118 |       if: false
119 |       run: |
120 |         curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.9/python-3.9.9-embed-amd64.zip
121 |         7z x python.zip -ovs_portable
122 | 
123 |     - name: Install VapourSynth portable
124 |       if: false
125 |       run: |
126 |         curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z
127 |         7z x vs.7z -ovs_portable -y
128 | 
129 |     - name: Copy plugin & swiftshader
130 |       if: false
131 |       run: |
132 |         copy artifact\*.dll vs_portable\vapoursynth64\plugins
133 |         copy ncnn\tests\* vs_portable\
134 | 
135 |     - name: Install waifu2x model
136 |       if: false
137 |       run: |
138 |         curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z
139 |         7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models
140 | 
141 |     - name: Download x265
142 |       if: false
143 |       run: |
144 |         curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z
145 |         7z x x265.7z -ovs_portable\
146 | 
147 |     - name: Create script
148 |       if: false
149 |       shell: bash
150 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ncnn, core.ncnn.Version(), file=sys.stderr);core.std.BlankClip(format=vs.RGBS, width=127, height=63).ncnn.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
151 | 
152 |     - name: Run vspipe
153 |       if: false
154 |       shell: bash
155 |       run: |
156 |         set -ex
157 |         vs_portable/vspipe -i test.vpy -
158 |         vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
159 |         ls -l out.hevc x265.log
160 |         cat x265.log
161 |         grep -F 'encoded 10 frames' x265.log || exit 2
162 |         grep -i 'error' x265.log && exit 1
163 |         exit 0
164 | 
165 |     - name: Create script (flexible output)
166 |       if: false
167 |       shell: bash
168 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ncnn, core.ncnn.Version(), file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS, width=127, height=63).ncnn.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy
169 | 
170 |     - name: Run vspipe (flexible output)
171 |       if: false
172 |       shell: bash
173 |       run: |
174 |         set -ex
175 |         vs_portable/vspipe -i test_flexible_output.vpy -
176 |         vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
177 |         ls -l out.hevc x265.log
178 |         cat x265.log
179 |         grep -F 'encoded 10 frames' x265.log || exit 2
180 |         grep -i 'error' x265.log && exit 1
181 |         exit 0
182 | 
183 |     - name: Describe
184 |       run: git describe --tags --long
185 | 
186 |     - name: Dump dependencies
187 |       run: dumpbin /dependents artifact\vsncnn.dll
188 | 
189 |     - name: Compress artifact for release
190 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
191 |       run: |
192 |         cd artifact
193 |         7z a -t7z -mx=7 ../../VSNCNN-Windows-x64.${{ github.event.inputs.tag }}.7z .
194 | 
195 |     - name: Release
196 |       uses: softprops/action-gh-release@v2
197 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
198 |       with:
199 |         tag_name: ${{ inputs.tag }}
200 |         files: VSNCNN-Windows-x64.${{ github.event.inputs.tag }}.7z
201 |         fail_on_unmatched_files: true
202 |         generate_release_notes: false
203 |         prerelease: true
204 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-ov.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-OV)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsov/**'
  8 |       - '.github/workflows/windows-ov.yml'
  9 |   workflow_call:
 10 |     inputs:
 11 |       tag:
 12 |         description: 'which tag to upload to'
 13 |         required: true
 14 |         type: string
 15 |       ov_tag:
 16 |         description: 'which tag of openvino to use'
 17 |         required: true
 18 |         default: 'latest'
 19 |         type: string
 20 |   workflow_dispatch:
 21 |     inputs:
 22 |       tag:
 23 |         description: 'which tag to upload to'
 24 |         default: ''
 25 |       ov_tag:
 26 |         description: 'which tag of openvino to use'
 27 |         required: true
 28 |         default: 'latest'
 29 |         type: string
 30 | 
 31 | jobs:
 32 |   build-windows:
 33 |     runs-on: windows-2022
 34 | 
 35 |     defaults:
 36 |       run:
 37 |         shell: cmd
 38 |         working-directory: vsov
 39 | 
 40 |     steps:
 41 |     - name: Checkout repo
 42 |       uses: actions/checkout@v4
 43 |       with:
 44 |         fetch-depth: 0
 45 | 
 46 |     - name: Setup MSVC
 47 |       uses: ilammy/msvc-dev-cmd@v1
 48 | 
 49 |     - name: Setup Ninja
 50 |       run: pip install ninja
 51 | 
 52 |     - name: Cache protobuf
 53 |       id: cache-protobuf
 54 |       uses: actions/cache@v4
 55 |       with:
 56 |         path: vsov/protobuf/install
 57 |         key: ${{ runner.os }}-vsov-protobuf-v3
 58 | 
 59 |     - name: Checkout protobuf
 60 |       uses: actions/checkout@v4
 61 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 62 |       with:
 63 |         repository: protocolbuffers/protobuf
 64 |         # follows protobuf in https://github.com/AmusementClub/openvino/tree/master/thirdparty/protobuf
 65 |         # if you change this, remember to bump the version of the cache key.
 66 |         ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c
 67 |         fetch-depth: 1
 68 |         path: vsov/protobuf
 69 | 
 70 |     - name: Configure protobuf
 71 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 72 |       run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA
 73 |         -D CMAKE_BUILD_TYPE=Release
 74 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 75 | 
 76 |     - name: Build protobuf
 77 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 78 |       run: cmake --build protobuf\build_rel --verbose
 79 | 
 80 |     - name: Install protobuf
 81 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 82 |       run: cmake --install protobuf\build_rel --prefix protobuf\install
 83 | 
 84 |     - name: Cache onnx
 85 |       id: cache-onnx
 86 |       uses: actions/cache@v4
 87 |       with:
 88 |         path: vsov/onnx/install
 89 |         key: ${{ runner.os }}-vsov-onnx-v3
 90 | 
 91 |     - name: Checkout onnx
 92 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 93 |       uses: actions/checkout@v4
 94 |       with:
 95 |         repository: onnx/onnx
 96 |         # follows onnx in https://github.com/AmusementClub/openvino/tree/master/thirdparty/onnx
 97 |         # if you change this, remember to bump the version of the cache key.
 98 |         ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c
 99 |         fetch-depth: 1
100 |         path: vsov/onnx
101 | 
102 |     - name: Configure onnx
103 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
104 |       run: cmake -S onnx -B onnx\build -G Ninja -LA
105 |         -D CMAKE_BUILD_TYPE=Release
106 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc
107 |         -D Protobuf_LITE_LIBRARY=protobuf\install\lib
108 |         -D Protobuf_LIBRARIES=protobuf\install\lib
109 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
110 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
111 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
112 | 
113 |     - name: Build onnx
114 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
115 |       run: cmake --build onnx\build --verbose
116 | 
117 |     - name: Install onnx
118 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
119 |       run: cmake --install onnx\build --prefix onnx\install
120 | 
121 |     - name: Download VapourSynth headers
122 |       run: |
123 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
124 |         unzip -q vs.zip
125 |         mv vapoursynth-*/ vapoursynth/
126 | 
127 |     - name: Download OpenVINO Runtime Precompilation
128 |       shell: bash
129 |       run: |
130 |         # rev="${{github.event.inputs.ov_tag || inputs.ov_tag || 'latest'}}"
131 |         # if [ "$rev" == "latest" ]; then
132 |         #   url="https://github.com/AmusementClub/openvino/releases/latest/download/openvino-gpu-win64.zip"
133 |         # else
134 |         #   url="https://github.com/AmusementClub/openvino/releases/download/$rev/openvino-gpu-win64.zip"
135 |         # fi
136 |         url="https://github.com/AmusementClub/openvino/releases/download/2020.2-15171-g4655dd6ce3-2058-g5833781ddb/openvino-gpu-win64.zip"
137 |         curl -s -o openvino.zip -LJO "$url"
138 |         unzip -q openvino.zip
139 | 
140 |     - name: Configure
141 |       run: cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release
142 |         -D CMAKE_INTERPROCEDURAL_OPTIMIZATION=ON
143 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
144 |         -D OpenVINO_DIR=openvino/runtime/cmake
145 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
146 |         -D ENABLE_VISUALIZATION=ON
147 |         -D WIN32_SHARED_OPENVINO=ON
148 |         -D protobuf_DIR=protobuf\install\cmake
149 |         -D ONNX_DIR=onnx\install\lib\cmake\ONNX
150 | 
151 |     - name: Build
152 |       run: cmake --build build --verbose
153 | 
154 |     - name: Install
155 |       run: |
156 |         cmake --install build --prefix install
157 |         mkdir artifact
158 |         mkdir artifact\vsov
159 |         copy openvino\runtime\3rdparty\tbb\bin\tbb12.dll artifact\vsov\
160 |         copy install\bin\vsov.dll artifact\
161 |         xcopy openvino\runtime\bin\intel64\Release\* artifact\vsov\ /s
162 | 
163 |     - name: Upload
164 |       uses: actions/upload-artifact@v4
165 |       with:
166 |         name: VSOV-Windows-x64
167 |         path: vsov/artifact
168 | 
169 |     - name: Setup Python portable
170 |       run: |
171 |         curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.9/python-3.9.9-embed-amd64.zip
172 |         7z x python.zip -ovs_portable
173 | 
174 |     - name: Install VapourSynth portable
175 |       run: |
176 |         curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z
177 |         7z x vs.7z -ovs_portable -y
178 | 
179 |     - name: Copy plugin
180 |       run: |
181 |         copy artifact\*.dll vs_portable\vapoursynth64\plugins
182 |         mkdir vs_portable\vapoursynth64\plugins\vsov\
183 |         copy artifact\vsov\* vs_portable\vapoursynth64\plugins\vsov\
184 | 
185 |     - name: Install waifu2x model
186 |       run: |
187 |         curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z
188 |         7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models
189 | 
190 |     - name: Download x265
191 |       run: |
192 |         curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z
193 |         7z x x265.7z -ovs_portable\
194 | 
195 |     - name: Create script
196 |       shell: bash
197 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
198 | 
199 |     - name: Run vspipe
200 |       shell: bash
201 |       run: |
202 |         set -ex
203 |         vs_portable/vspipe -i test.vpy -
204 |         vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
205 |         ls -l out.hevc x265.log
206 |         cat x265.log
207 |         grep -F 'encoded 10 frames' x265.log || exit 2
208 |         grep -i 'error' x265.log && exit 1
209 |         exit 0
210 | 
211 |     - name: Create script (fp16)
212 |       shell: bash
213 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy
214 | 
215 |     - name: Run vspipe (fp16)
216 |       shell: bash
217 |       run: |
218 |         set -ex
219 |         vs_portable/vspipe -i test_fp16.vpy -
220 |         vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
221 |         ls -l out.hevc x265.log
222 |         cat x265.log
223 |         grep -F 'encoded 10 frames' x265.log || exit 2
224 |         grep -i 'error' x265.log && exit 1
225 |         exit 0
226 | 
227 |     - name: Create script (flexible output)
228 |       shell: bash
229 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);prop=\"test\";output=core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output[\"clip\"].std.PropToClip(prop=f\"{prop}{i}\") for i in range(output[\"num_planes\"])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy
230 | 
231 |     - name: Run vspipe (flexible output)
232 |       shell: bash
233 |       run: |
234 |         set -ex
235 |         vs_portable/vspipe -i test_flexible_output.vpy -
236 |         vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
237 |         ls -l out.hevc x265.log
238 |         cat x265.log
239 |         grep -F 'encoded 10 frames' x265.log || exit 2
240 |         grep -i 'error' x265.log && exit 1
241 |         exit 0
242 | 
243 |     - name: Describe
244 |       run: git describe --tags --long
245 | 
246 |     - name: Dump dependencies
247 |       run: dumpbin /dependents artifact\vsov.dll
248 | 
249 |     - name: Compress artifact for release
250 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
251 |       run: |
252 |         cd artifact
253 |         7z a -t7z -mx=7 ../../VSOV-Windows-x64.${{ github.event.inputs.tag }}.7z .
254 | 
255 |     - name: Release
256 |       uses: softprops/action-gh-release@v2
257 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
258 |       with:
259 |         tag_name: ${{ inputs.tag }}
260 |         files: VSOV-Windows-x64.${{ github.event.inputs.tag }}.7z
261 |         fail_on_unmatched_files: true
262 |         generate_release_notes: false
263 |         prerelease: true
264 | 


--------------------------------------------------------------------------------
/vstrt/utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef VSTRT_UTILS_H_
  2 | #define VSTRT_UTILS_H_
  3 | 
  4 | #include <array>
  5 | #include <cstdint>
  6 | #include <memory>
  7 | #include <optional>
  8 | #include <string>
  9 | #include <type_traits>
 10 | #include <vector>
 11 | 
 12 | #include <NvInferRuntime.h>
 13 | 
 14 | #include <VapourSynth.h>
 15 | #include <VSHelper.h>
 16 | 
 17 | #ifdef __cpp_impl_reflection
 18 | #include <meta>
 19 | #endif
 20 | 
 21 | static inline
 22 | void setDimensions(
 23 |     std::unique_ptr<VSVideoInfo> & vi,
 24 |     const std::unique_ptr<nvinfer1::IExecutionContext> & exec_context,
 25 |     VSCore * core,
 26 |     const VSAPI * vsapi,
 27 |     int sample_type,
 28 |     int bits_per_sample,
 29 |     bool flexible_output
 30 | ) noexcept {
 31 | 
 32 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
 33 |     auto input_name = exec_context->getEngine().getIOTensorName(0);
 34 |     auto output_name = exec_context->getEngine().getIOTensorName(1);
 35 |     const nvinfer1::Dims & in_dims = exec_context->getTensorShape(input_name);
 36 |     const nvinfer1::Dims & out_dims = exec_context->getTensorShape(output_name);
 37 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
 38 |     const nvinfer1::Dims & in_dims = exec_context->getBindingDimensions(0);
 39 |     const nvinfer1::Dims & out_dims = exec_context->getBindingDimensions(1);
 40 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
 41 | 
 42 |     auto in_height = static_cast<int>(in_dims.d[2]);
 43 |     auto in_width = static_cast<int>(in_dims.d[3]);
 44 | 
 45 |     auto out_height = static_cast<int>(out_dims.d[2]);
 46 |     auto out_width = static_cast<int>(out_dims.d[3]);
 47 | 
 48 |     vi->height *= out_height / in_height;
 49 |     vi->width *= out_width / in_width;
 50 | 
 51 |     if (out_dims.d[1] == 1 || flexible_output) {
 52 |         vi->format = vsapi->registerFormat(cmGray, sample_type, bits_per_sample, 0, 0, core);
 53 |     } else if (out_dims.d[1] == 3) {
 54 |         vi->format = vsapi->registerFormat(cmRGB, sample_type, bits_per_sample, 0, 0, core);
 55 |     }
 56 | }
 57 | 
 58 | static inline
 59 | std::vector<const VSVideoInfo *> getVideoInfo(
 60 |     const VSAPI * vsapi,
 61 |     const std::vector<VSNodeRef *> & nodes
 62 | ) noexcept {
 63 | 
 64 |     std::vector<const VSVideoInfo *> vis;
 65 |     vis.reserve(std::size(nodes));
 66 | 
 67 |     for (const auto & node : nodes) {
 68 |         vis.emplace_back(vsapi->getVideoInfo(node));
 69 |     }
 70 | 
 71 |     return vis;
 72 | }
 73 | 
 74 | static inline
 75 | std::vector<const VSFrameRef *> getFrames(
 76 |     int n,
 77 |     const VSAPI * vsapi,
 78 |     VSFrameContext * frameCtx,
 79 |     const std::vector<VSNodeRef *> & nodes
 80 | ) noexcept {
 81 | 
 82 |     std::vector<const VSFrameRef *> frames;
 83 |     frames.reserve(std::size(nodes));
 84 | 
 85 |     for (const auto & node : nodes) {
 86 |         frames.emplace_back(vsapi->getFrameFilter(n, node, frameCtx));
 87 |     }
 88 | 
 89 |     return frames;
 90 | }
 91 | 
 92 | static inline
 93 | std::optional<std::string> checkNodes(
 94 |     const std::vector<const VSVideoInfo *> & vis
 95 | ) noexcept {
 96 | 
 97 |     for (const auto & vi : vis) {
 98 |         if (!isConstantFormat(vi)) {
 99 |             return "video format must be constant";
100 |         }
101 | 
102 |         if (vi->width != vis[0]->width || vi->height != vis[0]->height) {
103 |             return "dimensions of clips mismatch";
104 |         }
105 | 
106 |         if (vi->numFrames != vis[0]->numFrames) {
107 |             return "number of frames mismatch";
108 |         }
109 | 
110 |         if (vi->format->subSamplingH != 0 || vi->format->subSamplingW != 0) {
111 |             return "clip must not be sub-sampled";
112 |         }
113 |     }
114 | 
115 |     return {};
116 | }
117 | 
118 | static inline
119 | std::optional<std::string> checkNodes(
120 |     const std::vector<const VSVideoInfo *> & vis,
121 |     int sample_type,
122 |     int bits_per_sample
123 | ) noexcept {
124 | 
125 |     for (const auto & vi : vis) {
126 |         if (vi->format->sampleType != sample_type) {
127 |             return "sample type mismatch";
128 |         }
129 | 
130 |         if (vi->format->bitsPerSample != bits_per_sample) {
131 |             return "bits per sample mismatch";
132 |         }
133 |     }
134 | 
135 |     return {};
136 | }
137 | 
138 | static inline
139 | int numPlanes(
140 |     const std::vector<const VSVideoInfo *> & vis
141 | ) noexcept {
142 | 
143 |     int num_planes = 0;
144 | 
145 |     for (const auto & vi : vis) {
146 |         num_planes += vi->format->numPlanes;
147 |     }
148 | 
149 |     return num_planes;
150 | }
151 | 
152 | static inline
153 | std::optional<std::string> checkNodesAndContext(
154 |     const std::unique_ptr<nvinfer1::IExecutionContext> & execution_context,
155 |     const std::vector<const VSVideoInfo *> & vis
156 | ) noexcept {
157 | 
158 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
159 |     auto input_name = execution_context->getEngine().getIOTensorName(0);
160 |     const nvinfer1::Dims & network_in_dims = execution_context->getTensorShape(input_name);
161 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
162 |     const nvinfer1::Dims & network_in_dims = execution_context->getBindingDimensions(0);
163 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
164 | 
165 |     auto network_in_channels = network_in_dims.d[1];
166 |     int num_planes = numPlanes(vis);
167 |     if (network_in_channels != num_planes) {
168 |         return "expects " + std::to_string(network_in_channels) + " input planes";
169 |     }
170 | 
171 |     auto network_in_height = network_in_dims.d[2];
172 |     auto network_in_width = network_in_dims.d[3];
173 |     int clip_in_height = vis[0]->height;
174 |     int clip_in_width = vis[0]->width;
175 | 
176 |     if (network_in_height > clip_in_height || network_in_width > clip_in_width) {
177 |         return "tile size larger than clip dimension";
178 |     }
179 | 
180 |     return {};
181 | }
182 | 
183 | static inline void VS_CC getDeviceProp(
184 |     const VSMap *in, VSMap *out, void *userData,
185 |     VSCore *core, const VSAPI *vsapi
186 | ) {
187 | 
188 |     int err;
189 |     int device_id = static_cast<int>(vsapi->propGetInt(in, "device_id", 0, &err));
190 |     if (err) {
191 |         device_id = 0;
192 |     }
193 | 
194 |     cudaDeviceProp prop;
195 |     if (auto error = cudaGetDeviceProperties(&prop, device_id); error != cudaSuccess) {
196 |         vsapi->setError(out, cudaGetErrorString(error));
197 |         return ;
198 |     }
199 | 
200 |     auto setProp = [&](const char * name, const auto & value, int data_length = -1) {
201 |         using T = std::decay_t<decltype(value)>;
202 |         if constexpr (std::is_integral_v<T>) {
203 |             vsapi->propSetInt(out, name, static_cast<int64_t>(value), paReplace);
204 |         } else if constexpr (std::is_same_v<T, const char *>) {
205 |             vsapi->propSetData(out, name, value, data_length, paReplace);
206 |         } else if constexpr (std::is_integral_v<std::remove_pointer_t<T>>) {
207 |             std::array<int64_t, std::extent_v<std::remove_reference_t<decltype(value)>>> data;
208 |             for (int i = 0; i < static_cast<int>(std::size(data)); i++) {
209 |                 data[i] = value[i];
210 |             }
211 |             vsapi->propSetIntArray(out, name, std::data(data), static_cast<int>(std::size(data)));
212 |         }
213 |     };
214 | 
215 |     int driver_version;
216 |     cudaDriverGetVersion(&driver_version);
217 |     setProp("driver_version", driver_version);
218 | 
219 | #ifdef __cpp_impl_reflection
220 |     constexpr auto ctx = std::meta::access_context::current();
221 |     template for (
222 |         constexpr auto r : define_static_array(nonstatic_data_members_of(^^decltype(prop), ctx))
223 |     ) {
224 |         if constexpr (identifier_of(r) == "uuid") {
225 |             std::array<int64_t, 16> uuid;
226 |             for (int i = 0; i < 16; ++i) {
227 |                 uuid[i] = prop.uuid.bytes[i];
228 |             }
229 |             vsapi->propSetIntArray(out, "uuid", std::data(uuid), static_cast<int>(std::size(uuid)));
230 |         } else if constexpr (identifier_of(r) != "reserved") {
231 |             setProp(std::string(identifier_of(r)).c_str(), prop.[:r:]);
232 |         }
233 |     }
234 | #else // __cpp_impl_reflection
235 |     setProp("name", prop.name);
236 |     {
237 |         std::array<int64_t, 16> uuid;
238 |         for (int i = 0; i < 16; ++i) {
239 |             uuid[i] = prop.uuid.bytes[i];
240 |         }
241 |         vsapi->propSetIntArray(out, "uuid", std::data(uuid), static_cast<int>(std::size(uuid)));
242 |     }
243 |     setProp("total_global_memory", prop.totalGlobalMem);
244 |     setProp("shared_memory_per_block", prop.sharedMemPerBlock);
245 |     setProp("regs_per_block", prop.regsPerBlock);
246 |     setProp("warp_size", prop.warpSize);
247 |     setProp("mem_pitch", prop.memPitch);
248 |     setProp("max_threads_per_block", prop.maxThreadsPerBlock);
249 |     setProp("total_const_mem", prop.totalConstMem);
250 |     setProp("major", prop.major);
251 |     setProp("minor", prop.minor);
252 |     setProp("texture_alignment", prop.textureAlignment);
253 |     setProp("texture_pitch_alignment", prop.texturePitchAlignment);
254 |     setProp("multi_processor_count", prop.multiProcessorCount);
255 |     setProp("integrated", prop.integrated);
256 |     setProp("can_map_host_memory", prop.canMapHostMemory);
257 |     setProp("concurrent_kernels", prop.concurrentKernels);
258 |     setProp("ecc_enabled", prop.ECCEnabled);
259 |     setProp("pci_bus_id", prop.pciBusID);
260 |     setProp("pci_device_id", prop.pciDeviceID);
261 |     setProp("pci_domain_id", prop.pciDomainID);
262 |     setProp("tcc_driver", prop.tccDriver);
263 |     setProp("async_engine_count", prop.asyncEngineCount);
264 |     setProp("unified_addressing", prop.unifiedAddressing);
265 |     setProp("memory_bus_width", prop.memoryBusWidth);
266 |     setProp("l2_cache_size", prop.l2CacheSize);
267 |     setProp("persisting_l2_cache_max_size", prop.persistingL2CacheMaxSize);
268 |     setProp("max_threads_per_multiprocessor", prop.maxThreadsPerMultiProcessor);
269 |     setProp("stream_priorities_supported", prop.streamPrioritiesSupported);
270 |     setProp("global_l1_cache_supported", prop.globalL1CacheSupported);
271 |     setProp("local_l1_cache_supported", prop.localL1CacheSupported);
272 |     setProp("shared_mem_per_multiprocessor", prop.sharedMemPerMultiprocessor);
273 |     setProp("regs_per_multiprocessor", prop.regsPerMultiprocessor);
274 |     setProp("managed_memory", prop.managedMemory);
275 |     setProp("is_multi_gpu_board", prop.isMultiGpuBoard);
276 |     setProp("multi_gpu_board_group_id", prop.multiGpuBoardGroupID);
277 |     setProp("host_native_atomic_supported", prop.hostNativeAtomicSupported);
278 |     setProp("pageable_memory_access", prop.pageableMemoryAccess);
279 |     setProp("conccurrent_managed_access", prop.concurrentManagedAccess);
280 |     setProp("compute_preemption_supported", prop.computePreemptionSupported);
281 |     setProp(
282 |         "can_use_host_pointer_for_registered_mem",
283 |         prop.canUseHostPointerForRegisteredMem
284 |     );
285 |     setProp("cooperative_launch", prop.cooperativeLaunch);
286 |     setProp("shared_mem_per_block_optin", prop.sharedMemPerBlockOptin);
287 |     setProp(
288 |         "pageable_memory_access_uses_host_page_tables",
289 |         prop.pageableMemoryAccessUsesHostPageTables
290 |     );
291 |     setProp("direct_managed_mem_access_from_host", prop.directManagedMemAccessFromHost);
292 |     setProp("max_blocks_per_multi_processor", prop.maxBlocksPerMultiProcessor);
293 |     setProp("access_policy_max_window_size", prop.accessPolicyMaxWindowSize);
294 |     setProp("reserved_shared_mem_per_block", prop.reservedSharedMemPerBlock);
295 | #endif // __cpp_impl_reflection
296 | };
297 | 
298 | #endif // VSTRT_UTILS_H_
299 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-ort.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-ORT)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsort/**'
  8 |       - '.github/workflows/windows-ort.yml'
  9 |   workflow_call:
 10 |     inputs:
 11 |       tag:
 12 |         description: 'which tag to upload to'
 13 |         required: true
 14 |         type: string
 15 |   workflow_dispatch:
 16 |     inputs:
 17 |       tag:
 18 |         description: 'which tag to upload to'
 19 |         default: ''
 20 | 
 21 | jobs:
 22 |   build-windows:
 23 |     runs-on: windows-2025
 24 | 
 25 |     defaults:
 26 |       run:
 27 |         shell: cmd
 28 |         working-directory: vsort
 29 | 
 30 |     steps:
 31 |     - name: Checkout repo
 32 |       uses: actions/checkout@v4
 33 |       with:
 34 |         fetch-depth: 0
 35 | 
 36 |     - name: Setup MSVC
 37 |       uses: ilammy/msvc-dev-cmd@v1
 38 | 
 39 |     - name: Setup Ninja
 40 |       run: pip install ninja
 41 | 
 42 |     - name: Restore cached onnx
 43 |       id: cache-onnx
 44 |       uses: actions/cache/restore@v4
 45 |       with:
 46 |         path: vsort/onnx/install
 47 |         key: ${{ runner.os }}-vsort-onnx-v6
 48 | 
 49 |     - name: Checkout onnx
 50 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 51 |       uses: actions/checkout@v4
 52 |       with:
 53 |         repository: onnx/onnx
 54 |         # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external
 55 |         # if you change this, remember to bump the version of the cache key.
 56 |         ref: v1.19.0
 57 |         fetch-depth: 1
 58 |         path: vsort/onnx
 59 | 
 60 |     - name: Configure onnx
 61 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 62 |       run: cmake -S onnx -B onnx\build -G Ninja -LA
 63 |         -D CMAKE_BUILD_TYPE=Release
 64 |         -D CMAKE_PREFIX_PATH=%cd%\protobuf\install\lib\cmake
 65 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 66 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 67 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
 68 |         -D ONNX_BUILD_CUSTOM_PROTOBUF=ON
 69 | 
 70 |     - name: Build onnx
 71 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 72 |       run: cmake --build onnx\build --verbose
 73 | 
 74 |     - name: Install onnx
 75 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 76 |       run: cmake --install onnx\build --prefix onnx\install
 77 | 
 78 |     - name: Save onnx
 79 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 80 |       uses: actions/cache/save@v4
 81 |       with:
 82 |         path: vsort/onnx/install
 83 |         key: ${{ steps.cache-onnx.outputs.cache-primary-key }}
 84 | 
 85 |     - name: Download VapourSynth headers
 86 |       run: |
 87 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
 88 |         unzip -q vs.zip
 89 |         mv vapoursynth-*/ vapoursynth/
 90 | 
 91 |     - name: Download ONNX Runtime Precompilation
 92 |       run: |
 93 |         curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-10525-gecb26fb775-250906-0716/onnxruntime-gpu-win64.zip
 94 |         unzip -q ortgpu.zip
 95 | 
 96 |     - name: Restore cached CUDA
 97 |       id: cache-cuda
 98 |       uses: actions/cache/restore@v4
 99 |       with:
100 |         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
101 |         key: ${{ runner.os }}-cuda-13.0.1
102 | 
103 |     - name: Setup CUDA
104 |       if: steps.cache-cuda.outputs.cache-hit != 'true'
105 |       run: |
106 |         curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/13.0.1/network_installers/cuda_13.0.1_windows_network.exe
107 |         cuda_installer.exe -s nvcc_13.0 cudart_13.0 crt_13.0 nvptxcompiler_13.0
108 | 
109 |     - name: Save CUDA
110 |       if: steps.cache-cuda.outputs.cache-hit != 'true'
111 |       uses: actions/cache/save@v4
112 |       with:
113 |         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
114 |         key: ${{ steps.cache-cuda.outputs.cache-primary-key }}
115 | 
116 |     - name: Configure
117 |       run: cmake -S . -B build -G Ninja -LA
118 |         -D CMAKE_BUILD_TYPE=Release
119 |         -D CMAKE_PREFIX_PATH=onnx\install
120 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
121 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include
122 |         -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime
123 |         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib
124 |         -D ENABLE_CUDA=1
125 |         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
126 |         -D ENABLE_DML=1
127 |         -D CMAKE_CXX_STANDARD=20
128 | 
129 |     - name: Build
130 |       run: cmake --build build --verbose
131 | 
132 |     - name: Install
133 |       run: |
134 |         cmake --install build --prefix install
135 |         mkdir artifact
136 |         mkdir artifact\vsort
137 |         copy install\bin\vsort.dll artifact\
138 |         copy onnxruntime-gpu\bin\*.dll artifact\vsort\
139 |         copy onnxruntime-gpu\lib\*.dll artifact\vsort\
140 | 
141 |     - name: Download DirectML Library
142 |       # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44
143 |       run: |
144 |         curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.15.4
145 |         unzip -q directml.nupkg -d dml
146 |         copy dml\bin\x64-win\DirectML.dll artifact\vsort\
147 | 
148 |     - name: Upload
149 |       uses: actions/upload-artifact@v4
150 |       with:
151 |         name: VSORT-Windows-x64
152 |         path: vsort/artifact
153 | 
154 |     - name: Describe
155 |       run: git describe --tags --long
156 | 
157 |     - name: Dump dependencies
158 |       run: dumpbin /dependents artifact\vsort.dll
159 | 
160 |     - name: Setup Python portable
161 |       run: |
162 |         curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.10/python-3.9.10-embed-amd64.zip
163 |         7z x python.zip -ovs_portable
164 | 
165 |     - name: Install VapourSynth portable
166 |       run: |
167 |         curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z
168 |         7z x vs.7z -ovs_portable -y
169 | 
170 |     - name: Copy plugin
171 |       run: |
172 |         copy artifact\*.dll vs_portable\vapoursynth64\plugins
173 |         mkdir vs_portable\vapoursynth64\plugins\vsort\
174 |         copy artifact\vsort\*.dll vs_portable\vapoursynth64\plugins\vsort\
175 | 
176 |     - name: Install waifu2x model
177 |       run: |
178 |         curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z
179 |         7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models
180 | 
181 |     - name: Download x265
182 |       run: |
183 |         curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z
184 |         7z x x265.7z -ovs_portable\
185 | 
186 |     - name: Create script
187 |       shell: bash
188 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, verbosity=4).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
189 | 
190 |     - name: Run vspipe
191 |       shell: bash
192 |       run: |
193 |         set -ex
194 |         vs_portable/vspipe -i test.vpy -
195 |         vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
196 |         ls -l out.hevc x265.log
197 |         cat x265.log
198 |         grep -F 'encoded 10 frames' x265.log || exit 2
199 |         grep -i 'error' x265.log && exit 1
200 |         exit 0
201 | 
202 |     - name: Create script (fp16)
203 |       shell: bash
204 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy
205 | 
206 |     - name: Run vspipe (fp16)
207 |       shell: bash
208 |       run: |
209 |         set -ex
210 |         vs_portable/vspipe -i test_fp16.vpy -
211 |         vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
212 |         ls -l out.hevc x265.log
213 |         cat x265.log
214 |         grep -F 'encoded 10 frames' x265.log || exit 2
215 |         grep -i 'error' x265.log && exit 1
216 |         exit 0
217 | 
218 |     - name: Create script (fp16 input)
219 |       shell: bash
220 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBH).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_input.vpy
221 | 
222 |     - name: Run vspipe (fp16 input)
223 |       shell: bash
224 |       run: |
225 |         set -ex
226 |         vs_portable/vspipe -i test_fp16_input.vpy -
227 |         vs_portable/vspipe --y4m -p -e 9 test_fp16_input.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
228 |         ls -l out.hevc x265.log
229 |         cat x265.log
230 |         grep -F 'encoded 10 frames' x265.log || exit 2
231 |         grep -i 'error' x265.log && exit 1
232 |         exit 0
233 | 
234 |     - name: Create script (fp16 output)
235 |       shell: bash
236 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True, output_format=1);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_output.vpy
237 | 
238 |     - name: Run vspipe (fp16 output)
239 |       shell: bash
240 |       run: |
241 |         set -ex
242 |         vs_portable/vspipe -i test_fp16_output.vpy -
243 |         vs_portable/vspipe --y4m -p -e 9 test_fp16_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
244 |         ls -l out.hevc x265.log
245 |         cat x265.log
246 |         grep -F 'encoded 10 frames' x265.log || exit 2
247 |         grep -i 'error' x265.log && exit 1
248 |         exit 0
249 | 
250 |     - name: Create script (flexible output)
251 |       shell: bash
252 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy
253 | 
254 |     - name: Run vspipe (flexible output)
255 |       shell: bash
256 |       run: |
257 |         set -ex
258 |         vs_portable/vspipe -i test_flexible_output.vpy -
259 |         vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
260 |         ls -l out.hevc x265.log
261 |         cat x265.log
262 |         grep -F 'encoded 10 frames' x265.log || exit 2
263 |         grep -i 'error' x265.log && exit 1
264 |         exit 0
265 | 
266 |     - name: Compress artifact for release
267 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
268 |       run: |
269 |         cd artifact
270 |         7z a -t7z -mx=7 ../../VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z .
271 | 
272 |     - name: Release
273 |       uses: softprops/action-gh-release@v2
274 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
275 |       with:
276 |         tag_name: ${{ inputs.tag }}
277 |         files: VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z
278 |         fail_on_unmatched_files: true
279 |         generate_release_notes: false
280 |         prerelease: true
281 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-release.yml:
--------------------------------------------------------------------------------
  1 | name: Make a Release
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       tag:
  7 |         description: 'which tag to create and release?'
  8 |         required: true
  9 |         default: 'nightly'
 10 |       model-tags:
 11 |         description: 'which tag(s) of model release to use? (comma-separated list of tags)'
 12 |         required: true
 13 |         default: 'model-20211209, model-20220923'
 14 |       ext-model-tags:
 15 |         description: 'which tag(s) of external model release to use?'
 16 |         required: true
 17 |         default: 'external-models'
 18 |       contrib-model-tags:
 19 |         description: 'which tag(s) of contributed model release to use?'
 20 |         required: true
 21 |         default: 'contrib-models'
 22 |       ov_tag:
 23 |         description: 'which tag of openvino to use'
 24 |         required: true
 25 |         default: 'latest'
 26 |         type: string
 27 | 
 28 | jobs:
 29 |   build-vsov:
 30 |     uses: ./.github/workflows/windows-ov.yml
 31 |     with:
 32 |       tag: ${{ github.event.inputs.tag }}
 33 |       ov_tag: ${{ github.event.inputs.ov_tag }}
 34 | 
 35 |   build-vsort:
 36 |     uses: ./.github/workflows/windows-ort.yml
 37 |     with:
 38 |       tag: ${{ github.event.inputs.tag }}
 39 | 
 40 |   build-vstrt:
 41 |     uses: ./.github/workflows/windows-trt.yml
 42 |     with:
 43 |       tag: ${{ github.event.inputs.tag }}
 44 |     secrets:
 45 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 46 | 
 47 |   build-vsmigx:
 48 |     uses: ./.github/workflows/windows-migx.yml
 49 |     with:
 50 |       tag: ${{ github.event.inputs.tag }}
 51 |     secrets:
 52 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 53 | 
 54 |   build-vsncnn:
 55 |     uses: ./.github/workflows/windows-ncnn.yml
 56 |     with:
 57 |       tag: ${{ github.event.inputs.tag }}
 58 | 
 59 |   build-vstrt_rtx:
 60 |     uses: ./.github/workflows/windows-trt_rtx.yml
 61 |     with:
 62 |       tag: ${{ github.event.inputs.tag }}
 63 |     secrets:
 64 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 65 | 
 66 |   build-cuda-dependency:
 67 |     uses: ./.github/workflows/windows-cuda-dependency.yml
 68 |     with:
 69 |       tag: ${{ github.event.inputs.tag }}
 70 |     secrets:
 71 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 72 | 
 73 |   build-hip-dependency:
 74 |     uses: ./.github/workflows/windows-hip-dependency.yml
 75 |     with:
 76 |       tag: ${{ github.event.inputs.tag }}
 77 |     secrets:
 78 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 79 | 
 80 |   build-scripts:
 81 |     runs-on: ubuntu-24.04-arm
 82 |     steps:
 83 |     - name: Checkout repo
 84 |       uses: actions/checkout@v4
 85 | 
 86 |     - name: Compress scirpts.7z
 87 |       run: |
 88 |         cd scripts
 89 |         7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z .
 90 | 
 91 |     - name: Upload scripts release
 92 |       uses: actions/upload-artifact@v4
 93 |       with:
 94 |         name: Scripts
 95 |         path: scripts
 96 |         retention-days: 1
 97 | 
 98 |     - name: Release scripts
 99 |       uses: softprops/action-gh-release@v2
100 |       with:
101 |         tag_name: ${{ github.event.inputs.tag }}
102 |         files: scripts.${{ github.event.inputs.tag }}.7z
103 |         fail_on_unmatched_files: true
104 |         generate_release_notes: false
105 |         prerelease: true
106 | 
107 |   build-models:
108 |     runs-on: ubuntu-24.04-arm
109 |     steps:
110 |     - name: Download Models
111 |       run: |
112 |         set -ex
113 |         mkdir -p release/models
114 |         cd release
115 |         pushd models
116 |         for tag in $(echo "${{ github.event.inputs.model-tags }}" | tr ',' ' '); do
117 |           echo "Handling tag $tag"
118 |           curl -s -H 'Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
119 |           cat release.json
120 |           for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
121 |             echo "Downloading $url"
122 |             curl -o dl.7z -LJ -H 'Accept: application/octet-stream' -H 'Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' "$url"
123 |             # later release should overwrite earlier ones
124 |             7za x -y dl.7z
125 |           done
126 |           test -f "dl.7z"
127 |           rm -f dl.7z release.json
128 |         done
129 |         popd
130 |         ls -lR
131 |         du -sh
132 |         7za a -t7z -bb3 -mx=9 ../models.7z .
133 | 
134 |     - name: Upload model release
135 |       uses: actions/upload-artifact@v4
136 |       with:
137 |         name: Models
138 |         path: release
139 |         retention-days: 1
140 |         compression-level: 0
141 | 
142 |     - name: Download External Models
143 |       if: false
144 |       run: |
145 |         rm -rf release
146 |         set -ex
147 |         mkdir -p release/models
148 |         cd release
149 |         pushd models
150 |         for tag in $(echo "${{ github.event.inputs.ext-model-tags }}" | tr ',' ' '); do
151 |           echo "Handling tag $tag"
152 |           curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
153 |           for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
154 |             echo "Downloading $url"
155 |             curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
156 |             # later release should overwrite earlier ones
157 |             7za x -y dl.7z
158 |           done
159 |           test -f "dl.7z"
160 |           rm -f dl.7z release.json
161 |         done
162 |         popd
163 |         ls -lR
164 |         du -sh
165 |         7za a -t7z -bb3 -mx=9 ../ext-models.7z .
166 | 
167 |     - name: Upload external model release
168 |       uses: actions/upload-artifact@v4
169 |       if: false
170 |       with:
171 |         name: External-Models
172 |         path: release
173 |         retention-days: 1
174 |         compression-level: 0
175 | 
176 |     - name: Download Contributed Models
177 |       run: |
178 |         rm -rf release
179 |         set -ex
180 |         mkdir -p release/models
181 |         cd release
182 |         pushd models
183 |         for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do
184 |           echo "Handling tag $tag"
185 |           curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
186 |           for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
187 |             echo "Downloading $url"
188 |             curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
189 |             # later release should overwrite earlier ones
190 |             7za x -y dl.7z
191 |           done
192 |           #test -f "dl.7z"  # contrib-models might be empty.
193 |           rm -f dl.7z release.json
194 |         done
195 |         popd
196 |         ls -lR
197 |         du -sh
198 |         7za a -t7z -bb3 -mx=9 ../contrib-models.7z .
199 | 
200 |     - name: Upload contrib model release
201 |       uses: actions/upload-artifact@v4
202 |       with:
203 |         name: Contrib-Models
204 |         path: release
205 |         retention-days: 1
206 |         compression-level: 0
207 | 
208 |     - name: Download Contributed Models
209 |       run: |
210 |         rm -rf release
211 |         set -ex
212 |         mkdir -p release/models
213 |         cd release
214 |         pushd models
215 |         for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do
216 |           echo "Handling tag $tag"
217 |           curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
218 |           for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
219 |             echo "Downloading $url"
220 |             curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
221 |             # later release should overwrite earlier ones
222 |             7za x -y dl.7z
223 |           done
224 |           #test -f "dl.7z"  # contrib-models might be empty.
225 |           rm -f dl.7z release.json
226 |         done
227 |         popd
228 |         ls -lR
229 |         du -sh
230 |         7za a -t7z -bb3 -mx=9 ../contrib-models.7z .
231 | 
232 |     - name: Rename release asset
233 |       run: |
234 |         mv models.7z models.${{ github.event.inputs.tag }}.7z
235 |         mv contrib-models.7z contrib-models.${{ github.event.inputs.tag }}.7z
236 | 
237 |     - name: Release models
238 |       uses: softprops/action-gh-release@v2
239 |       with:
240 |         tag_name: ${{ github.event.inputs.tag }}
241 |         files: |
242 |           models.${{ github.event.inputs.tag }}.7z
243 |           contrib-models.${{ github.event.inputs.tag }}.7z
244 |         fail_on_unmatched_files: true
245 |         generate_release_notes: false
246 |         prerelease: true
247 | 
248 |   release:
249 |     runs-on: ubuntu-24.04-arm
250 |     needs: [build-vsov, build-vsort, build-vstrt, build-vsmigx, build-vsncnn, build-vstrt_rtx, build-cuda-dependency, build-hip-dependency, build-scripts, build-models]
251 | 
252 |     defaults:
253 |       run:
254 |         shell: bash
255 | 
256 |     steps:
257 |     - name: Download artifact for scripts
258 |       uses: actions/download-artifact@v4
259 |       with:
260 |         name: Scripts
261 |         path: scripts-release
262 | 
263 |     - name: Download artifact for models
264 |       uses: actions/download-artifact@v4
265 |       with:
266 |         name: Models
267 |         path: models-release
268 | 
269 |     - name: Download artifact for vsov
270 |       uses: actions/download-artifact@v4
271 |       with:
272 |         name: VSOV-Windows-x64
273 |         path: vsov-release
274 | 
275 |     - name: Download artifact for vsort
276 |       uses: actions/download-artifact@v4
277 |       with:
278 |         name: VSORT-Windows-x64
279 |         path: vsort-release
280 | 
281 |     - name: Download artifact for vstrt
282 |       uses: actions/download-artifact@v4
283 |       with:
284 |         name: VSTRT-Windows-x64
285 |         path: vstrt-release
286 | 
287 |     - name: Download artifact for vsmigx
288 |       uses: actions/download-artifact@v4
289 |       with:
290 |         name: VSMIGX-Windows-x64
291 |         path: vsmigx-release
292 | 
293 |     - name: Download artifact for vsncnn
294 |       uses: actions/download-artifact@v4
295 |       with:
296 |         name: VSNCNN-GPU-Windows-x64
297 |         path: vsncnn-release
298 | 
299 |     - name: Download artifact for vstrt_rtx
300 |       uses: actions/download-artifact@v4
301 |       with:
302 |         name: VSTRT-RTX-Windows-x64
303 |         path: vstrt-rtx-release
304 | 
305 |     - name: Download artifact for cuda dependencies
306 |       uses: actions/download-artifact@v4
307 |       with:
308 |         name: vsmlrt-cuda
309 |         path: cuda-release
310 | 
311 |     - name: Download artifact for hip dependencies
312 |       uses: actions/download-artifact@v4
313 |       with:
314 |         name: vsmlrt-hip
315 |         path: hip-release
316 | 
317 |     - name: Build CPU-only release
318 |       shell: bash
319 |       run: |
320 |         mkdir release-cpu
321 |         cp -r models-release/models release-cpu/
322 |         cp -r vsov-release/* release-cpu/
323 |         cp -r vsort-release/* release-cpu/
324 |         rm -f release-cpu/vsort/onnxruntime_providers_*.dll
325 |         cp scripts-release/*.py release-cpu/
326 |         cd release-cpu
327 |         ls -lR
328 |         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z .
329 | 
330 |     - name: Upload CPU-only release
331 |       uses: actions/upload-artifact@v4
332 |       if: false
333 |       with:
334 |         name: vsmlrt-cpu-release
335 |         path: vsmlrt-windows-x64-cpu.7z
336 |         retention-days: 1
337 |         compression-level: 0
338 | 
339 |     - name: Rename release asset
340 |       run: mv vsmlrt-windows-x64-cpu.7z vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag }}.7z
341 | 
342 |     - name: Release CPU
343 |       uses: softprops/action-gh-release@v2
344 |       with:
345 |         tag_name: ${{ github.event.inputs.tag }}
346 |         files: vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag}}.7z
347 |         fail_on_unmatched_files: true
348 |         generate_release_notes: false
349 |         prerelease: true
350 | 
351 |     - name: Build generic GPU release
352 |       shell: bash
353 |       run: |
354 |         mkdir release-generic-gpu
355 |         cp -r models-release/models release-generic-gpu/
356 |         cp -r vsov-release/* release-generic-gpu/
357 |         cp -r vsort-release/* release-generic-gpu/
358 |         rm -f release-generic-gpu/vsort/onnxruntime_providers_*.dll
359 |         cp -r vsncnn-release/* release-generic-gpu/
360 |         cp scripts-release/*.py release-generic-gpu/
361 |         cd release-generic-gpu
362 |         ls -lR
363 |         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z .
364 | 
365 |     - name: Upload generic GPU release
366 |       uses: actions/upload-artifact@v4
367 |       if: false
368 |       with:
369 |         name: vsmlrt-generic-gpu-release
370 |         path: vsmlrt-windows-x64-generic-gpu.7z
371 |         retention-days: 1
372 |         compression-level: 0
373 | 
374 |     - name: Rename release asset for generic GPU release
375 |       run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
376 | 
377 |     - name: Release generic GPU
378 |       uses: softprops/action-gh-release@v2
379 |       with:
380 |         tag_name: ${{ github.event.inputs.tag }}
381 |         files: vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
382 |         fail_on_unmatched_files: true
383 |         generate_release_notes: false
384 |         prerelease: true
385 | 
386 |     - name: Extract CUDA libraries
387 |       run: |
388 |         cd cuda-release
389 |         7za x -bb3 vsmlrt-cuda.7z.001
390 |         rm vsmlrt-cuda.7z.*
391 | 
392 |     - name: Build CUDA release
393 |       shell: bash
394 |       run: |
395 |         mkdir release-cuda
396 |         cp -r models-release/models release-cuda/
397 |         cp -r vsov-release/* release-cuda/
398 |         cp -r vsort-release/* release-cuda/
399 |         cp -r vstrt-release/* release-cuda/
400 |         cp -r vsncnn-release/* release-cuda/
401 |         cp -r vstrt-rtx-release/* release-cuda/
402 |         cp -r cuda-release/* release-cuda/
403 |         cp scripts-release/*.py release-cuda/
404 |         cd release-cuda
405 |         ls -lR
406 |         7za a -t7z -bb3 -mx=9 -v2147483647b ../vsmlrt-windows-x64-cuda.7z .
407 | 
408 |     - name: Upload CUDA release
409 |       uses: actions/upload-artifact@v4
410 |       if: false
411 |       with:
412 |         name: vsmlrt-cuda-release
413 |         path: vsmlrt-windows-x64-cuda.7z*
414 |         retention-days: 1
415 |         compression-level: 0
416 | 
417 |     - name: Rename release asset for CUDA release
418 |       run: |
419 |         mv vsmlrt-windows-x64-cuda.7z.001 vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.001
420 |         mv vsmlrt-windows-x64-cuda.7z.002 vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.002
421 | 
422 |     - name: Release CUDA
423 |       uses: softprops/action-gh-release@v2
424 |       with:
425 |         tag_name: ${{ github.event.inputs.tag }}
426 |         files: vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z*
427 |         fail_on_unmatched_files: true
428 |         generate_release_notes: false
429 |         prerelease: true
430 | 
431 |     - name: Build TensorRT release
432 |       shell: bash
433 |       run: |
434 |         cd release-cuda
435 |         cd vsmlrt-cuda
436 |         rm --verbose cublas*.dll cudnn*.dll cufft*.dll cupti*.dll nvblas*.dll
437 |         cd ..
438 |         rm --verbose vsort/onnxruntime_providers_*.dll
439 |         7za a -t7z -bb3 -mx=9 -v2147483647b ../vsmlrt-windows-x64-tensorrt.7z .
440 | 
441 |     - name: Upload TensorRT release
442 |       uses: actions/upload-artifact@v4
443 |       if: false
444 |       with:
445 |         name: vsmlrt-tensorrt-release
446 |         path: vsmlrt-windows-x64-tensorrt.7z*
447 |         retention-days: 1
448 |         compression-level: 0
449 | 
450 |     - name: Rename release asset for TensorRT release
451 |       run: |
452 |         mv vsmlrt-windows-x64-tensorrt.7z.001 vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z.001
453 |         mv vsmlrt-windows-x64-tensorrt.7z.002 vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z.002
454 | 
455 |     - name: Release TensorRT
456 |       uses: softprops/action-gh-release@v2
457 |       with:
458 |         tag_name: ${{ github.event.inputs.tag }}
459 |         files: vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z*
460 |         fail_on_unmatched_files: true
461 |         generate_release_notes: false
462 |         prerelease: true
463 | 
464 |     - name: Extract HIP libraries
465 |       run: |
466 |         cd hip-release
467 |         7za x -bb3 vsmlrt-hip.7z
468 |         rm vsmlrt-hip.7z
469 | 
470 |     - name: Build MIGraphX release
471 |       shell: bash
472 |       run: |
473 |         mkdir release-hip
474 |         cp -r models-release/models release-hip/
475 |         cp -r vsov-release/* release-hip/
476 |         cp -r vsort-release/* release-hip/
477 |         cp -r vsmigx-release/* release-hip/
478 |         cp -r vsncnn-release/* release-hip/
479 |         cp -r hip-release/* release-hip/
480 |         cp scripts-release/*.py release-hip/
481 |         cd release-hip
482 |         ls -lR
483 |         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-migraphx.7z .
484 | 
485 |     - name: Upload MIGraphX release
486 |       uses: actions/upload-artifact@v4
487 |       if: false
488 |       with:
489 |         name: vsmlrt-migraphx-release
490 |         path: vsmlrt-windows-x64-migraphx.7z
491 |         retention-days: 1
492 |         compression-level: 0
493 | 
494 |     - name: Rename release asset for MIGraphX release
495 |       run: mv vsmlrt-windows-x64-migraphx.7z vsmlrt-windows-x64-migraphx.${{ github.event.inputs.tag }}.7z
496 | 
497 |     - name: Release MIGraphX
498 |       uses: softprops/action-gh-release@v2
499 |       with:
500 |         tag_name: ${{ github.event.inputs.tag }}
501 |         files: vsmlrt-windows-x64-migraphx.${{ github.event.inputs.tag }}.7z
502 |         fail_on_unmatched_files: true
503 |         generate_release_notes: false
504 |         prerelease: true
505 | 
506 |     # Update nightly tag.
507 |     - name: Checkout repo
508 |       if: github.event.inputs.tag == 'nightly'
509 |       uses: actions/checkout@v4
510 |       with:
511 |         fetch-depth: 0
512 |     - name: Overwrite tag
513 |       if: github.event.inputs.tag == 'nightly'
514 |       run: |
515 |         git pull --tags --force
516 |         git tag -f ${{ github.event.inputs.tag }}
517 |         git push -f origin ${{ github.event.inputs.tag }}
518 | 


--------------------------------------------------------------------------------
/vstrt/trt_utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef VSTRT_TRT_UTILS_H_
  2 | #define VSTRT_TRT_UTILS_H_
  3 | 
  4 | #include <cstdint>
  5 | #include <memory>
  6 | #include <iostream>
  7 | #include <optional>
  8 | #include <string>
  9 | #include <variant>
 10 | 
 11 | #include <cuda_runtime.h>
 12 | #include <NvInferRuntime.h>
 13 | 
 14 | #include "cuda_helper.h"
 15 | #include "cuda_utils.h"
 16 | 
 17 | using ErrorMessage = std::string;
 18 | 
 19 | struct RequestedTileSize {
 20 |     int tile_w;
 21 |     int tile_h;
 22 | };
 23 | 
 24 | struct VideoSize {
 25 |     int width;
 26 |     int height;
 27 | };
 28 | 
 29 | using TileSize = std::variant<RequestedTileSize, VideoSize>;
 30 | 
 31 | struct InferenceInstance {
 32 |     MemoryResource src;
 33 |     MemoryResource dst;
 34 |     StreamResource stream;
 35 |     std::unique_ptr<nvinfer1::IExecutionContext> exec_context;
 36 |     GraphExecResource graphexec;
 37 | 
 38 | #if NV_TENSORRT_MAJOR >= 10 || defined(TRT_MAJOR_RTX)
 39 |     Resource<uint8_t *, cudaFree> d_context_allocation;
 40 | #endif
 41 | };
 42 | 
 43 | class Logger : public nvinfer1::ILogger {
 44 |     void log(Severity severity, const char* message) noexcept override {
 45 |         if (severity <= verbosity) {
 46 |             std::cerr << message << '\n';
 47 |         }
 48 |     }
 49 | 
 50 | public:
 51 |     Logger() = default;
 52 | 
 53 |     void set_verbosity(Severity value) noexcept {
 54 |         this->verbosity = value;
 55 |     }
 56 | 
 57 | private:
 58 |     Severity verbosity;
 59 | };
 60 | 
 61 | static inline
 62 | std::optional<int> selectProfile(
 63 |     const std::unique_ptr<nvinfer1::ICudaEngine> & engine,
 64 |     const TileSize & tile_size,
 65 |     int batch_size = 1
 66 | ) noexcept {
 67 | 
 68 |     int tile_w, tile_h;
 69 |     if (std::holds_alternative<RequestedTileSize>(tile_size)) {
 70 |         tile_w = std::get<RequestedTileSize>(tile_size).tile_w;
 71 |         tile_h = std::get<RequestedTileSize>(tile_size).tile_h;
 72 |     } else {
 73 |         tile_w = std::get<VideoSize>(tile_size).width;
 74 |         tile_h = std::get<VideoSize>(tile_size).height;
 75 |     }
 76 | 
 77 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
 78 |     auto input_name = engine->getIOTensorName(0);
 79 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
 80 | 
 81 |     // finds the optimal profile
 82 |     for (int i = 0; i < engine->getNbOptimizationProfiles(); ++i) {
 83 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
 84 |         nvinfer1::Dims opt_dims = engine->getProfileShape(
 85 |             input_name, i, nvinfer1::OptProfileSelector::kOPT
 86 |         );
 87 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
 88 |         nvinfer1::Dims opt_dims = engine->getProfileDimensions(
 89 |             0, i, nvinfer1::OptProfileSelector::kOPT
 90 |         );
 91 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
 92 | 
 93 |         if (opt_dims.d[0] != batch_size) {
 94 |             continue;
 95 |         }
 96 |         if (opt_dims.d[2] == tile_h && opt_dims.d[3] == tile_w) {
 97 |             return i;
 98 |         }
 99 |     }
100 | 
101 |     // finds the first eligible profile
102 |     for (int i = 0; i < engine->getNbOptimizationProfiles(); ++i) {
103 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
104 |         nvinfer1::Dims min_dims = engine->getProfileShape(
105 |             input_name, i, nvinfer1::OptProfileSelector::kMIN
106 |         );
107 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
108 |         nvinfer1::Dims min_dims = engine->getProfileDimensions(
109 |             0, i, nvinfer1::OptProfileSelector::kMIN
110 |         );
111 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
112 | 
113 |         if (min_dims.d[0] > batch_size) {
114 |             continue;
115 |         }
116 |         if (min_dims.d[2] > tile_h || min_dims.d[3] > tile_w) {
117 |             continue;
118 |         }
119 | 
120 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
121 |         nvinfer1::Dims max_dims = engine->getProfileShape(
122 |             input_name, i, nvinfer1::OptProfileSelector::kMAX
123 |         );
124 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
125 |         nvinfer1::Dims max_dims = engine->getProfileDimensions(
126 |             0, i, nvinfer1::OptProfileSelector::kMAX
127 |         );
128 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
129 | 
130 |         if (max_dims.d[0] < batch_size) {
131 |             continue;
132 |         }
133 |         if (max_dims.d[2] < tile_h || max_dims.d[3] < tile_w) {
134 |             continue;
135 |         }
136 | 
137 |         return i;
138 |     }
139 | 
140 |     // returns not-found
141 |     return {};
142 | }
143 | 
144 | static inline
145 | std::optional<ErrorMessage> enqueue(
146 |     const MemoryResource & src,
147 |     const MemoryResource & dst,
148 |     const std::unique_ptr<nvinfer1::IExecutionContext> & exec_context,
149 |     cudaStream_t stream
150 | ) noexcept {
151 | 
152 |     const auto set_error = [](const ErrorMessage & message) {
153 |         return message;
154 |     };
155 | 
156 |     checkError(cudaMemcpyAsync(
157 |         src.d_data, src.h_data, src.size,
158 |         cudaMemcpyHostToDevice, stream
159 |     ));
160 | 
161 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
162 |     auto input_name = exec_context->getEngine().getIOTensorName(0);
163 |     auto output_name = exec_context->getEngine().getIOTensorName(1);
164 | 
165 |     if (!exec_context->setTensorAddress(input_name, src.d_data.data)) {
166 |         return set_error("set input tensor address failed");
167 |     }
168 |     if (!exec_context->setTensorAddress(output_name, dst.d_data.data)) {
169 |         return set_error("set output tensor address failed");
170 |     }
171 |     if (!exec_context->enqueueV3(stream)) {
172 |         return set_error("enqueue error");
173 |     }
174 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
175 |     void * bindings[] {
176 |         static_cast<void *>(src.d_data.data),
177 |         static_cast<void *>(dst.d_data.data)
178 |     };
179 | 
180 |     if (!exec_context->enqueueV2(bindings, stream, nullptr)) {
181 |         return set_error("enqueue error");
182 |     }
183 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
184 | 
185 |     checkError(cudaMemcpyAsync(
186 |         dst.h_data, dst.d_data, dst.size,
187 |         cudaMemcpyDeviceToHost, stream
188 |     ));
189 | 
190 |     return {};
191 | }
192 | 
193 | static inline
194 | std::variant<ErrorMessage, GraphExecResource> getGraphExec(
195 |     const MemoryResource & src, const MemoryResource & dst,
196 |     const std::unique_ptr<nvinfer1::IExecutionContext> & exec_context,
197 |     cudaStream_t stream
198 | ) noexcept {
199 | 
200 |     const auto set_error = [](const ErrorMessage & message) {
201 |         return message;
202 |     };
203 | 
204 |     // flush deferred internal state update
205 |     // https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/developer-guide/index.html#cuda-graphs
206 |     {
207 |         auto result = enqueue(src, dst, exec_context, stream);
208 |         if (result.has_value()) {
209 |             return set_error(result.value());
210 |         }
211 |         checkError(cudaStreamSynchronize(stream));
212 |     }
213 | 
214 |     checkError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed));
215 |     {
216 |         auto result = enqueue(src, dst, exec_context, stream);
217 |         if (result.has_value()) {
218 |             return set_error(result.value());
219 |         }
220 |     }
221 |     cudaGraph_t graph;
222 |     checkError(cudaStreamEndCapture(stream, &graph));
223 |     cudaGraphExec_t graphexec;
224 |     checkError(cudaGraphInstantiate(&graphexec, graph, nullptr, nullptr, 0));
225 |     checkError(cudaGraphDestroy(graph));
226 | 
227 |     return graphexec;
228 | }
229 | 
230 | static inline
231 | size_t getSize(
232 |     const nvinfer1::Dims & dim
233 | ) noexcept {
234 | 
235 |     size_t ret = 1;
236 |     for (int i = 0; i < dim.nbDims; ++i) {
237 |         ret *= dim.d[i];
238 |     }
239 |     return ret;
240 | }
241 | 
242 | static inline
243 | int getBytesPerSample(nvinfer1::DataType type) noexcept {
244 |     switch (type) {
245 |         case nvinfer1::DataType::kFLOAT:
246 |             return 4;
247 |         case nvinfer1::DataType::kHALF:
248 |             return 2;
249 |         case nvinfer1::DataType::kINT8:
250 |             return 1;
251 |         case nvinfer1::DataType::kINT32:
252 |             return 4;
253 |         case nvinfer1::DataType::kBOOL:
254 |             return 1;
255 |         case nvinfer1::DataType::kUINT8:
256 |             return 1;
257 | #if (NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 8061 || defined(TRT_MAJOR_RTX)
258 |         case nvinfer1::DataType::kFP8:
259 |             return 1;
260 | #endif // (NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 8061 || defined(TRT_MAJOR_RTX)
261 | #if NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX)
262 |         case nvinfer1::DataType::kBF16:
263 |             return 2;
264 |         case nvinfer1::DataType::kINT64:
265 |             return 8;
266 | #endif // NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX)
267 |         default:
268 |             return 0;
269 |     }
270 | }
271 | 
272 | static inline
273 | std::variant<ErrorMessage, InferenceInstance> getInstance(
274 |     const std::unique_ptr<nvinfer1::ICudaEngine> & engine,
275 |     const std::optional<int> & profile_index,
276 |     const TileSize & tile_size,
277 |     bool use_cuda_graph,
278 | #if NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX)
279 |     bool & is_dynamic
280 | #else // NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX)
281 |     bool is_dynamic
282 | #endif // NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX)
283 | ) noexcept {
284 | 
285 |     const auto set_error = [](const ErrorMessage & error_message) {
286 |         return error_message;
287 |     };
288 | 
289 |     StreamResource stream {};
290 |     checkError(cudaStreamCreateWithFlags(&stream.data, cudaStreamNonBlocking));
291 | 
292 |     auto exec_context = std::unique_ptr<nvinfer1::IExecutionContext>(
293 | #if NV_TENSORRT_MAJOR >= 10 || defined(TRT_MAJOR_RTX)
294 |         engine->createExecutionContext(
295 |             is_dynamic ?
296 |             nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED :
297 |             nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE
298 |         )
299 | #else
300 |         engine->createExecutionContext()
301 | #endif
302 |     );
303 | 
304 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
305 |     auto input_name = engine->getIOTensorName(0);
306 |     auto output_name = engine->getIOTensorName(1);
307 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
308 | 
309 |     if (!exec_context->allInputDimensionsSpecified()) {
310 |         if (!profile_index.has_value()) {
311 |             return set_error("no valid optimization profile found");
312 |         }
313 | #if NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX)
314 |         is_dynamic = true;
315 | #endif // NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX)
316 |         exec_context->setOptimizationProfileAsync(profile_index.value(), stream);
317 |         checkError(cudaStreamSynchronize(stream));
318 | 
319 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
320 |         nvinfer1::Dims dims = exec_context->getTensorShape(input_name);
321 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
322 |         nvinfer1::Dims dims = exec_context->getBindingDimensions(0);
323 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
324 | 
325 |         dims.d[0] = 1;
326 | 
327 |         if (std::holds_alternative<RequestedTileSize>(tile_size)) {
328 |             dims.d[2] = std::get<RequestedTileSize>(tile_size).tile_h;
329 |             dims.d[3] = std::get<RequestedTileSize>(tile_size).tile_w;
330 |         } else {
331 |             dims.d[2] = std::get<VideoSize>(tile_size).height;
332 |             dims.d[3] = std::get<VideoSize>(tile_size).width;
333 |         }
334 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
335 |         exec_context->setInputShape(input_name, dims);
336 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
337 |         exec_context->setBindingDimensions(0, dims);
338 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
339 |     } else if (std::holds_alternative<RequestedTileSize>(tile_size)) {
340 | #if NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX)
341 |         is_dynamic = false;
342 | #endif // NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX)
343 | 
344 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
345 |         nvinfer1::Dims dims = exec_context->getTensorShape(input_name);
346 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
347 |         nvinfer1::Dims dims = exec_context->getBindingDimensions(0);
348 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
349 | 
350 |         if (std::holds_alternative<RequestedTileSize>(tile_size)) {
351 |             if (dims.d[2] != std::get<RequestedTileSize>(tile_size).tile_h ||
352 |                 dims.d[3] != std::get<RequestedTileSize>(tile_size).tile_w
353 |             ) {
354 |                 return set_error("requested tile size not applicable");
355 |             }
356 |         } else {
357 |             if (dims.d[2] != std::get<VideoSize>(tile_size).height ||
358 |                 dims.d[3] != std::get<VideoSize>(tile_size).width
359 |             ) {
360 |                 return set_error("not supported video dimensions");
361 |             }
362 |         }
363 |     }
364 | 
365 |     MemoryResource src {};
366 |     {
367 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
368 |         auto dim = exec_context->getTensorShape(input_name);
369 |         auto type = engine->getTensorDataType(input_name);
370 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
371 |         auto dim = exec_context->getBindingDimensions(0);
372 |         auto type = engine->getBindingDataType(0);
373 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
374 | 
375 |         auto size = getSize(dim) * getBytesPerSample(type);
376 | 
377 |         Resource<uint8_t *, cudaFree> d_data {};
378 |         checkError(cudaMalloc(&d_data.data, size));
379 | 
380 |         Resource<uint8_t *, cudaFreeHost> h_data {};
381 |         checkError(cudaMallocHost(&h_data.data, size, cudaHostAllocWriteCombined));
382 | 
383 |         src = MemoryResource{
384 |             .h_data = std::move(h_data),
385 |             .d_data = std::move(d_data),
386 |             .size=size
387 |         };
388 |     }
389 | 
390 |     MemoryResource dst {};
391 |     {
392 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
393 |         auto dim = exec_context->getTensorShape(output_name);
394 |         auto type = engine->getTensorDataType(output_name);
395 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
396 |         auto dim = exec_context->getBindingDimensions(1);
397 |         auto type = engine->getBindingDataType(1);
398 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
399 | 
400 |         auto size = getSize(dim) * getBytesPerSample(type);
401 | 
402 |         Resource<uint8_t *, cudaFree> d_data {};
403 |         checkError(cudaMalloc(&d_data.data, size));
404 | 
405 |         Resource<uint8_t *, cudaFreeHost> h_data {};
406 |         checkError(cudaMallocHost(&h_data.data, size));
407 | 
408 |         dst = MemoryResource{
409 |             .h_data = std::move(h_data),
410 |             .d_data = std::move(d_data),
411 |             .size=size
412 |         };
413 |     }
414 | 
415 | #if NV_TENSORRT_MAJOR >= 10 || defined(TRT_MAJOR_RTX)
416 |     Resource<uint8_t *, cudaFree> d_context_allocation {};
417 | 
418 |     if (is_dynamic) {
419 |         size_t buffer_size { exec_context->updateDeviceMemorySizeForShapes() };
420 |         if (buffer_size == 0) {
421 |             return set_error("failed to get internal activation buffer size");
422 |         }
423 | 
424 |         checkError(cudaMalloc(&d_context_allocation.data, buffer_size));
425 | 
426 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001
427 |         exec_context->setDeviceMemoryV2(d_context_allocation.data, static_cast<int64_t>(buffer_size));
428 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001
429 |         exec_context->setDeviceMemory(d_context_allocation.data);
430 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001
431 |     }
432 | #endif // NV_TENSORRT_MAJOR >= 10
433 | 
434 |     GraphExecResource graphexec {};
435 |     if (use_cuda_graph) {
436 |         auto result = getGraphExec(
437 |             src, dst,
438 |             exec_context, stream
439 |         );
440 |         if (std::holds_alternative<GraphExecResource>(result)) {
441 |             graphexec = std::move(std::get<GraphExecResource>(result));
442 |         } else {
443 |             return set_error(std::get<ErrorMessage>(result));
444 |         }
445 |     }
446 | 
447 |     return InferenceInstance{
448 |         .src = std::move(src),
449 |         .dst = std::move(dst),
450 |         .stream = std::move(stream),
451 |         .exec_context = std::move(exec_context),
452 |         .graphexec = std::move(graphexec),
453 | #if NV_TENSORRT_MAJOR >= 10 || defined(TRT_MAJOR_RTX)
454 |         .d_context_allocation = std::move(d_context_allocation)
455 | #endif
456 |     };
457 | }
458 | 
459 | static inline
460 | std::optional<ErrorMessage> checkEngine(
461 |     const std::unique_ptr<nvinfer1::ICudaEngine> & engine,
462 |     bool flexible_output
463 | ) noexcept {
464 | 
465 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
466 |     int num_bindings = engine->getNbIOTensors();
467 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
468 |     int num_bindings = engine->getNbBindings();
469 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
470 | 
471 |     if (num_bindings != 2) {
472 |         return "network binding count must be 2, got " + std::to_string(num_bindings);
473 |     }
474 | 
475 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
476 |     auto input_name = engine->getIOTensorName(0);
477 |     auto output_name = engine->getIOTensorName(1);
478 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
479 | 
480 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
481 |     if (engine->getTensorIOMode(input_name) != nvinfer1::TensorIOMode::kINPUT) {
482 |         return "the first binding should be an input binding";
483 |     }
484 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
485 |     if (!engine->bindingIsInput(0)) {
486 |         return "the first binding should be an input binding";
487 |     }
488 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
489 | 
490 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
491 |     const nvinfer1::Dims & input_dims = engine->getTensorShape(input_name);
492 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
493 |     const nvinfer1::Dims & input_dims = engine->getBindingDimensions(0);
494 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
495 | 
496 |     if (input_dims.nbDims != 4) {
497 |         return "expects network with 4-D input";
498 |     }
499 |     if (input_dims.d[0] != 1) {
500 |         return "batch size of network input must be 1";
501 |     }
502 | 
503 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
504 |     if (engine->getTensorIOMode(output_name) != nvinfer1::TensorIOMode::kOUTPUT) {
505 |         return "the second binding should be an output binding";
506 |     }
507 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
508 |     if (engine->bindingIsInput(1)) {
509 |         return "the second binding should be an output binding";
510 |     }
511 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
512 | 
513 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
514 |     const nvinfer1::Dims & output_dims = engine->getTensorShape(output_name);
515 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
516 |     const nvinfer1::Dims & output_dims = engine->getBindingDimensions(1);
517 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
518 | 
519 |     if (output_dims.nbDims != 4) {
520 |         return "expects network with 4-D output";
521 |     }
522 |     if (output_dims.d[0] != 1) {
523 |         return "batch size of network output must be 1";
524 |     }
525 | 
526 |     auto out_channels = output_dims.d[1];
527 |     if (out_channels != 1 && out_channels != 3 && !flexible_output) {
528 |         return "output dimensions must be 1 or 3, or enable \"flexible_output\"";
529 |     }
530 | 
531 |     auto in_height = input_dims.d[2];
532 |     auto in_width = input_dims.d[3];
533 |     auto out_height = output_dims.d[2];
534 |     auto out_width = output_dims.d[3];
535 |     if (out_height % in_height != 0 || out_width % in_width != 0) {
536 |         return "output dimensions must be divisible by input dimensions";
537 |     }
538 | 
539 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
540 |     for (const auto & name : { input_name, output_name }) {
541 |         if (engine->getTensorLocation(name) != nvinfer1::TensorLocation::kDEVICE) {
542 |             return "network binding " + std::string{ name } + " should reside on device";
543 |         }
544 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
545 |     for (int i = 0; i < 2; i++) {
546 |         if (engine->getLocation(i) != nvinfer1::TensorLocation::kDEVICE) {
547 |             return "network binding " + std::to_string(i) + " should reside on device";
548 |         }
549 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
550 | 
551 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
552 |         if (engine->getTensorFormat(name) != nvinfer1::TensorFormat::kLINEAR) {
553 |             return "expects network IO with layout NCHW (row major linear)";
554 |         }
555 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
556 |         if (engine->getBindingFormat(i) != nvinfer1::TensorFormat::kLINEAR) {
557 |             return "expects network IO with layout NCHW (row major linear)";
558 |         }
559 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX)
560 |     }
561 | 
562 |     return {};
563 | }
564 | 
565 | static inline
566 | std::variant<ErrorMessage, std::unique_ptr<nvinfer1::ICudaEngine>> initEngine(
567 |     const char * engine_data, size_t engine_nbytes,
568 |     const std::unique_ptr<nvinfer1::IRuntime> & runtime,
569 |     bool flexible_output
570 | ) noexcept {
571 | 
572 |     const auto set_error = [](const ErrorMessage & error_message) {
573 |         return error_message;
574 |     };
575 | 
576 |     std::unique_ptr<nvinfer1::ICudaEngine> engine {
577 |         runtime->deserializeCudaEngine(engine_data, engine_nbytes)
578 |     };
579 | 
580 |     if (!engine) {
581 |         return set_error("engine deserialization failed");
582 |     }
583 | 
584 |     if (auto err = checkEngine(engine, flexible_output); err.has_value()) {
585 |         return set_error(err.value());
586 |     }
587 | 
588 |     return engine;
589 | }
590 | 
591 | // 0: integer, 1: float
592 | static inline
593 | int getSampleType(nvinfer1::DataType type) noexcept {
594 |     switch (type) {
595 |         case nvinfer1::DataType::kFLOAT:
596 |         case nvinfer1::DataType::kHALF:
597 | #if (NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 8061 || defined(TRT_MAJOR_RTX)
598 |         case nvinfer1::DataType::kFP8:
599 | #endif // (NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 8061 || defined(TRT_MAJOR_RTX)
600 | #if NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX)
601 |         case nvinfer1::DataType::kBF16:
602 | #endif // NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX)
603 |             return 1;
604 |         case nvinfer1::DataType::kINT8:
605 |         case nvinfer1::DataType::kINT32:
606 |         case nvinfer1::DataType::kBOOL:
607 |         case nvinfer1::DataType::kUINT8:
608 | #if NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX)
609 |         case nvinfer1::DataType::kINT64:
610 | #endif // NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX)
611 |             return 0;
612 |         default:
613 |             return -1;
614 |     }
615 | }
616 | 
617 | #endif // VSTRT_TRT_UTILS_H_
618 | 


--------------------------------------------------------------------------------