├── vsov ├── config.h.in ├── CMakeLists.txt ├── win32.cpp └── README.md ├── vsmigx ├── config.h.in ├── CMakeLists.txt ├── win32.cpp └── README.md ├── vsncnn ├── config.h.in ├── onnx2ncnn.hpp └── CMakeLists.txt ├── vsort ├── config.h.in ├── CMakeLists.txt ├── README.md └── win32.cpp ├── vstrt ├── config.h.in ├── longpath.manifest ├── cuda_helper.h ├── trtexec │ ├── CMakeLists.txt │ ├── trtexec.patch │ └── logfile.cpp ├── cuda_utils.h ├── win32.cpp ├── CMakeLists.txt ├── inference_helper.h ├── README.md ├── utils.h └── trt_utils.h ├── common ├── onnx_utils.h ├── convert_float_to_float16.h └── onnx_utils.cpp ├── .github └── workflows │ ├── linux-trt.yml │ ├── linux-trt-arm64.yml │ ├── linux-trt-rtx.yml │ ├── linux-migx.yml │ ├── windows-hip-dependency.yml │ ├── windows-migx.yml │ ├── windows-trt_rtx.yml │ ├── linux-ncnn.yml │ ├── windows-cuda-dependency.yml │ ├── macos-ort.yml │ ├── linux-ov.yml │ ├── linux-ov-arm64.yml │ ├── windows-trt.yml │ ├── linux-ort.yml │ ├── windows-ncnn.yml │ ├── windows-ov.yml │ ├── windows-ort.yml │ └── windows-release.yml └── README.md /vsov/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vsmigx/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vsncnn/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vsort/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vstrt/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vsncnn/onnx2ncnn.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ONNX2NCNN_HPP 2 | #define ONNX2NCNN_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | extern std::optional> onnx2ncnn(ONNX_NAMESPACE::ModelProto & model); 11 | 12 | #endif // ONNX2NCNN_HPP 13 | -------------------------------------------------------------------------------- /common/onnx_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef ONNX_UTILS_H 2 | #define ONNX_UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | std::variant loadONNX( 12 | const std::string_view & path, 13 | int64_t tile_w, 14 | int64_t tile_h, 15 | bool path_is_serialization 16 | ) noexcept; 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /vstrt/longpath.manifest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | true 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /common/convert_float_to_float16.h: -------------------------------------------------------------------------------- 1 | #ifndef CONVERT_FLOAT_TO_FLOAT16_H 2 | #define CONVERT_FLOAT_TO_FLOAT16_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | void convert_float_to_float16( 10 | ONNX_NAMESPACE::ModelProto & model, 11 | bool force_fp16_initializers, 12 | // bool keep_io_types = True, 13 | // bool disable_shape_infer = True, 14 | // const std::optional> op_block_list = DEFAULT_OP_BLOCK_LIST, 15 | // const std::optional> op_block_list = {}, 16 | const std::unordered_set & op_block_list, 17 | bool cast_input = true, 18 | bool cast_output = true 19 | ) noexcept; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /vstrt/cuda_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_CUDA_HELPER_H_ 2 | #define VSTRT_CUDA_HELPER_H_ 3 | 4 | #include 5 | 6 | #include 7 | 8 | #define checkError(expr) do { \ 9 | using namespace std::string_literals; \ 10 | cudaError_t __err = expr; \ 11 | if (__err != cudaSuccess) { \ 12 | const char * message = cudaGetErrorString(__err); \ 13 | return set_error("'"s + # expr + "' failed: " + message); \ 14 | } \ 15 | } while(0) 16 | 17 | #endif // VSTRT_CUDA_HELPER_H_ 18 | -------------------------------------------------------------------------------- /vstrt/trtexec/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(trtexec LANGUAGES CXX) 4 | 5 | find_package(CUDAToolkit REQUIRED) 6 | 7 | add_executable(trtexec 8 | $<$: longpath.manifest> 9 | trtexec.cpp 10 | logfile.cpp 11 | ../common/bfloat16.cpp 12 | ../common/debugTensorWriter.cpp 13 | ../common/logger.cpp 14 | ../common/sampleDevice.cpp 15 | ../common/sampleEngines.cpp 16 | ../common/sampleInference.cpp 17 | ../common/sampleOptions.cpp 18 | ../common/sampleReporting.cpp 19 | ../common/sampleUtils.cpp 20 | ../../shared/utils/fileLock.cpp 21 | ../../shared/utils/cacheUtils.cpp 22 | ) 23 | 24 | target_include_directories(trtexec PRIVATE 25 | ../common 26 | .. 27 | ../../include 28 | ../../shared 29 | ) 30 | 31 | target_link_libraries(trtexec PRIVATE CUDA::cudart_static) 32 | 33 | install(TARGETS trtexec RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) 34 | -------------------------------------------------------------------------------- /vstrt/trtexec/trtexec.patch: -------------------------------------------------------------------------------- 1 | diff --git a/shared/utils/fileLock.cpp b/shared/utils/fileLock.cpp 2 | index e155c0b..de6bce2 100644 3 | --- a/shared/utils/fileLock.cpp 4 | +++ b/shared/utils/fileLock.cpp 5 | @@ -35,8 +35,11 @@ FileLock::FileLock(ILogger& logger, std::string const& fileName) 6 | ss << "Trying to set exclusive file lock " << lockFileName << std::endl; 7 | mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str()); 8 | } 9 | + int size = MultiByteToWideChar(CP_UTF8, 0, lockFileName.c_str(), -1, nullptr, 0); 10 | + std::wstring lockFileNameW (size, L'\0'); 11 | + MultiByteToWideChar(CP_UTF8, 0, lockFileName.c_str(), -1, &lockFileNameW[0], size); 12 | // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided 13 | - mHandle = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL); 14 | + mHandle = CreateFileW(lockFileNameW.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, FILE_FLAG_DELETE_ON_CLOSE | FILE_ATTRIBUTE_TEMPORARY, NULL); 15 | if (mHandle == INVALID_HANDLE_VALUE) 16 | { 17 | throw std::runtime_error("Failed to lock " + lockFileName + "!"); 18 | -------------------------------------------------------------------------------- /vstrt/trtexec/logfile.cpp: -------------------------------------------------------------------------------- 1 | // When $TRTEXEC_LOG_FILE is set, redirect stdout and stderr to the specified 2 | // file as well. 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace { 9 | static struct redirect { 10 | class teebuf: public std::streambuf { 11 | public: 12 | teebuf(std::streambuf *a, std::streambuf *b): s1(a), s2(b) {} 13 | private: 14 | std::streambuf *s1, *s2; 15 | 16 | virtual int overflow(int c) override { 17 | if (c == EOF) 18 | return EOF; 19 | else { 20 | int r1 = s1->sputc(c); 21 | int r2 = s2->sputc(c); 22 | return (r1 == EOF || r2 == EOF) ? EOF : c; 23 | } 24 | } 25 | 26 | virtual int sync() override { 27 | int r1 = s1->pubsync(); 28 | int r2 = s2->pubsync(); 29 | return (r1 == 0 && r2 == 0) ? 0 : -1; 30 | } 31 | }; 32 | redirect() { 33 | const char *fn = getenv("TRTEXEC_LOG_FILE"); 34 | if (fn) { 35 | static std::ofstream ofs(fn, std::ios::app); 36 | static teebuf out(ofs.rdbuf(), std::cout.rdbuf()); 37 | static teebuf err(ofs.rdbuf(), std::cerr.rdbuf()); 38 | std::cout.rdbuf(&out); 39 | std::cerr.rdbuf(&err); 40 | } 41 | } 42 | } _; 43 | } // namespace 44 | -------------------------------------------------------------------------------- /vsmigx/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-migraphx VERSION 3.1 LANGUAGES CXX) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 8 | 9 | find_package(migraphx REQUIRED CONFIG) 10 | find_package(hip REQUIRED CONFIG) 11 | 12 | add_library(vsmigx SHARED vs_migraphx.cpp win32.cpp) 13 | 14 | target_include_directories(vsmigx PRIVATE ${VAPOURSYNTH_INCLUDE_DIRECTORY}) 15 | 16 | target_link_libraries(vsmigx PRIVATE migraphx::c hip::host) 17 | 18 | set_target_properties(vsmigx PROPERTIES 19 | CXX_EXTENSIONS OFF 20 | POSITION_INDEPENDENT_CODE ON 21 | CXX_STANDARD 20 22 | CXX_STANDARD_REQUIRED ON 23 | ) 24 | 25 | if (WIN32) 26 | target_link_options(vsmigx PRIVATE 27 | "/DELAYLOAD:migraphx_c.dll" 28 | "/DELAYLOAD:amdhip64_6.dll" 29 | "delayimp.lib" 30 | ) 31 | endif() 32 | 33 | find_package(Git REQUIRED) 34 | execute_process( 35 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 36 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 37 | OUTPUT_VARIABLE VCS_TAG 38 | ) 39 | string(STRIP ${VCS_TAG} VCS_TAG) 40 | configure_file(config.h.in config.h) 41 | target_include_directories(vsmigx PUBLIC "${PROJECT_BINARY_DIR}") 42 | 43 | install(TARGETS vsmigx 44 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 45 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 46 | ) 47 | -------------------------------------------------------------------------------- /vsncnn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-ncnn VERSION 3.0 LANGUAGES CXX) 4 | 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 6 | 7 | find_package(protobuf REQUIRED CONFIG) 8 | find_package(ONNX REQUIRED CONFIG) 9 | find_package(ncnn REQUIRED CONFIG) 10 | 11 | add_library(vsncnn SHARED vs_ncnn.cpp onnx2ncnn.cpp ../common/onnx_utils.cpp) 12 | 13 | target_include_directories(vsncnn PRIVATE 14 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 15 | ${ONNX_INCLUDE_DIRS} 16 | ) 17 | 18 | target_link_libraries(vsncnn PRIVATE ncnn) 19 | 20 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5 21 | if (ONNX_VERSION VERSION_LESS 1.16.0) 22 | target_link_libraries(vsncnn PRIVATE onnx) 23 | else() 24 | target_link_libraries(vsncnn PRIVATE ONNX::onnx) 25 | endif() 26 | 27 | set_target_properties(vsncnn PROPERTIES 28 | POSITION_INDEPENDENT_CODE ON 29 | CXX_EXTENSIONS OFF 30 | CXX_STANDARD 17 31 | CXX_STANDARD_REQUIRED ON 32 | ) 33 | 34 | if (CMAKE_CXX_STANDARD GREATER 17) 35 | set_target_properties(vsncnn PROPERTIES CXX_STANDARD ${CMAKE_CXX_STANDARD}) 36 | endif() 37 | 38 | target_include_directories(vsncnn PUBLIC 39 | "${PROJECT_BINARY_DIR}" 40 | ) 41 | 42 | find_package(Git REQUIRED) 43 | execute_process( 44 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 45 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 46 | OUTPUT_VARIABLE VCS_TAG 47 | ) 48 | string(STRIP ${VCS_TAG} VCS_TAG) 49 | configure_file(config.h.in config.h) 50 | 51 | install(TARGETS vsncnn 52 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 53 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 54 | ) 55 | -------------------------------------------------------------------------------- /vstrt/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_CUDA_UTILS_H_ 2 | #define VSTRT_CUDA_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | template 12 | requires 13 | std::default_initializable && 14 | std::movable && 15 | std::is_trivially_copy_assignable_v && 16 | std::convertible_to && 17 | std::invocable 18 | struct Resource { 19 | T data; 20 | 21 | [[nodiscard]] 22 | constexpr Resource() noexcept = default; 23 | 24 | [[nodiscard]] 25 | constexpr Resource(T && x) noexcept : data(x) {} 26 | 27 | [[nodiscard]] 28 | constexpr Resource(Resource&& other) noexcept 29 | : data(std::exchange(other.data, T{})) 30 | { } 31 | 32 | constexpr Resource& operator=(Resource&& other) noexcept { 33 | if (this == &other) return *this; 34 | deleter_(std::move(data)); 35 | data = std::exchange(other.data, T{}); 36 | return *this; 37 | } 38 | 39 | constexpr Resource& operator=(const Resource & other) = delete; 40 | 41 | Resource(const Resource& other) = delete; 42 | 43 | constexpr operator T() const noexcept { 44 | return data; 45 | } 46 | 47 | constexpr auto deleter_(T && x) noexcept { 48 | if (x) { 49 | deleter(x); 50 | } 51 | } 52 | 53 | constexpr Resource& operator=(T && x) noexcept { 54 | deleter_(std::move(data)); 55 | data = x; 56 | return *this; 57 | } 58 | 59 | constexpr ~Resource() noexcept { 60 | deleter_(std::move(data)); 61 | } 62 | }; 63 | 64 | struct MemoryResource { 65 | Resource h_data; 66 | Resource d_data; 67 | size_t size; 68 | }; 69 | 70 | using StreamResource = Resource; 71 | using GraphExecResource = Resource; 72 | 73 | #endif // VSTRT_CUDA_UTILS_H_ 74 | -------------------------------------------------------------------------------- /vsov/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-ov VERSION 3.0 LANGUAGES CXX) 4 | 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 6 | set(ENABLE_VISUALIZATION OFF CACHE BOOL "Enable support for network visualization") 7 | set(WIN32_SHARED_OPENVINO OFF CACHE BOOL "Build for win32 with shared openvino library") 8 | 9 | find_package(OpenVINO REQUIRED CONFIG) 10 | 11 | add_library(vsov SHARED 12 | vs_openvino.cpp 13 | win32.cpp 14 | ../common/onnx_utils.cpp 15 | ../common/convert_float_to_float16.cpp 16 | ) 17 | 18 | if(ENABLE_VISUALIZATION) 19 | target_compile_definitions(vsov PRIVATE ENABLE_VISUALIZATION) 20 | endif() 21 | 22 | if(WIN32_SHARED_OPENVINO) 23 | target_compile_definitions(vsov PRIVATE WIN32_SHARED_OPENVINO) 24 | endif() 25 | 26 | find_package(protobuf REQUIRED CONFIG) 27 | find_package(ONNX REQUIRED CONFIG) 28 | 29 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5 30 | if (ONNX_VERSION VERSION_LESS 1.16.0) 31 | target_link_libraries(vsov PRIVATE onnx) 32 | else() 33 | target_link_libraries(vsov PRIVATE ONNX::onnx) 34 | endif() 35 | 36 | target_include_directories(vsov PRIVATE 37 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 38 | ${ONNX_INCLUDE_DIRS} 39 | ) 40 | 41 | target_link_libraries(vsov PRIVATE openvino::runtime) 42 | 43 | set_target_properties(vsov PROPERTIES 44 | CXX_EXTENSIONS OFF 45 | CXX_STANDARD 17 46 | CXX_STANDARD_REQUIRED ON 47 | ) 48 | 49 | if (WIN32) 50 | if(WIN32_SHARED_OPENVINO) 51 | target_link_options(vsov PRIVATE "/DELAYLOAD:openvino.dll" "delayimp.lib") 52 | else() 53 | target_link_options(vsov PRIVATE "/DELAYLOAD:tbb.dll" "delayimp.lib") 54 | endif() 55 | endif() 56 | 57 | target_include_directories(vsov PUBLIC 58 | "${PROJECT_BINARY_DIR}" 59 | ) 60 | 61 | find_package(Git REQUIRED) 62 | execute_process( 63 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 64 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 65 | OUTPUT_VARIABLE VCS_TAG 66 | ) 67 | string(STRIP ${VCS_TAG} VCS_TAG) 68 | configure_file(config.h.in config.h) 69 | 70 | install(TARGETS vsov 71 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 72 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 73 | ) 74 | -------------------------------------------------------------------------------- /.github/workflows/linux-trt.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-TRT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vstrt/**' 7 | - '.github/workflows/linux-trt.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04 13 | 14 | defaults: 15 | run: 16 | working-directory: vstrt 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja --break-system-packages 26 | 27 | - name: Download VapourSynth headers 28 | run: | 29 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 30 | unzip -q vs.zip 31 | mv vapoursynth*/ vapoursynth 32 | 33 | - name: Setup CUDA and TensorRT 34 | run: | 35 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb 36 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 37 | sudo apt-get update 38 | export TRT_VER=10.13.2.6-1+cuda13.0 39 | sudo apt-get install -y --no-install-recommends cuda-nvcc-13-0 cuda-cudart-dev-13-0 libnvinfer-dev=${TRT_VER} libnvinfer-headers-dev=${TRT_VER} libnvinfer10=${TRT_VER} 40 | echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV 41 | echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV 42 | echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV 43 | 44 | - name: Configure 45 | run: cmake -S . -B build -G Ninja -LA 46 | -D CMAKE_BUILD_TYPE=Release 47 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 48 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 49 | 50 | - name: Build 51 | run: cmake --build build --verbose 52 | 53 | - name: Install 54 | run: cmake --install build --prefix install 55 | 56 | - name: Prepare for upload 57 | run: | 58 | mkdir artifact 59 | cp -v install/lib/*.so artifact 60 | 61 | - name: Describe 62 | run: git describe --tags --long 63 | 64 | - name: Upload 65 | uses: actions/upload-artifact@v4 66 | with: 67 | name: VSTRT-Linux-x64 68 | path: vstrt/artifact 69 | overwrite: true 70 | 71 | -------------------------------------------------------------------------------- /.github/workflows/linux-trt-arm64.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-TRT, ARM64) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vstrt/**' 7 | - '.github/workflows/linux-trt-arm64.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04-arm 13 | 14 | defaults: 15 | run: 16 | working-directory: vstrt 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja --break-system-packages 26 | 27 | - name: Download VapourSynth headers 28 | run: | 29 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 30 | unzip -q vs.zip 31 | mv vapoursynth*/ vapoursynth 32 | 33 | - name: Setup CUDA and TensorRT 34 | run: | 35 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb 36 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 37 | sudo apt-get update 38 | export TRT_VER=10.8.0.43-1+cuda12.8 39 | sudo apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-cudart-dev-12-8 libnvinfer-dev=${TRT_VER} libnvinfer-headers-dev=${TRT_VER} libnvinfer10=${TRT_VER} 40 | echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV 41 | echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV 42 | echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV 43 | 44 | - name: Configure 45 | run: cmake -S . -B build -G Ninja -LA 46 | -D CMAKE_BUILD_TYPE=Release 47 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 48 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math" 49 | 50 | - name: Build 51 | run: cmake --build build --verbose 52 | 53 | - name: Install 54 | run: cmake --install build --prefix install 55 | 56 | - name: Prepare for upload 57 | run: | 58 | mkdir artifact 59 | cp -v install/lib/*.so artifact 60 | 61 | - name: Describe 62 | run: git describe --tags --long 63 | 64 | - name: Upload 65 | uses: actions/upload-artifact@v4 66 | with: 67 | name: VSTRT-Linux-ARM64 68 | path: vstrt/artifact 69 | overwrite: true 70 | 71 | -------------------------------------------------------------------------------- /.github/workflows/linux-trt-rtx.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-TRT-RTX) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vstrt/**' 7 | - '.github/workflows/linux-trt-rtx.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04 13 | 14 | defaults: 15 | run: 16 | working-directory: vstrt 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja --break-system-packages 26 | 27 | - name: Download VapourSynth headers 28 | run: | 29 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 30 | unzip -q vs.zip 31 | mv vapoursynth*/ vapoursynth 32 | 33 | - name: Setup CUDA 34 | run: | 35 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb 36 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 37 | sudo apt-get update 38 | sudo apt-get install -y --no-install-recommends cuda-nvcc-13-0 cuda-cudart-dev-13-0 39 | echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV 40 | echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV 41 | echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV 42 | 43 | - name: Download TensorRT-RTX 44 | run: | 45 | curl -L -o trt.tar.gz https://developer.nvidia.com/downloads/trt/rtx_sdk/secure/1.2/TensorRT-RTX-1.2.0.44-Linux-x86_64-cuda-13.0-Release-external.tar.gz 46 | tar -xzf trt.tar.gz --verbose 47 | mv TensorRT-RTX-*/ tensorrt/ 48 | 49 | - name: Configure 50 | run: cmake -S . -B build -G Ninja -LA 51 | -D CMAKE_BUILD_TYPE=Release 52 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 53 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 54 | -D TENSORRT_HOME="$(pwd)\tensorrt" 55 | 56 | - name: Build 57 | run: cmake --build build --verbose 58 | 59 | - name: Install 60 | run: cmake --install build --prefix install 61 | 62 | - name: Prepare for upload 63 | run: | 64 | mkdir artifact 65 | cp -v install/lib/*.so artifact 66 | 67 | - name: Describe 68 | run: git describe --tags --long 69 | 70 | - name: Upload 71 | uses: actions/upload-artifact@v4 72 | with: 73 | name: VSTRT-RTX-Linux-x64 74 | path: vstrt/artifact 75 | overwrite: true 76 | 77 | -------------------------------------------------------------------------------- /.github/workflows/linux-migx.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-MIGX) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vsmigx/**' 7 | - '.github/workflows/linux-migx.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04 13 | 14 | defaults: 15 | run: 16 | working-directory: vsmigx 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja 26 | 27 | - name: Download VapourSynth headers 28 | run: | 29 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 30 | unzip -q vs.zip 31 | mv vapoursynth*/ vapoursynth 32 | 33 | - name: Setup HIP and MIGraphX 34 | run: | 35 | wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 36 | echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.3.1 noble main" | sudo tee --append /etc/apt/sources.list.d/rocm.list 37 | echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 38 | sudo apt update 39 | sudo apt install hip-runtime-amd rocm-device-libs migraphx-dev hipcc 40 | ls -R /opt/rocm 41 | 42 | - name: Configure 43 | run: cmake -S . -B build -G Ninja -Wno-dev -LA 44 | -D CMAKE_BUILD_TYPE=Release 45 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 46 | -D CMAKE_CXX_COMPILER=g++-13 47 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 48 | -D migraphx_DIR=/opt/rocm/lib/cmake/migraphx 49 | -D MIOpen_DIR=/opt/rocm/lib/cmake/miopen 50 | -D hip_DIR=/opt/rocm/lib/cmake/hip 51 | -D AMDDeviceLibs_DIR=/opt/rocm/lib/cmake/AMDDeviceLibs 52 | -D amd_comgr_DIR=/opt/rocm/lib/cmake/amd_comgr 53 | -D hsa-runtime64_DIR=/opt/rocm/lib/cmake/hsa-runtime64 54 | -D rocblas_DIR=/opt/rocm/lib/cmake/rocblas 55 | -D hipblaslt_DIR=/opt/rocm/lib/cmake/hipblaslt 56 | -D CMAKE_PREFIX_PATH=/opt/rocm/lib/cmake 57 | 58 | - name: Build 59 | run: cmake --build build --verbose 60 | 61 | - name: Install 62 | run: cmake --install build --prefix install 63 | 64 | - name: Prepare for upload 65 | run: | 66 | mkdir artifact 67 | cp -v install/lib/*.so artifact 68 | 69 | - name: Describe 70 | run: git describe --tags --long 71 | 72 | - name: Upload 73 | uses: actions/upload-artifact@v4 74 | with: 75 | name: VSMIGX-Linux-x64 76 | path: vsmigx/artifact 77 | overwrite: true 78 | 79 | -------------------------------------------------------------------------------- /vsort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-ort VERSION 3.0 LANGUAGES CXX) 4 | 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 6 | set(ONNX_RUNTIME_API_DIRECTORY "" CACHE PATH "Path to ONNX API headers") 7 | set(ONNX_RUNTIME_LIB_DIRECTORY "" CACHE PATH "Path to ONNX Runtime libraries") 8 | 9 | set(ENABLE_CUDA OFF CACHE BOOL "Enable CUDA backend") 10 | set(ENABLE_DML OFF CACHE BOOL "Enable DirectML backend") 11 | set(ENABLE_COREML OFF CACHE BOOL "Enable CoreML support") 12 | 13 | find_package(protobuf REQUIRED CONFIG) 14 | find_package(ONNX REQUIRED CONFIG) 15 | 16 | add_library(vsort SHARED 17 | vs_onnxruntime.cpp 18 | win32.cpp 19 | ../common/onnx_utils.cpp 20 | ../common/convert_float_to_float16.cpp 21 | ) 22 | 23 | target_include_directories(vsort PRIVATE 24 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 25 | ${ONNX_INCLUDE_DIRS} 26 | ${ONNX_RUNTIME_API_DIRECTORY} 27 | ) 28 | 29 | target_link_directories(vsort PRIVATE 30 | ${ONNX_RUNTIME_LIB_DIRECTORY} 31 | ) 32 | 33 | set_target_properties(vsort PROPERTIES 34 | POSITION_INDEPENDENT_CODE ON 35 | CXX_EXTENSIONS OFF 36 | CXX_STANDARD 17 37 | CXX_STANDARD_REQUIRED ON) 38 | 39 | if (CMAKE_CXX_STANDARD GREATER_EQUAL 20) 40 | set_target_properties(vsort PROPERTIES CXX_STANDARD 20) 41 | endif() 42 | 43 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5 44 | if (ONNX_VERSION VERSION_LESS 1.16.0) 45 | target_link_libraries(vsort PRIVATE onnx) 46 | else() 47 | target_link_libraries(vsort PRIVATE ONNX::onnx) 48 | endif() 49 | 50 | target_link_libraries(vsort PRIVATE onnxruntime) 51 | 52 | if (ENABLE_CUDA) 53 | find_package(CUDAToolkit REQUIRED) 54 | 55 | add_compile_definitions(ENABLE_CUDA) 56 | target_include_directories(vsort PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) 57 | target_link_libraries(vsort PRIVATE CUDA::cudart_static) 58 | 59 | if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 60 | target_link_options(vsort PRIVATE "/DELAYLOAD:onnxruntime.dll" "delayimp.lib") 61 | endif() 62 | endif() 63 | 64 | if (ENABLE_DML) 65 | add_compile_definitions(ENABLE_DML) 66 | endif() 67 | 68 | if(ENABLE_COREML) 69 | add_compile_definitions(ENABLE_COREML=1) 70 | endif() 71 | 72 | target_include_directories(vsort PUBLIC 73 | "${PROJECT_BINARY_DIR}" 74 | ) 75 | 76 | find_package(Git REQUIRED) 77 | execute_process( 78 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 79 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 80 | OUTPUT_VARIABLE VCS_TAG 81 | ) 82 | string(STRIP ${VCS_TAG} VCS_TAG) 83 | configure_file(config.h.in config.h) 84 | 85 | install(TARGETS vsort 86 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 87 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 88 | ) 89 | -------------------------------------------------------------------------------- /vsmigx/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DLL_DIR L"vsmlrt-hip" 10 | 11 | #include 12 | 13 | namespace { 14 | std::vector dlls = { 15 | // This list must be sorted by dependency. 16 | L"amdhip64_6.dll", 17 | L"migraphx.dll", 18 | L"migraphx_tf.dll", 19 | L"migraphx_onnx.dll", 20 | L"migraphx_c.dll", // must be the last 21 | }; 22 | 23 | namespace fs = std::filesystem; 24 | static fs::path dllDir() { 25 | static const std::wstring res = []() -> std::wstring { 26 | HMODULE mod = 0; 27 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 28 | std::vector buf; 29 | size_t n = 0; 30 | do { 31 | buf.resize(buf.size() + MAX_PATH); 32 | n = GetModuleFileNameW(mod, buf.data(), buf.size()); 33 | } while (n >= buf.size()); 34 | buf.resize(n); 35 | std::wstring path(buf.begin(), buf.end()); 36 | return path; 37 | } 38 | throw std::runtime_error("unable to locate myself"); 39 | }(); 40 | return fs::path(res).parent_path(); 41 | } 42 | 43 | FARPROC loadDLLs() { 44 | fs::path dir = dllDir() / DLL_DIR; 45 | HMODULE h = nullptr; 46 | for (const auto dll: dlls) { 47 | fs::path p = dir / dll; 48 | std::wstring s = p; 49 | h = LoadLibraryW(s.c_str()); 50 | if (getenv("VSMIGX_VERBOSE")) 51 | std::wcerr << L"vsmigx: preloading " << p << L": " << h << std::endl; 52 | if (!h) 53 | std::wcerr << L"vsmigx: failed to preload " << s << std::endl; 54 | } 55 | return (FARPROC)h; 56 | } 57 | 58 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 59 | switch (reason) { 60 | case dliNoteStartProcessing: 61 | case dliNoteEndProcessing: 62 | // Nothing to do here. 63 | break; 64 | case dliNotePreLoadLibrary: 65 | //std::cerr << "loading " << info->szDll << std::endl; 66 | if (std::string(info->szDll).find("migraphx_c.dll") != std::string::npos || 67 | std::string(info->szDll).find("amdhip64_6.dll") != std::string::npos 68 | ) 69 | return loadDLLs(); 70 | break; 71 | case dliNotePreGetProcAddress: 72 | // Nothing to do here. 73 | break; 74 | case dliFailLoadLib: 75 | case dliFailGetProc: 76 | // Returning NULL from error notifications will cause the delay load 77 | // runtime to raise a VcppException structured exception, that some code 78 | // might want to handle. 79 | return NULL; 80 | break; 81 | default: 82 | abort(); // unreachable. 83 | break; 84 | } 85 | // Returning NULL causes the delay load machinery to perform default 86 | // processing for this notification. 87 | return NULL; 88 | } 89 | } // namespace 90 | 91 | extern "C" { 92 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 93 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 94 | }; 95 | #endif 96 | -------------------------------------------------------------------------------- /.github/workflows/windows-hip-dependency.yml: -------------------------------------------------------------------------------- 1 | name: Upload vs-mlrt-hip dependencies 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | tag: 7 | description: 'which tag to upload to' 8 | required: true 9 | default: 'v100' 10 | workflow_call: 11 | inputs: 12 | tag: 13 | description: 'which tag to upload to' 14 | required: true 15 | type: string 16 | secrets: 17 | REPO_TOKEN: 18 | required: true 19 | 20 | jobs: 21 | build-windows: 22 | runs-on: windows-2022 23 | outputs: 24 | runID: ${{ steps.output.outputs.runID }} 25 | 26 | defaults: 27 | run: 28 | shell: bash 29 | 30 | steps: 31 | - name: Download MIGraphX Precompilation 32 | run: | 33 | curl -s -o migx.zip -LJO https://github.com/AmusementClub/AMDMIGraphX/releases/download/rocm-4.1.0-1730-g6acc1f957-241221-0629/migraphx-win64.zip 34 | 35 | - name: Extract MIGraphX Precompilation 36 | run: | 37 | unzip migx.zip 38 | 39 | - name: Move MIGraphX Precompilation 40 | run: | 41 | mkdir vsmlrt-hip 42 | mv migraphx/bin/* vsmlrt-hip -v 43 | 44 | - name: Setup VC commands 45 | uses: ilammy/msvc-dev-cmd@v1 46 | with: 47 | arch: amd64 48 | 49 | - name: List Dependencies 50 | shell: bash 51 | run: | 52 | cd vsmlrt-hip 53 | for dll in *.[dD][lL][lL]; do 54 | echo $(dumpbin -dependents "$dll") 55 | done 56 | 57 | - name: Cache HIP 58 | id: cache-hip 59 | uses: actions/cache@v4 60 | with: 61 | path: C:\Program Files\AMD\ROCm 62 | key: ${{ runner.os }}-rocm-6.2.4 63 | 64 | - name: Setup HIP 65 | if: steps.cache-hip.outputs.cache-hit != 'true' 66 | shell: pwsh 67 | run: | 68 | curl -s -o hip_installer.exe -L https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe 69 | Start-Process hip_installer.exe -ArgumentList '-install' -NoNewWindow -Wait 70 | 71 | - name: Move HIP Libraries 72 | shell: cmd 73 | run: | 74 | copy "C:\Program Files\AMD\ROCm\6.2\bin\amd_comgr_2.dll" vsmlrt-hip 75 | copy "C:\Program Files\AMD\ROCm\6.2\bin\amdhip64_6.dll" vsmlrt-hip 76 | copy "C:\Program Files\AMD\ROCm\6.2\bin\hiprtc0602.dll" vsmlrt-hip 77 | copy "C:\Program Files\AMD\ROCm\6.2\bin\hiprtc-builtins0602.dll" vsmlrt-hip 78 | 79 | - name: Compress 80 | run: | 81 | 7z a -t7z -mx=3 vsmlrt-hip.7z vsmlrt-hip 82 | 83 | - name: Upload 84 | uses: actions/upload-artifact@v4 85 | with: 86 | name: vsmlrt-hip 87 | path: vsmlrt-hip.7z 88 | retention-days: 1 89 | compression-level: 0 90 | 91 | - name: Rename release asset 92 | run: | 93 | mv vsmlrt-hip.7z vsmlrt-hip.${{ github.event.inputs.tag}}.7z 94 | 95 | - name: Release 96 | uses: softprops/action-gh-release@v2 97 | with: 98 | tag_name: ${{ github.event.inputs.tag }} 99 | files: vsmlrt-hip.${{ github.event.inputs.tag }}.7z 100 | fail_on_unmatched_files: true 101 | generate_release_notes: false 102 | prerelease: true 103 | -------------------------------------------------------------------------------- /vsov/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DLL_DIR L"vsov" 10 | 11 | #include 12 | 13 | namespace { 14 | std::vector dlls = { 15 | // This list must be sorted by dependency. 16 | #ifdef WIN32_SHARED_OPENVINO 17 | L"tbb12.dll", 18 | L"openvino.dll", // must be the last 19 | #else // WIN32_SHARED_OPENVINO 20 | L"tbb12.dll", // must be the last 21 | #endif // WIN32_SHARED_OPENVINO 22 | }; 23 | 24 | namespace fs = std::filesystem; 25 | static fs::path dllDir() { 26 | static const std::wstring res = []() -> std::wstring { 27 | HMODULE mod = 0; 28 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 29 | std::vector buf; 30 | size_t n = 0; 31 | do { 32 | buf.resize(buf.size() + MAX_PATH); 33 | n = GetModuleFileNameW(mod, buf.data(), buf.size()); 34 | } while (n >= buf.size()); 35 | buf.resize(n); 36 | std::wstring path(buf.begin(), buf.end()); 37 | return path; 38 | } 39 | throw std::runtime_error("unable to locate myself"); 40 | }(); 41 | return fs::path(res).parent_path(); 42 | } 43 | 44 | FARPROC loadDLLs() { 45 | fs::path dir = dllDir() / DLL_DIR; 46 | HMODULE h = nullptr; 47 | for (const auto dll: dlls) { 48 | fs::path p = dir / dll; 49 | std::wstring s = p; 50 | h = LoadLibraryW(s.c_str()); 51 | if (getenv("VSOV_VERBOSE")) 52 | std::wcerr << L"vsov: preloading " << p << L": " << h << std::endl; 53 | if (!h) 54 | std::wcerr << L"vsov: failed to preload " << s << std::endl; 55 | } 56 | return (FARPROC)h; 57 | } 58 | 59 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 60 | switch (reason) { 61 | case dliNoteStartProcessing: 62 | case dliNoteEndProcessing: 63 | // Nothing to do here. 64 | break; 65 | case dliNotePreLoadLibrary: 66 | //std::cerr << "loading " << info->szDll << std::endl; 67 | #ifdef WIN32_SHARED_OPENVINO 68 | if (std::string(info->szDll).find("openvino.dll") != std::string::npos) 69 | return loadDLLs(); 70 | #else // WIN32_SHARED_OPENVINO 71 | if (std::string(info->szDll).find("tbb.dll") != std::string::npos) 72 | return loadDLLs(); 73 | #endif // WIN32_SHARED_OPENVINO 74 | break; 75 | case dliNotePreGetProcAddress: 76 | // Nothing to do here. 77 | break; 78 | case dliFailLoadLib: 79 | case dliFailGetProc: 80 | // Returning NULL from error notifications will cause the delay load 81 | // runtime to raise a VcppException structured exception, that some code 82 | // might want to handle. 83 | return NULL; 84 | break; 85 | default: 86 | abort(); // unreachable. 87 | break; 88 | } 89 | // Returning NULL causes the delay load machinery to perform default 90 | // processing for this notification. 91 | return NULL; 92 | } 93 | } // namespace 94 | 95 | extern "C" { 96 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 97 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 98 | }; 99 | #endif 100 | -------------------------------------------------------------------------------- /.github/workflows/windows-migx.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-MIGX) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vsmigx/**' 7 | - '.github/workflows/windows-migx.yml' 8 | workflow_call: 9 | inputs: 10 | tag: 11 | description: 'which tag to upload to' 12 | required: true 13 | type: string 14 | secrets: 15 | REPO_TOKEN: 16 | required: true 17 | workflow_dispatch: 18 | inputs: 19 | tag: 20 | description: 'which tag to upload to' 21 | default: '' 22 | 23 | jobs: 24 | build-windows: 25 | runs-on: windows-2022 26 | 27 | defaults: 28 | run: 29 | shell: cmd 30 | working-directory: vsmigx 31 | 32 | steps: 33 | - name: Checkout repo 34 | uses: actions/checkout@v4 35 | with: 36 | fetch-depth: 0 37 | 38 | - name: Setup MSVC 39 | uses: ilammy/msvc-dev-cmd@v1 40 | 41 | - name: Cache HIP 42 | id: cache-hip 43 | uses: actions/cache@v4 44 | with: 45 | path: C:\Program Files\AMD\ROCm 46 | key: ${{ runner.os }}-rocm-6.2.4 47 | 48 | - name: Setup HIP 49 | if: steps.cache-hip.outputs.cache-hit != 'true' 50 | shell: pwsh 51 | run: | 52 | curl -s -o hip_installer.exe -L https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe 53 | Start-Process hip_installer.exe -ArgumentList '-install' -NoNewWindow -Wait 54 | 55 | - name: Download MIGraphX Precompilation 56 | run: | 57 | curl -s -o migx.zip -LJO https://github.com/AmusementClub/AMDMIGraphX/releases/download/rocm-4.1.0-1730-g6acc1f957-241221-0629/migraphx-win64.zip 58 | unzip -q migx.zip 59 | 60 | - name: Download VapourSynth headers 61 | run: | 62 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 63 | unzip -q vs.zip 64 | mv vapoursynth-*/ vapoursynth/ 65 | 66 | - name: Configure 67 | run: cmake -S . -B build -G Ninja -Wno-dev -LA 68 | -D CMAKE_BUILD_TYPE=Release 69 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 70 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%/vapoursynth/include" 71 | -D hip_DIR="C:/Program Files/AMD/ROCm/6.2/lib/cmake/hip" 72 | -D HIP_PLATFORM=amd 73 | -D migraphx_DIR="%cd%/migraphx/lib/cmake/migraphx" 74 | 75 | - name: Build 76 | run: cmake --build build --verbose 77 | 78 | - name: Install 79 | run: cmake --install build --prefix install 80 | 81 | - name: Prepare for upload 82 | run: | 83 | mkdir artifact 84 | copy install\bin\vsmigx.dll artifact\ 85 | 86 | - name: Describe 87 | run: git describe --tags --long 88 | 89 | - name: Dump dependencies 90 | run: dumpbin /dependents artifact/vsmigx.dll 91 | 92 | - name: Upload 93 | uses: actions/upload-artifact@v4 94 | with: 95 | name: VSMIGX-Windows-x64 96 | path: vsmigx/artifact 97 | 98 | - name: Compress artifact for release 99 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 100 | run: | 101 | cd artifact 102 | 7z a -t7z -mx=7 ../../VSMIGX-Windows-x64.${{ github.event.inputs.tag }}.7z . 103 | 104 | - name: Release 105 | uses: softprops/action-gh-release@v2 106 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 107 | with: 108 | tag_name: ${{ inputs.tag }} 109 | files: VSMIGX-Windows-x64.${{ github.event.inputs.tag }}.7z 110 | fail_on_unmatched_files: true 111 | generate_release_notes: false 112 | prerelease: true 113 | 114 | -------------------------------------------------------------------------------- /common/onnx_utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "onnx_utils.h" 12 | 13 | 14 | using namespace std::string_literals; 15 | 16 | #ifdef _WIN32 17 | #include 18 | #include 19 | static inline std::wstring translateName(const char *name) noexcept { 20 | std::wstring_convert> converter; 21 | return converter.from_bytes(name); 22 | } 23 | #else 24 | #define translateName(n) (n) 25 | #endif 26 | 27 | 28 | [[nodiscard]] 29 | static std::optional specifyShape( 30 | ONNX_NAMESPACE::ModelProto & model, 31 | int64_t tile_w, 32 | int64_t tile_h, 33 | int64_t batch = 1 34 | ) noexcept { 35 | 36 | if (model.graph().input_size() != 1) { 37 | return "graph must has a single input"; 38 | } 39 | ONNX_NAMESPACE::TensorShapeProto * input_shape { 40 | model 41 | .mutable_graph() 42 | ->mutable_input(0) 43 | ->mutable_type() 44 | ->mutable_tensor_type() 45 | ->mutable_shape() 46 | }; 47 | 48 | if (model.graph().output_size() != 1) { 49 | return "graph must has a single output"; 50 | } 51 | ONNX_NAMESPACE::TensorShapeProto * output_shape { 52 | model 53 | .mutable_graph() 54 | ->mutable_output(0) 55 | ->mutable_type() 56 | ->mutable_tensor_type() 57 | ->mutable_shape() 58 | }; 59 | 60 | constexpr auto n_idx = 0; 61 | constexpr auto h_idx = 2; 62 | constexpr auto w_idx = 3; 63 | 64 | if (input_shape->dim_size() != 4) { 65 | return "input dimension must be 4"; 66 | } 67 | 68 | input_shape->mutable_dim(n_idx)->set_dim_value(batch); 69 | input_shape->mutable_dim(h_idx)->set_dim_value(tile_h); 70 | input_shape->mutable_dim(w_idx)->set_dim_value(tile_w); 71 | 72 | if (output_shape->dim_size() != 4) { 73 | return "output dimsion must be 4"; 74 | } 75 | 76 | output_shape->mutable_dim(n_idx)->set_dim_value(batch); 77 | output_shape->mutable_dim(h_idx)->clear_dim_value(); 78 | output_shape->mutable_dim(w_idx)->clear_dim_value(); 79 | 80 | // remove shape info 81 | if (model.graph().value_info_size() != 0) { 82 | model.mutable_graph()->mutable_value_info()->Clear(); 83 | } 84 | 85 | try { 86 | ONNX_NAMESPACE::shape_inference::InferShapes(model); 87 | } catch (const ONNX_NAMESPACE::InferenceError & e) { 88 | return e.what(); 89 | } 90 | 91 | return {}; 92 | } 93 | 94 | 95 | std::variant loadONNX( 96 | const std::string_view & path, 97 | int64_t tile_w, 98 | int64_t tile_h, 99 | bool path_is_serialization 100 | ) noexcept { 101 | 102 | ONNX_NAMESPACE::ModelProto onnx_proto; 103 | 104 | if (path_is_serialization) { 105 | if (!onnx_proto.ParseFromArray(path.data(), static_cast(path.size()))) { 106 | return "parse onnx serialization failed"s; 107 | } 108 | } else { 109 | std::ifstream onnx_stream( 110 | translateName(path.data()), 111 | std::ios::binary 112 | ); 113 | 114 | if (!onnx_stream.good()) { 115 | return "open "s + std::string{ path } + " failed"s; 116 | } 117 | 118 | if (!onnx_proto.ParseFromIstream(&onnx_stream)) { 119 | return "parse "s + std::string{ path } + " failed"s; 120 | } 121 | } 122 | 123 | if (auto err = specifyShape(onnx_proto, tile_w, tile_h); err.has_value()) { 124 | return err.value(); 125 | } 126 | 127 | return onnx_proto; 128 | } 129 | -------------------------------------------------------------------------------- /.github/workflows/windows-trt_rtx.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-TRT-RTX) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vstrt/**' 7 | - '.github/workflows/windows-trt_rtx.yml' 8 | workflow_call: 9 | inputs: 10 | tag: 11 | description: 'which tag to upload to' 12 | required: true 13 | type: string 14 | secrets: 15 | REPO_TOKEN: 16 | required: true 17 | workflow_dispatch: 18 | inputs: 19 | tag: 20 | description: 'which tag to upload to' 21 | default: '' 22 | 23 | jobs: 24 | build-windows: 25 | runs-on: windows-2025 26 | 27 | defaults: 28 | run: 29 | shell: cmd 30 | working-directory: vstrt 31 | 32 | steps: 33 | - name: Checkout repo 34 | uses: actions/checkout@v5 35 | with: 36 | fetch-depth: 0 37 | 38 | - name: Setup MSVC 39 | uses: ilammy/msvc-dev-cmd@v1 40 | 41 | - name: Setup Ninja 42 | run: pip install ninja 43 | 44 | - name: Cache CUDA 45 | id: cache-cuda 46 | uses: actions/cache@v4 47 | with: 48 | path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA 49 | key: ${{ runner.os }}-vstrt-cuda-13.0.2 50 | 51 | - name: Setup CUDA 52 | if: steps.cache-cuda.outputs.cache-hit != 'true' 53 | run: | 54 | curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/13.0.2/network_installers/cuda_13.0.2_windows_network.exe 55 | cuda_installer.exe -s nvcc_13.0 cudart_13.0 cuda_profiler_api_13.0 crt_13.0 nvptxcompiler_13.0 56 | 57 | - name: Download TensorRT-RTX 58 | run: | 59 | curl -L -o trt.zip https://developer.nvidia.com/downloads/trt/rtx_sdk/secure/1.2/TensorRT-RTX-1.2.0.44-win10-amd64-cuda-13.0-Release-external.zip 60 | unzip trt.zip 61 | mv TensorRT-RTX-*/ tensorrt/ 62 | 63 | - name: Download VapourSynth headers 64 | run: | 65 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 66 | unzip -q vs.zip 67 | mv vapoursynth-*/ vapoursynth/ 68 | 69 | - name: Configure 70 | run: cmake -S . -B build -G Ninja -LA 71 | -D CMAKE_BUILD_TYPE=Release 72 | -D CMAKE_CXX_FLAGS="/EHsc /Wall /wd4100 /wd4625 /wd4626 /wd4710 /wd4711 /wd4820 /wd4996 /wd5026 /wd5027" 73 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 74 | -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0" 75 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include" 76 | -D TENSORRT_HOME="%cd%\tensorrt" 77 | -D TENSORRT_LIBRARY_SUFFIX="_1_2" 78 | 79 | - name: Build 80 | run: cmake --build build --config Release --verbose 81 | 82 | - name: Install 83 | run: cmake --install build --prefix install 84 | 85 | - name: Prepare for upload 86 | run: | 87 | mkdir artifact 88 | copy install\bin\vstrt_rtx.dll artifact\ 89 | 90 | - name: Describe 91 | run: git describe --tags --long 92 | 93 | - name: Dump dependencies 94 | run: dumpbin /dependents artifact\vstrt_rtx.dll 95 | 96 | - name: Upload 97 | uses: actions/upload-artifact@v4 98 | with: 99 | name: VSTRT-RTX-Windows-x64 100 | path: vstrt/artifact 101 | 102 | - name: Compress artifact for release 103 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 104 | run: | 105 | cd artifact 106 | 7z a -t7z -mx=9 ../../VSTRT-RTX-Windows-x64.${{ github.event.inputs.tag }}.7z . 107 | 108 | - name: Release 109 | uses: softprops/action-gh-release@v2 110 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 111 | with: 112 | tag_name: ${{ inputs.tag }} 113 | files: VSTRT-RTX-Windows-x64.${{ github.event.inputs.tag }}.7z 114 | fail_on_unmatched_files: true 115 | generate_release_notes: false 116 | prerelease: true 117 | -------------------------------------------------------------------------------- /vstrt/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DLL_DIR L"vsmlrt-cuda" 10 | 11 | #include 12 | 13 | #include 14 | 15 | #if NV_TENSORRT_VERSION >= 100001 || defined(TRT_MAJOR_RTX) 16 | #define TO_STRING(x) #x 17 | #define CONCAT_VERSION(name, version) (name "_" TO_STRING(version) ".dll") 18 | #define CONCAT_VERSION2(name, major, minor) (name "_" TO_STRING(major) "_" TO_STRING(minor) ".dll") 19 | #endif // NV_TENSORRT_VERSION >= 100001 20 | 21 | namespace { 22 | std::vector dlls = { 23 | // This list must be sorted by dependency. 24 | #if defined(TRT_MAJOR_RTX) 25 | CONCAT_VERSION2(L"tensorrt_rtx", TRT_MAJOR_RTX, TRT_MINOR_RTX), 26 | #elif NV_TENSORRT_VERSION >= 100001 27 | #ifdef USE_NVINFER_PLUGIN 28 | // nvinfer_plugin dependencies 29 | CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), 30 | CONCAT_VERSION(L"nvinfer_plugin", NV_TENSORRT_MAJOR), 31 | #endif // USE_NVINFER_PLUGIN 32 | // Finally, nvinfer again. 33 | CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), // must be the last 34 | #else // NV_TENSORRT_VERSION >= 100001 35 | #ifdef USE_NVINFER_PLUGIN 36 | // nvinfer_plugin dependencies 37 | L"nvinfer.dll", 38 | L"nvinfer_plugin.dll", 39 | #endif // USE_NVINFER_PLUGIN 40 | // Finally, nvinfer again. 41 | L"nvinfer.dll", // must be the last 42 | #endif // NV_TENSORRT_VERSION >= 100001 43 | }; 44 | 45 | namespace fs = std::filesystem; 46 | static fs::path dllDir() { 47 | static const std::wstring res = []() -> std::wstring { 48 | HMODULE mod = 0; 49 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 50 | std::vector buf; 51 | size_t n = 0; 52 | do { 53 | buf.resize(buf.size() + MAX_PATH); 54 | n = GetModuleFileNameW(mod, buf.data(), static_cast(buf.size())); 55 | } while (n >= buf.size()); 56 | buf.resize(n); 57 | std::wstring path(buf.begin(), buf.end()); 58 | return path; 59 | } 60 | throw std::runtime_error("unable to locate myself"); 61 | }(); 62 | return fs::path(res).parent_path(); 63 | } 64 | 65 | FARPROC loadDLLs() { 66 | fs::path dir = dllDir() / DLL_DIR; 67 | HMODULE h = nullptr; 68 | for (const auto dll: dlls) { 69 | fs::path p = dir / dll; 70 | std::wstring s = p; 71 | h = LoadLibraryW(s.c_str()); 72 | DWORD err = GetLastError(); 73 | if (getenv("VSTRT_VERBOSE")) 74 | std::wcerr << L"vstrt: preloading " << p << L": " << h << std::endl; 75 | if (!h) 76 | std::wcerr << L"vstrt: failed to preload " << s << L", errno " << err << std::endl; 77 | } 78 | return (FARPROC)h; 79 | } 80 | 81 | #if (NV_TENSORRT_MAJOR == 9 && !defined(TRT_MAJOR_RTX)) && defined(_WIN32) 82 | static void * dummy() { // mimic getPluginRegistry 83 | #else 84 | static int dummy() { // mimic getInferLibVersion 85 | #endif 86 | return 0; 87 | } 88 | 89 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 90 | switch (reason) { 91 | case dliNoteStartProcessing: 92 | case dliNoteEndProcessing: 93 | // Nothing to do here. 94 | break; 95 | case dliNotePreLoadLibrary: 96 | //std::cerr << "loading " << info->szDll << std::endl; 97 | loadDLLs(); 98 | return (FARPROC)LoadLibraryA(info->szDll); 99 | case dliNotePreGetProcAddress: 100 | // Nothing to do here. 101 | break; 102 | case dliFailLoadLib: 103 | case dliFailGetProc: 104 | // Returning NULL from error notifications will cause the delay load 105 | // runtime to raise a VcppException structured exception, that some code 106 | // might want to handle. 107 | //return NULL; 108 | // The SE will crash the process, so instead we return a dummy function. 109 | return (FARPROC)dummy; 110 | break; 111 | default: 112 | abort(); // unreachable. 113 | break; 114 | } 115 | // Returning NULL causes the delay load machinery to perform default 116 | // processing for this notification. 117 | return NULL; 118 | } 119 | } // namespace 120 | 121 | extern "C" { 122 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 123 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 124 | }; 125 | #endif // _MSC_VER 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vs-mlrt 2 | 3 | This project provides VapourSynth ML filter runtimes for a variety of platforms: 4 | - x86 CPUs: [vsov-cpu](#vsov-openvino-based-pure-cpu--intel-gpu-runtime), [vsort-cpu](#vsort-onnx-runtime-based-cpugpu-runtime) 5 | - Intel GPU (both integrated & discrete): [vsov-gpu](#vsov-openvino-based-pure-cpu--intel-gpu-runtime), [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime) 6 | - NVidia GPU: [vsort-cuda](#vsort-onnx-runtime-based-cpugpu-runtime), [vstrt & vstrt_rtx](#vstrt-tensorrt-based-gpu-runtime), [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime) 7 | - AMD GPU: [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime), [vsmigx](#vsmigx-migraphx-based-gpu-runtime) 8 | - Apple SoC: [vsort-coreml](#vsort-onnx-runtime-based-cpugpu-runtime) 9 | 10 | To simplify usage, we also provide a Python wrapper [vsmlrt.py](https://github.com/AmusementClub/vs-mlrt/blob/master/scripts/vsmlrt.py) 11 | for all bundled models and a unified interface to select different backends. 12 | 13 | Please refer to [the wiki](https://github.com/AmusementClub/vs-mlrt/wiki) for supported models & usage information. 14 | 15 | ## vsov: OpenVINO-based Pure CPU & Intel GPU Runtime 16 | 17 | [OpenVINO](https://docs.openvino.ai/latest/index.html) is an AI inference runtime developed 18 | by Intel, mainly targeting x86 CPUs and Intel GPUs. 19 | 20 | The vs-openvino plugin provides optimized *pure* CPU & Intel GPU runtime for some popular AI filters. 21 | Intel GPU supports Gen 8+ on Broadwell+ and the Arc series GPUs. 22 | 23 | To install, download the latest release and extract them into your VS `plugins` directory. 24 | 25 | Please visit the [vsov](vsov) directory for details. 26 | 27 | ## vsort: ONNX Runtime-based CPU/GPU Runtime 28 | 29 | [ONNX Runtime](https://onnxruntime.ai/) is an AI inference runtime with many backends. 30 | 31 | The vs-onnxruntime plugin provides optimized CPU and CUDA GPU runtime for some popular AI filters. 32 | 33 | To install, download the latest release and extract them into your VS `plugins` directory. 34 | 35 | Please visit the [vsort](vsort) directory for details. 36 | 37 | ## vstrt: TensorRT-based GPU Runtime 38 | 39 | [TensorRT](https://developer.nvidia.com/tensorrt) is a highly optimized AI inference runtime 40 | for NVidia GPUs. It uses benchmarking to find the optimal kernel to use for your specific 41 | GPU, and so there is an extra step to build an engine from ONNX network on the machine 42 | you are going to use the vstrt filter, and this extra step makes deploying models a little 43 | harder than the other runtimes. However, the resulting performance is also typically 44 | *much much better* than the CUDA backend of [vsort](vsort). 45 | 46 | [TensorRT-RTX](https://developer.nvidia.com/tensorrt-rtx) is a specialization of TensorRT 47 | for NVIDIA RTX GPUs, which compiles engines faster with comparable performance with TensorRT. 48 | 49 | To install, download the latest release and extract them into your VS `plugins` directory. 50 | 51 | Please visit the [vstrt](vstrt) directory for details. 52 | 53 | ## vsmigx: MIGraphX-based GPU Runtime 54 | 55 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX) is a highly optimized AI inference runtime 56 | for AMD GPUs. It also uses benchmarking to find the optimal kernel, similar to vstrt. 57 | 58 | To install, download the latest release and extract them into your VS `plugins` directory. 59 | 60 | Please visit the [vsmigx](vsmigx) directory for details. 61 | 62 | ## vsncnn: NCNN-based GPU (Vulkan) Runtime 63 | 64 | [ncnn](https://github.com/Tencent/ncnn) is a popular AI inference runtime. [vsncnn](vsncnn) 65 | provides a vulkan based runtime for some AI filters. It includes support for on-the-fly 66 | ONNX to ncnn native format conversion so as to provide a unified interface across all 67 | runtimes provided by this project. As it uses the device-independent 68 | [Vulkan](https://en.wikipedia.org/wiki/Vulkan) interface for GPU accelerated inference, 69 | this plugin supports all GPUs that provides Vulkan interface (NVidia, AMD, Intel integrated & 70 | discrete GPUs all provide this interface.) Another benefit is that it has a significant 71 | smaller footprint than other GPU runtimes (both vsort and vstrt CUDA backends require >1GB 72 | CUDA libraries.) The main drawback is that it's slower. 73 | 74 | To install, download the latest release and extract them into your VS `plugins` directory. 75 | 76 | Please visit the [vsncnn](vsncnn) directory for details. 77 | -------------------------------------------------------------------------------- /vsmigx/README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth MIGraphX 2 | 3 | The vs-migraphx plugin provides optimized HIP runtime for some popular AI filters on AMD GPUs. 4 | 5 | ## Usage 6 | 7 | Prototype: `core.migx.Model(clip[] clips, string program_path[, int[] overlap, int[] tilesize, int device_id=0, int num_streams=1, string flexible_output_prop=""])` 8 | 9 | Arguments: 10 | - `clip[] clips`: the input clips, only 16/32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki). 11 | - `string program_path`: the path to the prebuilt program (see below) 12 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size. 13 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size. 14 | - `int device_id`: Specifies the GPU device id to use, default 0. Requires AMD GPUs with gfx1030 target or RDNA3 architecture onwards ([list](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus)). 15 | - `int num_streams`: number of concurrent HIP streams to use. Default 1. Increase if GPU not saturated. 16 | - `string flexible_output_prop`: used to support onnx models with arbitrary number of output planes. 17 | 18 | ```python3 19 | from typing import TypedDict 20 | 21 | class Output(TypedDict): 22 | clip: vs.VideoNode 23 | num_planes: int 24 | 25 | prop = "planes" # arbitrary non-empty string 26 | output = core.migx.Model(src, program_path, flexible_output_prop=prop) # type: Output 27 | 28 | clip = output["clip"] 29 | num_planes = output["num_planes"] 30 | 31 | output_planes = [ 32 | clip.std.PropToClip(prop=f"{prop}{i}") 33 | for i in range(num_planes) 34 | ] # type: list[vs.VideoNode] 35 | ``` 36 | 37 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case. 38 | 39 | The general rule is to either: 40 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or 41 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used. 42 | 43 | ## Instructions 44 | 45 | ### Build program 46 | ```shell 47 | migraphx-driver compile --onnx drunet_gray.onnx --gpu --input-dim @input 1 2 1080 1920 --output dpir_gray_1080p.mxr 48 | ``` 49 | 50 | The program can be applied to `1920x1080` input. 51 | 52 | Also check [migraphx-driver useful arguments](#migraphx-driver-useful-arguments) 53 | 54 | ### Run model 55 | In vpy script: 56 | ```python3 57 | # DPIR 58 | src = core.std.BlankClip(src, width=1920, height=1080, format=vs.GRAYS) 59 | sigma = 10.0 60 | flt = core.migx.Model([src, core.std.BlankClip(src, color=sigma/255.0)], engine_path="dpir_gray_1080p.mxr", tilesize=[1920, 1080]) 61 | ``` 62 | 63 | ## trtexec useful arguments 64 | - `--fp16`: Enable fp16 precision, in addition to fp32 (default = disabled) 65 | 66 | - `--output `: Save the serialized program 67 | 68 | - `--migraphx `: Load a serialized program 69 | 70 | - `--optimize`: Performs common graph optimizations 71 | 72 | - `--exhaustive-tune`: Enables exhaustive search to find the fastest kernel 73 | 74 | - `--disable-fast-math`: Disable fast math optimization 75 | 76 | Also check the [full list of options](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/migraphx-driver.html#options) and [environment variables](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/dev/env_vars.html). 77 | 78 | -------------------------------------------------------------------------------- /.github/workflows/linux-ncnn.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-NCNN) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsncnn/**' 8 | - '.github/workflows/linux-ncnn.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-24.04 14 | 15 | defaults: 16 | run: 17 | working-directory: vsncnn 18 | 19 | steps: 20 | - name: Checkout repo 21 | uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Setup Ninja 26 | run: pip install ninja 27 | 28 | - name: Cache protobuf 29 | id: cache-protobuf 30 | uses: actions/cache@v4 31 | with: 32 | path: vsncnn/protobuf/install 33 | key: ${{ runner.os }}-vsncnn-protobuf-v1 34 | 35 | - name: Checkout protobuf 36 | uses: actions/checkout@v4 37 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 38 | with: 39 | repository: protocolbuffers/protobuf 40 | ref: v3.21.12 41 | fetch-depth: 1 42 | path: vsncnn/protobuf 43 | 44 | - name: Configure protobuf 45 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 46 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 47 | -D CMAKE_BUILD_TYPE=Release 48 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 49 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 50 | 51 | - name: Build protobuf 52 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 53 | run: cmake --build protobuf/build_rel --verbose 54 | 55 | - name: Install protobuf 56 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 57 | run: cmake --install protobuf/build_rel --prefix protobuf/install 58 | 59 | - name: Cache onnx 60 | id: cache-onnx 61 | uses: actions/cache@v4 62 | with: 63 | path: vsncnn/onnx/install 64 | key: ${{ runner.os }}-vsncnn-onnx-v1 65 | 66 | - name: Checkout onnx 67 | if: steps.cache-onnx.outputs.cache-hit != 'true' 68 | uses: actions/checkout@v4 69 | with: 70 | repository: onnx/onnx 71 | ref: b86cc54efce19530fb953e4b21f57e6b3888534c 72 | fetch-depth: 1 73 | path: vsncnn/onnx 74 | 75 | - name: Configure onnx 76 | if: steps.cache-onnx.outputs.cache-hit != 'true' 77 | run: cmake -S onnx -B onnx/build -G Ninja -LA 78 | -D CMAKE_BUILD_TYPE=Release 79 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 80 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 81 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 82 | -D Protobuf_LIBRARIES=protobuf/install/lib 83 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 84 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 85 | 86 | - name: Build onnx 87 | if: steps.cache-onnx.outputs.cache-hit != 'true' 88 | run: cmake --build onnx/build --verbose 89 | 90 | - name: Install onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --install onnx/build --prefix onnx/install 93 | 94 | - name: Download VapourSynth headers 95 | run: | 96 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 97 | unzip -q vs.zip 98 | mv vapoursynth*/ vapoursynth 99 | 100 | - name: Download NCNN Precompilation 101 | run: | 102 | curl -s -o ncnn.zip -LJO https://github.com/Tencent/ncnn/releases/download/20250503/ncnn-20250503-ubuntu-2404.zip 103 | unzip -q ncnn.zip 104 | 105 | - name: Configure 106 | run: cmake -S . -B build -G Ninja -LA 107 | -D CMAKE_BUILD_TYPE=Release 108 | -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth/include 109 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 110 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 111 | -D ncnn_DIR=ncnn-20250503-ubuntu-2404/lib/cmake/ncnn 112 | -D CMAKE_CXX_STANDARD=20 113 | 114 | - name: Build 115 | run: cmake --build build --verbose 116 | 117 | - name: Install 118 | run: cmake --install build --prefix install 119 | 120 | - name: Prepare for upload 121 | run: | 122 | mkdir artifact 123 | cp -v install/lib/*.so artifact 124 | 125 | - name: Describe 126 | run: git describe --tags --long 127 | 128 | - name: Upload 129 | uses: actions/upload-artifact@v4 130 | with: 131 | name: vsncnn-linux-x64 132 | path: vsncnn/artifact 133 | -------------------------------------------------------------------------------- /.github/workflows/windows-cuda-dependency.yml: -------------------------------------------------------------------------------- 1 | name: Upload vs-mlrt-cuda dependencies 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | tag: 7 | description: 'which tag to upload to' 8 | required: true 9 | default: 'v100' 10 | workflow_call: 11 | inputs: 12 | tag: 13 | description: 'which tag to upload to' 14 | required: true 15 | type: string 16 | secrets: 17 | REPO_TOKEN: 18 | required: true 19 | 20 | jobs: 21 | build-windows: 22 | runs-on: windows-2025 23 | outputs: 24 | runID: ${{ steps.output.outputs.runID }} 25 | 26 | defaults: 27 | run: 28 | shell: bash 29 | 30 | steps: 31 | - name: Download cuDNN inference library 32 | run: curl -LJ https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.13.0.50_cuda13-archive.zip -o cudnn.zip 33 | 34 | - name: Extract cuDNN library 35 | run: unzip cudnn.zip 36 | 37 | - name: Move cuDNN library 38 | run: | 39 | mkdir -p vsmlrt-cuda 40 | mv cudnn-windows-*/bin/*.dll vsmlrt-cuda/ -v 41 | # rm vsmlrt-cuda/cudnn_*_train*.dll -v 42 | 43 | - name: Download TensorRT library 44 | run: | 45 | curl -L -o trt.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.14.1/zip/TensorRT-10.14.1.48.Windows.win10.cuda-13.0.zip 46 | 47 | - name: Extract TensorRT library 48 | run: | 49 | unzip trt.zip 50 | mv TensorRT-*/ TensorRT/ 51 | 52 | - name: Move TensorRT library 53 | run: mv TensorRT/bin/*.dll vsmlrt-cuda -v 54 | 55 | - name: Download CUDA Libraries 56 | shell: cmd 57 | run: | 58 | curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/13.0.2/network_installers/cuda_13.0.2_windows_network.exe 59 | cuda_installer.exe -s cudart_13.0 cublas_13.0 cufft_13.0 cupti_13.0 nvrtc_13.0 60 | 61 | - name: Move CUDA Libraries 62 | shell: cmd 63 | run: | 64 | move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda 65 | move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64\*.dll" vsmlrt-cuda 66 | del vsmlrt-cuda\cudart32*.dll 67 | 68 | - name: Download TensorRT-RTX library 69 | run: | 70 | curl -L -o trt-rtx.zip https://developer.nvidia.com/downloads/trt/rtx_sdk/secure/1.2/TensorRT-RTX-1.2.0.44-win10-amd64-cuda-13.0-Release-external.zip 71 | 72 | - name: Extract TensorRT-RTX library 73 | run: | 74 | unzip trt-rtx.zip 75 | mv TensorRT-RTX-*/ TensorRT-RTX/ 76 | 77 | - name: Move TensorRT-RTX library 78 | run: | 79 | mv TensorRT-RTX/bin/tensorrt_rtx.exe vsmlrt-cuda -v 80 | mv TensorRT-RTX/bin/*.dll vsmlrt-cuda -v 81 | 82 | - name: Setup VC commands 83 | uses: ilammy/msvc-dev-cmd@v1 84 | with: 85 | arch: amd64 86 | 87 | - name: Copy VC Runtime Libraries 88 | shell: bash 89 | run: | 90 | cd vsmlrt-cuda 91 | while true; do 92 | changed=false 93 | for dll in *.[dD][lL][lL]; do 94 | for dep in $(dumpbin -dependents "$dll" | grep -o -i '\<\(vc\|msvc\)[a-z0-9_-]*\.dll'); do 95 | echo "finding $dep for $dll" 96 | if ! test -f ./"$dep"; then 97 | changed=true 98 | src="$(where "$dep" | grep -i 'MSVC' | head -1)" 99 | echo "copying $src for $dep" 100 | test -f "$src" || exit 1 101 | cp -f "$src" . 102 | fi 103 | done 104 | done 105 | $changed || break 106 | done 107 | 108 | - name: Compress 109 | run: | 110 | 7z a -t7z -bb3 -mx=9 -v2147483647b vsmlrt-cuda.7z vsmlrt-cuda 111 | 112 | - name: Upload 113 | uses: actions/upload-artifact@v4 114 | with: 115 | name: vsmlrt-cuda 116 | path: vsmlrt-cuda.7z* 117 | retention-days: 1 118 | compression-level: 0 119 | 120 | - name: Rename release asset 121 | run: | 122 | mv vsmlrt-cuda.7z.001 vsmlrt-cuda.${{ github.event.inputs.tag}}.7z.001 123 | mv vsmlrt-cuda.7z.002 vsmlrt-cuda.${{ github.event.inputs.tag}}.7z.002 124 | 125 | - name: Release 126 | uses: softprops/action-gh-release@v2 127 | with: 128 | tag_name: ${{ github.event.inputs.tag }} 129 | files: vsmlrt-cuda.${{ github.event.inputs.tag }}.7z* 130 | fail_on_unmatched_files: true 131 | generate_release_notes: false 132 | prerelease: true 133 | -------------------------------------------------------------------------------- /vstrt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-trt VERSION 3.1 LANGUAGES CXX) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 8 | set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT") 9 | option(USE_NVINFER_PLUGIN "Initialize nvinfer_plugin" FALSE) 10 | option(USE_NVINFER_PLUGIN_STATIC "Use static nvinfer_plugin" FALSE) 11 | set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix") 12 | 13 | if (WIN32) 14 | file(GLOB files "${TENSORRT_HOME}/lib/tensorrt_rtx*.lib") 15 | else() 16 | file(GLOB files "${TENSORRT_HOME}/lib/libtensorrt_rtx*.so") 17 | endif() 18 | if (files) 19 | set(USE_TRT_RTX TRUE) 20 | else() 21 | set(USE_TRT_RTX FALSE) 22 | endif() 23 | 24 | FIND_PACKAGE(CUDAToolkit REQUIRED) 25 | 26 | if (USE_TRT_RTX) 27 | add_library(vstrt_rtx SHARED 28 | $<$: longpath.manifest> 29 | vs_tensorrt.cpp 30 | win32.cpp 31 | ) 32 | 33 | target_include_directories(vstrt_rtx PRIVATE 34 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 35 | ${CUDAToolkit_INCLUDE_DIRS} 36 | ${TENSORRT_HOME}/include 37 | ) 38 | 39 | set_target_properties(vstrt_rtx PROPERTIES 40 | CXX_EXTENSIONS OFF 41 | POSITION_INDEPENDENT_CODE ON 42 | CXX_STANDARD 20 43 | CXX_STANDARD_REQUIRED ON 44 | ) 45 | 46 | target_link_directories(vstrt_rtx PRIVATE ${TENSORRT_HOME}/lib) 47 | target_link_libraries(vstrt_rtx PRIVATE CUDA::cudart_static) 48 | 49 | if (WIN32) 50 | target_link_libraries(vstrt_rtx PRIVATE "tensorrt_rtx${TENSORRT_LIBRARY_SUFFIX}") 51 | if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 52 | target_link_options(vstrt_rtx PRIVATE "/DELAYLOAD:tensorrt_rtx${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib") 53 | endif() 54 | else() 55 | target_link_libraries(vstrt_rtx PRIVATE tensorrt_rtx) 56 | endif() 57 | 58 | target_compile_definitions(vstrt_rtx PRIVATE USE_NVINFER_PLUGIN) 59 | 60 | target_include_directories(vstrt_rtx PUBLIC 61 | "${PROJECT_BINARY_DIR}" 62 | ) 63 | 64 | find_package(Git REQUIRED) 65 | execute_process( 66 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 67 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 68 | OUTPUT_VARIABLE VCS_TAG 69 | ) 70 | string(STRIP ${VCS_TAG} VCS_TAG) 71 | configure_file(config.h.in config.h) 72 | 73 | install(TARGETS vstrt_rtx 74 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 75 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 76 | ) 77 | else() 78 | add_library(vstrt SHARED 79 | $<$: longpath.manifest> 80 | vs_tensorrt.cpp 81 | win32.cpp 82 | ) 83 | 84 | target_include_directories(vstrt PRIVATE 85 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 86 | ${CUDAToolkit_INCLUDE_DIRS} 87 | ${TENSORRT_HOME}/include 88 | ) 89 | 90 | set_target_properties(vstrt PROPERTIES 91 | CXX_EXTENSIONS OFF 92 | POSITION_INDEPENDENT_CODE ON 93 | CXX_STANDARD 20 94 | CXX_STANDARD_REQUIRED ON 95 | ) 96 | 97 | target_link_directories(vstrt PRIVATE ${TENSORRT_HOME}/lib) 98 | target_link_libraries(vstrt PRIVATE CUDA::cudart_static "nvinfer${TENSORRT_LIBRARY_SUFFIX}") 99 | 100 | if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 101 | target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib") 102 | endif() 103 | 104 | if (USE_NVINFER_PLUGIN) 105 | add_definitions(-DUSE_NVINFER_PLUGIN) 106 | if (USE_NVINFER_PLUGIN_STATIC) 107 | target_link_libraries(vstrt PRIVATE "nvinfer_plugin_static${TENSORRT_LIBRARY_SUFFIX}") 108 | else() 109 | target_link_libraries(vstrt PRIVATE "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}") 110 | 111 | if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 112 | target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}.dll") 113 | endif() 114 | endif() 115 | endif() 116 | 117 | target_include_directories(vstrt PUBLIC 118 | "${PROJECT_BINARY_DIR}" 119 | ) 120 | 121 | find_package(Git REQUIRED) 122 | execute_process( 123 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 124 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 125 | OUTPUT_VARIABLE VCS_TAG 126 | ) 127 | string(STRIP ${VCS_TAG} VCS_TAG) 128 | configure_file(config.h.in config.h) 129 | 130 | install(TARGETS vstrt 131 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 132 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 133 | ) 134 | endif() 135 | 136 | -------------------------------------------------------------------------------- /vsort/README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth ONNX Runtime 2 | 3 | The vs-onnxruntime plugin provides optimized CPU & CUDA runtime for some popular AI filters. 4 | 5 | ## Building and Installation 6 | 7 | To build, you will need [ONNX Runtime](https://www.onnxruntime.ai/), [protobuf](https://github.com/protocolbuffers/protobuf), [ONNX](https://github.com/onnx/onnx) and their dependencies. 8 | 9 | Please refer to [ONNX Runtime Docs](https://onnxruntime.ai/docs/install/) for installation notes. 10 | Or, you can use our prebuilt Windows binary releases from [AmusementClub](https://github.com/AmusementClub/onnxruntime/releases/latest/). 11 | 12 | Please refer to our [github actions workflow](../.github/workflows/windows-ort.yml) for sample building instructions. 13 | 14 | If you only use the CPU backend, then you just need to extract binary release into your `vapoursynth/plugins` directory. 15 | 16 | However, if you also use the CUDA backend, you will need to download some CUDA libraries as well, please see the release page for details. Those CUDA libraries also need to be extracted into VS `vapoursynth/plugins` directory. The plugin will try to load them from `vapoursynth/plugins/vsort/` directory or `vapoursynth/plugins/vsmlrt-cuda/` directory. 17 | 18 | ## Usage 19 | 20 | Prototype: `core.ort.Model(clip[] clips, string network_path[, int[] overlap = None, int[] tilesize = None, string provider = "", int device_id = 0, int verbosity = 2, bint cudnn_benchmark = True, bint builtin = False, string builtindir="models", bint fp16 = False, bint path_is_serialization = False, bint use_cuda_graph = False])` 21 | 22 | Arguments: 23 | - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki). 24 | - `string network_path`: the path to the network in ONNX format. 25 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size. 26 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size. 27 | - `string provider`: Specifies the device to run the inference on. 28 | - `"CPU"` or `""`: pure CPU backend 29 | - `"CUDA"`: CUDA GPU backend, requires Nvidia Maxwell+ GPUs. 30 | - `"DML"`: DirectML backend 31 | - `"COREML"`: CoreML backend 32 | - `int device_id`: select the GPU device for the CUDA backend.' 33 | - `int verbosity`: specify the verbosity of logging, the default is warning. 34 | - 0: fatal error only, `ORT_LOGGING_LEVEL_FATAL` 35 | - 1: also errors, `ORT_LOGGING_LEVEL_ERROR` 36 | - 2: also warnings, `ORT_LOGGING_LEVEL_WARNING` 37 | - 3: also info, `ORT_LOGGING_LEVEL_INFO` 38 | - 4: everything, `ORT_LOGGING_LEVEL_VERBOSE` 39 | - `bint cudnn_benchmark`: whether to let cuDNN use benchmarking to search for the best convolution kernel to use. Default True. It might incur some startup latency. 40 | - `bint builtin`: whether to load the model from the VS plugins directory, see also `builtindir`. 41 | - `string builtindir`: the model directory under VS plugins directory for builtin models, default "models". 42 | - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation. 43 | - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`. 44 | - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead in CUDA backend. Not all models are supported. 45 | - `int ml_program`: select CoreML provider. 46 | - 0: NeuralNetwork 47 | - 1: MLProgram 48 | 49 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case. 50 | 51 | The general rule is to either: 52 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or 53 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used. 54 | -------------------------------------------------------------------------------- /.github/workflows/macos-ort.yml: -------------------------------------------------------------------------------- 1 | name: Build (macOS-ORT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsort/**' 8 | - '.github/workflows/macos-ort.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-macos: 13 | runs-on: macos-14 14 | 15 | defaults: 16 | run: 17 | working-directory: vsort 18 | 19 | steps: 20 | - name: Checkout repo 21 | uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Setup Ninja 26 | run: brew install ninja 27 | 28 | - name: Cache protobuf 29 | id: cache-protobuf 30 | uses: actions/cache@v4 31 | with: 32 | path: vsort/protobuf/install 33 | key: ${{ runner.os }}-vsort-protobuf-v1 34 | 35 | - name: Checkout protobuf 36 | uses: actions/checkout@v4 37 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 38 | with: 39 | repository: protocolbuffers/protobuf 40 | ref: v3.21.12 41 | fetch-depth: 1 42 | path: vsort/protobuf 43 | 44 | - name: Configure protobuf 45 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 46 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 47 | -D CMAKE_BUILD_TYPE=Release 48 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 49 | -D protobuf_BUILD_SHARED_LIBS=OFF 50 | -D protobuf_BUILD_TESTS=OFF 51 | 52 | - name: Build protobuf 53 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 54 | run: cmake --build protobuf/build_rel --verbose 55 | 56 | - name: Install protobuf 57 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 58 | run: cmake --install protobuf/build_rel --prefix protobuf/install 59 | 60 | - name: Cache onnx 61 | id: cache-onnx 62 | uses: actions/cache@v4 63 | with: 64 | path: vsort/onnx/install 65 | key: ${{ runner.os }}-vsort-onnx-v2 66 | 67 | - name: Checkout onnx 68 | if: steps.cache-onnx.outputs.cache-hit != 'true' 69 | uses: actions/checkout@v4 70 | with: 71 | repository: onnx/onnx 72 | # follows onnx in https://github.com/microsoft/onnxruntime/tree/main/cmake/external 73 | ref: 595228d99e3977ac27cb79d5963adda262af99ad 74 | fetch-depth: 1 75 | path: vsort/onnx 76 | 77 | - name: Configure onnx 78 | if: steps.cache-onnx.outputs.cache-hit != 'true' 79 | run: cmake -S onnx -B onnx/build -G Ninja -LA 80 | -D CMAKE_BUILD_TYPE=Release 81 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 82 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 83 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 84 | -D Protobuf_LIBRARIES=protobuf/install/lib 85 | -D ONNX_USE_LITE_PROTO=ON 86 | -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 87 | -D ONNX_GEN_PB_TYPE_STUBS=OFF 88 | -D ONNX_ML=0 89 | 90 | - name: Build onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --build onnx/build --verbose 93 | 94 | - name: Install onnx 95 | if: steps.cache-onnx.outputs.cache-hit != 'true' 96 | run: cmake --install onnx/build --prefix onnx/install 97 | 98 | - name: Download VapourSynth headers 99 | run: | 100 | curl -L -o vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 101 | unzip -q vs.zip 102 | mv vapoursynth*/ vapoursynth 103 | 104 | - name: Setup ONNX Runtime 105 | run: | 106 | curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.20.0/onnxruntime-osx-arm64-1.20.0.tgz 107 | tar -xf ort.tgz 108 | mv onnxruntime-* onnxruntime 109 | 110 | - name: Configure 111 | run: cmake -S . -B build -G Ninja -LA 112 | -D CMAKE_BUILD_TYPE=Release 113 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -mcpu=apple-m1" 114 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 115 | -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime/include 116 | -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime/lib 117 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 118 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 119 | -D CMAKE_CXX_STANDARD=20 120 | -D ENABLE_COREML=ON 121 | 122 | - name: Build 123 | run: cmake --build build --verbose 124 | 125 | - name: Install 126 | run: cmake --install build --prefix install 127 | 128 | - name: Prepare for upload 129 | run: | 130 | mkdir artifact 131 | cp -v install/lib/*.dylib artifact 132 | 133 | - name: Describe 134 | run: git describe --tags --long 135 | 136 | - name: Upload 137 | uses: actions/upload-artifact@v4 138 | with: 139 | name: vsort-macos-arm64 140 | path: vsort/artifact 141 | -------------------------------------------------------------------------------- /vsov/README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth OpenVINO 2 | 3 | The vs-openvino plugin provides optimized *pure* CPU runtime for some popular AI filters. 4 | 5 | ## Building and Installation 6 | 7 | To build, you will need [OpenVINO](https://docs.openvino.ai/latest/get_started.html) and its dependencies. 8 | Only `Model Optimizer` and `Inference Engine` are required. 9 | 10 | You can download official Intel releases: 11 | - [Linux](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_linux_header.html) 12 | - [Windows](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_windows_header.html) 13 | - [macOS](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_macos_header.html) 14 | 15 | Or, you can use our prebuilt Windows binary releases from [AmusementClub](https://github.com/AmusementClub/openvino/releases/latest/), our release has the benefit of static linking support. 16 | 17 | Sample cmake commands to build: 18 | ```bash 19 | cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release 20 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 21 | -D InferenceEngine_DIR=openvino/runtime/cmake 22 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="path/to/vapoursynth/include" 23 | cmake --build build 24 | cmake --install build --prefix install 25 | ``` 26 | You should find `vsov.dll` (or libvsov.so) under `install/bin`. You will also need Intel TBB (you can get 27 | `tbb.dll` from OpenVINO release). On windows, `tbb.dll` must be placed under `vapoursynth/plugins/vsov/` 28 | directory for `vsov.dll` to find. 29 | 30 | ## Usage 31 | 32 | Prototype: `core.ov.Model(clip[] clips, string network_path[, int[] overlap = None, int[] tilesize = None, string device = "CPU", bint builtin = 0, string builtindir="models", bint fp16 = False, function config = None, bint path_is_serialization = False])` 33 | 34 | Arguments: 35 | - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki). 36 | - `string network_path`: the path to the network in ONNX format. 37 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size. 38 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size. 39 | - `string device`: Specifies the device to run the inference on. Currently `"CPU"` and `"GPU"` are supported. `"GPU"` requires Intel graphics (Broadwell+ processors with Gen8+ integrated GPUs or Xe discrete GPUs) with compatible graphics driver and compute runtime. 40 | - `bint builtin`: whether to load the model from the VS plugins directory, see also `builtindir`. 41 | - `string builtindir`: the model directory under VS plugins directory for builtin models, default "models". 42 | - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation. 43 | - `function config`: plugin configuration parameters. It must be a callable object (e.g. a function) with no positional arguments, and returns the configuration parameter in a dictionary `dict`. The dictionary must use string `str` for its key and `int`, `float` or `str` for its values. Supported parameters: [CPU](https://docs.openvino.ai/2021.4/openvino_docs_IE_DG_supported_plugins_CPU.html#supported-configuration-parameters), [GPU](https://docs.openvino.ai/2021.4/openvino_docs_IE_DG_supported_plugins_GPU.html#supported-configuration-parameters) (the prefix `KEY_` has to be removed). Example: `config = lambda: dict(CPU_THROUGHPUT_STREAMS=2)` 44 | - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`. 45 | 46 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case. 47 | 48 | The general rule is to either: 49 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or 50 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used. 51 | -------------------------------------------------------------------------------- /vstrt/inference_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_INFERENCE_HELPER_H_ 2 | #define VSTRT_INFERENCE_HELPER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include "cuda_helper.h" 13 | #include "trt_utils.h" 14 | 15 | struct InputInfo { 16 | int width; 17 | int height; 18 | int pitch; 19 | int bytes_per_sample; 20 | int tile_w; 21 | int tile_h; 22 | }; 23 | 24 | struct OutputInfo { 25 | int pitch; 26 | int bytes_per_sample; 27 | }; 28 | 29 | struct IOInfo { 30 | InputInfo in; 31 | OutputInfo out; 32 | int w_scale; 33 | int h_scale; 34 | int overlap_w; 35 | int overlap_h; 36 | }; 37 | 38 | static inline 39 | std::optional inference( 40 | const InferenceInstance & instance, 41 | int device_id, 42 | bool use_cuda_graph, 43 | const IOInfo & info, 44 | const std::vector & src_ptrs, 45 | const std::vector & dst_ptrs 46 | ) noexcept { 47 | 48 | const auto set_error = [](const ErrorMessage & error_message) { 49 | return error_message; 50 | }; 51 | 52 | checkError(cudaSetDevice(device_id)); 53 | 54 | int src_tile_w_bytes = info.in.tile_w * info.in.bytes_per_sample; 55 | int src_tile_bytes = info.in.tile_h * info.in.tile_w * info.in.bytes_per_sample; 56 | int dst_tile_w = info.in.tile_w * info.w_scale; 57 | int dst_tile_h = info.in.tile_h * info.h_scale; 58 | int dst_tile_w_bytes = dst_tile_w * info.out.bytes_per_sample; 59 | int dst_tile_bytes = dst_tile_h * dst_tile_w * info.out.bytes_per_sample; 60 | 61 | int step_w = info.in.tile_w - 2 * info.overlap_w; 62 | int step_h = info.in.tile_h - 2 * info.overlap_h; 63 | 64 | int y = 0; 65 | while (true) { 66 | int y_crop_start = (y == 0) ? 0 : info.overlap_h; 67 | int y_crop_end = (y == info.in.height - info.in.tile_h) ? 0 : info.overlap_h; 68 | 69 | int x = 0; 70 | while (true) { 71 | int x_crop_start = (x == 0) ? 0 : info.overlap_w; 72 | int x_crop_end = (x == info.in.width - info.in.tile_w) ? 0 : info.overlap_w; 73 | 74 | { 75 | uint8_t * h_data = instance.src.h_data.data; 76 | for (const uint8_t * _src_ptr : src_ptrs) { 77 | const uint8_t * src_ptr { _src_ptr + 78 | y * info.in.pitch + x * info.in.bytes_per_sample 79 | }; 80 | 81 | vs_bitblt( 82 | h_data, src_tile_w_bytes, 83 | src_ptr, info.in.pitch, 84 | static_cast(src_tile_w_bytes), 85 | static_cast(info.in.tile_h) 86 | ); 87 | 88 | h_data += src_tile_bytes; 89 | } 90 | } 91 | 92 | if (use_cuda_graph) { 93 | checkError(cudaGraphLaunch(instance.graphexec, instance.stream)); 94 | } else { 95 | auto result = enqueue( 96 | instance.src, instance.dst, 97 | instance.exec_context, instance.stream 98 | ); 99 | 100 | if (result.has_value()) { 101 | return set_error(result.value()); 102 | } 103 | } 104 | checkError(cudaStreamSynchronize(instance.stream)); 105 | 106 | { 107 | const uint8_t * h_data = instance.dst.h_data.data; 108 | for (uint8_t * _dst_ptr : dst_ptrs) { 109 | uint8_t * dst_ptr { _dst_ptr + 110 | info.h_scale * y * info.out.pitch + info.w_scale * x * info.out.bytes_per_sample 111 | }; 112 | 113 | vs_bitblt( 114 | dst_ptr + (y_crop_start * info.out.pitch + x_crop_start * info.out.bytes_per_sample), 115 | info.out.pitch, 116 | h_data + (y_crop_start * dst_tile_w_bytes + x_crop_start * info.out.bytes_per_sample), 117 | dst_tile_w_bytes, 118 | static_cast(dst_tile_w_bytes - (x_crop_start + x_crop_end) * info.out.bytes_per_sample), 119 | static_cast(dst_tile_h - (y_crop_start + y_crop_end)) 120 | ); 121 | 122 | h_data += dst_tile_bytes; 123 | } 124 | } 125 | 126 | if (x + info.in.tile_w == info.in.width) { 127 | break; 128 | } 129 | 130 | x = std::min(x + step_w, info.in.width - info.in.tile_w); 131 | } 132 | 133 | if (y + info.in.tile_h == info.in.height) { 134 | break; 135 | } 136 | 137 | y = std::min(y + step_h, info.in.height - info.in.tile_h); 138 | } 139 | 140 | return {}; 141 | } 142 | 143 | #endif // VSTRT_INFERENCE_HELPER_H_ 144 | -------------------------------------------------------------------------------- /.github/workflows/linux-ov.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-OV) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vsov/**' 7 | - '.github/workflows/linux-ov.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-22.04 13 | 14 | defaults: 15 | run: 16 | working-directory: vsov 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja 26 | 27 | - name: Cache protobuf 28 | id: cache-protobuf 29 | uses: actions/cache@v4 30 | with: 31 | path: vsov/protobuf/install 32 | key: ${{ runner.os }}-vsov-protobuf-v1 33 | 34 | - name: Checkout protobuf 35 | uses: actions/checkout@v4 36 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 37 | with: 38 | repository: protocolbuffers/protobuf 39 | # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/protobuf 40 | # if you change this, remember to bump the version of the cache key. 41 | ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c 42 | fetch-depth: 1 43 | path: vsov/protobuf 44 | 45 | - name: Configure protobuf 46 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 47 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 48 | -D CMAKE_BUILD_TYPE=Release 49 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 50 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 51 | 52 | - name: Build protobuf 53 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 54 | run: cmake --build protobuf/build_rel --verbose 55 | 56 | - name: Install protobuf 57 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 58 | run: cmake --install protobuf/build_rel --prefix protobuf/install 59 | 60 | - name: Cache onnx 61 | id: cache-onnx 62 | uses: actions/cache@v4 63 | with: 64 | path: vsov/onnx/install 65 | key: ${{ runner.os }}-vsov-onnx-v1 66 | 67 | - name: Checkout onnx 68 | if: steps.cache-onnx.outputs.cache-hit != 'true' 69 | uses: actions/checkout@v4 70 | with: 71 | repository: onnx/onnx 72 | # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/onnx 73 | # if you change this, remember to bump the version of the cache key. 74 | ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c 75 | fetch-depth: 1 76 | path: vsov/onnx 77 | 78 | - name: Configure onnx 79 | if: steps.cache-onnx.outputs.cache-hit != 'true' 80 | run: cmake -S onnx -B onnx/build -G Ninja -LA 81 | -D CMAKE_BUILD_TYPE=Release 82 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 83 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 84 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 85 | -D Protobuf_LIBRARIES=protobuf/install/lib 86 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 87 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 88 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 89 | 90 | - name: Build onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --build onnx/build --verbose 93 | 94 | - name: Install onnx 95 | if: steps.cache-onnx.outputs.cache-hit != 'true' 96 | run: cmake --install onnx/build --prefix onnx/install 97 | 98 | - name: Download VapourSynth headers 99 | run: | 100 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 101 | unzip -q vs.zip 102 | mv vapoursynth*/ vapoursynth 103 | 104 | - name: Setup OpenVINO 105 | run: | 106 | curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/linux/l_openvino_toolkit_ubuntu24_2024.6.0.17404.4c0f47d2335_x86_64.tgz 107 | tar -xf ov.tgz 108 | mv l_openvino_* openvino -v 109 | 110 | - name: Configure 111 | run: cmake -S . -B build -G Ninja -LA 112 | -D CMAKE_BUILD_TYPE=Release 113 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 114 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 115 | -D OpenVINO_DIR=openvino/runtime/cmake 116 | -D ENABLE_VISUALIZATION=ON 117 | -D WIN32_SHARED_OPENVINO=ON 118 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 119 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 120 | 121 | - name: Build 122 | run: cmake --build build --verbose 123 | 124 | - name: Install 125 | run: cmake --install build --prefix install 126 | 127 | - name: Prepare for upload 128 | run: | 129 | mkdir artifact 130 | cp -v install/lib/*.so artifact 131 | 132 | - name: Describe 133 | run: git describe --tags --long 134 | 135 | - name: Upload 136 | uses: actions/upload-artifact@v4 137 | with: 138 | name: VSOV-Linux-x64 139 | path: vsov/artifact 140 | 141 | -------------------------------------------------------------------------------- /.github/workflows/linux-ov-arm64.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-OV, ARM64) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vsov/**' 7 | - '.github/workflows/linux-ov-arm64.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04-arm 13 | 14 | defaults: 15 | run: 16 | working-directory: vsov 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja 26 | 27 | - name: Cache protobuf 28 | id: cache-protobuf 29 | uses: actions/cache@v4 30 | with: 31 | path: vsov/protobuf/install 32 | key: ${{ runner.os }}-vsov-protobuf-arm64-v1 33 | 34 | - name: Checkout protobuf 35 | uses: actions/checkout@v4 36 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 37 | with: 38 | repository: protocolbuffers/protobuf 39 | # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/protobuf 40 | # if you change this, remember to bump the version of the cache key. 41 | ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c 42 | fetch-depth: 1 43 | path: vsov/protobuf 44 | 45 | - name: Configure protobuf 46 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 47 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 48 | -D CMAKE_BUILD_TYPE=Release 49 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 50 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 51 | 52 | - name: Build protobuf 53 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 54 | run: cmake --build protobuf/build_rel --verbose 55 | 56 | - name: Install protobuf 57 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 58 | run: cmake --install protobuf/build_rel --prefix protobuf/install 59 | 60 | - name: Cache onnx 61 | id: cache-onnx 62 | uses: actions/cache@v4 63 | with: 64 | path: vsov/onnx/install 65 | key: ${{ runner.os }}-vsov-onnx-arm64-v1 66 | 67 | - name: Checkout onnx 68 | if: steps.cache-onnx.outputs.cache-hit != 'true' 69 | uses: actions/checkout@v4 70 | with: 71 | repository: onnx/onnx 72 | # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/onnx 73 | # if you change this, remember to bump the version of the cache key. 74 | ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c 75 | fetch-depth: 1 76 | path: vsov/onnx 77 | 78 | - name: Configure onnx 79 | if: steps.cache-onnx.outputs.cache-hit != 'true' 80 | run: cmake -S onnx -B onnx/build -G Ninja -LA 81 | -D CMAKE_BUILD_TYPE=Release 82 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 83 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 84 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 85 | -D Protobuf_LIBRARIES=protobuf/install/lib 86 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 87 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 88 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 89 | 90 | - name: Build onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --build onnx/build --verbose 93 | 94 | - name: Install onnx 95 | if: steps.cache-onnx.outputs.cache-hit != 'true' 96 | run: cmake --install onnx/build --prefix onnx/install 97 | 98 | - name: Download VapourSynth headers 99 | run: | 100 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 101 | unzip -q vs.zip 102 | mv vapoursynth*/ vapoursynth 103 | 104 | - name: Setup OpenVINO 105 | run: | 106 | curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/linux/l_openvino_toolkit_ubuntu20_2024.6.0.17404.4c0f47d2335_arm64.tgz 107 | tar -xf ov.tgz 108 | mv l_openvino_* openvino -v 109 | 110 | - name: Configure 111 | run: cmake -S . -B build -G Ninja -LA 112 | -D CMAKE_BUILD_TYPE=Release 113 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math" 114 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 115 | -D OpenVINO_DIR=openvino/runtime/cmake 116 | -D ENABLE_VISUALIZATION=ON 117 | -D WIN32_SHARED_OPENVINO=ON 118 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 119 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 120 | 121 | - name: Build 122 | run: cmake --build build --verbose 123 | 124 | - name: Install 125 | run: cmake --install build --prefix install 126 | 127 | - name: Prepare for upload 128 | run: | 129 | mkdir artifact 130 | cp -v install/lib/*.so artifact 131 | 132 | - name: Describe 133 | run: git describe --tags --long 134 | 135 | - name: Upload 136 | uses: actions/upload-artifact@v4 137 | with: 138 | name: VSOV-Linux-ARM64 139 | path: vsov/artifact 140 | 141 | -------------------------------------------------------------------------------- /.github/workflows/windows-trt.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-TRT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vstrt/**' 7 | - '.github/workflows/windows-trt.yml' 8 | workflow_call: 9 | inputs: 10 | tag: 11 | description: 'which tag to upload to' 12 | required: true 13 | type: string 14 | secrets: 15 | REPO_TOKEN: 16 | required: true 17 | workflow_dispatch: 18 | inputs: 19 | tag: 20 | description: 'which tag to upload to' 21 | default: '' 22 | 23 | jobs: 24 | build-windows: 25 | runs-on: windows-2025 26 | 27 | defaults: 28 | run: 29 | shell: cmd 30 | working-directory: vstrt 31 | 32 | steps: 33 | - name: Checkout repo 34 | uses: actions/checkout@v4 35 | with: 36 | fetch-depth: 0 37 | 38 | - name: Setup MSVC 39 | uses: ilammy/msvc-dev-cmd@v1 40 | 41 | - name: Setup Ninja 42 | run: pip install ninja 43 | 44 | - name: Cache CUDA 45 | id: cache-cuda 46 | uses: actions/cache@v4 47 | with: 48 | path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA 49 | key: ${{ runner.os }}-vstrt-cuda-13.0.2 50 | 51 | - name: Setup CUDA 52 | if: steps.cache-cuda.outputs.cache-hit != 'true' 53 | run: | 54 | curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/13.0.2/network_installers/cuda_13.0.2_windows_network.exe 55 | cuda_installer.exe -s nvcc_13.0 cudart_13.0 cuda_profiler_api_13.0 crt_13.0 nvptxcompiler_13.0 56 | 57 | - name: Download TensorRT 58 | run: | 59 | curl -L -o trt.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.14.1/zip/TensorRT-10.14.1.48.Windows.win10.cuda-13.0.zip 60 | unzip trt.zip 61 | mv TensorRT-*/ tensorrt/ 62 | 63 | - name: Download VapourSynth headers 64 | run: | 65 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 66 | unzip -q vs.zip 67 | mv vapoursynth-*/ vapoursynth/ 68 | 69 | - name: Configure 70 | run: cmake -S . -B build -G Ninja -LA 71 | -D CMAKE_BUILD_TYPE=Release 72 | -D CMAKE_CXX_FLAGS="/EHsc /Wall /wd4100 /wd4625 /wd4626 /wd4710 /wd4711 /wd4820 /wd4996 /wd5026 /wd5027" 73 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 74 | -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0" 75 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include" 76 | -D TENSORRT_HOME="%cd%\tensorrt" 77 | -D USE_NVINFER_PLUGIN=ON 78 | -D TENSORRT_LIBRARY_SUFFIX="_10" 79 | 80 | - name: Build 81 | run: cmake --build build --config Release --verbose 82 | 83 | - name: Install 84 | run: cmake --install build --prefix install 85 | 86 | - name: Checkout TensorRT OSS 87 | uses: actions/checkout@v4 88 | with: 89 | repository: NVIDIA/TensorRT 90 | ref: v10.14 91 | fetch-depth: 1 92 | path: tensorrt-oss 93 | 94 | - name: Override trtexec CMake file 95 | run: | 96 | mv trtexec/CMakeLists.txt ../tensorrt-oss/samples/trtexec 97 | mv trtexec/*.cpp ../tensorrt-oss/samples/trtexec 98 | mv longpath.manifest ../tensorrt-oss/samples/trtexec 99 | 100 | - name: Apply patch 101 | run: | 102 | mv trtexec/trtexec.patch ../tensorrt-oss 103 | cd ../tensorrt-oss 104 | 105 | git apply trtexec.patch --verbose 106 | 107 | - name: Apply patch2 108 | shell: bash 109 | run: sed -i 's/fp16 || bf16 || int8 || fp8 || int4 || best/0/g' ../tensorrt-oss/samples/common/sampleOptions.cpp 110 | 111 | - name: Configure trtexec 112 | run: cmake -S ../tensorrt-oss/samples/trtexec -B build_trtexec -G Ninja 113 | -D CMAKE_BUILD_TYPE=Release 114 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 115 | -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0" 116 | -D CMAKE_UNITY_BUILD=ON -D CMAKE_UNITY_BUILD_BATCH_SIZE=0 117 | -D CMAKE_CXX_STANDARD=20 118 | 119 | - name: Build trtexec 120 | run: cmake --build build_trtexec --verbose 121 | 122 | - name: Install trtexec 123 | run: cmake --install build_trtexec --prefix trtexec 124 | 125 | - name: Prepare for upload 126 | run: | 127 | mkdir artifact 128 | copy install\bin\vstrt.dll artifact\ 129 | mkdir artifact\vsmlrt-cuda 130 | copy trtexec\bin\trtexec.exe artifact\vsmlrt-cuda 131 | 132 | - name: Describe 133 | run: git describe --tags --long 134 | 135 | - name: Dump dependencies 136 | run: dumpbin /dependents artifact\vstrt.dll 137 | 138 | - name: Upload 139 | uses: actions/upload-artifact@v4 140 | with: 141 | name: VSTRT-Windows-x64 142 | path: vstrt/artifact 143 | 144 | - name: Compress artifact for release 145 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 146 | run: | 147 | cd artifact 148 | 7z a -t7z -mx=7 ../../VSTRT-Windows-x64.${{ github.event.inputs.tag }}.7z . 149 | 150 | - name: Release 151 | uses: softprops/action-gh-release@v2 152 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 153 | with: 154 | tag_name: ${{ inputs.tag }} 155 | files: VSTRT-Windows-x64.${{ github.event.inputs.tag }}.7z 156 | fail_on_unmatched_files: true 157 | generate_release_notes: false 158 | prerelease: true 159 | -------------------------------------------------------------------------------- /.github/workflows/linux-ort.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-ORT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsort/**' 8 | - '.github/workflows/linux-ort.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-22.04 14 | 15 | defaults: 16 | run: 17 | working-directory: vsort 18 | 19 | steps: 20 | - name: Checkout repo 21 | uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Setup Ninja 26 | run: pip install ninja 27 | 28 | - name: Cache protobuf 29 | id: cache-protobuf 30 | uses: actions/cache@v4 31 | with: 32 | path: vsort/protobuf/install 33 | key: ${{ runner.os }}-vsort-protobuf-v1 34 | 35 | - name: Checkout protobuf 36 | uses: actions/checkout@v4 37 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 38 | with: 39 | repository: protocolbuffers/protobuf 40 | # follows protobuf in https://github.com/microsoft/onnxruntime/blob/v1.17.1/cmake/external/onnxruntime_external_deps.cmake#L183 41 | # if you change this, remember to bump the version of the cache key. 42 | ref: v3.21.12 43 | fetch-depth: 1 44 | path: vsort/protobuf 45 | 46 | - name: Configure protobuf 47 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 48 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 49 | -D CMAKE_BUILD_TYPE=Release 50 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 51 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 52 | 53 | - name: Build protobuf 54 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 55 | run: cmake --build protobuf/build_rel --verbose 56 | 57 | - name: Install protobuf 58 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 59 | run: cmake --install protobuf/build_rel --prefix protobuf/install 60 | 61 | - name: Cache onnx 62 | id: cache-onnx 63 | uses: actions/cache@v4 64 | with: 65 | path: vsort/onnx/install 66 | key: ${{ runner.os }}-vsort-onnx-v1 67 | 68 | - name: Checkout onnx 69 | if: steps.cache-onnx.outputs.cache-hit != 'true' 70 | uses: actions/checkout@v4 71 | with: 72 | repository: onnx/onnx 73 | # follows onnx in https://github.com/microsoft/onnxruntime/tree/v1.17.1/cmake/external 74 | # if you change this, remember to bump the version of the cache key. 75 | ref: b86cc54efce19530fb953e4b21f57e6b3888534c 76 | fetch-depth: 1 77 | path: vsort/onnx 78 | 79 | - name: Configure onnx 80 | if: steps.cache-onnx.outputs.cache-hit != 'true' 81 | run: cmake -S onnx -B onnx/build -G Ninja -LA 82 | -D CMAKE_BUILD_TYPE=Release 83 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 84 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 85 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 86 | -D Protobuf_LIBRARIES=protobuf/install/lib 87 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 88 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 89 | 90 | - name: Build onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --build onnx/build --verbose 93 | 94 | - name: Install onnx 95 | if: steps.cache-onnx.outputs.cache-hit != 'true' 96 | run: cmake --install onnx/build --prefix onnx/install 97 | 98 | - name: Download VapourSynth headers 99 | run: | 100 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 101 | unzip -q vs.zip 102 | mv vapoursynth*/ vapoursynth 103 | 104 | - name: Setup ONNX Runtime 105 | run: | 106 | curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-cuda12-1.17.1.tgz 107 | tar -xf ort.tgz 108 | mv onnxruntime-* onnxruntime -v 109 | 110 | - name: Setup CUDA 111 | run: | 112 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb 113 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 114 | sudo apt-get update 115 | sudo apt-get install -y cuda-nvcc-12-1 cuda-cudart-dev-12-1 116 | echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV 117 | echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV 118 | echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV 119 | 120 | - name: Configure 121 | run: cmake -S . -B build -G Ninja -LA 122 | -D CMAKE_BUILD_TYPE=Release 123 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 124 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 125 | -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime/include 126 | -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime/lib 127 | -D ENABLE_CUDA=1 128 | -D CUDAToolkit_ROOT=/usr/local/cuda 129 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 130 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 131 | -D CMAKE_CXX_STANDARD=20 132 | 133 | - name: Build 134 | run: cmake --build build --verbose 135 | 136 | - name: Install 137 | run: cmake --install build --prefix install 138 | 139 | - name: Prepare for upload 140 | run: | 141 | mkdir artifact 142 | cp -v install/lib/*.so artifact 143 | 144 | - name: Describe 145 | run: git describe --tags --long 146 | 147 | - name: Upload 148 | uses: actions/upload-artifact@v4 149 | with: 150 | name: vsort-linux-x64-cuda12.1 151 | path: vsort/artifact 152 | -------------------------------------------------------------------------------- /vsort/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define DLL_DIR L"vsort" 12 | #define COMMON_CUDA_DIR L"vsmlrt-cuda" 13 | 14 | namespace { 15 | std::vector dlls = { 16 | // This list must be sorted by dependency. 17 | L"DirectML.dll", 18 | L"onnxruntime.dll", // must be the last 19 | }; 20 | 21 | static std::vector cudaDlls { 22 | L"cudart64", 23 | L"cublasLt64", L"cublas64", 24 | L"cufft64", 25 | // follows the dependency graph in 26 | // https://docs.nvidia.com/deeplearning/cudnn/backend/v9.12.0/api/overview.html#backend-api-overview 27 | L"cudnn_graph64", 28 | L"cudnn_engines_precompiled64", L"cudnn_heuristic64", L"cudnn_engines_runtime_compiled64", 29 | L"cudnn_ops64", L"cudnn_cnn64", L"cudnn_adv64", 30 | L"cudnn64", 31 | L"cupti64", 32 | }; 33 | 34 | bool verbose() { return getenv("VSORT_VERBOSE") != nullptr; } 35 | 36 | namespace fs = std::filesystem; 37 | static fs::path dllDir() { 38 | static const std::wstring res = []() -> std::wstring { 39 | HMODULE mod = 0; 40 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 41 | std::vector buf; 42 | size_t n = 0; 43 | do { 44 | buf.resize(buf.size() + MAX_PATH); 45 | n = GetModuleFileNameW(mod, buf.data(), buf.size()); 46 | } while (n >= buf.size()); 47 | buf.resize(n); 48 | std::wstring path(buf.begin(), buf.end()); 49 | return path; 50 | } 51 | throw std::runtime_error("unable to locate myself"); 52 | }(); 53 | return fs::path(res).parent_path(); 54 | } 55 | 56 | FARPROC loadDLLs() { 57 | fs::path dir = dllDir() / DLL_DIR; 58 | HMODULE h = nullptr; 59 | for (const auto dll: dlls) { 60 | fs::path p = dir / dll; 61 | std::wstring s = p; 62 | h = LoadLibraryW(s.c_str()); 63 | if (verbose()) 64 | std::wcerr << DLL_DIR << L": preloading " << p << L": " << h << std::endl; 65 | if (!h) 66 | std::wcerr << DLL_DIR << L": failed to preload " << s << std::endl; 67 | } 68 | return (FARPROC)h; 69 | } 70 | 71 | static void *dummy() { // mimic OrtGetApiBase 72 | return nullptr; 73 | } 74 | 75 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 76 | switch (reason) { 77 | case dliNoteStartProcessing: 78 | case dliNoteEndProcessing: 79 | // Nothing to do here. 80 | break; 81 | case dliNotePreLoadLibrary: 82 | //std::cerr << "loading " << info->szDll << std::endl; 83 | if (std::string(info->szDll).find("onnxruntime.dll") != std::string::npos) 84 | return loadDLLs(); 85 | break; 86 | case dliNotePreGetProcAddress: 87 | // Nothing to do here. 88 | break; 89 | case dliFailLoadLib: 90 | case dliFailGetProc: 91 | // Returning NULL from error notifications will cause the delay load 92 | // runtime to raise a VcppException structured exception, that some code 93 | // might want to handle. 94 | // The SE will crash the process, so instead we return a dummy function. 95 | return (FARPROC)dummy; 96 | break; 97 | default: 98 | abort(); // unreachable. 99 | break; 100 | } 101 | // Returning NULL causes the delay load machinery to perform default 102 | // processing for this notification. 103 | return NULL; 104 | } 105 | } // namespace 106 | 107 | extern "C" { 108 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 109 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 110 | }; 111 | 112 | bool preloadCudaDlls() { 113 | std::map dllmap; 114 | 115 | auto findDllIn = [&](const std::filesystem::path &dir) { 116 | if (!std::filesystem::is_directory(dir)) 117 | return; 118 | for (const auto &ent: std::filesystem::directory_iterator{dir}) { 119 | if (!ent.is_regular_file()) 120 | continue; 121 | const auto path = ent.path(); 122 | if (path.extension() != ".dll") 123 | continue; 124 | const std::wstring filename = path.filename().wstring(); 125 | for (const auto &dll: cudaDlls) { 126 | if (dllmap.count(dll) > 0) 127 | continue; 128 | if (filename.find(dll) == 0) { 129 | if (verbose()) 130 | std::wcerr << DLL_DIR << L": found " << path << L" for " << dll << std::endl; 131 | dllmap.insert({ dll, path }); 132 | break; 133 | } 134 | } 135 | } 136 | }; 137 | const fs::path dir = dllDir(); 138 | findDllIn(dir / DLL_DIR); 139 | findDllIn(dir / COMMON_CUDA_DIR); 140 | 141 | if (verbose()) { 142 | for (const auto pair: dllmap) 143 | std::wcerr << DLL_DIR << L": will load " << pair.first << L" from " << pair.second << std::endl; 144 | } 145 | for (const auto &dll: cudaDlls) { 146 | if (dllmap.count(dll) == 0) { 147 | if (verbose()) { 148 | std::wcerr << DLL_DIR << L": unable to preload " << dll << L": not found" << std::endl; 149 | return false; 150 | } 151 | } 152 | std::wstring p = dllmap[dll]; 153 | HMODULE h = LoadLibraryW(p.c_str()); 154 | if (verbose()) 155 | std::wcerr << DLL_DIR << L": preloading " << p << L": " << h << std::endl; 156 | if (!h) return false; 157 | } 158 | return true; 159 | } 160 | #endif 161 | -------------------------------------------------------------------------------- /vstrt/README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth TensorRT & TensorRT-RTX 2 | 3 | The vs-tensorrt plugin provides optimized CUDA runtime for some popular AI filters. 4 | 5 | ## Usage 6 | 7 | Prototype: `core.{trt, trt_rtx}.Model(clip[] clips, string engine_path[, int[] overlap, int[] tilesize, int device_id=0, bint use_cuda_graph=False, int num_streams=1, int verbosity=2, string flexible_output_prop=""])` 8 | 9 | Arguments: 10 | - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki). 11 | - `string engine_path`: the path to the prebuilt engine (see below) 12 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size. 13 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size. 14 | - `int device_id`: Specifies the GPU device id to use, default 0. Requires Nvidia GPUs with second-generation Kepler architecture onwards. 15 | - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead. 16 | - `int num_streams`: number of concurrent CUDA streams to use. Default 1. Increase if GPU not saturated. 17 | - `verbosity`: The verbosity level of TensorRT runtime. The message writes to `stderr`. 18 | `0`: Internal error. `1`: Application error. `2`: Warning. `3`: Informational messages with instructional information. `4`: Verbose messages with debugging information. 19 | - `string flexible_output_prop`: used to support onnx models with arbitrary number of output planes. 20 | 21 | ```python3 22 | from typing import TypedDict 23 | 24 | class Output(TypedDict): 25 | clip: vs.VideoNode 26 | num_planes: int 27 | 28 | prop = "planes" # arbitrary non-empty string 29 | output = core.trt.Model(src, engine_path, flexible_output_prop=prop) # type: Output 30 | 31 | clip = output["clip"] 32 | num_planes = output["num_planes"] 33 | 34 | output_planes = [ 35 | clip.std.PropToClip(prop=f"{prop}{i}") 36 | for i in range(num_planes) 37 | ] # type: list[vs.VideoNode] 38 | ``` 39 | 40 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case. 41 | 42 | The general rule is to either: 43 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or 44 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used. 45 | 46 | ## Instructions for TensorRT 47 | 48 | ### Build engine with dynamic shape support 49 | - Requires models with built-in dynamic shape support, e.g. `waifu2x_v3.7z` and `dpir_v3.7z`. 50 | 51 | 1. Build engine 52 | ```shell 53 | trtexec --onnx=drunet_gray.onnx --minShapes=input:1x2x8x8 --optShapes=input:1x2x64x64 --maxShapes=input:1x2x1080x1920 --saveEngine=dpir_gray_1080p_dynamic.engine 54 | ``` 55 | 56 | The engine will be optimized for `64x64` input and can be applied to eligible inputs with shape from `8x8` to `1920x1080` by specifying parameter `tilesize` in the `trt` plugin. 57 | 58 | Also check [trtexec useful arguments](#trtexec-useful-arguments) 59 | 60 | ### Run model 61 | In vpy script: 62 | ```python3 63 | # DPIR 64 | src = core.std.BlankClip(src, width=640, height=360, format=vs.GRAYS) 65 | sigma = 10.0 66 | flt = core.trt.Model([src, core.std.BlankClip(src, color=sigma/255.0)], engine_path="dpir_gray_1080p_dynamic.engine", tilesize=[640, 360]) 67 | ``` 68 | 69 | ## trtexec useful arguments 70 | - `--workspace=N`: Set workspace size in megabytes (default = 16) 71 | 72 | - `--fp16`: Enable fp16 precision, in addition to fp32 (default = disabled) 73 | 74 | - `--noTF32`: Disable tf32 precision (default is to enable tf32, in addition to fp32, Ampere only) 75 | 76 | - `--device=N`: Select cuda device N (default = 0) 77 | 78 | - `--timingCacheFile=`: Save/load the serialized global timing cache 79 | 80 | - `--verbose`: Use verbose logging (default = false) 81 | 82 | - `--profilingVerbosity=mode`: Specify profiling verbosity. 83 | 84 | ``` 85 | mode ::= layer_names_only|detailed|none 86 | ``` 87 | 88 | (default = layer_names_only) 89 | 90 | - `--tacticSources=tactics`: Specify the tactics to be used by adding (+) or removing (-) tactics from the default 91 | 92 | tactic sources (default = all available tactics). 93 | 94 | Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics. 95 | 96 | Tactic Sources: 97 | ``` 98 | tactics ::= [","tactic] 99 | tactic ::= (+|-)lib 100 | lib ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN" 101 | ``` 102 | 103 | For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS 104 | 105 | - `--useCudaGraph`: Use CUDA graph to capture engine execution and then launch inference (default = disabled). 106 | This flag may be ignored if the graph capture fails. 107 | 108 | - `--noDataTransfers`: Disable DMA transfers to and from device (default = enabled). 109 | 110 | - `--saveEngine=`: Save the serialized engine 111 | 112 | - `--loadEngine=`: Load a serialized engine 113 | 114 | ## Instructions for TensorRT-RTX 115 | Replace the `trtexec` executable by the `tensorrt_rtx` executable. Some options may not be supported, e.g. `--fp16`. 116 | 117 | -------------------------------------------------------------------------------- /.github/workflows/windows-ncnn.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-NCNN) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsncnn/**' 8 | - '.github/workflows/windows-ncnn.yml' 9 | workflow_call: 10 | inputs: 11 | tag: 12 | description: 'which tag to upload to' 13 | required: true 14 | type: string 15 | workflow_dispatch: 16 | inputs: 17 | tag: 18 | description: 'which tag to upload to' 19 | default: '' 20 | 21 | jobs: 22 | build-windows: 23 | runs-on: windows-2025 24 | 25 | defaults: 26 | run: 27 | shell: cmd 28 | working-directory: vsncnn 29 | 30 | steps: 31 | - name: Checkout repo 32 | uses: actions/checkout@v5 33 | with: 34 | fetch-depth: 0 35 | 36 | - name: Setup MSVC 37 | uses: ilammy/msvc-dev-cmd@v1 38 | 39 | - name: Cache onnx 40 | id: cache-onnx 41 | uses: actions/cache@v4 42 | with: 43 | path: vsncnn/onnx/install 44 | key: ${{ runner.os }}-vsncnn-onnx-v1 45 | 46 | - name: Checkout onnx 47 | if: steps.cache-onnx.outputs.cache-hit != 'true' 48 | uses: actions/checkout@v4 49 | with: 50 | repository: onnx/onnx 51 | ref: v1.19.0 52 | path: vsncnn/onnx 53 | 54 | - name: Configure onnx 55 | if: steps.cache-onnx.outputs.cache-hit != 'true' 56 | run: cmake -S onnx -B onnx\build -G Ninja -LA 57 | -D CMAKE_BUILD_TYPE=Release 58 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 59 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 60 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 61 | -D ONNX_BUILD_CUSTOM_PROTOBUF=ON 62 | 63 | - name: Build onnx 64 | if: steps.cache-onnx.outputs.cache-hit != 'true' 65 | run: cmake --build onnx\build --verbose 66 | 67 | - name: Install onnx 68 | if: steps.cache-onnx.outputs.cache-hit != 'true' 69 | run: cmake --install onnx\build --prefix onnx\install 70 | 71 | - name: Download VapourSynth headers 72 | run: | 73 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 74 | unzip -q vs.zip 75 | mv vapoursynth-*/ vapoursynth/ 76 | 77 | - name: Download NCNN Precompilation 78 | shell: bash 79 | run: | 80 | curl -s -o ncnn.zip -LJO https://github.com/AmusementClub/ncnn/releases/download/250919-1038-g86efe80/ncnn-gpu-x64-windows.zip 81 | unzip -q ncnn.zip 82 | 83 | # follows vulkan sdk in https://github.com/AmusementClub/ncnn/blob/github-actions/.github/workflows/windows-x64-gpu.yml 84 | - name: Setup Vulkan SDK 85 | shell: pwsh 86 | run: | 87 | Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.4.321.1/windows/vulkansdk-windows-X64-1.4.321.1.exe?Human=true -OutFile VulkanSDK.exe 88 | $installer = Start-Process -FilePath VulkanSDK.exe -Wait -PassThru -ArgumentList "--accept-licenses --default-answer --confirm-command install"; 89 | $installer.WaitForExit(); 90 | 91 | - name: Configure 92 | run: cmake -S . -B build -G Ninja -LA 93 | -D CMAKE_BUILD_TYPE=Release 94 | -D CMAKE_PREFIX_PATH=onnx\install 95 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 96 | -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include 97 | -D ncnn_DIR=ncnn\lib\cmake\ncnn 98 | -D CMAKE_CXX_STANDARD=20 99 | env: 100 | VULKAN_SDK: C:\VulkanSDK\1.4.321.1 101 | 102 | - name: Build 103 | run: cmake --build build --verbose 104 | 105 | - name: Install 106 | run: | 107 | cmake --install build --prefix install 108 | mkdir artifact 109 | copy install\bin\vsncnn.dll artifact\ 110 | 111 | - name: Upload 112 | uses: actions/upload-artifact@v4 113 | with: 114 | name: VSNCNN-GPU-Windows-x64 115 | path: vsncnn/artifact 116 | 117 | - name: Setup Python portable 118 | if: false 119 | run: | 120 | curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.9/python-3.9.9-embed-amd64.zip 121 | 7z x python.zip -ovs_portable 122 | 123 | - name: Install VapourSynth portable 124 | if: false 125 | run: | 126 | curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z 127 | 7z x vs.7z -ovs_portable -y 128 | 129 | - name: Copy plugin & swiftshader 130 | if: false 131 | run: | 132 | copy artifact\*.dll vs_portable\vapoursynth64\plugins 133 | copy ncnn\tests\* vs_portable\ 134 | 135 | - name: Install waifu2x model 136 | if: false 137 | run: | 138 | curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z 139 | 7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models 140 | 141 | - name: Download x265 142 | if: false 143 | run: | 144 | curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z 145 | 7z x x265.7z -ovs_portable\ 146 | 147 | - name: Create script 148 | if: false 149 | shell: bash 150 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ncnn, core.ncnn.Version(), file=sys.stderr);core.std.BlankClip(format=vs.RGBS, width=127, height=63).ncnn.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy 151 | 152 | - name: Run vspipe 153 | if: false 154 | shell: bash 155 | run: | 156 | set -ex 157 | vs_portable/vspipe -i test.vpy - 158 | vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 159 | ls -l out.hevc x265.log 160 | cat x265.log 161 | grep -F 'encoded 10 frames' x265.log || exit 2 162 | grep -i 'error' x265.log && exit 1 163 | exit 0 164 | 165 | - name: Create script (flexible output) 166 | if: false 167 | shell: bash 168 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ncnn, core.ncnn.Version(), file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS, width=127, height=63).ncnn.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy 169 | 170 | - name: Run vspipe (flexible output) 171 | if: false 172 | shell: bash 173 | run: | 174 | set -ex 175 | vs_portable/vspipe -i test_flexible_output.vpy - 176 | vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 177 | ls -l out.hevc x265.log 178 | cat x265.log 179 | grep -F 'encoded 10 frames' x265.log || exit 2 180 | grep -i 'error' x265.log && exit 1 181 | exit 0 182 | 183 | - name: Describe 184 | run: git describe --tags --long 185 | 186 | - name: Dump dependencies 187 | run: dumpbin /dependents artifact\vsncnn.dll 188 | 189 | - name: Compress artifact for release 190 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 191 | run: | 192 | cd artifact 193 | 7z a -t7z -mx=7 ../../VSNCNN-Windows-x64.${{ github.event.inputs.tag }}.7z . 194 | 195 | - name: Release 196 | uses: softprops/action-gh-release@v2 197 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 198 | with: 199 | tag_name: ${{ inputs.tag }} 200 | files: VSNCNN-Windows-x64.${{ github.event.inputs.tag }}.7z 201 | fail_on_unmatched_files: true 202 | generate_release_notes: false 203 | prerelease: true 204 | -------------------------------------------------------------------------------- /.github/workflows/windows-ov.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-OV) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsov/**' 8 | - '.github/workflows/windows-ov.yml' 9 | workflow_call: 10 | inputs: 11 | tag: 12 | description: 'which tag to upload to' 13 | required: true 14 | type: string 15 | ov_tag: 16 | description: 'which tag of openvino to use' 17 | required: true 18 | default: 'latest' 19 | type: string 20 | workflow_dispatch: 21 | inputs: 22 | tag: 23 | description: 'which tag to upload to' 24 | default: '' 25 | ov_tag: 26 | description: 'which tag of openvino to use' 27 | required: true 28 | default: 'latest' 29 | type: string 30 | 31 | jobs: 32 | build-windows: 33 | runs-on: windows-2022 34 | 35 | defaults: 36 | run: 37 | shell: cmd 38 | working-directory: vsov 39 | 40 | steps: 41 | - name: Checkout repo 42 | uses: actions/checkout@v4 43 | with: 44 | fetch-depth: 0 45 | 46 | - name: Setup MSVC 47 | uses: ilammy/msvc-dev-cmd@v1 48 | 49 | - name: Setup Ninja 50 | run: pip install ninja 51 | 52 | - name: Cache protobuf 53 | id: cache-protobuf 54 | uses: actions/cache@v4 55 | with: 56 | path: vsov/protobuf/install 57 | key: ${{ runner.os }}-vsov-protobuf-v3 58 | 59 | - name: Checkout protobuf 60 | uses: actions/checkout@v4 61 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 62 | with: 63 | repository: protocolbuffers/protobuf 64 | # follows protobuf in https://github.com/AmusementClub/openvino/tree/master/thirdparty/protobuf 65 | # if you change this, remember to bump the version of the cache key. 66 | ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c 67 | fetch-depth: 1 68 | path: vsov/protobuf 69 | 70 | - name: Configure protobuf 71 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 72 | run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA 73 | -D CMAKE_BUILD_TYPE=Release 74 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 75 | 76 | - name: Build protobuf 77 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 78 | run: cmake --build protobuf\build_rel --verbose 79 | 80 | - name: Install protobuf 81 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 82 | run: cmake --install protobuf\build_rel --prefix protobuf\install 83 | 84 | - name: Cache onnx 85 | id: cache-onnx 86 | uses: actions/cache@v4 87 | with: 88 | path: vsov/onnx/install 89 | key: ${{ runner.os }}-vsov-onnx-v3 90 | 91 | - name: Checkout onnx 92 | if: steps.cache-onnx.outputs.cache-hit != 'true' 93 | uses: actions/checkout@v4 94 | with: 95 | repository: onnx/onnx 96 | # follows onnx in https://github.com/AmusementClub/openvino/tree/master/thirdparty/onnx 97 | # if you change this, remember to bump the version of the cache key. 98 | ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c 99 | fetch-depth: 1 100 | path: vsov/onnx 101 | 102 | - name: Configure onnx 103 | if: steps.cache-onnx.outputs.cache-hit != 'true' 104 | run: cmake -S onnx -B onnx\build -G Ninja -LA 105 | -D CMAKE_BUILD_TYPE=Release 106 | -D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc 107 | -D Protobuf_LITE_LIBRARY=protobuf\install\lib 108 | -D Protobuf_LIBRARIES=protobuf\install\lib 109 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 110 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 111 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 112 | 113 | - name: Build onnx 114 | if: steps.cache-onnx.outputs.cache-hit != 'true' 115 | run: cmake --build onnx\build --verbose 116 | 117 | - name: Install onnx 118 | if: steps.cache-onnx.outputs.cache-hit != 'true' 119 | run: cmake --install onnx\build --prefix onnx\install 120 | 121 | - name: Download VapourSynth headers 122 | run: | 123 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 124 | unzip -q vs.zip 125 | mv vapoursynth-*/ vapoursynth/ 126 | 127 | - name: Download OpenVINO Runtime Precompilation 128 | shell: bash 129 | run: | 130 | # rev="${{github.event.inputs.ov_tag || inputs.ov_tag || 'latest'}}" 131 | # if [ "$rev" == "latest" ]; then 132 | # url="https://github.com/AmusementClub/openvino/releases/latest/download/openvino-gpu-win64.zip" 133 | # else 134 | # url="https://github.com/AmusementClub/openvino/releases/download/$rev/openvino-gpu-win64.zip" 135 | # fi 136 | url="https://github.com/AmusementClub/openvino/releases/download/2020.2-15171-g4655dd6ce3-2058-g5833781ddb/openvino-gpu-win64.zip" 137 | curl -s -o openvino.zip -LJO "$url" 138 | unzip -q openvino.zip 139 | 140 | - name: Configure 141 | run: cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release 142 | -D CMAKE_INTERPROCEDURAL_OPTIMIZATION=ON 143 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 144 | -D OpenVINO_DIR=openvino/runtime/cmake 145 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include" 146 | -D ENABLE_VISUALIZATION=ON 147 | -D WIN32_SHARED_OPENVINO=ON 148 | -D protobuf_DIR=protobuf\install\cmake 149 | -D ONNX_DIR=onnx\install\lib\cmake\ONNX 150 | 151 | - name: Build 152 | run: cmake --build build --verbose 153 | 154 | - name: Install 155 | run: | 156 | cmake --install build --prefix install 157 | mkdir artifact 158 | mkdir artifact\vsov 159 | copy openvino\runtime\3rdparty\tbb\bin\tbb12.dll artifact\vsov\ 160 | copy install\bin\vsov.dll artifact\ 161 | xcopy openvino\runtime\bin\intel64\Release\* artifact\vsov\ /s 162 | 163 | - name: Upload 164 | uses: actions/upload-artifact@v4 165 | with: 166 | name: VSOV-Windows-x64 167 | path: vsov/artifact 168 | 169 | - name: Setup Python portable 170 | run: | 171 | curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.9/python-3.9.9-embed-amd64.zip 172 | 7z x python.zip -ovs_portable 173 | 174 | - name: Install VapourSynth portable 175 | run: | 176 | curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z 177 | 7z x vs.7z -ovs_portable -y 178 | 179 | - name: Copy plugin 180 | run: | 181 | copy artifact\*.dll vs_portable\vapoursynth64\plugins 182 | mkdir vs_portable\vapoursynth64\plugins\vsov\ 183 | copy artifact\vsov\* vs_portable\vapoursynth64\plugins\vsov\ 184 | 185 | - name: Install waifu2x model 186 | run: | 187 | curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z 188 | 7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models 189 | 190 | - name: Download x265 191 | run: | 192 | curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z 193 | 7z x x265.7z -ovs_portable\ 194 | 195 | - name: Create script 196 | shell: bash 197 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy 198 | 199 | - name: Run vspipe 200 | shell: bash 201 | run: | 202 | set -ex 203 | vs_portable/vspipe -i test.vpy - 204 | vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 205 | ls -l out.hevc x265.log 206 | cat x265.log 207 | grep -F 'encoded 10 frames' x265.log || exit 2 208 | grep -i 'error' x265.log && exit 1 209 | exit 0 210 | 211 | - name: Create script (fp16) 212 | shell: bash 213 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy 214 | 215 | - name: Run vspipe (fp16) 216 | shell: bash 217 | run: | 218 | set -ex 219 | vs_portable/vspipe -i test_fp16.vpy - 220 | vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 221 | ls -l out.hevc x265.log 222 | cat x265.log 223 | grep -F 'encoded 10 frames' x265.log || exit 2 224 | grep -i 'error' x265.log && exit 1 225 | exit 0 226 | 227 | - name: Create script (flexible output) 228 | shell: bash 229 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);prop=\"test\";output=core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output[\"clip\"].std.PropToClip(prop=f\"{prop}{i}\") for i in range(output[\"num_planes\"])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy 230 | 231 | - name: Run vspipe (flexible output) 232 | shell: bash 233 | run: | 234 | set -ex 235 | vs_portable/vspipe -i test_flexible_output.vpy - 236 | vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 237 | ls -l out.hevc x265.log 238 | cat x265.log 239 | grep -F 'encoded 10 frames' x265.log || exit 2 240 | grep -i 'error' x265.log && exit 1 241 | exit 0 242 | 243 | - name: Describe 244 | run: git describe --tags --long 245 | 246 | - name: Dump dependencies 247 | run: dumpbin /dependents artifact\vsov.dll 248 | 249 | - name: Compress artifact for release 250 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 251 | run: | 252 | cd artifact 253 | 7z a -t7z -mx=7 ../../VSOV-Windows-x64.${{ github.event.inputs.tag }}.7z . 254 | 255 | - name: Release 256 | uses: softprops/action-gh-release@v2 257 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 258 | with: 259 | tag_name: ${{ inputs.tag }} 260 | files: VSOV-Windows-x64.${{ github.event.inputs.tag }}.7z 261 | fail_on_unmatched_files: true 262 | generate_release_notes: false 263 | prerelease: true 264 | -------------------------------------------------------------------------------- /vstrt/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_UTILS_H_ 2 | #define VSTRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #ifdef __cpp_impl_reflection 18 | #include 19 | #endif 20 | 21 | static inline 22 | void setDimensions( 23 | std::unique_ptr & vi, 24 | const std::unique_ptr & exec_context, 25 | VSCore * core, 26 | const VSAPI * vsapi, 27 | int sample_type, 28 | int bits_per_sample, 29 | bool flexible_output 30 | ) noexcept { 31 | 32 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 33 | auto input_name = exec_context->getEngine().getIOTensorName(0); 34 | auto output_name = exec_context->getEngine().getIOTensorName(1); 35 | const nvinfer1::Dims & in_dims = exec_context->getTensorShape(input_name); 36 | const nvinfer1::Dims & out_dims = exec_context->getTensorShape(output_name); 37 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 38 | const nvinfer1::Dims & in_dims = exec_context->getBindingDimensions(0); 39 | const nvinfer1::Dims & out_dims = exec_context->getBindingDimensions(1); 40 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 41 | 42 | auto in_height = static_cast(in_dims.d[2]); 43 | auto in_width = static_cast(in_dims.d[3]); 44 | 45 | auto out_height = static_cast(out_dims.d[2]); 46 | auto out_width = static_cast(out_dims.d[3]); 47 | 48 | vi->height *= out_height / in_height; 49 | vi->width *= out_width / in_width; 50 | 51 | if (out_dims.d[1] == 1 || flexible_output) { 52 | vi->format = vsapi->registerFormat(cmGray, sample_type, bits_per_sample, 0, 0, core); 53 | } else if (out_dims.d[1] == 3) { 54 | vi->format = vsapi->registerFormat(cmRGB, sample_type, bits_per_sample, 0, 0, core); 55 | } 56 | } 57 | 58 | static inline 59 | std::vector getVideoInfo( 60 | const VSAPI * vsapi, 61 | const std::vector & nodes 62 | ) noexcept { 63 | 64 | std::vector vis; 65 | vis.reserve(std::size(nodes)); 66 | 67 | for (const auto & node : nodes) { 68 | vis.emplace_back(vsapi->getVideoInfo(node)); 69 | } 70 | 71 | return vis; 72 | } 73 | 74 | static inline 75 | std::vector getFrames( 76 | int n, 77 | const VSAPI * vsapi, 78 | VSFrameContext * frameCtx, 79 | const std::vector & nodes 80 | ) noexcept { 81 | 82 | std::vector frames; 83 | frames.reserve(std::size(nodes)); 84 | 85 | for (const auto & node : nodes) { 86 | frames.emplace_back(vsapi->getFrameFilter(n, node, frameCtx)); 87 | } 88 | 89 | return frames; 90 | } 91 | 92 | static inline 93 | std::optional checkNodes( 94 | const std::vector & vis 95 | ) noexcept { 96 | 97 | for (const auto & vi : vis) { 98 | if (!isConstantFormat(vi)) { 99 | return "video format must be constant"; 100 | } 101 | 102 | if (vi->width != vis[0]->width || vi->height != vis[0]->height) { 103 | return "dimensions of clips mismatch"; 104 | } 105 | 106 | if (vi->numFrames != vis[0]->numFrames) { 107 | return "number of frames mismatch"; 108 | } 109 | 110 | if (vi->format->subSamplingH != 0 || vi->format->subSamplingW != 0) { 111 | return "clip must not be sub-sampled"; 112 | } 113 | } 114 | 115 | return {}; 116 | } 117 | 118 | static inline 119 | std::optional checkNodes( 120 | const std::vector & vis, 121 | int sample_type, 122 | int bits_per_sample 123 | ) noexcept { 124 | 125 | for (const auto & vi : vis) { 126 | if (vi->format->sampleType != sample_type) { 127 | return "sample type mismatch"; 128 | } 129 | 130 | if (vi->format->bitsPerSample != bits_per_sample) { 131 | return "bits per sample mismatch"; 132 | } 133 | } 134 | 135 | return {}; 136 | } 137 | 138 | static inline 139 | int numPlanes( 140 | const std::vector & vis 141 | ) noexcept { 142 | 143 | int num_planes = 0; 144 | 145 | for (const auto & vi : vis) { 146 | num_planes += vi->format->numPlanes; 147 | } 148 | 149 | return num_planes; 150 | } 151 | 152 | static inline 153 | std::optional checkNodesAndContext( 154 | const std::unique_ptr & execution_context, 155 | const std::vector & vis 156 | ) noexcept { 157 | 158 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 159 | auto input_name = execution_context->getEngine().getIOTensorName(0); 160 | const nvinfer1::Dims & network_in_dims = execution_context->getTensorShape(input_name); 161 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 162 | const nvinfer1::Dims & network_in_dims = execution_context->getBindingDimensions(0); 163 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 164 | 165 | auto network_in_channels = network_in_dims.d[1]; 166 | int num_planes = numPlanes(vis); 167 | if (network_in_channels != num_planes) { 168 | return "expects " + std::to_string(network_in_channels) + " input planes"; 169 | } 170 | 171 | auto network_in_height = network_in_dims.d[2]; 172 | auto network_in_width = network_in_dims.d[3]; 173 | int clip_in_height = vis[0]->height; 174 | int clip_in_width = vis[0]->width; 175 | 176 | if (network_in_height > clip_in_height || network_in_width > clip_in_width) { 177 | return "tile size larger than clip dimension"; 178 | } 179 | 180 | return {}; 181 | } 182 | 183 | static inline void VS_CC getDeviceProp( 184 | const VSMap *in, VSMap *out, void *userData, 185 | VSCore *core, const VSAPI *vsapi 186 | ) { 187 | 188 | int err; 189 | int device_id = static_cast(vsapi->propGetInt(in, "device_id", 0, &err)); 190 | if (err) { 191 | device_id = 0; 192 | } 193 | 194 | cudaDeviceProp prop; 195 | if (auto error = cudaGetDeviceProperties(&prop, device_id); error != cudaSuccess) { 196 | vsapi->setError(out, cudaGetErrorString(error)); 197 | return ; 198 | } 199 | 200 | auto setProp = [&](const char * name, const auto & value, int data_length = -1) { 201 | using T = std::decay_t; 202 | if constexpr (std::is_integral_v) { 203 | vsapi->propSetInt(out, name, static_cast(value), paReplace); 204 | } else if constexpr (std::is_same_v) { 205 | vsapi->propSetData(out, name, value, data_length, paReplace); 206 | } else if constexpr (std::is_integral_v>) { 207 | std::array>> data; 208 | for (int i = 0; i < static_cast(std::size(data)); i++) { 209 | data[i] = value[i]; 210 | } 211 | vsapi->propSetIntArray(out, name, std::data(data), static_cast(std::size(data))); 212 | } 213 | }; 214 | 215 | int driver_version; 216 | cudaDriverGetVersion(&driver_version); 217 | setProp("driver_version", driver_version); 218 | 219 | #ifdef __cpp_impl_reflection 220 | constexpr auto ctx = std::meta::access_context::current(); 221 | template for ( 222 | constexpr auto r : define_static_array(nonstatic_data_members_of(^^decltype(prop), ctx)) 223 | ) { 224 | if constexpr (identifier_of(r) == "uuid") { 225 | std::array uuid; 226 | for (int i = 0; i < 16; ++i) { 227 | uuid[i] = prop.uuid.bytes[i]; 228 | } 229 | vsapi->propSetIntArray(out, "uuid", std::data(uuid), static_cast(std::size(uuid))); 230 | } else if constexpr (identifier_of(r) != "reserved") { 231 | setProp(std::string(identifier_of(r)).c_str(), prop.[:r:]); 232 | } 233 | } 234 | #else // __cpp_impl_reflection 235 | setProp("name", prop.name); 236 | { 237 | std::array uuid; 238 | for (int i = 0; i < 16; ++i) { 239 | uuid[i] = prop.uuid.bytes[i]; 240 | } 241 | vsapi->propSetIntArray(out, "uuid", std::data(uuid), static_cast(std::size(uuid))); 242 | } 243 | setProp("total_global_memory", prop.totalGlobalMem); 244 | setProp("shared_memory_per_block", prop.sharedMemPerBlock); 245 | setProp("regs_per_block", prop.regsPerBlock); 246 | setProp("warp_size", prop.warpSize); 247 | setProp("mem_pitch", prop.memPitch); 248 | setProp("max_threads_per_block", prop.maxThreadsPerBlock); 249 | setProp("total_const_mem", prop.totalConstMem); 250 | setProp("major", prop.major); 251 | setProp("minor", prop.minor); 252 | setProp("texture_alignment", prop.textureAlignment); 253 | setProp("texture_pitch_alignment", prop.texturePitchAlignment); 254 | setProp("multi_processor_count", prop.multiProcessorCount); 255 | setProp("integrated", prop.integrated); 256 | setProp("can_map_host_memory", prop.canMapHostMemory); 257 | setProp("concurrent_kernels", prop.concurrentKernels); 258 | setProp("ecc_enabled", prop.ECCEnabled); 259 | setProp("pci_bus_id", prop.pciBusID); 260 | setProp("pci_device_id", prop.pciDeviceID); 261 | setProp("pci_domain_id", prop.pciDomainID); 262 | setProp("tcc_driver", prop.tccDriver); 263 | setProp("async_engine_count", prop.asyncEngineCount); 264 | setProp("unified_addressing", prop.unifiedAddressing); 265 | setProp("memory_bus_width", prop.memoryBusWidth); 266 | setProp("l2_cache_size", prop.l2CacheSize); 267 | setProp("persisting_l2_cache_max_size", prop.persistingL2CacheMaxSize); 268 | setProp("max_threads_per_multiprocessor", prop.maxThreadsPerMultiProcessor); 269 | setProp("stream_priorities_supported", prop.streamPrioritiesSupported); 270 | setProp("global_l1_cache_supported", prop.globalL1CacheSupported); 271 | setProp("local_l1_cache_supported", prop.localL1CacheSupported); 272 | setProp("shared_mem_per_multiprocessor", prop.sharedMemPerMultiprocessor); 273 | setProp("regs_per_multiprocessor", prop.regsPerMultiprocessor); 274 | setProp("managed_memory", prop.managedMemory); 275 | setProp("is_multi_gpu_board", prop.isMultiGpuBoard); 276 | setProp("multi_gpu_board_group_id", prop.multiGpuBoardGroupID); 277 | setProp("host_native_atomic_supported", prop.hostNativeAtomicSupported); 278 | setProp("pageable_memory_access", prop.pageableMemoryAccess); 279 | setProp("conccurrent_managed_access", prop.concurrentManagedAccess); 280 | setProp("compute_preemption_supported", prop.computePreemptionSupported); 281 | setProp( 282 | "can_use_host_pointer_for_registered_mem", 283 | prop.canUseHostPointerForRegisteredMem 284 | ); 285 | setProp("cooperative_launch", prop.cooperativeLaunch); 286 | setProp("shared_mem_per_block_optin", prop.sharedMemPerBlockOptin); 287 | setProp( 288 | "pageable_memory_access_uses_host_page_tables", 289 | prop.pageableMemoryAccessUsesHostPageTables 290 | ); 291 | setProp("direct_managed_mem_access_from_host", prop.directManagedMemAccessFromHost); 292 | setProp("max_blocks_per_multi_processor", prop.maxBlocksPerMultiProcessor); 293 | setProp("access_policy_max_window_size", prop.accessPolicyMaxWindowSize); 294 | setProp("reserved_shared_mem_per_block", prop.reservedSharedMemPerBlock); 295 | #endif // __cpp_impl_reflection 296 | }; 297 | 298 | #endif // VSTRT_UTILS_H_ 299 | -------------------------------------------------------------------------------- /.github/workflows/windows-ort.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-ORT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsort/**' 8 | - '.github/workflows/windows-ort.yml' 9 | workflow_call: 10 | inputs: 11 | tag: 12 | description: 'which tag to upload to' 13 | required: true 14 | type: string 15 | workflow_dispatch: 16 | inputs: 17 | tag: 18 | description: 'which tag to upload to' 19 | default: '' 20 | 21 | jobs: 22 | build-windows: 23 | runs-on: windows-2025 24 | 25 | defaults: 26 | run: 27 | shell: cmd 28 | working-directory: vsort 29 | 30 | steps: 31 | - name: Checkout repo 32 | uses: actions/checkout@v4 33 | with: 34 | fetch-depth: 0 35 | 36 | - name: Setup MSVC 37 | uses: ilammy/msvc-dev-cmd@v1 38 | 39 | - name: Setup Ninja 40 | run: pip install ninja 41 | 42 | - name: Restore cached onnx 43 | id: cache-onnx 44 | uses: actions/cache/restore@v4 45 | with: 46 | path: vsort/onnx/install 47 | key: ${{ runner.os }}-vsort-onnx-v6 48 | 49 | - name: Checkout onnx 50 | if: steps.cache-onnx.outputs.cache-hit != 'true' 51 | uses: actions/checkout@v4 52 | with: 53 | repository: onnx/onnx 54 | # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external 55 | # if you change this, remember to bump the version of the cache key. 56 | ref: v1.19.0 57 | fetch-depth: 1 58 | path: vsort/onnx 59 | 60 | - name: Configure onnx 61 | if: steps.cache-onnx.outputs.cache-hit != 'true' 62 | run: cmake -S onnx -B onnx\build -G Ninja -LA 63 | -D CMAKE_BUILD_TYPE=Release 64 | -D CMAKE_PREFIX_PATH=%cd%\protobuf\install\lib\cmake 65 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 66 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 67 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 68 | -D ONNX_BUILD_CUSTOM_PROTOBUF=ON 69 | 70 | - name: Build onnx 71 | if: steps.cache-onnx.outputs.cache-hit != 'true' 72 | run: cmake --build onnx\build --verbose 73 | 74 | - name: Install onnx 75 | if: steps.cache-onnx.outputs.cache-hit != 'true' 76 | run: cmake --install onnx\build --prefix onnx\install 77 | 78 | - name: Save onnx 79 | if: steps.cache-onnx.outputs.cache-hit != 'true' 80 | uses: actions/cache/save@v4 81 | with: 82 | path: vsort/onnx/install 83 | key: ${{ steps.cache-onnx.outputs.cache-primary-key }} 84 | 85 | - name: Download VapourSynth headers 86 | run: | 87 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 88 | unzip -q vs.zip 89 | mv vapoursynth-*/ vapoursynth/ 90 | 91 | - name: Download ONNX Runtime Precompilation 92 | run: | 93 | curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-10525-gecb26fb775-250906-0716/onnxruntime-gpu-win64.zip 94 | unzip -q ortgpu.zip 95 | 96 | - name: Restore cached CUDA 97 | id: cache-cuda 98 | uses: actions/cache/restore@v4 99 | with: 100 | path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA 101 | key: ${{ runner.os }}-cuda-13.0.1 102 | 103 | - name: Setup CUDA 104 | if: steps.cache-cuda.outputs.cache-hit != 'true' 105 | run: | 106 | curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/13.0.1/network_installers/cuda_13.0.1_windows_network.exe 107 | cuda_installer.exe -s nvcc_13.0 cudart_13.0 crt_13.0 nvptxcompiler_13.0 108 | 109 | - name: Save CUDA 110 | if: steps.cache-cuda.outputs.cache-hit != 'true' 111 | uses: actions/cache/save@v4 112 | with: 113 | path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA 114 | key: ${{ steps.cache-cuda.outputs.cache-primary-key }} 115 | 116 | - name: Configure 117 | run: cmake -S . -B build -G Ninja -LA 118 | -D CMAKE_BUILD_TYPE=Release 119 | -D CMAKE_PREFIX_PATH=onnx\install 120 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 121 | -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include 122 | -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime 123 | -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib 124 | -D ENABLE_CUDA=1 125 | -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0" 126 | -D ENABLE_DML=1 127 | -D CMAKE_CXX_STANDARD=20 128 | 129 | - name: Build 130 | run: cmake --build build --verbose 131 | 132 | - name: Install 133 | run: | 134 | cmake --install build --prefix install 135 | mkdir artifact 136 | mkdir artifact\vsort 137 | copy install\bin\vsort.dll artifact\ 138 | copy onnxruntime-gpu\bin\*.dll artifact\vsort\ 139 | copy onnxruntime-gpu\lib\*.dll artifact\vsort\ 140 | 141 | - name: Download DirectML Library 142 | # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44 143 | run: | 144 | curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.15.4 145 | unzip -q directml.nupkg -d dml 146 | copy dml\bin\x64-win\DirectML.dll artifact\vsort\ 147 | 148 | - name: Upload 149 | uses: actions/upload-artifact@v4 150 | with: 151 | name: VSORT-Windows-x64 152 | path: vsort/artifact 153 | 154 | - name: Describe 155 | run: git describe --tags --long 156 | 157 | - name: Dump dependencies 158 | run: dumpbin /dependents artifact\vsort.dll 159 | 160 | - name: Setup Python portable 161 | run: | 162 | curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.10/python-3.9.10-embed-amd64.zip 163 | 7z x python.zip -ovs_portable 164 | 165 | - name: Install VapourSynth portable 166 | run: | 167 | curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z 168 | 7z x vs.7z -ovs_portable -y 169 | 170 | - name: Copy plugin 171 | run: | 172 | copy artifact\*.dll vs_portable\vapoursynth64\plugins 173 | mkdir vs_portable\vapoursynth64\plugins\vsort\ 174 | copy artifact\vsort\*.dll vs_portable\vapoursynth64\plugins\vsort\ 175 | 176 | - name: Install waifu2x model 177 | run: | 178 | curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z 179 | 7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models 180 | 181 | - name: Download x265 182 | run: | 183 | curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z 184 | 7z x x265.7z -ovs_portable\ 185 | 186 | - name: Create script 187 | shell: bash 188 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, verbosity=4).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy 189 | 190 | - name: Run vspipe 191 | shell: bash 192 | run: | 193 | set -ex 194 | vs_portable/vspipe -i test.vpy - 195 | vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 196 | ls -l out.hevc x265.log 197 | cat x265.log 198 | grep -F 'encoded 10 frames' x265.log || exit 2 199 | grep -i 'error' x265.log && exit 1 200 | exit 0 201 | 202 | - name: Create script (fp16) 203 | shell: bash 204 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy 205 | 206 | - name: Run vspipe (fp16) 207 | shell: bash 208 | run: | 209 | set -ex 210 | vs_portable/vspipe -i test_fp16.vpy - 211 | vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 212 | ls -l out.hevc x265.log 213 | cat x265.log 214 | grep -F 'encoded 10 frames' x265.log || exit 2 215 | grep -i 'error' x265.log && exit 1 216 | exit 0 217 | 218 | - name: Create script (fp16 input) 219 | shell: bash 220 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBH).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_input.vpy 221 | 222 | - name: Run vspipe (fp16 input) 223 | shell: bash 224 | run: | 225 | set -ex 226 | vs_portable/vspipe -i test_fp16_input.vpy - 227 | vs_portable/vspipe --y4m -p -e 9 test_fp16_input.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 228 | ls -l out.hevc x265.log 229 | cat x265.log 230 | grep -F 'encoded 10 frames' x265.log || exit 2 231 | grep -i 'error' x265.log && exit 1 232 | exit 0 233 | 234 | - name: Create script (fp16 output) 235 | shell: bash 236 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True, output_format=1);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_output.vpy 237 | 238 | - name: Run vspipe (fp16 output) 239 | shell: bash 240 | run: | 241 | set -ex 242 | vs_portable/vspipe -i test_fp16_output.vpy - 243 | vs_portable/vspipe --y4m -p -e 9 test_fp16_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 244 | ls -l out.hevc x265.log 245 | cat x265.log 246 | grep -F 'encoded 10 frames' x265.log || exit 2 247 | grep -i 'error' x265.log && exit 1 248 | exit 0 249 | 250 | - name: Create script (flexible output) 251 | shell: bash 252 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy 253 | 254 | - name: Run vspipe (flexible output) 255 | shell: bash 256 | run: | 257 | set -ex 258 | vs_portable/vspipe -i test_flexible_output.vpy - 259 | vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 260 | ls -l out.hevc x265.log 261 | cat x265.log 262 | grep -F 'encoded 10 frames' x265.log || exit 2 263 | grep -i 'error' x265.log && exit 1 264 | exit 0 265 | 266 | - name: Compress artifact for release 267 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 268 | run: | 269 | cd artifact 270 | 7z a -t7z -mx=7 ../../VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z . 271 | 272 | - name: Release 273 | uses: softprops/action-gh-release@v2 274 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 275 | with: 276 | tag_name: ${{ inputs.tag }} 277 | files: VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z 278 | fail_on_unmatched_files: true 279 | generate_release_notes: false 280 | prerelease: true 281 | -------------------------------------------------------------------------------- /.github/workflows/windows-release.yml: -------------------------------------------------------------------------------- 1 | name: Make a Release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | tag: 7 | description: 'which tag to create and release?' 8 | required: true 9 | default: 'nightly' 10 | model-tags: 11 | description: 'which tag(s) of model release to use? (comma-separated list of tags)' 12 | required: true 13 | default: 'model-20211209, model-20220923' 14 | ext-model-tags: 15 | description: 'which tag(s) of external model release to use?' 16 | required: true 17 | default: 'external-models' 18 | contrib-model-tags: 19 | description: 'which tag(s) of contributed model release to use?' 20 | required: true 21 | default: 'contrib-models' 22 | ov_tag: 23 | description: 'which tag of openvino to use' 24 | required: true 25 | default: 'latest' 26 | type: string 27 | 28 | jobs: 29 | build-vsov: 30 | uses: ./.github/workflows/windows-ov.yml 31 | with: 32 | tag: ${{ github.event.inputs.tag }} 33 | ov_tag: ${{ github.event.inputs.ov_tag }} 34 | 35 | build-vsort: 36 | uses: ./.github/workflows/windows-ort.yml 37 | with: 38 | tag: ${{ github.event.inputs.tag }} 39 | 40 | build-vstrt: 41 | uses: ./.github/workflows/windows-trt.yml 42 | with: 43 | tag: ${{ github.event.inputs.tag }} 44 | secrets: 45 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 46 | 47 | build-vsmigx: 48 | uses: ./.github/workflows/windows-migx.yml 49 | with: 50 | tag: ${{ github.event.inputs.tag }} 51 | secrets: 52 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 53 | 54 | build-vsncnn: 55 | uses: ./.github/workflows/windows-ncnn.yml 56 | with: 57 | tag: ${{ github.event.inputs.tag }} 58 | 59 | build-vstrt_rtx: 60 | uses: ./.github/workflows/windows-trt_rtx.yml 61 | with: 62 | tag: ${{ github.event.inputs.tag }} 63 | secrets: 64 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 65 | 66 | build-cuda-dependency: 67 | uses: ./.github/workflows/windows-cuda-dependency.yml 68 | with: 69 | tag: ${{ github.event.inputs.tag }} 70 | secrets: 71 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 72 | 73 | build-hip-dependency: 74 | uses: ./.github/workflows/windows-hip-dependency.yml 75 | with: 76 | tag: ${{ github.event.inputs.tag }} 77 | secrets: 78 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 79 | 80 | build-scripts: 81 | runs-on: ubuntu-24.04-arm 82 | steps: 83 | - name: Checkout repo 84 | uses: actions/checkout@v4 85 | 86 | - name: Compress scirpts.7z 87 | run: | 88 | cd scripts 89 | 7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z . 90 | 91 | - name: Upload scripts release 92 | uses: actions/upload-artifact@v4 93 | with: 94 | name: Scripts 95 | path: scripts 96 | retention-days: 1 97 | 98 | - name: Release scripts 99 | uses: softprops/action-gh-release@v2 100 | with: 101 | tag_name: ${{ github.event.inputs.tag }} 102 | files: scripts.${{ github.event.inputs.tag }}.7z 103 | fail_on_unmatched_files: true 104 | generate_release_notes: false 105 | prerelease: true 106 | 107 | build-models: 108 | runs-on: ubuntu-24.04-arm 109 | steps: 110 | - name: Download Models 111 | run: | 112 | set -ex 113 | mkdir -p release/models 114 | cd release 115 | pushd models 116 | for tag in $(echo "${{ github.event.inputs.model-tags }}" | tr ',' ' '); do 117 | echo "Handling tag $tag" 118 | curl -s -H 'Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json 119 | cat release.json 120 | for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do 121 | echo "Downloading $url" 122 | curl -o dl.7z -LJ -H 'Accept: application/octet-stream' -H 'Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' "$url" 123 | # later release should overwrite earlier ones 124 | 7za x -y dl.7z 125 | done 126 | test -f "dl.7z" 127 | rm -f dl.7z release.json 128 | done 129 | popd 130 | ls -lR 131 | du -sh 132 | 7za a -t7z -bb3 -mx=9 ../models.7z . 133 | 134 | - name: Upload model release 135 | uses: actions/upload-artifact@v4 136 | with: 137 | name: Models 138 | path: release 139 | retention-days: 1 140 | compression-level: 0 141 | 142 | - name: Download External Models 143 | if: false 144 | run: | 145 | rm -rf release 146 | set -ex 147 | mkdir -p release/models 148 | cd release 149 | pushd models 150 | for tag in $(echo "${{ github.event.inputs.ext-model-tags }}" | tr ',' ' '); do 151 | echo "Handling tag $tag" 152 | curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json 153 | for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do 154 | echo "Downloading $url" 155 | curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url" 156 | # later release should overwrite earlier ones 157 | 7za x -y dl.7z 158 | done 159 | test -f "dl.7z" 160 | rm -f dl.7z release.json 161 | done 162 | popd 163 | ls -lR 164 | du -sh 165 | 7za a -t7z -bb3 -mx=9 ../ext-models.7z . 166 | 167 | - name: Upload external model release 168 | uses: actions/upload-artifact@v4 169 | if: false 170 | with: 171 | name: External-Models 172 | path: release 173 | retention-days: 1 174 | compression-level: 0 175 | 176 | - name: Download Contributed Models 177 | run: | 178 | rm -rf release 179 | set -ex 180 | mkdir -p release/models 181 | cd release 182 | pushd models 183 | for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do 184 | echo "Handling tag $tag" 185 | curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json 186 | for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do 187 | echo "Downloading $url" 188 | curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url" 189 | # later release should overwrite earlier ones 190 | 7za x -y dl.7z 191 | done 192 | #test -f "dl.7z" # contrib-models might be empty. 193 | rm -f dl.7z release.json 194 | done 195 | popd 196 | ls -lR 197 | du -sh 198 | 7za a -t7z -bb3 -mx=9 ../contrib-models.7z . 199 | 200 | - name: Upload contrib model release 201 | uses: actions/upload-artifact@v4 202 | with: 203 | name: Contrib-Models 204 | path: release 205 | retention-days: 1 206 | compression-level: 0 207 | 208 | - name: Download Contributed Models 209 | run: | 210 | rm -rf release 211 | set -ex 212 | mkdir -p release/models 213 | cd release 214 | pushd models 215 | for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do 216 | echo "Handling tag $tag" 217 | curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json 218 | for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do 219 | echo "Downloading $url" 220 | curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url" 221 | # later release should overwrite earlier ones 222 | 7za x -y dl.7z 223 | done 224 | #test -f "dl.7z" # contrib-models might be empty. 225 | rm -f dl.7z release.json 226 | done 227 | popd 228 | ls -lR 229 | du -sh 230 | 7za a -t7z -bb3 -mx=9 ../contrib-models.7z . 231 | 232 | - name: Rename release asset 233 | run: | 234 | mv models.7z models.${{ github.event.inputs.tag }}.7z 235 | mv contrib-models.7z contrib-models.${{ github.event.inputs.tag }}.7z 236 | 237 | - name: Release models 238 | uses: softprops/action-gh-release@v2 239 | with: 240 | tag_name: ${{ github.event.inputs.tag }} 241 | files: | 242 | models.${{ github.event.inputs.tag }}.7z 243 | contrib-models.${{ github.event.inputs.tag }}.7z 244 | fail_on_unmatched_files: true 245 | generate_release_notes: false 246 | prerelease: true 247 | 248 | release: 249 | runs-on: ubuntu-24.04-arm 250 | needs: [build-vsov, build-vsort, build-vstrt, build-vsmigx, build-vsncnn, build-vstrt_rtx, build-cuda-dependency, build-hip-dependency, build-scripts, build-models] 251 | 252 | defaults: 253 | run: 254 | shell: bash 255 | 256 | steps: 257 | - name: Download artifact for scripts 258 | uses: actions/download-artifact@v4 259 | with: 260 | name: Scripts 261 | path: scripts-release 262 | 263 | - name: Download artifact for models 264 | uses: actions/download-artifact@v4 265 | with: 266 | name: Models 267 | path: models-release 268 | 269 | - name: Download artifact for vsov 270 | uses: actions/download-artifact@v4 271 | with: 272 | name: VSOV-Windows-x64 273 | path: vsov-release 274 | 275 | - name: Download artifact for vsort 276 | uses: actions/download-artifact@v4 277 | with: 278 | name: VSORT-Windows-x64 279 | path: vsort-release 280 | 281 | - name: Download artifact for vstrt 282 | uses: actions/download-artifact@v4 283 | with: 284 | name: VSTRT-Windows-x64 285 | path: vstrt-release 286 | 287 | - name: Download artifact for vsmigx 288 | uses: actions/download-artifact@v4 289 | with: 290 | name: VSMIGX-Windows-x64 291 | path: vsmigx-release 292 | 293 | - name: Download artifact for vsncnn 294 | uses: actions/download-artifact@v4 295 | with: 296 | name: VSNCNN-GPU-Windows-x64 297 | path: vsncnn-release 298 | 299 | - name: Download artifact for vstrt_rtx 300 | uses: actions/download-artifact@v4 301 | with: 302 | name: VSTRT-RTX-Windows-x64 303 | path: vstrt-rtx-release 304 | 305 | - name: Download artifact for cuda dependencies 306 | uses: actions/download-artifact@v4 307 | with: 308 | name: vsmlrt-cuda 309 | path: cuda-release 310 | 311 | - name: Download artifact for hip dependencies 312 | uses: actions/download-artifact@v4 313 | with: 314 | name: vsmlrt-hip 315 | path: hip-release 316 | 317 | - name: Build CPU-only release 318 | shell: bash 319 | run: | 320 | mkdir release-cpu 321 | cp -r models-release/models release-cpu/ 322 | cp -r vsov-release/* release-cpu/ 323 | cp -r vsort-release/* release-cpu/ 324 | rm -f release-cpu/vsort/onnxruntime_providers_*.dll 325 | cp scripts-release/*.py release-cpu/ 326 | cd release-cpu 327 | ls -lR 328 | 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z . 329 | 330 | - name: Upload CPU-only release 331 | uses: actions/upload-artifact@v4 332 | if: false 333 | with: 334 | name: vsmlrt-cpu-release 335 | path: vsmlrt-windows-x64-cpu.7z 336 | retention-days: 1 337 | compression-level: 0 338 | 339 | - name: Rename release asset 340 | run: mv vsmlrt-windows-x64-cpu.7z vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag }}.7z 341 | 342 | - name: Release CPU 343 | uses: softprops/action-gh-release@v2 344 | with: 345 | tag_name: ${{ github.event.inputs.tag }} 346 | files: vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag}}.7z 347 | fail_on_unmatched_files: true 348 | generate_release_notes: false 349 | prerelease: true 350 | 351 | - name: Build generic GPU release 352 | shell: bash 353 | run: | 354 | mkdir release-generic-gpu 355 | cp -r models-release/models release-generic-gpu/ 356 | cp -r vsov-release/* release-generic-gpu/ 357 | cp -r vsort-release/* release-generic-gpu/ 358 | rm -f release-generic-gpu/vsort/onnxruntime_providers_*.dll 359 | cp -r vsncnn-release/* release-generic-gpu/ 360 | cp scripts-release/*.py release-generic-gpu/ 361 | cd release-generic-gpu 362 | ls -lR 363 | 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z . 364 | 365 | - name: Upload generic GPU release 366 | uses: actions/upload-artifact@v4 367 | if: false 368 | with: 369 | name: vsmlrt-generic-gpu-release 370 | path: vsmlrt-windows-x64-generic-gpu.7z 371 | retention-days: 1 372 | compression-level: 0 373 | 374 | - name: Rename release asset for generic GPU release 375 | run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z 376 | 377 | - name: Release generic GPU 378 | uses: softprops/action-gh-release@v2 379 | with: 380 | tag_name: ${{ github.event.inputs.tag }} 381 | files: vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z 382 | fail_on_unmatched_files: true 383 | generate_release_notes: false 384 | prerelease: true 385 | 386 | - name: Extract CUDA libraries 387 | run: | 388 | cd cuda-release 389 | 7za x -bb3 vsmlrt-cuda.7z.001 390 | rm vsmlrt-cuda.7z.* 391 | 392 | - name: Build CUDA release 393 | shell: bash 394 | run: | 395 | mkdir release-cuda 396 | cp -r models-release/models release-cuda/ 397 | cp -r vsov-release/* release-cuda/ 398 | cp -r vsort-release/* release-cuda/ 399 | cp -r vstrt-release/* release-cuda/ 400 | cp -r vsncnn-release/* release-cuda/ 401 | cp -r vstrt-rtx-release/* release-cuda/ 402 | cp -r cuda-release/* release-cuda/ 403 | cp scripts-release/*.py release-cuda/ 404 | cd release-cuda 405 | ls -lR 406 | 7za a -t7z -bb3 -mx=9 -v2147483647b ../vsmlrt-windows-x64-cuda.7z . 407 | 408 | - name: Upload CUDA release 409 | uses: actions/upload-artifact@v4 410 | if: false 411 | with: 412 | name: vsmlrt-cuda-release 413 | path: vsmlrt-windows-x64-cuda.7z* 414 | retention-days: 1 415 | compression-level: 0 416 | 417 | - name: Rename release asset for CUDA release 418 | run: | 419 | mv vsmlrt-windows-x64-cuda.7z.001 vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.001 420 | mv vsmlrt-windows-x64-cuda.7z.002 vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.002 421 | 422 | - name: Release CUDA 423 | uses: softprops/action-gh-release@v2 424 | with: 425 | tag_name: ${{ github.event.inputs.tag }} 426 | files: vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z* 427 | fail_on_unmatched_files: true 428 | generate_release_notes: false 429 | prerelease: true 430 | 431 | - name: Build TensorRT release 432 | shell: bash 433 | run: | 434 | cd release-cuda 435 | cd vsmlrt-cuda 436 | rm --verbose cublas*.dll cudnn*.dll cufft*.dll cupti*.dll nvblas*.dll 437 | cd .. 438 | rm --verbose vsort/onnxruntime_providers_*.dll 439 | 7za a -t7z -bb3 -mx=9 -v2147483647b ../vsmlrt-windows-x64-tensorrt.7z . 440 | 441 | - name: Upload TensorRT release 442 | uses: actions/upload-artifact@v4 443 | if: false 444 | with: 445 | name: vsmlrt-tensorrt-release 446 | path: vsmlrt-windows-x64-tensorrt.7z* 447 | retention-days: 1 448 | compression-level: 0 449 | 450 | - name: Rename release asset for TensorRT release 451 | run: | 452 | mv vsmlrt-windows-x64-tensorrt.7z.001 vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z.001 453 | mv vsmlrt-windows-x64-tensorrt.7z.002 vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z.002 454 | 455 | - name: Release TensorRT 456 | uses: softprops/action-gh-release@v2 457 | with: 458 | tag_name: ${{ github.event.inputs.tag }} 459 | files: vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z* 460 | fail_on_unmatched_files: true 461 | generate_release_notes: false 462 | prerelease: true 463 | 464 | - name: Extract HIP libraries 465 | run: | 466 | cd hip-release 467 | 7za x -bb3 vsmlrt-hip.7z 468 | rm vsmlrt-hip.7z 469 | 470 | - name: Build MIGraphX release 471 | shell: bash 472 | run: | 473 | mkdir release-hip 474 | cp -r models-release/models release-hip/ 475 | cp -r vsov-release/* release-hip/ 476 | cp -r vsort-release/* release-hip/ 477 | cp -r vsmigx-release/* release-hip/ 478 | cp -r vsncnn-release/* release-hip/ 479 | cp -r hip-release/* release-hip/ 480 | cp scripts-release/*.py release-hip/ 481 | cd release-hip 482 | ls -lR 483 | 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-migraphx.7z . 484 | 485 | - name: Upload MIGraphX release 486 | uses: actions/upload-artifact@v4 487 | if: false 488 | with: 489 | name: vsmlrt-migraphx-release 490 | path: vsmlrt-windows-x64-migraphx.7z 491 | retention-days: 1 492 | compression-level: 0 493 | 494 | - name: Rename release asset for MIGraphX release 495 | run: mv vsmlrt-windows-x64-migraphx.7z vsmlrt-windows-x64-migraphx.${{ github.event.inputs.tag }}.7z 496 | 497 | - name: Release MIGraphX 498 | uses: softprops/action-gh-release@v2 499 | with: 500 | tag_name: ${{ github.event.inputs.tag }} 501 | files: vsmlrt-windows-x64-migraphx.${{ github.event.inputs.tag }}.7z 502 | fail_on_unmatched_files: true 503 | generate_release_notes: false 504 | prerelease: true 505 | 506 | # Update nightly tag. 507 | - name: Checkout repo 508 | if: github.event.inputs.tag == 'nightly' 509 | uses: actions/checkout@v4 510 | with: 511 | fetch-depth: 0 512 | - name: Overwrite tag 513 | if: github.event.inputs.tag == 'nightly' 514 | run: | 515 | git pull --tags --force 516 | git tag -f ${{ github.event.inputs.tag }} 517 | git push -f origin ${{ github.event.inputs.tag }} 518 | -------------------------------------------------------------------------------- /vstrt/trt_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_TRT_UTILS_H_ 2 | #define VSTRT_TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include "cuda_helper.h" 15 | #include "cuda_utils.h" 16 | 17 | using ErrorMessage = std::string; 18 | 19 | struct RequestedTileSize { 20 | int tile_w; 21 | int tile_h; 22 | }; 23 | 24 | struct VideoSize { 25 | int width; 26 | int height; 27 | }; 28 | 29 | using TileSize = std::variant; 30 | 31 | struct InferenceInstance { 32 | MemoryResource src; 33 | MemoryResource dst; 34 | StreamResource stream; 35 | std::unique_ptr exec_context; 36 | GraphExecResource graphexec; 37 | 38 | #if NV_TENSORRT_MAJOR >= 10 || defined(TRT_MAJOR_RTX) 39 | Resource d_context_allocation; 40 | #endif 41 | }; 42 | 43 | class Logger : public nvinfer1::ILogger { 44 | void log(Severity severity, const char* message) noexcept override { 45 | if (severity <= verbosity) { 46 | std::cerr << message << '\n'; 47 | } 48 | } 49 | 50 | public: 51 | Logger() = default; 52 | 53 | void set_verbosity(Severity value) noexcept { 54 | this->verbosity = value; 55 | } 56 | 57 | private: 58 | Severity verbosity; 59 | }; 60 | 61 | static inline 62 | std::optional selectProfile( 63 | const std::unique_ptr & engine, 64 | const TileSize & tile_size, 65 | int batch_size = 1 66 | ) noexcept { 67 | 68 | int tile_w, tile_h; 69 | if (std::holds_alternative(tile_size)) { 70 | tile_w = std::get(tile_size).tile_w; 71 | tile_h = std::get(tile_size).tile_h; 72 | } else { 73 | tile_w = std::get(tile_size).width; 74 | tile_h = std::get(tile_size).height; 75 | } 76 | 77 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 78 | auto input_name = engine->getIOTensorName(0); 79 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 80 | 81 | // finds the optimal profile 82 | for (int i = 0; i < engine->getNbOptimizationProfiles(); ++i) { 83 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 84 | nvinfer1::Dims opt_dims = engine->getProfileShape( 85 | input_name, i, nvinfer1::OptProfileSelector::kOPT 86 | ); 87 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 88 | nvinfer1::Dims opt_dims = engine->getProfileDimensions( 89 | 0, i, nvinfer1::OptProfileSelector::kOPT 90 | ); 91 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 92 | 93 | if (opt_dims.d[0] != batch_size) { 94 | continue; 95 | } 96 | if (opt_dims.d[2] == tile_h && opt_dims.d[3] == tile_w) { 97 | return i; 98 | } 99 | } 100 | 101 | // finds the first eligible profile 102 | for (int i = 0; i < engine->getNbOptimizationProfiles(); ++i) { 103 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 104 | nvinfer1::Dims min_dims = engine->getProfileShape( 105 | input_name, i, nvinfer1::OptProfileSelector::kMIN 106 | ); 107 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 108 | nvinfer1::Dims min_dims = engine->getProfileDimensions( 109 | 0, i, nvinfer1::OptProfileSelector::kMIN 110 | ); 111 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 112 | 113 | if (min_dims.d[0] > batch_size) { 114 | continue; 115 | } 116 | if (min_dims.d[2] > tile_h || min_dims.d[3] > tile_w) { 117 | continue; 118 | } 119 | 120 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 121 | nvinfer1::Dims max_dims = engine->getProfileShape( 122 | input_name, i, nvinfer1::OptProfileSelector::kMAX 123 | ); 124 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 125 | nvinfer1::Dims max_dims = engine->getProfileDimensions( 126 | 0, i, nvinfer1::OptProfileSelector::kMAX 127 | ); 128 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 129 | 130 | if (max_dims.d[0] < batch_size) { 131 | continue; 132 | } 133 | if (max_dims.d[2] < tile_h || max_dims.d[3] < tile_w) { 134 | continue; 135 | } 136 | 137 | return i; 138 | } 139 | 140 | // returns not-found 141 | return {}; 142 | } 143 | 144 | static inline 145 | std::optional enqueue( 146 | const MemoryResource & src, 147 | const MemoryResource & dst, 148 | const std::unique_ptr & exec_context, 149 | cudaStream_t stream 150 | ) noexcept { 151 | 152 | const auto set_error = [](const ErrorMessage & message) { 153 | return message; 154 | }; 155 | 156 | checkError(cudaMemcpyAsync( 157 | src.d_data, src.h_data, src.size, 158 | cudaMemcpyHostToDevice, stream 159 | )); 160 | 161 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 162 | auto input_name = exec_context->getEngine().getIOTensorName(0); 163 | auto output_name = exec_context->getEngine().getIOTensorName(1); 164 | 165 | if (!exec_context->setTensorAddress(input_name, src.d_data.data)) { 166 | return set_error("set input tensor address failed"); 167 | } 168 | if (!exec_context->setTensorAddress(output_name, dst.d_data.data)) { 169 | return set_error("set output tensor address failed"); 170 | } 171 | if (!exec_context->enqueueV3(stream)) { 172 | return set_error("enqueue error"); 173 | } 174 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 175 | void * bindings[] { 176 | static_cast(src.d_data.data), 177 | static_cast(dst.d_data.data) 178 | }; 179 | 180 | if (!exec_context->enqueueV2(bindings, stream, nullptr)) { 181 | return set_error("enqueue error"); 182 | } 183 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 184 | 185 | checkError(cudaMemcpyAsync( 186 | dst.h_data, dst.d_data, dst.size, 187 | cudaMemcpyDeviceToHost, stream 188 | )); 189 | 190 | return {}; 191 | } 192 | 193 | static inline 194 | std::variant getGraphExec( 195 | const MemoryResource & src, const MemoryResource & dst, 196 | const std::unique_ptr & exec_context, 197 | cudaStream_t stream 198 | ) noexcept { 199 | 200 | const auto set_error = [](const ErrorMessage & message) { 201 | return message; 202 | }; 203 | 204 | // flush deferred internal state update 205 | // https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/developer-guide/index.html#cuda-graphs 206 | { 207 | auto result = enqueue(src, dst, exec_context, stream); 208 | if (result.has_value()) { 209 | return set_error(result.value()); 210 | } 211 | checkError(cudaStreamSynchronize(stream)); 212 | } 213 | 214 | checkError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed)); 215 | { 216 | auto result = enqueue(src, dst, exec_context, stream); 217 | if (result.has_value()) { 218 | return set_error(result.value()); 219 | } 220 | } 221 | cudaGraph_t graph; 222 | checkError(cudaStreamEndCapture(stream, &graph)); 223 | cudaGraphExec_t graphexec; 224 | checkError(cudaGraphInstantiate(&graphexec, graph, nullptr, nullptr, 0)); 225 | checkError(cudaGraphDestroy(graph)); 226 | 227 | return graphexec; 228 | } 229 | 230 | static inline 231 | size_t getSize( 232 | const nvinfer1::Dims & dim 233 | ) noexcept { 234 | 235 | size_t ret = 1; 236 | for (int i = 0; i < dim.nbDims; ++i) { 237 | ret *= dim.d[i]; 238 | } 239 | return ret; 240 | } 241 | 242 | static inline 243 | int getBytesPerSample(nvinfer1::DataType type) noexcept { 244 | switch (type) { 245 | case nvinfer1::DataType::kFLOAT: 246 | return 4; 247 | case nvinfer1::DataType::kHALF: 248 | return 2; 249 | case nvinfer1::DataType::kINT8: 250 | return 1; 251 | case nvinfer1::DataType::kINT32: 252 | return 4; 253 | case nvinfer1::DataType::kBOOL: 254 | return 1; 255 | case nvinfer1::DataType::kUINT8: 256 | return 1; 257 | #if (NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 8061 || defined(TRT_MAJOR_RTX) 258 | case nvinfer1::DataType::kFP8: 259 | return 1; 260 | #endif // (NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 8061 || defined(TRT_MAJOR_RTX) 261 | #if NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX) 262 | case nvinfer1::DataType::kBF16: 263 | return 2; 264 | case nvinfer1::DataType::kINT64: 265 | return 8; 266 | #endif // NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX) 267 | default: 268 | return 0; 269 | } 270 | } 271 | 272 | static inline 273 | std::variant getInstance( 274 | const std::unique_ptr & engine, 275 | const std::optional & profile_index, 276 | const TileSize & tile_size, 277 | bool use_cuda_graph, 278 | #if NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX) 279 | bool & is_dynamic 280 | #else // NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX) 281 | bool is_dynamic 282 | #endif // NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX) 283 | ) noexcept { 284 | 285 | const auto set_error = [](const ErrorMessage & error_message) { 286 | return error_message; 287 | }; 288 | 289 | StreamResource stream {}; 290 | checkError(cudaStreamCreateWithFlags(&stream.data, cudaStreamNonBlocking)); 291 | 292 | auto exec_context = std::unique_ptr( 293 | #if NV_TENSORRT_MAJOR >= 10 || defined(TRT_MAJOR_RTX) 294 | engine->createExecutionContext( 295 | is_dynamic ? 296 | nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED : 297 | nvinfer1::ExecutionContextAllocationStrategy::kON_PROFILE_CHANGE 298 | ) 299 | #else 300 | engine->createExecutionContext() 301 | #endif 302 | ); 303 | 304 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 305 | auto input_name = engine->getIOTensorName(0); 306 | auto output_name = engine->getIOTensorName(1); 307 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 308 | 309 | if (!exec_context->allInputDimensionsSpecified()) { 310 | if (!profile_index.has_value()) { 311 | return set_error("no valid optimization profile found"); 312 | } 313 | #if NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX) 314 | is_dynamic = true; 315 | #endif // NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX) 316 | exec_context->setOptimizationProfileAsync(profile_index.value(), stream); 317 | checkError(cudaStreamSynchronize(stream)); 318 | 319 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 320 | nvinfer1::Dims dims = exec_context->getTensorShape(input_name); 321 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 322 | nvinfer1::Dims dims = exec_context->getBindingDimensions(0); 323 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 324 | 325 | dims.d[0] = 1; 326 | 327 | if (std::holds_alternative(tile_size)) { 328 | dims.d[2] = std::get(tile_size).tile_h; 329 | dims.d[3] = std::get(tile_size).tile_w; 330 | } else { 331 | dims.d[2] = std::get(tile_size).height; 332 | dims.d[3] = std::get(tile_size).width; 333 | } 334 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 335 | exec_context->setInputShape(input_name, dims); 336 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 337 | exec_context->setBindingDimensions(0, dims); 338 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 339 | } else if (std::holds_alternative(tile_size)) { 340 | #if NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX) 341 | is_dynamic = false; 342 | #endif // NV_TENSORRT_MAJOR < 10 && !defined(TRT_MAJOR_RTX) 343 | 344 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 345 | nvinfer1::Dims dims = exec_context->getTensorShape(input_name); 346 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 347 | nvinfer1::Dims dims = exec_context->getBindingDimensions(0); 348 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 349 | 350 | if (std::holds_alternative(tile_size)) { 351 | if (dims.d[2] != std::get(tile_size).tile_h || 352 | dims.d[3] != std::get(tile_size).tile_w 353 | ) { 354 | return set_error("requested tile size not applicable"); 355 | } 356 | } else { 357 | if (dims.d[2] != std::get(tile_size).height || 358 | dims.d[3] != std::get(tile_size).width 359 | ) { 360 | return set_error("not supported video dimensions"); 361 | } 362 | } 363 | } 364 | 365 | MemoryResource src {}; 366 | { 367 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 368 | auto dim = exec_context->getTensorShape(input_name); 369 | auto type = engine->getTensorDataType(input_name); 370 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 371 | auto dim = exec_context->getBindingDimensions(0); 372 | auto type = engine->getBindingDataType(0); 373 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 374 | 375 | auto size = getSize(dim) * getBytesPerSample(type); 376 | 377 | Resource d_data {}; 378 | checkError(cudaMalloc(&d_data.data, size)); 379 | 380 | Resource h_data {}; 381 | checkError(cudaMallocHost(&h_data.data, size, cudaHostAllocWriteCombined)); 382 | 383 | src = MemoryResource{ 384 | .h_data = std::move(h_data), 385 | .d_data = std::move(d_data), 386 | .size=size 387 | }; 388 | } 389 | 390 | MemoryResource dst {}; 391 | { 392 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 393 | auto dim = exec_context->getTensorShape(output_name); 394 | auto type = engine->getTensorDataType(output_name); 395 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 396 | auto dim = exec_context->getBindingDimensions(1); 397 | auto type = engine->getBindingDataType(1); 398 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 399 | 400 | auto size = getSize(dim) * getBytesPerSample(type); 401 | 402 | Resource d_data {}; 403 | checkError(cudaMalloc(&d_data.data, size)); 404 | 405 | Resource h_data {}; 406 | checkError(cudaMallocHost(&h_data.data, size)); 407 | 408 | dst = MemoryResource{ 409 | .h_data = std::move(h_data), 410 | .d_data = std::move(d_data), 411 | .size=size 412 | }; 413 | } 414 | 415 | #if NV_TENSORRT_MAJOR >= 10 || defined(TRT_MAJOR_RTX) 416 | Resource d_context_allocation {}; 417 | 418 | if (is_dynamic) { 419 | size_t buffer_size { exec_context->updateDeviceMemorySizeForShapes() }; 420 | if (buffer_size == 0) { 421 | return set_error("failed to get internal activation buffer size"); 422 | } 423 | 424 | checkError(cudaMalloc(&d_context_allocation.data, buffer_size)); 425 | 426 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001 427 | exec_context->setDeviceMemoryV2(d_context_allocation.data, static_cast(buffer_size)); 428 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001 429 | exec_context->setDeviceMemory(d_context_allocation.data); 430 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001 431 | } 432 | #endif // NV_TENSORRT_MAJOR >= 10 433 | 434 | GraphExecResource graphexec {}; 435 | if (use_cuda_graph) { 436 | auto result = getGraphExec( 437 | src, dst, 438 | exec_context, stream 439 | ); 440 | if (std::holds_alternative(result)) { 441 | graphexec = std::move(std::get(result)); 442 | } else { 443 | return set_error(std::get(result)); 444 | } 445 | } 446 | 447 | return InferenceInstance{ 448 | .src = std::move(src), 449 | .dst = std::move(dst), 450 | .stream = std::move(stream), 451 | .exec_context = std::move(exec_context), 452 | .graphexec = std::move(graphexec), 453 | #if NV_TENSORRT_MAJOR >= 10 || defined(TRT_MAJOR_RTX) 454 | .d_context_allocation = std::move(d_context_allocation) 455 | #endif 456 | }; 457 | } 458 | 459 | static inline 460 | std::optional checkEngine( 461 | const std::unique_ptr & engine, 462 | bool flexible_output 463 | ) noexcept { 464 | 465 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 466 | int num_bindings = engine->getNbIOTensors(); 467 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 468 | int num_bindings = engine->getNbBindings(); 469 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 470 | 471 | if (num_bindings != 2) { 472 | return "network binding count must be 2, got " + std::to_string(num_bindings); 473 | } 474 | 475 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 476 | auto input_name = engine->getIOTensorName(0); 477 | auto output_name = engine->getIOTensorName(1); 478 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 479 | 480 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 481 | if (engine->getTensorIOMode(input_name) != nvinfer1::TensorIOMode::kINPUT) { 482 | return "the first binding should be an input binding"; 483 | } 484 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 485 | if (!engine->bindingIsInput(0)) { 486 | return "the first binding should be an input binding"; 487 | } 488 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 489 | 490 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 491 | const nvinfer1::Dims & input_dims = engine->getTensorShape(input_name); 492 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 493 | const nvinfer1::Dims & input_dims = engine->getBindingDimensions(0); 494 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 495 | 496 | if (input_dims.nbDims != 4) { 497 | return "expects network with 4-D input"; 498 | } 499 | if (input_dims.d[0] != 1) { 500 | return "batch size of network input must be 1"; 501 | } 502 | 503 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 504 | if (engine->getTensorIOMode(output_name) != nvinfer1::TensorIOMode::kOUTPUT) { 505 | return "the second binding should be an output binding"; 506 | } 507 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 508 | if (engine->bindingIsInput(1)) { 509 | return "the second binding should be an output binding"; 510 | } 511 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 512 | 513 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 514 | const nvinfer1::Dims & output_dims = engine->getTensorShape(output_name); 515 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 516 | const nvinfer1::Dims & output_dims = engine->getBindingDimensions(1); 517 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 518 | 519 | if (output_dims.nbDims != 4) { 520 | return "expects network with 4-D output"; 521 | } 522 | if (output_dims.d[0] != 1) { 523 | return "batch size of network output must be 1"; 524 | } 525 | 526 | auto out_channels = output_dims.d[1]; 527 | if (out_channels != 1 && out_channels != 3 && !flexible_output) { 528 | return "output dimensions must be 1 or 3, or enable \"flexible_output\""; 529 | } 530 | 531 | auto in_height = input_dims.d[2]; 532 | auto in_width = input_dims.d[3]; 533 | auto out_height = output_dims.d[2]; 534 | auto out_width = output_dims.d[3]; 535 | if (out_height % in_height != 0 || out_width % in_width != 0) { 536 | return "output dimensions must be divisible by input dimensions"; 537 | } 538 | 539 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 540 | for (const auto & name : { input_name, output_name }) { 541 | if (engine->getTensorLocation(name) != nvinfer1::TensorLocation::kDEVICE) { 542 | return "network binding " + std::string{ name } + " should reside on device"; 543 | } 544 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 545 | for (int i = 0; i < 2; i++) { 546 | if (engine->getLocation(i) != nvinfer1::TensorLocation::kDEVICE) { 547 | return "network binding " + std::to_string(i) + " should reside on device"; 548 | } 549 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 550 | 551 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 552 | if (engine->getTensorFormat(name) != nvinfer1::TensorFormat::kLINEAR) { 553 | return "expects network IO with layout NCHW (row major linear)"; 554 | } 555 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 556 | if (engine->getBindingFormat(i) != nvinfer1::TensorFormat::kLINEAR) { 557 | return "expects network IO with layout NCHW (row major linear)"; 558 | } 559 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 805 || defined(TRT_MAJOR_RTX) 560 | } 561 | 562 | return {}; 563 | } 564 | 565 | static inline 566 | std::variant> initEngine( 567 | const char * engine_data, size_t engine_nbytes, 568 | const std::unique_ptr & runtime, 569 | bool flexible_output 570 | ) noexcept { 571 | 572 | const auto set_error = [](const ErrorMessage & error_message) { 573 | return error_message; 574 | }; 575 | 576 | std::unique_ptr engine { 577 | runtime->deserializeCudaEngine(engine_data, engine_nbytes) 578 | }; 579 | 580 | if (!engine) { 581 | return set_error("engine deserialization failed"); 582 | } 583 | 584 | if (auto err = checkEngine(engine, flexible_output); err.has_value()) { 585 | return set_error(err.value()); 586 | } 587 | 588 | return engine; 589 | } 590 | 591 | // 0: integer, 1: float 592 | static inline 593 | int getSampleType(nvinfer1::DataType type) noexcept { 594 | switch (type) { 595 | case nvinfer1::DataType::kFLOAT: 596 | case nvinfer1::DataType::kHALF: 597 | #if (NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 8061 || defined(TRT_MAJOR_RTX) 598 | case nvinfer1::DataType::kFP8: 599 | #endif // (NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 8061 || defined(TRT_MAJOR_RTX) 600 | #if NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX) 601 | case nvinfer1::DataType::kBF16: 602 | #endif // NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX) 603 | return 1; 604 | case nvinfer1::DataType::kINT8: 605 | case nvinfer1::DataType::kINT32: 606 | case nvinfer1::DataType::kBOOL: 607 | case nvinfer1::DataType::kUINT8: 608 | #if NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX) 609 | case nvinfer1::DataType::kINT64: 610 | #endif // NV_TENSORRT_MAJOR >= 9 || defined(TRT_MAJOR_RTX) 611 | return 0; 612 | default: 613 | return -1; 614 | } 615 | } 616 | 617 | #endif // VSTRT_TRT_UTILS_H_ 618 | --------------------------------------------------------------------------------