├── .github └── workflows │ ├── linux-migx.yml │ ├── linux-ncnn.yml │ ├── linux-ort.yml │ ├── linux-ov-arm64.yml │ ├── linux-ov.yml │ ├── linux-trt-arm64.yml │ ├── linux-trt.yml │ ├── macos-ort.yml │ ├── windows-cuda-dependency.yml │ ├── windows-hip-dependency.yml │ ├── windows-migx.yml │ ├── windows-ncnn.yml │ ├── windows-ort.yml │ ├── windows-ov.yml │ ├── windows-release.yml │ └── windows-trt.yml ├── LICENSE ├── README.md ├── common ├── convert_float_to_float16.cpp ├── convert_float_to_float16.h ├── onnx_utils.cpp └── onnx_utils.h ├── scripts └── vsmlrt.py ├── vsmigx ├── CMakeLists.txt ├── README.md ├── config.h.in ├── vs_migraphx.cpp └── win32.cpp ├── vsncnn ├── CMakeLists.txt ├── config.h.in ├── onnx2ncnn.cpp ├── onnx2ncnn.hpp └── vs_ncnn.cpp ├── vsort ├── CMakeLists.txt ├── README.md ├── config.h.in ├── vs_onnxruntime.cpp └── win32.cpp ├── vsov ├── CMakeLists.txt ├── README.md ├── config.h.in ├── vs_openvino.cpp └── win32.cpp └── vstrt ├── CMakeLists.txt ├── README.md ├── config.h.in ├── cuda_helper.h ├── cuda_utils.h ├── inference_helper.h ├── longpath.manifest ├── trt_utils.h ├── trtexec ├── CMakeLists.txt ├── logfile.cpp └── trtexec.patch ├── utils.h ├── vs_tensorrt.cpp └── win32.cpp /.github/workflows/linux-migx.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-MIGX) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vsmigx/**' 7 | - '.github/workflows/linux-migx.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04 13 | 14 | defaults: 15 | run: 16 | working-directory: vsmigx 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja 26 | 27 | - name: Download VapourSynth headers 28 | run: | 29 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 30 | unzip -q vs.zip 31 | mv vapoursynth*/ vapoursynth 32 | 33 | - name: Setup HIP and MIGraphX 34 | run: | 35 | wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null 36 | echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.3.1 noble main" | sudo tee --append /etc/apt/sources.list.d/rocm.list 37 | echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600 38 | sudo apt update 39 | sudo apt install hip-runtime-amd rocm-device-libs migraphx-dev hipcc 40 | ls -R /opt/rocm 41 | 42 | - name: Configure 43 | run: cmake -S . -B build -G Ninja -Wno-dev -LA 44 | -D CMAKE_BUILD_TYPE=Release 45 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 46 | -D CMAKE_CXX_COMPILER=g++-13 47 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 48 | -D migraphx_DIR=/opt/rocm/lib/cmake/migraphx 49 | -D MIOpen_DIR=/opt/rocm/lib/cmake/miopen 50 | -D hip_DIR=/opt/rocm/lib/cmake/hip 51 | -D AMDDeviceLibs_DIR=/opt/rocm/lib/cmake/AMDDeviceLibs 52 | -D amd_comgr_DIR=/opt/rocm/lib/cmake/amd_comgr 53 | -D hsa-runtime64_DIR=/opt/rocm/lib/cmake/hsa-runtime64 54 | -D rocblas_DIR=/opt/rocm/lib/cmake/rocblas 55 | -D hipblaslt_DIR=/opt/rocm/lib/cmake/hipblaslt 56 | -D CMAKE_PREFIX_PATH=/opt/rocm/lib/cmake 57 | 58 | - name: Build 59 | run: cmake --build build --verbose 60 | 61 | - name: Install 62 | run: cmake --install build --prefix install 63 | 64 | - name: Prepare for upload 65 | run: | 66 | mkdir artifact 67 | cp -v install/lib/*.so artifact 68 | 69 | - name: Describe 70 | run: git describe --tags --long 71 | 72 | - name: Upload 73 | uses: actions/upload-artifact@v4 74 | with: 75 | name: VSMIGX-Linux-x64 76 | path: vsmigx/artifact 77 | overwrite: true 78 | 79 | -------------------------------------------------------------------------------- /.github/workflows/linux-ncnn.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-NCNN) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsncnn/**' 8 | - '.github/workflows/linux-ncnn.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-24.04 14 | 15 | defaults: 16 | run: 17 | working-directory: vsncnn 18 | 19 | steps: 20 | - name: Checkout repo 21 | uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Setup Ninja 26 | run: pip install ninja 27 | 28 | - name: Cache protobuf 29 | id: cache-protobuf 30 | uses: actions/cache@v4 31 | with: 32 | path: vsncnn/protobuf/install 33 | key: ${{ runner.os }}-vsncnn-protobuf-v1 34 | 35 | - name: Checkout protobuf 36 | uses: actions/checkout@v4 37 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 38 | with: 39 | repository: protocolbuffers/protobuf 40 | ref: v3.21.12 41 | fetch-depth: 1 42 | path: vsncnn/protobuf 43 | 44 | - name: Configure protobuf 45 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 46 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 47 | -D CMAKE_BUILD_TYPE=Release 48 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 49 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 50 | 51 | - name: Build protobuf 52 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 53 | run: cmake --build protobuf/build_rel --verbose 54 | 55 | - name: Install protobuf 56 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 57 | run: cmake --install protobuf/build_rel --prefix protobuf/install 58 | 59 | - name: Cache onnx 60 | id: cache-onnx 61 | uses: actions/cache@v4 62 | with: 63 | path: vsncnn/onnx/install 64 | key: ${{ runner.os }}-vsncnn-onnx-v1 65 | 66 | - name: Checkout onnx 67 | if: steps.cache-onnx.outputs.cache-hit != 'true' 68 | uses: actions/checkout@v4 69 | with: 70 | repository: onnx/onnx 71 | ref: b86cc54efce19530fb953e4b21f57e6b3888534c 72 | fetch-depth: 1 73 | path: vsncnn/onnx 74 | 75 | - name: Configure onnx 76 | if: steps.cache-onnx.outputs.cache-hit != 'true' 77 | run: cmake -S onnx -B onnx/build -G Ninja -LA 78 | -D CMAKE_BUILD_TYPE=Release 79 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 80 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 81 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 82 | -D Protobuf_LIBRARIES=protobuf/install/lib 83 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 84 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 85 | 86 | - name: Build onnx 87 | if: steps.cache-onnx.outputs.cache-hit != 'true' 88 | run: cmake --build onnx/build --verbose 89 | 90 | - name: Install onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --install onnx/build --prefix onnx/install 93 | 94 | - name: Download VapourSynth headers 95 | run: | 96 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 97 | unzip -q vs.zip 98 | mv vapoursynth*/ vapoursynth 99 | 100 | - name: Download NCNN Precompilation 101 | run: | 102 | curl -s -o ncnn.zip -LJO https://github.com/Tencent/ncnn/releases/download/20250503/ncnn-20250503-ubuntu-2404.zip 103 | unzip -q ncnn.zip 104 | 105 | - name: Configure 106 | run: cmake -S . -B build -G Ninja -LA 107 | -D CMAKE_BUILD_TYPE=Release 108 | -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth/include 109 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 110 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 111 | -D ncnn_DIR=ncnn-20250503-ubuntu-2404/lib/cmake/ncnn 112 | -D CMAKE_CXX_STANDARD=20 113 | 114 | - name: Build 115 | run: cmake --build build --verbose 116 | 117 | - name: Install 118 | run: cmake --install build --prefix install 119 | 120 | - name: Prepare for upload 121 | run: | 122 | mkdir artifact 123 | cp -v install/lib/*.so artifact 124 | 125 | - name: Describe 126 | run: git describe --tags --long 127 | 128 | - name: Upload 129 | uses: actions/upload-artifact@v4 130 | with: 131 | name: vsncnn-linux-x64 132 | path: vsncnn/artifact 133 | -------------------------------------------------------------------------------- /.github/workflows/linux-ort.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-ORT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsort/**' 8 | - '.github/workflows/linux-ort.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-22.04 14 | 15 | defaults: 16 | run: 17 | working-directory: vsort 18 | 19 | steps: 20 | - name: Checkout repo 21 | uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Setup Ninja 26 | run: pip install ninja 27 | 28 | - name: Cache protobuf 29 | id: cache-protobuf 30 | uses: actions/cache@v4 31 | with: 32 | path: vsort/protobuf/install 33 | key: ${{ runner.os }}-vsort-protobuf-v1 34 | 35 | - name: Checkout protobuf 36 | uses: actions/checkout@v4 37 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 38 | with: 39 | repository: protocolbuffers/protobuf 40 | # follows protobuf in https://github.com/microsoft/onnxruntime/blob/v1.17.1/cmake/external/onnxruntime_external_deps.cmake#L183 41 | # if you change this, remember to bump the version of the cache key. 42 | ref: v3.21.12 43 | fetch-depth: 1 44 | path: vsort/protobuf 45 | 46 | - name: Configure protobuf 47 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 48 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 49 | -D CMAKE_BUILD_TYPE=Release 50 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 51 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 52 | 53 | - name: Build protobuf 54 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 55 | run: cmake --build protobuf/build_rel --verbose 56 | 57 | - name: Install protobuf 58 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 59 | run: cmake --install protobuf/build_rel --prefix protobuf/install 60 | 61 | - name: Cache onnx 62 | id: cache-onnx 63 | uses: actions/cache@v4 64 | with: 65 | path: vsort/onnx/install 66 | key: ${{ runner.os }}-vsort-onnx-v1 67 | 68 | - name: Checkout onnx 69 | if: steps.cache-onnx.outputs.cache-hit != 'true' 70 | uses: actions/checkout@v4 71 | with: 72 | repository: onnx/onnx 73 | # follows onnx in https://github.com/microsoft/onnxruntime/tree/v1.17.1/cmake/external 74 | # if you change this, remember to bump the version of the cache key. 75 | ref: b86cc54efce19530fb953e4b21f57e6b3888534c 76 | fetch-depth: 1 77 | path: vsort/onnx 78 | 79 | - name: Configure onnx 80 | if: steps.cache-onnx.outputs.cache-hit != 'true' 81 | run: cmake -S onnx -B onnx/build -G Ninja -LA 82 | -D CMAKE_BUILD_TYPE=Release 83 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 84 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 85 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 86 | -D Protobuf_LIBRARIES=protobuf/install/lib 87 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 88 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 89 | 90 | - name: Build onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --build onnx/build --verbose 93 | 94 | - name: Install onnx 95 | if: steps.cache-onnx.outputs.cache-hit != 'true' 96 | run: cmake --install onnx/build --prefix onnx/install 97 | 98 | - name: Download VapourSynth headers 99 | run: | 100 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 101 | unzip -q vs.zip 102 | mv vapoursynth*/ vapoursynth 103 | 104 | - name: Setup ONNX Runtime 105 | run: | 106 | curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-cuda12-1.17.1.tgz 107 | tar -xf ort.tgz 108 | mv onnxruntime-* onnxruntime -v 109 | 110 | - name: Setup CUDA 111 | run: | 112 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb 113 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 114 | sudo apt-get update 115 | sudo apt-get install -y cuda-nvcc-12-1 cuda-cudart-dev-12-1 116 | echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV 117 | echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV 118 | echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV 119 | 120 | - name: Configure 121 | run: cmake -S . -B build -G Ninja -LA 122 | -D CMAKE_BUILD_TYPE=Release 123 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 124 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 125 | -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime/include 126 | -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime/lib 127 | -D ENABLE_CUDA=1 128 | -D CUDAToolkit_ROOT=/usr/local/cuda 129 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 130 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 131 | -D CMAKE_CXX_STANDARD=20 132 | 133 | - name: Build 134 | run: cmake --build build --verbose 135 | 136 | - name: Install 137 | run: cmake --install build --prefix install 138 | 139 | - name: Prepare for upload 140 | run: | 141 | mkdir artifact 142 | cp -v install/lib/*.so artifact 143 | 144 | - name: Describe 145 | run: git describe --tags --long 146 | 147 | - name: Upload 148 | uses: actions/upload-artifact@v4 149 | with: 150 | name: vsort-linux-x64-cuda12.1 151 | path: vsort/artifact 152 | -------------------------------------------------------------------------------- /.github/workflows/linux-ov-arm64.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-OV, ARM64) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vsov/**' 7 | - '.github/workflows/linux-ov-arm64.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04-arm 13 | 14 | defaults: 15 | run: 16 | working-directory: vsov 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja 26 | 27 | - name: Cache protobuf 28 | id: cache-protobuf 29 | uses: actions/cache@v4 30 | with: 31 | path: vsov/protobuf/install 32 | key: ${{ runner.os }}-vsov-protobuf-arm64-v1 33 | 34 | - name: Checkout protobuf 35 | uses: actions/checkout@v4 36 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 37 | with: 38 | repository: protocolbuffers/protobuf 39 | # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/protobuf 40 | # if you change this, remember to bump the version of the cache key. 41 | ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c 42 | fetch-depth: 1 43 | path: vsov/protobuf 44 | 45 | - name: Configure protobuf 46 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 47 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 48 | -D CMAKE_BUILD_TYPE=Release 49 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 50 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 51 | 52 | - name: Build protobuf 53 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 54 | run: cmake --build protobuf/build_rel --verbose 55 | 56 | - name: Install protobuf 57 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 58 | run: cmake --install protobuf/build_rel --prefix protobuf/install 59 | 60 | - name: Cache onnx 61 | id: cache-onnx 62 | uses: actions/cache@v4 63 | with: 64 | path: vsov/onnx/install 65 | key: ${{ runner.os }}-vsov-onnx-arm64-v1 66 | 67 | - name: Checkout onnx 68 | if: steps.cache-onnx.outputs.cache-hit != 'true' 69 | uses: actions/checkout@v4 70 | with: 71 | repository: onnx/onnx 72 | # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/onnx 73 | # if you change this, remember to bump the version of the cache key. 74 | ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c 75 | fetch-depth: 1 76 | path: vsov/onnx 77 | 78 | - name: Configure onnx 79 | if: steps.cache-onnx.outputs.cache-hit != 'true' 80 | run: cmake -S onnx -B onnx/build -G Ninja -LA 81 | -D CMAKE_BUILD_TYPE=Release 82 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 83 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 84 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 85 | -D Protobuf_LIBRARIES=protobuf/install/lib 86 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 87 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 88 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 89 | 90 | - name: Build onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --build onnx/build --verbose 93 | 94 | - name: Install onnx 95 | if: steps.cache-onnx.outputs.cache-hit != 'true' 96 | run: cmake --install onnx/build --prefix onnx/install 97 | 98 | - name: Download VapourSynth headers 99 | run: | 100 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 101 | unzip -q vs.zip 102 | mv vapoursynth*/ vapoursynth 103 | 104 | - name: Setup OpenVINO 105 | run: | 106 | curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/linux/l_openvino_toolkit_ubuntu20_2024.6.0.17404.4c0f47d2335_arm64.tgz 107 | tar -xf ov.tgz 108 | mv l_openvino_* openvino -v 109 | 110 | - name: Configure 111 | run: cmake -S . -B build -G Ninja -LA 112 | -D CMAKE_BUILD_TYPE=Release 113 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math" 114 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 115 | -D OpenVINO_DIR=openvino/runtime/cmake 116 | -D ENABLE_VISUALIZATION=ON 117 | -D WIN32_SHARED_OPENVINO=ON 118 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 119 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 120 | 121 | - name: Build 122 | run: cmake --build build --verbose 123 | 124 | - name: Install 125 | run: cmake --install build --prefix install 126 | 127 | - name: Prepare for upload 128 | run: | 129 | mkdir artifact 130 | cp -v install/lib/*.so artifact 131 | 132 | - name: Describe 133 | run: git describe --tags --long 134 | 135 | - name: Upload 136 | uses: actions/upload-artifact@v4 137 | with: 138 | name: VSOV-Linux-ARM64 139 | path: vsov/artifact 140 | 141 | -------------------------------------------------------------------------------- /.github/workflows/linux-ov.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-OV) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vsov/**' 7 | - '.github/workflows/linux-ov.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-22.04 13 | 14 | defaults: 15 | run: 16 | working-directory: vsov 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja 26 | 27 | - name: Cache protobuf 28 | id: cache-protobuf 29 | uses: actions/cache@v4 30 | with: 31 | path: vsov/protobuf/install 32 | key: ${{ runner.os }}-vsov-protobuf-v1 33 | 34 | - name: Checkout protobuf 35 | uses: actions/checkout@v4 36 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 37 | with: 38 | repository: protocolbuffers/protobuf 39 | # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/protobuf 40 | # if you change this, remember to bump the version of the cache key. 41 | ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c 42 | fetch-depth: 1 43 | path: vsov/protobuf 44 | 45 | - name: Configure protobuf 46 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 47 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 48 | -D CMAKE_BUILD_TYPE=Release 49 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 50 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 51 | 52 | - name: Build protobuf 53 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 54 | run: cmake --build protobuf/build_rel --verbose 55 | 56 | - name: Install protobuf 57 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 58 | run: cmake --install protobuf/build_rel --prefix protobuf/install 59 | 60 | - name: Cache onnx 61 | id: cache-onnx 62 | uses: actions/cache@v4 63 | with: 64 | path: vsov/onnx/install 65 | key: ${{ runner.os }}-vsov-onnx-v1 66 | 67 | - name: Checkout onnx 68 | if: steps.cache-onnx.outputs.cache-hit != 'true' 69 | uses: actions/checkout@v4 70 | with: 71 | repository: onnx/onnx 72 | # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/onnx 73 | # if you change this, remember to bump the version of the cache key. 74 | ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c 75 | fetch-depth: 1 76 | path: vsov/onnx 77 | 78 | - name: Configure onnx 79 | if: steps.cache-onnx.outputs.cache-hit != 'true' 80 | run: cmake -S onnx -B onnx/build -G Ninja -LA 81 | -D CMAKE_BUILD_TYPE=Release 82 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 83 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 84 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 85 | -D Protobuf_LIBRARIES=protobuf/install/lib 86 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 87 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 88 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 89 | 90 | - name: Build onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --build onnx/build --verbose 93 | 94 | - name: Install onnx 95 | if: steps.cache-onnx.outputs.cache-hit != 'true' 96 | run: cmake --install onnx/build --prefix onnx/install 97 | 98 | - name: Download VapourSynth headers 99 | run: | 100 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 101 | unzip -q vs.zip 102 | mv vapoursynth*/ vapoursynth 103 | 104 | - name: Setup OpenVINO 105 | run: | 106 | curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/linux/l_openvino_toolkit_ubuntu24_2024.6.0.17404.4c0f47d2335_x86_64.tgz 107 | tar -xf ov.tgz 108 | mv l_openvino_* openvino -v 109 | 110 | - name: Configure 111 | run: cmake -S . -B build -G Ninja -LA 112 | -D CMAKE_BUILD_TYPE=Release 113 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 114 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 115 | -D OpenVINO_DIR=openvino/runtime/cmake 116 | -D ENABLE_VISUALIZATION=ON 117 | -D WIN32_SHARED_OPENVINO=ON 118 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 119 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 120 | 121 | - name: Build 122 | run: cmake --build build --verbose 123 | 124 | - name: Install 125 | run: cmake --install build --prefix install 126 | 127 | - name: Prepare for upload 128 | run: | 129 | mkdir artifact 130 | cp -v install/lib/*.so artifact 131 | 132 | - name: Describe 133 | run: git describe --tags --long 134 | 135 | - name: Upload 136 | uses: actions/upload-artifact@v4 137 | with: 138 | name: VSOV-Linux-x64 139 | path: vsov/artifact 140 | 141 | -------------------------------------------------------------------------------- /.github/workflows/linux-trt-arm64.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-TRT, ARM64) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vstrt/**' 7 | - '.github/workflows/linux-trt-arm64.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04-arm 13 | 14 | defaults: 15 | run: 16 | working-directory: vstrt 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja --break-system-packages 26 | 27 | - name: Download VapourSynth headers 28 | run: | 29 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 30 | unzip -q vs.zip 31 | mv vapoursynth*/ vapoursynth 32 | 33 | - name: Setup CUDA and TensorRT 34 | run: | 35 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb 36 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 37 | sudo apt-get update 38 | export TRT_VER=10.8.0.43-1+cuda12.8 39 | sudo apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-cudart-dev-12-8 libnvinfer-dev=${TRT_VER} libnvinfer-headers-dev=${TRT_VER} libnvinfer10=${TRT_VER} 40 | echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV 41 | echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV 42 | echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV 43 | 44 | - name: Configure 45 | run: cmake -S . -B build -G Ninja -LA 46 | -D CMAKE_BUILD_TYPE=Release 47 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 48 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math" 49 | 50 | - name: Build 51 | run: cmake --build build --verbose 52 | 53 | - name: Install 54 | run: cmake --install build --prefix install 55 | 56 | - name: Prepare for upload 57 | run: | 58 | mkdir artifact 59 | cp -v install/lib/*.so artifact 60 | 61 | - name: Describe 62 | run: git describe --tags --long 63 | 64 | - name: Upload 65 | uses: actions/upload-artifact@v4 66 | with: 67 | name: VSTRT-Linux-ARM64 68 | path: vstrt/artifact 69 | overwrite: true 70 | 71 | -------------------------------------------------------------------------------- /.github/workflows/linux-trt.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux-TRT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vstrt/**' 7 | - '.github/workflows/linux-trt.yml' 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build-linux: 12 | runs-on: ubuntu-24.04 13 | 14 | defaults: 15 | run: 16 | working-directory: vstrt 17 | 18 | steps: 19 | - name: Checkout repo 20 | uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Setup Ninja 25 | run: pip install ninja --break-system-packages 26 | 27 | - name: Download VapourSynth headers 28 | run: | 29 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 30 | unzip -q vs.zip 31 | mv vapoursynth*/ vapoursynth 32 | 33 | - name: Setup CUDA and TensorRT 34 | run: | 35 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb 36 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 37 | sudo apt-get update 38 | export TRT_VER=10.9.0.34-1+cuda12.8 39 | sudo apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-cudart-dev-12-8 libnvinfer-dev=${TRT_VER} libnvinfer-headers-dev=${TRT_VER} libnvinfer10=${TRT_VER} 40 | echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV 41 | echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV 42 | echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV 43 | 44 | - name: Configure 45 | run: cmake -S . -B build -G Ninja -LA 46 | -D CMAKE_BUILD_TYPE=Release 47 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 48 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3" 49 | 50 | - name: Build 51 | run: cmake --build build --verbose 52 | 53 | - name: Install 54 | run: cmake --install build --prefix install 55 | 56 | - name: Prepare for upload 57 | run: | 58 | mkdir artifact 59 | cp -v install/lib/*.so artifact 60 | 61 | - name: Describe 62 | run: git describe --tags --long 63 | 64 | - name: Upload 65 | uses: actions/upload-artifact@v4 66 | with: 67 | name: VSTRT-Linux-x64 68 | path: vstrt/artifact 69 | overwrite: true 70 | 71 | -------------------------------------------------------------------------------- /.github/workflows/macos-ort.yml: -------------------------------------------------------------------------------- 1 | name: Build (macOS-ORT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsort/**' 8 | - '.github/workflows/macos-ort.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-macos: 13 | runs-on: macos-14 14 | 15 | defaults: 16 | run: 17 | working-directory: vsort 18 | 19 | steps: 20 | - name: Checkout repo 21 | uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 24 | 25 | - name: Setup Ninja 26 | run: brew install ninja 27 | 28 | - name: Cache protobuf 29 | id: cache-protobuf 30 | uses: actions/cache@v4 31 | with: 32 | path: vsort/protobuf/install 33 | key: ${{ runner.os }}-vsort-protobuf-v1 34 | 35 | - name: Checkout protobuf 36 | uses: actions/checkout@v4 37 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 38 | with: 39 | repository: protocolbuffers/protobuf 40 | ref: v3.21.12 41 | fetch-depth: 1 42 | path: vsort/protobuf 43 | 44 | - name: Configure protobuf 45 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 46 | run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA 47 | -D CMAKE_BUILD_TYPE=Release 48 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 49 | -D protobuf_BUILD_SHARED_LIBS=OFF 50 | -D protobuf_BUILD_TESTS=OFF 51 | 52 | - name: Build protobuf 53 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 54 | run: cmake --build protobuf/build_rel --verbose 55 | 56 | - name: Install protobuf 57 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 58 | run: cmake --install protobuf/build_rel --prefix protobuf/install 59 | 60 | - name: Cache onnx 61 | id: cache-onnx 62 | uses: actions/cache@v4 63 | with: 64 | path: vsort/onnx/install 65 | key: ${{ runner.os }}-vsort-onnx-v2 66 | 67 | - name: Checkout onnx 68 | if: steps.cache-onnx.outputs.cache-hit != 'true' 69 | uses: actions/checkout@v4 70 | with: 71 | repository: onnx/onnx 72 | # follows onnx in https://github.com/microsoft/onnxruntime/tree/main/cmake/external 73 | ref: 595228d99e3977ac27cb79d5963adda262af99ad 74 | fetch-depth: 1 75 | path: vsort/onnx 76 | 77 | - name: Configure onnx 78 | if: steps.cache-onnx.outputs.cache-hit != 'true' 79 | run: cmake -S onnx -B onnx/build -G Ninja -LA 80 | -D CMAKE_BUILD_TYPE=Release 81 | -D CMAKE_POSITION_INDEPENDENT_CODE=ON 82 | -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc 83 | -D Protobuf_LITE_LIBRARY=protobuf/install/lib 84 | -D Protobuf_LIBRARIES=protobuf/install/lib 85 | -D ONNX_USE_LITE_PROTO=ON 86 | -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 87 | -D ONNX_GEN_PB_TYPE_STUBS=OFF 88 | -D ONNX_ML=0 89 | 90 | - name: Build onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | run: cmake --build onnx/build --verbose 93 | 94 | - name: Install onnx 95 | if: steps.cache-onnx.outputs.cache-hit != 'true' 96 | run: cmake --install onnx/build --prefix onnx/install 97 | 98 | - name: Download VapourSynth headers 99 | run: | 100 | curl -L -o vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 101 | unzip -q vs.zip 102 | mv vapoursynth*/ vapoursynth 103 | 104 | - name: Setup ONNX Runtime 105 | run: | 106 | curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.20.0/onnxruntime-osx-arm64-1.20.0.tgz 107 | tar -xf ort.tgz 108 | mv onnxruntime-* onnxruntime 109 | 110 | - name: Configure 111 | run: cmake -S . -B build -G Ninja -LA 112 | -D CMAKE_BUILD_TYPE=Release 113 | -D CMAKE_CXX_FLAGS="-Wall -ffast-math -mcpu=apple-m1" 114 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include" 115 | -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime/include 116 | -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime/lib 117 | -D protobuf_DIR=protobuf/install/lib/cmake/protobuf 118 | -D ONNX_DIR=onnx/install/lib/cmake/ONNX 119 | -D CMAKE_CXX_STANDARD=20 120 | -D ENABLE_COREML=ON 121 | 122 | - name: Build 123 | run: cmake --build build --verbose 124 | 125 | - name: Install 126 | run: cmake --install build --prefix install 127 | 128 | - name: Prepare for upload 129 | run: | 130 | mkdir artifact 131 | cp -v install/lib/*.dylib artifact 132 | 133 | - name: Describe 134 | run: git describe --tags --long 135 | 136 | - name: Upload 137 | uses: actions/upload-artifact@v4 138 | with: 139 | name: vsort-macos-arm64 140 | path: vsort/artifact 141 | -------------------------------------------------------------------------------- /.github/workflows/windows-cuda-dependency.yml: -------------------------------------------------------------------------------- 1 | name: Upload vs-mlrt-cuda dependencies 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | tag: 7 | description: 'which tag to upload to' 8 | required: true 9 | default: 'v100' 10 | workflow_call: 11 | inputs: 12 | tag: 13 | description: 'which tag to upload to' 14 | required: true 15 | type: string 16 | secrets: 17 | REPO_TOKEN: 18 | required: true 19 | 20 | jobs: 21 | build-windows: 22 | runs-on: windows-2025 23 | outputs: 24 | runID: ${{ steps.output.outputs.runID }} 25 | 26 | defaults: 27 | run: 28 | shell: bash 29 | 30 | steps: 31 | - name: Download cuDNN inference library 32 | run: curl -LJ https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip -o cudnn.zip 33 | 34 | - name: Extract cuDNN library 35 | run: unzip cudnn.zip 36 | 37 | - name: Move cuDNN library 38 | run: | 39 | mkdir -p vsmlrt-cuda 40 | mv cudnn-windows-*/bin/*.dll vsmlrt-cuda/ -v 41 | rm vsmlrt-cuda/cudnn_*_train*.dll -v 42 | 43 | - name: Download TensorRT library 44 | run: | 45 | curl -L -o trt.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/zip/TensorRT-10.11.0.33.Windows.win10.cuda-12.9.zip 46 | 47 | - name: Extract TensorRT library 48 | run: | 49 | unzip trt.zip 50 | mv TensorRT-*/ TensorRT/ 51 | 52 | - name: Move TensorRT library 53 | run: mv TensorRT/lib/*.dll vsmlrt-cuda -v 54 | 55 | - name: Download CUDA Libraries 56 | shell: cmd 57 | run: | 58 | curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe 59 | cuda_installer.exe -s cudart_12.9 cublas_12.9 cufft_12.9 cupti_12.9 nvrtc_12.9 60 | 61 | - name: Move CUDA Libraries 62 | shell: cmd 63 | run: | 64 | move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda 65 | move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\*.dll" vsmlrt-cuda 66 | del vsmlrt-cuda\cudart32*.dll 67 | 68 | - name: Setup VC commands 69 | uses: ilammy/msvc-dev-cmd@v1 70 | with: 71 | arch: amd64 72 | 73 | - name: Copy VC Runtime Libraries 74 | shell: bash 75 | run: | 76 | cd vsmlrt-cuda 77 | while true; do 78 | changed=false 79 | for dll in *.[dD][lL][lL]; do 80 | for dep in $(dumpbin -dependents "$dll" | grep -o -i '\<\(vc\|msvc\)[a-z0-9_-]*\.dll'); do 81 | echo "finding $dep for $dll" 82 | if ! test -f ./"$dep"; then 83 | changed=true 84 | src="$(where "$dep" | grep -i 'MSVC' | head -1)" 85 | echo "copying $src for $dep" 86 | test -f "$src" || exit 1 87 | cp -f "$src" . 88 | fi 89 | done 90 | done 91 | $changed || break 92 | done 93 | 94 | - name: Compress 95 | run: | 96 | 7z a -t7z -bb3 -mx=9 vsmlrt-cuda.7z vsmlrt-cuda 97 | 98 | - name: Upload 99 | uses: actions/upload-artifact@v4 100 | with: 101 | name: vsmlrt-cuda 102 | path: vsmlrt-cuda.7z 103 | retention-days: 1 104 | compression-level: 0 105 | 106 | - name: Rename release asset 107 | run: | 108 | mv vsmlrt-cuda.7z vsmlrt-cuda.${{ github.event.inputs.tag}}.7z 109 | 110 | - name: Release 111 | uses: softprops/action-gh-release@v2 112 | with: 113 | tag_name: ${{ github.event.inputs.tag }} 114 | files: vsmlrt-cuda.${{ github.event.inputs.tag }}.7z 115 | fail_on_unmatched_files: true 116 | generate_release_notes: false 117 | prerelease: true 118 | -------------------------------------------------------------------------------- /.github/workflows/windows-hip-dependency.yml: -------------------------------------------------------------------------------- 1 | name: Upload vs-mlrt-hip dependencies 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | tag: 7 | description: 'which tag to upload to' 8 | required: true 9 | default: 'v100' 10 | workflow_call: 11 | inputs: 12 | tag: 13 | description: 'which tag to upload to' 14 | required: true 15 | type: string 16 | secrets: 17 | REPO_TOKEN: 18 | required: true 19 | 20 | jobs: 21 | build-windows: 22 | runs-on: windows-2022 23 | outputs: 24 | runID: ${{ steps.output.outputs.runID }} 25 | 26 | defaults: 27 | run: 28 | shell: bash 29 | 30 | steps: 31 | - name: Download MIGraphX Precompilation 32 | run: | 33 | curl -s -o migx.zip -LJO https://github.com/AmusementClub/AMDMIGraphX/releases/download/rocm-4.1.0-1730-g6acc1f957-241221-0629/migraphx-win64.zip 34 | 35 | - name: Extract MIGraphX Precompilation 36 | run: | 37 | unzip migx.zip 38 | 39 | - name: Move MIGraphX Precompilation 40 | run: | 41 | mkdir vsmlrt-hip 42 | mv migraphx/bin/* vsmlrt-hip -v 43 | 44 | - name: Setup VC commands 45 | uses: ilammy/msvc-dev-cmd@v1 46 | with: 47 | arch: amd64 48 | 49 | - name: List Dependencies 50 | shell: bash 51 | run: | 52 | cd vsmlrt-hip 53 | for dll in *.[dD][lL][lL]; do 54 | echo $(dumpbin -dependents "$dll") 55 | done 56 | 57 | - name: Cache HIP 58 | id: cache-hip 59 | uses: actions/cache@v4 60 | with: 61 | path: C:\Program Files\AMD\ROCm 62 | key: ${{ runner.os }}-rocm-6.2.4 63 | 64 | - name: Setup HIP 65 | if: steps.cache-hip.outputs.cache-hit != 'true' 66 | shell: pwsh 67 | run: | 68 | curl -s -o hip_installer.exe -L https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe 69 | Start-Process hip_installer.exe -ArgumentList '-install' -NoNewWindow -Wait 70 | 71 | - name: Move HIP Libraries 72 | shell: cmd 73 | run: | 74 | move "C:\Program Files\AMD\ROCm\6.2\bin\amd_comgr_2.dll" vsmlrt-hip 75 | move "C:\Program Files\AMD\ROCm\6.2\bin\amdhip64_6.dll" vsmlrt-hip 76 | move "C:\Program Files\AMD\ROCm\6.2\bin\hiprtc0602.dll" vsmlrt-hip 77 | move "C:\Program Files\AMD\ROCm\6.2\bin\hiprtc-builtins0602.dll" vsmlrt-hip 78 | 79 | - name: Compress 80 | run: | 81 | 7z a -t7z -mx=3 vsmlrt-hip.7z vsmlrt-hip 82 | 83 | - name: Upload 84 | uses: actions/upload-artifact@v4 85 | with: 86 | name: vsmlrt-hip 87 | path: vsmlrt-hip.7z 88 | retention-days: 1 89 | compression-level: 0 90 | 91 | - name: Rename release asset 92 | run: | 93 | mv vsmlrt-hip.7z vsmlrt-hip.${{ github.event.inputs.tag}}.7z 94 | 95 | - name: Release 96 | uses: softprops/action-gh-release@v2 97 | with: 98 | tag_name: ${{ github.event.inputs.tag }} 99 | files: vsmlrt-hip.${{ github.event.inputs.tag }}.7z 100 | fail_on_unmatched_files: true 101 | generate_release_notes: false 102 | prerelease: true 103 | -------------------------------------------------------------------------------- /.github/workflows/windows-migx.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-MIGX) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vsmigx/**' 7 | - '.github/workflows/windows-migx.yml' 8 | workflow_call: 9 | inputs: 10 | tag: 11 | description: 'which tag to upload to' 12 | required: true 13 | type: string 14 | secrets: 15 | REPO_TOKEN: 16 | required: true 17 | workflow_dispatch: 18 | inputs: 19 | tag: 20 | description: 'which tag to upload to' 21 | default: '' 22 | 23 | jobs: 24 | build-windows: 25 | runs-on: windows-2022 26 | 27 | defaults: 28 | run: 29 | shell: cmd 30 | working-directory: vsmigx 31 | 32 | steps: 33 | - name: Checkout repo 34 | uses: actions/checkout@v4 35 | with: 36 | fetch-depth: 0 37 | 38 | - name: Setup MSVC 39 | uses: ilammy/msvc-dev-cmd@v1 40 | 41 | - name: Cache HIP 42 | id: cache-hip 43 | uses: actions/cache@v4 44 | with: 45 | path: C:\Program Files\AMD\ROCm 46 | key: ${{ runner.os }}-rocm-6.2.4 47 | 48 | - name: Setup HIP 49 | if: steps.cache-hip.outputs.cache-hit != 'true' 50 | shell: pwsh 51 | run: | 52 | curl -s -o hip_installer.exe -L https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe 53 | Start-Process hip_installer.exe -ArgumentList '-install' -NoNewWindow -Wait 54 | 55 | - name: Download MIGraphX Precompilation 56 | run: | 57 | curl -s -o migx.zip -LJO https://github.com/AmusementClub/AMDMIGraphX/releases/download/rocm-4.1.0-1730-g6acc1f957-241221-0629/migraphx-win64.zip 58 | unzip -q migx.zip 59 | 60 | - name: Download VapourSynth headers 61 | run: | 62 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 63 | unzip -q vs.zip 64 | mv vapoursynth-*/ vapoursynth/ 65 | 66 | - name: Configure 67 | run: cmake -S . -B build -G Ninja -Wno-dev -LA 68 | -D CMAKE_BUILD_TYPE=Release 69 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 70 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%/vapoursynth/include" 71 | -D hip_DIR="C:/Program Files/AMD/ROCm/6.2/lib/cmake/hip" 72 | -D HIP_PLATFORM=amd 73 | -D migraphx_DIR="%cd%/migraphx/lib/cmake/migraphx" 74 | 75 | - name: Build 76 | run: cmake --build build --verbose 77 | 78 | - name: Install 79 | run: cmake --install build --prefix install 80 | 81 | - name: Prepare for upload 82 | run: | 83 | mkdir artifact 84 | copy install\bin\vsmigx.dll artifact\ 85 | 86 | - name: Describe 87 | run: git describe --tags --long 88 | 89 | - name: Dump dependencies 90 | run: dumpbin /dependents artifact/vsmigx.dll 91 | 92 | - name: Upload 93 | uses: actions/upload-artifact@v4 94 | with: 95 | name: VSMIGX-Windows-x64 96 | path: vsmigx/artifact 97 | 98 | - name: Compress artifact for release 99 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 100 | run: | 101 | cd artifact 102 | 7z a -t7z -mx=7 ../../VSMIGX-Windows-x64.${{ github.event.inputs.tag }}.7z . 103 | 104 | - name: Release 105 | uses: softprops/action-gh-release@v2 106 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 107 | with: 108 | tag_name: ${{ inputs.tag }} 109 | files: VSMIGX-Windows-x64.${{ github.event.inputs.tag }}.7z 110 | fail_on_unmatched_files: true 111 | generate_release_notes: false 112 | prerelease: true 113 | 114 | -------------------------------------------------------------------------------- /.github/workflows/windows-ncnn.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-NCNN) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsncnn/**' 8 | - '.github/workflows/windows-ncnn.yml' 9 | workflow_call: 10 | inputs: 11 | tag: 12 | description: 'which tag to upload to' 13 | required: true 14 | type: string 15 | ncnn_tag: 16 | description: 'which tag of ncnn to use' 17 | required: true 18 | default: 'latest' 19 | type: string 20 | workflow_dispatch: 21 | inputs: 22 | tag: 23 | description: 'which tag to upload to' 24 | default: '' 25 | ncnn_tag: 26 | description: 'which tag of ncnn to use' 27 | required: true 28 | default: 'latest' 29 | type: string 30 | 31 | jobs: 32 | build-windows: 33 | runs-on: windows-2022 34 | 35 | defaults: 36 | run: 37 | shell: cmd 38 | working-directory: vsncnn 39 | 40 | steps: 41 | - name: Checkout repo 42 | uses: actions/checkout@v4 43 | with: 44 | fetch-depth: 0 45 | 46 | - name: Setup MSVC 47 | uses: ilammy/msvc-dev-cmd@v1 48 | 49 | - name: Cache protobuf 50 | id: cache-protobuf 51 | uses: actions/cache@v4 52 | with: 53 | path: vsncnn/protobuf/install 54 | key: ${{ runner.os }}-vsncnn-protobuf-v3.16.0 55 | 56 | - name: Checkout protobuf 57 | uses: actions/checkout@v4 58 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 59 | with: 60 | repository: protocolbuffers/protobuf 61 | # follows protobuf in https://github.com/onnx/onnx/tree/v1.12.0#windows 62 | # if you change this, remember to bump the version of the cache key of protobuf and onnx. 63 | ref: v3.16.0 64 | path: vsncnn/protobuf 65 | 66 | - name: Configure protobuf 67 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 68 | run: cmake -S protobuf\cmake -B protobuf\build_rel -G Ninja -LA 69 | -D CMAKE_BUILD_TYPE=Release 70 | -D protobuf_BUILD_SHARED_LIBS=OFF 71 | -D protobuf_BUILD_TESTS=OFF 72 | -D protobuf_MSVC_STATIC_RUNTIME=ON 73 | -D CMAKE_POLICY_VERSION_MINIMUM=3.5 74 | 75 | - name: Build protobuf 76 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 77 | run: cmake --build protobuf\build_rel --verbose 78 | 79 | - name: Install protobuf 80 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 81 | run: cmake --install protobuf\build_rel --prefix protobuf\install 82 | 83 | - name: Cache onnx 84 | id: cache-onnx 85 | uses: actions/cache@v4 86 | with: 87 | path: vsncnn/onnx/install 88 | key: ${{ runner.os }}-vsncnn-onnx-v1.12.0-protobuf-v3.16.0 89 | 90 | - name: Checkout onnx 91 | if: steps.cache-onnx.outputs.cache-hit != 'true' 92 | uses: actions/checkout@v4 93 | with: 94 | repository: onnx/onnx 95 | ref: v1.12.0 96 | path: vsncnn/onnx 97 | 98 | - name: Configure onnx 99 | if: steps.cache-onnx.outputs.cache-hit != 'true' 100 | run: cmake -S onnx -B onnx\build -G Ninja -LA 101 | -D CMAKE_BUILD_TYPE=Release 102 | -D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc 103 | -D Protobuf_LITE_LIBRARY=protobuf\install\lib 104 | -D Protobuf_LIBRARIES=protobuf\install\lib 105 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 106 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 107 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 108 | -D CMAKE_POLICY_VERSION_MINIMUM=3.5 109 | 110 | - name: Build onnx 111 | if: steps.cache-onnx.outputs.cache-hit != 'true' 112 | run: cmake --build onnx\build --verbose 113 | 114 | - name: Install onnx 115 | if: steps.cache-onnx.outputs.cache-hit != 'true' 116 | run: cmake --install onnx\build --prefix onnx\install 117 | 118 | - name: Download VapourSynth headers 119 | run: | 120 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 121 | unzip -q vs.zip 122 | mv vapoursynth-*/ vapoursynth/ 123 | 124 | - name: Download NCNN Precompilation 125 | shell: bash 126 | run: | 127 | rev="${{github.event.inputs.ncnn_tag || inputs.ncnn_tag || 'latest'}}" 128 | if [ "$rev" == "latest" ]; then 129 | url="https://github.com/AmusementClub/ncnn/releases/latest/download/ncnn-gpu-x64-windows.zip" 130 | else 131 | url="https://github.com/AmusementClub/ncnn/releases/download/$rev/ncnn-gpu-x64-windows.zip" 132 | fi 133 | curl -s -o ncnn.zip -LJO "$url" 134 | unzip -q ncnn.zip 135 | 136 | # follows vulkan sdk in https://github.com/AmusementClub/ncnn/blob/github-actions/.github/workflows/windows-x64-gpu.yml 137 | - name: Setup Vulkan SDK 138 | shell: pwsh 139 | run: | 140 | Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.275.0/windows/VulkanSDK-1.3.275.0-Installer.exe?Human=true -OutFile VulkanSDK.exe 141 | $installer = Start-Process -FilePath VulkanSDK.exe -Wait -PassThru -ArgumentList "--accept-licenses --default-answer --confirm-command install"; 142 | $installer.WaitForExit(); 143 | 144 | - name: Configure 145 | run: cmake -S . -B build -G Ninja -LA 146 | -D CMAKE_BUILD_TYPE=Release 147 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 148 | -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include 149 | -D protobuf_DIR=protobuf\install\cmake 150 | -D ONNX_DIR=onnx\install\lib\cmake\ONNX 151 | -D ncnn_DIR=ncnn\lib\cmake\ncnn 152 | -D CMAKE_CXX_STANDARD=20 153 | env: 154 | VULKAN_SDK: C:\VulkanSDK\1.3.275.0 155 | 156 | - name: Build 157 | run: cmake --build build --verbose 158 | 159 | - name: Install 160 | run: | 161 | cmake --install build --prefix install 162 | mkdir artifact 163 | copy install\bin\vsncnn.dll artifact\ 164 | 165 | - name: Upload 166 | uses: actions/upload-artifact@v4 167 | with: 168 | name: VSNCNN-GPU-Windows-x64 169 | path: vsncnn/artifact 170 | 171 | - name: Setup Python portable 172 | run: | 173 | curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.9/python-3.9.9-embed-amd64.zip 174 | 7z x python.zip -ovs_portable 175 | 176 | - name: Install VapourSynth portable 177 | run: | 178 | curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z 179 | 7z x vs.7z -ovs_portable -y 180 | 181 | - name: Copy plugin & swiftshader 182 | run: | 183 | copy artifact\*.dll vs_portable\vapoursynth64\plugins 184 | copy ncnn\tests\* vs_portable\ 185 | 186 | - name: Install waifu2x model 187 | run: | 188 | curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z 189 | 7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models 190 | 191 | - name: Download x265 192 | run: | 193 | curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z 194 | 7z x x265.7z -ovs_portable\ 195 | 196 | - name: Create script 197 | shell: bash 198 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ncnn, core.ncnn.Version(), file=sys.stderr);core.std.BlankClip(format=vs.RGBS, width=127, height=63).ncnn.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy 199 | 200 | - name: Run vspipe 201 | shell: bash 202 | run: | 203 | set -ex 204 | vs_portable/vspipe -i test.vpy - 205 | vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 206 | ls -l out.hevc x265.log 207 | cat x265.log 208 | grep -F 'encoded 10 frames' x265.log || exit 2 209 | grep -i 'error' x265.log && exit 1 210 | exit 0 211 | 212 | - name: Create script (flexible output) 213 | shell: bash 214 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ncnn, core.ncnn.Version(), file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS, width=127, height=63).ncnn.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy 215 | 216 | - name: Run vspipe (flexible output) 217 | shell: bash 218 | run: | 219 | set -ex 220 | vs_portable/vspipe -i test_flexible_output.vpy - 221 | vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 222 | ls -l out.hevc x265.log 223 | cat x265.log 224 | grep -F 'encoded 10 frames' x265.log || exit 2 225 | grep -i 'error' x265.log && exit 1 226 | exit 0 227 | 228 | - name: Describe 229 | run: git describe --tags --long 230 | 231 | - name: Dump dependencies 232 | run: dumpbin /dependents artifact\vsncnn.dll 233 | 234 | - name: Compress artifact for release 235 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 236 | run: | 237 | cd artifact 238 | 7z a -t7z -mx=7 ../../VSNCNN-Windows-x64.${{ github.event.inputs.tag }}.7z . 239 | 240 | - name: Release 241 | uses: softprops/action-gh-release@v2 242 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 243 | with: 244 | tag_name: ${{ inputs.tag }} 245 | files: VSNCNN-Windows-x64.${{ github.event.inputs.tag }}.7z 246 | fail_on_unmatched_files: true 247 | generate_release_notes: false 248 | prerelease: true 249 | -------------------------------------------------------------------------------- /.github/workflows/windows-ort.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-ORT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsort/**' 8 | - '.github/workflows/windows-ort.yml' 9 | workflow_call: 10 | inputs: 11 | tag: 12 | description: 'which tag to upload to' 13 | required: true 14 | type: string 15 | workflow_dispatch: 16 | inputs: 17 | tag: 18 | description: 'which tag to upload to' 19 | default: '' 20 | 21 | jobs: 22 | build-windows: 23 | runs-on: windows-2022 24 | 25 | defaults: 26 | run: 27 | shell: cmd 28 | working-directory: vsort 29 | 30 | steps: 31 | - name: Checkout repo 32 | uses: actions/checkout@v4 33 | with: 34 | fetch-depth: 0 35 | 36 | - name: Setup MSVC 37 | uses: ilammy/msvc-dev-cmd@v1 38 | 39 | - name: Setup Ninja 40 | run: pip install ninja 41 | 42 | - name: Cache protobuf 43 | id: cache-protobuf 44 | uses: actions/cache@v4 45 | with: 46 | path: vsort/protobuf/install 47 | key: ${{ runner.os }}-vsort-protobuf-v4 48 | 49 | - name: Checkout protobuf 50 | uses: actions/checkout@v4 51 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 52 | with: 53 | repository: protocolbuffers/protobuf 54 | # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L203 55 | # if you change this, remember to bump the version of the cache key. 56 | ref: v3.21.12 57 | fetch-depth: 1 58 | path: vsort/protobuf 59 | 60 | - name: Configure protobuf 61 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 62 | run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA 63 | -D CMAKE_BUILD_TYPE=Release 64 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 65 | 66 | - name: Build protobuf 67 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 68 | run: cmake --build protobuf\build_rel --verbose 69 | 70 | - name: Install protobuf 71 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 72 | run: cmake --install protobuf\build_rel --prefix protobuf\install 73 | 74 | - name: Cache onnx 75 | id: cache-onnx 76 | uses: actions/cache@v4 77 | with: 78 | path: vsort/onnx/install 79 | key: ${{ runner.os }}-vsort-onnx-v5 80 | 81 | - name: Checkout onnx 82 | if: steps.cache-onnx.outputs.cache-hit != 'true' 83 | uses: actions/checkout@v4 84 | with: 85 | repository: onnx/onnx 86 | # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external 87 | # if you change this, remember to bump the version of the cache key. 88 | ref: 990217f043af7222348ca8f0301e17fa7b841781 89 | fetch-depth: 1 90 | path: vsort/onnx 91 | 92 | - name: Configure onnx 93 | if: steps.cache-onnx.outputs.cache-hit != 'true' 94 | run: cmake -S onnx -B onnx\build -G Ninja -LA 95 | -D CMAKE_BUILD_TYPE=Release 96 | -D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc 97 | -D Protobuf_LITE_LIBRARY=protobuf\install\lib 98 | -D Protobuf_LIBRARIES=protobuf\install\lib 99 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 100 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 101 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 102 | 103 | - name: Build onnx 104 | if: steps.cache-onnx.outputs.cache-hit != 'true' 105 | run: cmake --build onnx\build --verbose 106 | 107 | - name: Install onnx 108 | if: steps.cache-onnx.outputs.cache-hit != 'true' 109 | run: cmake --install onnx\build --prefix onnx\install 110 | 111 | - name: Download VapourSynth headers 112 | run: | 113 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 114 | unzip -q vs.zip 115 | mv vapoursynth-*/ vapoursynth/ 116 | 117 | - name: Download ONNX Runtime Precompilation 118 | run: | 119 | curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-8036-geb41d57f21-240425-0428/onnxruntime-gpu-win64.zip 120 | unzip -q ortgpu.zip 121 | 122 | - name: Cache CUDA 123 | id: cache-cuda 124 | uses: actions/cache@v4 125 | with: 126 | path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA 127 | key: ${{ runner.os }}-cuda-12.4.1 128 | 129 | - name: Setup CUDA 130 | if: steps.cache-cuda.outputs.cache-hit != 'true' 131 | run: | 132 | curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe 133 | cuda_installer.exe -s nvcc_12.4 cudart_12.4 134 | 135 | - name: Configure 136 | run: cmake -S . -B build -G Ninja -LA 137 | -D CMAKE_BUILD_TYPE=Release 138 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 139 | -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include 140 | -D protobuf_DIR=protobuf\install\cmake 141 | -D ONNX_DIR=onnx\install\lib\cmake\ONNX 142 | -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime 143 | -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib 144 | -D ENABLE_CUDA=1 145 | -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" 146 | -D ENABLE_DML=1 147 | -D CMAKE_CXX_STANDARD=20 148 | 149 | - name: Build 150 | run: cmake --build build --verbose 151 | 152 | - name: Install 153 | run: | 154 | cmake --install build --prefix install 155 | mkdir artifact 156 | mkdir artifact\vsort 157 | copy install\bin\vsort.dll artifact\ 158 | copy onnxruntime-gpu\bin\*.dll artifact\vsort\ 159 | copy onnxruntime-gpu\lib\*.dll artifact\vsort\ 160 | 161 | - name: Download DirectML Library 162 | # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44 163 | run: | 164 | curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1 165 | unzip -q directml.nupkg -d dml 166 | copy dml\bin\x64-win\DirectML.dll artifact\vsort\ 167 | 168 | - name: Upload 169 | uses: actions/upload-artifact@v4 170 | with: 171 | name: VSORT-Windows-x64 172 | path: vsort/artifact 173 | 174 | - name: Setup Python portable 175 | run: | 176 | curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.10/python-3.9.10-embed-amd64.zip 177 | 7z x python.zip -ovs_portable 178 | 179 | - name: Install VapourSynth portable 180 | run: | 181 | curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z 182 | 7z x vs.7z -ovs_portable -y 183 | 184 | - name: Copy plugin 185 | run: | 186 | copy artifact\*.dll vs_portable\vapoursynth64\plugins 187 | mkdir vs_portable\vapoursynth64\plugins\vsort\ 188 | copy artifact\vsort\*.dll vs_portable\vapoursynth64\plugins\vsort\ 189 | 190 | - name: Install waifu2x model 191 | run: | 192 | curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z 193 | 7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models 194 | 195 | - name: Download x265 196 | run: | 197 | curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z 198 | 7z x x265.7z -ovs_portable\ 199 | 200 | - name: Create script 201 | shell: bash 202 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy 203 | 204 | - name: Run vspipe 205 | shell: bash 206 | run: | 207 | set -ex 208 | vs_portable/vspipe -i test.vpy - 209 | vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 210 | ls -l out.hevc x265.log 211 | cat x265.log 212 | grep -F 'encoded 10 frames' x265.log || exit 2 213 | grep -i 'error' x265.log && exit 1 214 | exit 0 215 | 216 | - name: Create script (fp16) 217 | shell: bash 218 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy 219 | 220 | - name: Run vspipe (fp16) 221 | shell: bash 222 | run: | 223 | set -ex 224 | vs_portable/vspipe -i test_fp16.vpy - 225 | vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 226 | ls -l out.hevc x265.log 227 | cat x265.log 228 | grep -F 'encoded 10 frames' x265.log || exit 2 229 | grep -i 'error' x265.log && exit 1 230 | exit 0 231 | 232 | - name: Create script (fp16 input) 233 | shell: bash 234 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBH).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_input.vpy 235 | 236 | - name: Run vspipe (fp16 input) 237 | shell: bash 238 | run: | 239 | set -ex 240 | vs_portable/vspipe -i test_fp16_input.vpy - 241 | vs_portable/vspipe --y4m -p -e 9 test_fp16_input.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 242 | ls -l out.hevc x265.log 243 | cat x265.log 244 | grep -F 'encoded 10 frames' x265.log || exit 2 245 | grep -i 'error' x265.log && exit 1 246 | exit 0 247 | 248 | - name: Create script (fp16 output) 249 | shell: bash 250 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True, output_format=1);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_output.vpy 251 | 252 | - name: Run vspipe (fp16 output) 253 | shell: bash 254 | run: | 255 | set -ex 256 | vs_portable/vspipe -i test_fp16_output.vpy - 257 | vs_portable/vspipe --y4m -p -e 9 test_fp16_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 258 | ls -l out.hevc x265.log 259 | cat x265.log 260 | grep -F 'encoded 10 frames' x265.log || exit 2 261 | grep -i 'error' x265.log && exit 1 262 | exit 0 263 | 264 | - name: Create script (flexible output) 265 | shell: bash 266 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy 267 | 268 | - name: Run vspipe (flexible output) 269 | shell: bash 270 | run: | 271 | set -ex 272 | vs_portable/vspipe -i test_flexible_output.vpy - 273 | vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 274 | ls -l out.hevc x265.log 275 | cat x265.log 276 | grep -F 'encoded 10 frames' x265.log || exit 2 277 | grep -i 'error' x265.log && exit 1 278 | exit 0 279 | 280 | - name: Describe 281 | run: git describe --tags --long 282 | 283 | - name: Dump dependencies 284 | run: dumpbin /dependents artifact\vsort.dll 285 | 286 | - name: Compress artifact for release 287 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 288 | run: | 289 | cd artifact 290 | 7z a -t7z -mx=7 ../../VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z . 291 | 292 | - name: Release 293 | uses: softprops/action-gh-release@v2 294 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 295 | with: 296 | tag_name: ${{ inputs.tag }} 297 | files: VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z 298 | fail_on_unmatched_files: true 299 | generate_release_notes: false 300 | prerelease: true 301 | -------------------------------------------------------------------------------- /.github/workflows/windows-ov.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-OV) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'common/**' 7 | - 'vsov/**' 8 | - '.github/workflows/windows-ov.yml' 9 | workflow_call: 10 | inputs: 11 | tag: 12 | description: 'which tag to upload to' 13 | required: true 14 | type: string 15 | ov_tag: 16 | description: 'which tag of openvino to use' 17 | required: true 18 | default: 'latest' 19 | type: string 20 | workflow_dispatch: 21 | inputs: 22 | tag: 23 | description: 'which tag to upload to' 24 | default: '' 25 | ov_tag: 26 | description: 'which tag of openvino to use' 27 | required: true 28 | default: 'latest' 29 | type: string 30 | 31 | jobs: 32 | build-windows: 33 | runs-on: windows-2022 34 | 35 | defaults: 36 | run: 37 | shell: cmd 38 | working-directory: vsov 39 | 40 | steps: 41 | - name: Checkout repo 42 | uses: actions/checkout@v4 43 | with: 44 | fetch-depth: 0 45 | 46 | - name: Setup MSVC 47 | uses: ilammy/msvc-dev-cmd@v1 48 | 49 | - name: Setup Ninja 50 | run: pip install ninja 51 | 52 | - name: Cache protobuf 53 | id: cache-protobuf 54 | uses: actions/cache@v4 55 | with: 56 | path: vsov/protobuf/install 57 | key: ${{ runner.os }}-vsov-protobuf-v3 58 | 59 | - name: Checkout protobuf 60 | uses: actions/checkout@v4 61 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 62 | with: 63 | repository: protocolbuffers/protobuf 64 | # follows protobuf in https://github.com/AmusementClub/openvino/tree/master/thirdparty/protobuf 65 | # if you change this, remember to bump the version of the cache key. 66 | ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c 67 | fetch-depth: 1 68 | path: vsov/protobuf 69 | 70 | - name: Configure protobuf 71 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 72 | run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA 73 | -D CMAKE_BUILD_TYPE=Release 74 | -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF 75 | 76 | - name: Build protobuf 77 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 78 | run: cmake --build protobuf\build_rel --verbose 79 | 80 | - name: Install protobuf 81 | if: steps.cache-protobuf.outputs.cache-hit != 'true' 82 | run: cmake --install protobuf\build_rel --prefix protobuf\install 83 | 84 | - name: Cache onnx 85 | id: cache-onnx 86 | uses: actions/cache@v4 87 | with: 88 | path: vsov/onnx/install 89 | key: ${{ runner.os }}-vsov-onnx-v3 90 | 91 | - name: Checkout onnx 92 | if: steps.cache-onnx.outputs.cache-hit != 'true' 93 | uses: actions/checkout@v4 94 | with: 95 | repository: onnx/onnx 96 | # follows onnx in https://github.com/AmusementClub/openvino/tree/master/thirdparty/onnx 97 | # if you change this, remember to bump the version of the cache key. 98 | ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c 99 | fetch-depth: 1 100 | path: vsov/onnx 101 | 102 | - name: Configure onnx 103 | if: steps.cache-onnx.outputs.cache-hit != 'true' 104 | run: cmake -S onnx -B onnx\build -G Ninja -LA 105 | -D CMAKE_BUILD_TYPE=Release 106 | -D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc 107 | -D Protobuf_LITE_LIBRARY=protobuf\install\lib 108 | -D Protobuf_LIBRARIES=protobuf\install\lib 109 | -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF 110 | -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0 111 | -D ONNX_USE_MSVC_STATIC_RUNTIME=1 112 | 113 | - name: Build onnx 114 | if: steps.cache-onnx.outputs.cache-hit != 'true' 115 | run: cmake --build onnx\build --verbose 116 | 117 | - name: Install onnx 118 | if: steps.cache-onnx.outputs.cache-hit != 'true' 119 | run: cmake --install onnx\build --prefix onnx\install 120 | 121 | - name: Download VapourSynth headers 122 | run: | 123 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 124 | unzip -q vs.zip 125 | mv vapoursynth-*/ vapoursynth/ 126 | 127 | - name: Download OpenVINO Runtime Precompilation 128 | shell: bash 129 | run: | 130 | # rev="${{github.event.inputs.ov_tag || inputs.ov_tag || 'latest'}}" 131 | # if [ "$rev" == "latest" ]; then 132 | # url="https://github.com/AmusementClub/openvino/releases/latest/download/openvino-gpu-win64.zip" 133 | # else 134 | # url="https://github.com/AmusementClub/openvino/releases/download/$rev/openvino-gpu-win64.zip" 135 | # fi 136 | url="https://github.com/AmusementClub/openvino/releases/download/2020.2-15171-g4655dd6ce3-2058-g5833781ddb/openvino-gpu-win64.zip" 137 | curl -s -o openvino.zip -LJO "$url" 138 | unzip -q openvino.zip 139 | 140 | - name: Configure 141 | run: cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release 142 | -D CMAKE_INTERPROCEDURAL_OPTIMIZATION=ON 143 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 144 | -D OpenVINO_DIR=openvino/runtime/cmake 145 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include" 146 | -D ENABLE_VISUALIZATION=ON 147 | -D WIN32_SHARED_OPENVINO=ON 148 | -D protobuf_DIR=protobuf\install\cmake 149 | -D ONNX_DIR=onnx\install\lib\cmake\ONNX 150 | 151 | - name: Build 152 | run: cmake --build build --verbose 153 | 154 | - name: Install 155 | run: | 156 | cmake --install build --prefix install 157 | mkdir artifact 158 | mkdir artifact\vsov 159 | copy openvino\runtime\3rdparty\tbb\bin\tbb12.dll artifact\vsov\ 160 | copy install\bin\vsov.dll artifact\ 161 | xcopy openvino\runtime\bin\intel64\Release\* artifact\vsov\ /s 162 | 163 | - name: Upload 164 | uses: actions/upload-artifact@v4 165 | with: 166 | name: VSOV-Windows-x64 167 | path: vsov/artifact 168 | 169 | - name: Setup Python portable 170 | run: | 171 | curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.9/python-3.9.9-embed-amd64.zip 172 | 7z x python.zip -ovs_portable 173 | 174 | - name: Install VapourSynth portable 175 | run: | 176 | curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z 177 | 7z x vs.7z -ovs_portable -y 178 | 179 | - name: Copy plugin 180 | run: | 181 | copy artifact\*.dll vs_portable\vapoursynth64\plugins 182 | mkdir vs_portable\vapoursynth64\plugins\vsov\ 183 | copy artifact\vsov\* vs_portable\vapoursynth64\plugins\vsov\ 184 | 185 | - name: Install waifu2x model 186 | run: | 187 | curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z 188 | 7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models 189 | 190 | - name: Download x265 191 | run: | 192 | curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z 193 | 7z x x265.7z -ovs_portable\ 194 | 195 | - name: Create script 196 | shell: bash 197 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy 198 | 199 | - name: Run vspipe 200 | shell: bash 201 | run: | 202 | set -ex 203 | vs_portable/vspipe -i test.vpy - 204 | vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 205 | ls -l out.hevc x265.log 206 | cat x265.log 207 | grep -F 'encoded 10 frames' x265.log || exit 2 208 | grep -i 'error' x265.log && exit 1 209 | exit 0 210 | 211 | - name: Create script (fp16) 212 | shell: bash 213 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy 214 | 215 | - name: Run vspipe (fp16) 216 | shell: bash 217 | run: | 218 | set -ex 219 | vs_portable/vspipe -i test_fp16.vpy - 220 | vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 221 | ls -l out.hevc x265.log 222 | cat x265.log 223 | grep -F 'encoded 10 frames' x265.log || exit 2 224 | grep -i 'error' x265.log && exit 1 225 | exit 0 226 | 227 | - name: Create script (flexible output) 228 | shell: bash 229 | run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);prop=\"test\";output=core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output[\"clip\"].std.PropToClip(prop=f\"{prop}{i}\") for i in range(output[\"num_planes\"])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy 230 | 231 | - name: Run vspipe (flexible output) 232 | shell: bash 233 | run: | 234 | set -ex 235 | vs_portable/vspipe -i test_flexible_output.vpy - 236 | vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc - 237 | ls -l out.hevc x265.log 238 | cat x265.log 239 | grep -F 'encoded 10 frames' x265.log || exit 2 240 | grep -i 'error' x265.log && exit 1 241 | exit 0 242 | 243 | - name: Describe 244 | run: git describe --tags --long 245 | 246 | - name: Dump dependencies 247 | run: dumpbin /dependents artifact\vsov.dll 248 | 249 | - name: Compress artifact for release 250 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 251 | run: | 252 | cd artifact 253 | 7z a -t7z -mx=7 ../../VSOV-Windows-x64.${{ github.event.inputs.tag }}.7z . 254 | 255 | - name: Release 256 | uses: softprops/action-gh-release@v2 257 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 258 | with: 259 | tag_name: ${{ inputs.tag }} 260 | files: VSOV-Windows-x64.${{ github.event.inputs.tag }}.7z 261 | fail_on_unmatched_files: true 262 | generate_release_notes: false 263 | prerelease: true 264 | -------------------------------------------------------------------------------- /.github/workflows/windows-release.yml: -------------------------------------------------------------------------------- 1 | name: Make a Release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | tag: 7 | description: 'which tag to create and release?' 8 | required: true 9 | default: 'nightly' 10 | model-tags: 11 | description: 'which tag(s) of model release to use? (comma-separated list of tags)' 12 | required: true 13 | default: 'model-20211209, model-20220923' 14 | ext-model-tags: 15 | description: 'which tag(s) of external model release to use?' 16 | required: true 17 | default: 'external-models' 18 | contrib-model-tags: 19 | description: 'which tag(s) of contributed model release to use?' 20 | required: true 21 | default: 'contrib-models' 22 | ov_tag: 23 | description: 'which tag of openvino to use' 24 | required: true 25 | default: 'latest' 26 | type: string 27 | ncnn_tag: 28 | description: 'which tag of ncnn to use' 29 | required: true 30 | default: 'latest' 31 | type: string 32 | 33 | jobs: 34 | build-vsov: 35 | uses: ./.github/workflows/windows-ov.yml 36 | with: 37 | tag: ${{ github.event.inputs.tag }} 38 | ov_tag: ${{ github.event.inputs.ov_tag }} 39 | 40 | build-vsort: 41 | uses: ./.github/workflows/windows-ort.yml 42 | with: 43 | tag: ${{ github.event.inputs.tag }} 44 | 45 | build-vstrt: 46 | uses: ./.github/workflows/windows-trt.yml 47 | with: 48 | tag: ${{ github.event.inputs.tag }} 49 | secrets: 50 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 51 | 52 | build-vsmigx: 53 | uses: ./.github/workflows/windows-migx.yml 54 | with: 55 | tag: ${{ github.event.inputs.tag }} 56 | secrets: 57 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 58 | 59 | build-vsncnn: 60 | uses: ./.github/workflows/windows-ncnn.yml 61 | with: 62 | tag: ${{ github.event.inputs.tag }} 63 | ncnn_tag: ${{ github.event.inputs.ncnn_tag }} 64 | 65 | build-cuda-dependency: 66 | uses: ./.github/workflows/windows-cuda-dependency.yml 67 | with: 68 | tag: ${{ github.event.inputs.tag }} 69 | secrets: 70 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 71 | 72 | build-hip-dependency: 73 | uses: ./.github/workflows/windows-hip-dependency.yml 74 | with: 75 | tag: ${{ github.event.inputs.tag }} 76 | secrets: 77 | REPO_TOKEN: ${{ secrets.REPO_TOKEN }} 78 | 79 | build-scripts: 80 | runs-on: ubuntu-24.04-arm 81 | steps: 82 | - name: Checkout repo 83 | uses: actions/checkout@v4 84 | 85 | - name: Compress scirpts.7z 86 | run: | 87 | cd scripts 88 | 7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z . 89 | 90 | - name: Upload scripts release 91 | uses: actions/upload-artifact@v4 92 | with: 93 | name: Scripts 94 | path: scripts 95 | retention-days: 1 96 | 97 | - name: Release scripts 98 | uses: softprops/action-gh-release@v2 99 | with: 100 | tag_name: ${{ github.event.inputs.tag }} 101 | files: scripts.${{ github.event.inputs.tag }}.7z 102 | fail_on_unmatched_files: true 103 | generate_release_notes: false 104 | prerelease: true 105 | 106 | build-models: 107 | runs-on: ubuntu-24.04-arm 108 | steps: 109 | - name: Download Models 110 | run: | 111 | set -ex 112 | mkdir -p release/models 113 | cd release 114 | pushd models 115 | for tag in $(echo "${{ github.event.inputs.model-tags }}" | tr ',' ' '); do 116 | echo "Handling tag $tag" 117 | curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json 118 | for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do 119 | echo "Downloading $url" 120 | curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url" 121 | # later release should overwrite earlier ones 122 | 7za x -y dl.7z 123 | done 124 | test -f "dl.7z" 125 | rm -f dl.7z release.json 126 | done 127 | popd 128 | ls -lR 129 | du -sh 130 | 7za a -t7z -bb3 -mx=9 ../models.7z . 131 | 132 | - name: Upload model release 133 | uses: actions/upload-artifact@v4 134 | with: 135 | name: Models 136 | path: release 137 | retention-days: 1 138 | compression-level: 0 139 | 140 | - name: Download External Models 141 | if: false 142 | run: | 143 | rm -rf release 144 | set -ex 145 | mkdir -p release/models 146 | cd release 147 | pushd models 148 | for tag in $(echo "${{ github.event.inputs.ext-model-tags }}" | tr ',' ' '); do 149 | echo "Handling tag $tag" 150 | curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json 151 | for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do 152 | echo "Downloading $url" 153 | curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url" 154 | # later release should overwrite earlier ones 155 | 7za x -y dl.7z 156 | done 157 | test -f "dl.7z" 158 | rm -f dl.7z release.json 159 | done 160 | popd 161 | ls -lR 162 | du -sh 163 | 7za a -t7z -bb3 -mx=9 ../ext-models.7z . 164 | 165 | - name: Upload external model release 166 | uses: actions/upload-artifact@v4 167 | if: false 168 | with: 169 | name: External-Models 170 | path: release 171 | retention-days: 1 172 | compression-level: 0 173 | 174 | - name: Download Contributed Models 175 | run: | 176 | rm -rf release 177 | set -ex 178 | mkdir -p release/models 179 | cd release 180 | pushd models 181 | for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do 182 | echo "Handling tag $tag" 183 | curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json 184 | for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do 185 | echo "Downloading $url" 186 | curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url" 187 | # later release should overwrite earlier ones 188 | 7za x -y dl.7z 189 | done 190 | #test -f "dl.7z" # contrib-models might be empty. 191 | rm -f dl.7z release.json 192 | done 193 | popd 194 | ls -lR 195 | du -sh 196 | 7za a -t7z -bb3 -mx=9 ../contrib-models.7z . 197 | 198 | - name: Upload contrib model release 199 | uses: actions/upload-artifact@v4 200 | with: 201 | name: Contrib-Models 202 | path: release 203 | retention-days: 1 204 | compression-level: 0 205 | 206 | - name: Download Contributed Models 207 | run: | 208 | rm -rf release 209 | set -ex 210 | mkdir -p release/models 211 | cd release 212 | pushd models 213 | for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do 214 | echo "Handling tag $tag" 215 | curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json 216 | for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do 217 | echo "Downloading $url" 218 | curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url" 219 | # later release should overwrite earlier ones 220 | 7za x -y dl.7z 221 | done 222 | #test -f "dl.7z" # contrib-models might be empty. 223 | rm -f dl.7z release.json 224 | done 225 | popd 226 | ls -lR 227 | du -sh 228 | 7za a -t7z -bb3 -mx=9 ../contrib-models.7z . 229 | 230 | - name: Rename release asset 231 | run: | 232 | mv models.7z models.${{ github.event.inputs.tag }}.7z 233 | mv contrib-models.7z contrib-models.${{ github.event.inputs.tag }}.7z 234 | 235 | - name: Release models 236 | uses: softprops/action-gh-release@v2 237 | with: 238 | tag_name: ${{ github.event.inputs.tag }} 239 | files: | 240 | models.${{ github.event.inputs.tag }}.7z 241 | contrib-models.${{ github.event.inputs.tag }}.7z 242 | fail_on_unmatched_files: true 243 | generate_release_notes: false 244 | prerelease: true 245 | 246 | release: 247 | runs-on: ubuntu-24.04-arm 248 | needs: [build-vsov, build-vsort, build-vstrt, build-vsmigx, build-vsncnn, build-cuda-dependency, build-hip-dependency, build-scripts, build-models] 249 | 250 | defaults: 251 | run: 252 | shell: bash 253 | 254 | steps: 255 | - name: Download artifact for scripts 256 | uses: actions/download-artifact@v4 257 | with: 258 | name: Scripts 259 | path: scripts-release 260 | 261 | - name: Download artifact for models 262 | uses: actions/download-artifact@v4 263 | with: 264 | name: Models 265 | path: models-release 266 | 267 | - name: Download artifact for vsov 268 | uses: actions/download-artifact@v4 269 | with: 270 | name: VSOV-Windows-x64 271 | path: vsov-release 272 | 273 | - name: Download artifact for vsort 274 | uses: actions/download-artifact@v4 275 | with: 276 | name: VSORT-Windows-x64 277 | path: vsort-release 278 | 279 | - name: Download artifact for vstrt 280 | uses: actions/download-artifact@v4 281 | with: 282 | name: VSTRT-Windows-x64 283 | path: vstrt-release 284 | 285 | - name: Download artifact for vsmigx 286 | uses: actions/download-artifact@v4 287 | with: 288 | name: VSMIGX-Windows-x64 289 | path: vsmigx-release 290 | 291 | - name: Download artifact for vsncnn 292 | uses: actions/download-artifact@v4 293 | with: 294 | name: VSNCNN-GPU-Windows-x64 295 | path: vsncnn-release 296 | 297 | - name: Download artifact for cuda dependencies 298 | uses: actions/download-artifact@v4 299 | with: 300 | name: vsmlrt-cuda 301 | path: cuda-release 302 | 303 | - name: Download artifact for hip dependencies 304 | uses: actions/download-artifact@v4 305 | with: 306 | name: vsmlrt-hip 307 | path: hip-release 308 | 309 | - name: Build CPU-only release 310 | shell: bash 311 | run: | 312 | mkdir release-cpu 313 | cp -r models-release/models release-cpu/ 314 | cp -r vsov-release/* release-cpu/ 315 | cp -r vsort-release/* release-cpu/ 316 | rm -f release-cpu/vsort/onnxruntime_providers_*.dll 317 | cp scripts-release/*.py release-cpu/ 318 | cd release-cpu 319 | ls -lR 320 | 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z . 321 | 322 | - name: Upload CPU-only release 323 | uses: actions/upload-artifact@v4 324 | if: false 325 | with: 326 | name: vsmlrt-cpu-release 327 | path: vsmlrt-windows-x64-cpu.7z 328 | retention-days: 1 329 | compression-level: 0 330 | 331 | - name: Rename release asset 332 | run: mv vsmlrt-windows-x64-cpu.7z vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag }}.7z 333 | 334 | - name: Release CPU 335 | uses: softprops/action-gh-release@v2 336 | with: 337 | tag_name: ${{ github.event.inputs.tag }} 338 | files: vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag}}.7z 339 | fail_on_unmatched_files: true 340 | generate_release_notes: false 341 | prerelease: true 342 | 343 | - name: Build generic GPU release 344 | shell: bash 345 | run: | 346 | mkdir release-generic-gpu 347 | cp -r models-release/models release-generic-gpu/ 348 | cp -r vsov-release/* release-generic-gpu/ 349 | cp -r vsort-release/* release-generic-gpu/ 350 | rm -f release-generic-gpu/vsort/onnxruntime_providers_*.dll 351 | cp -r vsncnn-release/* release-generic-gpu/ 352 | cp scripts-release/*.py release-generic-gpu/ 353 | cd release-generic-gpu 354 | ls -lR 355 | 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z . 356 | 357 | - name: Upload generic GPU release 358 | uses: actions/upload-artifact@v4 359 | if: false 360 | with: 361 | name: vsmlrt-generic-gpu-release 362 | path: vsmlrt-windows-x64-generic-gpu.7z 363 | retention-days: 1 364 | compression-level: 0 365 | 366 | - name: Rename release asset for generic GPU release 367 | run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z 368 | 369 | - name: Release generic GPU 370 | uses: softprops/action-gh-release@v2 371 | with: 372 | tag_name: ${{ github.event.inputs.tag }} 373 | files: vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z 374 | fail_on_unmatched_files: true 375 | generate_release_notes: false 376 | prerelease: true 377 | 378 | - name: Extract CUDA libraries 379 | run: | 380 | cd cuda-release 381 | 7za x -bb3 vsmlrt-cuda.7z 382 | rm vsmlrt-cuda.7z 383 | 384 | - name: Build CUDA release 385 | shell: bash 386 | run: | 387 | mkdir release-cuda 388 | cp -r models-release/models release-cuda/ 389 | cp -r vsov-release/* release-cuda/ 390 | cp -r vsort-release/* release-cuda/ 391 | cp -r vstrt-release/* release-cuda/ 392 | cp -r vsncnn-release/* release-cuda/ 393 | cp -r cuda-release/* release-cuda/ 394 | cp scripts-release/*.py release-cuda/ 395 | cd release-cuda 396 | ls -lR 397 | 7za a -t7z -bb3 -mx=9 -v2000000000b ../vsmlrt-windows-x64-cuda.7z . 398 | 399 | - name: Upload CUDA release 400 | uses: actions/upload-artifact@v4 401 | if: false 402 | with: 403 | name: vsmlrt-cuda-release 404 | path: | 405 | vsmlrt-windows-x64-cuda.7z.001 406 | vsmlrt-windows-x64-cuda.7z.002 407 | retention-days: 1 408 | compression-level: 0 409 | 410 | - name: Rename release asset for CUDA release 411 | run: | 412 | mv vsmlrt-windows-x64-cuda.7z.001 vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.001 413 | mv vsmlrt-windows-x64-cuda.7z.002 vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.002 414 | 415 | - name: Release CUDA 416 | uses: softprops/action-gh-release@v2 417 | with: 418 | tag_name: ${{ github.event.inputs.tag }} 419 | files: | 420 | vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.001 421 | vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.002 422 | fail_on_unmatched_files: true 423 | generate_release_notes: false 424 | prerelease: true 425 | 426 | - name: Build TensorRT release 427 | shell: bash 428 | run: | 429 | cd release-cuda 430 | cd vsmlrt-cuda 431 | rm --verbose cublas*.dll cudnn*.dll cufft*.dll cupti*.dll nvblas*.dll 432 | cd .. 433 | rm --verbose vsort/onnxruntime_providers_*.dll 434 | 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-tensorrt.7z . 435 | 436 | - name: Upload TensorRT release 437 | uses: actions/upload-artifact@v4 438 | if: false 439 | with: 440 | name: vsmlrt-tensorrt-release 441 | path: vsmlrt-windows-x64-tensorrt.7z 442 | retention-days: 1 443 | compression-level: 0 444 | 445 | - name: Rename release asset for TensorRT release 446 | run: mv vsmlrt-windows-x64-tensorrt.7z vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z 447 | 448 | - name: Release TensorRT 449 | uses: softprops/action-gh-release@v2 450 | with: 451 | tag_name: ${{ github.event.inputs.tag }} 452 | files: vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z 453 | fail_on_unmatched_files: true 454 | generate_release_notes: false 455 | prerelease: true 456 | 457 | - name: Extract HIP libraries 458 | run: | 459 | cd hip-release 460 | 7za x -bb3 vsmlrt-hip.7z 461 | rm vsmlrt-hip.7z 462 | 463 | - name: Build MIGraphX release 464 | shell: bash 465 | run: | 466 | mkdir release-hip 467 | cp -r models-release/models release-hip/ 468 | cp -r vsov-release/* release-hip/ 469 | cp -r vsort-release/* release-hip/ 470 | cp -r vsmigx-release/* release-hip/ 471 | cp -r vsncnn-release/* release-hip/ 472 | cp -r hip-release/* release-hip/ 473 | cp scripts-release/*.py release-hip/ 474 | cd release-hip 475 | ls -lR 476 | 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-migraphx.7z . 477 | 478 | - name: Upload MIGraphX release 479 | uses: actions/upload-artifact@v4 480 | if: false 481 | with: 482 | name: vsmlrt-migraphx-release 483 | path: vsmlrt-windows-x64-migraphx.7z 484 | retention-days: 1 485 | compression-level: 0 486 | 487 | - name: Rename release asset for MIGraphX release 488 | run: mv vsmlrt-windows-x64-migraphx.7z vsmlrt-windows-x64-migraphx.${{ github.event.inputs.tag }}.7z 489 | 490 | - name: Release MIGraphX 491 | uses: softprops/action-gh-release@v2 492 | with: 493 | tag_name: ${{ github.event.inputs.tag }} 494 | files: vsmlrt-windows-x64-migraphx.${{ github.event.inputs.tag }}.7z 495 | fail_on_unmatched_files: true 496 | generate_release_notes: false 497 | prerelease: true 498 | 499 | # Update nightly tag. 500 | - name: Checkout repo 501 | if: github.event.inputs.tag == 'nightly' 502 | uses: actions/checkout@v4 503 | with: 504 | fetch-depth: 0 505 | - name: Overwrite tag 506 | if: github.event.inputs.tag == 'nightly' 507 | run: | 508 | git pull --tags --force 509 | git tag -f ${{ github.event.inputs.tag }} 510 | git push -f origin ${{ github.event.inputs.tag }} 511 | -------------------------------------------------------------------------------- /.github/workflows/windows-trt.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows-TRT) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'vstrt/**' 7 | - '.github/workflows/windows-trt.yml' 8 | workflow_call: 9 | inputs: 10 | tag: 11 | description: 'which tag to upload to' 12 | required: true 13 | type: string 14 | secrets: 15 | REPO_TOKEN: 16 | required: true 17 | workflow_dispatch: 18 | inputs: 19 | tag: 20 | description: 'which tag to upload to' 21 | default: '' 22 | 23 | jobs: 24 | build-windows: 25 | runs-on: windows-2025 26 | 27 | defaults: 28 | run: 29 | shell: cmd 30 | working-directory: vstrt 31 | 32 | steps: 33 | - name: Checkout repo 34 | uses: actions/checkout@v4 35 | with: 36 | fetch-depth: 0 37 | 38 | - name: Setup MSVC 39 | uses: ilammy/msvc-dev-cmd@v1 40 | 41 | - name: Setup Ninja 42 | run: pip install ninja 43 | 44 | - name: Cache CUDA 45 | id: cache-cuda 46 | uses: actions/cache@v4 47 | with: 48 | path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA 49 | key: ${{ runner.os }}-vstrt-cuda-12.9.0 50 | save-always: true 51 | 52 | - name: Setup CUDA 53 | if: steps.cache-cuda.outputs.cache-hit != 'true' 54 | run: | 55 | curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe 56 | cuda_installer.exe -s nvcc_12.9 cudart_12.9 cuda_profiler_api_12.9 57 | 58 | - name: Download TensorRT 59 | run: | 60 | curl -L -o trt.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/zip/TensorRT-10.11.0.33.Windows.win10.cuda-12.9.zip 61 | unzip trt.zip 62 | mv TensorRT-*/ tensorrt/ 63 | 64 | - name: Download VapourSynth headers 65 | run: | 66 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip 67 | unzip -q vs.zip 68 | mv vapoursynth-*/ vapoursynth/ 69 | 70 | - name: Configure 71 | run: cmake -S . -B build -G Ninja -LA 72 | -D CMAKE_BUILD_TYPE=Release 73 | -D CMAKE_CXX_FLAGS="/EHsc /Wall /wd4100 /wd4625 /wd4626 /wd4710 /wd4711 /wd4820 /wd4996 /wd5026 /wd5027" 74 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 75 | -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" 76 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include" 77 | -D TENSORRT_HOME="%cd%\tensorrt" 78 | -D USE_NVINFER_PLUGIN=ON 79 | -D TENSORRT_LIBRARY_SUFFIX="_10" 80 | 81 | - name: Build 82 | run: cmake --build build --config Release --verbose 83 | 84 | - name: Install 85 | run: cmake --install build --prefix install 86 | 87 | - name: Checkout TensorRT OSS 88 | uses: actions/checkout@v4 89 | with: 90 | repository: NVIDIA/TensorRT 91 | ref: v10.1.0 92 | fetch-depth: 1 93 | path: tensorrt-oss 94 | 95 | - name: Override trtexec CMake file 96 | run: | 97 | cp -f -r -v tensorrt/samples ../tensorrt-oss 98 | cp -f -r -v tensorrt/include ../tensorrt-oss 99 | 100 | mv trtexec/CMakeLists.txt ../tensorrt-oss/samples/trtexec 101 | mv trtexec/*.cpp ../tensorrt-oss/samples/trtexec 102 | mv longpath.manifest ../tensorrt-oss/samples/trtexec 103 | 104 | - name: Apply patch 105 | run: | 106 | mv trtexec/trtexec.patch ../tensorrt-oss 107 | cd ../tensorrt-oss 108 | 109 | copy samples\utils\fileLock.cpp samples\utils\fileLock-utf16le.cpp 110 | powershell "Get-Content samples\utils\fileLock-utf16le.cpp | Out-File samples\utils\fileLock.cpp -Encoding ascii" 111 | git apply trtexec.patch --verbose 112 | copy samples\utils\fileLock.cpp samples\utils\fileLock-utf8.cpp 113 | powershell "Get-Content samples\utils\fileLock-utf8.cpp | Out-File samples\utils\fileLock.cpp" 114 | 115 | - name: Configure trtexec 116 | run: cmake -S ../tensorrt-oss/samples/trtexec -B build_trtexec -G Ninja 117 | -D CMAKE_BUILD_TYPE=Release 118 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 119 | -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9" 120 | -D CMAKE_UNITY_BUILD=ON -D CMAKE_UNITY_BUILD_BATCH_SIZE=0 121 | -D CMAKE_CXX_STANDARD=20 122 | 123 | - name: Build trtexec 124 | run: cmake --build build_trtexec --verbose 125 | 126 | - name: Install trtexec 127 | run: cmake --install build_trtexec --prefix trtexec 128 | 129 | - name: Prepare for upload 130 | run: | 131 | mkdir artifact 132 | copy install\bin\vstrt.dll artifact\ 133 | mkdir artifact\vsmlrt-cuda 134 | copy trtexec\bin\trtexec.exe artifact\vsmlrt-cuda 135 | 136 | - name: Describe 137 | run: git describe --tags --long 138 | 139 | - name: Dump dependencies 140 | run: dumpbin /dependents artifact\vstrt.dll 141 | 142 | - name: Upload 143 | uses: actions/upload-artifact@v4 144 | with: 145 | name: VSTRT-Windows-x64 146 | path: vstrt/artifact 147 | 148 | - name: Compress artifact for release 149 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 150 | run: | 151 | cd artifact 152 | 7z a -t7z -mx=7 ../../VSTRT-Windows-x64.${{ github.event.inputs.tag }}.7z . 153 | 154 | - name: Release 155 | uses: softprops/action-gh-release@v2 156 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 157 | with: 158 | tag_name: ${{ inputs.tag }} 159 | files: VSTRT-Windows-x64.${{ github.event.inputs.tag }}.7z 160 | fail_on_unmatched_files: true 161 | generate_release_notes: false 162 | prerelease: true 163 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vs-mlrt 2 | 3 | This project provides VapourSynth ML filter runtimes for a variety of platforms: 4 | - x86 CPUs: [vsov-cpu](#vsov-openvino-based-pure-cpu--intel-gpu-runtime), [vsort-cpu](#vsort-onnx-runtime-based-cpugpu-runtime) 5 | - Intel GPU (both integrated & discrete): [vsov-gpu](#vsov-openvino-based-pure-cpu--intel-gpu-runtime), [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime) 6 | - NVidia GPU: [vsort-cuda](#vsort-onnx-runtime-based-cpugpu-runtime), [vstrt](#vstrt-tensorrt-based-gpu-runtime), [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime) 7 | - AMD GPU: [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime), [vsmigx](#vsmigx-migraphx-based-gpu-runtime) 8 | - Apple SoC: [vsort-coreml](#vsort-onnx-runtime-based-cpugpu-runtime) 9 | 10 | To simplify usage, we also provide a Python wrapper [vsmlrt.py](https://github.com/AmusementClub/vs-mlrt/blob/master/scripts/vsmlrt.py) 11 | for all bundled models and a unified interface to select different backends. 12 | 13 | Please refer to [the wiki](https://github.com/AmusementClub/vs-mlrt/wiki) for supported models & usage information. 14 | 15 | ## vsov: OpenVINO-based Pure CPU & Intel GPU Runtime 16 | 17 | [OpenVINO](https://docs.openvino.ai/latest/index.html) is an AI inference runtime developed 18 | by Intel, mainly targeting x86 CPUs and Intel GPUs. 19 | 20 | The vs-openvino plugin provides optimized *pure* CPU & Intel GPU runtime for some popular AI filters. 21 | Intel GPU supports Gen 8+ on Broadwell+ and the Arc series GPUs. 22 | 23 | To install, download the latest release and extract them into your VS `plugins` directory. 24 | 25 | Please visit the [vsov](vsov) directory for details. 26 | 27 | ## vsort: ONNX Runtime-based CPU/GPU Runtime 28 | 29 | [ONNX Runtime](https://onnxruntime.ai/) is an AI inference runtime with many backends. 30 | 31 | The vs-onnxruntime plugin provides optimized CPU and CUDA GPU runtime for some popular AI filters. 32 | 33 | To install, download the latest release and extract them into your VS `plugins` directory. 34 | 35 | Please visit the [vsort](vsort) directory for details. 36 | 37 | ## vstrt: TensorRT-based GPU Runtime 38 | 39 | [TensorRT](https://developer.nvidia.com/tensorrt) is a highly optimized AI inference runtime 40 | for NVidia GPUs. It uses benchmarking to find the optimal kernel to use for your specific 41 | GPU, and so there is an extra step to build an engine from ONNX network on the machine 42 | you are going to use the vstrt filter, and this extra step makes deploying models a little 43 | harder than the other runtimes. However, the resulting performance is also typically 44 | *much much better* than the CUDA backend of [vsort](vsort). 45 | 46 | To install, download the latest release and extract them into your VS `plugins` directory. 47 | 48 | Please visit the [vstrt](vstrt) directory for details. 49 | 50 | ## vsmigx: MIGraphX-based GPU Runtime 51 | 52 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX) is a highly optimized AI inference runtime 53 | for AMD GPUs. It also uses benchmarking to find the optimal kernel, similar to vstrt. 54 | 55 | To install, download the latest release and extract them into your VS `plugins` directory. 56 | 57 | Please visit the [vsmigx](vsmigx) directory for details. 58 | 59 | ## vsncnn: NCNN-based GPU (Vulkan) Runtime 60 | 61 | [ncnn](https://github.com/Tencent/ncnn) is a popular AI inference runtime. [vsncnn](vsncnn) 62 | provides a vulkan based runtime for some AI filters. It includes support for on-the-fly 63 | ONNX to ncnn native format conversion so as to provide a unified interface across all 64 | runtimes provided by this project. As it uses the device-independent 65 | [Vulkan](https://en.wikipedia.org/wiki/Vulkan) interface for GPU accelerated inference, 66 | this plugin supports all GPUs that provides Vulkan interface (NVidia, AMD, Intel integrated & 67 | discrete GPUs all provide this interface.) Another benefit is that it has a significant 68 | smaller footprint than other GPU runtimes (both vsort and vstrt CUDA backends require >1GB 69 | CUDA libraries.) The main drawback is that it's slower. 70 | 71 | To install, download the latest release and extract them into your VS `plugins` directory. 72 | 73 | Please visit the [vsncnn](vsncnn) directory for details. 74 | -------------------------------------------------------------------------------- /common/convert_float_to_float16.h: -------------------------------------------------------------------------------- 1 | #ifndef CONVERT_FLOAT_TO_FLOAT16_H 2 | #define CONVERT_FLOAT_TO_FLOAT16_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | void convert_float_to_float16( 10 | ONNX_NAMESPACE::ModelProto & model, 11 | bool force_fp16_initializers, 12 | // bool keep_io_types = True, 13 | // bool disable_shape_infer = True, 14 | // const std::optional> op_block_list = DEFAULT_OP_BLOCK_LIST, 15 | // const std::optional> op_block_list = {}, 16 | const std::unordered_set & op_block_list, 17 | bool cast_input = true, 18 | bool cast_output = true 19 | ) noexcept; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /common/onnx_utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "onnx_utils.h" 12 | 13 | 14 | using namespace std::string_literals; 15 | 16 | #ifdef _WIN32 17 | #include 18 | #include 19 | static inline std::wstring translateName(const char *name) noexcept { 20 | std::wstring_convert> converter; 21 | return converter.from_bytes(name); 22 | } 23 | #else 24 | #define translateName(n) (n) 25 | #endif 26 | 27 | 28 | [[nodiscard]] 29 | static std::optional specifyShape( 30 | ONNX_NAMESPACE::ModelProto & model, 31 | int64_t tile_w, 32 | int64_t tile_h, 33 | int64_t batch = 1 34 | ) noexcept { 35 | 36 | if (model.graph().input_size() != 1) { 37 | return "graph must has a single input"; 38 | } 39 | ONNX_NAMESPACE::TensorShapeProto * input_shape { 40 | model 41 | .mutable_graph() 42 | ->mutable_input(0) 43 | ->mutable_type() 44 | ->mutable_tensor_type() 45 | ->mutable_shape() 46 | }; 47 | 48 | if (model.graph().output_size() != 1) { 49 | return "graph must has a single output"; 50 | } 51 | ONNX_NAMESPACE::TensorShapeProto * output_shape { 52 | model 53 | .mutable_graph() 54 | ->mutable_output(0) 55 | ->mutable_type() 56 | ->mutable_tensor_type() 57 | ->mutable_shape() 58 | }; 59 | 60 | constexpr auto n_idx = 0; 61 | constexpr auto h_idx = 2; 62 | constexpr auto w_idx = 3; 63 | 64 | if (input_shape->dim_size() != 4) { 65 | return "input dimension must be 4"; 66 | } 67 | 68 | input_shape->mutable_dim(n_idx)->set_dim_value(batch); 69 | input_shape->mutable_dim(h_idx)->set_dim_value(tile_h); 70 | input_shape->mutable_dim(w_idx)->set_dim_value(tile_w); 71 | 72 | if (output_shape->dim_size() != 4) { 73 | return "output dimsion must be 4"; 74 | } 75 | 76 | output_shape->mutable_dim(n_idx)->set_dim_value(batch); 77 | output_shape->mutable_dim(h_idx)->clear_dim_value(); 78 | output_shape->mutable_dim(w_idx)->clear_dim_value(); 79 | 80 | // remove shape info 81 | if (model.graph().value_info_size() != 0) { 82 | model.mutable_graph()->mutable_value_info()->Clear(); 83 | } 84 | 85 | try { 86 | ONNX_NAMESPACE::shape_inference::InferShapes(model); 87 | } catch (const ONNX_NAMESPACE::InferenceError & e) { 88 | return e.what(); 89 | } 90 | 91 | return {}; 92 | } 93 | 94 | 95 | std::variant loadONNX( 96 | const std::string_view & path, 97 | int64_t tile_w, 98 | int64_t tile_h, 99 | bool path_is_serialization 100 | ) noexcept { 101 | 102 | ONNX_NAMESPACE::ModelProto onnx_proto; 103 | 104 | if (path_is_serialization) { 105 | if (!onnx_proto.ParseFromArray(path.data(), static_cast(path.size()))) { 106 | return "parse onnx serialization failed"s; 107 | } 108 | } else { 109 | std::ifstream onnx_stream( 110 | translateName(path.data()), 111 | std::ios::binary 112 | ); 113 | 114 | if (!onnx_stream.good()) { 115 | return "open "s + std::string{ path } + " failed"s; 116 | } 117 | 118 | if (!onnx_proto.ParseFromIstream(&onnx_stream)) { 119 | return "parse "s + std::string{ path } + " failed"s; 120 | } 121 | } 122 | 123 | if (auto err = specifyShape(onnx_proto, tile_w, tile_h); err.has_value()) { 124 | return err.value(); 125 | } 126 | 127 | return onnx_proto; 128 | } 129 | -------------------------------------------------------------------------------- /common/onnx_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef ONNX_UTILS_H 2 | #define ONNX_UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | std::variant loadONNX( 12 | const std::string_view & path, 13 | int64_t tile_w, 14 | int64_t tile_h, 15 | bool path_is_serialization 16 | ) noexcept; 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /vsmigx/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-migraphx VERSION 3.1 LANGUAGES CXX) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 8 | 9 | find_package(migraphx REQUIRED CONFIG) 10 | find_package(hip REQUIRED CONFIG) 11 | 12 | add_library(vsmigx SHARED vs_migraphx.cpp win32.cpp) 13 | 14 | target_include_directories(vsmigx PRIVATE ${VAPOURSYNTH_INCLUDE_DIRECTORY}) 15 | 16 | target_link_libraries(vsmigx PRIVATE migraphx::c hip::host) 17 | 18 | set_target_properties(vsmigx PROPERTIES 19 | CXX_EXTENSIONS OFF 20 | POSITION_INDEPENDENT_CODE ON 21 | CXX_STANDARD 20 22 | CXX_STANDARD_REQUIRED ON 23 | ) 24 | 25 | if (WIN32) 26 | target_link_options(vsmigx PRIVATE 27 | "/DELAYLOAD:migraphx_c.dll" 28 | "/DELAYLOAD:amdhip64_6.dll" 29 | "delayimp.lib" 30 | ) 31 | endif() 32 | 33 | find_package(Git REQUIRED) 34 | execute_process( 35 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 36 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 37 | OUTPUT_VARIABLE VCS_TAG 38 | ) 39 | string(STRIP ${VCS_TAG} VCS_TAG) 40 | configure_file(config.h.in config.h) 41 | target_include_directories(vsmigx PUBLIC "${PROJECT_BINARY_DIR}") 42 | 43 | install(TARGETS vsmigx 44 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 45 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 46 | ) 47 | -------------------------------------------------------------------------------- /vsmigx/README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth MIGraphX 2 | 3 | The vs-migraphx plugin provides optimized HIP runtime for some popular AI filters on AMD GPUs. 4 | 5 | ## Usage 6 | 7 | Prototype: `core.migx.Model(clip[] clips, string program_path[, int[] overlap, int[] tilesize, int device_id=0, int num_streams=1, string flexible_output_prop=""])` 8 | 9 | Arguments: 10 | - `clip[] clips`: the input clips, only 16/32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki). 11 | - `string program_path`: the path to the prebuilt program (see below) 12 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size. 13 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size. 14 | - `int device_id`: Specifies the GPU device id to use, default 0. Requires AMD GPUs with gfx1030 target or RDNA3 architecture onwards ([list](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus)). 15 | - `int num_streams`: number of concurrent HIP streams to use. Default 1. Increase if GPU not saturated. 16 | - `string flexible_output_prop`: used to support onnx models with arbitrary number of output planes. 17 | 18 | ```python3 19 | from typing import TypedDict 20 | 21 | class Output(TypedDict): 22 | clip: vs.VideoNode 23 | num_planes: int 24 | 25 | prop = "planes" # arbitrary non-empty string 26 | output = core.migx.Model(src, program_path, flexible_output_prop=prop) # type: Output 27 | 28 | clip = output["clip"] 29 | num_planes = output["num_planes"] 30 | 31 | output_planes = [ 32 | clip.std.PropToClip(prop=f"{prop}{i}") 33 | for i in range(num_planes) 34 | ] # type: list[vs.VideoNode] 35 | ``` 36 | 37 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case. 38 | 39 | The general rule is to either: 40 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or 41 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used. 42 | 43 | ## Instructions 44 | 45 | ### Build program 46 | ```shell 47 | migraphx-driver compile --onnx drunet_gray.onnx --gpu --input-dim @input 1 2 1080 1920 --output dpir_gray_1080p.mxr 48 | ``` 49 | 50 | The program can be applied to `1920x1080` input. 51 | 52 | Also check [migraphx-driver useful arguments](#migraphx-driver-useful-arguments) 53 | 54 | ### Run model 55 | In vpy script: 56 | ```python3 57 | # DPIR 58 | src = core.std.BlankClip(src, width=1920, height=1080, format=vs.GRAYS) 59 | sigma = 10.0 60 | flt = core.migx.Model([src, core.std.BlankClip(src, color=sigma/255.0)], engine_path="dpir_gray_1080p.mxr", tilesize=[1920, 1080]) 61 | ``` 62 | 63 | ## trtexec useful arguments 64 | - `--fp16`: Enable fp16 precision, in addition to fp32 (default = disabled) 65 | 66 | - `--output `: Save the serialized program 67 | 68 | - `--migraphx `: Load a serialized program 69 | 70 | - `--optimize`: Performs common graph optimizations 71 | 72 | - `--exhaustive-tune`: Enables exhaustive search to find the fastest kernel 73 | 74 | - `--disable-fast-math`: Disable fast math optimization 75 | 76 | Also check the [full list of options](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/migraphx-driver.html#options) and [environment variables](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/dev/env_vars.html). 77 | 78 | -------------------------------------------------------------------------------- /vsmigx/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vsmigx/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DLL_DIR L"vsmlrt-hip" 10 | 11 | #include 12 | 13 | namespace { 14 | std::vector dlls = { 15 | // This list must be sorted by dependency. 16 | L"amdhip64_6.dll", 17 | L"migraphx.dll", 18 | L"migraphx_tf.dll", 19 | L"migraphx_onnx.dll", 20 | L"migraphx_c.dll", // must be the last 21 | }; 22 | 23 | namespace fs = std::filesystem; 24 | static fs::path dllDir() { 25 | static const std::wstring res = []() -> std::wstring { 26 | HMODULE mod = 0; 27 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 28 | std::vector buf; 29 | size_t n = 0; 30 | do { 31 | buf.resize(buf.size() + MAX_PATH); 32 | n = GetModuleFileNameW(mod, buf.data(), buf.size()); 33 | } while (n >= buf.size()); 34 | buf.resize(n); 35 | std::wstring path(buf.begin(), buf.end()); 36 | return path; 37 | } 38 | throw std::runtime_error("unable to locate myself"); 39 | }(); 40 | return fs::path(res).parent_path(); 41 | } 42 | 43 | FARPROC loadDLLs() { 44 | fs::path dir = dllDir() / DLL_DIR; 45 | HMODULE h = nullptr; 46 | for (const auto dll: dlls) { 47 | fs::path p = dir / dll; 48 | std::wstring s = p; 49 | h = LoadLibraryW(s.c_str()); 50 | if (getenv("VSMIGX_VERBOSE")) 51 | std::wcerr << L"vsmigx: preloading " << p << L": " << h << std::endl; 52 | if (!h) 53 | std::wcerr << L"vsmigx: failed to preload " << s << std::endl; 54 | } 55 | return (FARPROC)h; 56 | } 57 | 58 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 59 | switch (reason) { 60 | case dliNoteStartProcessing: 61 | case dliNoteEndProcessing: 62 | // Nothing to do here. 63 | break; 64 | case dliNotePreLoadLibrary: 65 | //std::cerr << "loading " << info->szDll << std::endl; 66 | if (std::string(info->szDll).find("migraphx_c.dll") != std::string::npos || 67 | std::string(info->szDll).find("amdhip64_6.dll") != std::string::npos 68 | ) 69 | return loadDLLs(); 70 | break; 71 | case dliNotePreGetProcAddress: 72 | // Nothing to do here. 73 | break; 74 | case dliFailLoadLib: 75 | case dliFailGetProc: 76 | // Returning NULL from error notifications will cause the delay load 77 | // runtime to raise a VcppException structured exception, that some code 78 | // might want to handle. 79 | return NULL; 80 | break; 81 | default: 82 | abort(); // unreachable. 83 | break; 84 | } 85 | // Returning NULL causes the delay load machinery to perform default 86 | // processing for this notification. 87 | return NULL; 88 | } 89 | } // namespace 90 | 91 | extern "C" { 92 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 93 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 94 | }; 95 | #endif 96 | -------------------------------------------------------------------------------- /vsncnn/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-ncnn VERSION 3.0 LANGUAGES CXX) 4 | 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 6 | 7 | find_package(protobuf REQUIRED CONFIG) 8 | find_package(ONNX REQUIRED CONFIG) 9 | find_package(ncnn REQUIRED CONFIG) 10 | 11 | add_library(vsncnn SHARED vs_ncnn.cpp onnx2ncnn.cpp ../common/onnx_utils.cpp) 12 | 13 | target_include_directories(vsncnn PRIVATE 14 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 15 | ${ONNX_INCLUDE_DIRS} 16 | ) 17 | 18 | target_link_libraries(vsncnn PRIVATE onnx ncnn) 19 | 20 | set_target_properties(vsncnn PROPERTIES 21 | POSITION_INDEPENDENT_CODE ON 22 | CXX_EXTENSIONS OFF 23 | CXX_STANDARD 17 24 | CXX_STANDARD_REQUIRED ON 25 | ) 26 | 27 | if (CMAKE_CXX_STANDARD GREATER 17) 28 | set_target_properties(vsncnn PROPERTIES CXX_STANDARD ${CMAKE_CXX_STANDARD}) 29 | endif() 30 | 31 | target_include_directories(vsncnn PUBLIC 32 | "${PROJECT_BINARY_DIR}" 33 | ) 34 | 35 | find_package(Git REQUIRED) 36 | execute_process( 37 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 38 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 39 | OUTPUT_VARIABLE VCS_TAG 40 | ) 41 | string(STRIP ${VCS_TAG} VCS_TAG) 42 | configure_file(config.h.in config.h) 43 | 44 | install(TARGETS vsncnn 45 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 46 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 47 | ) 48 | -------------------------------------------------------------------------------- /vsncnn/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vsncnn/onnx2ncnn.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ONNX2NCNN_HPP 2 | #define ONNX2NCNN_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | extern std::optional> onnx2ncnn(ONNX_NAMESPACE::ModelProto & model); 11 | 12 | #endif // ONNX2NCNN_HPP 13 | -------------------------------------------------------------------------------- /vsort/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-ort VERSION 3.0 LANGUAGES CXX) 4 | 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 6 | set(ONNX_RUNTIME_API_DIRECTORY "" CACHE PATH "Path to ONNX API headers") 7 | set(ONNX_RUNTIME_LIB_DIRECTORY "" CACHE PATH "Path to ONNX Runtime libraries") 8 | 9 | set(ENABLE_CUDA OFF CACHE BOOL "Enable CUDA backend") 10 | set(ENABLE_DML OFF CACHE BOOL "Enable DirectML backend") 11 | set(ENABLE_COREML OFF CACHE BOOL "Enable CoreML support") 12 | 13 | find_package(protobuf REQUIRED CONFIG) 14 | find_package(ONNX REQUIRED CONFIG) 15 | 16 | add_library(vsort SHARED 17 | vs_onnxruntime.cpp 18 | win32.cpp 19 | ../common/onnx_utils.cpp 20 | ../common/convert_float_to_float16.cpp 21 | ) 22 | 23 | target_include_directories(vsort PRIVATE 24 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 25 | ${ONNX_INCLUDE_DIRS} 26 | ${ONNX_RUNTIME_API_DIRECTORY} 27 | ) 28 | 29 | target_link_directories(vsort PRIVATE 30 | ${ONNX_RUNTIME_LIB_DIRECTORY} 31 | ) 32 | 33 | set_target_properties(vsort PROPERTIES 34 | POSITION_INDEPENDENT_CODE ON 35 | CXX_EXTENSIONS OFF 36 | CXX_STANDARD 17 37 | CXX_STANDARD_REQUIRED ON) 38 | 39 | if (CMAKE_CXX_STANDARD GREATER_EQUAL 20) 40 | set_target_properties(vsort PROPERTIES CXX_STANDARD 20) 41 | endif() 42 | 43 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5 44 | if (ONNX_VERSION VERSION_LESS 1.16.0) 45 | target_link_libraries(vsort PRIVATE onnx) 46 | else() 47 | target_link_libraries(vsort PRIVATE ONNX::onnx) 48 | endif() 49 | 50 | target_link_libraries(vsort PRIVATE onnxruntime) 51 | 52 | if (ENABLE_CUDA) 53 | find_package(CUDAToolkit REQUIRED) 54 | 55 | add_compile_definitions(ENABLE_CUDA) 56 | target_include_directories(vsort PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) 57 | target_link_libraries(vsort PRIVATE CUDA::cudart_static) 58 | 59 | if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 60 | target_link_options(vsort PRIVATE "/DELAYLOAD:onnxruntime.dll" "delayimp.lib") 61 | endif() 62 | endif() 63 | 64 | if (ENABLE_DML) 65 | add_compile_definitions(ENABLE_DML) 66 | endif() 67 | 68 | if(ENABLE_COREML) 69 | add_compile_definitions(ENABLE_COREML=1) 70 | endif() 71 | 72 | target_include_directories(vsort PUBLIC 73 | "${PROJECT_BINARY_DIR}" 74 | ) 75 | 76 | find_package(Git REQUIRED) 77 | execute_process( 78 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 79 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 80 | OUTPUT_VARIABLE VCS_TAG 81 | ) 82 | string(STRIP ${VCS_TAG} VCS_TAG) 83 | configure_file(config.h.in config.h) 84 | 85 | install(TARGETS vsort 86 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 87 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 88 | ) 89 | -------------------------------------------------------------------------------- /vsort/README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth ONNX Runtime 2 | 3 | The vs-onnxruntime plugin provides optimized CPU & CUDA runtime for some popular AI filters. 4 | 5 | ## Building and Installation 6 | 7 | To build, you will need [ONNX Runtime](https://www.onnxruntime.ai/), [protobuf](https://github.com/protocolbuffers/protobuf), [ONNX](https://github.com/onnx/onnx) and their dependencies. 8 | 9 | Please refer to [ONNX Runtime Docs](https://onnxruntime.ai/docs/install/) for installation notes. 10 | Or, you can use our prebuilt Windows binary releases from [AmusementClub](https://github.com/AmusementClub/onnxruntime/releases/latest/). 11 | 12 | Please refer to our [github actions workflow](../.github/workflows/windows-ort.yml) for sample building instructions. 13 | 14 | If you only use the CPU backend, then you just need to extract binary release into your `vapoursynth/plugins` directory. 15 | 16 | However, if you also use the CUDA backend, you will need to download some CUDA libraries as well, please see the release page for details. Those CUDA libraries also need to be extracted into VS `vapoursynth/plugins` directory. The plugin will try to load them from `vapoursynth/plugins/vsort/` directory or `vapoursynth/plugins/vsmlrt-cuda/` directory. 17 | 18 | ## Usage 19 | 20 | Prototype: `core.ort.Model(clip[] clips, string network_path[, int[] overlap = None, int[] tilesize = None, string provider = "", int device_id = 0, int verbosity = 2, bint cudnn_benchmark = True, bint builtin = False, string builtindir="models", bint fp16 = False, bint path_is_serialization = False, bint use_cuda_graph = False])` 21 | 22 | Arguments: 23 | - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki). 24 | - `string network_path`: the path to the network in ONNX format. 25 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size. 26 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size. 27 | - `string provider`: Specifies the device to run the inference on. 28 | - `"CPU"` or `""`: pure CPU backend 29 | - `"CUDA"`: CUDA GPU backend, requires Nvidia Maxwell+ GPUs. 30 | - `"DML"`: DirectML backend 31 | - `"COREML"`: CoreML backend 32 | - `int device_id`: select the GPU device for the CUDA backend.' 33 | - `int verbosity`: specify the verbosity of logging, the default is warning. 34 | - 0: fatal error only, `ORT_LOGGING_LEVEL_FATAL` 35 | - 1: also errors, `ORT_LOGGING_LEVEL_ERROR` 36 | - 2: also warnings, `ORT_LOGGING_LEVEL_WARNING` 37 | - 3: also info, `ORT_LOGGING_LEVEL_INFO` 38 | - 4: everything, `ORT_LOGGING_LEVEL_VERBOSE` 39 | - `bint cudnn_benchmark`: whether to let cuDNN use benchmarking to search for the best convolution kernel to use. Default True. It might incur some startup latency. 40 | - `bint builtin`: whether to load the model from the VS plugins directory, see also `builtindir`. 41 | - `string builtindir`: the model directory under VS plugins directory for builtin models, default "models". 42 | - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation. 43 | - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`. 44 | - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead in CUDA backend. Not all models are supported. 45 | - `int ml_program`: select CoreML provider. 46 | - 0: NeuralNetwork 47 | - 1: MLProgram 48 | 49 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case. 50 | 51 | The general rule is to either: 52 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or 53 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used. 54 | -------------------------------------------------------------------------------- /vsort/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vsort/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define DLL_DIR L"vsort" 12 | #define COMMON_CUDA_DIR L"vsmlrt-cuda" 13 | 14 | namespace { 15 | std::vector dlls = { 16 | // This list must be sorted by dependency. 17 | L"DirectML.dll", 18 | L"onnxruntime.dll", // must be the last 19 | }; 20 | 21 | static std::vector cudaDlls { 22 | L"cudart64", 23 | L"cublasLt64", L"cublas64", 24 | L"cufft64", 25 | L"cudnn_ops_infer64", L"cudnn_cnn_infer64", L"cudnn_adv_infer64", L"cudnn64", 26 | L"cupti64", 27 | }; 28 | 29 | bool verbose() { return getenv("VSORT_VERBOSE") != nullptr; } 30 | 31 | namespace fs = std::filesystem; 32 | static fs::path dllDir() { 33 | static const std::wstring res = []() -> std::wstring { 34 | HMODULE mod = 0; 35 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 36 | std::vector buf; 37 | size_t n = 0; 38 | do { 39 | buf.resize(buf.size() + MAX_PATH); 40 | n = GetModuleFileNameW(mod, buf.data(), buf.size()); 41 | } while (n >= buf.size()); 42 | buf.resize(n); 43 | std::wstring path(buf.begin(), buf.end()); 44 | return path; 45 | } 46 | throw std::runtime_error("unable to locate myself"); 47 | }(); 48 | return fs::path(res).parent_path(); 49 | } 50 | 51 | FARPROC loadDLLs() { 52 | fs::path dir = dllDir() / DLL_DIR; 53 | HMODULE h = nullptr; 54 | for (const auto dll: dlls) { 55 | fs::path p = dir / dll; 56 | std::wstring s = p; 57 | h = LoadLibraryW(s.c_str()); 58 | if (verbose()) 59 | std::wcerr << DLL_DIR << L": preloading " << p << L": " << h << std::endl; 60 | if (!h) 61 | std::wcerr << DLL_DIR << L": failed to preload " << s << std::endl; 62 | } 63 | return (FARPROC)h; 64 | } 65 | 66 | static void *dummy() { // mimic OrtGetApiBase 67 | return nullptr; 68 | } 69 | 70 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 71 | switch (reason) { 72 | case dliNoteStartProcessing: 73 | case dliNoteEndProcessing: 74 | // Nothing to do here. 75 | break; 76 | case dliNotePreLoadLibrary: 77 | //std::cerr << "loading " << info->szDll << std::endl; 78 | if (std::string(info->szDll).find("onnxruntime.dll") != std::string::npos) 79 | return loadDLLs(); 80 | break; 81 | case dliNotePreGetProcAddress: 82 | // Nothing to do here. 83 | break; 84 | case dliFailLoadLib: 85 | case dliFailGetProc: 86 | // Returning NULL from error notifications will cause the delay load 87 | // runtime to raise a VcppException structured exception, that some code 88 | // might want to handle. 89 | // The SE will crash the process, so instead we return a dummy function. 90 | return (FARPROC)dummy; 91 | break; 92 | default: 93 | abort(); // unreachable. 94 | break; 95 | } 96 | // Returning NULL causes the delay load machinery to perform default 97 | // processing for this notification. 98 | return NULL; 99 | } 100 | } // namespace 101 | 102 | extern "C" { 103 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 104 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 105 | }; 106 | 107 | bool preloadCudaDlls() { 108 | std::map dllmap; 109 | 110 | auto findDllIn = [&](const std::filesystem::path &dir) { 111 | if (!std::filesystem::is_directory(dir)) 112 | return; 113 | for (const auto &ent: std::filesystem::directory_iterator{dir}) { 114 | if (!ent.is_regular_file()) 115 | continue; 116 | const auto path = ent.path(); 117 | if (path.extension() != ".dll") 118 | continue; 119 | const std::wstring filename = path.filename().wstring(); 120 | for (const auto &dll: cudaDlls) { 121 | if (dllmap.count(dll) > 0) 122 | continue; 123 | if (filename.find(dll) == 0) { 124 | if (verbose()) 125 | std::wcerr << DLL_DIR << L": found " << path << L" for " << dll << std::endl; 126 | dllmap.insert({ dll, path }); 127 | break; 128 | } 129 | } 130 | } 131 | }; 132 | const fs::path dir = dllDir(); 133 | findDllIn(dir / DLL_DIR); 134 | findDllIn(dir / COMMON_CUDA_DIR); 135 | 136 | if (verbose()) { 137 | for (const auto pair: dllmap) 138 | std::wcerr << DLL_DIR << L": will load " << pair.first << L" from " << pair.second << std::endl; 139 | } 140 | for (const auto &dll: cudaDlls) { 141 | if (dllmap.count(dll) == 0) { 142 | if (verbose()) { 143 | std::wcerr << DLL_DIR << L": unable to preload " << dll << L": not found" << std::endl; 144 | return false; 145 | } 146 | } 147 | std::wstring p = dllmap[dll]; 148 | HMODULE h = LoadLibraryW(p.c_str()); 149 | if (verbose()) 150 | std::wcerr << DLL_DIR << L": preloading " << p << L": " << h << std::endl; 151 | if (!h) return false; 152 | } 153 | return true; 154 | } 155 | #endif 156 | -------------------------------------------------------------------------------- /vsov/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-ov VERSION 3.0 LANGUAGES CXX) 4 | 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 6 | set(ENABLE_VISUALIZATION OFF CACHE BOOL "Enable support for network visualization") 7 | set(WIN32_SHARED_OPENVINO OFF CACHE BOOL "Build for win32 with shared openvino library") 8 | 9 | find_package(OpenVINO REQUIRED CONFIG) 10 | 11 | add_library(vsov SHARED 12 | vs_openvino.cpp 13 | win32.cpp 14 | ../common/onnx_utils.cpp 15 | ../common/convert_float_to_float16.cpp 16 | ) 17 | 18 | if(ENABLE_VISUALIZATION) 19 | target_compile_definitions(vsov PRIVATE ENABLE_VISUALIZATION) 20 | endif() 21 | 22 | if(WIN32_SHARED_OPENVINO) 23 | target_compile_definitions(vsov PRIVATE WIN32_SHARED_OPENVINO) 24 | endif() 25 | 26 | find_package(protobuf REQUIRED CONFIG) 27 | find_package(ONNX REQUIRED CONFIG) 28 | 29 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5 30 | if (ONNX_VERSION VERSION_LESS 1.16.0) 31 | target_link_libraries(vsov PRIVATE onnx) 32 | else() 33 | target_link_libraries(vsov PRIVATE ONNX::onnx) 34 | endif() 35 | 36 | target_include_directories(vsov PRIVATE 37 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 38 | ${ONNX_INCLUDE_DIRS} 39 | ) 40 | 41 | target_link_libraries(vsov PRIVATE openvino::runtime) 42 | 43 | set_target_properties(vsov PROPERTIES 44 | CXX_EXTENSIONS OFF 45 | CXX_STANDARD 17 46 | CXX_STANDARD_REQUIRED ON 47 | ) 48 | 49 | if (WIN32) 50 | if(WIN32_SHARED_OPENVINO) 51 | target_link_options(vsov PRIVATE "/DELAYLOAD:openvino.dll" "delayimp.lib") 52 | else() 53 | target_link_options(vsov PRIVATE "/DELAYLOAD:tbb.dll" "delayimp.lib") 54 | endif() 55 | endif() 56 | 57 | target_include_directories(vsov PUBLIC 58 | "${PROJECT_BINARY_DIR}" 59 | ) 60 | 61 | find_package(Git REQUIRED) 62 | execute_process( 63 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 64 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 65 | OUTPUT_VARIABLE VCS_TAG 66 | ) 67 | string(STRIP ${VCS_TAG} VCS_TAG) 68 | configure_file(config.h.in config.h) 69 | 70 | install(TARGETS vsov 71 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 72 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 73 | ) 74 | -------------------------------------------------------------------------------- /vsov/README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth OpenVINO 2 | 3 | The vs-openvino plugin provides optimized *pure* CPU runtime for some popular AI filters. 4 | 5 | ## Building and Installation 6 | 7 | To build, you will need [OpenVINO](https://docs.openvino.ai/latest/get_started.html) and its dependencies. 8 | Only `Model Optimizer` and `Inference Engine` are required. 9 | 10 | You can download official Intel releases: 11 | - [Linux](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_linux_header.html) 12 | - [Windows](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_windows_header.html) 13 | - [macOS](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_macos_header.html) 14 | 15 | Or, you can use our prebuilt Windows binary releases from [AmusementClub](https://github.com/AmusementClub/openvino/releases/latest/), our release has the benefit of static linking support. 16 | 17 | Sample cmake commands to build: 18 | ```bash 19 | cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release 20 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 21 | -D InferenceEngine_DIR=openvino/runtime/cmake 22 | -D VAPOURSYNTH_INCLUDE_DIRECTORY="path/to/vapoursynth/include" 23 | cmake --build build 24 | cmake --install build --prefix install 25 | ``` 26 | You should find `vsov.dll` (or libvsov.so) under `install/bin`. You will also need Intel TBB (you can get 27 | `tbb.dll` from OpenVINO release). On windows, `tbb.dll` must be placed under `vapoursynth/plugins/vsov/` 28 | directory for `vsov.dll` to find. 29 | 30 | ## Usage 31 | 32 | Prototype: `core.ov.Model(clip[] clips, string network_path[, int[] overlap = None, int[] tilesize = None, string device = "CPU", bint builtin = 0, string builtindir="models", bint fp16 = False, function config = None, bint path_is_serialization = False])` 33 | 34 | Arguments: 35 | - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki). 36 | - `string network_path`: the path to the network in ONNX format. 37 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size. 38 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size. 39 | - `string device`: Specifies the device to run the inference on. Currently `"CPU"` and `"GPU"` are supported. `"GPU"` requires Intel graphics (Broadwell+ processors with Gen8+ integrated GPUs or Xe discrete GPUs) with compatible graphics driver and compute runtime. 40 | - `bint builtin`: whether to load the model from the VS plugins directory, see also `builtindir`. 41 | - `string builtindir`: the model directory under VS plugins directory for builtin models, default "models". 42 | - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation. 43 | - `function config`: plugin configuration parameters. It must be a callable object (e.g. a function) with no positional arguments, and returns the configuration parameter in a dictionary `dict`. The dictionary must use string `str` for its key and `int`, `float` or `str` for its values. Supported parameters: [CPU](https://docs.openvino.ai/2021.4/openvino_docs_IE_DG_supported_plugins_CPU.html#supported-configuration-parameters), [GPU](https://docs.openvino.ai/2021.4/openvino_docs_IE_DG_supported_plugins_GPU.html#supported-configuration-parameters) (the prefix `KEY_` has to be removed). Example: `config = lambda: dict(CPU_THROUGHPUT_STREAMS=2)` 44 | - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`. 45 | 46 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case. 47 | 48 | The general rule is to either: 49 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or 50 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used. 51 | -------------------------------------------------------------------------------- /vsov/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vsov/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DLL_DIR L"vsov" 10 | 11 | #include 12 | 13 | namespace { 14 | std::vector dlls = { 15 | // This list must be sorted by dependency. 16 | #ifdef WIN32_SHARED_OPENVINO 17 | L"tbb12.dll", 18 | L"openvino.dll", // must be the last 19 | #else // WIN32_SHARED_OPENVINO 20 | L"tbb12.dll", // must be the last 21 | #endif // WIN32_SHARED_OPENVINO 22 | }; 23 | 24 | namespace fs = std::filesystem; 25 | static fs::path dllDir() { 26 | static const std::wstring res = []() -> std::wstring { 27 | HMODULE mod = 0; 28 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 29 | std::vector buf; 30 | size_t n = 0; 31 | do { 32 | buf.resize(buf.size() + MAX_PATH); 33 | n = GetModuleFileNameW(mod, buf.data(), buf.size()); 34 | } while (n >= buf.size()); 35 | buf.resize(n); 36 | std::wstring path(buf.begin(), buf.end()); 37 | return path; 38 | } 39 | throw std::runtime_error("unable to locate myself"); 40 | }(); 41 | return fs::path(res).parent_path(); 42 | } 43 | 44 | FARPROC loadDLLs() { 45 | fs::path dir = dllDir() / DLL_DIR; 46 | HMODULE h = nullptr; 47 | for (const auto dll: dlls) { 48 | fs::path p = dir / dll; 49 | std::wstring s = p; 50 | h = LoadLibraryW(s.c_str()); 51 | if (getenv("VSOV_VERBOSE")) 52 | std::wcerr << L"vsov: preloading " << p << L": " << h << std::endl; 53 | if (!h) 54 | std::wcerr << L"vsov: failed to preload " << s << std::endl; 55 | } 56 | return (FARPROC)h; 57 | } 58 | 59 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 60 | switch (reason) { 61 | case dliNoteStartProcessing: 62 | case dliNoteEndProcessing: 63 | // Nothing to do here. 64 | break; 65 | case dliNotePreLoadLibrary: 66 | //std::cerr << "loading " << info->szDll << std::endl; 67 | #ifdef WIN32_SHARED_OPENVINO 68 | if (std::string(info->szDll).find("openvino.dll") != std::string::npos) 69 | return loadDLLs(); 70 | #else // WIN32_SHARED_OPENVINO 71 | if (std::string(info->szDll).find("tbb.dll") != std::string::npos) 72 | return loadDLLs(); 73 | #endif // WIN32_SHARED_OPENVINO 74 | break; 75 | case dliNotePreGetProcAddress: 76 | // Nothing to do here. 77 | break; 78 | case dliFailLoadLib: 79 | case dliFailGetProc: 80 | // Returning NULL from error notifications will cause the delay load 81 | // runtime to raise a VcppException structured exception, that some code 82 | // might want to handle. 83 | return NULL; 84 | break; 85 | default: 86 | abort(); // unreachable. 87 | break; 88 | } 89 | // Returning NULL causes the delay load machinery to perform default 90 | // processing for this notification. 91 | return NULL; 92 | } 93 | } // namespace 94 | 95 | extern "C" { 96 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 97 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 98 | }; 99 | #endif 100 | -------------------------------------------------------------------------------- /vstrt/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(vs-trt VERSION 3.1 LANGUAGES CXX) 4 | 5 | set(CMAKE_CXX_STANDARD 17) 6 | 7 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers") 8 | set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT") 9 | option(USE_NVINFER_PLUGIN "Initialize nvinfer_plugin" FALSE) 10 | option(USE_NVINFER_PLUGIN_STATIC "Use static nvinfer_plugin" FALSE) 11 | set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix") 12 | 13 | FIND_PACKAGE(CUDAToolkit REQUIRED) 14 | 15 | add_library(vstrt SHARED 16 | $<$: longpath.manifest> 17 | vs_tensorrt.cpp 18 | win32.cpp 19 | ) 20 | 21 | target_include_directories(vstrt PRIVATE 22 | ${VAPOURSYNTH_INCLUDE_DIRECTORY} 23 | ${CUDAToolkit_INCLUDE_DIRS} 24 | ${TENSORRT_HOME}/include 25 | ) 26 | 27 | set_target_properties(vstrt PROPERTIES 28 | CXX_EXTENSIONS OFF 29 | POSITION_INDEPENDENT_CODE ON 30 | CXX_STANDARD 20 31 | CXX_STANDARD_REQUIRED ON 32 | ) 33 | 34 | target_link_directories(vstrt PRIVATE ${TENSORRT_HOME}/lib) 35 | target_link_libraries(vstrt PRIVATE CUDA::cudart_static "nvinfer${TENSORRT_LIBRARY_SUFFIX}") 36 | 37 | if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 38 | target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib") 39 | endif() 40 | 41 | if (USE_NVINFER_PLUGIN) 42 | add_definitions(-DUSE_NVINFER_PLUGIN) 43 | if (USE_NVINFER_PLUGIN_STATIC) 44 | target_link_libraries(vstrt PRIVATE "nvinfer_plugin_static${TENSORRT_LIBRARY_SUFFIX}") 45 | else() 46 | target_link_libraries(vstrt PRIVATE "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}") 47 | 48 | if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") 49 | target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}.dll") 50 | endif() 51 | endif() 52 | endif() 53 | 54 | target_include_directories(vstrt PUBLIC 55 | "${PROJECT_BINARY_DIR}" 56 | ) 57 | 58 | find_package(Git REQUIRED) 59 | execute_process( 60 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 61 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 62 | OUTPUT_VARIABLE VCS_TAG 63 | ) 64 | string(STRIP ${VCS_TAG} VCS_TAG) 65 | configure_file(config.h.in config.h) 66 | 67 | install(TARGETS vstrt 68 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 69 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 70 | ) 71 | -------------------------------------------------------------------------------- /vstrt/README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth TensorRT 2 | 3 | The vs-tensorrt plugin provides optimized CUDA runtime for some popular AI filters. 4 | 5 | ## Usage 6 | 7 | Prototype: `core.trt.Model(clip[] clips, string engine_path[, int[] overlap, int[] tilesize, int device_id=0, bint use_cuda_graph=False, int num_streams=1, int verbosity=2, string flexible_output_prop=""])` 8 | 9 | Arguments: 10 | - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki). 11 | - `string engine_path`: the path to the prebuilt engine (see below) 12 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size. 13 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size. 14 | - `int device_id`: Specifies the GPU device id to use, default 0. Requires Nvidia GPUs with second-generation Kepler architecture onwards. 15 | - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead. 16 | - `int num_streams`: number of concurrent CUDA streams to use. Default 1. Increase if GPU not saturated. 17 | - `verbosity`: The verbosity level of TensorRT runtime. The message writes to `stderr`. 18 | `0`: Internal error. `1`: Application error. `2`: Warning. `3`: Informational messages with instructional information. `4`: Verbose messages with debugging information. 19 | - `string flexible_output_prop`: used to support onnx models with arbitrary number of output planes. 20 | 21 | ```python3 22 | from typing import TypedDict 23 | 24 | class Output(TypedDict): 25 | clip: vs.VideoNode 26 | num_planes: int 27 | 28 | prop = "planes" # arbitrary non-empty string 29 | output = core.trt.Model(src, engine_path, flexible_output_prop=prop) # type: Output 30 | 31 | clip = output["clip"] 32 | num_planes = output["num_planes"] 33 | 34 | output_planes = [ 35 | clip.std.PropToClip(prop=f"{prop}{i}") 36 | for i in range(num_planes) 37 | ] # type: list[vs.VideoNode] 38 | ``` 39 | 40 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case. 41 | 42 | The general rule is to either: 43 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or 44 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used. 45 | 46 | ## Instructions 47 | 48 | ### Build engine with dynamic shape support 49 | - Requires models with built-in dynamic shape support, e.g. `waifu2x_v3.7z` and `dpir_v3.7z`. 50 | 51 | 1. Build engine 52 | ```shell 53 | trtexec --onnx=drunet_gray.onnx --minShapes=input:1x2x8x8 --optShapes=input:1x2x64x64 --maxShapes=input:1x2x1080x1920 --saveEngine=dpir_gray_1080p_dynamic.engine 54 | ``` 55 | 56 | The engine will be optimized for `64x64` input and can be applied to eligible inputs with shape from `8x8` to `1920x1080` by specifying parameter `tilesize` in the `trt` plugin. 57 | 58 | Also check [trtexec useful arguments](#trtexec-useful-arguments) 59 | 60 | ### Run model 61 | In vpy script: 62 | ```python3 63 | # DPIR 64 | src = core.std.BlankClip(src, width=640, height=360, format=vs.GRAYS) 65 | sigma = 10.0 66 | flt = core.trt.Model([src, core.std.BlankClip(src, color=sigma/255.0)], engine_path="dpir_gray_1080p_dynamic.engine", tilesize=[640, 360]) 67 | ``` 68 | 69 | ## trtexec useful arguments 70 | - `--workspace=N`: Set workspace size in megabytes (default = 16) 71 | 72 | - `--fp16`: Enable fp16 precision, in addition to fp32 (default = disabled) 73 | 74 | - `--noTF32`: Disable tf32 precision (default is to enable tf32, in addition to fp32, Ampere only) 75 | 76 | - `--device=N`: Select cuda device N (default = 0) 77 | 78 | - `--timingCacheFile=`: Save/load the serialized global timing cache 79 | 80 | - `--buildOnly` :Skip inference perf measurement (default = disabled) 81 | 82 | - `--verbose`: Use verbose logging (default = false) 83 | 84 | - `--profilingVerbosity=mode`: Specify profiling verbosity. 85 | 86 | ``` 87 | mode ::= layer_names_only|detailed|none 88 | ``` 89 | 90 | (default = layer_names_only) 91 | 92 | - `--tacticSources=tactics`: Specify the tactics to be used by adding (+) or removing (-) tactics from the default 93 | 94 | tactic sources (default = all available tactics). 95 | 96 | Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics. 97 | 98 | Tactic Sources: 99 | ``` 100 | tactics ::= [","tactic] 101 | tactic ::= (+|-)lib 102 | lib ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN" 103 | ``` 104 | 105 | For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS 106 | 107 | - `--useCudaGraph`: Use CUDA graph to capture engine execution and then launch inference (default = disabled). 108 | This flag may be ignored if the graph capture fails. 109 | 110 | - `--noDataTransfers`: Disable DMA transfers to and from device (default = enabled). 111 | 112 | - `--saveEngine=`: Save the serialized engine 113 | 114 | - `--loadEngine=`: Load a serialized engine 115 | 116 | -------------------------------------------------------------------------------- /vstrt/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /vstrt/cuda_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_CUDA_HELPER_H_ 2 | #define VSTRT_CUDA_HELPER_H_ 3 | 4 | #include 5 | 6 | #include 7 | 8 | #define checkError(expr) do { \ 9 | using namespace std::string_literals; \ 10 | cudaError_t __err = expr; \ 11 | if (__err != cudaSuccess) { \ 12 | const char * message = cudaGetErrorString(__err); \ 13 | return set_error("'"s + # expr + "' failed: " + message); \ 14 | } \ 15 | } while(0) 16 | 17 | #endif // VSTRT_CUDA_HELPER_H_ 18 | -------------------------------------------------------------------------------- /vstrt/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_CUDA_UTILS_H_ 2 | #define VSTRT_CUDA_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | template 12 | requires 13 | std::default_initializable && 14 | std::movable && 15 | std::is_trivially_copy_assignable_v && 16 | std::convertible_to && 17 | std::invocable 18 | struct Resource { 19 | T data; 20 | 21 | [[nodiscard]] 22 | constexpr Resource() noexcept = default; 23 | 24 | [[nodiscard]] 25 | constexpr Resource(T && x) noexcept : data(x) {} 26 | 27 | [[nodiscard]] 28 | constexpr Resource(Resource&& other) noexcept 29 | : data(std::exchange(other.data, T{})) 30 | { } 31 | 32 | constexpr Resource& operator=(Resource&& other) noexcept { 33 | if (this == &other) return *this; 34 | deleter_(std::move(data)); 35 | data = std::exchange(other.data, T{}); 36 | return *this; 37 | } 38 | 39 | constexpr Resource& operator=(const Resource & other) = delete; 40 | 41 | Resource(const Resource& other) = delete; 42 | 43 | constexpr operator T() const noexcept { 44 | return data; 45 | } 46 | 47 | constexpr auto deleter_(T && x) noexcept { 48 | if (x) { 49 | deleter(x); 50 | } 51 | } 52 | 53 | constexpr Resource& operator=(T && x) noexcept { 54 | deleter_(std::move(data)); 55 | data = x; 56 | return *this; 57 | } 58 | 59 | constexpr ~Resource() noexcept { 60 | deleter_(std::move(data)); 61 | } 62 | }; 63 | 64 | struct MemoryResource { 65 | Resource h_data; 66 | Resource d_data; 67 | size_t size; 68 | }; 69 | 70 | using StreamResource = Resource; 71 | using GraphExecResource = Resource; 72 | 73 | #endif // VSTRT_CUDA_UTILS_H_ 74 | -------------------------------------------------------------------------------- /vstrt/inference_helper.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_INFERENCE_HELPER_H_ 2 | #define VSTRT_INFERENCE_HELPER_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include "cuda_helper.h" 13 | #include "trt_utils.h" 14 | 15 | struct InputInfo { 16 | int width; 17 | int height; 18 | int pitch; 19 | int bytes_per_sample; 20 | int tile_w; 21 | int tile_h; 22 | }; 23 | 24 | struct OutputInfo { 25 | int pitch; 26 | int bytes_per_sample; 27 | }; 28 | 29 | struct IOInfo { 30 | InputInfo in; 31 | OutputInfo out; 32 | int w_scale; 33 | int h_scale; 34 | int overlap_w; 35 | int overlap_h; 36 | }; 37 | 38 | static inline 39 | std::optional inference( 40 | const InferenceInstance & instance, 41 | int device_id, 42 | bool use_cuda_graph, 43 | const IOInfo & info, 44 | const std::vector & src_ptrs, 45 | const std::vector & dst_ptrs 46 | ) noexcept { 47 | 48 | const auto set_error = [](const ErrorMessage & error_message) { 49 | return error_message; 50 | }; 51 | 52 | checkError(cudaSetDevice(device_id)); 53 | 54 | int src_tile_w_bytes = info.in.tile_w * info.in.bytes_per_sample; 55 | int src_tile_bytes = info.in.tile_h * info.in.tile_w * info.in.bytes_per_sample; 56 | int dst_tile_w = info.in.tile_w * info.w_scale; 57 | int dst_tile_h = info.in.tile_h * info.h_scale; 58 | int dst_tile_w_bytes = dst_tile_w * info.out.bytes_per_sample; 59 | int dst_tile_bytes = dst_tile_h * dst_tile_w * info.out.bytes_per_sample; 60 | 61 | int step_w = info.in.tile_w - 2 * info.overlap_w; 62 | int step_h = info.in.tile_h - 2 * info.overlap_h; 63 | 64 | int y = 0; 65 | while (true) { 66 | int y_crop_start = (y == 0) ? 0 : info.overlap_h; 67 | int y_crop_end = (y == info.in.height - info.in.tile_h) ? 0 : info.overlap_h; 68 | 69 | int x = 0; 70 | while (true) { 71 | int x_crop_start = (x == 0) ? 0 : info.overlap_w; 72 | int x_crop_end = (x == info.in.width - info.in.tile_w) ? 0 : info.overlap_w; 73 | 74 | { 75 | uint8_t * h_data = instance.src.h_data.data; 76 | for (const uint8_t * _src_ptr : src_ptrs) { 77 | const uint8_t * src_ptr { _src_ptr + 78 | y * info.in.pitch + x * info.in.bytes_per_sample 79 | }; 80 | 81 | vs_bitblt( 82 | h_data, src_tile_w_bytes, 83 | src_ptr, info.in.pitch, 84 | static_cast(src_tile_w_bytes), 85 | static_cast(info.in.tile_h) 86 | ); 87 | 88 | h_data += src_tile_bytes; 89 | } 90 | } 91 | 92 | if (use_cuda_graph) { 93 | checkError(cudaGraphLaunch(instance.graphexec, instance.stream)); 94 | } else { 95 | auto result = enqueue( 96 | instance.src, instance.dst, 97 | instance.exec_context, instance.stream 98 | ); 99 | 100 | if (result.has_value()) { 101 | return set_error(result.value()); 102 | } 103 | } 104 | checkError(cudaStreamSynchronize(instance.stream)); 105 | 106 | { 107 | const uint8_t * h_data = instance.dst.h_data.data; 108 | for (uint8_t * _dst_ptr : dst_ptrs) { 109 | uint8_t * dst_ptr { _dst_ptr + 110 | info.h_scale * y * info.out.pitch + info.w_scale * x * info.out.bytes_per_sample 111 | }; 112 | 113 | vs_bitblt( 114 | dst_ptr + (y_crop_start * info.out.pitch + x_crop_start * info.out.bytes_per_sample), 115 | info.out.pitch, 116 | h_data + (y_crop_start * dst_tile_w_bytes + x_crop_start * info.out.bytes_per_sample), 117 | dst_tile_w_bytes, 118 | static_cast(dst_tile_w_bytes - (x_crop_start + x_crop_end) * info.out.bytes_per_sample), 119 | static_cast(dst_tile_h - (y_crop_start + y_crop_end)) 120 | ); 121 | 122 | h_data += dst_tile_bytes; 123 | } 124 | } 125 | 126 | if (x + info.in.tile_w == info.in.width) { 127 | break; 128 | } 129 | 130 | x = std::min(x + step_w, info.in.width - info.in.tile_w); 131 | } 132 | 133 | if (y + info.in.tile_h == info.in.height) { 134 | break; 135 | } 136 | 137 | y = std::min(y + step_h, info.in.height - info.in.tile_h); 138 | } 139 | 140 | return {}; 141 | } 142 | 143 | #endif // VSTRT_INFERENCE_HELPER_H_ 144 | -------------------------------------------------------------------------------- /vstrt/longpath.manifest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | true 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /vstrt/trt_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_TRT_UTILS_H_ 2 | #define VSTRT_TRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include "cuda_helper.h" 15 | #include "cuda_utils.h" 16 | 17 | using ErrorMessage = std::string; 18 | 19 | struct RequestedTileSize { 20 | int tile_w; 21 | int tile_h; 22 | }; 23 | 24 | struct VideoSize { 25 | int width; 26 | int height; 27 | }; 28 | 29 | using TileSize = std::variant; 30 | 31 | struct InferenceInstance { 32 | MemoryResource src; 33 | MemoryResource dst; 34 | StreamResource stream; 35 | std::unique_ptr exec_context; 36 | GraphExecResource graphexec; 37 | 38 | #if NV_TENSORRT_MAJOR >= 10 39 | Resource d_context_allocation; 40 | #endif 41 | }; 42 | 43 | class Logger : public nvinfer1::ILogger { 44 | void log(Severity severity, const char* message) noexcept override { 45 | if (severity <= verbosity) { 46 | std::cerr << message << '\n'; 47 | } 48 | } 49 | 50 | public: 51 | Logger() = default; 52 | 53 | void set_verbosity(Severity value) noexcept { 54 | this->verbosity = value; 55 | } 56 | 57 | private: 58 | Severity verbosity; 59 | }; 60 | 61 | static inline 62 | std::optional selectProfile( 63 | const std::unique_ptr & engine, 64 | const TileSize & tile_size, 65 | int batch_size = 1 66 | ) noexcept { 67 | 68 | int tile_w, tile_h; 69 | if (std::holds_alternative(tile_size)) { 70 | tile_w = std::get(tile_size).tile_w; 71 | tile_h = std::get(tile_size).tile_h; 72 | } else { 73 | tile_w = std::get(tile_size).width; 74 | tile_h = std::get(tile_size).height; 75 | } 76 | 77 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 78 | auto input_name = engine->getIOTensorName(0); 79 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 80 | 81 | // finds the optimal profile 82 | for (int i = 0; i < engine->getNbOptimizationProfiles(); ++i) { 83 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 84 | nvinfer1::Dims opt_dims = engine->getProfileShape( 85 | input_name, i, nvinfer1::OptProfileSelector::kOPT 86 | ); 87 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 88 | nvinfer1::Dims opt_dims = engine->getProfileDimensions( 89 | 0, i, nvinfer1::OptProfileSelector::kOPT 90 | ); 91 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 92 | 93 | if (opt_dims.d[0] != batch_size) { 94 | continue; 95 | } 96 | if (opt_dims.d[2] == tile_h && opt_dims.d[3] == tile_w) { 97 | return i; 98 | } 99 | } 100 | 101 | // finds the first eligible profile 102 | for (int i = 0; i < engine->getNbOptimizationProfiles(); ++i) { 103 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 104 | nvinfer1::Dims min_dims = engine->getProfileShape( 105 | input_name, i, nvinfer1::OptProfileSelector::kMIN 106 | ); 107 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 108 | nvinfer1::Dims min_dims = engine->getProfileDimensions( 109 | 0, i, nvinfer1::OptProfileSelector::kMIN 110 | ); 111 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 112 | 113 | if (min_dims.d[0] > batch_size) { 114 | continue; 115 | } 116 | if (min_dims.d[2] > tile_h || min_dims.d[3] > tile_w) { 117 | continue; 118 | } 119 | 120 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 121 | nvinfer1::Dims max_dims = engine->getProfileShape( 122 | input_name, i, nvinfer1::OptProfileSelector::kMAX 123 | ); 124 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 125 | nvinfer1::Dims max_dims = engine->getProfileDimensions( 126 | 0, i, nvinfer1::OptProfileSelector::kMAX 127 | ); 128 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 129 | 130 | if (max_dims.d[0] < batch_size) { 131 | continue; 132 | } 133 | if (max_dims.d[2] < tile_h || max_dims.d[3] < tile_w) { 134 | continue; 135 | } 136 | 137 | return i; 138 | } 139 | 140 | // returns not-found 141 | return {}; 142 | } 143 | 144 | static inline 145 | std::optional enqueue( 146 | const MemoryResource & src, 147 | const MemoryResource & dst, 148 | const std::unique_ptr & exec_context, 149 | cudaStream_t stream 150 | ) noexcept { 151 | 152 | const auto set_error = [](const ErrorMessage & message) { 153 | return message; 154 | }; 155 | 156 | checkError(cudaMemcpyAsync( 157 | src.d_data, src.h_data, src.size, 158 | cudaMemcpyHostToDevice, stream 159 | )); 160 | 161 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 162 | auto input_name = exec_context->getEngine().getIOTensorName(0); 163 | auto output_name = exec_context->getEngine().getIOTensorName(1); 164 | 165 | if (!exec_context->setTensorAddress(input_name, src.d_data.data)) { 166 | return set_error("set input tensor address failed"); 167 | } 168 | if (!exec_context->setTensorAddress(output_name, dst.d_data.data)) { 169 | return set_error("set output tensor address failed"); 170 | } 171 | if (!exec_context->enqueueV3(stream)) { 172 | return set_error("enqueue error"); 173 | } 174 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 175 | void * bindings[] { 176 | static_cast(src.d_data.data), 177 | static_cast(dst.d_data.data) 178 | }; 179 | 180 | if (!exec_context->enqueueV2(bindings, stream, nullptr)) { 181 | return set_error("enqueue error"); 182 | } 183 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 184 | 185 | checkError(cudaMemcpyAsync( 186 | dst.h_data, dst.d_data, dst.size, 187 | cudaMemcpyDeviceToHost, stream 188 | )); 189 | 190 | return {}; 191 | } 192 | 193 | static inline 194 | std::variant getGraphExec( 195 | const MemoryResource & src, const MemoryResource & dst, 196 | const std::unique_ptr & exec_context, 197 | cudaStream_t stream 198 | ) noexcept { 199 | 200 | const auto set_error = [](const ErrorMessage & message) { 201 | return message; 202 | }; 203 | 204 | // flush deferred internal state update 205 | // https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/developer-guide/index.html#cuda-graphs 206 | { 207 | auto result = enqueue(src, dst, exec_context, stream); 208 | if (result.has_value()) { 209 | return set_error(result.value()); 210 | } 211 | checkError(cudaStreamSynchronize(stream)); 212 | } 213 | 214 | checkError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed)); 215 | { 216 | auto result = enqueue(src, dst, exec_context, stream); 217 | if (result.has_value()) { 218 | return set_error(result.value()); 219 | } 220 | } 221 | cudaGraph_t graph; 222 | checkError(cudaStreamEndCapture(stream, &graph)); 223 | cudaGraphExec_t graphexec; 224 | checkError(cudaGraphInstantiate(&graphexec, graph, nullptr, nullptr, 0)); 225 | checkError(cudaGraphDestroy(graph)); 226 | 227 | return graphexec; 228 | } 229 | 230 | static inline 231 | size_t getSize( 232 | const nvinfer1::Dims & dim 233 | ) noexcept { 234 | 235 | size_t ret = 1; 236 | for (int i = 0; i < dim.nbDims; ++i) { 237 | ret *= dim.d[i]; 238 | } 239 | return ret; 240 | } 241 | 242 | static inline 243 | int getBytesPerSample(nvinfer1::DataType type) noexcept { 244 | switch (type) { 245 | case nvinfer1::DataType::kFLOAT: 246 | return 4; 247 | case nvinfer1::DataType::kHALF: 248 | return 2; 249 | case nvinfer1::DataType::kINT8: 250 | return 1; 251 | case nvinfer1::DataType::kINT32: 252 | return 4; 253 | case nvinfer1::DataType::kBOOL: 254 | return 1; 255 | case nvinfer1::DataType::kUINT8: 256 | return 1; 257 | #if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861 258 | case nvinfer1::DataType::kFP8: 259 | return 1; 260 | #endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861 261 | #if NV_TENSORRT_MAJOR >= 9 262 | case nvinfer1::DataType::kBF16: 263 | return 2; 264 | case nvinfer1::DataType::kINT64: 265 | return 8; 266 | #endif // NV_TENSORRT_MAJOR >= 9 267 | default: 268 | return 0; 269 | } 270 | } 271 | 272 | static inline 273 | std::variant getInstance( 274 | const std::unique_ptr & engine, 275 | const std::optional & profile_index, 276 | const TileSize & tile_size, 277 | bool use_cuda_graph, 278 | bool & is_dynamic 279 | ) noexcept { 280 | 281 | const auto set_error = [](const ErrorMessage & error_message) { 282 | return error_message; 283 | }; 284 | 285 | StreamResource stream {}; 286 | checkError(cudaStreamCreateWithFlags(&stream.data, cudaStreamNonBlocking)); 287 | 288 | auto exec_context = std::unique_ptr( 289 | #if NV_TENSORRT_MAJOR >= 10 290 | engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED) 291 | #else 292 | engine->createExecutionContext() 293 | #endif 294 | ); 295 | 296 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 297 | auto input_name = exec_context->getEngine().getIOTensorName(0); 298 | auto output_name = exec_context->getEngine().getIOTensorName(1); 299 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 300 | 301 | if (!exec_context->allInputDimensionsSpecified()) { 302 | if (!profile_index.has_value()) { 303 | return set_error("no valid optimization profile found"); 304 | } 305 | 306 | is_dynamic = true; 307 | 308 | exec_context->setOptimizationProfileAsync(profile_index.value(), stream); 309 | checkError(cudaStreamSynchronize(stream)); 310 | 311 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 312 | nvinfer1::Dims dims = exec_context->getTensorShape(input_name); 313 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 314 | nvinfer1::Dims dims = exec_context->getBindingDimensions(0); 315 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 316 | 317 | dims.d[0] = 1; 318 | 319 | if (std::holds_alternative(tile_size)) { 320 | dims.d[2] = std::get(tile_size).tile_h; 321 | dims.d[3] = std::get(tile_size).tile_w; 322 | } else { 323 | dims.d[2] = std::get(tile_size).height; 324 | dims.d[3] = std::get(tile_size).width; 325 | } 326 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 327 | exec_context->setInputShape(input_name, dims); 328 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 329 | exec_context->setBindingDimensions(0, dims); 330 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 331 | } else if (std::holds_alternative(tile_size)) { 332 | is_dynamic = false; 333 | 334 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 335 | nvinfer1::Dims dims = exec_context->getTensorShape(input_name); 336 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 337 | nvinfer1::Dims dims = exec_context->getBindingDimensions(0); 338 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 339 | 340 | if (std::holds_alternative(tile_size)) { 341 | if (dims.d[2] != std::get(tile_size).tile_h || 342 | dims.d[3] != std::get(tile_size).tile_w 343 | ) { 344 | return set_error("requested tile size not applicable"); 345 | } 346 | } else { 347 | if (dims.d[2] != std::get(tile_size).height || 348 | dims.d[3] != std::get(tile_size).width 349 | ) { 350 | return set_error("not supported video dimensions"); 351 | } 352 | } 353 | } 354 | 355 | MemoryResource src {}; 356 | { 357 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 358 | auto dim = exec_context->getTensorShape(input_name); 359 | auto type = engine->getTensorDataType(input_name); 360 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 361 | auto dim = exec_context->getBindingDimensions(0); 362 | auto type = engine->getBindingDataType(0); 363 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 364 | 365 | auto size = getSize(dim) * getBytesPerSample(type); 366 | 367 | Resource d_data {}; 368 | checkError(cudaMalloc(&d_data.data, size)); 369 | 370 | Resource h_data {}; 371 | checkError(cudaMallocHost(&h_data.data, size, cudaHostAllocWriteCombined)); 372 | 373 | src = MemoryResource{ 374 | .h_data = std::move(h_data), 375 | .d_data = std::move(d_data), 376 | .size=size 377 | }; 378 | } 379 | 380 | MemoryResource dst {}; 381 | { 382 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 383 | auto dim = exec_context->getTensorShape(output_name); 384 | auto type = engine->getTensorDataType(output_name); 385 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 386 | auto dim = exec_context->getBindingDimensions(1); 387 | auto type = engine->getBindingDataType(1); 388 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 389 | 390 | auto size = getSize(dim) * getBytesPerSample(type); 391 | 392 | Resource d_data {}; 393 | checkError(cudaMalloc(&d_data.data, size)); 394 | 395 | Resource h_data {}; 396 | checkError(cudaMallocHost(&h_data.data, size)); 397 | 398 | dst = MemoryResource{ 399 | .h_data = std::move(h_data), 400 | .d_data = std::move(d_data), 401 | .size=size 402 | }; 403 | } 404 | 405 | #if NV_TENSORRT_MAJOR >= 10 406 | size_t buffer_size { exec_context->updateDeviceMemorySizeForShapes() }; 407 | if (buffer_size == 0) { 408 | return set_error("failed to get internal activation buffer size"); 409 | } 410 | 411 | Resource d_context_allocation {}; 412 | checkError(cudaMalloc(&d_context_allocation.data, buffer_size)); 413 | 414 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001 415 | exec_context->setDeviceMemoryV2(d_context_allocation.data, static_cast(buffer_size)); 416 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001 417 | exec_context->setDeviceMemory(d_context_allocation.data); 418 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001 419 | #endif // NV_TENSORRT_MAJOR >= 10 420 | 421 | GraphExecResource graphexec {}; 422 | if (use_cuda_graph) { 423 | auto result = getGraphExec( 424 | src, dst, 425 | exec_context, stream 426 | ); 427 | if (std::holds_alternative(result)) { 428 | graphexec = std::move(std::get(result)); 429 | } else { 430 | return set_error(std::get(result)); 431 | } 432 | } 433 | 434 | return InferenceInstance{ 435 | .src = std::move(src), 436 | .dst = std::move(dst), 437 | .stream = std::move(stream), 438 | .exec_context = std::move(exec_context), 439 | .graphexec = std::move(graphexec), 440 | #if NV_TENSORRT_MAJOR >= 10 441 | .d_context_allocation = std::move(d_context_allocation) 442 | #endif 443 | }; 444 | } 445 | 446 | static inline 447 | std::optional checkEngine( 448 | const std::unique_ptr & engine, 449 | bool flexible_output 450 | ) noexcept { 451 | 452 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 453 | int num_bindings = engine->getNbIOTensors(); 454 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 455 | int num_bindings = engine->getNbBindings(); 456 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 457 | 458 | if (num_bindings != 2) { 459 | return "network binding count must be 2, got " + std::to_string(num_bindings); 460 | } 461 | 462 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 463 | auto input_name = engine->getIOTensorName(0); 464 | auto output_name = engine->getIOTensorName(1); 465 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 466 | 467 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 468 | if (engine->getTensorIOMode(input_name) != nvinfer1::TensorIOMode::kINPUT) { 469 | return "the first binding should be an input binding"; 470 | } 471 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 472 | if (!engine->bindingIsInput(0)) { 473 | return "the first binding should be an input binding"; 474 | } 475 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 476 | 477 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 478 | const nvinfer1::Dims & input_dims = engine->getTensorShape(input_name); 479 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 480 | const nvinfer1::Dims & input_dims = engine->getBindingDimensions(0); 481 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 482 | 483 | if (input_dims.nbDims != 4) { 484 | return "expects network with 4-D input"; 485 | } 486 | if (input_dims.d[0] != 1) { 487 | return "batch size of network input must be 1"; 488 | } 489 | 490 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 491 | if (engine->getTensorIOMode(output_name) != nvinfer1::TensorIOMode::kOUTPUT) { 492 | return "the second binding should be an output binding"; 493 | } 494 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 495 | if (engine->bindingIsInput(1)) { 496 | return "the second binding should be an output binding"; 497 | } 498 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 499 | 500 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 501 | const nvinfer1::Dims & output_dims = engine->getTensorShape(output_name); 502 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 503 | const nvinfer1::Dims & output_dims = engine->getBindingDimensions(1); 504 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 505 | 506 | if (output_dims.nbDims != 4) { 507 | return "expects network with 4-D output"; 508 | } 509 | if (output_dims.d[0] != 1) { 510 | return "batch size of network output must be 1"; 511 | } 512 | 513 | auto out_channels = output_dims.d[1]; 514 | if (out_channels != 1 && out_channels != 3 && !flexible_output) { 515 | return "output dimensions must be 1 or 3, or enable \"flexible_output\""; 516 | } 517 | 518 | auto in_height = input_dims.d[2]; 519 | auto in_width = input_dims.d[3]; 520 | auto out_height = output_dims.d[2]; 521 | auto out_width = output_dims.d[3]; 522 | if (out_height % in_height != 0 || out_width % in_width != 0) { 523 | return "output dimensions must be divisible by input dimensions"; 524 | } 525 | 526 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 527 | for (const auto & name : { input_name, output_name }) { 528 | if (engine->getTensorLocation(name) != nvinfer1::TensorLocation::kDEVICE) { 529 | return "network binding " + std::string{ name } + " should reside on device"; 530 | } 531 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 532 | for (int i = 0; i < 2; i++) { 533 | if (engine->getLocation(i) != nvinfer1::TensorLocation::kDEVICE) { 534 | return "network binding " + std::to_string(i) + " should reside on device"; 535 | } 536 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 537 | 538 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 539 | if (engine->getTensorFormat(name) != nvinfer1::TensorFormat::kLINEAR) { 540 | return "expects network IO with layout NCHW (row major linear)"; 541 | } 542 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 543 | if (engine->getBindingFormat(i) != nvinfer1::TensorFormat::kLINEAR) { 544 | return "expects network IO with layout NCHW (row major linear)"; 545 | } 546 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 547 | } 548 | 549 | return {}; 550 | } 551 | 552 | static inline 553 | std::variant> initEngine( 554 | const char * engine_data, size_t engine_nbytes, 555 | const std::unique_ptr & runtime, 556 | bool flexible_output 557 | ) noexcept { 558 | 559 | const auto set_error = [](const ErrorMessage & error_message) { 560 | return error_message; 561 | }; 562 | 563 | std::unique_ptr engine { 564 | runtime->deserializeCudaEngine(engine_data, engine_nbytes) 565 | }; 566 | 567 | if (!engine) { 568 | return set_error("engine deserialization failed"); 569 | } 570 | 571 | if (auto err = checkEngine(engine, flexible_output); err.has_value()) { 572 | return set_error(err.value()); 573 | } 574 | 575 | return engine; 576 | } 577 | 578 | // 0: integer, 1: float 579 | static inline 580 | int getSampleType(nvinfer1::DataType type) noexcept { 581 | switch (type) { 582 | case nvinfer1::DataType::kFLOAT: 583 | case nvinfer1::DataType::kHALF: 584 | #if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861 585 | case nvinfer1::DataType::kFP8: 586 | #endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861 587 | #if NV_TENSORRT_MAJOR >= 9 588 | case nvinfer1::DataType::kBF16: 589 | #endif // NV_TENSORRT_MAJOR >= 9 590 | return 1; 591 | case nvinfer1::DataType::kINT8: 592 | case nvinfer1::DataType::kINT32: 593 | case nvinfer1::DataType::kBOOL: 594 | case nvinfer1::DataType::kUINT8: 595 | #if NV_TENSORRT_MAJOR >= 9 596 | case nvinfer1::DataType::kINT64: 597 | #endif // NV_TENSORRT_MAJOR >= 9 598 | return 0; 599 | default: 600 | return -1; 601 | } 602 | } 603 | 604 | #endif // VSTRT_TRT_UTILS_H_ 605 | -------------------------------------------------------------------------------- /vstrt/trtexec/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20) 2 | 3 | project(trtexec LANGUAGES CXX) 4 | 5 | find_package(CUDAToolkit REQUIRED) 6 | 7 | add_executable(trtexec 8 | $<$: longpath.manifest> 9 | trtexec.cpp 10 | logfile.cpp 11 | ../common/bfloat16.cpp 12 | ../common/logger.cpp 13 | ../common/sampleDevice.cpp 14 | ../common/sampleEngines.cpp 15 | ../common/sampleInference.cpp 16 | ../common/sampleOptions.cpp 17 | ../common/sampleReporting.cpp 18 | ../common/sampleUtils.cpp 19 | ../utils/fileLock.cpp 20 | ../utils/timingCache.cpp 21 | ) 22 | 23 | target_include_directories(trtexec PRIVATE 24 | ../common 25 | .. 26 | ../../include 27 | ) 28 | 29 | target_link_libraries(trtexec PRIVATE CUDA::cudart_static) 30 | 31 | install(TARGETS trtexec RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) 32 | -------------------------------------------------------------------------------- /vstrt/trtexec/logfile.cpp: -------------------------------------------------------------------------------- 1 | // When $TRTEXEC_LOG_FILE is set, redirect stdout and stderr to the specified 2 | // file as well. 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace { 9 | static struct redirect { 10 | class teebuf: public std::streambuf { 11 | public: 12 | teebuf(std::streambuf *a, std::streambuf *b): s1(a), s2(b) {} 13 | private: 14 | std::streambuf *s1, *s2; 15 | 16 | virtual int overflow(int c) override { 17 | if (c == EOF) 18 | return EOF; 19 | else { 20 | int r1 = s1->sputc(c); 21 | int r2 = s2->sputc(c); 22 | return (r1 == EOF || r2 == EOF) ? EOF : c; 23 | } 24 | } 25 | 26 | virtual int sync() override { 27 | int r1 = s1->pubsync(); 28 | int r2 = s2->pubsync(); 29 | return (r1 == 0 && r2 == 0) ? 0 : -1; 30 | } 31 | }; 32 | redirect() { 33 | const char *fn = getenv("TRTEXEC_LOG_FILE"); 34 | if (fn) { 35 | static std::ofstream ofs(fn, std::ios::app); 36 | static teebuf out(ofs.rdbuf(), std::cout.rdbuf()); 37 | static teebuf err(ofs.rdbuf(), std::cerr.rdbuf()); 38 | std::cout.rdbuf(&out); 39 | std::cerr.rdbuf(&err); 40 | } 41 | } 42 | } _; 43 | } // namespace 44 | -------------------------------------------------------------------------------- /vstrt/trtexec/trtexec.patch: -------------------------------------------------------------------------------- 1 | diff --git a/samples/utils/fileLock.cpp b/samples/utils/fileLock.cpp 2 | index e155c0b..de6bce2 100644 3 | --- a/samples/utils/fileLock.cpp 4 | +++ b/samples/utils/fileLock.cpp 5 | @@ -35,8 +35,11 @@ FileLock::FileLock(ILogger& logger, std::string const& fileName) 6 | ss << "Trying to set exclusive file lock " << lockFileName << std::endl; 7 | mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str()); 8 | } 9 | + int size = MultiByteToWideChar(CP_UTF8, 0, lockFileName.c_str(), -1, nullptr, 0); 10 | + std::wstring lockFileNameW (size, L'\0'); 11 | + MultiByteToWideChar(CP_UTF8, 0, lockFileName.c_str(), -1, &lockFileNameW[0], size); 12 | // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided 13 | - mHandle = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL); 14 | + mHandle = CreateFileW(lockFileNameW.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, FILE_FLAG_DELETE_ON_CLOSE | FILE_ATTRIBUTE_TEMPORARY, NULL); 15 | if (mHandle == INVALID_HANDLE_VALUE) 16 | { 17 | throw std::runtime_error("Failed to lock " + lockFileName + "!"); 18 | -------------------------------------------------------------------------------- /vstrt/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef VSTRT_UTILS_H_ 2 | #define VSTRT_UTILS_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | static inline 17 | void setDimensions( 18 | std::unique_ptr & vi, 19 | const std::unique_ptr & exec_context, 20 | VSCore * core, 21 | const VSAPI * vsapi, 22 | int sample_type, 23 | int bits_per_sample, 24 | bool flexible_output 25 | ) noexcept { 26 | 27 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 28 | auto input_name = exec_context->getEngine().getIOTensorName(0); 29 | auto output_name = exec_context->getEngine().getIOTensorName(1); 30 | const nvinfer1::Dims & in_dims = exec_context->getTensorShape(input_name); 31 | const nvinfer1::Dims & out_dims = exec_context->getTensorShape(output_name); 32 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 33 | const nvinfer1::Dims & in_dims = exec_context->getBindingDimensions(0); 34 | const nvinfer1::Dims & out_dims = exec_context->getBindingDimensions(1); 35 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 36 | 37 | auto in_height = static_cast(in_dims.d[2]); 38 | auto in_width = static_cast(in_dims.d[3]); 39 | 40 | auto out_height = static_cast(out_dims.d[2]); 41 | auto out_width = static_cast(out_dims.d[3]); 42 | 43 | vi->height *= out_height / in_height; 44 | vi->width *= out_width / in_width; 45 | 46 | if (out_dims.d[1] == 1 || flexible_output) { 47 | vi->format = vsapi->registerFormat(cmGray, sample_type, bits_per_sample, 0, 0, core); 48 | } else if (out_dims.d[1] == 3) { 49 | vi->format = vsapi->registerFormat(cmRGB, sample_type, bits_per_sample, 0, 0, core); 50 | } 51 | } 52 | 53 | static inline 54 | std::vector getVideoInfo( 55 | const VSAPI * vsapi, 56 | const std::vector & nodes 57 | ) noexcept { 58 | 59 | std::vector vis; 60 | vis.reserve(std::size(nodes)); 61 | 62 | for (const auto & node : nodes) { 63 | vis.emplace_back(vsapi->getVideoInfo(node)); 64 | } 65 | 66 | return vis; 67 | } 68 | 69 | static inline 70 | std::vector getFrames( 71 | int n, 72 | const VSAPI * vsapi, 73 | VSFrameContext * frameCtx, 74 | const std::vector & nodes 75 | ) noexcept { 76 | 77 | std::vector frames; 78 | frames.reserve(std::size(nodes)); 79 | 80 | for (const auto & node : nodes) { 81 | frames.emplace_back(vsapi->getFrameFilter(n, node, frameCtx)); 82 | } 83 | 84 | return frames; 85 | } 86 | 87 | static inline 88 | std::optional checkNodes( 89 | const std::vector & vis 90 | ) noexcept { 91 | 92 | for (const auto & vi : vis) { 93 | if (!isConstantFormat(vi)) { 94 | return "video format must be constant"; 95 | } 96 | 97 | if (vi->width != vis[0]->width || vi->height != vis[0]->height) { 98 | return "dimensions of clips mismatch"; 99 | } 100 | 101 | if (vi->numFrames != vis[0]->numFrames) { 102 | return "number of frames mismatch"; 103 | } 104 | 105 | if (vi->format->subSamplingH != 0 || vi->format->subSamplingW != 0) { 106 | return "clip must not be sub-sampled"; 107 | } 108 | } 109 | 110 | return {}; 111 | } 112 | 113 | static inline 114 | std::optional checkNodes( 115 | const std::vector & vis, 116 | int sample_type, 117 | int bits_per_sample 118 | ) noexcept { 119 | 120 | for (const auto & vi : vis) { 121 | if (vi->format->sampleType != sample_type) { 122 | return "sample type mismatch"; 123 | } 124 | 125 | if (vi->format->bitsPerSample != bits_per_sample) { 126 | return "bits per sample mismatch"; 127 | } 128 | } 129 | 130 | return {}; 131 | } 132 | 133 | static inline 134 | int numPlanes( 135 | const std::vector & vis 136 | ) noexcept { 137 | 138 | int num_planes = 0; 139 | 140 | for (const auto & vi : vis) { 141 | num_planes += vi->format->numPlanes; 142 | } 143 | 144 | return num_planes; 145 | } 146 | 147 | static inline 148 | std::optional checkNodesAndContext( 149 | const std::unique_ptr & execution_context, 150 | const std::vector & vis 151 | ) noexcept { 152 | 153 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 154 | auto input_name = execution_context->getEngine().getIOTensorName(0); 155 | const nvinfer1::Dims & network_in_dims = execution_context->getTensorShape(input_name); 156 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 157 | const nvinfer1::Dims & network_in_dims = execution_context->getBindingDimensions(0); 158 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 159 | 160 | auto network_in_channels = network_in_dims.d[1]; 161 | int num_planes = numPlanes(vis); 162 | if (network_in_channels != num_planes) { 163 | return "expects " + std::to_string(network_in_channels) + " input planes"; 164 | } 165 | 166 | auto network_in_height = network_in_dims.d[2]; 167 | auto network_in_width = network_in_dims.d[3]; 168 | int clip_in_height = vis[0]->height; 169 | int clip_in_width = vis[0]->width; 170 | 171 | if (network_in_height > clip_in_height || network_in_width > clip_in_width) { 172 | return "tile size larger than clip dimension"; 173 | } 174 | 175 | return {}; 176 | } 177 | 178 | static inline void VS_CC getDeviceProp( 179 | const VSMap *in, VSMap *out, void *userData, 180 | VSCore *core, const VSAPI *vsapi 181 | ) { 182 | 183 | int err; 184 | int device_id = static_cast(vsapi->propGetInt(in, "device_id", 0, &err)); 185 | if (err) { 186 | device_id = 0; 187 | } 188 | 189 | cudaDeviceProp prop; 190 | if (auto error = cudaGetDeviceProperties(&prop, device_id); error != cudaSuccess) { 191 | vsapi->setError(out, cudaGetErrorString(error)); 192 | return ; 193 | } 194 | 195 | auto setProp = [&](const char * name, auto value, int data_length = -1) { 196 | using T = std::decay_t; 197 | if constexpr (std::is_same_v) { 198 | vsapi->propSetInt(out, name, value, paReplace); 199 | } else if constexpr (std::is_same_v) { 200 | vsapi->propSetInt(out, name, static_cast(value), paReplace); 201 | } else if constexpr (std::is_same_v) { 202 | vsapi->propSetData(out, name, value, data_length, paReplace); 203 | } 204 | }; 205 | 206 | int driver_version; 207 | cudaDriverGetVersion(&driver_version); 208 | setProp("driver_version", driver_version); 209 | 210 | setProp("name", prop.name); 211 | { 212 | std::array uuid; 213 | for (int i = 0; i < 16; ++i) { 214 | uuid[i] = prop.uuid.bytes[i]; 215 | } 216 | vsapi->propSetIntArray(out, "uuid", std::data(uuid), static_cast(std::size(uuid))); 217 | } 218 | setProp("total_global_memory", prop.totalGlobalMem); 219 | setProp("shared_memory_per_block", prop.sharedMemPerBlock); 220 | setProp("regs_per_block", prop.regsPerBlock); 221 | setProp("warp_size", prop.warpSize); 222 | setProp("mem_pitch", prop.memPitch); 223 | setProp("max_threads_per_block", prop.maxThreadsPerBlock); 224 | setProp("clock_rate", prop.clockRate); 225 | setProp("total_const_mem", prop.totalConstMem); 226 | setProp("major", prop.major); 227 | setProp("minor", prop.minor); 228 | setProp("texture_alignment", prop.textureAlignment); 229 | setProp("texture_pitch_alignment", prop.texturePitchAlignment); 230 | setProp("device_overlap", prop.deviceOverlap); 231 | setProp("multi_processor_count", prop.multiProcessorCount); 232 | setProp("kernel_exec_timeout_enabled", prop.kernelExecTimeoutEnabled); 233 | setProp("integrated", prop.integrated); 234 | setProp("can_map_host_memory", prop.canMapHostMemory); 235 | setProp("compute_mode", prop.computeMode); 236 | setProp("concurrent_kernels", prop.concurrentKernels); 237 | setProp("ecc_enabled", prop.ECCEnabled); 238 | setProp("pci_bus_id", prop.pciBusID); 239 | setProp("pci_device_id", prop.pciDeviceID); 240 | setProp("pci_domain_id", prop.pciDomainID); 241 | setProp("tcc_driver", prop.tccDriver); 242 | setProp("async_engine_count", prop.asyncEngineCount); 243 | setProp("unified_addressing", prop.unifiedAddressing); 244 | setProp("memory_clock_rate", prop.memoryClockRate); 245 | setProp("memory_bus_width", prop.memoryBusWidth); 246 | setProp("l2_cache_size", prop.l2CacheSize); 247 | setProp("persisting_l2_cache_max_size", prop.persistingL2CacheMaxSize); 248 | setProp("max_threads_per_multiprocessor", prop.maxThreadsPerMultiProcessor); 249 | setProp("stream_priorities_supported", prop.streamPrioritiesSupported); 250 | setProp("global_l1_cache_supported", prop.globalL1CacheSupported); 251 | setProp("local_l1_cache_supported", prop.localL1CacheSupported); 252 | setProp("shared_mem_per_multiprocessor", prop.sharedMemPerMultiprocessor); 253 | setProp("regs_per_multiprocessor", prop.regsPerMultiprocessor); 254 | setProp("managed_memory", prop.managedMemory); 255 | setProp("is_multi_gpu_board", prop.isMultiGpuBoard); 256 | setProp("multi_gpu_board_group_id", prop.multiGpuBoardGroupID); 257 | setProp("host_native_atomic_supported", prop.hostNativeAtomicSupported); 258 | setProp("single_to_double_precision_perf_ratio", prop.singleToDoublePrecisionPerfRatio); 259 | setProp("pageable_memory_access", prop.pageableMemoryAccess); 260 | setProp("conccurrent_managed_access", prop.concurrentManagedAccess); 261 | setProp("compute_preemption_supported", prop.computePreemptionSupported); 262 | setProp( 263 | "can_use_host_pointer_for_registered_mem", 264 | prop.canUseHostPointerForRegisteredMem 265 | ); 266 | setProp("cooperative_launch", prop.cooperativeLaunch); 267 | setProp("cooperative_multi_device_launch", prop.cooperativeMultiDeviceLaunch); 268 | setProp("shared_mem_per_block_optin", prop.sharedMemPerBlockOptin); 269 | setProp( 270 | "pageable_memory_access_uses_host_page_tables", 271 | prop.pageableMemoryAccessUsesHostPageTables 272 | ); 273 | setProp("direct_managed_mem_access_from_host", prop.directManagedMemAccessFromHost); 274 | setProp("max_blocks_per_multi_processor", prop.maxBlocksPerMultiProcessor); 275 | setProp("access_policy_max_window_size", prop.accessPolicyMaxWindowSize); 276 | setProp("reserved_shared_mem_per_block", prop.reservedSharedMemPerBlock); 277 | }; 278 | 279 | #endif // VSTRT_UTILS_H_ 280 | -------------------------------------------------------------------------------- /vstrt/vs_tensorrt.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | #ifdef USE_NVINFER_PLUGIN 20 | #include 21 | #endif 22 | 23 | #include "config.h" 24 | #include "inference_helper.h" 25 | #include "trt_utils.h" 26 | #include "utils.h" 27 | 28 | #ifdef _WIN32 29 | #define WIN32_LEAN_AND_MEAN 30 | #include 31 | 32 | static std::wstring translateName(const char *name) { 33 | auto size = MultiByteToWideChar(CP_UTF8, 0, name, -1, nullptr, 0); 34 | std::wstring ret(static_cast(size), {}); 35 | MultiByteToWideChar(CP_UTF8, 0, name, -1, ret.data(), size); 36 | return ret; 37 | } 38 | #else 39 | #define translateName(n) (n) 40 | #endif 41 | 42 | using namespace std::string_literals; 43 | 44 | static const VSPlugin * myself = nullptr; 45 | 46 | struct TicketSemaphore { 47 | std::atomic ticket {}; 48 | std::atomic current {}; 49 | 50 | void init(intptr_t num) noexcept { 51 | current.store(num, std::memory_order::seq_cst); 52 | } 53 | 54 | void acquire() noexcept { 55 | intptr_t tk { ticket.fetch_add(1, std::memory_order::acquire) }; 56 | while (true) { 57 | intptr_t curr { current.load(std::memory_order::acquire) }; 58 | if (tk < curr) { 59 | return; 60 | } 61 | current.wait(curr, std::memory_order::relaxed); 62 | } 63 | } 64 | 65 | void release() noexcept { 66 | current.fetch_add(1, std::memory_order::release); 67 | current.notify_all(); 68 | } 69 | }; 70 | 71 | struct vsTrtData { 72 | std::vector nodes; 73 | std::unique_ptr out_vi; 74 | 75 | int device_id; 76 | int num_streams; 77 | bool use_cuda_graph; 78 | int overlap_w, overlap_h; 79 | 80 | Logger logger; 81 | std::unique_ptr runtime; 82 | std::vector> engines; 83 | 84 | TicketSemaphore semaphore; 85 | std::vector tickets; 86 | std::mutex instances_lock; 87 | std::vector instances; 88 | 89 | std::string flexible_output_prop; 90 | 91 | [[nodiscard]] 92 | int acquire() noexcept { 93 | semaphore.acquire(); 94 | int ticket; 95 | { 96 | std::lock_guard lock { instances_lock }; 97 | ticket = tickets.back(); 98 | tickets.pop_back(); 99 | } 100 | return ticket; 101 | } 102 | 103 | void release(int ticket) noexcept { 104 | { 105 | std::lock_guard lock { instances_lock }; 106 | tickets.push_back(ticket); 107 | } 108 | semaphore.release(); 109 | } 110 | }; 111 | 112 | static void VS_CC vsTrtInit( 113 | VSMap *in, 114 | VSMap *out, 115 | void **instanceData, 116 | VSNode *node, 117 | VSCore *core, 118 | const VSAPI *vsapi 119 | ) noexcept { 120 | 121 | auto d = static_cast(*instanceData); 122 | vsapi->setVideoInfo(d->out_vi.get(), 1, node); 123 | } 124 | 125 | static const VSFrameRef *VS_CC vsTrtGetFrame( 126 | int n, 127 | int activationReason, 128 | void **instanceData, 129 | void **frameData, 130 | VSFrameContext *frameCtx, 131 | VSCore *core, 132 | const VSAPI *vsapi 133 | ) noexcept { 134 | 135 | auto d = static_cast(*instanceData); 136 | 137 | if (activationReason == arInitial) { 138 | for (const auto & node : d->nodes) { 139 | vsapi->requestFrameFilter(n, node, frameCtx); 140 | } 141 | } else if (activationReason == arAllFramesReady) { 142 | const std::vector in_vis { 143 | getVideoInfo(vsapi, d->nodes) 144 | }; 145 | 146 | const std::vector src_frames { 147 | getFrames(n, vsapi, frameCtx, d->nodes) 148 | }; 149 | 150 | const int ticket { d->acquire() }; 151 | InferenceInstance & instance { d->instances[ticket] }; 152 | 153 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 154 | auto input_name = d->engines[0]->getIOTensorName(0); 155 | const nvinfer1::Dims src_dim { instance.exec_context->getTensorShape(input_name) }; 156 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 157 | const nvinfer1::Dims src_dim { instance.exec_context->getBindingDimensions(0) }; 158 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 159 | 160 | const int src_planes { static_cast(src_dim.d[1]) }; 161 | const int src_tile_h { static_cast(src_dim.d[2]) }; 162 | const int src_tile_w { static_cast(src_dim.d[3]) }; 163 | 164 | std::vector src_ptrs; 165 | src_ptrs.reserve(src_planes); 166 | for (int i = 0; i < std::ssize(d->nodes); ++i) { 167 | for (int j = 0; j < in_vis[i]->format->numPlanes; ++j) { 168 | src_ptrs.emplace_back(vsapi->getReadPtr(src_frames[i], j)); 169 | } 170 | } 171 | 172 | VSFrameRef * const dst_frame { vsapi->newVideoFrame( 173 | d->out_vi->format, d->out_vi->width, d->out_vi->height, 174 | src_frames[0], core 175 | )}; 176 | 177 | std::vector dst_frames; 178 | 179 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 180 | auto output_name = d->engines[0]->getIOTensorName(1); 181 | const nvinfer1::Dims dst_dim { instance.exec_context->getTensorShape(output_name) }; 182 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 183 | const nvinfer1::Dims dst_dim { instance.exec_context->getBindingDimensions(1) }; 184 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 185 | 186 | const int dst_planes { static_cast(dst_dim.d[1]) }; 187 | const int dst_tile_h { static_cast(dst_dim.d[2]) }; 188 | const int dst_tile_w { static_cast(dst_dim.d[3]) }; 189 | 190 | std::vector dst_ptrs; 191 | dst_ptrs.reserve(dst_planes); 192 | if (d->flexible_output_prop.empty()) { 193 | for (int i = 0; i < dst_planes; ++i) { 194 | dst_ptrs.emplace_back(vsapi->getWritePtr(dst_frame, i)); 195 | } 196 | } else { 197 | for (int i = 0; i < dst_planes; ++i) { 198 | auto frame { vsapi->newVideoFrame( 199 | d->out_vi->format, d->out_vi->width, d->out_vi->height, 200 | src_frames[0], core 201 | )}; 202 | dst_frames.emplace_back(frame); 203 | dst_ptrs.emplace_back(vsapi->getWritePtr(frame, 0)); 204 | } 205 | } 206 | 207 | const int h_scale = dst_tile_h / src_tile_h; 208 | const int w_scale = dst_tile_w / src_tile_w; 209 | 210 | const IOInfo info { 211 | .in = InputInfo { 212 | .width = vsapi->getFrameWidth(src_frames[0], 0), 213 | .height = vsapi->getFrameHeight(src_frames[0], 0), 214 | .pitch = vsapi->getStride(src_frames[0], 0), 215 | .bytes_per_sample = vsapi->getFrameFormat(src_frames[0])->bytesPerSample, 216 | .tile_w = src_tile_w, 217 | .tile_h = src_tile_h 218 | }, 219 | .out = OutputInfo { 220 | .pitch = vsapi->getStride(dst_frame, 0), 221 | .bytes_per_sample = vsapi->getFrameFormat(dst_frame)->bytesPerSample 222 | }, 223 | .w_scale = w_scale, 224 | .h_scale = h_scale, 225 | .overlap_w = d->overlap_w, 226 | .overlap_h = d->overlap_h 227 | }; 228 | 229 | const auto inference_result = inference( 230 | instance, 231 | d->device_id, d->use_cuda_graph, 232 | info, src_ptrs, dst_ptrs 233 | ); 234 | 235 | d->release(ticket); 236 | 237 | for (const auto & frame : src_frames) { 238 | vsapi->freeFrame(frame); 239 | } 240 | 241 | if (inference_result.has_value()) { 242 | vsapi->setFilterError( 243 | (__func__ + ": "s + inference_result.value()).c_str(), 244 | frameCtx 245 | ); 246 | 247 | for (const auto & frame : dst_frames) { 248 | vsapi->freeFrame(frame); 249 | } 250 | 251 | vsapi->freeFrame(dst_frame); 252 | 253 | return nullptr; 254 | } 255 | 256 | if (!d->flexible_output_prop.empty()) { 257 | auto prop = vsapi->getFramePropsRW(dst_frame); 258 | 259 | for (int i = 0; i < dst_planes; i++) { 260 | auto key { d->flexible_output_prop + std::to_string(i) }; 261 | vsapi->propSetFrame(prop, key.c_str(), dst_frames[i], paReplace); 262 | vsapi->freeFrame(dst_frames[i]); 263 | } 264 | } 265 | 266 | return dst_frame; 267 | } 268 | 269 | return nullptr; 270 | } 271 | 272 | static void VS_CC vsTrtFree( 273 | void *instanceData, VSCore *core, const VSAPI *vsapi 274 | ) noexcept { 275 | 276 | auto d = static_cast(instanceData); 277 | 278 | for (const auto & node : d->nodes) { 279 | vsapi->freeNode(node); 280 | } 281 | 282 | cudaSetDevice(d->device_id); 283 | 284 | delete d; 285 | } 286 | 287 | static void VS_CC vsTrtCreate( 288 | const VSMap *in, VSMap *out, void *userData, 289 | VSCore *core, const VSAPI *vsapi 290 | ) noexcept { 291 | 292 | auto d { std::make_unique() }; 293 | 294 | int num_nodes = vsapi->propNumElements(in, "clips"); 295 | d->nodes.reserve(num_nodes); 296 | for (int i = 0; i < num_nodes; ++i) { 297 | d->nodes.emplace_back(vsapi->propGetNode(in, "clips", i, nullptr)); 298 | } 299 | 300 | auto set_error = [&](const std::string & error_message) { 301 | vsapi->setError(out, (__func__ + ": "s + error_message).c_str()); 302 | for (const auto & node : d->nodes) { 303 | vsapi->freeNode(node); 304 | } 305 | }; 306 | 307 | const char * engine_path = vsapi->propGetData(in, "engine_path", 0, nullptr); 308 | 309 | std::vector in_vis; 310 | in_vis.reserve(std::size(d->nodes)); 311 | for (const auto & node : d->nodes) { 312 | in_vis.emplace_back(vsapi->getVideoInfo(node)); 313 | } 314 | if (auto err = checkNodes(in_vis); err.has_value()) { 315 | return set_error(err.value()); 316 | } 317 | 318 | int error1, error2; 319 | d->overlap_w = int64ToIntS(vsapi->propGetInt(in, "overlap", 0, &error1)); 320 | d->overlap_h = int64ToIntS(vsapi->propGetInt(in, "overlap", 1, &error2)); 321 | if (!error1) { 322 | if (error2) { 323 | d->overlap_h = d->overlap_w; 324 | } 325 | 326 | if (d->overlap_w < 0 || d->overlap_h < 0) { 327 | return set_error("\"overlap\" must be non-negative"); 328 | } 329 | } else { 330 | d->overlap_w = 0; 331 | d->overlap_h = 0; 332 | } 333 | 334 | int tile_w = int64ToIntS(vsapi->propGetInt(in, "tilesize", 0, &error1)); 335 | int tile_h = int64ToIntS(vsapi->propGetInt(in, "tilesize", 1, &error2)); 336 | 337 | TileSize tile_size; 338 | if (!error1) { // manual specification triggered 339 | if (error2) { 340 | tile_h = tile_w; 341 | } 342 | 343 | if (tile_w - 2 * d->overlap_w <= 0 || tile_h - 2 * d->overlap_h <= 0) { 344 | return set_error("\"overlap\" too large"); 345 | } 346 | 347 | tile_size = RequestedTileSize { 348 | .tile_w = tile_w, 349 | .tile_h = tile_h 350 | }; 351 | } else { 352 | if (d->overlap_w != 0 || d->overlap_h != 0) { 353 | return set_error("\"tilesize\" must be specified"); 354 | } 355 | 356 | int width = in_vis[0]->width; 357 | int height = in_vis[0]->height; 358 | 359 | if (width - 2 * d->overlap_w <= 0 || height - 2 * d->overlap_h <= 0) { 360 | return set_error("\"overlap\" too large"); 361 | } 362 | 363 | tile_size = VideoSize { 364 | .width = width, 365 | .height = height 366 | }; 367 | } 368 | 369 | int error; 370 | 371 | int device_id = int64ToIntS(vsapi->propGetInt(in, "device_id", 0, &error)); 372 | if (error) { 373 | device_id = 0; 374 | } 375 | 376 | int device_count; 377 | checkError(cudaGetDeviceCount(&device_count)); 378 | if (0 <= device_id && device_id < device_count) { 379 | checkError(cudaSetDevice(device_id)); 380 | } else { 381 | return set_error("invalid device ID (" + std::to_string(device_id) + ")"); 382 | } 383 | d->device_id = device_id; 384 | 385 | d->use_cuda_graph = !!vsapi->propGetInt(in, "use_cuda_graph", 0, &error); 386 | if (error) { 387 | d->use_cuda_graph = false; 388 | } 389 | 390 | d->num_streams = int64ToIntS(vsapi->propGetInt(in, "num_streams", 0, &error)); 391 | if (error) { 392 | d->num_streams = 1; 393 | } 394 | 395 | int verbosity = int64ToIntS(vsapi->propGetInt(in, "verbosity", 0, &error)); 396 | if (error) { 397 | verbosity = int(nvinfer1::ILogger::Severity::kWARNING); 398 | } 399 | d->logger.set_verbosity(static_cast(verbosity)); 400 | 401 | auto flexible_output_prop = vsapi->propGetData(in, "flexible_output_prop", 0, &error); 402 | if (!error) { 403 | d->flexible_output_prop = flexible_output_prop; 404 | } 405 | 406 | #ifdef USE_NVINFER_PLUGIN 407 | // related to https://github.com/AmusementClub/vs-mlrt/discussions/65, for unknown reason 408 | #if !(NV_TENSORRT_MAJOR == 9 && defined(_WIN32)) 409 | if (!initLibNvInferPlugins(&d->logger, "")) { 410 | vsapi->logMessage(mtWarning, "vsTrt: Initialize TensorRT plugins failed"); 411 | } 412 | #endif 413 | #endif 414 | 415 | std::ifstream engine_stream { 416 | translateName(engine_path), 417 | std::ios::binary | std::ios::ate 418 | }; 419 | 420 | if (!engine_stream.good()) { 421 | return set_error("open engine failed"); 422 | } 423 | 424 | auto engine_nbytes = engine_stream.tellg(); 425 | if (engine_nbytes == -1) { 426 | return set_error("open engine failed"); 427 | } 428 | 429 | std::unique_ptr engine_data { 430 | (char *) malloc(static_cast(engine_nbytes)), free 431 | }; 432 | engine_stream.seekg(0, std::ios::beg); 433 | engine_stream.read(engine_data.get(), static_cast(engine_nbytes)); 434 | 435 | d->runtime.reset(nvinfer1::createInferRuntime(d->logger)); 436 | auto maybe_engine = initEngine( 437 | engine_data.get(), 438 | static_cast(engine_nbytes), 439 | d->runtime, 440 | !d->flexible_output_prop.empty() 441 | ); 442 | if (std::holds_alternative>(maybe_engine)) { 443 | d->engines.push_back(std::move(std::get>(maybe_engine))); 444 | } else { 445 | return set_error(std::get(maybe_engine)); 446 | } 447 | 448 | auto maybe_profile_index = selectProfile(d->engines[0], tile_size); 449 | 450 | bool is_dynamic = false; 451 | d->instances.reserve(d->num_streams); 452 | for (int i = 0; i < d->num_streams; ++i) { 453 | auto maybe_instance = getInstance( 454 | d->engines.back(), 455 | maybe_profile_index, 456 | tile_size, 457 | d->use_cuda_graph, 458 | is_dynamic 459 | ); 460 | 461 | // https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-1000-ea/developer-guide/index.html#perform-inference 462 | #if NV_TENSORRT_MAJOR < 10 463 | // duplicates ICudaEngine instances 464 | // 465 | // According to 466 | // https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/developer-guide/index.html#perform-inference 467 | // each optimization profile can only have one execution context when using dynamic shapes 468 | if (is_dynamic && i < d->num_streams - 1) { 469 | auto maybe_engine = initEngine(engine_data.get(), engine_nbytes, d->runtime, !d->flexible_output_prop.empty()); 470 | if (std::holds_alternative>(maybe_engine)) { 471 | d->engines.push_back(std::move(std::get>(maybe_engine))); 472 | } else { 473 | return set_error(std::get(maybe_engine)); 474 | } 475 | } 476 | #endif // NV_TENSORRT_MAJOR < 10 477 | 478 | if (std::holds_alternative(maybe_instance)) { 479 | auto instance = std::move(std::get(maybe_instance)); 480 | if (auto err = checkNodesAndContext(instance.exec_context, in_vis); err.has_value()) { 481 | return set_error(err.value()); 482 | } 483 | d->instances.emplace_back(std::move(instance)); 484 | } else { 485 | return set_error(std::get(maybe_instance)); 486 | } 487 | } 488 | 489 | d->semaphore.init(d->num_streams); 490 | d->tickets.reserve(d->num_streams); 491 | for (int i = 0; i < d->num_streams; ++i) { 492 | d->tickets.push_back(i); 493 | } 494 | 495 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 496 | auto input_name = d->engines[0]->getIOTensorName(0); 497 | auto input_type = d->engines[0]->getTensorDataType(input_name); 498 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 499 | auto input_type = d->engines[0]->getBindingDataType(0); 500 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 501 | 502 | VSSampleType input_sample_type; 503 | { 504 | auto sample_type = getSampleType(input_type); 505 | if (sample_type == 0) { 506 | input_sample_type = stInteger; 507 | } else if (sample_type == 1) { 508 | input_sample_type = stFloat; 509 | } else { 510 | return set_error("unknown input sample type"); 511 | } 512 | } 513 | auto input_bits_per_sample = getBytesPerSample(input_type) * 8; 514 | 515 | if (auto err = checkNodes(in_vis, input_sample_type, input_bits_per_sample); err.has_value()) { 516 | return set_error(err.value()); 517 | } 518 | 519 | d->out_vi = std::make_unique(*in_vis[0]); 520 | 521 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 522 | auto output_name = d->engines[0]->getIOTensorName(1); 523 | auto output_type = d->engines[0]->getTensorDataType(output_name); 524 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 525 | auto output_type = d->engines[0]->getBindingDataType(1); 526 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 527 | 528 | VSSampleType output_sample_type; 529 | { 530 | auto sample_type = getSampleType(output_type); 531 | if (sample_type == 0) { 532 | output_sample_type = stInteger; 533 | } else if (sample_type == 1) { 534 | output_sample_type = stFloat; 535 | } else { 536 | return set_error("unknown output sample type"); 537 | } 538 | } 539 | auto output_bits_per_sample = getBytesPerSample(output_type) * 8; 540 | 541 | setDimensions( 542 | d->out_vi, d->instances[0].exec_context, core, vsapi, 543 | output_sample_type, output_bits_per_sample, 544 | !d->flexible_output_prop.empty() 545 | ); 546 | 547 | if (!d->flexible_output_prop.empty()) { 548 | const auto & exec_context = d->instances[0].exec_context; 549 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 550 | const nvinfer1::Dims & out_dims = exec_context->getTensorShape(output_name); 551 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 552 | const nvinfer1::Dims & out_dims = exec_context->getBindingDimensions(1); 553 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85 554 | vsapi->propSetInt(out, "num_planes", out_dims.d[1], paReplace); 555 | } 556 | 557 | vsapi->createFilter( 558 | in, out, "Model", 559 | vsTrtInit, vsTrtGetFrame, vsTrtFree, 560 | fmParallel, 0, d.release(), core 561 | ); 562 | } 563 | 564 | VS_EXTERNAL_API(void) VapourSynthPluginInit( 565 | VSConfigPlugin configFunc, 566 | VSRegisterFunction registerFunc, 567 | VSPlugin *plugin 568 | ) noexcept { 569 | 570 | configFunc( 571 | "io.github.amusementclub.vs_tensorrt", "trt", 572 | "TensorRT ML Filter Runtime", 573 | VAPOURSYNTH_API_VERSION, 1, plugin 574 | ); 575 | 576 | // TRT 9 for windows does not export getInferLibVersion() 577 | #if NV_TENSORRT_MAJOR == 9 && defined(_WIN32) 578 | auto test = getPluginRegistry(); 579 | 580 | if (test == nullptr) { 581 | std::fprintf(stderr, "vstrt: TensorRT failed to load.\n"); 582 | return; 583 | } 584 | #else // NV_TENSORRT_MAJOR == 9 && defined(_WIN32) 585 | int ver = getInferLibVersion(); // must ensure this is the first nvinfer function called 586 | #ifdef _WIN32 587 | if (ver == 0) { // a sentinel value, see dummy function in win32.cpp. 588 | std::fprintf(stderr, "vstrt: TensorRT failed to load.\n"); 589 | return; 590 | } 591 | #endif // _WIN32 592 | if (ver != NV_TENSORRT_VERSION) { 593 | #if NV_TENSORRT_MAJOR >= 10 594 | std::fprintf( 595 | stderr, 596 | "vstrt: TensorRT version mismatch, built with %ld but loaded with %d; continue but fingers crossed...\n", 597 | NV_TENSORRT_VERSION, 598 | ver 599 | ); 600 | #else // NV_TENSORRT_MAJOR >= 10 601 | std::fprintf( 602 | stderr, 603 | "vstrt: TensorRT version mismatch, built with %d but loaded with %d; continue but fingers crossed...\n", 604 | NV_TENSORRT_VERSION, 605 | ver 606 | ); 607 | #endif // NV_TENSORRT_MAJOR >= 10 608 | } 609 | #endif // NV_TENSORRT_MAJOR == 9 && defined(_WIN32) 610 | 611 | myself = plugin; 612 | 613 | registerFunc("Model", 614 | "clips:clip[];" 615 | "engine_path:data;" 616 | "overlap:int[]:opt;" 617 | "tilesize:int[]:opt;" 618 | "device_id:int:opt;" 619 | "use_cuda_graph:int:opt;" 620 | "num_streams:int:opt;" 621 | "verbosity:int:opt;" 622 | "flexible_output_prop:data:opt;", 623 | vsTrtCreate, 624 | nullptr, 625 | plugin 626 | ); 627 | 628 | auto getVersion = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) { 629 | vsapi->propSetData(out, "version", VERSION, -1, paReplace); 630 | 631 | vsapi->propSetData( 632 | out, "tensorrt_version", 633 | #if NV_TENSORRT_MAJOR == 9 && defined(_WIN32) 634 | std::to_string(NV_TENSORRT_VERSION).c_str(), 635 | #else 636 | std::to_string(getInferLibVersion()).c_str(), 637 | #endif 638 | -1, paReplace 639 | ); 640 | 641 | vsapi->propSetData( 642 | out, "tensorrt_version_build", 643 | std::to_string(NV_TENSORRT_VERSION).c_str(), -1, paReplace 644 | ); 645 | 646 | int runtime_version; 647 | cudaRuntimeGetVersion(&runtime_version); 648 | vsapi->propSetData( 649 | out, "cuda_runtime_version", 650 | std::to_string(runtime_version).c_str(), -1, paReplace 651 | ); 652 | 653 | vsapi->propSetData( 654 | out, "cuda_runtime_version_build", 655 | std::to_string(__CUDART_API_VERSION).c_str(), -1, paReplace 656 | ); 657 | 658 | vsapi->propSetData(out, "path", vsapi->getPluginPath(myself), -1, paReplace); 659 | }; 660 | registerFunc("Version", "", getVersion, nullptr, plugin); 661 | 662 | registerFunc("DeviceProperties", "device_id:int:opt;", getDeviceProp, nullptr, plugin); 663 | } 664 | -------------------------------------------------------------------------------- /vstrt/win32.cpp: -------------------------------------------------------------------------------- 1 | #ifdef _MSC_VER 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define DLL_DIR L"vsmlrt-cuda" 10 | 11 | #include 12 | 13 | #include 14 | 15 | #if NV_TENSORRT_VERSION >= 100001 16 | #define TO_STRING(x) #x 17 | #define CONCAT_VERSION(name, version) (name "_" TO_STRING(version) ".dll") 18 | #endif // NV_TENSORRT_VERSION >= 100001 19 | 20 | namespace { 21 | std::vector dlls = { 22 | // This list must be sorted by dependency. 23 | #if NV_TENSORRT_VERSION >= 100001 24 | #ifdef USE_NVINFER_PLUGIN 25 | // nvinfer_plugin dependencies 26 | CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), 27 | CONCAT_VERSION(L"nvinfer_plugin", NV_TENSORRT_MAJOR), 28 | #endif // USE_NVINFER_PLUGIN 29 | // Finally, nvinfer again. 30 | CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), // must be the last 31 | #else // NV_TENSORRT_VERSION >= 100001 32 | #ifdef USE_NVINFER_PLUGIN 33 | // nvinfer_plugin dependencies 34 | L"nvinfer.dll", 35 | L"nvinfer_plugin.dll", 36 | #endif // USE_NVINFER_PLUGIN 37 | // Finally, nvinfer again. 38 | L"nvinfer.dll", // must be the last 39 | #endif // NV_TENSORRT_VERSION >= 100001 40 | }; 41 | 42 | namespace fs = std::filesystem; 43 | static fs::path dllDir() { 44 | static const std::wstring res = []() -> std::wstring { 45 | HMODULE mod = 0; 46 | if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) { 47 | std::vector buf; 48 | size_t n = 0; 49 | do { 50 | buf.resize(buf.size() + MAX_PATH); 51 | n = GetModuleFileNameW(mod, buf.data(), static_cast(buf.size())); 52 | } while (n >= buf.size()); 53 | buf.resize(n); 54 | std::wstring path(buf.begin(), buf.end()); 55 | return path; 56 | } 57 | throw std::runtime_error("unable to locate myself"); 58 | }(); 59 | return fs::path(res).parent_path(); 60 | } 61 | 62 | FARPROC loadDLLs() { 63 | fs::path dir = dllDir() / DLL_DIR; 64 | HMODULE h = nullptr; 65 | for (const auto dll: dlls) { 66 | fs::path p = dir / dll; 67 | std::wstring s = p; 68 | h = LoadLibraryW(s.c_str()); 69 | DWORD err = GetLastError(); 70 | if (getenv("VSTRT_VERBOSE")) 71 | std::wcerr << L"vstrt: preloading " << p << L": " << h << std::endl; 72 | if (!h) 73 | std::wcerr << L"vstrt: failed to preload " << s << L", errno " << err << std::endl; 74 | } 75 | return (FARPROC)h; 76 | } 77 | 78 | #if NV_TENSORRT_MAJOR == 9 && defined(_WIN32) 79 | static void * dummy() { // mimic getPluginRegistry 80 | #else 81 | static int dummy() { // mimic getInferLibVersion 82 | #endif 83 | return 0; 84 | } 85 | 86 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) { 87 | switch (reason) { 88 | case dliNoteStartProcessing: 89 | case dliNoteEndProcessing: 90 | // Nothing to do here. 91 | break; 92 | case dliNotePreLoadLibrary: 93 | //std::cerr << "loading " << info->szDll << std::endl; 94 | loadDLLs(); 95 | return (FARPROC)LoadLibraryA(info->szDll); 96 | case dliNotePreGetProcAddress: 97 | // Nothing to do here. 98 | break; 99 | case dliFailLoadLib: 100 | case dliFailGetProc: 101 | // Returning NULL from error notifications will cause the delay load 102 | // runtime to raise a VcppException structured exception, that some code 103 | // might want to handle. 104 | //return NULL; 105 | // The SE will crash the process, so instead we return a dummy function. 106 | return (FARPROC)dummy; 107 | break; 108 | default: 109 | abort(); // unreachable. 110 | break; 111 | } 112 | // Returning NULL causes the delay load machinery to perform default 113 | // processing for this notification. 114 | return NULL; 115 | } 116 | } // namespace 117 | 118 | extern "C" { 119 | const PfnDliHook __pfnDliNotifyHook2 = delayload_hook; 120 | const PfnDliHook __pfnDliFailureHook2 = delayload_hook; 121 | }; 122 | #endif // _MSC_VER 123 | --------------------------------------------------------------------------------