├── .github
    └── workflows
    │   ├── linux-migx.yml
    │   ├── linux-ncnn.yml
    │   ├── linux-ort.yml
    │   ├── linux-ov-arm64.yml
    │   ├── linux-ov.yml
    │   ├── linux-trt-arm64.yml
    │   ├── linux-trt.yml
    │   ├── macos-ort.yml
    │   ├── windows-cuda-dependency.yml
    │   ├── windows-hip-dependency.yml
    │   ├── windows-migx.yml
    │   ├── windows-ncnn.yml
    │   ├── windows-ort.yml
    │   ├── windows-ov.yml
    │   ├── windows-release.yml
    │   └── windows-trt.yml
├── LICENSE
├── README.md
├── common
    ├── convert_float_to_float16.cpp
    ├── convert_float_to_float16.h
    ├── onnx_utils.cpp
    └── onnx_utils.h
├── scripts
    └── vsmlrt.py
├── vsmigx
    ├── CMakeLists.txt
    ├── README.md
    ├── config.h.in
    ├── vs_migraphx.cpp
    └── win32.cpp
├── vsncnn
    ├── CMakeLists.txt
    ├── config.h.in
    ├── onnx2ncnn.cpp
    ├── onnx2ncnn.hpp
    └── vs_ncnn.cpp
├── vsort
    ├── CMakeLists.txt
    ├── README.md
    ├── config.h.in
    ├── vs_onnxruntime.cpp
    └── win32.cpp
├── vsov
    ├── CMakeLists.txt
    ├── README.md
    ├── config.h.in
    ├── vs_openvino.cpp
    └── win32.cpp
└── vstrt
    ├── CMakeLists.txt
    ├── README.md
    ├── config.h.in
    ├── cuda_helper.h
    ├── cuda_utils.h
    ├── inference_helper.h
    ├── longpath.manifest
    ├── trt_utils.h
    ├── trtexec
        ├── CMakeLists.txt
        ├── logfile.cpp
        └── trtexec.patch
    ├── utils.h
    ├── vs_tensorrt.cpp
    └── win32.cpp


/.github/workflows/linux-migx.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux-MIGX)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'vsmigx/**'
 7 |       - '.github/workflows/linux-migx.yml'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build-linux:
12 |     runs-on: ubuntu-24.04
13 |     
14 |     defaults:
15 |       run:
16 |         working-directory: vsmigx
17 | 
18 |     steps:
19 |     - name: Checkout repo
20 |       uses: actions/checkout@v4
21 |       with:
22 |         fetch-depth: 0
23 | 
24 |     - name: Setup Ninja
25 |       run: pip install ninja
26 | 
27 |     - name: Download VapourSynth headers
28 |       run: |
29 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
30 |         unzip -q vs.zip
31 |         mv vapoursynth*/ vapoursynth
32 | 
33 |     - name: Setup HIP and MIGraphX
34 |       run: |
35 |         wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
36 |         echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.3.1 noble main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
37 |         echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
38 |         sudo apt update
39 |         sudo apt install hip-runtime-amd rocm-device-libs migraphx-dev hipcc
40 |         ls -R /opt/rocm
41 | 
42 |     - name: Configure
43 |       run: cmake -S . -B build -G Ninja -Wno-dev -LA
44 |         -D CMAKE_BUILD_TYPE=Release
45 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
46 |         -D CMAKE_CXX_COMPILER=g++-13
47 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
48 |         -D migraphx_DIR=/opt/rocm/lib/cmake/migraphx
49 |         -D MIOpen_DIR=/opt/rocm/lib/cmake/miopen
50 |         -D hip_DIR=/opt/rocm/lib/cmake/hip
51 |         -D AMDDeviceLibs_DIR=/opt/rocm/lib/cmake/AMDDeviceLibs
52 |         -D amd_comgr_DIR=/opt/rocm/lib/cmake/amd_comgr
53 |         -D hsa-runtime64_DIR=/opt/rocm/lib/cmake/hsa-runtime64
54 |         -D rocblas_DIR=/opt/rocm/lib/cmake/rocblas
55 |         -D hipblaslt_DIR=/opt/rocm/lib/cmake/hipblaslt
56 |         -D CMAKE_PREFIX_PATH=/opt/rocm/lib/cmake
57 | 
58 |     - name: Build
59 |       run: cmake --build build --verbose
60 | 
61 |     - name: Install
62 |       run: cmake --install build --prefix install
63 | 
64 |     - name: Prepare for upload
65 |       run: |
66 |         mkdir artifact
67 |         cp -v install/lib/*.so artifact
68 | 
69 |     - name: Describe
70 |       run: git describe --tags --long
71 | 
72 |     - name: Upload
73 |       uses: actions/upload-artifact@v4
74 |       with:
75 |         name: VSMIGX-Linux-x64
76 |         path: vsmigx/artifact
77 |         overwrite: true
78 | 
79 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-ncnn.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Linux-NCNN)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsncnn/**'
  8 |       - '.github/workflows/linux-ncnn.yml'
  9 |   workflow_dispatch:
 10 | 
 11 | jobs:
 12 |   build-linux:
 13 |     runs-on: ubuntu-24.04
 14 |     
 15 |     defaults:
 16 |       run:
 17 |         working-directory: vsncnn
 18 | 
 19 |     steps:
 20 |     - name: Checkout repo
 21 |       uses: actions/checkout@v4
 22 |       with:
 23 |         fetch-depth: 0
 24 | 
 25 |     - name: Setup Ninja
 26 |       run: pip install ninja
 27 | 
 28 |     - name: Cache protobuf
 29 |       id: cache-protobuf
 30 |       uses: actions/cache@v4
 31 |       with:
 32 |         path: vsncnn/protobuf/install
 33 |         key: ${{ runner.os }}-vsncnn-protobuf-v1
 34 | 
 35 |     - name: Checkout protobuf
 36 |       uses: actions/checkout@v4
 37 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 38 |       with:
 39 |         repository: protocolbuffers/protobuf
 40 |         ref: v3.21.12
 41 |         fetch-depth: 1
 42 |         path: vsncnn/protobuf
 43 | 
 44 |     - name: Configure protobuf
 45 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 46 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 47 |         -D CMAKE_BUILD_TYPE=Release
 48 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 49 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 50 | 
 51 |     - name: Build protobuf
 52 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 53 |       run: cmake --build protobuf/build_rel --verbose
 54 | 
 55 |     - name: Install protobuf
 56 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 57 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 58 | 
 59 |     - name: Cache onnx
 60 |       id: cache-onnx
 61 |       uses: actions/cache@v4
 62 |       with:
 63 |         path: vsncnn/onnx/install
 64 |         key: ${{ runner.os }}-vsncnn-onnx-v1
 65 | 
 66 |     - name: Checkout onnx
 67 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 68 |       uses: actions/checkout@v4
 69 |       with:
 70 |         repository: onnx/onnx
 71 |         ref: b86cc54efce19530fb953e4b21f57e6b3888534c
 72 |         fetch-depth: 1
 73 |         path: vsncnn/onnx
 74 | 
 75 |     - name: Configure onnx
 76 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 77 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 78 |         -D CMAKE_BUILD_TYPE=Release
 79 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 80 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 81 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 82 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 83 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 84 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 85 | 
 86 |     - name: Build onnx
 87 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 88 |       run: cmake --build onnx/build --verbose
 89 | 
 90 |     - name: Install onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --install onnx/build --prefix onnx/install
 93 | 
 94 |     - name: Download VapourSynth headers
 95 |       run: |
 96 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
 97 |         unzip -q vs.zip
 98 |         mv vapoursynth*/ vapoursynth
 99 | 
100 |     - name: Download NCNN Precompilation
101 |       run: |
102 |         curl -s -o ncnn.zip -LJO https://github.com/Tencent/ncnn/releases/download/20250503/ncnn-20250503-ubuntu-2404.zip
103 |         unzip -q ncnn.zip
104 | 
105 |     - name: Configure
106 |       run: cmake -S . -B build -G Ninja -LA
107 |         -D CMAKE_BUILD_TYPE=Release
108 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth/include
109 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
110 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
111 |         -D ncnn_DIR=ncnn-20250503-ubuntu-2404/lib/cmake/ncnn
112 |         -D CMAKE_CXX_STANDARD=20
113 | 
114 |     - name: Build
115 |       run: cmake --build build --verbose
116 | 
117 |     - name: Install
118 |       run: cmake --install build --prefix install
119 | 
120 |     - name: Prepare for upload
121 |       run: |
122 |         mkdir artifact
123 |         cp -v install/lib/*.so artifact
124 | 
125 |     - name: Describe
126 |       run: git describe --tags --long
127 | 
128 |     - name: Upload
129 |       uses: actions/upload-artifact@v4
130 |       with:
131 |         name: vsncnn-linux-x64
132 |         path: vsncnn/artifact
133 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-ort.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Linux-ORT)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsort/**'
  8 |       - '.github/workflows/linux-ort.yml'
  9 |   workflow_dispatch:
 10 | 
 11 | jobs:
 12 |   build-linux:
 13 |     runs-on: ubuntu-22.04
 14 |     
 15 |     defaults:
 16 |       run:
 17 |         working-directory: vsort
 18 | 
 19 |     steps:
 20 |     - name: Checkout repo
 21 |       uses: actions/checkout@v4
 22 |       with:
 23 |         fetch-depth: 0
 24 | 
 25 |     - name: Setup Ninja
 26 |       run: pip install ninja
 27 | 
 28 |     - name: Cache protobuf
 29 |       id: cache-protobuf
 30 |       uses: actions/cache@v4
 31 |       with:
 32 |         path: vsort/protobuf/install
 33 |         key: ${{ runner.os }}-vsort-protobuf-v1
 34 | 
 35 |     - name: Checkout protobuf
 36 |       uses: actions/checkout@v4
 37 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 38 |       with:
 39 |         repository: protocolbuffers/protobuf
 40 |         # follows protobuf in https://github.com/microsoft/onnxruntime/blob/v1.17.1/cmake/external/onnxruntime_external_deps.cmake#L183
 41 |         # if you change this, remember to bump the version of the cache key.
 42 |         ref: v3.21.12
 43 |         fetch-depth: 1
 44 |         path: vsort/protobuf
 45 | 
 46 |     - name: Configure protobuf
 47 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 48 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 49 |         -D CMAKE_BUILD_TYPE=Release
 50 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 51 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 52 | 
 53 |     - name: Build protobuf
 54 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 55 |       run: cmake --build protobuf/build_rel --verbose
 56 | 
 57 |     - name: Install protobuf
 58 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 59 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 60 | 
 61 |     - name: Cache onnx
 62 |       id: cache-onnx
 63 |       uses: actions/cache@v4
 64 |       with:
 65 |         path: vsort/onnx/install
 66 |         key: ${{ runner.os }}-vsort-onnx-v1
 67 | 
 68 |     - name: Checkout onnx
 69 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 70 |       uses: actions/checkout@v4
 71 |       with:
 72 |         repository: onnx/onnx
 73 |         # follows onnx in https://github.com/microsoft/onnxruntime/tree/v1.17.1/cmake/external
 74 |         # if you change this, remember to bump the version of the cache key.
 75 |         ref: b86cc54efce19530fb953e4b21f57e6b3888534c
 76 |         fetch-depth: 1
 77 |         path: vsort/onnx
 78 | 
 79 |     - name: Configure onnx
 80 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 81 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 82 |         -D CMAKE_BUILD_TYPE=Release
 83 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 84 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 85 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 86 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 87 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 88 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 89 | 
 90 |     - name: Build onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --build onnx/build --verbose
 93 | 
 94 |     - name: Install onnx
 95 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 96 |       run: cmake --install onnx/build --prefix onnx/install
 97 | 
 98 |     - name: Download VapourSynth headers
 99 |       run: |
100 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
101 |         unzip -q vs.zip
102 |         mv vapoursynth*/ vapoursynth
103 | 
104 |     - name: Setup ONNX Runtime
105 |       run: |
106 |         curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.17.1/onnxruntime-linux-x64-cuda12-1.17.1.tgz
107 |         tar -xf ort.tgz
108 |         mv onnxruntime-* onnxruntime -v
109 | 
110 |     - name: Setup CUDA
111 |       run: |
112 |         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
113 |         sudo dpkg -i cuda-keyring_1.1-1_all.deb
114 |         sudo apt-get update
115 |         sudo apt-get install -y cuda-nvcc-12-1 cuda-cudart-dev-12-1
116 |         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
117 |         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
118 |         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
119 | 
120 |     - name: Configure
121 |       run: cmake -S . -B build -G Ninja -LA
122 |         -D CMAKE_BUILD_TYPE=Release
123 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
124 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
125 |         -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime/include
126 |         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime/lib
127 |         -D ENABLE_CUDA=1
128 |         -D CUDAToolkit_ROOT=/usr/local/cuda
129 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
130 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
131 |         -D CMAKE_CXX_STANDARD=20
132 | 
133 |     - name: Build
134 |       run: cmake --build build --verbose
135 | 
136 |     - name: Install
137 |       run: cmake --install build --prefix install
138 | 
139 |     - name: Prepare for upload
140 |       run: |
141 |         mkdir artifact
142 |         cp -v install/lib/*.so artifact
143 | 
144 |     - name: Describe
145 |       run: git describe --tags --long
146 | 
147 |     - name: Upload
148 |       uses: actions/upload-artifact@v4
149 |       with:
150 |         name: vsort-linux-x64-cuda12.1
151 |         path: vsort/artifact
152 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-ov-arm64.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Linux-OV, ARM64)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vsov/**'
  7 |       - '.github/workflows/linux-ov-arm64.yml'
  8 |   workflow_dispatch:
  9 | 
 10 | jobs:
 11 |   build-linux:
 12 |     runs-on: ubuntu-24.04-arm
 13 |     
 14 |     defaults:
 15 |       run:
 16 |         working-directory: vsov
 17 | 
 18 |     steps:
 19 |     - name: Checkout repo
 20 |       uses: actions/checkout@v4
 21 |       with:
 22 |         fetch-depth: 0
 23 | 
 24 |     - name: Setup Ninja
 25 |       run: pip install ninja
 26 | 
 27 |     - name: Cache protobuf
 28 |       id: cache-protobuf
 29 |       uses: actions/cache@v4
 30 |       with:
 31 |         path: vsov/protobuf/install
 32 |         key: ${{ runner.os }}-vsov-protobuf-arm64-v1
 33 | 
 34 |     - name: Checkout protobuf
 35 |       uses: actions/checkout@v4
 36 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 37 |       with:
 38 |         repository: protocolbuffers/protobuf
 39 |         # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/protobuf
 40 |         # if you change this, remember to bump the version of the cache key.
 41 |         ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c
 42 |         fetch-depth: 1
 43 |         path: vsov/protobuf
 44 | 
 45 |     - name: Configure protobuf
 46 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 47 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 48 |         -D CMAKE_BUILD_TYPE=Release
 49 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 50 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 51 | 
 52 |     - name: Build protobuf
 53 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 54 |       run: cmake --build protobuf/build_rel --verbose
 55 | 
 56 |     - name: Install protobuf
 57 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 58 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 59 | 
 60 |     - name: Cache onnx
 61 |       id: cache-onnx
 62 |       uses: actions/cache@v4
 63 |       with:
 64 |         path: vsov/onnx/install
 65 |         key: ${{ runner.os }}-vsov-onnx-arm64-v1
 66 | 
 67 |     - name: Checkout onnx
 68 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 69 |       uses: actions/checkout@v4
 70 |       with:
 71 |         repository: onnx/onnx
 72 |         # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/onnx
 73 |         # if you change this, remember to bump the version of the cache key.
 74 |         ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c
 75 |         fetch-depth: 1
 76 |         path: vsov/onnx
 77 | 
 78 |     - name: Configure onnx
 79 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 80 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 81 |         -D CMAKE_BUILD_TYPE=Release
 82 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 83 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 84 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 85 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 86 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 87 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 88 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
 89 | 
 90 |     - name: Build onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --build onnx/build --verbose
 93 | 
 94 |     - name: Install onnx
 95 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 96 |       run: cmake --install onnx/build --prefix onnx/install
 97 | 
 98 |     - name: Download VapourSynth headers
 99 |       run: |
100 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
101 |         unzip -q vs.zip
102 |         mv vapoursynth*/ vapoursynth
103 | 
104 |     - name: Setup OpenVINO
105 |       run: |
106 |         curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/linux/l_openvino_toolkit_ubuntu20_2024.6.0.17404.4c0f47d2335_arm64.tgz
107 |         tar -xf ov.tgz
108 |         mv l_openvino_* openvino -v
109 | 
110 |     - name: Configure
111 |       run: cmake -S . -B build -G Ninja -LA
112 |         -D CMAKE_BUILD_TYPE=Release
113 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math"
114 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
115 |         -D OpenVINO_DIR=openvino/runtime/cmake
116 |         -D ENABLE_VISUALIZATION=ON
117 |         -D WIN32_SHARED_OPENVINO=ON
118 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
119 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
120 | 
121 |     - name: Build
122 |       run: cmake --build build --verbose
123 | 
124 |     - name: Install
125 |       run: cmake --install build --prefix install
126 | 
127 |     - name: Prepare for upload
128 |       run: |
129 |         mkdir artifact
130 |         cp -v install/lib/*.so artifact
131 | 
132 |     - name: Describe
133 |       run: git describe --tags --long
134 | 
135 |     - name: Upload
136 |       uses: actions/upload-artifact@v4
137 |       with:
138 |         name: VSOV-Linux-ARM64
139 |         path: vsov/artifact
140 | 
141 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-ov.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Linux-OV)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vsov/**'
  7 |       - '.github/workflows/linux-ov.yml'
  8 |   workflow_dispatch:
  9 | 
 10 | jobs:
 11 |   build-linux:
 12 |     runs-on: ubuntu-22.04
 13 |     
 14 |     defaults:
 15 |       run:
 16 |         working-directory: vsov
 17 | 
 18 |     steps:
 19 |     - name: Checkout repo
 20 |       uses: actions/checkout@v4
 21 |       with:
 22 |         fetch-depth: 0
 23 | 
 24 |     - name: Setup Ninja
 25 |       run: pip install ninja
 26 | 
 27 |     - name: Cache protobuf
 28 |       id: cache-protobuf
 29 |       uses: actions/cache@v4
 30 |       with:
 31 |         path: vsov/protobuf/install
 32 |         key: ${{ runner.os }}-vsov-protobuf-v1
 33 | 
 34 |     - name: Checkout protobuf
 35 |       uses: actions/checkout@v4
 36 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 37 |       with:
 38 |         repository: protocolbuffers/protobuf
 39 |         # follows protobuf in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/protobuf
 40 |         # if you change this, remember to bump the version of the cache key.
 41 |         ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c
 42 |         fetch-depth: 1
 43 |         path: vsov/protobuf
 44 | 
 45 |     - name: Configure protobuf
 46 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 47 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 48 |         -D CMAKE_BUILD_TYPE=Release
 49 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 50 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 51 | 
 52 |     - name: Build protobuf
 53 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 54 |       run: cmake --build protobuf/build_rel --verbose
 55 | 
 56 |     - name: Install protobuf
 57 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 58 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 59 | 
 60 |     - name: Cache onnx
 61 |       id: cache-onnx
 62 |       uses: actions/cache@v4
 63 |       with:
 64 |         path: vsov/onnx/install
 65 |         key: ${{ runner.os }}-vsov-onnx-v1
 66 | 
 67 |     - name: Checkout onnx
 68 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 69 |       uses: actions/checkout@v4
 70 |       with:
 71 |         repository: onnx/onnx
 72 |         # follows onnx in https://github.com/openvinotoolkit/openvino/tree/2024.6.0/thirdparty/onnx
 73 |         # if you change this, remember to bump the version of the cache key.
 74 |         ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c
 75 |         fetch-depth: 1
 76 |         path: vsov/onnx
 77 | 
 78 |     - name: Configure onnx
 79 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 80 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 81 |         -D CMAKE_BUILD_TYPE=Release
 82 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 83 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 84 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 85 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 86 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 87 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
 88 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
 89 | 
 90 |     - name: Build onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --build onnx/build --verbose
 93 | 
 94 |     - name: Install onnx
 95 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 96 |       run: cmake --install onnx/build --prefix onnx/install
 97 | 
 98 |     - name: Download VapourSynth headers
 99 |       run: |
100 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
101 |         unzip -q vs.zip
102 |         mv vapoursynth*/ vapoursynth
103 | 
104 |     - name: Setup OpenVINO
105 |       run: |
106 |         curl -L -o ov.tgz https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/linux/l_openvino_toolkit_ubuntu24_2024.6.0.17404.4c0f47d2335_x86_64.tgz
107 |         tar -xf ov.tgz
108 |         mv l_openvino_* openvino -v
109 | 
110 |     - name: Configure
111 |       run: cmake -S . -B build -G Ninja -LA
112 |         -D CMAKE_BUILD_TYPE=Release
113 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
114 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
115 |         -D OpenVINO_DIR=openvino/runtime/cmake
116 |         -D ENABLE_VISUALIZATION=ON
117 |         -D WIN32_SHARED_OPENVINO=ON
118 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
119 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
120 | 
121 |     - name: Build
122 |       run: cmake --build build --verbose
123 | 
124 |     - name: Install
125 |       run: cmake --install build --prefix install
126 | 
127 |     - name: Prepare for upload
128 |       run: |
129 |         mkdir artifact
130 |         cp -v install/lib/*.so artifact
131 | 
132 |     - name: Describe
133 |       run: git describe --tags --long
134 | 
135 |     - name: Upload
136 |       uses: actions/upload-artifact@v4
137 |       with:
138 |         name: VSOV-Linux-x64
139 |         path: vsov/artifact
140 | 
141 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-trt-arm64.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux-TRT, ARM64)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'vstrt/**'
 7 |       - '.github/workflows/linux-trt-arm64.yml'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build-linux:
12 |     runs-on: ubuntu-24.04-arm
13 |     
14 |     defaults:
15 |       run:
16 |         working-directory: vstrt
17 | 
18 |     steps:
19 |     - name: Checkout repo
20 |       uses: actions/checkout@v4
21 |       with:
22 |         fetch-depth: 0
23 | 
24 |     - name: Setup Ninja
25 |       run: pip install ninja --break-system-packages
26 | 
27 |     - name: Download VapourSynth headers
28 |       run: |
29 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
30 |         unzip -q vs.zip
31 |         mv vapoursynth*/ vapoursynth
32 | 
33 |     - name: Setup CUDA and TensorRT
34 |       run: |
35 |         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb
36 |         sudo dpkg -i cuda-keyring_1.1-1_all.deb
37 |         sudo apt-get update
38 |         export TRT_VER=10.8.0.43-1+cuda12.8
39 |         sudo apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-cudart-dev-12-8 libnvinfer-dev=${TRT_VER} libnvinfer-headers-dev=${TRT_VER} libnvinfer10=${TRT_VER}
40 |         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
41 |         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
42 |         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
43 | 
44 |     - name: Configure
45 |       run: cmake -S . -B build -G Ninja -LA
46 |         -D CMAKE_BUILD_TYPE=Release
47 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
48 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math"
49 | 
50 |     - name: Build
51 |       run: cmake --build build --verbose
52 | 
53 |     - name: Install
54 |       run: cmake --install build --prefix install
55 | 
56 |     - name: Prepare for upload
57 |       run: |
58 |         mkdir artifact
59 |         cp -v install/lib/*.so artifact
60 | 
61 |     - name: Describe
62 |       run: git describe --tags --long
63 | 
64 |     - name: Upload
65 |       uses: actions/upload-artifact@v4
66 |       with:
67 |         name: VSTRT-Linux-ARM64
68 |         path: vstrt/artifact
69 |         overwrite: true
70 | 
71 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-trt.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux-TRT)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'vstrt/**'
 7 |       - '.github/workflows/linux-trt.yml'
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   build-linux:
12 |     runs-on: ubuntu-24.04
13 |     
14 |     defaults:
15 |       run:
16 |         working-directory: vstrt
17 | 
18 |     steps:
19 |     - name: Checkout repo
20 |       uses: actions/checkout@v4
21 |       with:
22 |         fetch-depth: 0
23 | 
24 |     - name: Setup Ninja
25 |       run: pip install ninja --break-system-packages
26 | 
27 |     - name: Download VapourSynth headers
28 |       run: |
29 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
30 |         unzip -q vs.zip
31 |         mv vapoursynth*/ vapoursynth
32 | 
33 |     - name: Setup CUDA and TensorRT
34 |       run: |
35 |         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
36 |         sudo dpkg -i cuda-keyring_1.1-1_all.deb
37 |         sudo apt-get update
38 |         export TRT_VER=10.9.0.34-1+cuda12.8
39 |         sudo apt-get install -y --no-install-recommends cuda-nvcc-12-8 cuda-cudart-dev-12-8 libnvinfer-dev=${TRT_VER} libnvinfer-headers-dev=${TRT_VER} libnvinfer10=${TRT_VER}
40 |         echo "PATH=/usr/local/cuda/bin${PATH:+:${PATH}}" >> $GITHUB_ENV
41 |         echo "CUDA_PATH=/usr/local/cuda" >> $GITHUB_ENV
42 |         echo "LD_LIBRARY_PATH=/usr/local/cuda/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" >> $GITHUB_ENV
43 | 
44 |     - name: Configure
45 |       run: cmake -S . -B build -G Ninja -LA
46 |         -D CMAKE_BUILD_TYPE=Release
47 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
48 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -march=x86-64-v3"
49 | 
50 |     - name: Build
51 |       run: cmake --build build --verbose
52 | 
53 |     - name: Install
54 |       run: cmake --install build --prefix install
55 | 
56 |     - name: Prepare for upload
57 |       run: |
58 |         mkdir artifact
59 |         cp -v install/lib/*.so artifact
60 | 
61 |     - name: Describe
62 |       run: git describe --tags --long
63 | 
64 |     - name: Upload
65 |       uses: actions/upload-artifact@v4
66 |       with:
67 |         name: VSTRT-Linux-x64
68 |         path: vstrt/artifact
69 |         overwrite: true
70 | 
71 | 


--------------------------------------------------------------------------------
/.github/workflows/macos-ort.yml:
--------------------------------------------------------------------------------
  1 | name: Build (macOS-ORT)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsort/**'
  8 |       - '.github/workflows/macos-ort.yml'
  9 |   workflow_dispatch:
 10 | 
 11 | jobs:
 12 |   build-macos:
 13 |     runs-on: macos-14
 14 |     
 15 |     defaults:
 16 |       run:
 17 |         working-directory: vsort
 18 | 
 19 |     steps:
 20 |     - name: Checkout repo
 21 |       uses: actions/checkout@v4
 22 |       with:
 23 |         fetch-depth: 0
 24 | 
 25 |     - name: Setup Ninja
 26 |       run: brew install ninja
 27 | 
 28 |     - name: Cache protobuf
 29 |       id: cache-protobuf
 30 |       uses: actions/cache@v4
 31 |       with:
 32 |         path: vsort/protobuf/install
 33 |         key: ${{ runner.os }}-vsort-protobuf-v1
 34 | 
 35 |     - name: Checkout protobuf
 36 |       uses: actions/checkout@v4
 37 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 38 |       with:
 39 |         repository: protocolbuffers/protobuf
 40 |         ref: v3.21.12
 41 |         fetch-depth: 1
 42 |         path: vsort/protobuf
 43 | 
 44 |     - name: Configure protobuf
 45 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 46 |       run: cmake -S protobuf/cmake -B protobuf/build_rel -G Ninja -LA
 47 |         -D CMAKE_BUILD_TYPE=Release
 48 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 49 |         -D protobuf_BUILD_SHARED_LIBS=OFF
 50 |         -D protobuf_BUILD_TESTS=OFF
 51 | 
 52 |     - name: Build protobuf
 53 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 54 |       run: cmake --build protobuf/build_rel --verbose
 55 | 
 56 |     - name: Install protobuf
 57 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 58 |       run: cmake --install protobuf/build_rel --prefix protobuf/install
 59 | 
 60 |     - name: Cache onnx
 61 |       id: cache-onnx
 62 |       uses: actions/cache@v4
 63 |       with:
 64 |         path: vsort/onnx/install
 65 |         key: ${{ runner.os }}-vsort-onnx-v2
 66 | 
 67 |     - name: Checkout onnx
 68 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 69 |       uses: actions/checkout@v4
 70 |       with:
 71 |         repository: onnx/onnx
 72 |         # follows onnx in https://github.com/microsoft/onnxruntime/tree/main/cmake/external
 73 |         ref: 595228d99e3977ac27cb79d5963adda262af99ad
 74 |         fetch-depth: 1
 75 |         path: vsort/onnx
 76 | 
 77 |     - name: Configure onnx
 78 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 79 |       run: cmake -S onnx -B onnx/build -G Ninja -LA
 80 |         -D CMAKE_BUILD_TYPE=Release
 81 |         -D CMAKE_POSITION_INDEPENDENT_CODE=ON
 82 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf/install/bin/protoc
 83 |         -D Protobuf_LITE_LIBRARY=protobuf/install/lib
 84 |         -D Protobuf_LIBRARIES=protobuf/install/lib
 85 |         -D ONNX_USE_LITE_PROTO=ON
 86 |         -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
 87 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF
 88 |         -D ONNX_ML=0
 89 | 
 90 |     - name: Build onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       run: cmake --build onnx/build --verbose
 93 | 
 94 |     - name: Install onnx
 95 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 96 |       run: cmake --install onnx/build --prefix onnx/install
 97 | 
 98 |     - name: Download VapourSynth headers
 99 |       run: |
100 |         curl -L -o vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
101 |         unzip -q vs.zip
102 |         mv vapoursynth*/ vapoursynth
103 | 
104 |     - name: Setup ONNX Runtime
105 |       run: |
106 |         curl -L -o ort.tgz https://github.com/microsoft/onnxruntime/releases/download/v1.20.0/onnxruntime-osx-arm64-1.20.0.tgz
107 |         tar -xf ort.tgz
108 |         mv onnxruntime-* onnxruntime
109 | 
110 |     - name: Configure
111 |       run: cmake -S . -B build -G Ninja -LA
112 |         -D CMAKE_BUILD_TYPE=Release
113 |         -D CMAKE_CXX_FLAGS="-Wall -ffast-math -mcpu=apple-m1"
114 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="`pwd`/vapoursynth/include"
115 |         -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime/include
116 |         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime/lib
117 |         -D protobuf_DIR=protobuf/install/lib/cmake/protobuf
118 |         -D ONNX_DIR=onnx/install/lib/cmake/ONNX
119 |         -D CMAKE_CXX_STANDARD=20
120 |         -D ENABLE_COREML=ON
121 | 
122 |     - name: Build
123 |       run: cmake --build build --verbose
124 | 
125 |     - name: Install
126 |       run: cmake --install build --prefix install
127 | 
128 |     - name: Prepare for upload
129 |       run: |
130 |         mkdir artifact
131 |         cp -v install/lib/*.dylib artifact
132 | 
133 |     - name: Describe
134 |       run: git describe --tags --long
135 | 
136 |     - name: Upload
137 |       uses: actions/upload-artifact@v4
138 |       with:
139 |         name: vsort-macos-arm64
140 |         path: vsort/artifact
141 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-cuda-dependency.yml:
--------------------------------------------------------------------------------
  1 | name: Upload vs-mlrt-cuda dependencies
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       tag:
  7 |         description: 'which tag to upload to'
  8 |         required: true
  9 |         default: 'v100'
 10 |   workflow_call:
 11 |     inputs:
 12 |       tag:
 13 |         description: 'which tag to upload to'
 14 |         required: true
 15 |         type: string
 16 |     secrets:
 17 |       REPO_TOKEN:
 18 |         required: true
 19 | 
 20 | jobs:
 21 |   build-windows:
 22 |     runs-on: windows-2025
 23 |     outputs:
 24 |       runID: ${{ steps.output.outputs.runID }}
 25 | 
 26 |     defaults:
 27 |       run:
 28 |         shell: bash
 29 | 
 30 |     steps:
 31 |     - name: Download cuDNN inference library
 32 |       run: curl -LJ https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-8.9.7.29_cuda12-archive.zip -o cudnn.zip
 33 | 
 34 |     - name: Extract cuDNN library
 35 |       run: unzip cudnn.zip
 36 | 
 37 |     - name: Move cuDNN library
 38 |       run: |
 39 |         mkdir -p vsmlrt-cuda
 40 |         mv cudnn-windows-*/bin/*.dll vsmlrt-cuda/ -v
 41 |         rm vsmlrt-cuda/cudnn_*_train*.dll -v
 42 | 
 43 |     - name: Download TensorRT library
 44 |       run: |
 45 |         curl -L -o trt.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/zip/TensorRT-10.11.0.33.Windows.win10.cuda-12.9.zip
 46 | 
 47 |     - name: Extract TensorRT library
 48 |       run: |
 49 |         unzip trt.zip
 50 |         mv TensorRT-*/ TensorRT/
 51 | 
 52 |     - name: Move TensorRT library
 53 |       run: mv TensorRT/lib/*.dll vsmlrt-cuda -v
 54 | 
 55 |     - name: Download CUDA Libraries
 56 |       shell: cmd
 57 |       run: |
 58 |         curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe
 59 |         cuda_installer.exe -s cudart_12.9 cublas_12.9 cufft_12.9 cupti_12.9 nvrtc_12.9
 60 | 
 61 |     - name: Move CUDA Libraries
 62 |       shell: cmd
 63 |       run: |
 64 |         move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\extras\CUPTI\lib64\cupti*.dll" vsmlrt-cuda
 65 |         move "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\*.dll" vsmlrt-cuda
 66 |         del vsmlrt-cuda\cudart32*.dll
 67 | 
 68 |     - name: Setup VC commands
 69 |       uses: ilammy/msvc-dev-cmd@v1
 70 |       with:
 71 |         arch: amd64
 72 | 
 73 |     - name: Copy VC Runtime Libraries
 74 |       shell: bash
 75 |       run: |
 76 |         cd vsmlrt-cuda
 77 |         while true; do
 78 |           changed=false
 79 |           for dll in *.[dD][lL][lL]; do
 80 |              for dep in $(dumpbin -dependents "$dll" | grep -o -i '\<\(vc\|msvc\)[a-z0-9_-]*\.dll'); do
 81 |                 echo "finding $dep for $dll"
 82 |                 if ! test -f ./"$dep"; then
 83 |                   changed=true
 84 |                   src="$(where "$dep" | grep -i 'MSVC' | head -1)"
 85 |                   echo "copying $src for $dep"
 86 |                   test -f "$src" || exit 1
 87 |                   cp -f "$src" .
 88 |                 fi
 89 |              done
 90 |           done
 91 |           $changed || break
 92 |         done
 93 | 
 94 |     - name: Compress
 95 |       run: |
 96 |         7z a -t7z -bb3 -mx=9 vsmlrt-cuda.7z vsmlrt-cuda
 97 | 
 98 |     - name: Upload
 99 |       uses: actions/upload-artifact@v4
100 |       with:
101 |         name: vsmlrt-cuda
102 |         path: vsmlrt-cuda.7z
103 |         retention-days: 1
104 |         compression-level: 0
105 | 
106 |     - name: Rename release asset
107 |       run: |
108 |         mv vsmlrt-cuda.7z vsmlrt-cuda.${{ github.event.inputs.tag}}.7z
109 | 
110 |     - name: Release
111 |       uses: softprops/action-gh-release@v2
112 |       with:
113 |         tag_name: ${{ github.event.inputs.tag }}
114 |         files: vsmlrt-cuda.${{ github.event.inputs.tag }}.7z
115 |         fail_on_unmatched_files: true
116 |         generate_release_notes: false
117 |         prerelease: true
118 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-hip-dependency.yml:
--------------------------------------------------------------------------------
  1 | name: Upload vs-mlrt-hip dependencies
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       tag:
  7 |         description: 'which tag to upload to'
  8 |         required: true
  9 |         default: 'v100'
 10 |   workflow_call:
 11 |     inputs:
 12 |       tag:
 13 |         description: 'which tag to upload to'
 14 |         required: true
 15 |         type: string
 16 |     secrets:
 17 |       REPO_TOKEN:
 18 |         required: true
 19 | 
 20 | jobs:
 21 |   build-windows:
 22 |     runs-on: windows-2022
 23 |     outputs:
 24 |       runID: ${{ steps.output.outputs.runID }}
 25 | 
 26 |     defaults:
 27 |       run:
 28 |         shell: bash
 29 | 
 30 |     steps:
 31 |     - name: Download MIGraphX Precompilation
 32 |       run: |
 33 |         curl -s -o migx.zip -LJO https://github.com/AmusementClub/AMDMIGraphX/releases/download/rocm-4.1.0-1730-g6acc1f957-241221-0629/migraphx-win64.zip
 34 | 
 35 |     - name: Extract MIGraphX Precompilation
 36 |       run: |
 37 |         unzip migx.zip
 38 | 
 39 |     - name: Move MIGraphX Precompilation
 40 |       run: |
 41 |         mkdir vsmlrt-hip
 42 |         mv migraphx/bin/* vsmlrt-hip -v
 43 | 
 44 |     - name: Setup VC commands
 45 |       uses: ilammy/msvc-dev-cmd@v1
 46 |       with:
 47 |         arch: amd64
 48 | 
 49 |     - name: List Dependencies
 50 |       shell: bash
 51 |       run: |
 52 |         cd vsmlrt-hip
 53 |         for dll in *.[dD][lL][lL]; do
 54 |           echo $(dumpbin -dependents "$dll")
 55 |         done
 56 | 
 57 |     - name: Cache HIP
 58 |       id: cache-hip
 59 |       uses: actions/cache@v4
 60 |       with:
 61 |         path: C:\Program Files\AMD\ROCm
 62 |         key: ${{ runner.os }}-rocm-6.2.4
 63 | 
 64 |     - name: Setup HIP
 65 |       if: steps.cache-hip.outputs.cache-hit != 'true'
 66 |       shell: pwsh
 67 |       run: |
 68 |         curl -s -o hip_installer.exe -L https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
 69 |         Start-Process hip_installer.exe -ArgumentList '-install' -NoNewWindow -Wait
 70 | 
 71 |     - name: Move HIP Libraries
 72 |       shell: cmd
 73 |       run: |
 74 |         move "C:\Program Files\AMD\ROCm\6.2\bin\amd_comgr_2.dll" vsmlrt-hip
 75 |         move "C:\Program Files\AMD\ROCm\6.2\bin\amdhip64_6.dll" vsmlrt-hip
 76 |         move "C:\Program Files\AMD\ROCm\6.2\bin\hiprtc0602.dll" vsmlrt-hip
 77 |         move "C:\Program Files\AMD\ROCm\6.2\bin\hiprtc-builtins0602.dll" vsmlrt-hip
 78 | 
 79 |     - name: Compress
 80 |       run: |
 81 |         7z a -t7z -mx=3 vsmlrt-hip.7z vsmlrt-hip
 82 | 
 83 |     - name: Upload
 84 |       uses: actions/upload-artifact@v4
 85 |       with:
 86 |         name: vsmlrt-hip
 87 |         path: vsmlrt-hip.7z
 88 |         retention-days: 1
 89 |         compression-level: 0
 90 | 
 91 |     - name: Rename release asset
 92 |       run: |
 93 |         mv vsmlrt-hip.7z vsmlrt-hip.${{ github.event.inputs.tag}}.7z
 94 | 
 95 |     - name: Release
 96 |       uses: softprops/action-gh-release@v2
 97 |       with:
 98 |         tag_name: ${{ github.event.inputs.tag }}
 99 |         files: vsmlrt-hip.${{ github.event.inputs.tag }}.7z
100 |         fail_on_unmatched_files: true
101 |         generate_release_notes: false
102 |         prerelease: true
103 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-migx.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-MIGX)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vsmigx/**'
  7 |       - '.github/workflows/windows-migx.yml'
  8 |   workflow_call:
  9 |     inputs:
 10 |       tag:
 11 |         description: 'which tag to upload to'
 12 |         required: true
 13 |         type: string
 14 |     secrets:
 15 |       REPO_TOKEN:
 16 |         required: true
 17 |   workflow_dispatch:
 18 |     inputs:
 19 |       tag:
 20 |         description: 'which tag to upload to'
 21 |         default: ''
 22 | 
 23 | jobs:
 24 |   build-windows:
 25 |     runs-on: windows-2022
 26 | 
 27 |     defaults:
 28 |       run:
 29 |         shell: cmd
 30 |         working-directory: vsmigx
 31 | 
 32 |     steps:
 33 |     - name: Checkout repo
 34 |       uses: actions/checkout@v4
 35 |       with:
 36 |         fetch-depth: 0
 37 | 
 38 |     - name: Setup MSVC
 39 |       uses: ilammy/msvc-dev-cmd@v1
 40 | 
 41 |     - name: Cache HIP
 42 |       id: cache-hip
 43 |       uses: actions/cache@v4
 44 |       with:
 45 |         path: C:\Program Files\AMD\ROCm
 46 |         key: ${{ runner.os }}-rocm-6.2.4
 47 | 
 48 |     - name: Setup HIP
 49 |       if: steps.cache-hip.outputs.cache-hit != 'true'
 50 |       shell: pwsh
 51 |       run: |
 52 |         curl -s -o hip_installer.exe -L https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q4-WinSvr2022-For-HIP.exe
 53 |         Start-Process hip_installer.exe -ArgumentList '-install' -NoNewWindow -Wait
 54 | 
 55 |     - name: Download MIGraphX Precompilation
 56 |       run: |
 57 |         curl -s -o migx.zip -LJO https://github.com/AmusementClub/AMDMIGraphX/releases/download/rocm-4.1.0-1730-g6acc1f957-241221-0629/migraphx-win64.zip
 58 |         unzip -q migx.zip
 59 | 
 60 |     - name: Download VapourSynth headers
 61 |       run: |
 62 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
 63 |         unzip -q vs.zip
 64 |         mv vapoursynth-*/ vapoursynth/
 65 | 
 66 |     - name: Configure
 67 |       run: cmake -S . -B build -G Ninja -Wno-dev -LA
 68 |         -D CMAKE_BUILD_TYPE=Release
 69 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
 70 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%/vapoursynth/include"
 71 |         -D hip_DIR="C:/Program Files/AMD/ROCm/6.2/lib/cmake/hip"
 72 |         -D HIP_PLATFORM=amd
 73 |         -D migraphx_DIR="%cd%/migraphx/lib/cmake/migraphx"
 74 | 
 75 |     - name: Build
 76 |       run: cmake --build build --verbose
 77 | 
 78 |     - name: Install
 79 |       run: cmake --install build --prefix install
 80 | 
 81 |     - name: Prepare for upload
 82 |       run: |
 83 |         mkdir artifact
 84 |         copy install\bin\vsmigx.dll artifact\
 85 | 
 86 |     - name: Describe
 87 |       run: git describe --tags --long
 88 | 
 89 |     - name: Dump dependencies
 90 |       run: dumpbin /dependents artifact/vsmigx.dll
 91 | 
 92 |     - name: Upload
 93 |       uses: actions/upload-artifact@v4
 94 |       with:
 95 |         name: VSMIGX-Windows-x64
 96 |         path: vsmigx/artifact
 97 | 
 98 |     - name: Compress artifact for release
 99 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
100 |       run: |
101 |         cd artifact
102 |         7z a -t7z -mx=7 ../../VSMIGX-Windows-x64.${{ github.event.inputs.tag }}.7z .
103 | 
104 |     - name: Release
105 |       uses: softprops/action-gh-release@v2
106 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
107 |       with:
108 |         tag_name: ${{ inputs.tag }}
109 |         files: VSMIGX-Windows-x64.${{ github.event.inputs.tag }}.7z
110 |         fail_on_unmatched_files: true
111 |         generate_release_notes: false
112 |         prerelease: true
113 | 
114 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-ncnn.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-NCNN)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsncnn/**'
  8 |       - '.github/workflows/windows-ncnn.yml'
  9 |   workflow_call:
 10 |     inputs:
 11 |       tag:
 12 |         description: 'which tag to upload to'
 13 |         required: true
 14 |         type: string
 15 |       ncnn_tag:
 16 |         description: 'which tag of ncnn to use'
 17 |         required: true
 18 |         default: 'latest'
 19 |         type: string
 20 |   workflow_dispatch:
 21 |     inputs:
 22 |       tag:
 23 |         description: 'which tag to upload to'
 24 |         default: ''
 25 |       ncnn_tag:
 26 |         description: 'which tag of ncnn to use'
 27 |         required: true
 28 |         default: 'latest'
 29 |         type: string
 30 | 
 31 | jobs:
 32 |   build-windows:
 33 |     runs-on: windows-2022
 34 | 
 35 |     defaults:
 36 |       run:
 37 |         shell: cmd
 38 |         working-directory: vsncnn
 39 | 
 40 |     steps:
 41 |     - name: Checkout repo
 42 |       uses: actions/checkout@v4
 43 |       with:
 44 |         fetch-depth: 0
 45 | 
 46 |     - name: Setup MSVC
 47 |       uses: ilammy/msvc-dev-cmd@v1
 48 | 
 49 |     - name: Cache protobuf
 50 |       id: cache-protobuf
 51 |       uses: actions/cache@v4
 52 |       with:
 53 |         path: vsncnn/protobuf/install
 54 |         key: ${{ runner.os }}-vsncnn-protobuf-v3.16.0
 55 | 
 56 |     - name: Checkout protobuf
 57 |       uses: actions/checkout@v4
 58 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 59 |       with:
 60 |         repository: protocolbuffers/protobuf
 61 |         # follows protobuf in https://github.com/onnx/onnx/tree/v1.12.0#windows
 62 |         # if you change this, remember to bump the version of the cache key of protobuf and onnx.
 63 |         ref: v3.16.0
 64 |         path: vsncnn/protobuf
 65 | 
 66 |     - name: Configure protobuf
 67 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 68 |       run: cmake -S protobuf\cmake -B protobuf\build_rel -G Ninja -LA
 69 |         -D CMAKE_BUILD_TYPE=Release
 70 |         -D protobuf_BUILD_SHARED_LIBS=OFF 
 71 |         -D protobuf_BUILD_TESTS=OFF
 72 |         -D protobuf_MSVC_STATIC_RUNTIME=ON
 73 |         -D CMAKE_POLICY_VERSION_MINIMUM=3.5
 74 | 
 75 |     - name: Build protobuf
 76 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 77 |       run: cmake --build protobuf\build_rel --verbose
 78 | 
 79 |     - name: Install protobuf
 80 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 81 |       run: cmake --install protobuf\build_rel --prefix protobuf\install
 82 | 
 83 |     - name: Cache onnx
 84 |       id: cache-onnx
 85 |       uses: actions/cache@v4
 86 |       with:
 87 |         path: vsncnn/onnx/install
 88 |         key: ${{ runner.os }}-vsncnn-onnx-v1.12.0-protobuf-v3.16.0
 89 | 
 90 |     - name: Checkout onnx
 91 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 92 |       uses: actions/checkout@v4
 93 |       with:
 94 |         repository: onnx/onnx
 95 |         ref: v1.12.0
 96 |         path: vsncnn/onnx
 97 | 
 98 |     - name: Configure onnx
 99 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
100 |       run: cmake -S onnx -B onnx\build -G Ninja -LA
101 |         -D CMAKE_BUILD_TYPE=Release
102 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc
103 |         -D Protobuf_LITE_LIBRARY=protobuf\install\lib
104 |         -D Protobuf_LIBRARIES=protobuf\install\lib
105 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
106 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
107 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
108 |         -D CMAKE_POLICY_VERSION_MINIMUM=3.5
109 | 
110 |     - name: Build onnx
111 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
112 |       run: cmake --build onnx\build --verbose
113 | 
114 |     - name: Install onnx
115 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
116 |       run: cmake --install onnx\build --prefix onnx\install
117 | 
118 |     - name: Download VapourSynth headers
119 |       run: |
120 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
121 |         unzip -q vs.zip
122 |         mv vapoursynth-*/ vapoursynth/
123 | 
124 |     - name: Download NCNN Precompilation
125 |       shell: bash
126 |       run: |
127 |         rev="${{github.event.inputs.ncnn_tag || inputs.ncnn_tag || 'latest'}}"
128 |         if [ "$rev" == "latest" ]; then
129 |           url="https://github.com/AmusementClub/ncnn/releases/latest/download/ncnn-gpu-x64-windows.zip"
130 |         else
131 |           url="https://github.com/AmusementClub/ncnn/releases/download/$rev/ncnn-gpu-x64-windows.zip"
132 |         fi
133 |         curl -s -o ncnn.zip -LJO "$url"
134 |         unzip -q ncnn.zip
135 | 
136 |     # follows vulkan sdk in https://github.com/AmusementClub/ncnn/blob/github-actions/.github/workflows/windows-x64-gpu.yml
137 |     - name: Setup Vulkan SDK
138 |       shell: pwsh
139 |       run: |
140 |         Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.275.0/windows/VulkanSDK-1.3.275.0-Installer.exe?Human=true -OutFile VulkanSDK.exe
141 |         $installer = Start-Process -FilePath VulkanSDK.exe -Wait -PassThru -ArgumentList "--accept-licenses --default-answer --confirm-command install";
142 |         $installer.WaitForExit();
143 | 
144 |     - name: Configure
145 |       run: cmake -S . -B build -G Ninja -LA
146 |         -D CMAKE_BUILD_TYPE=Release
147 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
148 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include
149 |         -D protobuf_DIR=protobuf\install\cmake
150 |         -D ONNX_DIR=onnx\install\lib\cmake\ONNX
151 |         -D ncnn_DIR=ncnn\lib\cmake\ncnn
152 |         -D CMAKE_CXX_STANDARD=20
153 |       env:
154 |         VULKAN_SDK: C:\VulkanSDK\1.3.275.0
155 | 
156 |     - name: Build
157 |       run: cmake --build build --verbose
158 | 
159 |     - name: Install
160 |       run: |
161 |         cmake --install build --prefix install
162 |         mkdir artifact
163 |         copy install\bin\vsncnn.dll artifact\
164 | 
165 |     - name: Upload
166 |       uses: actions/upload-artifact@v4
167 |       with:
168 |         name: VSNCNN-GPU-Windows-x64
169 |         path: vsncnn/artifact
170 | 
171 |     - name: Setup Python portable
172 |       run: |
173 |         curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.9/python-3.9.9-embed-amd64.zip
174 |         7z x python.zip -ovs_portable
175 | 
176 |     - name: Install VapourSynth portable
177 |       run: |
178 |         curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z
179 |         7z x vs.7z -ovs_portable -y
180 | 
181 |     - name: Copy plugin & swiftshader
182 |       run: |
183 |         copy artifact\*.dll vs_portable\vapoursynth64\plugins
184 |         copy ncnn\tests\* vs_portable\
185 | 
186 |     - name: Install waifu2x model
187 |       run: |
188 |         curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z
189 |         7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models
190 | 
191 |     - name: Download x265
192 |       run: |
193 |         curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z
194 |         7z x x265.7z -ovs_portable\
195 | 
196 |     - name: Create script
197 |       shell: bash
198 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ncnn, core.ncnn.Version(), file=sys.stderr);core.std.BlankClip(format=vs.RGBS, width=127, height=63).ncnn.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
199 | 
200 |     - name: Run vspipe
201 |       shell: bash
202 |       run: |
203 |         set -ex
204 |         vs_portable/vspipe -i test.vpy -
205 |         vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
206 |         ls -l out.hevc x265.log
207 |         cat x265.log
208 |         grep -F 'encoded 10 frames' x265.log || exit 2
209 |         grep -i 'error' x265.log && exit 1
210 |         exit 0
211 | 
212 |     - name: Create script (flexible output)
213 |       shell: bash
214 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ncnn, core.ncnn.Version(), file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS, width=127, height=63).ncnn.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy
215 | 
216 |     - name: Run vspipe (flexible output)
217 |       shell: bash
218 |       run: |
219 |         set -ex
220 |         vs_portable/vspipe -i test_flexible_output.vpy -
221 |         vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
222 |         ls -l out.hevc x265.log
223 |         cat x265.log
224 |         grep -F 'encoded 10 frames' x265.log || exit 2
225 |         grep -i 'error' x265.log && exit 1
226 |         exit 0
227 | 
228 |     - name: Describe
229 |       run: git describe --tags --long
230 | 
231 |     - name: Dump dependencies
232 |       run: dumpbin /dependents artifact\vsncnn.dll
233 | 
234 |     - name: Compress artifact for release
235 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
236 |       run: |
237 |         cd artifact
238 |         7z a -t7z -mx=7 ../../VSNCNN-Windows-x64.${{ github.event.inputs.tag }}.7z .
239 | 
240 |     - name: Release
241 |       uses: softprops/action-gh-release@v2
242 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
243 |       with:
244 |         tag_name: ${{ inputs.tag }}
245 |         files: VSNCNN-Windows-x64.${{ github.event.inputs.tag }}.7z
246 |         fail_on_unmatched_files: true
247 |         generate_release_notes: false
248 |         prerelease: true
249 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-ort.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-ORT)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsort/**'
  8 |       - '.github/workflows/windows-ort.yml'
  9 |   workflow_call:
 10 |     inputs:
 11 |       tag:
 12 |         description: 'which tag to upload to'
 13 |         required: true
 14 |         type: string
 15 |   workflow_dispatch:
 16 |     inputs:
 17 |       tag:
 18 |         description: 'which tag to upload to'
 19 |         default: ''
 20 | 
 21 | jobs:
 22 |   build-windows:
 23 |     runs-on: windows-2022
 24 | 
 25 |     defaults:
 26 |       run:
 27 |         shell: cmd
 28 |         working-directory: vsort
 29 | 
 30 |     steps:
 31 |     - name: Checkout repo
 32 |       uses: actions/checkout@v4
 33 |       with:
 34 |         fetch-depth: 0
 35 | 
 36 |     - name: Setup MSVC
 37 |       uses: ilammy/msvc-dev-cmd@v1
 38 | 
 39 |     - name: Setup Ninja
 40 |       run: pip install ninja
 41 | 
 42 |     - name: Cache protobuf
 43 |       id: cache-protobuf
 44 |       uses: actions/cache@v4
 45 |       with:
 46 |         path: vsort/protobuf/install
 47 |         key: ${{ runner.os }}-vsort-protobuf-v4
 48 | 
 49 |     - name: Checkout protobuf
 50 |       uses: actions/checkout@v4
 51 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 52 |       with:
 53 |         repository: protocolbuffers/protobuf
 54 |         # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L203
 55 |         # if you change this, remember to bump the version of the cache key.
 56 |         ref: v3.21.12
 57 |         fetch-depth: 1
 58 |         path: vsort/protobuf
 59 | 
 60 |     - name: Configure protobuf
 61 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 62 |       run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA
 63 |         -D CMAKE_BUILD_TYPE=Release
 64 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 65 | 
 66 |     - name: Build protobuf
 67 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 68 |       run: cmake --build protobuf\build_rel --verbose
 69 | 
 70 |     - name: Install protobuf
 71 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 72 |       run: cmake --install protobuf\build_rel --prefix protobuf\install
 73 | 
 74 |     - name: Cache onnx
 75 |       id: cache-onnx
 76 |       uses: actions/cache@v4
 77 |       with:
 78 |         path: vsort/onnx/install
 79 |         key: ${{ runner.os }}-vsort-onnx-v5
 80 | 
 81 |     - name: Checkout onnx
 82 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 83 |       uses: actions/checkout@v4
 84 |       with:
 85 |         repository: onnx/onnx
 86 |         # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external
 87 |         # if you change this, remember to bump the version of the cache key.
 88 |         ref: 990217f043af7222348ca8f0301e17fa7b841781
 89 |         fetch-depth: 1
 90 |         path: vsort/onnx
 91 | 
 92 |     - name: Configure onnx
 93 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 94 |       run: cmake -S onnx -B onnx\build -G Ninja -LA
 95 |         -D CMAKE_BUILD_TYPE=Release
 96 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc
 97 |         -D Protobuf_LITE_LIBRARY=protobuf\install\lib
 98 |         -D Protobuf_LIBRARIES=protobuf\install\lib
 99 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
100 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
101 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
102 | 
103 |     - name: Build onnx
104 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
105 |       run: cmake --build onnx\build --verbose
106 | 
107 |     - name: Install onnx
108 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
109 |       run: cmake --install onnx\build --prefix onnx\install
110 | 
111 |     - name: Download VapourSynth headers
112 |       run: |
113 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
114 |         unzip -q vs.zip
115 |         mv vapoursynth-*/ vapoursynth/
116 | 
117 |     - name: Download ONNX Runtime Precompilation
118 |       run: |
119 |         curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-8036-geb41d57f21-240425-0428/onnxruntime-gpu-win64.zip
120 |         unzip -q ortgpu.zip
121 | 
122 |     - name: Cache CUDA
123 |       id: cache-cuda
124 |       uses: actions/cache@v4
125 |       with:
126 |         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
127 |         key: ${{ runner.os }}-cuda-12.4.1
128 | 
129 |     - name: Setup CUDA
130 |       if: steps.cache-cuda.outputs.cache-hit != 'true'
131 |       run: |
132 |         curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.4.1/network_installers/cuda_12.4.1_windows_network.exe
133 |         cuda_installer.exe -s nvcc_12.4 cudart_12.4
134 | 
135 |     - name: Configure
136 |       run: cmake -S . -B build -G Ninja -LA
137 |         -D CMAKE_BUILD_TYPE=Release
138 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
139 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY=vapoursynth\include
140 |         -D protobuf_DIR=protobuf\install\cmake
141 |         -D ONNX_DIR=onnx\install\lib\cmake\ONNX
142 |         -D ONNX_RUNTIME_API_DIRECTORY=onnxruntime-gpu\include\onnxruntime
143 |         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib
144 |         -D ENABLE_CUDA=1
145 |         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
146 |         -D ENABLE_DML=1
147 |         -D CMAKE_CXX_STANDARD=20
148 | 
149 |     - name: Build
150 |       run: cmake --build build --verbose
151 | 
152 |     - name: Install
153 |       run: |
154 |         cmake --install build --prefix install
155 |         mkdir artifact
156 |         mkdir artifact\vsort
157 |         copy install\bin\vsort.dll artifact\
158 |         copy onnxruntime-gpu\bin\*.dll artifact\vsort\
159 |         copy onnxruntime-gpu\lib\*.dll artifact\vsort\
160 | 
161 |     - name: Download DirectML Library
162 |       # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44
163 |       run: |
164 |         curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.14.1
165 |         unzip -q directml.nupkg -d dml
166 |         copy dml\bin\x64-win\DirectML.dll artifact\vsort\
167 | 
168 |     - name: Upload
169 |       uses: actions/upload-artifact@v4
170 |       with:
171 |         name: VSORT-Windows-x64
172 |         path: vsort/artifact
173 | 
174 |     - name: Setup Python portable
175 |       run: |
176 |         curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.10/python-3.9.10-embed-amd64.zip
177 |         7z x python.zip -ovs_portable
178 | 
179 |     - name: Install VapourSynth portable
180 |       run: |
181 |         curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z
182 |         7z x vs.7z -ovs_portable -y
183 | 
184 |     - name: Copy plugin
185 |       run: |
186 |         copy artifact\*.dll vs_portable\vapoursynth64\plugins
187 |         mkdir vs_portable\vapoursynth64\plugins\vsort\
188 |         copy artifact\vsort\*.dll vs_portable\vapoursynth64\plugins\vsort\
189 | 
190 |     - name: Install waifu2x model
191 |       run: |
192 |         curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z
193 |         7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models
194 | 
195 |     - name: Download x265
196 |       run: |
197 |         curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z
198 |         7z x x265.7z -ovs_portable\
199 | 
200 |     - name: Create script
201 |       shell: bash
202 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
203 | 
204 |     - name: Run vspipe
205 |       shell: bash
206 |       run: |
207 |         set -ex
208 |         vs_portable/vspipe -i test.vpy -
209 |         vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
210 |         ls -l out.hevc x265.log
211 |         cat x265.log
212 |         grep -F 'encoded 10 frames' x265.log || exit 2
213 |         grep -i 'error' x265.log && exit 1
214 |         exit 0
215 | 
216 |     - name: Create script (fp16)
217 |       shell: bash
218 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy
219 | 
220 |     - name: Run vspipe (fp16)
221 |       shell: bash
222 |       run: |
223 |         set -ex
224 |         vs_portable/vspipe -i test_fp16.vpy -
225 |         vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
226 |         ls -l out.hevc x265.log
227 |         cat x265.log
228 |         grep -F 'encoded 10 frames' x265.log || exit 2
229 |         grep -i 'error' x265.log && exit 1
230 |         exit 0
231 | 
232 |     - name: Create script (fp16 input)
233 |       shell: bash
234 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBH).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_input.vpy
235 | 
236 |     - name: Run vspipe (fp16 input)
237 |       shell: bash
238 |       run: |
239 |         set -ex
240 |         vs_portable/vspipe -i test_fp16_input.vpy -
241 |         vs_portable/vspipe --y4m -p -e 9 test_fp16_input.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
242 |         ls -l out.hevc x265.log
243 |         cat x265.log
244 |         grep -F 'encoded 10 frames' x265.log || exit 2
245 |         grep -i 'error' x265.log && exit 1
246 |         exit 0
247 | 
248 |     - name: Create script (fp16 output)
249 |       shell: bash
250 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);flt=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True, output_format=1);print(flt,file=sys.stderr);flt.resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16_output.vpy
251 | 
252 |     - name: Run vspipe (fp16 output)
253 |       shell: bash
254 |       run: |
255 |         set -ex
256 |         vs_portable/vspipe -i test_fp16_output.vpy -
257 |         vs_portable/vspipe --y4m -p -e 9 test_fp16_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
258 |         ls -l out.hevc x265.log
259 |         cat x265.log
260 |         grep -F 'encoded 10 frames' x265.log || exit 2
261 |         grep -i 'error' x265.log && exit 1
262 |         exit 0
263 | 
264 |     - name: Create script (flexible output)
265 |       shell: bash
266 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ort, file=sys.stderr);print(core.ort.Version(),file=sys.stderr);prop='test';output=core.std.BlankClip(format=vs.RGBS).ort.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output['clip'].std.PropToClip(prop=f'{prop}{i}') for i in range(output['num_planes'])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy
267 | 
268 |     - name: Run vspipe (flexible output)
269 |       shell: bash
270 |       run: |
271 |         set -ex
272 |         vs_portable/vspipe -i test_flexible_output.vpy -
273 |         vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
274 |         ls -l out.hevc x265.log
275 |         cat x265.log
276 |         grep -F 'encoded 10 frames' x265.log || exit 2
277 |         grep -i 'error' x265.log && exit 1
278 |         exit 0
279 | 
280 |     - name: Describe
281 |       run: git describe --tags --long
282 | 
283 |     - name: Dump dependencies
284 |       run: dumpbin /dependents artifact\vsort.dll
285 | 
286 |     - name: Compress artifact for release
287 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
288 |       run: |
289 |         cd artifact
290 |         7z a -t7z -mx=7 ../../VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z .
291 | 
292 |     - name: Release
293 |       uses: softprops/action-gh-release@v2
294 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
295 |       with:
296 |         tag_name: ${{ inputs.tag }}
297 |         files: VSORT-Windows-x64.${{ github.event.inputs.tag }}.7z
298 |         fail_on_unmatched_files: true
299 |         generate_release_notes: false
300 |         prerelease: true
301 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-ov.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-OV)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'common/**'
  7 |       - 'vsov/**'
  8 |       - '.github/workflows/windows-ov.yml'
  9 |   workflow_call:
 10 |     inputs:
 11 |       tag:
 12 |         description: 'which tag to upload to'
 13 |         required: true
 14 |         type: string
 15 |       ov_tag:
 16 |         description: 'which tag of openvino to use'
 17 |         required: true
 18 |         default: 'latest'
 19 |         type: string
 20 |   workflow_dispatch:
 21 |     inputs:
 22 |       tag:
 23 |         description: 'which tag to upload to'
 24 |         default: ''
 25 |       ov_tag:
 26 |         description: 'which tag of openvino to use'
 27 |         required: true
 28 |         default: 'latest'
 29 |         type: string
 30 | 
 31 | jobs:
 32 |   build-windows:
 33 |     runs-on: windows-2022
 34 | 
 35 |     defaults:
 36 |       run:
 37 |         shell: cmd
 38 |         working-directory: vsov
 39 | 
 40 |     steps:
 41 |     - name: Checkout repo
 42 |       uses: actions/checkout@v4
 43 |       with:
 44 |         fetch-depth: 0
 45 | 
 46 |     - name: Setup MSVC
 47 |       uses: ilammy/msvc-dev-cmd@v1
 48 | 
 49 |     - name: Setup Ninja
 50 |       run: pip install ninja
 51 | 
 52 |     - name: Cache protobuf
 53 |       id: cache-protobuf
 54 |       uses: actions/cache@v4
 55 |       with:
 56 |         path: vsov/protobuf/install
 57 |         key: ${{ runner.os }}-vsov-protobuf-v3
 58 | 
 59 |     - name: Checkout protobuf
 60 |       uses: actions/checkout@v4
 61 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 62 |       with:
 63 |         repository: protocolbuffers/protobuf
 64 |         # follows protobuf in https://github.com/AmusementClub/openvino/tree/master/thirdparty/protobuf
 65 |         # if you change this, remember to bump the version of the cache key.
 66 |         ref: f0dc78d7e6e331b8c6bb2d5283e06aa26883ca7c
 67 |         fetch-depth: 1
 68 |         path: vsov/protobuf
 69 | 
 70 |     - name: Configure protobuf
 71 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 72 |       run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA
 73 |         -D CMAKE_BUILD_TYPE=Release
 74 |         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 75 | 
 76 |     - name: Build protobuf
 77 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 78 |       run: cmake --build protobuf\build_rel --verbose
 79 | 
 80 |     - name: Install protobuf
 81 |       if: steps.cache-protobuf.outputs.cache-hit != 'true'
 82 |       run: cmake --install protobuf\build_rel --prefix protobuf\install
 83 | 
 84 |     - name: Cache onnx
 85 |       id: cache-onnx
 86 |       uses: actions/cache@v4
 87 |       with:
 88 |         path: vsov/onnx/install
 89 |         key: ${{ runner.os }}-vsov-onnx-v3
 90 | 
 91 |     - name: Checkout onnx
 92 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
 93 |       uses: actions/checkout@v4
 94 |       with:
 95 |         repository: onnx/onnx
 96 |         # follows onnx in https://github.com/AmusementClub/openvino/tree/master/thirdparty/onnx
 97 |         # if you change this, remember to bump the version of the cache key.
 98 |         ref: b8baa8446686496da4cc8fda09f2b6fe65c2a02c
 99 |         fetch-depth: 1
100 |         path: vsov/onnx
101 | 
102 |     - name: Configure onnx
103 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
104 |       run: cmake -S onnx -B onnx\build -G Ninja -LA
105 |         -D CMAKE_BUILD_TYPE=Release
106 |         -D Protobuf_PROTOC_EXECUTABLE=protobuf\install\bin\protoc
107 |         -D Protobuf_LITE_LIBRARY=protobuf\install\lib
108 |         -D Protobuf_LIBRARIES=protobuf\install\lib
109 |         -D ONNX_USE_LITE_PROTO=ON -D ONNX_USE_PROTOBUF_SHARED_LIBS=OFF
110 |         -D ONNX_GEN_PB_TYPE_STUBS=OFF -D ONNX_ML=0
111 |         -D ONNX_USE_MSVC_STATIC_RUNTIME=1
112 | 
113 |     - name: Build onnx
114 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
115 |       run: cmake --build onnx\build --verbose
116 | 
117 |     - name: Install onnx
118 |       if: steps.cache-onnx.outputs.cache-hit != 'true'
119 |       run: cmake --install onnx\build --prefix onnx\install
120 | 
121 |     - name: Download VapourSynth headers
122 |       run: |
123 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
124 |         unzip -q vs.zip
125 |         mv vapoursynth-*/ vapoursynth/
126 | 
127 |     - name: Download OpenVINO Runtime Precompilation
128 |       shell: bash
129 |       run: |
130 |         # rev="${{github.event.inputs.ov_tag || inputs.ov_tag || 'latest'}}"
131 |         # if [ "$rev" == "latest" ]; then
132 |         #   url="https://github.com/AmusementClub/openvino/releases/latest/download/openvino-gpu-win64.zip"
133 |         # else
134 |         #   url="https://github.com/AmusementClub/openvino/releases/download/$rev/openvino-gpu-win64.zip"
135 |         # fi
136 |         url="https://github.com/AmusementClub/openvino/releases/download/2020.2-15171-g4655dd6ce3-2058-g5833781ddb/openvino-gpu-win64.zip"
137 |         curl -s -o openvino.zip -LJO "$url"
138 |         unzip -q openvino.zip
139 | 
140 |     - name: Configure
141 |       run: cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release
142 |         -D CMAKE_INTERPROCEDURAL_OPTIMIZATION=ON
143 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
144 |         -D OpenVINO_DIR=openvino/runtime/cmake
145 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
146 |         -D ENABLE_VISUALIZATION=ON
147 |         -D WIN32_SHARED_OPENVINO=ON
148 |         -D protobuf_DIR=protobuf\install\cmake
149 |         -D ONNX_DIR=onnx\install\lib\cmake\ONNX
150 | 
151 |     - name: Build
152 |       run: cmake --build build --verbose
153 | 
154 |     - name: Install
155 |       run: |
156 |         cmake --install build --prefix install
157 |         mkdir artifact
158 |         mkdir artifact\vsov
159 |         copy openvino\runtime\3rdparty\tbb\bin\tbb12.dll artifact\vsov\
160 |         copy install\bin\vsov.dll artifact\
161 |         xcopy openvino\runtime\bin\intel64\Release\* artifact\vsov\ /s
162 | 
163 |     - name: Upload
164 |       uses: actions/upload-artifact@v4
165 |       with:
166 |         name: VSOV-Windows-x64
167 |         path: vsov/artifact
168 | 
169 |     - name: Setup Python portable
170 |       run: |
171 |         curl -s -o python.zip -LJO https://www.python.org/ftp/python/3.9.9/python-3.9.9-embed-amd64.zip
172 |         7z x python.zip -ovs_portable
173 | 
174 |     - name: Install VapourSynth portable
175 |       run: |
176 |         curl -s -o vs.7z -LJO https://github.com/vapoursynth/vapoursynth/releases/download/R54/VapourSynth64-Portable-R54.7z
177 |         7z x vs.7z -ovs_portable -y
178 | 
179 |     - name: Copy plugin
180 |       run: |
181 |         copy artifact\*.dll vs_portable\vapoursynth64\plugins
182 |         mkdir vs_portable\vapoursynth64\plugins\vsov\
183 |         copy artifact\vsov\* vs_portable\vapoursynth64\plugins\vsov\
184 | 
185 |     - name: Install waifu2x model
186 |       run: |
187 |         curl -s -o waifu2x.7z -LJO https://github.com/AmusementClub/vs-mlrt/releases/download/model-20211209/waifu2x_v3.7z
188 |         7z x waifu2x.7z -ovs_portable\vapoursynth64\plugins\models
189 | 
190 |     - name: Download x265
191 |       run: |
192 |         curl -s -o x265.7z -LJO https://github.com/AmusementClub/x265/releases/download/Yuuki-3.5-AC3/x265-win64-x86-64-clang.Yuuki-3.5-AC3.7z
193 |         7z x x265.7z -ovs_portable\
194 | 
195 |     - name: Create script
196 |       shell: bash
197 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test.vpy
198 | 
199 |     - name: Run vspipe
200 |       shell: bash
201 |       run: |
202 |         set -ex
203 |         vs_portable/vspipe -i test.vpy -
204 |         vs_portable/vspipe --y4m -p -e 9 test.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
205 |         ls -l out.hevc x265.log
206 |         cat x265.log
207 |         grep -F 'encoded 10 frames' x265.log || exit 2
208 |         grep -i 'error' x265.log && exit 1
209 |         exit 0
210 | 
211 |     - name: Create script (fp16)
212 |       shell: bash
213 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, fp16=True).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_fp16.vpy
214 | 
215 |     - name: Run vspipe (fp16)
216 |       shell: bash
217 |       run: |
218 |         set -ex
219 |         vs_portable/vspipe -i test_fp16.vpy -
220 |         vs_portable/vspipe --y4m -p -e 9 test_fp16.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
221 |         ls -l out.hevc x265.log
222 |         cat x265.log
223 |         grep -F 'encoded 10 frames' x265.log || exit 2
224 |         grep -i 'error' x265.log && exit 1
225 |         exit 0
226 | 
227 |     - name: Create script (flexible output)
228 |       shell: bash
229 |       run: echo "import vapoursynth as vs;from vapoursynth import core;import sys;print(core.ov, file=sys.stderr);prop=\"test\";output=core.std.BlankClip(format=vs.RGBS).ov.Model(r\"waifu2x\\upconv_7_anime_style_art_rgb\\scale2.0x_model.onnx\", builtin=True, flexible_output_prop=prop);core.std.ShufflePlanes([output[\"clip\"].std.PropToClip(prop=f\"{prop}{i}\") for i in range(output[\"num_planes\"])], [0, 0, 0], vs.RGB).resize.Bicubic(format=vs.YUV420P10, matrix_s='709').set_output()" > test_flexible_output.vpy
230 | 
231 |     - name: Run vspipe (flexible output)
232 |       shell: bash
233 |       run: |
234 |         set -ex
235 |         vs_portable/vspipe -i test_flexible_output.vpy -
236 |         vs_portable/vspipe --y4m -p -e 9 test_flexible_output.vpy - | vs_portable/x265 --log-file x265.log --log-file-level info --y4m -D 10 --preset ultrafast -o out.hevc -
237 |         ls -l out.hevc x265.log
238 |         cat x265.log
239 |         grep -F 'encoded 10 frames' x265.log || exit 2
240 |         grep -i 'error' x265.log && exit 1
241 |         exit 0
242 | 
243 |     - name: Describe
244 |       run: git describe --tags --long
245 | 
246 |     - name: Dump dependencies
247 |       run: dumpbin /dependents artifact\vsov.dll
248 | 
249 |     - name: Compress artifact for release
250 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
251 |       run: |
252 |         cd artifact
253 |         7z a -t7z -mx=7 ../../VSOV-Windows-x64.${{ github.event.inputs.tag }}.7z .
254 | 
255 |     - name: Release
256 |       uses: softprops/action-gh-release@v2
257 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
258 |       with:
259 |         tag_name: ${{ inputs.tag }}
260 |         files: VSOV-Windows-x64.${{ github.event.inputs.tag }}.7z
261 |         fail_on_unmatched_files: true
262 |         generate_release_notes: false
263 |         prerelease: true
264 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-release.yml:
--------------------------------------------------------------------------------
  1 | name: Make a Release
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       tag:
  7 |         description: 'which tag to create and release?'
  8 |         required: true
  9 |         default: 'nightly'
 10 |       model-tags:
 11 |         description: 'which tag(s) of model release to use? (comma-separated list of tags)'
 12 |         required: true
 13 |         default: 'model-20211209, model-20220923'
 14 |       ext-model-tags:
 15 |         description: 'which tag(s) of external model release to use?'
 16 |         required: true
 17 |         default: 'external-models'
 18 |       contrib-model-tags:
 19 |         description: 'which tag(s) of contributed model release to use?'
 20 |         required: true
 21 |         default: 'contrib-models'
 22 |       ov_tag:
 23 |         description: 'which tag of openvino to use'
 24 |         required: true
 25 |         default: 'latest'
 26 |         type: string
 27 |       ncnn_tag:
 28 |         description: 'which tag of ncnn to use'
 29 |         required: true
 30 |         default: 'latest'
 31 |         type: string
 32 | 
 33 | jobs:
 34 |   build-vsov:
 35 |     uses: ./.github/workflows/windows-ov.yml
 36 |     with:
 37 |       tag: ${{ github.event.inputs.tag }}
 38 |       ov_tag: ${{ github.event.inputs.ov_tag }}
 39 | 
 40 |   build-vsort:
 41 |     uses: ./.github/workflows/windows-ort.yml
 42 |     with:
 43 |       tag: ${{ github.event.inputs.tag }}
 44 | 
 45 |   build-vstrt:
 46 |     uses: ./.github/workflows/windows-trt.yml
 47 |     with:
 48 |       tag: ${{ github.event.inputs.tag }}
 49 |     secrets:
 50 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 51 | 
 52 |   build-vsmigx:
 53 |     uses: ./.github/workflows/windows-migx.yml
 54 |     with:
 55 |       tag: ${{ github.event.inputs.tag }}
 56 |     secrets:
 57 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 58 | 
 59 |   build-vsncnn:
 60 |     uses: ./.github/workflows/windows-ncnn.yml
 61 |     with:
 62 |       tag: ${{ github.event.inputs.tag }}
 63 |       ncnn_tag: ${{ github.event.inputs.ncnn_tag }}
 64 | 
 65 |   build-cuda-dependency:
 66 |     uses: ./.github/workflows/windows-cuda-dependency.yml
 67 |     with:
 68 |       tag: ${{ github.event.inputs.tag }}
 69 |     secrets:
 70 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 71 | 
 72 |   build-hip-dependency:
 73 |     uses: ./.github/workflows/windows-hip-dependency.yml
 74 |     with:
 75 |       tag: ${{ github.event.inputs.tag }}
 76 |     secrets:
 77 |       REPO_TOKEN: ${{ secrets.REPO_TOKEN }}
 78 | 
 79 |   build-scripts:
 80 |     runs-on: ubuntu-24.04-arm
 81 |     steps:
 82 |     - name: Checkout repo
 83 |       uses: actions/checkout@v4
 84 | 
 85 |     - name: Compress scirpts.7z
 86 |       run: |
 87 |         cd scripts
 88 |         7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z .
 89 | 
 90 |     - name: Upload scripts release
 91 |       uses: actions/upload-artifact@v4
 92 |       with:
 93 |         name: Scripts
 94 |         path: scripts
 95 |         retention-days: 1
 96 | 
 97 |     - name: Release scripts
 98 |       uses: softprops/action-gh-release@v2
 99 |       with:
100 |         tag_name: ${{ github.event.inputs.tag }}
101 |         files: scripts.${{ github.event.inputs.tag }}.7z
102 |         fail_on_unmatched_files: true
103 |         generate_release_notes: false
104 |         prerelease: true
105 | 
106 |   build-models:
107 |     runs-on: ubuntu-24.04-arm
108 |     steps:
109 |     - name: Download Models
110 |       run: |
111 |         set -ex
112 |         mkdir -p release/models
113 |         cd release
114 |         pushd models
115 |         for tag in $(echo "${{ github.event.inputs.model-tags }}" | tr ',' ' '); do
116 |           echo "Handling tag $tag"
117 |           curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
118 |           for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
119 |             echo "Downloading $url"
120 |             curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
121 |             # later release should overwrite earlier ones
122 |             7za x -y dl.7z
123 |           done
124 |           test -f "dl.7z"
125 |           rm -f dl.7z release.json
126 |         done
127 |         popd
128 |         ls -lR
129 |         du -sh
130 |         7za a -t7z -bb3 -mx=9 ../models.7z .
131 | 
132 |     - name: Upload model release
133 |       uses: actions/upload-artifact@v4
134 |       with:
135 |         name: Models
136 |         path: release
137 |         retention-days: 1
138 |         compression-level: 0
139 | 
140 |     - name: Download External Models
141 |       if: false
142 |       run: |
143 |         rm -rf release
144 |         set -ex
145 |         mkdir -p release/models
146 |         cd release
147 |         pushd models
148 |         for tag in $(echo "${{ github.event.inputs.ext-model-tags }}" | tr ',' ' '); do
149 |           echo "Handling tag $tag"
150 |           curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
151 |           for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
152 |             echo "Downloading $url"
153 |             curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
154 |             # later release should overwrite earlier ones
155 |             7za x -y dl.7z
156 |           done
157 |           test -f "dl.7z"
158 |           rm -f dl.7z release.json
159 |         done
160 |         popd
161 |         ls -lR
162 |         du -sh
163 |         7za a -t7z -bb3 -mx=9 ../ext-models.7z .
164 | 
165 |     - name: Upload external model release
166 |       uses: actions/upload-artifact@v4
167 |       if: false
168 |       with:
169 |         name: External-Models
170 |         path: release
171 |         retention-days: 1
172 |         compression-level: 0
173 | 
174 |     - name: Download Contributed Models
175 |       run: |
176 |         rm -rf release
177 |         set -ex
178 |         mkdir -p release/models
179 |         cd release
180 |         pushd models
181 |         for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do
182 |           echo "Handling tag $tag"
183 |           curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
184 |           for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
185 |             echo "Downloading $url"
186 |             curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
187 |             # later release should overwrite earlier ones
188 |             7za x -y dl.7z
189 |           done
190 |           #test -f "dl.7z"  # contrib-models might be empty.
191 |           rm -f dl.7z release.json
192 |         done
193 |         popd
194 |         ls -lR
195 |         du -sh
196 |         7za a -t7z -bb3 -mx=9 ../contrib-models.7z .
197 | 
198 |     - name: Upload contrib model release
199 |       uses: actions/upload-artifact@v4
200 |       with:
201 |         name: Contrib-Models
202 |         path: release
203 |         retention-days: 1
204 |         compression-level: 0
205 | 
206 |     - name: Download Contributed Models
207 |       run: |
208 |         rm -rf release
209 |         set -ex
210 |         mkdir -p release/models
211 |         cd release
212 |         pushd models
213 |         for tag in $(echo "${{ github.event.inputs.contrib-model-tags }}" | tr ',' ' '); do
214 |           echo "Handling tag $tag"
215 |           curl -s https://api.github.com/repos/AmusementClub/vs-mlrt/releases/tags/"$tag" > release.json
216 |           for url in $(cat release.json | jq '.assets | .[] | .url ' | tr -d '"'); do
217 |             echo "Downloading $url"
218 |             curl -o dl.7z -LJ -H 'Accept: application/octet-stream' "$url"
219 |             # later release should overwrite earlier ones
220 |             7za x -y dl.7z
221 |           done
222 |           #test -f "dl.7z"  # contrib-models might be empty.
223 |           rm -f dl.7z release.json
224 |         done
225 |         popd
226 |         ls -lR
227 |         du -sh
228 |         7za a -t7z -bb3 -mx=9 ../contrib-models.7z .
229 | 
230 |     - name: Rename release asset
231 |       run: |
232 |         mv models.7z models.${{ github.event.inputs.tag }}.7z
233 |         mv contrib-models.7z contrib-models.${{ github.event.inputs.tag }}.7z
234 | 
235 |     - name: Release models
236 |       uses: softprops/action-gh-release@v2
237 |       with:
238 |         tag_name: ${{ github.event.inputs.tag }}
239 |         files: |
240 |           models.${{ github.event.inputs.tag }}.7z
241 |           contrib-models.${{ github.event.inputs.tag }}.7z
242 |         fail_on_unmatched_files: true
243 |         generate_release_notes: false
244 |         prerelease: true
245 | 
246 |   release:
247 |     runs-on: ubuntu-24.04-arm
248 |     needs: [build-vsov, build-vsort, build-vstrt, build-vsmigx, build-vsncnn, build-cuda-dependency, build-hip-dependency, build-scripts, build-models]
249 | 
250 |     defaults:
251 |       run:
252 |         shell: bash
253 | 
254 |     steps:
255 |     - name: Download artifact for scripts
256 |       uses: actions/download-artifact@v4
257 |       with:
258 |         name: Scripts
259 |         path: scripts-release
260 | 
261 |     - name: Download artifact for models
262 |       uses: actions/download-artifact@v4
263 |       with:
264 |         name: Models
265 |         path: models-release
266 | 
267 |     - name: Download artifact for vsov
268 |       uses: actions/download-artifact@v4
269 |       with:
270 |         name: VSOV-Windows-x64
271 |         path: vsov-release
272 | 
273 |     - name: Download artifact for vsort
274 |       uses: actions/download-artifact@v4
275 |       with:
276 |         name: VSORT-Windows-x64
277 |         path: vsort-release
278 | 
279 |     - name: Download artifact for vstrt
280 |       uses: actions/download-artifact@v4
281 |       with:
282 |         name: VSTRT-Windows-x64
283 |         path: vstrt-release
284 | 
285 |     - name: Download artifact for vsmigx
286 |       uses: actions/download-artifact@v4
287 |       with:
288 |         name: VSMIGX-Windows-x64
289 |         path: vsmigx-release
290 | 
291 |     - name: Download artifact for vsncnn
292 |       uses: actions/download-artifact@v4
293 |       with:
294 |         name: VSNCNN-GPU-Windows-x64
295 |         path: vsncnn-release
296 | 
297 |     - name: Download artifact for cuda dependencies
298 |       uses: actions/download-artifact@v4
299 |       with:
300 |         name: vsmlrt-cuda
301 |         path: cuda-release
302 | 
303 |     - name: Download artifact for hip dependencies
304 |       uses: actions/download-artifact@v4
305 |       with:
306 |         name: vsmlrt-hip
307 |         path: hip-release
308 | 
309 |     - name: Build CPU-only release
310 |       shell: bash
311 |       run: |
312 |         mkdir release-cpu
313 |         cp -r models-release/models release-cpu/
314 |         cp -r vsov-release/* release-cpu/
315 |         cp -r vsort-release/* release-cpu/
316 |         rm -f release-cpu/vsort/onnxruntime_providers_*.dll
317 |         cp scripts-release/*.py release-cpu/
318 |         cd release-cpu
319 |         ls -lR
320 |         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z .
321 | 
322 |     - name: Upload CPU-only release
323 |       uses: actions/upload-artifact@v4
324 |       if: false
325 |       with:
326 |         name: vsmlrt-cpu-release
327 |         path: vsmlrt-windows-x64-cpu.7z
328 |         retention-days: 1
329 |         compression-level: 0
330 | 
331 |     - name: Rename release asset
332 |       run: mv vsmlrt-windows-x64-cpu.7z vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag }}.7z
333 | 
334 |     - name: Release CPU
335 |       uses: softprops/action-gh-release@v2
336 |       with:
337 |         tag_name: ${{ github.event.inputs.tag }}
338 |         files: vsmlrt-windows-x64-cpu.${{ github.event.inputs.tag}}.7z
339 |         fail_on_unmatched_files: true
340 |         generate_release_notes: false
341 |         prerelease: true
342 | 
343 |     - name: Build generic GPU release
344 |       shell: bash
345 |       run: |
346 |         mkdir release-generic-gpu
347 |         cp -r models-release/models release-generic-gpu/
348 |         cp -r vsov-release/* release-generic-gpu/
349 |         cp -r vsort-release/* release-generic-gpu/
350 |         rm -f release-generic-gpu/vsort/onnxruntime_providers_*.dll
351 |         cp -r vsncnn-release/* release-generic-gpu/
352 |         cp scripts-release/*.py release-generic-gpu/
353 |         cd release-generic-gpu
354 |         ls -lR
355 |         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z .
356 | 
357 |     - name: Upload generic GPU release
358 |       uses: actions/upload-artifact@v4
359 |       if: false
360 |       with:
361 |         name: vsmlrt-generic-gpu-release
362 |         path: vsmlrt-windows-x64-generic-gpu.7z
363 |         retention-days: 1
364 |         compression-level: 0
365 | 
366 |     - name: Rename release asset for generic GPU release
367 |       run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
368 | 
369 |     - name: Release generic GPU
370 |       uses: softprops/action-gh-release@v2
371 |       with:
372 |         tag_name: ${{ github.event.inputs.tag }}
373 |         files: vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
374 |         fail_on_unmatched_files: true
375 |         generate_release_notes: false
376 |         prerelease: true
377 | 
378 |     - name: Extract CUDA libraries
379 |       run: |
380 |         cd cuda-release
381 |         7za x -bb3 vsmlrt-cuda.7z
382 |         rm vsmlrt-cuda.7z
383 | 
384 |     - name: Build CUDA release
385 |       shell: bash
386 |       run: |
387 |         mkdir release-cuda
388 |         cp -r models-release/models release-cuda/
389 |         cp -r vsov-release/* release-cuda/
390 |         cp -r vsort-release/* release-cuda/
391 |         cp -r vstrt-release/* release-cuda/
392 |         cp -r vsncnn-release/* release-cuda/
393 |         cp -r cuda-release/* release-cuda/
394 |         cp scripts-release/*.py release-cuda/
395 |         cd release-cuda
396 |         ls -lR
397 |         7za a -t7z -bb3 -mx=9 -v2000000000b ../vsmlrt-windows-x64-cuda.7z .
398 | 
399 |     - name: Upload CUDA release
400 |       uses: actions/upload-artifact@v4
401 |       if: false
402 |       with:
403 |         name: vsmlrt-cuda-release
404 |         path: |
405 |           vsmlrt-windows-x64-cuda.7z.001
406 |           vsmlrt-windows-x64-cuda.7z.002
407 |         retention-days: 1
408 |         compression-level: 0
409 | 
410 |     - name: Rename release asset for CUDA release
411 |       run: |
412 |         mv vsmlrt-windows-x64-cuda.7z.001 vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.001
413 |         mv vsmlrt-windows-x64-cuda.7z.002 vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.002
414 | 
415 |     - name: Release CUDA
416 |       uses: softprops/action-gh-release@v2
417 |       with:
418 |         tag_name: ${{ github.event.inputs.tag }}
419 |         files: |
420 |           vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.001
421 |           vsmlrt-windows-x64-cuda.${{ github.event.inputs.tag }}.7z.002
422 |         fail_on_unmatched_files: true
423 |         generate_release_notes: false
424 |         prerelease: true
425 | 
426 |     - name: Build TensorRT release
427 |       shell: bash
428 |       run: |
429 |         cd release-cuda
430 |         cd vsmlrt-cuda
431 |         rm --verbose cublas*.dll cudnn*.dll cufft*.dll cupti*.dll nvblas*.dll
432 |         cd ..
433 |         rm --verbose vsort/onnxruntime_providers_*.dll
434 |         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-tensorrt.7z .
435 | 
436 |     - name: Upload TensorRT release
437 |       uses: actions/upload-artifact@v4
438 |       if: false
439 |       with:
440 |         name: vsmlrt-tensorrt-release
441 |         path: vsmlrt-windows-x64-tensorrt.7z
442 |         retention-days: 1
443 |         compression-level: 0
444 | 
445 |     - name: Rename release asset for TensorRT release
446 |       run: mv vsmlrt-windows-x64-tensorrt.7z vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z
447 | 
448 |     - name: Release TensorRT
449 |       uses: softprops/action-gh-release@v2
450 |       with:
451 |         tag_name: ${{ github.event.inputs.tag }}
452 |         files: vsmlrt-windows-x64-tensorrt.${{ github.event.inputs.tag }}.7z
453 |         fail_on_unmatched_files: true
454 |         generate_release_notes: false
455 |         prerelease: true
456 | 
457 |     - name: Extract HIP libraries
458 |       run: |
459 |         cd hip-release
460 |         7za x -bb3 vsmlrt-hip.7z
461 |         rm vsmlrt-hip.7z
462 | 
463 |     - name: Build MIGraphX release
464 |       shell: bash
465 |       run: |
466 |         mkdir release-hip
467 |         cp -r models-release/models release-hip/
468 |         cp -r vsov-release/* release-hip/
469 |         cp -r vsort-release/* release-hip/
470 |         cp -r vsmigx-release/* release-hip/
471 |         cp -r vsncnn-release/* release-hip/
472 |         cp -r hip-release/* release-hip/
473 |         cp scripts-release/*.py release-hip/
474 |         cd release-hip
475 |         ls -lR
476 |         7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-migraphx.7z .
477 | 
478 |     - name: Upload MIGraphX release
479 |       uses: actions/upload-artifact@v4
480 |       if: false
481 |       with:
482 |         name: vsmlrt-migraphx-release
483 |         path: vsmlrt-windows-x64-migraphx.7z
484 |         retention-days: 1
485 |         compression-level: 0
486 | 
487 |     - name: Rename release asset for MIGraphX release
488 |       run: mv vsmlrt-windows-x64-migraphx.7z vsmlrt-windows-x64-migraphx.${{ github.event.inputs.tag }}.7z
489 | 
490 |     - name: Release MIGraphX
491 |       uses: softprops/action-gh-release@v2
492 |       with:
493 |         tag_name: ${{ github.event.inputs.tag }}
494 |         files: vsmlrt-windows-x64-migraphx.${{ github.event.inputs.tag }}.7z
495 |         fail_on_unmatched_files: true
496 |         generate_release_notes: false
497 |         prerelease: true
498 | 
499 |     # Update nightly tag.
500 |     - name: Checkout repo
501 |       if: github.event.inputs.tag == 'nightly'
502 |       uses: actions/checkout@v4
503 |       with:
504 |         fetch-depth: 0
505 |     - name: Overwrite tag
506 |       if: github.event.inputs.tag == 'nightly'
507 |       run: |
508 |         git pull --tags --force
509 |         git tag -f ${{ github.event.inputs.tag }}
510 |         git push -f origin ${{ github.event.inputs.tag }}
511 | 


--------------------------------------------------------------------------------
/.github/workflows/windows-trt.yml:
--------------------------------------------------------------------------------
  1 | name: Build (Windows-TRT)
  2 | 
  3 | on:
  4 |   push:
  5 |     paths:
  6 |       - 'vstrt/**'
  7 |       - '.github/workflows/windows-trt.yml'
  8 |   workflow_call:
  9 |     inputs:
 10 |       tag:
 11 |         description: 'which tag to upload to'
 12 |         required: true
 13 |         type: string
 14 |     secrets:
 15 |       REPO_TOKEN:
 16 |         required: true
 17 |   workflow_dispatch:
 18 |     inputs:
 19 |       tag:
 20 |         description: 'which tag to upload to'
 21 |         default: ''
 22 | 
 23 | jobs:
 24 |   build-windows:
 25 |     runs-on: windows-2025
 26 | 
 27 |     defaults:
 28 |       run:
 29 |         shell: cmd
 30 |         working-directory: vstrt
 31 | 
 32 |     steps:
 33 |     - name: Checkout repo
 34 |       uses: actions/checkout@v4
 35 |       with:
 36 |         fetch-depth: 0
 37 | 
 38 |     - name: Setup MSVC
 39 |       uses: ilammy/msvc-dev-cmd@v1
 40 | 
 41 |     - name: Setup Ninja
 42 |       run: pip install ninja
 43 | 
 44 |     - name: Cache CUDA
 45 |       id: cache-cuda
 46 |       uses: actions/cache@v4
 47 |       with:
 48 |         path: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA
 49 |         key: ${{ runner.os }}-vstrt-cuda-12.9.0
 50 |         save-always: true
 51 | 
 52 |     - name: Setup CUDA
 53 |       if: steps.cache-cuda.outputs.cache-hit != 'true'
 54 |       run: |
 55 |         curl -s -o cuda_installer.exe -L https://developer.download.nvidia.com/compute/cuda/12.9.0/network_installers/cuda_12.9.0_windows_network.exe
 56 |         cuda_installer.exe -s nvcc_12.9 cudart_12.9 cuda_profiler_api_12.9
 57 | 
 58 |     - name: Download TensorRT
 59 |       run: |
 60 |         curl -L -o trt.zip https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.11.0/zip/TensorRT-10.11.0.33.Windows.win10.cuda-12.9.zip
 61 |         unzip trt.zip
 62 |         mv TensorRT-*/ tensorrt/
 63 | 
 64 |     - name: Download VapourSynth headers
 65 |       run: |
 66 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R54.zip
 67 |         unzip -q vs.zip
 68 |         mv vapoursynth-*/ vapoursynth/
 69 | 
 70 |     - name: Configure
 71 |       run: cmake -S . -B build -G Ninja -LA
 72 |         -D CMAKE_BUILD_TYPE=Release
 73 |         -D CMAKE_CXX_FLAGS="/EHsc /Wall /wd4100 /wd4625 /wd4626 /wd4710 /wd4711 /wd4820 /wd4996 /wd5026 /wd5027"
 74 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
 75 |         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
 76 |         -D VAPOURSYNTH_INCLUDE_DIRECTORY="%cd%\vapoursynth\include"
 77 |         -D TENSORRT_HOME="%cd%\tensorrt"
 78 |         -D USE_NVINFER_PLUGIN=ON
 79 |         -D TENSORRT_LIBRARY_SUFFIX="_10"
 80 | 
 81 |     - name: Build
 82 |       run: cmake --build build --config Release --verbose
 83 | 
 84 |     - name: Install
 85 |       run: cmake --install build --prefix install
 86 | 
 87 |     - name: Checkout TensorRT OSS
 88 |       uses: actions/checkout@v4
 89 |       with:
 90 |         repository: NVIDIA/TensorRT
 91 |         ref: v10.1.0
 92 |         fetch-depth: 1
 93 |         path: tensorrt-oss
 94 | 
 95 |     - name: Override trtexec CMake file
 96 |       run: |
 97 |         cp -f -r -v tensorrt/samples ../tensorrt-oss
 98 |         cp -f -r -v tensorrt/include ../tensorrt-oss
 99 | 
100 |         mv trtexec/CMakeLists.txt ../tensorrt-oss/samples/trtexec
101 |         mv trtexec/*.cpp ../tensorrt-oss/samples/trtexec
102 |         mv longpath.manifest ../tensorrt-oss/samples/trtexec
103 | 
104 |     - name: Apply patch
105 |       run: |
106 |         mv trtexec/trtexec.patch ../tensorrt-oss
107 |         cd ../tensorrt-oss
108 | 
109 |         copy samples\utils\fileLock.cpp samples\utils\fileLock-utf16le.cpp
110 |         powershell "Get-Content samples\utils\fileLock-utf16le.cpp | Out-File samples\utils\fileLock.cpp -Encoding ascii"
111 |         git apply trtexec.patch --verbose
112 |         copy samples\utils\fileLock.cpp samples\utils\fileLock-utf8.cpp
113 |         powershell "Get-Content samples\utils\fileLock-utf8.cpp | Out-File samples\utils\fileLock.cpp"
114 | 
115 |     - name: Configure trtexec
116 |       run: cmake -S ../tensorrt-oss/samples/trtexec -B build_trtexec -G Ninja
117 |         -D CMAKE_BUILD_TYPE=Release
118 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
119 |         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
120 |         -D CMAKE_UNITY_BUILD=ON -D CMAKE_UNITY_BUILD_BATCH_SIZE=0
121 |         -D CMAKE_CXX_STANDARD=20
122 | 
123 |     - name: Build trtexec
124 |       run: cmake --build build_trtexec --verbose
125 | 
126 |     - name: Install trtexec
127 |       run: cmake --install build_trtexec --prefix trtexec
128 | 
129 |     - name: Prepare for upload
130 |       run: |
131 |         mkdir artifact
132 |         copy install\bin\vstrt.dll artifact\
133 |         mkdir artifact\vsmlrt-cuda
134 |         copy trtexec\bin\trtexec.exe artifact\vsmlrt-cuda
135 | 
136 |     - name: Describe
137 |       run: git describe --tags --long
138 | 
139 |     - name: Dump dependencies
140 |       run: dumpbin /dependents artifact\vstrt.dll
141 | 
142 |     - name: Upload
143 |       uses: actions/upload-artifact@v4
144 |       with:
145 |         name: VSTRT-Windows-x64
146 |         path: vstrt/artifact
147 | 
148 |     - name: Compress artifact for release
149 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
150 |       run: |
151 |         cd artifact
152 |         7z a -t7z -mx=7 ../../VSTRT-Windows-x64.${{ github.event.inputs.tag }}.7z .
153 | 
154 |     - name: Release
155 |       uses: softprops/action-gh-release@v2
156 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
157 |       with:
158 |         tag_name: ${{ inputs.tag }}
159 |         files: VSTRT-Windows-x64.${{ github.event.inputs.tag }}.7z
160 |         fail_on_unmatched_files: true
161 |         generate_release_notes: false
162 |         prerelease: true
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # vs-mlrt
 2 | 
 3 | This project provides VapourSynth ML filter runtimes for a variety of platforms:
 4 |  - x86 CPUs: [vsov-cpu](#vsov-openvino-based-pure-cpu--intel-gpu-runtime), [vsort-cpu](#vsort-onnx-runtime-based-cpugpu-runtime)
 5 |  - Intel GPU (both integrated & discrete): [vsov-gpu](#vsov-openvino-based-pure-cpu--intel-gpu-runtime), [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime)
 6 |  - NVidia GPU: [vsort-cuda](#vsort-onnx-runtime-based-cpugpu-runtime), [vstrt](#vstrt-tensorrt-based-gpu-runtime), [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime)
 7 |  - AMD GPU: [vsncnn-vk](#vsncnn-ncnn-based-gpu-vulkan-runtime), [vsmigx](#vsmigx-migraphx-based-gpu-runtime)
 8 |  - Apple SoC: [vsort-coreml](#vsort-onnx-runtime-based-cpugpu-runtime)
 9 | 
10 | To simplify usage, we also provide a Python wrapper [vsmlrt.py](https://github.com/AmusementClub/vs-mlrt/blob/master/scripts/vsmlrt.py)
11 | for all bundled models and a unified interface to select different backends.
12 | 
13 | Please refer to [the wiki](https://github.com/AmusementClub/vs-mlrt/wiki) for supported models & usage information.
14 | 
15 | ## vsov: OpenVINO-based Pure CPU & Intel GPU Runtime
16 | 
17 | [OpenVINO](https://docs.openvino.ai/latest/index.html) is an AI inference runtime developed
18 | by Intel, mainly targeting x86 CPUs and Intel GPUs.
19 | 
20 | The vs-openvino plugin provides optimized *pure* CPU & Intel GPU runtime for some popular AI filters.
21 | Intel GPU supports Gen 8+ on Broadwell+ and the Arc series GPUs.
22 | 
23 | To install, download the latest release and extract them into your VS `plugins` directory.
24 | 
25 | Please visit the [vsov](vsov) directory for details.
26 | 
27 | ## vsort: ONNX Runtime-based CPU/GPU Runtime
28 | 
29 | [ONNX Runtime](https://onnxruntime.ai/) is an AI inference runtime with many backends.
30 | 
31 | The vs-onnxruntime plugin provides optimized CPU and CUDA GPU runtime for some popular AI filters.
32 | 
33 | To install, download the latest release and extract them into your VS `plugins` directory.
34 | 
35 | Please visit the [vsort](vsort) directory for details.
36 | 
37 | ## vstrt: TensorRT-based GPU Runtime
38 | 
39 | [TensorRT](https://developer.nvidia.com/tensorrt) is a highly optimized AI inference runtime
40 | for NVidia GPUs. It uses benchmarking to find the optimal kernel to use for your specific
41 | GPU, and so there is an extra step to build an engine from ONNX network on the machine
42 | you are going to use the vstrt filter, and this extra step makes deploying models a little
43 | harder than the other runtimes. However, the resulting performance is also typically
44 | *much much better* than the CUDA backend of [vsort](vsort).
45 | 
46 | To install, download the latest release and extract them into your VS `plugins` directory.
47 | 
48 | Please visit the [vstrt](vstrt) directory for details.
49 | 
50 | ## vsmigx: MIGraphX-based GPU Runtime
51 | 
52 | [MIGraphX](https://github.com/ROCm/AMDMIGraphX) is a highly optimized AI inference runtime
53 | for AMD GPUs. It also uses benchmarking to find the optimal kernel, similar to vstrt.
54 | 
55 | To install, download the latest release and extract them into your VS `plugins` directory.
56 | 
57 | Please visit the [vsmigx](vsmigx) directory for details.
58 | 
59 | ## vsncnn: NCNN-based GPU (Vulkan) Runtime
60 | 
61 | [ncnn](https://github.com/Tencent/ncnn) is a popular AI inference runtime. [vsncnn](vsncnn)
62 | provides a vulkan based runtime for some AI filters. It includes support for on-the-fly
63 | ONNX to ncnn native format conversion so as to provide a unified interface across all
64 | runtimes provided by this project. As it uses the device-independent
65 | [Vulkan](https://en.wikipedia.org/wiki/Vulkan) interface for GPU accelerated inference,
66 | this plugin supports all GPUs that provides Vulkan interface (NVidia, AMD, Intel integrated &
67 | discrete GPUs all provide this interface.) Another benefit is that it has a significant
68 | smaller footprint than other GPU runtimes (both vsort and vstrt CUDA backends require >1GB
69 | CUDA libraries.) The main drawback is that it's slower.
70 | 
71 | To install, download the latest release and extract them into your VS `plugins` directory.
72 | 
73 | Please visit the [vsncnn](vsncnn) directory for details.
74 | 


--------------------------------------------------------------------------------
/common/convert_float_to_float16.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONVERT_FLOAT_TO_FLOAT16_H
 2 | #define CONVERT_FLOAT_TO_FLOAT16_H
 3 | 
 4 | #include <string>
 5 | #include <unordered_set>
 6 | 
 7 | #include <onnx/onnx_pb.h>
 8 | 
 9 | void convert_float_to_float16(
10 |     ONNX_NAMESPACE::ModelProto & model,
11 |     bool force_fp16_initializers,
12 |     // bool keep_io_types = True,
13 |     // bool disable_shape_infer = True,
14 |     // const std::optional<std::unordered_set<std::string>> op_block_list = DEFAULT_OP_BLOCK_LIST,
15 |     // const std::optional<std::unordered_set<std::string>> op_block_list = {},
16 |     const std::unordered_set<std::string> & op_block_list,
17 |     bool cast_input = true,
18 |     bool cast_output = true
19 | ) noexcept;
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/common/onnx_utils.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdint>
  2 | #include <fstream>
  3 | #include <optional>
  4 | #include <variant>
  5 | #include <string>
  6 | #include <string_view>
  7 | 
  8 | #include <onnx/onnx_pb.h>
  9 | #include <onnx/shape_inference/implementation.h>
 10 | 
 11 | #include "onnx_utils.h"
 12 | 
 13 | 
 14 | using namespace std::string_literals;
 15 | 
 16 | #ifdef _WIN32
 17 | #include <locale>
 18 | #include <codecvt>
 19 | static inline std::wstring translateName(const char *name) noexcept {
 20 |     std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
 21 |     return converter.from_bytes(name);
 22 | }
 23 | #else
 24 | #define translateName(n) (n)
 25 | #endif
 26 | 
 27 | 
 28 | [[nodiscard]]
 29 | static std::optional<std::string> specifyShape(
 30 |     ONNX_NAMESPACE::ModelProto & model,
 31 |     int64_t tile_w,
 32 |     int64_t tile_h,
 33 |     int64_t batch = 1
 34 | ) noexcept {
 35 | 
 36 |     if (model.graph().input_size() != 1) {
 37 |         return "graph must has a single input";
 38 |     }
 39 |     ONNX_NAMESPACE::TensorShapeProto * input_shape {
 40 |         model
 41 |             .mutable_graph()
 42 |             ->mutable_input(0)
 43 |             ->mutable_type()
 44 |             ->mutable_tensor_type()
 45 |             ->mutable_shape()
 46 |     };
 47 | 
 48 |     if (model.graph().output_size() != 1) {
 49 |         return "graph must has a single output";
 50 |     }
 51 |     ONNX_NAMESPACE::TensorShapeProto * output_shape {
 52 |         model
 53 |             .mutable_graph()
 54 |             ->mutable_output(0)
 55 |             ->mutable_type()
 56 |             ->mutable_tensor_type()
 57 |             ->mutable_shape()
 58 |     };
 59 | 
 60 |     constexpr auto n_idx = 0;
 61 |     constexpr auto h_idx = 2;
 62 |     constexpr auto w_idx = 3;
 63 | 
 64 |     if (input_shape->dim_size() != 4) {
 65 |         return "input dimension must be 4";
 66 |     }
 67 | 
 68 |     input_shape->mutable_dim(n_idx)->set_dim_value(batch);
 69 |     input_shape->mutable_dim(h_idx)->set_dim_value(tile_h);
 70 |     input_shape->mutable_dim(w_idx)->set_dim_value(tile_w);
 71 | 
 72 |     if (output_shape->dim_size() != 4) {
 73 |         return "output dimsion must be 4";
 74 |     }
 75 | 
 76 |     output_shape->mutable_dim(n_idx)->set_dim_value(batch);
 77 |     output_shape->mutable_dim(h_idx)->clear_dim_value();
 78 |     output_shape->mutable_dim(w_idx)->clear_dim_value();
 79 | 
 80 |     // remove shape info
 81 |     if (model.graph().value_info_size() != 0) {
 82 |         model.mutable_graph()->mutable_value_info()->Clear();
 83 |     }
 84 | 
 85 |     try {
 86 |         ONNX_NAMESPACE::shape_inference::InferShapes(model);
 87 |     } catch (const ONNX_NAMESPACE::InferenceError & e) {
 88 |         return e.what();
 89 |     }
 90 | 
 91 |     return {};
 92 | }
 93 | 
 94 | 
 95 | std::variant<std::string, ONNX_NAMESPACE::ModelProto> loadONNX(
 96 |     const std::string_view & path,
 97 |     int64_t tile_w,
 98 |     int64_t tile_h,
 99 |     bool path_is_serialization
100 | ) noexcept {
101 | 
102 |     ONNX_NAMESPACE::ModelProto onnx_proto;
103 | 
104 |     if (path_is_serialization) {
105 |         if (!onnx_proto.ParseFromArray(path.data(), static_cast<int>(path.size()))) {
106 |             return "parse onnx serialization failed"s;
107 |         }
108 |     } else {
109 |         std::ifstream onnx_stream(
110 |             translateName(path.data()),
111 |             std::ios::binary
112 |         );
113 | 
114 |         if (!onnx_stream.good()) {
115 |             return "open "s + std::string{ path } + " failed"s;
116 |         }
117 | 
118 |         if (!onnx_proto.ParseFromIstream(&onnx_stream)) {
119 |             return "parse "s + std::string{ path } + " failed"s;
120 |         }
121 |     }
122 | 
123 |     if (auto err = specifyShape(onnx_proto, tile_w, tile_h); err.has_value()) {
124 |         return err.value();
125 |     }
126 | 
127 |     return onnx_proto;
128 | }
129 | 


--------------------------------------------------------------------------------
/common/onnx_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef ONNX_UTILS_H
 2 | #define ONNX_UTILS_H
 3 | 
 4 | #include <cstdint>
 5 | #include <string>
 6 | #include <string_view>
 7 | #include <variant>
 8 | 
 9 | #include <onnx/onnx_pb.h>
10 | 
11 | std::variant<std::string, ONNX_NAMESPACE::ModelProto> loadONNX(
12 |     const std::string_view & path,
13 |     int64_t tile_w,
14 |     int64_t tile_h,
15 |     bool path_is_serialization
16 | ) noexcept;
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/vsmigx/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-migraphx VERSION 3.1 LANGUAGES CXX)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 8 | 
 9 | find_package(migraphx REQUIRED CONFIG)
10 | find_package(hip REQUIRED CONFIG)
11 | 
12 | add_library(vsmigx SHARED vs_migraphx.cpp win32.cpp)
13 | 
14 | target_include_directories(vsmigx PRIVATE ${VAPOURSYNTH_INCLUDE_DIRECTORY})
15 | 
16 | target_link_libraries(vsmigx PRIVATE migraphx::c hip::host)
17 | 
18 | set_target_properties(vsmigx PROPERTIES
19 |     CXX_EXTENSIONS OFF
20 |     POSITION_INDEPENDENT_CODE ON
21 |     CXX_STANDARD 20
22 |     CXX_STANDARD_REQUIRED ON
23 | )
24 | 
25 | if (WIN32)
26 |     target_link_options(vsmigx PRIVATE
27 |         "/DELAYLOAD:migraphx_c.dll"
28 |         "/DELAYLOAD:amdhip64_6.dll"
29 |         "delayimp.lib"
30 |     )
31 | endif()
32 | 
33 | find_package(Git REQUIRED)
34 | execute_process(
35 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
36 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
37 |     OUTPUT_VARIABLE VCS_TAG
38 | )
39 | string(STRIP ${VCS_TAG} VCS_TAG)
40 | configure_file(config.h.in config.h)
41 | target_include_directories(vsmigx PUBLIC "${PROJECT_BINARY_DIR}")
42 | 
43 | install(TARGETS vsmigx
44 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
45 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
46 | )
47 | 


--------------------------------------------------------------------------------
/vsmigx/README.md:
--------------------------------------------------------------------------------
 1 | # VapourSynth MIGraphX
 2 | 
 3 | The vs-migraphx plugin provides optimized HIP runtime for some popular AI filters on AMD GPUs.
 4 | 
 5 | ## Usage
 6 | 
 7 | Prototype: `core.migx.Model(clip[] clips, string program_path[, int[] overlap, int[] tilesize, int device_id=0, int num_streams=1, string flexible_output_prop=""])`
 8 | 
 9 | Arguments:
10 | - `clip[] clips`: the input clips, only 16/32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki).
11 | - `string program_path`: the path to the prebuilt program (see below)
12 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size.
13 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size.
14 | - `int device_id`: Specifies the GPU device id to use, default 0. Requires AMD GPUs with gfx1030 target or RDNA3 architecture onwards ([list](https://rocm.docs.amd.com/projects/install-on-windows/en/latest/reference/system-requirements.html#windows-supported-gpus)).
15 | - `int num_streams`: number of concurrent HIP streams to use. Default 1. Increase if GPU not saturated.
16 | - `string flexible_output_prop`: used to support onnx models with arbitrary number of output planes.
17 | 
18 |   ```python3
19 |   from typing import TypedDict
20 | 
21 |   class Output(TypedDict):
22 |       clip: vs.VideoNode
23 |       num_planes: int
24 | 
25 |   prop = "planes" # arbitrary non-empty string
26 |   output = core.migx.Model(src, program_path, flexible_output_prop=prop) # type: Output
27 | 
28 |   clip = output["clip"]
29 |   num_planes = output["num_planes"]
30 | 
31 |   output_planes = [
32 |       clip.std.PropToClip(prop=f"{prop}{i}")
33 |       for i in range(num_planes)
34 |   ] # type: list[vs.VideoNode]
35 |   ```
36 |   
37 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
38 | 
39 | The general rule is to either:
40 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or
41 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used.
42 | 
43 | ## Instructions
44 | 
45 | ### Build program
46 |    ```shell
47 |    migraphx-driver compile --onnx drunet_gray.onnx --gpu --input-dim @input 1 2 1080 1920 --output dpir_gray_1080p.mxr
48 |    ```
49 |    
50 |    The program can be applied to `1920x1080` input.
51 |     
52 |    Also check [migraphx-driver useful arguments](#migraphx-driver-useful-arguments)
53 | 
54 | ### Run model
55 | In vpy script:
56 | ```python3
57 | # DPIR
58 | src = core.std.BlankClip(src, width=1920, height=1080, format=vs.GRAYS)
59 | sigma = 10.0
60 | flt = core.migx.Model([src, core.std.BlankClip(src, color=sigma/255.0)], engine_path="dpir_gray_1080p.mxr", tilesize=[1920, 1080])
61 | ```
62 | 
63 | ## trtexec useful arguments
64 | - `--fp16`: Enable fp16 precision, in addition to fp32 (default = disabled)
65 | 
66 | - `--output <file>`: Save the serialized program
67 | 
68 | - `--migraphx <file>`: Load a serialized program
69 | 
70 | - `--optimize`: Performs common graph optimizations
71 | 
72 | - `--exhaustive-tune`: Enables exhaustive search to find the fastest kernel
73 | 
74 | - `--disable-fast-math`: Disable fast math optimization
75 | 
76 | Also check the [full list of options](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/migraphx-driver.html#options) and [environment variables](https://rocm.docs.amd.com/projects/AMDMIGraphX/en/latest/dev/env_vars.html).
77 | 
78 | 


--------------------------------------------------------------------------------
/vsmigx/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vsmigx/win32.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef _MSC_VER
 2 | #include <windows.h>
 3 | #include <delayimp.h>
 4 | #include <string>
 5 | #include <vector>
 6 | #include <stdexcept>
 7 | #include <filesystem>
 8 | 
 9 | #define DLL_DIR L"vsmlrt-hip"
10 | 
11 | #include <iostream>
12 | 
13 | namespace {
14 | std::vector<std::wstring> dlls = {
15 |     // This list must be sorted by dependency.
16 |     L"amdhip64_6.dll",
17 |     L"migraphx.dll",
18 |     L"migraphx_tf.dll",
19 |     L"migraphx_onnx.dll",
20 |     L"migraphx_c.dll", // must be the last
21 | };
22 | 
23 | namespace fs = std::filesystem;
24 | static fs::path dllDir() {
25 |     static const std::wstring res = []() -> std::wstring {
26 |         HMODULE mod = 0;
27 |         if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
28 |             std::vector<wchar_t> buf;
29 |             size_t n = 0;
30 |             do {
31 |                 buf.resize(buf.size() + MAX_PATH);
32 |                 n = GetModuleFileNameW(mod, buf.data(), buf.size());
33 |             } while (n >= buf.size());
34 |             buf.resize(n);
35 |             std::wstring path(buf.begin(), buf.end());
36 |             return path;
37 |         }
38 |         throw std::runtime_error("unable to locate myself");
39 |     }();
40 |     return fs::path(res).parent_path();
41 | }
42 | 
43 | FARPROC loadDLLs() {
44 |     fs::path dir = dllDir() / DLL_DIR;
45 |     HMODULE h = nullptr;
46 |     for (const auto dll: dlls) {
47 |         fs::path p = dir / dll;
48 |         std::wstring s = p;
49 |         h = LoadLibraryW(s.c_str());
50 |         if (getenv("VSMIGX_VERBOSE"))
51 |             std::wcerr << L"vsmigx: preloading " << p << L": " << h << std::endl;
52 |         if (!h)
53 |             std::wcerr << L"vsmigx: failed to preload " << s << std::endl;
54 |     }
55 |     return (FARPROC)h;
56 | }
57 | 
58 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
59 |     switch (reason) {
60 |     case dliNoteStartProcessing:
61 |     case dliNoteEndProcessing:
62 |         // Nothing to do here.
63 |         break;
64 |     case dliNotePreLoadLibrary:
65 |         //std::cerr << "loading " << info->szDll << std::endl;
66 |         if (std::string(info->szDll).find("migraphx_c.dll") != std::string::npos ||
67 |             std::string(info->szDll).find("amdhip64_6.dll") != std::string::npos
68 |         )
69 |             return loadDLLs();
70 |         break;
71 |     case dliNotePreGetProcAddress:
72 |         // Nothing to do here.
73 |         break;
74 |     case dliFailLoadLib:
75 |     case dliFailGetProc:
76 |         // Returning NULL from error notifications will cause the delay load
77 |         // runtime to raise a VcppException structured exception, that some code
78 |         // might want to handle.
79 |         return NULL;
80 |         break;
81 |     default:
82 |         abort(); // unreachable.
83 |         break;
84 |     }
85 |     // Returning NULL causes the delay load machinery to perform default
86 |     // processing for this notification.
87 |     return NULL;
88 | }
89 | } // namespace
90 | 
91 | extern "C" {
92 |     const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
93 |     const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
94 | };
95 | #endif
96 | 


--------------------------------------------------------------------------------
/vsncnn/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-ncnn VERSION 3.0 LANGUAGES CXX)
 4 | 
 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 6 | 
 7 | find_package(protobuf REQUIRED CONFIG)
 8 | find_package(ONNX REQUIRED CONFIG)
 9 | find_package(ncnn REQUIRED CONFIG)
10 | 
11 | add_library(vsncnn SHARED vs_ncnn.cpp onnx2ncnn.cpp ../common/onnx_utils.cpp)
12 | 
13 | target_include_directories(vsncnn PRIVATE
14 |     ${VAPOURSYNTH_INCLUDE_DIRECTORY}
15 |     ${ONNX_INCLUDE_DIRS}
16 | )
17 | 
18 | target_link_libraries(vsncnn PRIVATE onnx ncnn)
19 | 
20 | set_target_properties(vsncnn PROPERTIES
21 |     POSITION_INDEPENDENT_CODE ON
22 |     CXX_EXTENSIONS OFF
23 |     CXX_STANDARD 17
24 |     CXX_STANDARD_REQUIRED ON
25 | )
26 | 
27 | if (CMAKE_CXX_STANDARD GREATER 17)
28 |     set_target_properties(vsncnn PROPERTIES CXX_STANDARD ${CMAKE_CXX_STANDARD})
29 | endif()
30 | 
31 | target_include_directories(vsncnn PUBLIC
32 |     "${PROJECT_BINARY_DIR}"
33 | )
34 | 
35 | find_package(Git REQUIRED)
36 | execute_process(
37 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
38 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
39 |     OUTPUT_VARIABLE VCS_TAG
40 | )
41 | string(STRIP ${VCS_TAG} VCS_TAG)
42 | configure_file(config.h.in config.h)
43 | 
44 | install(TARGETS vsncnn
45 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
46 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
47 | )
48 | 


--------------------------------------------------------------------------------
/vsncnn/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vsncnn/onnx2ncnn.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef ONNX2NCNN_HPP
 2 | #define ONNX2NCNN_HPP
 3 | 
 4 | #include <optional>
 5 | #include <string>
 6 | #include <tuple>
 7 | 
 8 | #include <onnx/onnx_pb.h>
 9 | 
10 | extern std::optional<std::tuple<char *, unsigned char *>> onnx2ncnn(ONNX_NAMESPACE::ModelProto & model);
11 | 
12 | #endif // ONNX2NCNN_HPP
13 | 


--------------------------------------------------------------------------------
/vsort/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-ort VERSION 3.0 LANGUAGES CXX)
 4 | 
 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 6 | set(ONNX_RUNTIME_API_DIRECTORY "" CACHE PATH "Path to ONNX API headers")
 7 | set(ONNX_RUNTIME_LIB_DIRECTORY "" CACHE PATH "Path to ONNX Runtime libraries")
 8 | 
 9 | set(ENABLE_CUDA OFF CACHE BOOL "Enable CUDA backend")
10 | set(ENABLE_DML OFF CACHE BOOL "Enable DirectML backend")
11 | set(ENABLE_COREML OFF CACHE BOOL "Enable CoreML support")
12 | 
13 | find_package(protobuf REQUIRED CONFIG)
14 | find_package(ONNX REQUIRED CONFIG)
15 | 
16 | add_library(vsort SHARED
17 |     vs_onnxruntime.cpp
18 |     win32.cpp
19 |     ../common/onnx_utils.cpp
20 |     ../common/convert_float_to_float16.cpp
21 | )
22 | 
23 | target_include_directories(vsort PRIVATE
24 |     ${VAPOURSYNTH_INCLUDE_DIRECTORY}
25 |     ${ONNX_INCLUDE_DIRS}
26 |     ${ONNX_RUNTIME_API_DIRECTORY}
27 | )
28 | 
29 | target_link_directories(vsort PRIVATE
30 |     ${ONNX_RUNTIME_LIB_DIRECTORY}
31 | )
32 | 
33 | set_target_properties(vsort PROPERTIES
34 |     POSITION_INDEPENDENT_CODE ON
35 |     CXX_EXTENSIONS OFF
36 |     CXX_STANDARD 17
37 |     CXX_STANDARD_REQUIRED ON)
38 | 
39 | if (CMAKE_CXX_STANDARD GREATER_EQUAL 20)
40 |     set_target_properties(vsort PROPERTIES CXX_STANDARD 20)
41 | endif()
42 | 
43 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5
44 | if (ONNX_VERSION VERSION_LESS 1.16.0)
45 |     target_link_libraries(vsort PRIVATE onnx)
46 | else()
47 |     target_link_libraries(vsort PRIVATE ONNX::onnx)
48 | endif()
49 | 
50 | target_link_libraries(vsort PRIVATE onnxruntime)
51 | 
52 | if (ENABLE_CUDA)
53 |     find_package(CUDAToolkit REQUIRED)
54 | 
55 |     add_compile_definitions(ENABLE_CUDA)
56 |     target_include_directories(vsort PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
57 |     target_link_libraries(vsort PRIVATE CUDA::cudart_static)
58 | 
59 |     if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
60 |         target_link_options(vsort PRIVATE "/DELAYLOAD:onnxruntime.dll" "delayimp.lib")
61 |     endif()
62 | endif()
63 | 
64 | if (ENABLE_DML)
65 |     add_compile_definitions(ENABLE_DML)
66 | endif()
67 | 
68 | if(ENABLE_COREML)
69 |     add_compile_definitions(ENABLE_COREML=1)
70 | endif()
71 | 
72 | target_include_directories(vsort PUBLIC
73 |     "${PROJECT_BINARY_DIR}"
74 | )
75 | 
76 | find_package(Git REQUIRED)
77 | execute_process(
78 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
79 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
80 |     OUTPUT_VARIABLE VCS_TAG
81 | )
82 | string(STRIP ${VCS_TAG} VCS_TAG)
83 | configure_file(config.h.in config.h)
84 | 
85 | install(TARGETS vsort
86 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
87 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
88 | )
89 | 


--------------------------------------------------------------------------------
/vsort/README.md:
--------------------------------------------------------------------------------
 1 | # VapourSynth ONNX Runtime
 2 | 
 3 | The vs-onnxruntime plugin provides optimized CPU & CUDA runtime for some popular AI filters.
 4 | 
 5 | ## Building and Installation
 6 | 
 7 | To build, you will need [ONNX Runtime](https://www.onnxruntime.ai/), [protobuf](https://github.com/protocolbuffers/protobuf), [ONNX](https://github.com/onnx/onnx) and their dependencies.
 8 | 
 9 | Please refer to [ONNX Runtime Docs](https://onnxruntime.ai/docs/install/) for installation notes.
10 | Or, you can use our prebuilt Windows binary releases from [AmusementClub](https://github.com/AmusementClub/onnxruntime/releases/latest/).
11 | 
12 | Please refer to our [github actions workflow](../.github/workflows/windows-ort.yml) for sample building instructions.
13 | 
14 | If you only use the CPU backend, then you just need to extract binary release into your `vapoursynth/plugins` directory.
15 | 
16 | However, if you also use the CUDA backend, you will need to download some CUDA libraries as well, please see the release page for details. Those CUDA libraries also need to be extracted into VS `vapoursynth/plugins` directory. The plugin will try to load them from `vapoursynth/plugins/vsort/` directory or `vapoursynth/plugins/vsmlrt-cuda/` directory.
17 | 
18 | ## Usage
19 | 
20 | Prototype: `core.ort.Model(clip[] clips, string network_path[, int[] overlap = None, int[] tilesize = None, string provider = "", int device_id = 0, int verbosity = 2, bint cudnn_benchmark = True, bint builtin = False, string builtindir="models", bint fp16 = False, bint path_is_serialization = False, bint use_cuda_graph = False])`
21 | 
22 | Arguments:
23 |  - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki).
24 |  - `string network_path`: the path to the network in ONNX format.
25 |  - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size.
26 |  - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size.
27 |  - `string provider`: Specifies the device to run the inference on.
28 |    - `"CPU"` or `""`: pure CPU backend
29 |    - `"CUDA"`: CUDA GPU backend, requires Nvidia Maxwell+ GPUs.
30 |    - `"DML"`: DirectML backend
31 |    - `"COREML"`: CoreML backend
32 |  - `int device_id`: select the GPU device for the CUDA backend.'
33 |  - `int verbosity`: specify the verbosity of logging, the default is warning.
34 |    - 0: fatal error only, `ORT_LOGGING_LEVEL_FATAL`
35 |    - 1: also errors, `ORT_LOGGING_LEVEL_ERROR`
36 |    - 2: also warnings, `ORT_LOGGING_LEVEL_WARNING`
37 |    - 3: also info, `ORT_LOGGING_LEVEL_INFO`
38 |    - 4: everything, `ORT_LOGGING_LEVEL_VERBOSE`
39 |  - `bint cudnn_benchmark`: whether to let cuDNN use benchmarking to search for the best convolution kernel to use. Default True. It might incur some startup latency.
40 |  - `bint builtin`: whether to load the model from the VS plugins directory, see also `builtindir`.
41 |  - `string builtindir`: the model directory under VS plugins directory for builtin models, default "models".
42 |  - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation.
43 |  - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`.
44 |  - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead in CUDA backend. Not all models are supported.
45 |  - `int ml_program`: select CoreML provider.
46 |    - 0: NeuralNetwork
47 |    - 1: MLProgram
48 | 
49 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
50 | 
51 | The general rule is to either:
52 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or
53 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used.
54 | 


--------------------------------------------------------------------------------
/vsort/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vsort/win32.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _MSC_VER
  2 | #include <windows.h>
  3 | #include <delayimp.h>
  4 | #include <iostream>
  5 | #include <map>
  6 | #include <string>
  7 | #include <vector>
  8 | #include <stdexcept>
  9 | #include <filesystem>
 10 | 
 11 | #define DLL_DIR L"vsort"
 12 | #define COMMON_CUDA_DIR L"vsmlrt-cuda"
 13 | 
 14 | namespace {
 15 | std::vector<std::wstring> dlls = {
 16 |     // This list must be sorted by dependency.
 17 |     L"DirectML.dll",
 18 |     L"onnxruntime.dll", // must be the last
 19 | };
 20 | 
 21 | static std::vector<std::wstring> cudaDlls {
 22 |     L"cudart64",
 23 |     L"cublasLt64", L"cublas64",
 24 |     L"cufft64",
 25 |     L"cudnn_ops_infer64", L"cudnn_cnn_infer64", L"cudnn_adv_infer64", L"cudnn64",
 26 |     L"cupti64",
 27 | };
 28 | 
 29 | bool verbose() { return getenv("VSORT_VERBOSE") != nullptr; }
 30 | 
 31 | namespace fs = std::filesystem;
 32 | static fs::path dllDir() {
 33 |     static const std::wstring res = []() -> std::wstring {
 34 |         HMODULE mod = 0;
 35 |         if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
 36 |             std::vector<wchar_t> buf;
 37 |             size_t n = 0;
 38 |             do {
 39 |                 buf.resize(buf.size() + MAX_PATH);
 40 |                 n = GetModuleFileNameW(mod, buf.data(), buf.size());
 41 |             } while (n >= buf.size());
 42 |             buf.resize(n);
 43 |             std::wstring path(buf.begin(), buf.end());
 44 |             return path;
 45 |         }
 46 |         throw std::runtime_error("unable to locate myself");
 47 |     }();
 48 |     return fs::path(res).parent_path();
 49 | }
 50 | 
 51 | FARPROC loadDLLs() {
 52 |     fs::path dir = dllDir() / DLL_DIR;
 53 |     HMODULE h = nullptr;
 54 |     for (const auto dll: dlls) {
 55 |         fs::path p = dir / dll;
 56 |         std::wstring s = p;
 57 |         h = LoadLibraryW(s.c_str());
 58 |         if (verbose())
 59 |             std::wcerr << DLL_DIR << L": preloading " << p << L": " << h << std::endl;
 60 |         if (!h)
 61 |             std::wcerr << DLL_DIR << L": failed to preload " << s << std::endl;
 62 |     }
 63 |     return (FARPROC)h;
 64 | }
 65 | 
 66 | static void *dummy() { // mimic OrtGetApiBase
 67 |     return nullptr;
 68 | }
 69 | 
 70 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
 71 |     switch (reason) {
 72 |     case dliNoteStartProcessing:
 73 |     case dliNoteEndProcessing:
 74 |         // Nothing to do here.
 75 |         break;
 76 |     case dliNotePreLoadLibrary:
 77 |         //std::cerr << "loading " << info->szDll << std::endl;
 78 |         if (std::string(info->szDll).find("onnxruntime.dll") != std::string::npos)
 79 |             return loadDLLs();
 80 |         break;
 81 |     case dliNotePreGetProcAddress:
 82 |         // Nothing to do here.
 83 |         break;
 84 |     case dliFailLoadLib:
 85 |     case dliFailGetProc:
 86 |         // Returning NULL from error notifications will cause the delay load
 87 |         // runtime to raise a VcppException structured exception, that some code
 88 |         // might want to handle.
 89 |         // The SE will crash the process, so instead we return a dummy function.
 90 |         return (FARPROC)dummy;
 91 |         break;
 92 |     default:
 93 |         abort(); // unreachable.
 94 |         break;
 95 |     }
 96 |     // Returning NULL causes the delay load machinery to perform default
 97 |     // processing for this notification.
 98 |     return NULL;
 99 | }
100 | } // namespace
101 | 
102 | extern "C" {
103 |     const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
104 |     const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
105 | };
106 | 
107 | bool preloadCudaDlls() {
108 |     std::map<std::wstring, std::filesystem::path> dllmap;
109 | 
110 |     auto findDllIn = [&](const std::filesystem::path &dir) {
111 |         if (!std::filesystem::is_directory(dir))
112 |             return;
113 |         for (const auto &ent: std::filesystem::directory_iterator{dir}) {
114 |             if (!ent.is_regular_file())
115 |                 continue;
116 |             const auto path = ent.path();
117 |             if (path.extension() != ".dll")
118 |                 continue;
119 |             const std::wstring filename = path.filename().wstring();
120 |             for (const auto &dll: cudaDlls) {
121 |                 if (dllmap.count(dll) > 0)
122 |                     continue;
123 |                 if (filename.find(dll) == 0) {
124 |                     if (verbose())
125 |                         std::wcerr << DLL_DIR << L": found " << path << L" for " << dll << std::endl;
126 |                     dllmap.insert({ dll, path });
127 |                     break;
128 |                 }
129 |             }
130 |         }
131 |     };
132 |     const fs::path dir = dllDir();
133 |     findDllIn(dir / DLL_DIR);
134 |     findDllIn(dir / COMMON_CUDA_DIR);
135 | 
136 |     if (verbose()) {
137 |         for (const auto pair: dllmap)
138 |             std::wcerr << DLL_DIR << L": will load " << pair.first << L" from " << pair.second << std::endl;
139 |     }
140 |     for (const auto &dll: cudaDlls) {
141 |         if (dllmap.count(dll) == 0) {
142 |             if (verbose()) {
143 |                 std::wcerr << DLL_DIR << L": unable to preload " << dll << L": not found" << std::endl;
144 |                 return false;
145 |             }
146 |         }
147 |         std::wstring p = dllmap[dll];
148 |         HMODULE h = LoadLibraryW(p.c_str());
149 |         if (verbose())
150 |             std::wcerr << DLL_DIR << L": preloading " << p << L": " << h << std::endl;
151 |         if (!h) return false;
152 |     }
153 |     return true;
154 | }
155 | #endif
156 | 


--------------------------------------------------------------------------------
/vsov/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-ov VERSION 3.0 LANGUAGES CXX)
 4 | 
 5 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 6 | set(ENABLE_VISUALIZATION OFF CACHE BOOL "Enable support for network visualization")
 7 | set(WIN32_SHARED_OPENVINO OFF CACHE BOOL "Build for win32 with shared openvino library")
 8 | 
 9 | find_package(OpenVINO REQUIRED CONFIG)
10 | 
11 | add_library(vsov SHARED
12 |     vs_openvino.cpp
13 |     win32.cpp
14 |     ../common/onnx_utils.cpp
15 |     ../common/convert_float_to_float16.cpp
16 | )
17 | 
18 | if(ENABLE_VISUALIZATION)
19 |     target_compile_definitions(vsov PRIVATE ENABLE_VISUALIZATION)
20 | endif()
21 | 
22 | if(WIN32_SHARED_OPENVINO)
23 |     target_compile_definitions(vsov PRIVATE WIN32_SHARED_OPENVINO)
24 | endif()
25 | 
26 | find_package(protobuf REQUIRED CONFIG)
27 | find_package(ONNX REQUIRED CONFIG)
28 | 
29 | # https://github.com/onnx/onnx/commit/21bff4e55dcefecc069c679115baae6b00caa0d5
30 | if (ONNX_VERSION VERSION_LESS 1.16.0)
31 |     target_link_libraries(vsov PRIVATE onnx)
32 | else()
33 |     target_link_libraries(vsov PRIVATE ONNX::onnx)
34 | endif()
35 | 
36 | target_include_directories(vsov PRIVATE
37 |     ${VAPOURSYNTH_INCLUDE_DIRECTORY}
38 |     ${ONNX_INCLUDE_DIRS}
39 | )
40 | 
41 | target_link_libraries(vsov PRIVATE openvino::runtime)
42 | 
43 | set_target_properties(vsov PROPERTIES
44 |     CXX_EXTENSIONS OFF
45 |     CXX_STANDARD 17
46 |     CXX_STANDARD_REQUIRED ON
47 | )
48 | 
49 | if (WIN32)
50 |     if(WIN32_SHARED_OPENVINO)
51 |         target_link_options(vsov PRIVATE "/DELAYLOAD:openvino.dll" "delayimp.lib")
52 |     else()
53 |         target_link_options(vsov PRIVATE "/DELAYLOAD:tbb.dll" "delayimp.lib")
54 |     endif()
55 | endif()
56 | 
57 | target_include_directories(vsov PUBLIC
58 |     "${PROJECT_BINARY_DIR}"
59 | )
60 | 
61 | find_package(Git REQUIRED)
62 | execute_process(
63 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
64 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
65 |     OUTPUT_VARIABLE VCS_TAG
66 | )
67 | string(STRIP ${VCS_TAG} VCS_TAG)
68 | configure_file(config.h.in config.h)
69 | 
70 | install(TARGETS vsov
71 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
72 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
73 | )
74 | 


--------------------------------------------------------------------------------
/vsov/README.md:
--------------------------------------------------------------------------------
 1 | # VapourSynth OpenVINO
 2 | 
 3 | The vs-openvino plugin provides optimized *pure* CPU runtime for some popular AI filters.
 4 | 
 5 | ## Building and Installation
 6 | 
 7 | To build, you will need [OpenVINO](https://docs.openvino.ai/latest/get_started.html) and its dependencies.
 8 | Only `Model Optimizer` and `Inference Engine` are required.
 9 | 
10 | You can download official Intel releases:
11 | - [Linux](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_linux_header.html)
12 | - [Windows](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_windows_header.html)
13 | - [macOS](https://docs.openvino.ai/latest/openvino_docs_install_guides_installing_openvino_macos_header.html)
14 | 
15 | Or, you can use our prebuilt Windows binary releases from [AmusementClub](https://github.com/AmusementClub/openvino/releases/latest/), our release has the benefit of static linking support.
16 | 
17 | Sample cmake commands to build:
18 | ```bash
19 | cmake -S . -B build -G Ninja -D CMAKE_BUILD_TYPE=Release
20 | 	-D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
21 | 	-D InferenceEngine_DIR=openvino/runtime/cmake
22 | 	-D VAPOURSYNTH_INCLUDE_DIRECTORY="path/to/vapoursynth/include"
23 | cmake --build build
24 | cmake --install build --prefix install
25 | ```
26 | You should find `vsov.dll` (or libvsov.so) under `install/bin`. You will also need Intel TBB (you can get
27 | `tbb.dll` from OpenVINO release). On windows, `tbb.dll` must be placed under `vapoursynth/plugins/vsov/`
28 | directory for `vsov.dll` to find.
29 | 
30 | ## Usage
31 | 
32 | Prototype: `core.ov.Model(clip[] clips, string network_path[, int[] overlap = None, int[] tilesize = None, string device = "CPU", bint builtin = 0, string builtindir="models", bint fp16 = False, function config = None, bint path_is_serialization = False])`
33 | 
34 | Arguments:
35 |  - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki).
36 |  - `string network_path`: the path to the network in ONNX format.
37 |  - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size.
38 |  - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size.
39 |  - `string device`: Specifies the device to run the inference on. Currently `"CPU"` and `"GPU"` are supported. `"GPU"` requires Intel graphics (Broadwell+ processors with Gen8+ integrated GPUs or Xe discrete GPUs) with compatible graphics driver and compute runtime.
40 |  - `bint builtin`: whether to load the model from the VS plugins directory, see also `builtindir`.
41 |  - `string builtindir`: the model directory under VS plugins directory for builtin models, default "models".
42 |  - `bint fp16`: whether to quantize model to fp16 for faster and memory efficient computation.
43 |  - `function config`: plugin configuration parameters. It must be a callable object (e.g. a function) with no positional arguments, and returns the configuration parameter in a dictionary `dict`. The dictionary must use string `str` for its key and `int`, `float` or `str` for its values. Supported parameters: [CPU](https://docs.openvino.ai/2021.4/openvino_docs_IE_DG_supported_plugins_CPU.html#supported-configuration-parameters), [GPU](https://docs.openvino.ai/2021.4/openvino_docs_IE_DG_supported_plugins_GPU.html#supported-configuration-parameters) (the prefix `KEY_` has to be removed). Example: `config = lambda: dict(CPU_THROUGHPUT_STREAMS=2)`
44 |  - `bint path_is_serialization`: whether the `network_path` argument specifies an onnx serialization of type `bytes`.
45 | 
46 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
47 | 
48 | The general rule is to either:
49 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or
50 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used.
51 | 


--------------------------------------------------------------------------------
/vsov/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vsov/win32.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _MSC_VER
  2 | #include <windows.h>
  3 | #include <delayimp.h>
  4 | #include <string>
  5 | #include <vector>
  6 | #include <stdexcept>
  7 | #include <filesystem>
  8 | 
  9 | #define DLL_DIR L"vsov"
 10 | 
 11 | #include <iostream>
 12 | 
 13 | namespace {
 14 | std::vector<std::wstring> dlls = {
 15 |     // This list must be sorted by dependency.
 16 | #ifdef WIN32_SHARED_OPENVINO
 17 |     L"tbb12.dll",
 18 |     L"openvino.dll", // must be the last
 19 | #else // WIN32_SHARED_OPENVINO
 20 |     L"tbb12.dll", // must be the last
 21 | #endif // WIN32_SHARED_OPENVINO
 22 | };
 23 | 
 24 | namespace fs = std::filesystem;
 25 | static fs::path dllDir() {
 26 |     static const std::wstring res = []() -> std::wstring {
 27 |         HMODULE mod = 0;
 28 |         if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
 29 |             std::vector<wchar_t> buf;
 30 |             size_t n = 0;
 31 |             do {
 32 |                 buf.resize(buf.size() + MAX_PATH);
 33 |                 n = GetModuleFileNameW(mod, buf.data(), buf.size());
 34 |             } while (n >= buf.size());
 35 |             buf.resize(n);
 36 |             std::wstring path(buf.begin(), buf.end());
 37 |             return path;
 38 |         }
 39 |         throw std::runtime_error("unable to locate myself");
 40 |     }();
 41 |     return fs::path(res).parent_path();
 42 | }
 43 | 
 44 | FARPROC loadDLLs() {
 45 |     fs::path dir = dllDir() / DLL_DIR;
 46 |     HMODULE h = nullptr;
 47 |     for (const auto dll: dlls) {
 48 |         fs::path p = dir / dll;
 49 |         std::wstring s = p;
 50 |         h = LoadLibraryW(s.c_str());
 51 |         if (getenv("VSOV_VERBOSE"))
 52 |             std::wcerr << L"vsov: preloading " << p << L": " << h << std::endl;
 53 |         if (!h)
 54 |             std::wcerr << L"vsov: failed to preload " << s << std::endl;
 55 |     }
 56 |     return (FARPROC)h;
 57 | }
 58 | 
 59 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
 60 |     switch (reason) {
 61 |     case dliNoteStartProcessing:
 62 |     case dliNoteEndProcessing:
 63 |         // Nothing to do here.
 64 |         break;
 65 |     case dliNotePreLoadLibrary:
 66 |         //std::cerr << "loading " << info->szDll << std::endl;
 67 | #ifdef WIN32_SHARED_OPENVINO
 68 |         if (std::string(info->szDll).find("openvino.dll") != std::string::npos)
 69 |             return loadDLLs();
 70 | #else // WIN32_SHARED_OPENVINO
 71 |         if (std::string(info->szDll).find("tbb.dll") != std::string::npos)
 72 |             return loadDLLs();
 73 | #endif // WIN32_SHARED_OPENVINO
 74 |         break;
 75 |     case dliNotePreGetProcAddress:
 76 |         // Nothing to do here.
 77 |         break;
 78 |     case dliFailLoadLib:
 79 |     case dliFailGetProc:
 80 |         // Returning NULL from error notifications will cause the delay load
 81 |         // runtime to raise a VcppException structured exception, that some code
 82 |         // might want to handle.
 83 |         return NULL;
 84 |         break;
 85 |     default:
 86 |         abort(); // unreachable.
 87 |         break;
 88 |     }
 89 |     // Returning NULL causes the delay load machinery to perform default
 90 |     // processing for this notification.
 91 |     return NULL;
 92 | }
 93 | } // namespace
 94 | 
 95 | extern "C" {
 96 |     const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
 97 |     const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
 98 | };
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/vstrt/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(vs-trt VERSION 3.1 LANGUAGES CXX)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 17)
 6 | 
 7 | set(VAPOURSYNTH_INCLUDE_DIRECTORY "" CACHE PATH "Path to VapourSynth headers")
 8 | set(TENSORRT_HOME "" CACHE PATH "Path to TensorRT")
 9 | option(USE_NVINFER_PLUGIN "Initialize nvinfer_plugin" FALSE)
10 | option(USE_NVINFER_PLUGIN_STATIC "Use static nvinfer_plugin" FALSE)
11 | set(TENSORRT_LIBRARY_SUFFIX "" CACHE STRING "TensorRT library suffix")
12 | 
13 | FIND_PACKAGE(CUDAToolkit REQUIRED)
14 | 
15 | add_library(vstrt SHARED
16 |     $<$<PLATFORM_ID:Windows>: longpath.manifest>
17 |     vs_tensorrt.cpp
18 |     win32.cpp
19 | )
20 | 
21 | target_include_directories(vstrt PRIVATE
22 |     ${VAPOURSYNTH_INCLUDE_DIRECTORY}
23 |     ${CUDAToolkit_INCLUDE_DIRS}
24 |     ${TENSORRT_HOME}/include
25 | )
26 | 
27 | set_target_properties(vstrt PROPERTIES
28 |     CXX_EXTENSIONS OFF
29 |     POSITION_INDEPENDENT_CODE ON
30 |     CXX_STANDARD 20
31 |     CXX_STANDARD_REQUIRED ON
32 | )
33 | 
34 | target_link_directories(vstrt PRIVATE ${TENSORRT_HOME}/lib)
35 | target_link_libraries(vstrt PRIVATE CUDA::cudart_static "nvinfer${TENSORRT_LIBRARY_SUFFIX}")
36 | 
37 | if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
38 |     target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer${TENSORRT_LIBRARY_SUFFIX}.dll" "delayimp.lib")
39 | endif()
40 | 
41 | if (USE_NVINFER_PLUGIN)
42 |     add_definitions(-DUSE_NVINFER_PLUGIN)
43 |     if (USE_NVINFER_PLUGIN_STATIC)
44 |         target_link_libraries(vstrt PRIVATE "nvinfer_plugin_static${TENSORRT_LIBRARY_SUFFIX}")
45 |     else()
46 |         target_link_libraries(vstrt PRIVATE "nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}")
47 | 
48 |         if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
49 |             target_link_options(vstrt PRIVATE "/DELAYLOAD:nvinfer_plugin${TENSORRT_LIBRARY_SUFFIX}.dll")
50 |         endif()
51 |     endif()
52 | endif()
53 | 
54 | target_include_directories(vstrt PUBLIC
55 |     "${PROJECT_BINARY_DIR}"
56 | )
57 | 
58 | find_package(Git REQUIRED)
59 | execute_process(
60 |     COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
61 |     WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
62 |     OUTPUT_VARIABLE VCS_TAG
63 | )
64 | string(STRIP ${VCS_TAG} VCS_TAG)
65 | configure_file(config.h.in config.h)
66 | 
67 | install(TARGETS vstrt
68 |     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
69 |     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
70 | )
71 | 


--------------------------------------------------------------------------------
/vstrt/README.md:
--------------------------------------------------------------------------------
  1 | # VapourSynth TensorRT
  2 | 
  3 | The vs-tensorrt plugin provides optimized CUDA runtime for some popular AI filters.
  4 | 
  5 | ## Usage
  6 | 
  7 | Prototype: `core.trt.Model(clip[] clips, string engine_path[, int[] overlap, int[] tilesize, int device_id=0, bint use_cuda_graph=False, int num_streams=1, int verbosity=2, string flexible_output_prop=""])`
  8 | 
  9 | Arguments:
 10 | - `clip[] clips`: the input clips, only 32-bit floating point RGB or GRAY clips are supported. For model specific input requirements, please consult our [wiki](https://github.com/AmusementClub/vs-mlrt/wiki).
 11 | - `string engine_path`: the path to the prebuilt engine (see below)
 12 | - `int[] overlap`: some networks (e.g. [CNN](https://en.wikipedia.org/wiki/Convolutional_neural_network)) support arbitrary input shape where other networks might only support fixed input shape and the input clip must be processed in tiles. The `overlap` argument specifies the overlapping (horizontal and vertical, or both, in pixels) between adjacent tiles to minimize boundary issues. Please refer to network specific docs on the recommended overlapping size.
 13 | - `int[] tilesize`: Even for CNN where arbitrary input sizes could be supported, sometimes the network does not work well for the entire range of input dimensions, and you have to limit the size of each tile. This parameter specify the tile size (horizontal and vertical, or both, including the overlapping). Please refer to network specific docs on the recommended tile size.
 14 | - `int device_id`: Specifies the GPU device id to use, default 0. Requires Nvidia GPUs with second-generation Kepler architecture onwards.
 15 | - `bint use_cuda_graph`: whether to use CUDA Graphs to improve performance and reduce CPU overhead.
 16 | - `int num_streams`: number of concurrent CUDA streams to use. Default 1. Increase if GPU not saturated.
 17 | - `verbosity`: The verbosity level of TensorRT runtime. The message writes to `stderr`.
 18 |   `0`: Internal error. `1`: Application error. `2`: Warning. `3`: Informational messages with instructional information. `4`: Verbose messages with debugging information.
 19 | - `string flexible_output_prop`: used to support onnx models with arbitrary number of output planes.
 20 | 
 21 |   ```python3
 22 |   from typing import TypedDict
 23 | 
 24 |   class Output(TypedDict):
 25 |       clip: vs.VideoNode
 26 |       num_planes: int
 27 | 
 28 |   prop = "planes" # arbitrary non-empty string
 29 |   output = core.trt.Model(src, engine_path, flexible_output_prop=prop) # type: Output
 30 | 
 31 |   clip = output["clip"]
 32 |   num_planes = output["num_planes"]
 33 | 
 34 |   output_planes = [
 35 |       clip.std.PropToClip(prop=f"{prop}{i}")
 36 |       for i in range(num_planes)
 37 |   ] # type: list[vs.VideoNode]
 38 |   ```
 39 |   
 40 | When `overlap` and `tilesize` are not specified, the filter will internally try to resize the network to fit the input clips. This might not always work (for example, the network might require the width to be divisible by 8), and the filter will error out in this case.
 41 | 
 42 | The general rule is to either:
 43 | 1. left out `overlap`, `tilesize` at all and just process the input frame in one tile, or
 44 | 2. set all three so that the frame is processed in `tilesize[0]` x `tilesize[1]` tiles, and adjacent tiles will have an overlap of `overlap[0]` x `overlap[1]` pixels on each direction. The overlapped region will be throw out so that only internal output pixels are used.
 45 | 
 46 | ## Instructions
 47 | 
 48 | ### Build engine with dynamic shape support
 49 | - Requires models with built-in dynamic shape support, e.g. `waifu2x_v3.7z` and `dpir_v3.7z`.
 50 | 
 51 | 1. Build engine
 52 |    ```shell
 53 |    trtexec --onnx=drunet_gray.onnx --minShapes=input:1x2x8x8 --optShapes=input:1x2x64x64 --maxShapes=input:1x2x1080x1920 --saveEngine=dpir_gray_1080p_dynamic.engine
 54 |    ```
 55 |    
 56 |    The engine will be optimized for `64x64` input and can be applied to eligible inputs with shape from `8x8` to `1920x1080` by specifying parameter `tilesize` in the `trt` plugin.
 57 |     
 58 |    Also check [trtexec useful arguments](#trtexec-useful-arguments)
 59 | 
 60 | ### Run model
 61 | In vpy script:
 62 | ```python3
 63 | # DPIR
 64 | src = core.std.BlankClip(src, width=640, height=360, format=vs.GRAYS)
 65 | sigma = 10.0
 66 | flt = core.trt.Model([src, core.std.BlankClip(src, color=sigma/255.0)], engine_path="dpir_gray_1080p_dynamic.engine", tilesize=[640, 360])
 67 | ```
 68 | 
 69 | ## trtexec useful arguments
 70 | - `--workspace=N`: Set workspace size in megabytes (default = 16)
 71 | 
 72 | - `--fp16`: Enable fp16 precision, in addition to fp32 (default = disabled)
 73 | 
 74 | - `--noTF32`: Disable tf32 precision (default is to enable tf32, in addition to fp32, Ampere only)
 75 | 
 76 | - `--device=N`: Select cuda device N (default = 0)
 77 | 
 78 | - `--timingCacheFile=<file>`:  Save/load the serialized global timing cache
 79 | 
 80 | - `--buildOnly` :Skip inference perf measurement (default = disabled)
 81 | 
 82 | - `--verbose`: Use verbose logging (default = false)
 83 | 
 84 | - `--profilingVerbosity=mode`: Specify profiling verbosity.
 85 | 
 86 |   ```
 87 |   mode ::= layer_names_only|detailed|none
 88 |   ```
 89 | 
 90 |   (default = layer_names_only)
 91 | 
 92 | - `--tacticSources=tactics`: Specify the tactics to be used by adding (+) or removing (-) tactics from the default
 93 | 
 94 |   tactic sources (default = all available tactics).
 95 | 
 96 |   Note: Currently only cuDNN, cuBLAS and cuBLAS-LT are listed as optional tactics.
 97 | 
 98 |   Tactic Sources: 
 99 |   ```
100 |   tactics ::= [","tactic]
101 |   tactic  ::= (+|-)lib
102 |   lib     ::= "CUBLAS"|"CUBLAS_LT"|"CUDNN"
103 |   ```
104 | 
105 |   For example, to disable cudnn and enable cublas: --tacticSources=-CUDNN,+CUBLAS
106 | 
107 | - `--useCudaGraph`: Use CUDA graph to capture engine execution and then launch inference (default = disabled).
108 |   This flag may be ignored if the graph capture fails.
109 | 
110 | - `--noDataTransfers`: Disable DMA transfers to and from device (default = enabled).
111 | 
112 | - `--saveEngine=<file>`: Save the serialized engine
113 | 
114 | - `--loadEngine=<file>`: Load a serialized engine
115 | 
116 | 


--------------------------------------------------------------------------------
/vstrt/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/vstrt/cuda_helper.h:
--------------------------------------------------------------------------------
 1 | #ifndef VSTRT_CUDA_HELPER_H_
 2 | #define VSTRT_CUDA_HELPER_H_
 3 | 
 4 | #include <string>
 5 | 
 6 | #include <cuda_runtime_api.h>
 7 | 
 8 | #define checkError(expr) do {                                                  \
 9 |     using namespace std::string_literals;                                      \
10 |     cudaError_t __err = expr;                                                  \
11 |     if (__err != cudaSuccess) {                                                \
12 |         const char * message = cudaGetErrorString(__err);                      \
13 |         return set_error("'"s + # expr + "' failed: " + message);              \
14 |     }                                                                          \
15 | } while(0)
16 | 
17 | #endif // VSTRT_CUDA_HELPER_H_
18 | 


--------------------------------------------------------------------------------
/vstrt/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef VSTRT_CUDA_UTILS_H_
 2 | #define VSTRT_CUDA_UTILS_H_
 3 | 
 4 | #include <concepts>
 5 | #include <cstdint>
 6 | #include <type_traits>
 7 | #include <utility>
 8 | 
 9 | #include <cuda_runtime_api.h>
10 | 
11 | template <typename T, auto deleter>
12 |     requires
13 |         std::default_initializable<T> &&
14 |         std::movable<T> &&
15 |         std::is_trivially_copy_assignable_v<T> &&
16 |         std::convertible_to<T, bool> &&
17 |         std::invocable<decltype(deleter), T>
18 | struct Resource {
19 |     T data;
20 | 
21 |     [[nodiscard]]
22 |     constexpr Resource() noexcept = default;
23 | 
24 |     [[nodiscard]]
25 |     constexpr Resource(T && x) noexcept : data(x) {}
26 | 
27 |     [[nodiscard]]
28 |     constexpr Resource(Resource&& other) noexcept
29 |         : data(std::exchange(other.data, T{}))
30 |     { }
31 | 
32 |     constexpr Resource& operator=(Resource&& other) noexcept {
33 |         if (this == &other) return *this;
34 |         deleter_(std::move(data));
35 |         data = std::exchange(other.data, T{});
36 |         return *this;
37 |     }
38 | 
39 |     constexpr Resource& operator=(const Resource & other) = delete;
40 | 
41 |     Resource(const Resource& other) = delete;
42 | 
43 |     constexpr operator T() const noexcept {
44 |         return data;
45 |     }
46 | 
47 |     constexpr auto deleter_(T && x) noexcept {
48 |         if (x) {
49 |             deleter(x);
50 |         }
51 |     }
52 | 
53 |     constexpr Resource& operator=(T && x) noexcept {
54 |         deleter_(std::move(data));
55 |         data = x;
56 |         return *this;
57 |     }
58 | 
59 |     constexpr ~Resource() noexcept {
60 |         deleter_(std::move(data));
61 |     }
62 | };
63 | 
64 | struct MemoryResource {
65 |     Resource<uint8_t *, cudaFreeHost> h_data;
66 |     Resource<uint8_t *, cudaFree> d_data;
67 |     size_t size;
68 | };
69 | 
70 | using StreamResource = Resource<cudaStream_t, cudaStreamDestroy>;
71 | using GraphExecResource = Resource<cudaGraphExec_t, cudaGraphExecDestroy>;
72 | 
73 | #endif // VSTRT_CUDA_UTILS_H_
74 | 


--------------------------------------------------------------------------------
/vstrt/inference_helper.h:
--------------------------------------------------------------------------------
  1 | #ifndef VSTRT_INFERENCE_HELPER_H_
  2 | #define VSTRT_INFERENCE_HELPER_H_
  3 | 
  4 | #include <algorithm>
  5 | #include <cstdint>
  6 | #include <optional>
  7 | #include <string>
  8 | #include <vector>
  9 | 
 10 | #include <VSHelper.h>
 11 | 
 12 | #include "cuda_helper.h"
 13 | #include "trt_utils.h"
 14 | 
 15 | struct InputInfo {
 16 |     int width;
 17 |     int height;
 18 |     int pitch;
 19 |     int bytes_per_sample;
 20 |     int tile_w;
 21 |     int tile_h;
 22 | };
 23 | 
 24 | struct OutputInfo {
 25 |     int pitch;
 26 |     int bytes_per_sample;
 27 | };
 28 | 
 29 | struct IOInfo {
 30 |     InputInfo in;
 31 |     OutputInfo out;
 32 |     int w_scale;
 33 |     int h_scale;
 34 |     int overlap_w;
 35 |     int overlap_h;
 36 | };
 37 | 
 38 | static inline
 39 | std::optional<ErrorMessage> inference(
 40 |     const InferenceInstance & instance,
 41 |     int device_id,
 42 |     bool use_cuda_graph, 
 43 |     const IOInfo & info,
 44 |     const std::vector<const uint8_t *> & src_ptrs,
 45 |     const std::vector<uint8_t *> & dst_ptrs
 46 | ) noexcept {
 47 | 
 48 |     const auto set_error = [](const ErrorMessage & error_message) {
 49 |         return error_message;
 50 |     };
 51 | 
 52 |     checkError(cudaSetDevice(device_id));
 53 | 
 54 |     int src_tile_w_bytes = info.in.tile_w * info.in.bytes_per_sample;
 55 |     int src_tile_bytes = info.in.tile_h * info.in.tile_w * info.in.bytes_per_sample;
 56 |     int dst_tile_w = info.in.tile_w * info.w_scale;
 57 |     int dst_tile_h = info.in.tile_h * info.h_scale;
 58 |     int dst_tile_w_bytes = dst_tile_w * info.out.bytes_per_sample;
 59 |     int dst_tile_bytes = dst_tile_h * dst_tile_w * info.out.bytes_per_sample;
 60 | 
 61 |     int step_w = info.in.tile_w - 2 * info.overlap_w;
 62 |     int step_h = info.in.tile_h - 2 * info.overlap_h;
 63 | 
 64 |     int y = 0;
 65 |     while (true) {
 66 |         int y_crop_start = (y == 0) ? 0 : info.overlap_h;
 67 |         int y_crop_end = (y == info.in.height - info.in.tile_h) ? 0 : info.overlap_h;
 68 | 
 69 |         int x = 0;
 70 |         while (true) {
 71 |             int x_crop_start = (x == 0) ? 0 : info.overlap_w;
 72 |             int x_crop_end = (x == info.in.width - info.in.tile_w) ? 0 : info.overlap_w;
 73 | 
 74 |             {
 75 |                 uint8_t * h_data = instance.src.h_data.data;
 76 |                 for (const uint8_t * _src_ptr : src_ptrs) {
 77 |                     const uint8_t * src_ptr { _src_ptr +
 78 |                         y * info.in.pitch + x * info.in.bytes_per_sample
 79 |                     };
 80 | 
 81 |                     vs_bitblt(
 82 |                         h_data, src_tile_w_bytes,
 83 |                         src_ptr, info.in.pitch,
 84 |                         static_cast<size_t>(src_tile_w_bytes),
 85 |                         static_cast<size_t>(info.in.tile_h)
 86 |                     );
 87 | 
 88 |                     h_data += src_tile_bytes;
 89 |                 }
 90 |             }
 91 | 
 92 |             if (use_cuda_graph) {
 93 |                 checkError(cudaGraphLaunch(instance.graphexec, instance.stream));
 94 |             } else {
 95 |                 auto result = enqueue(
 96 |                     instance.src, instance.dst,
 97 |                     instance.exec_context, instance.stream
 98 |                 );
 99 | 
100 |                 if (result.has_value()) {
101 |                     return set_error(result.value());
102 |                 }
103 |             }
104 |             checkError(cudaStreamSynchronize(instance.stream));
105 | 
106 |             {
107 |                 const uint8_t * h_data = instance.dst.h_data.data;
108 |                 for (uint8_t * _dst_ptr : dst_ptrs) {
109 |                     uint8_t * dst_ptr { _dst_ptr +
110 |                         info.h_scale * y * info.out.pitch + info.w_scale * x * info.out.bytes_per_sample
111 |                     };
112 | 
113 |                     vs_bitblt(
114 |                         dst_ptr + (y_crop_start * info.out.pitch + x_crop_start * info.out.bytes_per_sample),
115 |                         info.out.pitch,
116 |                         h_data + (y_crop_start * dst_tile_w_bytes + x_crop_start * info.out.bytes_per_sample),
117 |                         dst_tile_w_bytes,
118 |                         static_cast<size_t>(dst_tile_w_bytes - (x_crop_start + x_crop_end) * info.out.bytes_per_sample),
119 |                         static_cast<size_t>(dst_tile_h - (y_crop_start + y_crop_end))
120 |                     );
121 | 
122 |                     h_data += dst_tile_bytes;
123 |                 }
124 |             }
125 | 
126 |             if (x + info.in.tile_w == info.in.width) {
127 |                 break;
128 |             }
129 | 
130 |             x = std::min(x + step_w, info.in.width - info.in.tile_w);
131 |         }
132 | 
133 |         if (y + info.in.tile_h == info.in.height) {
134 |             break;
135 |         }
136 | 
137 |         y = std::min(y + step_h, info.in.height - info.in.tile_h);
138 |     }
139 | 
140 |     return {};
141 | }
142 | 
143 | #endif // VSTRT_INFERENCE_HELPER_H_
144 | 


--------------------------------------------------------------------------------
/vstrt/longpath.manifest:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8" standalone="yes"?>
2 | <assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0" xmlns:asmv3="urn:schemas-microsoft-com:asm.v3" >
3 | <application xmlns="urn:schemas-microsoft-com:asm.v3">
4 |     <windowsSettings xmlns:ws2="http://schemas.microsoft.com/SMI/2016/WindowsSettings">
5 |         <ws2:longPathAware>true</ws2:longPathAware>
6 |     </windowsSettings>
7 | </application>
8 | </assembly>
9 | 


--------------------------------------------------------------------------------
/vstrt/trt_utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef VSTRT_TRT_UTILS_H_
  2 | #define VSTRT_TRT_UTILS_H_
  3 | 
  4 | #include <cstdint>
  5 | #include <memory>
  6 | #include <iostream>
  7 | #include <optional>
  8 | #include <string>
  9 | #include <variant>
 10 | 
 11 | #include <cuda_runtime.h>
 12 | #include <NvInferRuntime.h>
 13 | 
 14 | #include "cuda_helper.h"
 15 | #include "cuda_utils.h"
 16 | 
 17 | using ErrorMessage = std::string;
 18 | 
 19 | struct RequestedTileSize {
 20 |     int tile_w;
 21 |     int tile_h;
 22 | };
 23 | 
 24 | struct VideoSize {
 25 |     int width;
 26 |     int height;
 27 | };
 28 | 
 29 | using TileSize = std::variant<RequestedTileSize, VideoSize>;
 30 | 
 31 | struct InferenceInstance {
 32 |     MemoryResource src;
 33 |     MemoryResource dst;
 34 |     StreamResource stream;
 35 |     std::unique_ptr<nvinfer1::IExecutionContext> exec_context;
 36 |     GraphExecResource graphexec;
 37 | 
 38 | #if NV_TENSORRT_MAJOR >= 10
 39 |     Resource<uint8_t *, cudaFree> d_context_allocation;
 40 | #endif
 41 | };
 42 | 
 43 | class Logger : public nvinfer1::ILogger {
 44 |     void log(Severity severity, const char* message) noexcept override {
 45 |         if (severity <= verbosity) {
 46 |             std::cerr << message << '\n';
 47 |         }
 48 |     }
 49 | 
 50 | public:
 51 |     Logger() = default;
 52 | 
 53 |     void set_verbosity(Severity value) noexcept {
 54 |         this->verbosity = value;
 55 |     }
 56 | 
 57 | private:
 58 |     Severity verbosity;
 59 | };
 60 | 
 61 | static inline
 62 | std::optional<int> selectProfile(
 63 |     const std::unique_ptr<nvinfer1::ICudaEngine> & engine,
 64 |     const TileSize & tile_size,
 65 |     int batch_size = 1
 66 | ) noexcept {
 67 | 
 68 |     int tile_w, tile_h;
 69 |     if (std::holds_alternative<RequestedTileSize>(tile_size)) {
 70 |         tile_w = std::get<RequestedTileSize>(tile_size).tile_w;
 71 |         tile_h = std::get<RequestedTileSize>(tile_size).tile_h;
 72 |     } else {
 73 |         tile_w = std::get<VideoSize>(tile_size).width;
 74 |         tile_h = std::get<VideoSize>(tile_size).height;
 75 |     }
 76 | 
 77 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 78 |     auto input_name = engine->getIOTensorName(0);
 79 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 80 | 
 81 |     // finds the optimal profile
 82 |     for (int i = 0; i < engine->getNbOptimizationProfiles(); ++i) {
 83 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 84 |         nvinfer1::Dims opt_dims = engine->getProfileShape(
 85 |             input_name, i, nvinfer1::OptProfileSelector::kOPT
 86 |         );
 87 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 88 |         nvinfer1::Dims opt_dims = engine->getProfileDimensions(
 89 |             0, i, nvinfer1::OptProfileSelector::kOPT
 90 |         );
 91 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 92 | 
 93 |         if (opt_dims.d[0] != batch_size) {
 94 |             continue;
 95 |         }
 96 |         if (opt_dims.d[2] == tile_h && opt_dims.d[3] == tile_w) {
 97 |             return i;
 98 |         }
 99 |     }
100 | 
101 |     // finds the first eligible profile
102 |     for (int i = 0; i < engine->getNbOptimizationProfiles(); ++i) {
103 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
104 |         nvinfer1::Dims min_dims = engine->getProfileShape(
105 |             input_name, i, nvinfer1::OptProfileSelector::kMIN
106 |         );
107 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
108 |         nvinfer1::Dims min_dims = engine->getProfileDimensions(
109 |             0, i, nvinfer1::OptProfileSelector::kMIN
110 |         );
111 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
112 | 
113 |         if (min_dims.d[0] > batch_size) {
114 |             continue;
115 |         }
116 |         if (min_dims.d[2] > tile_h || min_dims.d[3] > tile_w) {
117 |             continue;
118 |         }
119 | 
120 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
121 |         nvinfer1::Dims max_dims = engine->getProfileShape(
122 |             input_name, i, nvinfer1::OptProfileSelector::kMAX
123 |         );
124 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
125 |         nvinfer1::Dims max_dims = engine->getProfileDimensions(
126 |             0, i, nvinfer1::OptProfileSelector::kMAX
127 |         );
128 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
129 | 
130 |         if (max_dims.d[0] < batch_size) {
131 |             continue;
132 |         }
133 |         if (max_dims.d[2] < tile_h || max_dims.d[3] < tile_w) {
134 |             continue;
135 |         }
136 | 
137 |         return i;
138 |     }
139 | 
140 |     // returns not-found
141 |     return {};
142 | }
143 | 
144 | static inline
145 | std::optional<ErrorMessage> enqueue(
146 |     const MemoryResource & src,
147 |     const MemoryResource & dst,
148 |     const std::unique_ptr<nvinfer1::IExecutionContext> & exec_context,
149 |     cudaStream_t stream
150 | ) noexcept {
151 | 
152 |     const auto set_error = [](const ErrorMessage & message) {
153 |         return message;
154 |     };
155 | 
156 |     checkError(cudaMemcpyAsync(
157 |         src.d_data, src.h_data, src.size,
158 |         cudaMemcpyHostToDevice, stream
159 |     ));
160 | 
161 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
162 |     auto input_name = exec_context->getEngine().getIOTensorName(0);
163 |     auto output_name = exec_context->getEngine().getIOTensorName(1);
164 | 
165 |     if (!exec_context->setTensorAddress(input_name, src.d_data.data)) {
166 |         return set_error("set input tensor address failed");
167 |     }
168 |     if (!exec_context->setTensorAddress(output_name, dst.d_data.data)) {
169 |         return set_error("set output tensor address failed");
170 |     }
171 |     if (!exec_context->enqueueV3(stream)) {
172 |         return set_error("enqueue error");
173 |     }
174 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
175 |     void * bindings[] {
176 |         static_cast<void *>(src.d_data.data),
177 |         static_cast<void *>(dst.d_data.data)
178 |     };
179 | 
180 |     if (!exec_context->enqueueV2(bindings, stream, nullptr)) {
181 |         return set_error("enqueue error");
182 |     }
183 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
184 | 
185 |     checkError(cudaMemcpyAsync(
186 |         dst.h_data, dst.d_data, dst.size,
187 |         cudaMemcpyDeviceToHost, stream
188 |     ));
189 | 
190 |     return {};
191 | }
192 | 
193 | static inline
194 | std::variant<ErrorMessage, GraphExecResource> getGraphExec(
195 |     const MemoryResource & src, const MemoryResource & dst,
196 |     const std::unique_ptr<nvinfer1::IExecutionContext> & exec_context,
197 |     cudaStream_t stream
198 | ) noexcept {
199 | 
200 |     const auto set_error = [](const ErrorMessage & message) {
201 |         return message;
202 |     };
203 | 
204 |     // flush deferred internal state update
205 |     // https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/developer-guide/index.html#cuda-graphs
206 |     {
207 |         auto result = enqueue(src, dst, exec_context, stream);
208 |         if (result.has_value()) {
209 |             return set_error(result.value());
210 |         }
211 |         checkError(cudaStreamSynchronize(stream));
212 |     }
213 | 
214 |     checkError(cudaStreamBeginCapture(stream, cudaStreamCaptureModeRelaxed));
215 |     {
216 |         auto result = enqueue(src, dst, exec_context, stream);
217 |         if (result.has_value()) {
218 |             return set_error(result.value());
219 |         }
220 |     }
221 |     cudaGraph_t graph;
222 |     checkError(cudaStreamEndCapture(stream, &graph));
223 |     cudaGraphExec_t graphexec;
224 |     checkError(cudaGraphInstantiate(&graphexec, graph, nullptr, nullptr, 0));
225 |     checkError(cudaGraphDestroy(graph));
226 | 
227 |     return graphexec;
228 | }
229 | 
230 | static inline
231 | size_t getSize(
232 |     const nvinfer1::Dims & dim
233 | ) noexcept {
234 | 
235 |     size_t ret = 1;
236 |     for (int i = 0; i < dim.nbDims; ++i) {
237 |         ret *= dim.d[i];
238 |     }
239 |     return ret;
240 | }
241 | 
242 | static inline
243 | int getBytesPerSample(nvinfer1::DataType type) noexcept {
244 |     switch (type) {
245 |         case nvinfer1::DataType::kFLOAT:
246 |             return 4;
247 |         case nvinfer1::DataType::kHALF:
248 |             return 2;
249 |         case nvinfer1::DataType::kINT8:
250 |             return 1;
251 |         case nvinfer1::DataType::kINT32:
252 |             return 4;
253 |         case nvinfer1::DataType::kBOOL:
254 |             return 1;
255 |         case nvinfer1::DataType::kUINT8:
256 |             return 1;
257 | #if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
258 |         case nvinfer1::DataType::kFP8:
259 |             return 1;
260 | #endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
261 | #if NV_TENSORRT_MAJOR >= 9
262 |         case nvinfer1::DataType::kBF16:
263 |             return 2;
264 |         case nvinfer1::DataType::kINT64:
265 |             return 8;
266 | #endif // NV_TENSORRT_MAJOR >= 9
267 |         default:
268 |             return 0;
269 |     }
270 | }
271 | 
272 | static inline
273 | std::variant<ErrorMessage, InferenceInstance> getInstance(
274 |     const std::unique_ptr<nvinfer1::ICudaEngine> & engine,
275 |     const std::optional<int> & profile_index,
276 |     const TileSize & tile_size,
277 |     bool use_cuda_graph,
278 |     bool & is_dynamic
279 | ) noexcept {
280 | 
281 |     const auto set_error = [](const ErrorMessage & error_message) {
282 |         return error_message;
283 |     };
284 | 
285 |     StreamResource stream {};
286 |     checkError(cudaStreamCreateWithFlags(&stream.data, cudaStreamNonBlocking));
287 | 
288 |     auto exec_context = std::unique_ptr<nvinfer1::IExecutionContext>(
289 | #if NV_TENSORRT_MAJOR >= 10
290 |         engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)
291 | #else
292 |         engine->createExecutionContext()
293 | #endif
294 |     );
295 | 
296 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
297 |     auto input_name = exec_context->getEngine().getIOTensorName(0);
298 |     auto output_name = exec_context->getEngine().getIOTensorName(1);
299 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
300 | 
301 |     if (!exec_context->allInputDimensionsSpecified()) {
302 |         if (!profile_index.has_value()) {
303 |             return set_error("no valid optimization profile found");
304 |         }
305 | 
306 |         is_dynamic = true;
307 | 
308 |         exec_context->setOptimizationProfileAsync(profile_index.value(), stream);
309 |         checkError(cudaStreamSynchronize(stream));
310 | 
311 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
312 |         nvinfer1::Dims dims = exec_context->getTensorShape(input_name);
313 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
314 |         nvinfer1::Dims dims = exec_context->getBindingDimensions(0);
315 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
316 | 
317 |         dims.d[0] = 1;
318 | 
319 |         if (std::holds_alternative<RequestedTileSize>(tile_size)) {
320 |             dims.d[2] = std::get<RequestedTileSize>(tile_size).tile_h;
321 |             dims.d[3] = std::get<RequestedTileSize>(tile_size).tile_w;
322 |         } else {
323 |             dims.d[2] = std::get<VideoSize>(tile_size).height;
324 |             dims.d[3] = std::get<VideoSize>(tile_size).width;
325 |         }
326 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
327 |         exec_context->setInputShape(input_name, dims);
328 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
329 |         exec_context->setBindingDimensions(0, dims);
330 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
331 |     } else if (std::holds_alternative<RequestedTileSize>(tile_size)) {
332 |         is_dynamic = false;
333 | 
334 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
335 |         nvinfer1::Dims dims = exec_context->getTensorShape(input_name);
336 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
337 |         nvinfer1::Dims dims = exec_context->getBindingDimensions(0);
338 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
339 | 
340 |         if (std::holds_alternative<RequestedTileSize>(tile_size)) {
341 |             if (dims.d[2] != std::get<RequestedTileSize>(tile_size).tile_h ||
342 |                 dims.d[3] != std::get<RequestedTileSize>(tile_size).tile_w
343 |             ) {
344 |                 return set_error("requested tile size not applicable");
345 |             }
346 |         } else {
347 |             if (dims.d[2] != std::get<VideoSize>(tile_size).height ||
348 |                 dims.d[3] != std::get<VideoSize>(tile_size).width
349 |             ) {
350 |                 return set_error("not supported video dimensions");
351 |             }
352 |         }
353 |     }
354 | 
355 |     MemoryResource src {};
356 |     {
357 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
358 |         auto dim = exec_context->getTensorShape(input_name);
359 |         auto type = engine->getTensorDataType(input_name);
360 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
361 |         auto dim = exec_context->getBindingDimensions(0);
362 |         auto type = engine->getBindingDataType(0);
363 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
364 | 
365 |         auto size = getSize(dim) * getBytesPerSample(type);
366 | 
367 |         Resource<uint8_t *, cudaFree> d_data {};
368 |         checkError(cudaMalloc(&d_data.data, size));
369 | 
370 |         Resource<uint8_t *, cudaFreeHost> h_data {};
371 |         checkError(cudaMallocHost(&h_data.data, size, cudaHostAllocWriteCombined));
372 | 
373 |         src = MemoryResource{
374 |             .h_data = std::move(h_data),
375 |             .d_data = std::move(d_data),
376 |             .size=size
377 |         };
378 |     }
379 | 
380 |     MemoryResource dst {};
381 |     {
382 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
383 |         auto dim = exec_context->getTensorShape(output_name);
384 |         auto type = engine->getTensorDataType(output_name);
385 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
386 |         auto dim = exec_context->getBindingDimensions(1);
387 |         auto type = engine->getBindingDataType(1);
388 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
389 | 
390 |         auto size = getSize(dim) * getBytesPerSample(type);
391 | 
392 |         Resource<uint8_t *, cudaFree> d_data {};
393 |         checkError(cudaMalloc(&d_data.data, size));
394 | 
395 |         Resource<uint8_t *, cudaFreeHost> h_data {};
396 |         checkError(cudaMallocHost(&h_data.data, size));
397 | 
398 |         dst = MemoryResource{
399 |             .h_data = std::move(h_data),
400 |             .d_data = std::move(d_data),
401 |             .size=size
402 |         };
403 |     }
404 | 
405 | #if NV_TENSORRT_MAJOR >= 10
406 |     size_t buffer_size { exec_context->updateDeviceMemorySizeForShapes() };
407 |     if (buffer_size == 0) {
408 |         return set_error("failed to get internal activation buffer size");
409 |     }
410 | 
411 |     Resource<uint8_t *, cudaFree> d_context_allocation {};
412 |     checkError(cudaMalloc(&d_context_allocation.data, buffer_size));
413 | 
414 | #if NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001
415 |     exec_context->setDeviceMemoryV2(d_context_allocation.data, static_cast<int64_t>(buffer_size));
416 | #else // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001
417 |     exec_context->setDeviceMemory(d_context_allocation.data);
418 | #endif // NV_TENSORRT_MAJOR * 100 + NV_TENSORRT_MINOR >= 1001
419 | #endif // NV_TENSORRT_MAJOR >= 10
420 | 
421 |     GraphExecResource graphexec {};
422 |     if (use_cuda_graph) {
423 |         auto result = getGraphExec(
424 |             src, dst,
425 |             exec_context, stream
426 |         );
427 |         if (std::holds_alternative<GraphExecResource>(result)) {
428 |             graphexec = std::move(std::get<GraphExecResource>(result));
429 |         } else {
430 |             return set_error(std::get<ErrorMessage>(result));
431 |         }
432 |     }
433 | 
434 |     return InferenceInstance{
435 |         .src = std::move(src),
436 |         .dst = std::move(dst),
437 |         .stream = std::move(stream),
438 |         .exec_context = std::move(exec_context),
439 |         .graphexec = std::move(graphexec),
440 | #if NV_TENSORRT_MAJOR >= 10
441 |         .d_context_allocation = std::move(d_context_allocation)
442 | #endif
443 |     };
444 | }
445 | 
446 | static inline
447 | std::optional<ErrorMessage> checkEngine(
448 |     const std::unique_ptr<nvinfer1::ICudaEngine> & engine,
449 |     bool flexible_output
450 | ) noexcept {
451 | 
452 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
453 |     int num_bindings = engine->getNbIOTensors();
454 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
455 |     int num_bindings = engine->getNbBindings();
456 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
457 | 
458 |     if (num_bindings != 2) {
459 |         return "network binding count must be 2, got " + std::to_string(num_bindings);
460 |     }
461 | 
462 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
463 |     auto input_name = engine->getIOTensorName(0);
464 |     auto output_name = engine->getIOTensorName(1);
465 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
466 | 
467 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
468 |     if (engine->getTensorIOMode(input_name) != nvinfer1::TensorIOMode::kINPUT) {
469 |         return "the first binding should be an input binding";
470 |     }
471 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
472 |     if (!engine->bindingIsInput(0)) {
473 |         return "the first binding should be an input binding";
474 |     }
475 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
476 | 
477 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
478 |     const nvinfer1::Dims & input_dims = engine->getTensorShape(input_name);
479 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
480 |     const nvinfer1::Dims & input_dims = engine->getBindingDimensions(0);
481 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
482 | 
483 |     if (input_dims.nbDims != 4) {
484 |         return "expects network with 4-D input";
485 |     }
486 |     if (input_dims.d[0] != 1) {
487 |         return "batch size of network input must be 1";
488 |     }
489 | 
490 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
491 |     if (engine->getTensorIOMode(output_name) != nvinfer1::TensorIOMode::kOUTPUT) {
492 |         return "the second binding should be an output binding";
493 |     }
494 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
495 |     if (engine->bindingIsInput(1)) {
496 |         return "the second binding should be an output binding";
497 |     }
498 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
499 | 
500 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
501 |     const nvinfer1::Dims & output_dims = engine->getTensorShape(output_name);
502 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
503 |     const nvinfer1::Dims & output_dims = engine->getBindingDimensions(1);
504 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
505 | 
506 |     if (output_dims.nbDims != 4) {
507 |         return "expects network with 4-D output";
508 |     }
509 |     if (output_dims.d[0] != 1) {
510 |         return "batch size of network output must be 1";
511 |     }
512 | 
513 |     auto out_channels = output_dims.d[1];
514 |     if (out_channels != 1 && out_channels != 3 && !flexible_output) {
515 |         return "output dimensions must be 1 or 3, or enable \"flexible_output\"";
516 |     }
517 | 
518 |     auto in_height = input_dims.d[2];
519 |     auto in_width = input_dims.d[3];
520 |     auto out_height = output_dims.d[2];
521 |     auto out_width = output_dims.d[3];
522 |     if (out_height % in_height != 0 || out_width % in_width != 0) {
523 |         return "output dimensions must be divisible by input dimensions";
524 |     }
525 | 
526 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
527 |     for (const auto & name : { input_name, output_name }) {
528 |         if (engine->getTensorLocation(name) != nvinfer1::TensorLocation::kDEVICE) {
529 |             return "network binding " + std::string{ name } + " should reside on device";
530 |         }
531 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
532 |     for (int i = 0; i < 2; i++) {
533 |         if (engine->getLocation(i) != nvinfer1::TensorLocation::kDEVICE) {
534 |             return "network binding " + std::to_string(i) + " should reside on device";
535 |         }
536 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
537 | 
538 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
539 |         if (engine->getTensorFormat(name) != nvinfer1::TensorFormat::kLINEAR) {
540 |             return "expects network IO with layout NCHW (row major linear)";
541 |         }
542 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
543 |         if (engine->getBindingFormat(i) != nvinfer1::TensorFormat::kLINEAR) {
544 |             return "expects network IO with layout NCHW (row major linear)";
545 |         }
546 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
547 |     }
548 | 
549 |     return {};
550 | }
551 | 
552 | static inline
553 | std::variant<ErrorMessage, std::unique_ptr<nvinfer1::ICudaEngine>> initEngine(
554 |     const char * engine_data, size_t engine_nbytes,
555 |     const std::unique_ptr<nvinfer1::IRuntime> & runtime,
556 |     bool flexible_output
557 | ) noexcept {
558 | 
559 |     const auto set_error = [](const ErrorMessage & error_message) {
560 |         return error_message;
561 |     };
562 | 
563 |     std::unique_ptr<nvinfer1::ICudaEngine> engine {
564 |         runtime->deserializeCudaEngine(engine_data, engine_nbytes)
565 |     };
566 | 
567 |     if (!engine) {
568 |         return set_error("engine deserialization failed");
569 |     }
570 | 
571 |     if (auto err = checkEngine(engine, flexible_output); err.has_value()) {
572 |         return set_error(err.value());
573 |     }
574 | 
575 |     return engine;
576 | }
577 | 
578 | // 0: integer, 1: float
579 | static inline
580 | int getSampleType(nvinfer1::DataType type) noexcept {
581 |     switch (type) {
582 |         case nvinfer1::DataType::kFLOAT:
583 |         case nvinfer1::DataType::kHALF:
584 | #if (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
585 |         case nvinfer1::DataType::kFP8:
586 | #endif // (NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR) * 10 + NV_TENSORRT_PATCH >= 861
587 | #if NV_TENSORRT_MAJOR >= 9
588 |         case nvinfer1::DataType::kBF16:
589 | #endif // NV_TENSORRT_MAJOR >= 9
590 |             return 1;
591 |         case nvinfer1::DataType::kINT8:
592 |         case nvinfer1::DataType::kINT32:
593 |         case nvinfer1::DataType::kBOOL:
594 |         case nvinfer1::DataType::kUINT8:
595 | #if NV_TENSORRT_MAJOR >= 9
596 |         case nvinfer1::DataType::kINT64:
597 | #endif // NV_TENSORRT_MAJOR >= 9
598 |             return 0;
599 |         default:
600 |             return -1;
601 |     }
602 | }
603 | 
604 | #endif // VSTRT_TRT_UTILS_H_
605 | 


--------------------------------------------------------------------------------
/vstrt/trtexec/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20)
 2 | 
 3 | project(trtexec LANGUAGES CXX)
 4 | 
 5 | find_package(CUDAToolkit REQUIRED)
 6 | 
 7 | add_executable(trtexec
 8 |     $<$<PLATFORM_ID:Windows>: longpath.manifest>
 9 |     trtexec.cpp
10 |     logfile.cpp
11 |     ../common/bfloat16.cpp
12 |     ../common/logger.cpp
13 |     ../common/sampleDevice.cpp
14 |     ../common/sampleEngines.cpp
15 |     ../common/sampleInference.cpp
16 |     ../common/sampleOptions.cpp
17 |     ../common/sampleReporting.cpp
18 |     ../common/sampleUtils.cpp
19 |     ../utils/fileLock.cpp
20 |     ../utils/timingCache.cpp
21 | )
22 | 
23 | target_include_directories(trtexec PRIVATE
24 |     ../common
25 |     ..
26 |     ../../include
27 | )
28 | 
29 | target_link_libraries(trtexec PRIVATE CUDA::cudart_static)
30 | 
31 | install(TARGETS trtexec RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
32 | 


--------------------------------------------------------------------------------
/vstrt/trtexec/logfile.cpp:
--------------------------------------------------------------------------------
 1 | // When $TRTEXEC_LOG_FILE is set, redirect stdout and stderr to the specified
 2 | // file as well.
 3 | #include <iostream>
 4 | #include <streambuf>
 5 | #include <fstream>
 6 | #include <stdlib.h>
 7 | 
 8 | namespace {
 9 | static struct redirect {
10 | 	class teebuf: public std::streambuf {
11 | 		public:
12 | 			teebuf(std::streambuf *a, std::streambuf *b): s1(a), s2(b) {}
13 | 		private:
14 | 			std::streambuf *s1, *s2;
15 | 
16 | 			virtual int overflow(int c) override {
17 | 				if (c == EOF)
18 | 					return EOF;
19 | 				else {
20 | 					int r1 = s1->sputc(c);
21 | 					int r2 = s2->sputc(c);
22 | 					return (r1 == EOF || r2 == EOF) ? EOF : c;
23 | 				}
24 | 			}
25 | 
26 | 			virtual int sync() override {
27 | 				int r1 = s1->pubsync();
28 | 				int r2 = s2->pubsync();
29 | 				return (r1 == 0 && r2 == 0) ? 0 : -1;
30 | 			}
31 | 	};
32 | 	redirect() {
33 | 		const char *fn = getenv("TRTEXEC_LOG_FILE");
34 | 		if (fn) {
35 | 			static std::ofstream ofs(fn, std::ios::app);
36 | 			static teebuf out(ofs.rdbuf(), std::cout.rdbuf());
37 | 			static teebuf err(ofs.rdbuf(), std::cerr.rdbuf());
38 | 			std::cout.rdbuf(&out);
39 | 			std::cerr.rdbuf(&err);
40 | 		}
41 | 	}
42 | } _;
43 | } // namespace
44 | 


--------------------------------------------------------------------------------
/vstrt/trtexec/trtexec.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/samples/utils/fileLock.cpp b/samples/utils/fileLock.cpp
 2 | index e155c0b..de6bce2 100644
 3 | --- a/samples/utils/fileLock.cpp
 4 | +++ b/samples/utils/fileLock.cpp
 5 | @@ -35,8 +35,11 @@ FileLock::FileLock(ILogger& logger, std::string const& fileName)
 6 |          ss << "Trying to set exclusive file lock " << lockFileName << std::endl;
 7 |          mLogger.log(ILogger::Severity::kVERBOSE, ss.str().c_str());
 8 |      }
 9 | +    int size = MultiByteToWideChar(CP_UTF8, 0, lockFileName.c_str(), -1, nullptr, 0);
10 | +    std::wstring lockFileNameW (size, L'\0');
11 | +    MultiByteToWideChar(CP_UTF8, 0, lockFileName.c_str(), -1, &lockFileNameW[0], size);
12 |      // MS docs said this is a blocking IO if "FILE_FLAG_OVERLAPPED" is not provided
13 | -    mHandle = CreateFileA(lockFileName.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, 0, NULL);
14 | +    mHandle = CreateFileW(lockFileNameW.c_str(), GENERIC_WRITE, 0, NULL, OPEN_ALWAYS, FILE_FLAG_DELETE_ON_CLOSE | FILE_ATTRIBUTE_TEMPORARY, NULL);
15 |      if (mHandle == INVALID_HANDLE_VALUE)
16 |      {
17 |          throw std::runtime_error("Failed to lock " + lockFileName + "!");
18 | 


--------------------------------------------------------------------------------
/vstrt/utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef VSTRT_UTILS_H_
  2 | #define VSTRT_UTILS_H_
  3 | 
  4 | #include <array>
  5 | #include <cstdint>
  6 | #include <memory>
  7 | #include <optional>
  8 | #include <string>
  9 | #include <vector>
 10 | 
 11 | #include <NvInferRuntime.h>
 12 | 
 13 | #include <VapourSynth.h>
 14 | #include <VSHelper.h>
 15 | 
 16 | static inline
 17 | void setDimensions(
 18 |     std::unique_ptr<VSVideoInfo> & vi,
 19 |     const std::unique_ptr<nvinfer1::IExecutionContext> & exec_context,
 20 |     VSCore * core,
 21 |     const VSAPI * vsapi,
 22 |     int sample_type,
 23 |     int bits_per_sample,
 24 |     bool flexible_output
 25 | ) noexcept {
 26 | 
 27 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 28 |     auto input_name = exec_context->getEngine().getIOTensorName(0);
 29 |     auto output_name = exec_context->getEngine().getIOTensorName(1);
 30 |     const nvinfer1::Dims & in_dims = exec_context->getTensorShape(input_name);
 31 |     const nvinfer1::Dims & out_dims = exec_context->getTensorShape(output_name);
 32 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 33 |     const nvinfer1::Dims & in_dims = exec_context->getBindingDimensions(0);
 34 |     const nvinfer1::Dims & out_dims = exec_context->getBindingDimensions(1);
 35 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
 36 | 
 37 |     auto in_height = static_cast<int>(in_dims.d[2]);
 38 |     auto in_width = static_cast<int>(in_dims.d[3]);
 39 | 
 40 |     auto out_height = static_cast<int>(out_dims.d[2]);
 41 |     auto out_width = static_cast<int>(out_dims.d[3]);
 42 | 
 43 |     vi->height *= out_height / in_height;
 44 |     vi->width *= out_width / in_width;
 45 | 
 46 |     if (out_dims.d[1] == 1 || flexible_output) {
 47 |         vi->format = vsapi->registerFormat(cmGray, sample_type, bits_per_sample, 0, 0, core);
 48 |     } else if (out_dims.d[1] == 3) {
 49 |         vi->format = vsapi->registerFormat(cmRGB, sample_type, bits_per_sample, 0, 0, core);
 50 |     }
 51 | }
 52 | 
 53 | static inline
 54 | std::vector<const VSVideoInfo *> getVideoInfo(
 55 |     const VSAPI * vsapi,
 56 |     const std::vector<VSNodeRef *> & nodes
 57 | ) noexcept {
 58 | 
 59 |     std::vector<const VSVideoInfo *> vis;
 60 |     vis.reserve(std::size(nodes));
 61 | 
 62 |     for (const auto & node : nodes) {
 63 |         vis.emplace_back(vsapi->getVideoInfo(node));
 64 |     }
 65 | 
 66 |     return vis;
 67 | }
 68 | 
 69 | static inline
 70 | std::vector<const VSFrameRef *> getFrames(
 71 |     int n,
 72 |     const VSAPI * vsapi,
 73 |     VSFrameContext * frameCtx,
 74 |     const std::vector<VSNodeRef *> & nodes
 75 | ) noexcept {
 76 | 
 77 |     std::vector<const VSFrameRef *> frames;
 78 |     frames.reserve(std::size(nodes));
 79 | 
 80 |     for (const auto & node : nodes) {
 81 |         frames.emplace_back(vsapi->getFrameFilter(n, node, frameCtx));
 82 |     }
 83 | 
 84 |     return frames;
 85 | }
 86 | 
 87 | static inline
 88 | std::optional<std::string> checkNodes(
 89 |     const std::vector<const VSVideoInfo *> & vis
 90 | ) noexcept {
 91 | 
 92 |     for (const auto & vi : vis) {
 93 |         if (!isConstantFormat(vi)) {
 94 |             return "video format must be constant";
 95 |         }
 96 | 
 97 |         if (vi->width != vis[0]->width || vi->height != vis[0]->height) {
 98 |             return "dimensions of clips mismatch";
 99 |         }
100 | 
101 |         if (vi->numFrames != vis[0]->numFrames) {
102 |             return "number of frames mismatch";
103 |         }
104 | 
105 |         if (vi->format->subSamplingH != 0 || vi->format->subSamplingW != 0) {
106 |             return "clip must not be sub-sampled";
107 |         }
108 |     }
109 | 
110 |     return {};
111 | }
112 | 
113 | static inline
114 | std::optional<std::string> checkNodes(
115 |     const std::vector<const VSVideoInfo *> & vis,
116 |     int sample_type,
117 |     int bits_per_sample
118 | ) noexcept {
119 | 
120 |     for (const auto & vi : vis) {
121 |         if (vi->format->sampleType != sample_type) {
122 |             return "sample type mismatch";
123 |         }
124 | 
125 |         if (vi->format->bitsPerSample != bits_per_sample) {
126 |             return "bits per sample mismatch";
127 |         }
128 |     }
129 | 
130 |     return {};
131 | }
132 | 
133 | static inline
134 | int numPlanes(
135 |     const std::vector<const VSVideoInfo *> & vis
136 | ) noexcept {
137 | 
138 |     int num_planes = 0;
139 | 
140 |     for (const auto & vi : vis) {
141 |         num_planes += vi->format->numPlanes;
142 |     }
143 | 
144 |     return num_planes;
145 | }
146 | 
147 | static inline
148 | std::optional<std::string> checkNodesAndContext(
149 |     const std::unique_ptr<nvinfer1::IExecutionContext> & execution_context,
150 |     const std::vector<const VSVideoInfo *> & vis
151 | ) noexcept {
152 | 
153 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
154 |     auto input_name = execution_context->getEngine().getIOTensorName(0);
155 |     const nvinfer1::Dims & network_in_dims = execution_context->getTensorShape(input_name);
156 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
157 |     const nvinfer1::Dims & network_in_dims = execution_context->getBindingDimensions(0);
158 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
159 | 
160 |     auto network_in_channels = network_in_dims.d[1];
161 |     int num_planes = numPlanes(vis);
162 |     if (network_in_channels != num_planes) {
163 |         return "expects " + std::to_string(network_in_channels) + " input planes";
164 |     }
165 | 
166 |     auto network_in_height = network_in_dims.d[2];
167 |     auto network_in_width = network_in_dims.d[3];
168 |     int clip_in_height = vis[0]->height;
169 |     int clip_in_width = vis[0]->width;
170 | 
171 |     if (network_in_height > clip_in_height || network_in_width > clip_in_width) {
172 |         return "tile size larger than clip dimension";
173 |     }
174 | 
175 |     return {};
176 | }
177 | 
178 | static inline void VS_CC getDeviceProp(
179 |     const VSMap *in, VSMap *out, void *userData,
180 |     VSCore *core, const VSAPI *vsapi
181 | ) {
182 | 
183 |     int err;
184 |     int device_id = static_cast<int>(vsapi->propGetInt(in, "device_id", 0, &err));
185 |     if (err) {
186 |         device_id = 0;
187 |     }
188 | 
189 |     cudaDeviceProp prop;
190 |     if (auto error = cudaGetDeviceProperties(&prop, device_id); error != cudaSuccess) {
191 |         vsapi->setError(out, cudaGetErrorString(error));
192 |         return ;
193 |     }
194 | 
195 |     auto setProp = [&](const char * name, auto value, int data_length = -1) {
196 |         using T = std::decay_t<decltype(value)>;
197 |         if constexpr (std::is_same_v<T, int>) {
198 |             vsapi->propSetInt(out, name, value, paReplace);
199 |         } else if constexpr (std::is_same_v<T, size_t>) {
200 |             vsapi->propSetInt(out, name, static_cast<int64_t>(value), paReplace);
201 |         } else if constexpr (std::is_same_v<T, char *>) {
202 |             vsapi->propSetData(out, name, value, data_length, paReplace);
203 |         }
204 |     };
205 | 
206 |     int driver_version;
207 |     cudaDriverGetVersion(&driver_version);
208 |     setProp("driver_version", driver_version);
209 | 
210 |     setProp("name", prop.name);
211 |     {
212 |         std::array<int64_t, 16> uuid;
213 |         for (int i = 0; i < 16; ++i) {
214 |             uuid[i] = prop.uuid.bytes[i];
215 |         }
216 |         vsapi->propSetIntArray(out, "uuid", std::data(uuid), static_cast<int>(std::size(uuid)));
217 |     }
218 |     setProp("total_global_memory", prop.totalGlobalMem);
219 |     setProp("shared_memory_per_block", prop.sharedMemPerBlock);
220 |     setProp("regs_per_block", prop.regsPerBlock);
221 |     setProp("warp_size", prop.warpSize);
222 |     setProp("mem_pitch", prop.memPitch);
223 |     setProp("max_threads_per_block", prop.maxThreadsPerBlock);
224 |     setProp("clock_rate", prop.clockRate);
225 |     setProp("total_const_mem", prop.totalConstMem);
226 |     setProp("major", prop.major);
227 |     setProp("minor", prop.minor);
228 |     setProp("texture_alignment", prop.textureAlignment);
229 |     setProp("texture_pitch_alignment", prop.texturePitchAlignment);
230 |     setProp("device_overlap", prop.deviceOverlap);
231 |     setProp("multi_processor_count", prop.multiProcessorCount);
232 |     setProp("kernel_exec_timeout_enabled", prop.kernelExecTimeoutEnabled);
233 |     setProp("integrated", prop.integrated);
234 |     setProp("can_map_host_memory", prop.canMapHostMemory);
235 |     setProp("compute_mode", prop.computeMode);
236 |     setProp("concurrent_kernels", prop.concurrentKernels);
237 |     setProp("ecc_enabled", prop.ECCEnabled);
238 |     setProp("pci_bus_id", prop.pciBusID);
239 |     setProp("pci_device_id", prop.pciDeviceID);
240 |     setProp("pci_domain_id", prop.pciDomainID);
241 |     setProp("tcc_driver", prop.tccDriver);
242 |     setProp("async_engine_count", prop.asyncEngineCount);
243 |     setProp("unified_addressing", prop.unifiedAddressing);
244 |     setProp("memory_clock_rate", prop.memoryClockRate);
245 |     setProp("memory_bus_width", prop.memoryBusWidth);
246 |     setProp("l2_cache_size", prop.l2CacheSize);
247 |     setProp("persisting_l2_cache_max_size", prop.persistingL2CacheMaxSize);
248 |     setProp("max_threads_per_multiprocessor", prop.maxThreadsPerMultiProcessor);
249 |     setProp("stream_priorities_supported", prop.streamPrioritiesSupported);
250 |     setProp("global_l1_cache_supported", prop.globalL1CacheSupported);
251 |     setProp("local_l1_cache_supported", prop.localL1CacheSupported);
252 |     setProp("shared_mem_per_multiprocessor", prop.sharedMemPerMultiprocessor);
253 |     setProp("regs_per_multiprocessor", prop.regsPerMultiprocessor);
254 |     setProp("managed_memory", prop.managedMemory);
255 |     setProp("is_multi_gpu_board", prop.isMultiGpuBoard);
256 |     setProp("multi_gpu_board_group_id", prop.multiGpuBoardGroupID);
257 |     setProp("host_native_atomic_supported", prop.hostNativeAtomicSupported);
258 |     setProp("single_to_double_precision_perf_ratio", prop.singleToDoublePrecisionPerfRatio);
259 |     setProp("pageable_memory_access", prop.pageableMemoryAccess);
260 |     setProp("conccurrent_managed_access", prop.concurrentManagedAccess);
261 |     setProp("compute_preemption_supported", prop.computePreemptionSupported);
262 |     setProp(
263 |         "can_use_host_pointer_for_registered_mem",
264 |         prop.canUseHostPointerForRegisteredMem
265 |     );
266 |     setProp("cooperative_launch", prop.cooperativeLaunch);
267 |     setProp("cooperative_multi_device_launch", prop.cooperativeMultiDeviceLaunch);
268 |     setProp("shared_mem_per_block_optin", prop.sharedMemPerBlockOptin);
269 |     setProp(
270 |         "pageable_memory_access_uses_host_page_tables",
271 |         prop.pageableMemoryAccessUsesHostPageTables
272 |     );
273 |     setProp("direct_managed_mem_access_from_host", prop.directManagedMemAccessFromHost);
274 |     setProp("max_blocks_per_multi_processor", prop.maxBlocksPerMultiProcessor);
275 |     setProp("access_policy_max_window_size", prop.accessPolicyMaxWindowSize);
276 |     setProp("reserved_shared_mem_per_block", prop.reservedSharedMemPerBlock);
277 | };
278 | 
279 | #endif // VSTRT_UTILS_H_
280 | 


--------------------------------------------------------------------------------
/vstrt/vs_tensorrt.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <atomic>
  3 | #include <cstdint>
  4 | #include <cstdlib>
  5 | #include <fstream>
  6 | #include <ios>
  7 | #include <memory>
  8 | #include <mutex>
  9 | #include <string>
 10 | #include <utility>
 11 | #include <variant>
 12 | #include <vector>
 13 | 
 14 | #include <VapourSynth.h>
 15 | #include <VSHelper.h>
 16 | 
 17 | #include <cuda_runtime.h>
 18 | #include <NvInferRuntime.h>
 19 | #ifdef USE_NVINFER_PLUGIN
 20 | #include <NvInferPlugin.h>
 21 | #endif
 22 | 
 23 | #include "config.h"
 24 | #include "inference_helper.h"
 25 | #include "trt_utils.h"
 26 | #include "utils.h"
 27 | 
 28 | #ifdef _WIN32
 29 | #define WIN32_LEAN_AND_MEAN
 30 | #include <Windows.h>
 31 |     
 32 | static std::wstring translateName(const char *name) {
 33 |     auto size = MultiByteToWideChar(CP_UTF8, 0, name, -1, nullptr, 0);
 34 |     std::wstring ret(static_cast<size_t>(size), {});
 35 |     MultiByteToWideChar(CP_UTF8, 0, name, -1, ret.data(), size);
 36 |     return ret;
 37 | }
 38 | #else
 39 | #define translateName(n) (n)
 40 | #endif
 41 | 
 42 | using namespace std::string_literals;
 43 | 
 44 | static const VSPlugin * myself = nullptr;
 45 | 
 46 | struct TicketSemaphore {
 47 |     std::atomic<intptr_t> ticket {};
 48 |     std::atomic<intptr_t> current {};
 49 | 
 50 |     void init(intptr_t num) noexcept {
 51 |         current.store(num, std::memory_order::seq_cst);
 52 |     }
 53 | 
 54 |     void acquire() noexcept {
 55 |         intptr_t tk { ticket.fetch_add(1, std::memory_order::acquire) };
 56 |         while (true) {
 57 |             intptr_t curr { current.load(std::memory_order::acquire) };
 58 |             if (tk < curr) {
 59 |                 return;
 60 |             }
 61 |             current.wait(curr, std::memory_order::relaxed);
 62 |         }
 63 |     }
 64 | 
 65 |     void release() noexcept {
 66 |         current.fetch_add(1, std::memory_order::release);
 67 |         current.notify_all();
 68 |     }
 69 | };
 70 | 
 71 | struct vsTrtData {
 72 |     std::vector<VSNodeRef *> nodes;
 73 |     std::unique_ptr<VSVideoInfo> out_vi;
 74 | 
 75 |     int device_id;
 76 |     int num_streams;
 77 |     bool use_cuda_graph;
 78 |     int overlap_w, overlap_h;
 79 | 
 80 |     Logger logger;
 81 |     std::unique_ptr<nvinfer1::IRuntime> runtime;
 82 |     std::vector<std::unique_ptr<nvinfer1::ICudaEngine>> engines;
 83 | 
 84 |     TicketSemaphore semaphore;
 85 |     std::vector<int> tickets;
 86 |     std::mutex instances_lock;
 87 |     std::vector<InferenceInstance> instances;
 88 | 
 89 |     std::string flexible_output_prop;
 90 | 
 91 |     [[nodiscard]]
 92 |     int acquire() noexcept {
 93 |         semaphore.acquire();
 94 |         int ticket;
 95 |         {
 96 |             std::lock_guard<std::mutex> lock { instances_lock };
 97 |             ticket = tickets.back();
 98 |             tickets.pop_back();
 99 |         }
100 |         return ticket;
101 |     }
102 | 
103 |     void release(int ticket) noexcept {
104 |         {
105 |             std::lock_guard<std::mutex> lock { instances_lock };
106 |             tickets.push_back(ticket);
107 |         }
108 |         semaphore.release();
109 |     }
110 | };
111 | 
112 | static void VS_CC vsTrtInit(
113 |     VSMap *in,
114 |     VSMap *out,
115 |     void **instanceData,
116 |     VSNode *node,
117 |     VSCore *core,
118 |     const VSAPI *vsapi
119 | ) noexcept {
120 | 
121 |     auto d = static_cast<vsTrtData *>(*instanceData);
122 |     vsapi->setVideoInfo(d->out_vi.get(), 1, node);
123 | }
124 | 
125 | static const VSFrameRef *VS_CC vsTrtGetFrame(
126 |     int n,
127 |     int activationReason,
128 |     void **instanceData,
129 |     void **frameData,
130 |     VSFrameContext *frameCtx,
131 |     VSCore *core,
132 |     const VSAPI *vsapi
133 | ) noexcept {
134 | 
135 |     auto d = static_cast<vsTrtData *>(*instanceData);
136 | 
137 |     if (activationReason == arInitial) {
138 |         for (const auto & node : d->nodes) {
139 |             vsapi->requestFrameFilter(n, node, frameCtx);
140 |         }
141 |     } else if (activationReason == arAllFramesReady) {
142 |         const std::vector<const VSVideoInfo *> in_vis {
143 |             getVideoInfo(vsapi, d->nodes)
144 |         };
145 | 
146 |         const std::vector<const VSFrameRef *> src_frames {
147 |             getFrames(n, vsapi, frameCtx, d->nodes)
148 |         };
149 | 
150 |         const int ticket { d->acquire() };
151 |         InferenceInstance & instance { d->instances[ticket] };
152 | 
153 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
154 |         auto input_name = d->engines[0]->getIOTensorName(0);
155 |         const nvinfer1::Dims src_dim { instance.exec_context->getTensorShape(input_name) };
156 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
157 |         const nvinfer1::Dims src_dim { instance.exec_context->getBindingDimensions(0) };
158 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
159 | 
160 |         const int src_planes { static_cast<int>(src_dim.d[1]) };
161 |         const int src_tile_h { static_cast<int>(src_dim.d[2]) };
162 |         const int src_tile_w { static_cast<int>(src_dim.d[3]) };
163 | 
164 |         std::vector<const uint8_t *> src_ptrs;
165 |         src_ptrs.reserve(src_planes);
166 |         for (int i = 0; i < std::ssize(d->nodes); ++i) {
167 |             for (int j = 0; j < in_vis[i]->format->numPlanes; ++j) {
168 |                 src_ptrs.emplace_back(vsapi->getReadPtr(src_frames[i], j));
169 |             }
170 |         }
171 | 
172 |         VSFrameRef * const dst_frame { vsapi->newVideoFrame(
173 |             d->out_vi->format, d->out_vi->width, d->out_vi->height,
174 |             src_frames[0], core
175 |         )};
176 | 
177 |         std::vector<VSFrameRef *> dst_frames;
178 | 
179 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
180 |         auto output_name = d->engines[0]->getIOTensorName(1);
181 |         const nvinfer1::Dims dst_dim { instance.exec_context->getTensorShape(output_name) };
182 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
183 |         const nvinfer1::Dims dst_dim { instance.exec_context->getBindingDimensions(1) };
184 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
185 | 
186 |         const int dst_planes { static_cast<int>(dst_dim.d[1]) };
187 |         const int dst_tile_h { static_cast<int>(dst_dim.d[2]) };
188 |         const int dst_tile_w { static_cast<int>(dst_dim.d[3]) };
189 | 
190 |         std::vector<uint8_t *> dst_ptrs;
191 |         dst_ptrs.reserve(dst_planes);
192 |         if (d->flexible_output_prop.empty()) {
193 |             for (int i = 0; i < dst_planes; ++i) {
194 |                 dst_ptrs.emplace_back(vsapi->getWritePtr(dst_frame, i));
195 |             }
196 |         } else {
197 |             for (int i = 0; i < dst_planes; ++i) {
198 |                 auto frame { vsapi->newVideoFrame(
199 |                     d->out_vi->format, d->out_vi->width, d->out_vi->height,
200 |                     src_frames[0], core
201 |                 )};
202 |                 dst_frames.emplace_back(frame);
203 |                 dst_ptrs.emplace_back(vsapi->getWritePtr(frame, 0));
204 |             }
205 |         }
206 | 
207 |         const int h_scale = dst_tile_h / src_tile_h;
208 |         const int w_scale = dst_tile_w / src_tile_w;
209 | 
210 |         const IOInfo info {
211 |             .in = InputInfo {
212 |                 .width = vsapi->getFrameWidth(src_frames[0], 0),
213 |                 .height = vsapi->getFrameHeight(src_frames[0], 0),
214 |                 .pitch = vsapi->getStride(src_frames[0], 0),
215 |                 .bytes_per_sample = vsapi->getFrameFormat(src_frames[0])->bytesPerSample,
216 |                 .tile_w = src_tile_w,
217 |                 .tile_h = src_tile_h
218 |             },
219 |             .out = OutputInfo {
220 |                 .pitch = vsapi->getStride(dst_frame, 0),
221 |                 .bytes_per_sample = vsapi->getFrameFormat(dst_frame)->bytesPerSample
222 |             },
223 |             .w_scale = w_scale,
224 |             .h_scale = h_scale,
225 |             .overlap_w = d->overlap_w,
226 |             .overlap_h = d->overlap_h
227 |         };
228 | 
229 |         const auto inference_result = inference(
230 |             instance,
231 |             d->device_id, d->use_cuda_graph,
232 |             info, src_ptrs, dst_ptrs
233 |         );
234 | 
235 |         d->release(ticket);
236 | 
237 |         for (const auto & frame : src_frames) {
238 |             vsapi->freeFrame(frame);
239 |         }
240 | 
241 |         if (inference_result.has_value()) {
242 |             vsapi->setFilterError(
243 |                 (__func__ + ": "s + inference_result.value()).c_str(),
244 |                 frameCtx
245 |             );
246 | 
247 |             for (const auto & frame : dst_frames) {
248 |                 vsapi->freeFrame(frame);
249 |             }
250 | 
251 |             vsapi->freeFrame(dst_frame);
252 | 
253 |             return nullptr;
254 |         }
255 |         
256 |         if (!d->flexible_output_prop.empty()) {
257 |             auto prop = vsapi->getFramePropsRW(dst_frame);
258 | 
259 |             for (int i = 0; i < dst_planes; i++) {
260 |                 auto key { d->flexible_output_prop + std::to_string(i) };
261 |                 vsapi->propSetFrame(prop, key.c_str(), dst_frames[i], paReplace);
262 |                 vsapi->freeFrame(dst_frames[i]);
263 |             }
264 |         }
265 | 
266 |         return dst_frame;
267 |     }
268 | 
269 |     return nullptr;
270 | }
271 | 
272 | static void VS_CC vsTrtFree(
273 |     void *instanceData, VSCore *core, const VSAPI *vsapi
274 | ) noexcept {
275 | 
276 |     auto d = static_cast<vsTrtData *>(instanceData);
277 | 
278 |     for (const auto & node : d->nodes) {
279 |         vsapi->freeNode(node);
280 |     }
281 | 
282 |     cudaSetDevice(d->device_id);
283 | 
284 |     delete d;
285 | }
286 | 
287 | static void VS_CC vsTrtCreate(
288 |     const VSMap *in, VSMap *out, void *userData,
289 |     VSCore *core, const VSAPI *vsapi
290 | ) noexcept {
291 | 
292 |     auto d { std::make_unique<vsTrtData>() };
293 | 
294 |     int num_nodes = vsapi->propNumElements(in, "clips");
295 |     d->nodes.reserve(num_nodes);
296 |     for (int i = 0; i < num_nodes; ++i) {
297 |         d->nodes.emplace_back(vsapi->propGetNode(in, "clips", i, nullptr));
298 |     }
299 | 
300 |     auto set_error = [&](const std::string & error_message) {
301 |         vsapi->setError(out, (__func__ + ": "s + error_message).c_str());
302 |         for (const auto & node : d->nodes) {
303 |             vsapi->freeNode(node);
304 |         }
305 |     };
306 | 
307 |     const char * engine_path = vsapi->propGetData(in, "engine_path", 0, nullptr);
308 | 
309 |     std::vector<const VSVideoInfo *> in_vis;
310 |     in_vis.reserve(std::size(d->nodes));
311 |     for (const auto & node : d->nodes) {
312 |         in_vis.emplace_back(vsapi->getVideoInfo(node));
313 |     }
314 |     if (auto err = checkNodes(in_vis); err.has_value()) {
315 |         return set_error(err.value());
316 |     }
317 | 
318 |     int error1, error2;
319 |     d->overlap_w = int64ToIntS(vsapi->propGetInt(in, "overlap", 0, &error1));
320 |     d->overlap_h = int64ToIntS(vsapi->propGetInt(in, "overlap", 1, &error2));
321 |     if (!error1) {
322 |         if (error2) {
323 |             d->overlap_h = d->overlap_w;
324 |         }
325 | 
326 |         if (d->overlap_w < 0 || d->overlap_h < 0) {
327 |             return set_error("\"overlap\" must be non-negative");
328 |         }
329 |     } else {
330 |         d->overlap_w = 0;
331 |         d->overlap_h = 0;
332 |     }
333 | 
334 |     int tile_w = int64ToIntS(vsapi->propGetInt(in, "tilesize", 0, &error1));
335 |     int tile_h = int64ToIntS(vsapi->propGetInt(in, "tilesize", 1, &error2));
336 | 
337 |     TileSize tile_size;
338 |     if (!error1) { // manual specification triggered
339 |         if (error2) {
340 |             tile_h = tile_w;
341 |         }
342 | 
343 |         if (tile_w - 2 * d->overlap_w <= 0 || tile_h - 2 * d->overlap_h <= 0) {
344 |             return set_error("\"overlap\" too large");
345 |         }
346 | 
347 |         tile_size = RequestedTileSize {
348 |             .tile_w = tile_w,
349 |             .tile_h = tile_h
350 |         };
351 |     } else {
352 |         if (d->overlap_w != 0 || d->overlap_h != 0) {
353 |             return set_error("\"tilesize\" must be specified");
354 |         }
355 | 
356 |         int width = in_vis[0]->width;
357 |         int height = in_vis[0]->height;
358 | 
359 |         if (width - 2 * d->overlap_w <= 0 || height - 2 * d->overlap_h <= 0) {
360 |             return set_error("\"overlap\" too large");
361 |         }
362 | 
363 |         tile_size = VideoSize {
364 |             .width = width,
365 |             .height = height
366 |         };
367 |     }
368 | 
369 |     int error;
370 | 
371 |     int device_id = int64ToIntS(vsapi->propGetInt(in, "device_id", 0, &error));
372 |     if (error) {
373 |         device_id = 0;
374 |     }
375 | 
376 |     int device_count;
377 |     checkError(cudaGetDeviceCount(&device_count));
378 |     if (0 <= device_id && device_id < device_count) {
379 |         checkError(cudaSetDevice(device_id));
380 |     } else {
381 |         return set_error("invalid device ID (" + std::to_string(device_id) + ")");
382 |     }
383 |     d->device_id = device_id;
384 | 
385 |     d->use_cuda_graph = !!vsapi->propGetInt(in, "use_cuda_graph", 0, &error);
386 |     if (error) {
387 |         d->use_cuda_graph = false;
388 |     }
389 | 
390 |     d->num_streams = int64ToIntS(vsapi->propGetInt(in, "num_streams", 0, &error));
391 |     if (error) {
392 |         d->num_streams = 1;
393 |     }
394 | 
395 |     int verbosity = int64ToIntS(vsapi->propGetInt(in, "verbosity", 0, &error));
396 |     if (error) {
397 |         verbosity = int(nvinfer1::ILogger::Severity::kWARNING);
398 |     }
399 |     d->logger.set_verbosity(static_cast<nvinfer1::ILogger::Severity>(verbosity));
400 | 
401 |     auto flexible_output_prop = vsapi->propGetData(in, "flexible_output_prop", 0, &error);
402 |     if (!error) {
403 |         d->flexible_output_prop = flexible_output_prop;
404 |     }
405 | 
406 | #ifdef USE_NVINFER_PLUGIN
407 |     // related to https://github.com/AmusementClub/vs-mlrt/discussions/65, for unknown reason
408 | #if !(NV_TENSORRT_MAJOR == 9 && defined(_WIN32))
409 |     if (!initLibNvInferPlugins(&d->logger, "")) {
410 |         vsapi->logMessage(mtWarning, "vsTrt: Initialize TensorRT plugins failed");
411 |     }
412 | #endif
413 | #endif
414 | 
415 |     std::ifstream engine_stream {
416 |         translateName(engine_path),
417 |         std::ios::binary | std::ios::ate
418 |     };
419 | 
420 |     if (!engine_stream.good()) {
421 |         return set_error("open engine failed");
422 |     }
423 | 
424 |     auto engine_nbytes = engine_stream.tellg();
425 |     if (engine_nbytes == -1) {
426 |         return set_error("open engine failed");
427 |     }
428 | 
429 |     std::unique_ptr<char [], decltype(&free)> engine_data {
430 |         (char *) malloc(static_cast<size_t>(engine_nbytes)), free
431 |     };
432 |     engine_stream.seekg(0, std::ios::beg);
433 |     engine_stream.read(engine_data.get(), static_cast<std::streamsize>(engine_nbytes));
434 | 
435 |     d->runtime.reset(nvinfer1::createInferRuntime(d->logger));
436 |     auto maybe_engine = initEngine(
437 |         engine_data.get(),
438 |         static_cast<size_t>(engine_nbytes),
439 |         d->runtime,
440 |         !d->flexible_output_prop.empty()
441 |     );
442 |     if (std::holds_alternative<std::unique_ptr<nvinfer1::ICudaEngine>>(maybe_engine)) {
443 |         d->engines.push_back(std::move(std::get<std::unique_ptr<nvinfer1::ICudaEngine>>(maybe_engine)));
444 |     } else {
445 |         return set_error(std::get<ErrorMessage>(maybe_engine));
446 |     }
447 | 
448 |     auto maybe_profile_index = selectProfile(d->engines[0], tile_size);
449 | 
450 |     bool is_dynamic = false;
451 |     d->instances.reserve(d->num_streams);
452 |     for (int i = 0; i < d->num_streams; ++i) {
453 |         auto maybe_instance = getInstance(
454 |             d->engines.back(),
455 |             maybe_profile_index,
456 |             tile_size,
457 |             d->use_cuda_graph,
458 |             is_dynamic
459 |         );
460 | 
461 |         // https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-1000-ea/developer-guide/index.html#perform-inference
462 | #if NV_TENSORRT_MAJOR < 10
463 |         // duplicates ICudaEngine instances
464 |         //
465 |         // According to
466 |         // https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-821/developer-guide/index.html#perform-inference
467 |         // each optimization profile can only have one execution context when using dynamic shapes
468 |         if (is_dynamic && i < d->num_streams - 1) {
469 |             auto maybe_engine = initEngine(engine_data.get(), engine_nbytes, d->runtime, !d->flexible_output_prop.empty());
470 |             if (std::holds_alternative<std::unique_ptr<nvinfer1::ICudaEngine>>(maybe_engine)) {
471 |                 d->engines.push_back(std::move(std::get<std::unique_ptr<nvinfer1::ICudaEngine>>(maybe_engine)));
472 |             } else {
473 |                 return set_error(std::get<ErrorMessage>(maybe_engine));
474 |             }
475 |         }
476 | #endif // NV_TENSORRT_MAJOR < 10
477 | 
478 |         if (std::holds_alternative<InferenceInstance>(maybe_instance)) {
479 |             auto instance = std::move(std::get<InferenceInstance>(maybe_instance));
480 |             if (auto err = checkNodesAndContext(instance.exec_context, in_vis); err.has_value()) {
481 |                 return set_error(err.value());
482 |             }
483 |             d->instances.emplace_back(std::move(instance));
484 |         } else {
485 |             return set_error(std::get<ErrorMessage>(maybe_instance));
486 |         }
487 |     }
488 | 
489 |     d->semaphore.init(d->num_streams);
490 |     d->tickets.reserve(d->num_streams);
491 |     for (int i = 0; i < d->num_streams; ++i) {
492 |         d->tickets.push_back(i);
493 |     }
494 | 
495 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
496 |     auto input_name = d->engines[0]->getIOTensorName(0);
497 |     auto input_type = d->engines[0]->getTensorDataType(input_name);
498 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
499 |     auto input_type = d->engines[0]->getBindingDataType(0);
500 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
501 | 
502 |     VSSampleType input_sample_type;
503 |     {
504 |         auto sample_type = getSampleType(input_type);
505 |         if (sample_type == 0) {
506 |             input_sample_type = stInteger;
507 |         } else if (sample_type == 1) {
508 |             input_sample_type = stFloat;
509 |         } else {
510 |             return set_error("unknown input sample type");
511 |         }
512 |     }
513 |     auto input_bits_per_sample = getBytesPerSample(input_type) * 8;
514 | 
515 |     if (auto err = checkNodes(in_vis, input_sample_type, input_bits_per_sample); err.has_value()) {
516 |         return set_error(err.value());
517 |     }
518 | 
519 |     d->out_vi = std::make_unique<VSVideoInfo>(*in_vis[0]);
520 | 
521 | #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
522 |     auto output_name = d->engines[0]->getIOTensorName(1);
523 |     auto output_type = d->engines[0]->getTensorDataType(output_name);
524 | #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
525 |     auto output_type = d->engines[0]->getBindingDataType(1);
526 | #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
527 | 
528 |     VSSampleType output_sample_type;
529 |     {
530 |         auto sample_type = getSampleType(output_type);
531 |         if (sample_type == 0) {
532 |             output_sample_type = stInteger;
533 |         } else if (sample_type == 1) {
534 |             output_sample_type = stFloat;
535 |         } else {
536 |             return set_error("unknown output sample type");
537 |         }
538 |     }
539 |     auto output_bits_per_sample = getBytesPerSample(output_type) * 8;
540 | 
541 |     setDimensions(
542 |         d->out_vi, d->instances[0].exec_context, core, vsapi,
543 |         output_sample_type, output_bits_per_sample,
544 |         !d->flexible_output_prop.empty()
545 |     );
546 | 
547 |     if (!d->flexible_output_prop.empty()) {
548 |         const auto & exec_context = d->instances[0].exec_context;
549 |         #if NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
550 |             const nvinfer1::Dims & out_dims = exec_context->getTensorShape(output_name);
551 |         #else // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
552 |             const nvinfer1::Dims & out_dims = exec_context->getBindingDimensions(1);
553 |         #endif // NV_TENSORRT_MAJOR * 10 + NV_TENSORRT_MINOR >= 85
554 |         vsapi->propSetInt(out, "num_planes", out_dims.d[1], paReplace);
555 |     }
556 | 
557 |     vsapi->createFilter(
558 |         in, out, "Model",
559 |         vsTrtInit, vsTrtGetFrame, vsTrtFree,
560 |         fmParallel, 0, d.release(), core
561 |     );
562 | }
563 | 
564 | VS_EXTERNAL_API(void) VapourSynthPluginInit(
565 |     VSConfigPlugin configFunc,
566 |     VSRegisterFunction registerFunc,
567 |     VSPlugin *plugin
568 | ) noexcept {
569 | 
570 |     configFunc(
571 |         "io.github.amusementclub.vs_tensorrt", "trt",
572 |         "TensorRT ML Filter Runtime",
573 |         VAPOURSYNTH_API_VERSION, 1, plugin
574 |     );
575 | 
576 |     // TRT 9 for windows does not export getInferLibVersion()
577 | #if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
578 |     auto test = getPluginRegistry();
579 | 
580 |     if (test == nullptr) {
581 |         std::fprintf(stderr, "vstrt: TensorRT failed to load.\n");
582 |         return;
583 |     }
584 | #else // NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
585 |     int ver = getInferLibVersion(); // must ensure this is the first nvinfer function called
586 | #ifdef _WIN32
587 |     if (ver == 0) { // a sentinel value, see dummy function in win32.cpp.
588 |         std::fprintf(stderr, "vstrt: TensorRT failed to load.\n");
589 |         return;
590 |     }
591 | #endif // _WIN32
592 |     if (ver != NV_TENSORRT_VERSION) {
593 | #if NV_TENSORRT_MAJOR >= 10
594 |         std::fprintf(
595 |             stderr,
596 |             "vstrt: TensorRT version mismatch, built with %ld but loaded with %d; continue but fingers crossed...\n",
597 |             NV_TENSORRT_VERSION,
598 |             ver
599 |         );
600 | #else // NV_TENSORRT_MAJOR >= 10
601 |         std::fprintf(
602 |             stderr,
603 |             "vstrt: TensorRT version mismatch, built with %d but loaded with %d; continue but fingers crossed...\n",
604 |             NV_TENSORRT_VERSION,
605 |             ver
606 |         );
607 | #endif // NV_TENSORRT_MAJOR >= 10
608 |     }
609 | #endif // NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
610 | 
611 |     myself = plugin;
612 | 
613 |     registerFunc("Model",
614 |         "clips:clip[];"
615 |         "engine_path:data;"
616 |         "overlap:int[]:opt;"
617 |         "tilesize:int[]:opt;"
618 |         "device_id:int:opt;"
619 |         "use_cuda_graph:int:opt;"
620 |         "num_streams:int:opt;"
621 |         "verbosity:int:opt;"
622 |         "flexible_output_prop:data:opt;",
623 |         vsTrtCreate,
624 |         nullptr,
625 |         plugin
626 |     );
627 | 
628 |     auto getVersion = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
629 |         vsapi->propSetData(out, "version", VERSION, -1, paReplace);
630 | 
631 |         vsapi->propSetData(
632 |             out, "tensorrt_version",
633 | #if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
634 |             std::to_string(NV_TENSORRT_VERSION).c_str(), 
635 | #else
636 |             std::to_string(getInferLibVersion()).c_str(), 
637 | #endif
638 |             -1, paReplace
639 |         );
640 | 
641 |         vsapi->propSetData(
642 |             out, "tensorrt_version_build",
643 |             std::to_string(NV_TENSORRT_VERSION).c_str(), -1, paReplace
644 |         );
645 | 
646 |         int runtime_version;
647 |         cudaRuntimeGetVersion(&runtime_version);
648 |         vsapi->propSetData(
649 |             out, "cuda_runtime_version",
650 |             std::to_string(runtime_version).c_str(), -1, paReplace
651 |         );
652 | 
653 |         vsapi->propSetData(
654 |             out, "cuda_runtime_version_build",
655 |             std::to_string(__CUDART_API_VERSION).c_str(), -1, paReplace
656 |         );
657 | 
658 |         vsapi->propSetData(out, "path", vsapi->getPluginPath(myself), -1, paReplace);
659 |     };
660 |     registerFunc("Version", "", getVersion, nullptr, plugin);
661 | 
662 |     registerFunc("DeviceProperties", "device_id:int:opt;", getDeviceProp, nullptr, plugin);
663 | }
664 | 


--------------------------------------------------------------------------------
/vstrt/win32.cpp:
--------------------------------------------------------------------------------
  1 | #ifdef _MSC_VER
  2 | #include <windows.h>
  3 | #include <delayimp.h>
  4 | #include <string>
  5 | #include <vector>
  6 | #include <stdexcept>
  7 | #include <filesystem>
  8 | 
  9 | #define DLL_DIR L"vsmlrt-cuda"
 10 | 
 11 | #include <iostream>
 12 | 
 13 | #include <NvInferRuntime.h>
 14 | 
 15 | #if NV_TENSORRT_VERSION >= 100001
 16 | #define TO_STRING(x) #x
 17 | #define CONCAT_VERSION(name, version) (name "_" TO_STRING(version) ".dll")
 18 | #endif // NV_TENSORRT_VERSION >= 100001
 19 | 
 20 | namespace {
 21 | std::vector<std::wstring> dlls = {
 22 | 	// This list must be sorted by dependency.
 23 | #if NV_TENSORRT_VERSION >= 100001
 24 | #ifdef USE_NVINFER_PLUGIN
 25 | 	// nvinfer_plugin dependencies
 26 | 	CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR),
 27 | 	CONCAT_VERSION(L"nvinfer_plugin", NV_TENSORRT_MAJOR),
 28 | #endif // USE_NVINFER_PLUGIN
 29 | 	// Finally, nvinfer again.
 30 | 	CONCAT_VERSION(L"nvinfer", NV_TENSORRT_MAJOR), // must be the last
 31 | #else // NV_TENSORRT_VERSION >= 100001
 32 | #ifdef USE_NVINFER_PLUGIN
 33 | 	// nvinfer_plugin dependencies
 34 | 	L"nvinfer.dll",
 35 | 	L"nvinfer_plugin.dll",
 36 | #endif // USE_NVINFER_PLUGIN
 37 | 	// Finally, nvinfer again.
 38 | 	L"nvinfer.dll", // must be the last
 39 | #endif // NV_TENSORRT_VERSION >= 100001
 40 | };
 41 | 
 42 | namespace fs = std::filesystem;
 43 | static fs::path dllDir() {
 44 | 	static const std::wstring res = []() -> std::wstring {
 45 | 		HMODULE mod = 0;
 46 | 		if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, (char *)dllDir, &mod)) {
 47 | 			std::vector<wchar_t> buf;
 48 | 			size_t n = 0;
 49 | 			do {
 50 | 				buf.resize(buf.size() + MAX_PATH);
 51 | 				n = GetModuleFileNameW(mod, buf.data(), static_cast<DWORD>(buf.size()));
 52 | 			} while (n >= buf.size());
 53 | 			buf.resize(n);
 54 | 			std::wstring path(buf.begin(), buf.end());
 55 | 			return path;
 56 | 		}
 57 | 		throw std::runtime_error("unable to locate myself");
 58 | 	}();
 59 | 	return fs::path(res).parent_path();
 60 | }
 61 | 
 62 | FARPROC loadDLLs() {
 63 | 	fs::path dir = dllDir() / DLL_DIR;
 64 | 	HMODULE h = nullptr;
 65 | 	for (const auto dll: dlls) {
 66 | 		fs::path p = dir / dll;
 67 | 		std::wstring s = p;
 68 | 		h = LoadLibraryW(s.c_str());
 69 | 		DWORD err = GetLastError();
 70 | 		if (getenv("VSTRT_VERBOSE"))
 71 | 			std::wcerr << L"vstrt: preloading " << p << L": " << h << std::endl;
 72 | 		if (!h)
 73 | 			std::wcerr << L"vstrt: failed to preload " << s << L", errno " << err << std::endl;
 74 | 	}
 75 | 	return (FARPROC)h;
 76 | }
 77 | 
 78 | #if NV_TENSORRT_MAJOR == 9 && defined(_WIN32)
 79 | static void * dummy() { // mimic getPluginRegistry
 80 | #else
 81 | static int dummy() { // mimic getInferLibVersion
 82 | #endif
 83 | 	return 0;
 84 | }
 85 | 
 86 | extern "C" FARPROC WINAPI delayload_hook(unsigned reason, DelayLoadInfo* info) {
 87 | 	switch (reason) {
 88 | 	case dliNoteStartProcessing:
 89 | 	case dliNoteEndProcessing:
 90 | 		// Nothing to do here.
 91 | 		break;
 92 | 	case dliNotePreLoadLibrary:
 93 | 		//std::cerr << "loading " << info->szDll << std::endl;
 94 | 		loadDLLs();
 95 | 		return (FARPROC)LoadLibraryA(info->szDll);
 96 | 	case dliNotePreGetProcAddress:
 97 | 		// Nothing to do here.
 98 | 		break;
 99 | 	case dliFailLoadLib:
100 | 	case dliFailGetProc:
101 | 		// Returning NULL from error notifications will cause the delay load
102 | 		// runtime to raise a VcppException structured exception, that some code
103 | 		// might want to handle.
104 | 		//return NULL;
105 | 		// The SE will crash the process, so instead we return a dummy function.
106 | 		return (FARPROC)dummy;
107 | 		break;
108 | 	default:
109 | 		abort(); // unreachable.
110 | 		break;
111 | 	}
112 | 	// Returning NULL causes the delay load machinery to perform default
113 | 	// processing for this notification.
114 | 	return NULL;
115 | }
116 | } // namespace
117 | 
118 | extern "C" {
119 | 	const PfnDliHook __pfnDliNotifyHook2 = delayload_hook;
120 | 	const PfnDliHook __pfnDliFailureHook2 = delayload_hook;
121 | };
122 | #endif // _MSC_VER
123 | 


--------------------------------------------------------------------------------