├── .gitignore ├── .gitmodules ├── src ├── waifu2x_postproc_fp32.comp ├── waifu2x_postproc_fp16.comp ├── waifu2x_preproc_fp32.comp ├── waifu2x_preproc_fp16.comp ├── waifu2x.hpp ├── waifu2x_postproc_tta_fp32.comp ├── waifu2x_preproc_tta_fp32.comp ├── waifu2x_postproc_tta_fp16.comp ├── waifu2x_preproc_tta_fp16.comp ├── vsw2xnvk.cpp └── waifu2x.cpp ├── LICENSE ├── README.md └── CMakeLists.txt /.gitignore: -------------------------------------------------------------------------------- 1 | build*/ 2 | .vs 3 | .idea 4 | .vscode 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "deps/ncnn"] 2 | path = deps/ncnn 3 | url = https://github.com/Tencent/ncnn.git 4 | -------------------------------------------------------------------------------- /src/waifu2x_postproc_fp32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; 4 | layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; 5 | 6 | layout (push_constant) uniform parameter 7 | { 8 | int w; 9 | int h; 10 | int cstep; 11 | 12 | int outw; 13 | int outh; 14 | int outcstep; 15 | 16 | int offset_x; 17 | int gx_max; 18 | } p; 19 | 20 | void main() 21 | { 22 | int gx = int(gl_GlobalInvocationID.x); 23 | int gy = int(gl_GlobalInvocationID.y); 24 | int gz = int(gl_GlobalInvocationID.z); 25 | 26 | if (gx >= p.gx_max || gy >= p.outh || gz >= 3) 27 | return; 28 | 29 | float v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; 30 | top_blob_data[gz * p.outcstep + gy * p.outw + gx + p.offset_x] = clamp(v, 0.0, 1.0); 31 | } 32 | -------------------------------------------------------------------------------- /src/waifu2x_postproc_fp16.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #extension GL_EXT_shader_16bit_storage: require 3 | 4 | layout (binding = 0) readonly buffer bottom_blob { float16_t bottom_blob_data[]; }; 5 | layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; 6 | 7 | layout (push_constant) uniform parameter 8 | { 9 | int w; 10 | int h; 11 | int cstep; 12 | 13 | int outw; 14 | int outh; 15 | int outcstep; 16 | 17 | int offset_x; 18 | int gx_max; 19 | } p; 20 | 21 | void main() 22 | { 23 | int gx = int(gl_GlobalInvocationID.x); 24 | int gy = int(gl_GlobalInvocationID.y); 25 | int gz = int(gl_GlobalInvocationID.z); 26 | 27 | if (gx >= p.gx_max || gy >= p.outh || gz >= 3) 28 | return; 29 | 30 | float v = float(bottom_blob_data[gz * p.cstep + gy * p.w + gx]); 31 | top_blob_data[gz * p.outcstep + gy * p.outw + gx + p.offset_x] = clamp(v * 1.006, 0.0, 1.0); 32 | } 33 | -------------------------------------------------------------------------------- /src/waifu2x_preproc_fp32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; 4 | layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; }; 5 | 6 | layout (push_constant) uniform parameter 7 | { 8 | int w; 9 | int h; 10 | int cstep; 11 | 12 | int outw; 13 | int outh; 14 | int outcstep; 15 | 16 | int pad_top; 17 | int pad_left; 18 | 19 | int crop_x; 20 | int crop_y; 21 | } p; 22 | 23 | void main() 24 | { 25 | int gx = int(gl_GlobalInvocationID.x); 26 | int gy = int(gl_GlobalInvocationID.y); 27 | int gz = int(gl_GlobalInvocationID.z); 28 | 29 | if (gx >= p.outw || gy >= p.outh || gz >= 3) 30 | return; 31 | 32 | int x = gx + p.crop_x - p.pad_left; 33 | int y = gy + p.crop_y - p.pad_top; 34 | x = clamp(x, 0, p.w - 1); 35 | y = clamp(y, 0, p.h - 1); 36 | 37 | float v = bottom_blob_data[gz * p.cstep + y * p.w + x]; 38 | top_blob_data[gz * p.outcstep + gy * p.outw + gx] = clamp(v, 0.0, 1.0); 39 | } 40 | -------------------------------------------------------------------------------- /src/waifu2x_preproc_fp16.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #extension GL_EXT_shader_16bit_storage: require 3 | 4 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; 5 | layout (binding = 1) writeonly buffer top_blob { float16_t top_blob_data[]; }; 6 | 7 | layout (push_constant) uniform parameter 8 | { 9 | int w; 10 | int h; 11 | int cstep; 12 | 13 | int outw; 14 | int outh; 15 | int outcstep; 16 | 17 | int pad_top; 18 | int pad_left; 19 | 20 | int crop_x; 21 | int crop_y; 22 | } p; 23 | 24 | void main() 25 | { 26 | int gx = int(gl_GlobalInvocationID.x); 27 | int gy = int(gl_GlobalInvocationID.y); 28 | int gz = int(gl_GlobalInvocationID.z); 29 | 30 | if (gx >= p.outw || gy >= p.outh || gz >= 3) 31 | return; 32 | 33 | int x = gx + p.crop_x - p.pad_left; 34 | int y = gy + p.crop_y - p.pad_top; 35 | x = clamp(x, 0, p.w - 1); 36 | y = clamp(y, 0, p.h - 1); 37 | 38 | float v = bottom_blob_data[gz * p.cstep + y * p.w + x]; 39 | top_blob_data[gz * p.outcstep + gy * p.outw + gx] = float16_t(clamp(v, 0.0, 1.0)); 40 | } 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018-2019 HolyWu 4 | Copyright (c) 2019 nihui 5 | Copyright (c) 2019-2020 NaLan ZeYu 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /src/waifu2x.hpp: -------------------------------------------------------------------------------- 1 | #ifndef WAIFU2X_HPP 2 | #define WAIFU2X_HPP 3 | 4 | #define RGB_CHANNELS 3 5 | 6 | #include 7 | #include 8 | #include 9 | #include "net.h" 10 | #include "gpu.h" 11 | 12 | class Waifu2x 13 | { 14 | public: 15 | Waifu2x(int width, int height, int scale, int tilesizew, int tilesizeh, int gpuid, int gputhread, 16 | int precision, int tta, int prepadding, const std::string& parampath, const std::string& modelpath); 17 | ~Waifu2x(); 18 | 19 | int process(const float *srcR, const float *srcG, const float *srcB, float *dstR, float *dstG, float *dstB, ptrdiff_t srcStride, ptrdiff_t dstStride) const; 20 | 21 | enum { 22 | ERROR_OK = 0, 23 | ERROR_EXTRACTOR = -1, 24 | ERROR_SUBMIT = -2, 25 | ERROR_UPLOAD = -3, 26 | ERROR_DOWNLOAD = -4 27 | }; 28 | 29 | private: 30 | int width; 31 | int height; 32 | int scale; 33 | int tilesizew; 34 | int tilesizeh; 35 | int prepadding; 36 | int tta; 37 | 38 | ncnn::Net net; 39 | ncnn::Pipeline* waifu2x_preproc; 40 | ncnn::Pipeline* waifu2x_postproc; 41 | 42 | class Semaphore { 43 | private: 44 | int val; 45 | std::mutex mtx; 46 | std::condition_variable cv; 47 | public: 48 | explicit Semaphore(int init_value) : val(init_value) { 49 | } 50 | void wait() { 51 | std::unique_lock lock(mtx); 52 | while (val <= 0) { 53 | cv.wait(lock); 54 | } 55 | val--; 56 | } 57 | void signal() { 58 | std::lock_guard guard(mtx); 59 | val++; 60 | cv.notify_one(); 61 | } 62 | }; 63 | 64 | mutable Semaphore semaphore; 65 | }; 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /src/waifu2x_postproc_tta_fp32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (binding = 0) readonly buffer bottom_blob0 { float bottom_blob0_data[]; }; 4 | layout (binding = 1) readonly buffer bottom_blob1 { float bottom_blob1_data[]; }; 5 | layout (binding = 2) readonly buffer bottom_blob2 { float bottom_blob2_data[]; }; 6 | layout (binding = 3) readonly buffer bottom_blob3 { float bottom_blob3_data[]; }; 7 | layout (binding = 4) readonly buffer bottom_blob4 { float bottom_blob4_data[]; }; 8 | layout (binding = 5) readonly buffer bottom_blob5 { float bottom_blob5_data[]; }; 9 | layout (binding = 6) readonly buffer bottom_blob6 { float bottom_blob6_data[]; }; 10 | layout (binding = 7) readonly buffer bottom_blob7 { float bottom_blob7_data[]; }; 11 | layout (binding = 8) writeonly buffer top_blob { float top_blob_data[]; }; 12 | 13 | layout (push_constant) uniform parameter 14 | { 15 | int w; 16 | int h; 17 | int cstep; 18 | 19 | int outw; 20 | int outh; 21 | int outcstep; 22 | 23 | int offset_x; 24 | int gx_max; 25 | } p; 26 | 27 | void main() 28 | { 29 | int gx = int(gl_GlobalInvocationID.x); 30 | int gy = int(gl_GlobalInvocationID.y); 31 | int gz = int(gl_GlobalInvocationID.z); 32 | int gzi = gz * p.cstep; 33 | 34 | if (gx >= p.gx_max || gy >= p.outh || gz >= 3) 35 | return; 36 | 37 | float v0 = bottom_blob0_data[gzi + gy * p.w + gx]; 38 | float v1 = bottom_blob1_data[gzi + gy * p.w + (p.w - 1 - gx)]; 39 | float v2 = bottom_blob2_data[gzi + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]; 40 | float v3 = bottom_blob3_data[gzi + (p.h - 1 - gy) * p.w + gx]; 41 | float v4 = bottom_blob4_data[gzi + gx * p.h + gy]; 42 | float v5 = bottom_blob5_data[gzi + gx * p.h + (p.h - 1 - gy)]; 43 | float v6 = bottom_blob6_data[gzi + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]; 44 | float v7 = bottom_blob7_data[gzi + (p.w - 1 - gx) * p.h + gy]; 45 | 46 | float v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f; 47 | 48 | top_blob_data[gz * p.outcstep + gy * p.outw + gx + p.offset_x] = clamp(v * 1.006, 0.0, 1.0); 49 | } 50 | -------------------------------------------------------------------------------- /src/waifu2x_preproc_tta_fp32.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | 3 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; 4 | layout (binding = 1) writeonly buffer top_blob0 { float top_blob0_data[]; }; 5 | layout (binding = 2) writeonly buffer top_blob1 { float top_blob1_data[]; }; 6 | layout (binding = 3) writeonly buffer top_blob2 { float top_blob2_data[]; }; 7 | layout (binding = 4) writeonly buffer top_blob3 { float top_blob3_data[]; }; 8 | layout (binding = 5) writeonly buffer top_blob4 { float top_blob4_data[]; }; 9 | layout (binding = 6) writeonly buffer top_blob5 { float top_blob5_data[]; }; 10 | layout (binding = 7) writeonly buffer top_blob6 { float top_blob6_data[]; }; 11 | layout (binding = 8) writeonly buffer top_blob7 { float top_blob7_data[]; }; 12 | 13 | layout (push_constant) uniform parameter 14 | { 15 | int w; 16 | int h; 17 | int cstep; 18 | 19 | int outw; 20 | int outh; 21 | int outcstep; 22 | 23 | int pad_top; 24 | int pad_left; 25 | 26 | int crop_x; 27 | int crop_y; 28 | } p; 29 | 30 | void main() 31 | { 32 | int gx = int(gl_GlobalInvocationID.x); 33 | int gy = int(gl_GlobalInvocationID.y); 34 | int gz = int(gl_GlobalInvocationID.z); 35 | int gzi = gz * p.outcstep; 36 | 37 | if (gx >= p.outw || gy >= p.outh || gz >= 3) 38 | return; 39 | 40 | int x = gx + p.crop_x - p.pad_left; 41 | int y = gy + p.crop_y - p.pad_top; 42 | x = clamp(x, 0, p.w - 1); 43 | y = clamp(y, 0, p.h - 1); 44 | 45 | float v = clamp(bottom_blob_data[gz * p.cstep + y * p.w + x], 0.0, 1.0); 46 | 47 | top_blob0_data[gzi + gy * p.outw + gx] = v; 48 | top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = v; 49 | top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = v; 50 | top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = v; 51 | top_blob4_data[gzi + gx * p.outh + gy] = v; 52 | top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = v; 53 | top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = v; 54 | top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = v; 55 | } 56 | -------------------------------------------------------------------------------- /src/waifu2x_postproc_tta_fp16.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #extension GL_EXT_shader_16bit_storage: require 3 | 4 | layout (binding = 0) readonly buffer bottom_blob0 { float16_t bottom_blob0_data[]; }; 5 | layout (binding = 1) readonly buffer bottom_blob1 { float16_t bottom_blob1_data[]; }; 6 | layout (binding = 2) readonly buffer bottom_blob2 { float16_t bottom_blob2_data[]; }; 7 | layout (binding = 3) readonly buffer bottom_blob3 { float16_t bottom_blob3_data[]; }; 8 | layout (binding = 4) readonly buffer bottom_blob4 { float16_t bottom_blob4_data[]; }; 9 | layout (binding = 5) readonly buffer bottom_blob5 { float16_t bottom_blob5_data[]; }; 10 | layout (binding = 6) readonly buffer bottom_blob6 { float16_t bottom_blob6_data[]; }; 11 | layout (binding = 7) readonly buffer bottom_blob7 { float16_t bottom_blob7_data[]; }; 12 | layout (binding = 8) writeonly buffer top_blob { float top_blob_data[]; }; 13 | 14 | layout (push_constant) uniform parameter 15 | { 16 | int w; 17 | int h; 18 | int cstep; 19 | 20 | int outw; 21 | int outh; 22 | int outcstep; 23 | 24 | int offset_x; 25 | int gx_max; 26 | } p; 27 | 28 | void main() 29 | { 30 | int gx = int(gl_GlobalInvocationID.x); 31 | int gy = int(gl_GlobalInvocationID.y); 32 | int gz = int(gl_GlobalInvocationID.z); 33 | int gzi = gz * p.cstep; 34 | 35 | if (gx >= p.gx_max || gy >= p.outh || gz >= 3) 36 | return; 37 | 38 | float v0 = float(bottom_blob0_data[gzi + gy * p.w + gx]); 39 | float v1 = float(bottom_blob1_data[gzi + gy * p.w + (p.w - 1 - gx)]); 40 | float v2 = float(bottom_blob2_data[gzi + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]); 41 | float v3 = float(bottom_blob3_data[gzi + (p.h - 1 - gy) * p.w + gx]); 42 | float v4 = float(bottom_blob4_data[gzi + gx * p.h + gy]); 43 | float v5 = float(bottom_blob5_data[gzi + gx * p.h + (p.h - 1 - gy)]); 44 | float v6 = float(bottom_blob6_data[gzi + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]); 45 | float v7 = float(bottom_blob7_data[gzi + (p.w - 1 - gx) * p.h + gy]); 46 | 47 | float v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f; 48 | 49 | top_blob_data[gz * p.outcstep + gy * p.outw + gx + p.offset_x] = clamp(v * 1.006, 0.0, 1.0); 50 | } 51 | -------------------------------------------------------------------------------- /src/waifu2x_preproc_tta_fp16.comp: -------------------------------------------------------------------------------- 1 | #version 450 2 | #extension GL_EXT_shader_16bit_storage: require 3 | 4 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; }; 5 | layout (binding = 1) writeonly buffer top_blob0 { float16_t top_blob0_data[]; }; 6 | layout (binding = 2) writeonly buffer top_blob1 { float16_t top_blob1_data[]; }; 7 | layout (binding = 3) writeonly buffer top_blob2 { float16_t top_blob2_data[]; }; 8 | layout (binding = 4) writeonly buffer top_blob3 { float16_t top_blob3_data[]; }; 9 | layout (binding = 5) writeonly buffer top_blob4 { float16_t top_blob4_data[]; }; 10 | layout (binding = 6) writeonly buffer top_blob5 { float16_t top_blob5_data[]; }; 11 | layout (binding = 7) writeonly buffer top_blob6 { float16_t top_blob6_data[]; }; 12 | layout (binding = 8) writeonly buffer top_blob7 { float16_t top_blob7_data[]; }; 13 | 14 | layout (push_constant) uniform parameter 15 | { 16 | int w; 17 | int h; 18 | int cstep; 19 | 20 | int outw; 21 | int outh; 22 | int outcstep; 23 | 24 | int pad_top; 25 | int pad_left; 26 | 27 | int crop_x; 28 | int crop_y; 29 | } p; 30 | 31 | void main() 32 | { 33 | int gx = int(gl_GlobalInvocationID.x); 34 | int gy = int(gl_GlobalInvocationID.y); 35 | int gz = int(gl_GlobalInvocationID.z); 36 | int gzi = gz * p.outcstep; 37 | 38 | if (gx >= p.outw || gy >= p.outh || gz >= 3) 39 | return; 40 | 41 | int x = gx + p.crop_x - p.pad_left; 42 | int y = gy + p.crop_y - p.pad_top; 43 | x = clamp(x, 0, p.w - 1); 44 | y = clamp(y, 0, p.h - 1); 45 | 46 | float v = clamp(bottom_blob_data[gz * p.cstep + y * p.w + x], 0.0, 1.0); 47 | 48 | top_blob0_data[gzi + gy * p.outw + gx] = float16_t(v); 49 | top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = float16_t(v); 50 | top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = float16_t(v); 51 | top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = float16_t(v); 52 | top_blob4_data[gzi + gx * p.outh + gy] = float16_t(v); 53 | top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = float16_t(v); 54 | top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = float16_t(v); 55 | top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = float16_t(v); 56 | } 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VapourSynth Waifu2x NCNN Vulkan Plugin 2 | 3 | Waifu2x filter for VapourSynth, based on [waifu2x-ncnn-vulkan](https://github.com/nihui/waifu2x-ncnn-vulkan). 4 | 5 | ## DEPRECATED 6 | 7 | **This plugin has been deprecated.** 8 | 9 | I recommend using [vs-mlrt](https://github.com/AmusementClub/vs-mlrt) instead, which offers broader model support, provides more inference framework options (including NCNN Vulkan), and is actively maintained. 10 | 11 | ## Install 12 | 13 | Download pre-built binaries and model files from [releases](https://github.com/Nlzy/vapoursynth-waifu2x-ncnn-vulkan/releases). Uncompress and put into VapourSynth plugin folder. 14 | 15 | ## Usage 16 | 17 | ``` 18 | core.w2xnvk.Waifu2x(clip[, noise, scale, model, tile_size, gpu_id, gpu_thread, precision, tile_size_w, tile_size_h, tta]) 19 | ``` 20 | 21 | * clip: Input clip. Only 32-bit float RGB is supported. 22 | 23 | * noise: Denoise level. (int -1/0/1/2/3, defualt=0) 24 | * -1 = none 25 | * 0 = low 26 | * 1 = medium 27 | * 2 = high 28 | * 3 = highest 29 | 30 | * scale: Upscale ratio. (int 1/2, default=2) 31 | * 1 = no scaling, denoise only. upconv_7 doesn't support this mode. 32 | * 2 = upscale 2x. 33 | 34 | * model: Model to use. (int 0/1/2, default=0) 35 | * 0 = upconv_7_anime_style_art_rgb 36 | * 1 = upconv_7_photo 37 | * 2 = cunet (For 2D artwork. Slow, but better quality.) 38 | 39 | * tile_size: Tile size. Must be divisible by 4. Increasing this value may improve performance and take more VRAM. (int >=32, default=0 for auto choose) 40 | 41 | * gpu_id: GPU device to use. (int >=0, default=0) 42 | 43 | * gpu_thread: Number of threads that can simultaneously access GPU. (int >=1, default=0 for auto detect) 44 | 45 | * precision: Floating-point precision. Single-precision (fp32) is slow but more precise in color. Default is half-precision (fp16). (int 16/32, default=16) 46 | 47 | * tile_size_w / tile_size_h: Override width and height of tile_size. 48 | 49 | * tta: TTA (test-time augmentation) mode. (bool True/False, default=False) 50 | 51 | ## Build 52 | 53 | ### Linux 54 | 55 | Install dependencies: 56 | 57 | ```bash 58 | # Arch Linux 59 | sudo pacman -S vapoursynth glslang vulkan-icd-loader vulkan-headers 60 | # Fedora 61 | sudo dnf install vapoursynth-devel glslang vulkan-loader-devel vulkan-headers 62 | ``` 63 | 64 | Get source code and build: 65 | 66 | ```bash 67 | # clone repository and submodule 68 | git clone https://github.com/Nlzy/vapoursynth-waifu2x-ncnn-vulkan.git 69 | cd vapoursynth-waifu2x-ncnn-vulkan 70 | git submodule update --init --recursive 71 | mkdir build 72 | cd build 73 | 74 | # build 75 | cmake .. 76 | cmake --build . -j 4 77 | ``` 78 | 79 | ### Windows 80 | 81 | Install [Vulkan SDK](https://vulkan.lunarg.com/sdk/home). 82 | 83 | Open `Git Bash`, clone repository and submodule: 84 | 85 | ```bash 86 | git clone https://github.com/Nlzy/vapoursynth-waifu2x-ncnn-vulkan.git 87 | cd vapoursynth-waifu2x-ncnn-vulkan 88 | git submodule update --init --recursive 89 | mkdir build 90 | ``` 91 | 92 | Open `Start Menu` -> `Visual Studio 2019` -> `x64 Native Tools Command Prompt for VS 2019`, then build: 93 | 94 | ``` 95 | cd X:\path_to_vapoursynth-waifu2x-ncnn-vulkan\build 96 | cmake -G "NMake Makefiles" -DVAPOURSYNTH_HEADER_DIR=X:\path_to_vapoursynth\sdk\include\vapoursynth .. 97 | cmake --build . 98 | ``` 99 | 100 | Note: If you are using VapourSynth "Portable" version, use `-DVAPOURSYNTH_HEADER_DIR=X:\path_to_vapoursynth\sdk\include` instead. 101 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(vapoursynth-waifu2x-ncnn-vulkan) 2 | cmake_minimum_required(VERSION 3.9) 3 | set(CMAKE_BUILD_TYPE Release) 4 | 5 | find_package(Vulkan REQUIRED) 6 | 7 | # check glslangValidator 8 | find_program(GLSLANGVALIDATOR_EXECUTABLE NAMES glslangValidator PATHS $ENV{VULKAN_SDK}/bin NO_CMAKE_FIND_ROOT_PATH) 9 | message(STATUS "Found glslangValidator: ${GLSLANGVALIDATOR_EXECUTABLE}") 10 | 11 | macro(compile_shader SHADER_SRC) 12 | set(SHADER_SRC_FULLPATH ${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_SRC}) 13 | get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE) 14 | set(SHADER_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.hex.h) 15 | add_custom_command( 16 | OUTPUT ${SHADER_SPV_HEX_FILE} 17 | COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} 18 | ARGS -V -s -x -o ${SHADER_SPV_HEX_FILE} ${SHADER_SRC_FULLPATH} 19 | DEPENDS ${SHADER_SRC_FULLPATH} 20 | COMMENT "Building SPIR-V module ${SHADER_SRC_NAME_WE}.spv" 21 | VERBATIM 22 | ) 23 | set_source_files_properties(${SHADER_SPV_HEX_FILE} PROPERTIES GENERATED TRUE) 24 | list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_FILE}) 25 | endmacro() 26 | 27 | # enable global link time optimization 28 | cmake_policy(SET CMP0069 NEW) 29 | set(CMAKE_POLICY_DEFAULT_CMP0069 NEW) 30 | include(CheckIPOSupported) 31 | check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output) 32 | if(ipo_supported) 33 | set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) 34 | else() 35 | message(WARNING "IPO is not supported: ${ipo_supported_output}") 36 | endif() 37 | 38 | # build ncnn library 39 | if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/deps/ncnn/CMakeLists.txt") 40 | message(FATAL_ERROR "The submodules were not downloaded! Please update submodules with \"git submodule update --init --recursive\" and try again.") 41 | endif() 42 | option(NCNN_INSTALL_SDK "" OFF) 43 | option(NCNN_PIXEL_ROTATE "" OFF) 44 | option(NCNN_PIXEL_AFFINE "" OFF) 45 | option(NCNN_PIXEL_DRAWING "" OFF) 46 | option(NCNN_VULKAN "" ON) 47 | option(NCNN_VULKAN_ONLINE_SPIRV "" ON) 48 | option(NCNN_BUILD_BENCHMARK "" OFF) 49 | option(NCNN_BUILD_TESTS "" OFF) 50 | option(NCNN_BUILD_TOOLS "" OFF) 51 | option(NCNN_BUILD_EXAMPLES "" OFF) 52 | option(NCNN_DISABLE_RTTI "" ON) 53 | option(NCNN_DISABLE_EXCEPTION "" ON) 54 | option(NCNN_INT8 "" OFF) 55 | option(NCNN_OPENMP "" OFF) 56 | option(WITH_LAYER_absval "" OFF) 57 | option(WITH_LAYER_argmax "" OFF) 58 | option(WITH_LAYER_batchnorm "" OFF) 59 | option(WITH_LAYER_bias "" OFF) 60 | option(WITH_LAYER_bnll "" OFF) 61 | option(WITH_LAYER_concat "" OFF) 62 | option(WITH_LAYER_convolution "" ON) 63 | option(WITH_LAYER_crop "" ON) 64 | option(WITH_LAYER_deconvolution "" ON) 65 | option(WITH_LAYER_dropout "" OFF) 66 | option(WITH_LAYER_eltwise "" ON) 67 | option(WITH_LAYER_elu "" OFF) 68 | option(WITH_LAYER_embed "" OFF) 69 | option(WITH_LAYER_exp "" OFF) 70 | option(WITH_LAYER_flatten "" ON) 71 | option(WITH_LAYER_innerproduct "" ON) 72 | option(WITH_LAYER_input "" ON) 73 | option(WITH_LAYER_log "" OFF) 74 | option(WITH_LAYER_lrn "" OFF) 75 | option(WITH_LAYER_memorydata "" OFF) 76 | option(WITH_LAYER_mvn "" OFF) 77 | option(WITH_LAYER_pooling "" ON) 78 | option(WITH_LAYER_power "" OFF) 79 | option(WITH_LAYER_prelu "" OFF) 80 | option(WITH_LAYER_proposal "" OFF) 81 | option(WITH_LAYER_reduction "" OFF) 82 | option(WITH_LAYER_relu "" ON) 83 | option(WITH_LAYER_reshape "" OFF) 84 | option(WITH_LAYER_roipooling "" OFF) 85 | option(WITH_LAYER_scale "" ON) 86 | option(WITH_LAYER_sigmoid "" OFF) 87 | option(WITH_LAYER_slice "" OFF) 88 | option(WITH_LAYER_softmax "" OFF) 89 | option(WITH_LAYER_split "" ON) 90 | option(WITH_LAYER_spp "" OFF) 91 | option(WITH_LAYER_tanh "" OFF) 92 | option(WITH_LAYER_threshold "" OFF) 93 | option(WITH_LAYER_tile "" OFF) 94 | option(WITH_LAYER_rnn "" OFF) 95 | option(WITH_LAYER_lstm "" OFF) 96 | option(WITH_LAYER_binaryop "" OFF) 97 | option(WITH_LAYER_unaryop "" OFF) 98 | option(WITH_LAYER_convolutiondepthwise "" OFF) 99 | option(WITH_LAYER_padding "" ON) 100 | option(WITH_LAYER_squeeze "" OFF) 101 | option(WITH_LAYER_expanddims "" OFF) 102 | option(WITH_LAYER_normalize "" OFF) 103 | option(WITH_LAYER_permute "" OFF) 104 | option(WITH_LAYER_priorbox "" OFF) 105 | option(WITH_LAYER_detectionoutput "" OFF) 106 | option(WITH_LAYER_interp "" ON) 107 | option(WITH_LAYER_deconvolutiondepthwise "" OFF) 108 | option(WITH_LAYER_shufflechannel "" OFF) 109 | option(WITH_LAYER_instancenorm "" OFF) 110 | option(WITH_LAYER_clip "" OFF) 111 | option(WITH_LAYER_reorg "" OFF) 112 | option(WITH_LAYER_yolodetectionoutput "" OFF) 113 | option(WITH_LAYER_quantize "" OFF) 114 | option(WITH_LAYER_dequantize "" OFF) 115 | option(WITH_LAYER_yolov3detectionoutput "" OFF) 116 | option(WITH_LAYER_psroipooling "" OFF) 117 | option(WITH_LAYER_roialign "" OFF) 118 | option(WITH_LAYER_packing "" ON) 119 | option(WITH_LAYER_requantize "" OFF) 120 | option(WITH_LAYER_cast "" ON) 121 | option(WITH_LAYER_hardsigmoid "" OFF) 122 | option(WITH_LAYER_selu "" OFF) 123 | option(WITH_LAYER_hardswish "" OFF) 124 | option(WITH_LAYER_noop "" OFF) 125 | option(WITH_LAYER_pixelshuffle "" OFF) 126 | option(WITH_LAYER_deepcopy "" OFF) 127 | option(WITH_LAYER_mish "" OFF) 128 | option(WITH_LAYER_statisticspooling "" OFF) 129 | option(WITH_LAYER_swish "" OFF) 130 | option(WITH_LAYER_gemm "" OFF) 131 | option(WITH_LAYER_groupnorm "" OFF) 132 | option(WITH_LAYER_layernorm "" OFF) 133 | option(WITH_LAYER_softplus "" OFF) 134 | option(WITH_LAYER_gru "" OFF) 135 | option(WITH_LAYER_multiheadattention "" OFF) 136 | option(WITH_LAYER_gelu "" OFF) 137 | option(WITH_LAYER_convolution1d "" OFF) 138 | option(WITH_LAYER_pooling1d "" OFF) 139 | option(WITH_LAYER_convolutiondepthwise1d "" OFF) 140 | option(WITH_LAYER_convolution3d "" OFF) 141 | option(WITH_LAYER_convolutiondepthwise3d "" OFF) 142 | option(WITH_LAYER_pooling3d "" OFF) 143 | option(WITH_LAYER_matmul "" OFF) 144 | option(WITH_LAYER_deconvolution1d "" OFF) 145 | option(WITH_LAYER_deconvolutiondepthwise1d "" OFF) 146 | option(WITH_LAYER_deconvolution3d "" OFF) 147 | option(WITH_LAYER_deconvolutiondepthwise3d "" OFF) 148 | add_subdirectory(deps/ncnn) 149 | 150 | # generate-spirv target 151 | set(SHADER_SPV_HEX_FILES) 152 | compile_shader(src/waifu2x_preproc_fp16.comp) 153 | compile_shader(src/waifu2x_preproc_fp32.comp) 154 | compile_shader(src/waifu2x_preproc_tta_fp16.comp) 155 | compile_shader(src/waifu2x_preproc_tta_fp32.comp) 156 | compile_shader(src/waifu2x_postproc_fp16.comp) 157 | compile_shader(src/waifu2x_postproc_fp32.comp) 158 | compile_shader(src/waifu2x_postproc_tta_fp16.comp) 159 | compile_shader(src/waifu2x_postproc_tta_fp32.comp) 160 | add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES}) 161 | 162 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 163 | 164 | # check VapourSynth headers 165 | set(VAPOURSYNTH_HEADER_DIR "/usr/include/vapoursynth" CACHE PATH "VapourSynth header files") 166 | if(NOT EXISTS "${VAPOURSYNTH_HEADER_DIR}/VSHelper.h") 167 | message(FATAL_ERROR "VapourSynth header files NOT FOUND, specify with -DVAPOURSYNTH_HEADER_DIR") 168 | endif() 169 | 170 | # libvsw2xnvk 171 | add_library(vsw2xnvk SHARED src/vsw2xnvk.cpp src/waifu2x.cpp) 172 | target_include_directories(vsw2xnvk PRIVATE ${VAPOURSYNTH_HEADER_DIR}) 173 | target_link_libraries(vsw2xnvk ncnn ${Vulkan_LIBRARY}) 174 | add_dependencies(vsw2xnvk generate-spirv) 175 | -------------------------------------------------------------------------------- /src/vsw2xnvk.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2018-2019 HolyWu 5 | Copyright (c) 2019-2020 NaLan ZeYu 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #include 27 | #include 28 | #include "gpu.h" 29 | #include "waifu2x.hpp" 30 | #include "VSHelper.h" 31 | 32 | static ncnn::Mutex instanceLock; 33 | static int instanceCounter = 0; 34 | 35 | static int tryCreateGpuInstance() { 36 | ncnn::MutexLockGuard lg(instanceLock); 37 | if (instanceCounter++ == 0) { 38 | return ncnn::create_gpu_instance(); 39 | } else { 40 | return 0; 41 | } 42 | } 43 | 44 | static void tryDestoryGpuInstance() { 45 | ncnn::MutexLockGuard lg(instanceLock); 46 | if (--instanceCounter == 0) { 47 | ncnn::destroy_gpu_instance(); 48 | } 49 | } 50 | 51 | typedef struct { 52 | VSNodeRef *node; 53 | VSVideoInfo vi; 54 | Waifu2x *waifu2x; 55 | } FilterData; 56 | 57 | static int filter(const VSFrameRef *src, VSFrameRef *dst, FilterData * const VS_RESTRICT d, const VSAPI *vsapi) noexcept { 58 | const int srcStride = vsapi->getStride(src, 0) / static_cast(sizeof(float)); 59 | const int dstStride = vsapi->getStride(dst, 0) / static_cast(sizeof(float)); 60 | auto * srcR = reinterpret_cast(vsapi->getReadPtr(src, 0)); 61 | auto * srcG = reinterpret_cast(vsapi->getReadPtr(src, 1)); 62 | auto * srcB = reinterpret_cast(vsapi->getReadPtr(src, 2)); 63 | auto * VS_RESTRICT dstR = reinterpret_cast(vsapi->getWritePtr(dst, 0)); 64 | auto * VS_RESTRICT dstG = reinterpret_cast(vsapi->getWritePtr(dst, 1)); 65 | auto * VS_RESTRICT dstB = reinterpret_cast(vsapi->getWritePtr(dst, 2)); 66 | return d->waifu2x->process(srcR, srcG, srcB, dstR, dstG, dstB, srcStride, dstStride); 67 | } 68 | 69 | static void VS_CC filterInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) { 70 | auto *d = static_cast(*instanceData); 71 | vsapi->setVideoInfo(&d->vi, 1, node); 72 | } 73 | 74 | static const VSFrameRef *VS_CC filterGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) { 75 | auto *d = static_cast(*instanceData); 76 | 77 | if (activationReason == arInitial) { 78 | vsapi->requestFrameFilter(n, d->node, frameCtx); 79 | } else if (activationReason == arAllFramesReady) { 80 | auto src = vsapi->getFrameFilter(n, d->node, frameCtx); 81 | auto dst = vsapi->newVideoFrame(d->vi.format, d->vi.width, d->vi.height, src, core); 82 | 83 | int err = filter(src, dst, d, vsapi); 84 | switch (err) { 85 | case Waifu2x::ERROR_OK: 86 | vsapi->freeFrame(src); 87 | return dst; 88 | case Waifu2x::ERROR_EXTRACTOR: 89 | vsapi->setFilterError("Waifu2x-NCNN-Vulkan: Waifu2x extractor error. Try to decrease tile_size or gpu_thread", frameCtx); 90 | vsapi->freeFrame(src); 91 | vsapi->freeFrame(dst); 92 | return nullptr; 93 | case Waifu2x::ERROR_DOWNLOAD: 94 | case Waifu2x::ERROR_UPLOAD: 95 | case Waifu2x::ERROR_SUBMIT: 96 | vsapi->setFilterError("Waifu2x-NCNN-Vulkan: Waifu2x submit error. Try to decrease gpu_thread", frameCtx); 97 | vsapi->freeFrame(src); 98 | vsapi->freeFrame(dst); 99 | return nullptr; 100 | } 101 | } 102 | 103 | return nullptr; 104 | } 105 | 106 | static void VS_CC filterFree(void *instanceData, VSCore *core, const VSAPI *vsapi) { 107 | auto *d = static_cast(instanceData); 108 | vsapi->freeNode(d->node); 109 | delete d->waifu2x; 110 | delete d; 111 | tryDestoryGpuInstance(); 112 | } 113 | 114 | static void VS_CC filterCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) { 115 | FilterData d{}; 116 | d.node = vsapi->propGetNode(in, "clip", 0, nullptr); 117 | d.vi = *vsapi->getVideoInfo(d.node); 118 | 119 | int gpuId, noise, scale, model, tileSizeW, tileSizeH, gpuThread, precision, tta; 120 | std::string paramPath, modelPath; 121 | char const * err_prompt = nullptr; 122 | do { 123 | int err; 124 | 125 | err = tryCreateGpuInstance(); 126 | if (err) { 127 | err_prompt = "create gpu instance failed"; 128 | break; 129 | } 130 | 131 | if (!isConstantFormat(&d.vi) || d.vi.format->colorFamily != cmRGB || d.vi.format->sampleType != stFloat || d.vi.format->bitsPerSample != 32) { 132 | err_prompt = "only constant RGB format and 32 bit float input supported"; 133 | break; 134 | } 135 | 136 | gpuId = int64ToIntS(vsapi->propGetInt(in, "gpu_id", 0, &err)); 137 | if (gpuId < 0 || gpuId >= ncnn::get_gpu_count()) { 138 | err_prompt = "invalid 'gpu_id'"; 139 | break; 140 | } 141 | 142 | noise = int64ToIntS(vsapi->propGetInt(in, "noise", 0, &err)); 143 | if (noise < -1 || noise > 3) { 144 | err_prompt = "'noise' must be -1, 0, 1, 2, or 3"; 145 | break; 146 | } 147 | 148 | scale = int64ToIntS(vsapi->propGetInt(in, "scale", 0, &err)); 149 | if (err) 150 | scale = 2; 151 | if (scale != 1 && scale != 2) { 152 | err_prompt = "'scale' must be 1 or 2"; 153 | break; 154 | } 155 | 156 | model = int64ToIntS(vsapi->propGetInt(in, "model", 0, &err)); 157 | if (model < 0 || model > 2) { 158 | err_prompt = "'model' must be 0, 1 or 2"; 159 | break; 160 | } 161 | 162 | precision = int64ToIntS(vsapi->propGetInt(in, "precision", 0, &err)); 163 | if (err) 164 | precision = 16; 165 | if (precision != 16 && precision != 32) { 166 | err_prompt = "'precision' must be 16 or 32"; 167 | break; 168 | } 169 | 170 | tta = int64ToIntS(vsapi->propGetInt(in, "tta", 0, &err)); 171 | if (err) 172 | tta = 0; 173 | if (tta != 0) 174 | tta = 1; 175 | 176 | int customGpuThread = int64ToIntS(vsapi->propGetInt(in, "gpu_thread", 0, &err)); 177 | if (customGpuThread > 0) { 178 | gpuThread = customGpuThread; 179 | } 180 | else { 181 | gpuThread = int64ToIntS(ncnn::get_gpu_info(gpuId).transfer_queue_count()); 182 | } 183 | gpuThread = std::min(gpuThread, int64ToIntS(ncnn::get_gpu_info(gpuId).compute_queue_count())); 184 | 185 | int tileSize = int64ToIntS(vsapi->propGetInt(in, "tile_size", 0, &err)); 186 | if (tileSize == 0) { 187 | double vram = ncnn::get_gpu_device(gpuId)->get_heap_budget(); // in MByte 188 | double factor = (precision == 32 ? 2 : 1) * (model == 2 ? 1.5 : 1) * gpuThread; 189 | if (vram / factor > 900) 190 | tileSize = 360; 191 | else if (vram / factor > 450) 192 | tileSize = 240; 193 | else 194 | tileSize = 180; 195 | } 196 | if (tileSize < 32) { 197 | err_prompt = "'tile_size' must be greater than or equal to 32"; 198 | break; 199 | } 200 | if (tileSize % 4) { 201 | err_prompt = "'tile_size' must be multiple of 4"; 202 | break; 203 | } 204 | tileSizeW = tileSizeH = tileSize; 205 | 206 | int tw = int64ToIntS(vsapi->propGetInt(in, "tile_size_w", 0, &err)); 207 | if (!err) { 208 | if (tw < 32) { 209 | err_prompt = "'tile_size_w' must be greater than or equal to 32"; 210 | break; 211 | } 212 | if (tw % 4) { 213 | err_prompt = "'tile_size_w' must be multiple of 4"; 214 | break; 215 | } 216 | tileSizeW = tw; 217 | } 218 | 219 | int th = int64ToIntS(vsapi->propGetInt(in, "tile_size_h", 0, &err)); 220 | if (!err) { 221 | if (th < 32) { 222 | err_prompt = "'tile_size_h' must be greater than or equal to 32"; 223 | break; 224 | } 225 | if (th % 4) { 226 | err_prompt = "'tile_size_h' must be multiple of 4"; 227 | break; 228 | } 229 | tileSizeH = th; 230 | } 231 | 232 | if (scale == 1 && noise == -1) { 233 | err_prompt = "use 'noise=-1' and 'scale=1' at same time is useless"; 234 | break; 235 | } 236 | 237 | if (scale == 1 && model != 2) { 238 | err_prompt = "only cunet model support 'scale=1'"; 239 | break; 240 | } 241 | 242 | // set model path 243 | const std::string pluginFilePath{ vsapi->getPluginPath(vsapi->getPluginById("net.nlzy.vsw2xnvk", core)) }; 244 | const std::string pluginDir = pluginFilePath.substr(0, pluginFilePath.find_last_of('/')); 245 | 246 | std::string modelsDir; 247 | if (model == 0) 248 | modelsDir += pluginDir + "/models-upconv_7_anime_style_art_rgb/"; 249 | else if (model == 1) 250 | modelsDir += pluginDir + "/models-upconv_7_photo/"; 251 | else 252 | modelsDir += pluginDir + "/models-cunet/"; 253 | 254 | std::string modelName; 255 | if (noise == -1) 256 | modelName = "scale2.0x_model"; 257 | else if (scale == 1) 258 | modelName = "noise" + std::to_string(noise) + "_model"; 259 | else 260 | modelName = "noise" + std::to_string(noise) + "_scale2.0x_model"; 261 | 262 | paramPath = modelsDir + modelName + ".param"; 263 | modelPath = modelsDir + modelName + ".bin"; 264 | 265 | // check model file readable 266 | std::ifstream pf(paramPath); 267 | std::ifstream mf(modelPath); 268 | if (!pf.good() || !mf.good()) { 269 | err_prompt = "can't open model file"; 270 | break; 271 | } 272 | 273 | break; 274 | } while (false); 275 | 276 | if (err_prompt) { 277 | vsapi->setError(out, (std::string{"Waifu2x-NCNN-Vulkan: "} + err_prompt).c_str()); 278 | vsapi->freeNode(d.node); 279 | tryDestoryGpuInstance(); 280 | return; 281 | } 282 | 283 | int prepadding; 284 | if (model == 2 && scale == 1) 285 | prepadding = 28; 286 | else if (model == 2) 287 | prepadding = 18; 288 | else 289 | prepadding = 7; 290 | 291 | d.waifu2x = new Waifu2x(d.vi.width, d.vi.height, scale, tileSizeW, tileSizeH, gpuId, gpuThread, precision, tta, prepadding, paramPath, modelPath); 292 | d.vi.width *= scale; 293 | d.vi.height *= scale; 294 | 295 | auto *data = new FilterData{ d }; 296 | 297 | vsapi->createFilter(in, out, "Waifu2x", filterInit, filterGetFrame, filterFree, fmParallel, 0, data, core); 298 | } 299 | 300 | VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) { 301 | configFunc("net.nlzy.vsw2xnvk", "w2xnvk", "VapourSynth Waifu2x NCNN Vulkan Plugin", VAPOURSYNTH_API_VERSION, 1, plugin); 302 | registerFunc("Waifu2x", "clip:clip;" 303 | "noise:int:opt;" 304 | "scale:int:opt;" 305 | "model:int:opt;" 306 | "tile_size:int:opt;" 307 | "gpu_id:int:opt;" 308 | "gpu_thread:int:opt;" 309 | "precision:int:opt;" 310 | "tile_size_w:int:opt;" 311 | "tile_size_h:int:opt;" 312 | "tta:int:opt;" 313 | , filterCreate, nullptr, plugin); 314 | } 315 | -------------------------------------------------------------------------------- /src/waifu2x.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | 4 | Copyright (c) 2019 nihui 5 | Copyright (c) 2019-2020 NaLan ZeYu 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | */ 25 | 26 | #include 27 | #include 28 | #include "waifu2x.hpp" 29 | 30 | #define DIV_CEIL(a, b) (((a) + (b) - 1) / (b)) 31 | #define PAD_TO_ALIGN(a, b) ((((a) + (b) - 1) / (b)) * (b) - (a)) 32 | 33 | static const uint32_t waifu2x_preproc_fp32_spv_data[] = { 34 | #include "waifu2x_preproc_fp32.spv.hex.h" 35 | }; 36 | static const uint32_t waifu2x_preproc_fp16_spv_data[] = { 37 | #include "waifu2x_preproc_fp16.spv.hex.h" 38 | }; 39 | static const uint32_t waifu2x_preproc_tta_fp32_spv_data[] = { 40 | #include "waifu2x_preproc_tta_fp32.spv.hex.h" 41 | }; 42 | static const uint32_t waifu2x_preproc_tta_fp16_spv_data[] = { 43 | #include "waifu2x_preproc_tta_fp16.spv.hex.h" 44 | }; 45 | 46 | static const uint32_t waifu2x_postproc_fp32_spv_data[] = { 47 | #include "waifu2x_postproc_fp32.spv.hex.h" 48 | }; 49 | static const uint32_t waifu2x_postproc_fp16_spv_data[] = { 50 | #include "waifu2x_postproc_fp16.spv.hex.h" 51 | }; 52 | static const uint32_t waifu2x_postproc_tta_fp32_spv_data[] = { 53 | #include "waifu2x_postproc_tta_fp32.spv.hex.h" 54 | }; 55 | static const uint32_t waifu2x_postproc_tta_fp16_spv_data[] = { 56 | #include "waifu2x_postproc_tta_fp16.spv.hex.h" 57 | }; 58 | 59 | 60 | Waifu2x::Waifu2x(int width, int height, int scale, int tilesizew, int tilesizeh, int gpuid, int gputhread, 61 | int precision, int tta, int prepadding, const std::string& parampath, const std::string& modelpath) : 62 | width(width), height(height), scale(scale), tilesizew(tilesizew), tilesizeh(tilesizeh), prepadding(prepadding), tta(tta), semaphore(gputhread) 63 | { 64 | net.opt.use_vulkan_compute = true; 65 | net.opt.use_fp16_packed = precision == 16; 66 | net.opt.use_fp16_storage = precision == 16; 67 | net.opt.use_fp16_arithmetic = false; 68 | net.opt.use_int8_storage = false; 69 | net.opt.use_int8_arithmetic = false; 70 | net.set_vulkan_device(gpuid); 71 | net.load_param(parampath.c_str()); 72 | net.load_model(modelpath.c_str()); 73 | 74 | std::vector specializations; 75 | waifu2x_preproc = new ncnn::Pipeline(net.vulkan_device()); 76 | waifu2x_preproc->set_optimal_local_size_xyz(8, 8, 3); 77 | if (tta) { 78 | if (net.opt.use_fp16_storage) 79 | waifu2x_preproc->create(waifu2x_preproc_tta_fp16_spv_data, sizeof(waifu2x_preproc_tta_fp16_spv_data), specializations); 80 | else 81 | waifu2x_preproc->create(waifu2x_preproc_tta_fp32_spv_data, sizeof(waifu2x_preproc_tta_fp32_spv_data), specializations); 82 | } else { 83 | if (net.opt.use_fp16_storage) 84 | waifu2x_preproc->create(waifu2x_preproc_fp16_spv_data, sizeof(waifu2x_preproc_fp16_spv_data), specializations); 85 | else 86 | waifu2x_preproc->create(waifu2x_preproc_fp32_spv_data, sizeof(waifu2x_preproc_fp32_spv_data), specializations); 87 | } 88 | 89 | 90 | waifu2x_postproc = new ncnn::Pipeline(net.vulkan_device()); 91 | waifu2x_postproc->set_optimal_local_size_xyz(8, 8, 3); 92 | if (tta) { 93 | if (net.opt.use_fp16_storage) 94 | waifu2x_postproc->create(waifu2x_postproc_tta_fp16_spv_data, sizeof(waifu2x_postproc_tta_fp16_spv_data), specializations); 95 | else 96 | waifu2x_postproc->create(waifu2x_postproc_tta_fp32_spv_data, sizeof(waifu2x_postproc_tta_fp32_spv_data), specializations); 97 | } else { 98 | if (net.opt.use_fp16_storage) 99 | waifu2x_postproc->create(waifu2x_postproc_fp16_spv_data, sizeof(waifu2x_postproc_fp16_spv_data), specializations); 100 | else 101 | waifu2x_postproc->create(waifu2x_postproc_fp32_spv_data, sizeof(waifu2x_postproc_fp32_spv_data), specializations); 102 | } 103 | } 104 | 105 | Waifu2x::~Waifu2x() { 106 | delete waifu2x_preproc; 107 | delete waifu2x_postproc; 108 | } 109 | 110 | int Waifu2x::process(const float *srcR, const float *srcG, const float *srcB, 111 | float *dstR, float *dstG, float *dstB, 112 | const ptrdiff_t srcStride, const ptrdiff_t dstStride) const { 113 | semaphore.wait(); 114 | 115 | ncnn::VkAllocator* blob_vkallocator = net.vulkan_device()->acquire_blob_allocator(); 116 | ncnn::VkAllocator* staging_vkallocator = net.vulkan_device()->acquire_staging_allocator(); 117 | ncnn::Option opt = net.opt; 118 | opt.blob_vkallocator = blob_vkallocator; 119 | opt.workspace_vkallocator = blob_vkallocator; 120 | opt.staging_vkallocator = staging_vkallocator; 121 | 122 | const int xtiles = DIV_CEIL(width, tilesizew); 123 | const int ytiles = DIV_CEIL(height, tilesizeh); 124 | 125 | for (int yi = 0; yi < ytiles; yi++) { 126 | ncnn::VkCompute cmd(net.vulkan_device()); 127 | 128 | const int tile_nopad_y0 = yi * tilesizeh; 129 | const int tile_nopad_y1 = std::min(tile_nopad_y0 + tilesizeh, height); 130 | const int tile_nopad_h = tile_nopad_y1 - tile_nopad_y0; 131 | const int prepadding_bottom = prepadding + PAD_TO_ALIGN(tile_nopad_h, 4 / scale); 132 | const int tile_pad_y0 = std::max(tile_nopad_y0 - prepadding, 0); 133 | const int tile_pad_y1 = std::min(tile_nopad_y1 + prepadding_bottom, height); 134 | const int tile_pad_h = tile_pad_y1 - tile_pad_y0; 135 | 136 | 137 | // upload 138 | ncnn::Mat in(width, tile_pad_h, RGB_CHANNELS, sizeof(float)); 139 | for (int y = 0; y < tile_pad_h; y++) { 140 | memcpy((float*)in.channel(0) + y * width, srcR + (y + tile_pad_y0) * srcStride, sizeof(float) * width); 141 | memcpy((float*)in.channel(1) + y * width, srcG + (y + tile_pad_y0) * srcStride, sizeof(float) * width); 142 | memcpy((float*)in.channel(2) + y * width, srcB + (y + tile_pad_y0) * srcStride, sizeof(float) * width); 143 | } 144 | 145 | ncnn::VkMat in_gpu; 146 | cmd.record_clone(in, in_gpu, opt); 147 | if (xtiles > 1) { 148 | if (cmd.submit_and_wait()) { 149 | return ERROR_UPLOAD; 150 | } 151 | cmd.reset(); 152 | } 153 | 154 | 155 | ncnn::VkMat out_gpu; 156 | out_gpu.create(width * scale, tile_nopad_h * scale, RGB_CHANNELS, sizeof(float), blob_vkallocator); 157 | 158 | for (int xi = 0; xi < xtiles; xi++) { 159 | const int tile_nopad_x0 = xi * tilesizew; 160 | const int tile_nopad_x1 = std::min(tile_nopad_x0 + tilesizew, width); 161 | const int tile_nopad_w = tile_nopad_x1 - tile_nopad_x0; 162 | const int prepadding_right = prepadding + PAD_TO_ALIGN(tile_nopad_w, 4 / scale); 163 | 164 | const int waifu2x_times = tta ? 8 : 1; 165 | 166 | std::vector in_tile_gpu(waifu2x_times); 167 | if (tta) { 168 | for (int i = 0; i < 4; i++) { 169 | in_tile_gpu[i].create( 170 | tile_nopad_x1 - tile_nopad_x0 + prepadding + prepadding_right, 171 | tile_nopad_y1 - tile_nopad_y0 + prepadding + prepadding_bottom, 172 | RGB_CHANNELS, net.opt.use_fp16_storage ? 2u : 4u, 1, blob_vkallocator); 173 | } 174 | for (int i = 0; i < 4; i++) { 175 | in_tile_gpu[4 + i].create( 176 | tile_nopad_y1 - tile_nopad_y0 + prepadding + prepadding_bottom, 177 | tile_nopad_x1 - tile_nopad_x0 + prepadding + prepadding_right, 178 | RGB_CHANNELS, net.opt.use_fp16_storage ? 2u : 4u, 1, blob_vkallocator); 179 | } 180 | } else { 181 | in_tile_gpu[0].create( 182 | tile_nopad_x1 - tile_nopad_x0 + prepadding + prepadding_right, 183 | tile_nopad_y1 - tile_nopad_y0 + prepadding + prepadding_bottom, 184 | RGB_CHANNELS, net.opt.use_fp16_storage ? 2u : 4u, 1, blob_vkallocator); 185 | } 186 | 187 | // preproc 188 | { 189 | std::vector bindings(1 + waifu2x_times); 190 | bindings[0] = in_gpu; 191 | for (int i = 0; i < waifu2x_times; ++i) { 192 | bindings[1 + i] = in_tile_gpu[i]; 193 | } 194 | 195 | std::vector constants(10); 196 | constants[0].i = in_gpu.w; 197 | constants[1].i = in_gpu.h; 198 | constants[2].i = in_gpu.cstep; 199 | constants[3].i = in_tile_gpu[0].w; 200 | constants[4].i = in_tile_gpu[0].h; 201 | constants[5].i = in_tile_gpu[0].cstep; 202 | constants[6].i = prepadding; 203 | constants[7].i = prepadding; 204 | constants[8].i = tile_nopad_x0; 205 | constants[9].i = std::min(tile_nopad_y0, prepadding); 206 | 207 | ncnn::VkMat dispatcher; 208 | dispatcher.w = in_tile_gpu[0].w; 209 | dispatcher.h = in_tile_gpu[0].h; 210 | dispatcher.c = RGB_CHANNELS; 211 | 212 | cmd.record_pipeline(waifu2x_preproc, bindings, constants, dispatcher); 213 | } 214 | 215 | 216 | // waifu2x 217 | std::vector out_tile_gpu(waifu2x_times); 218 | 219 | for (int i = 0; i < waifu2x_times; ++i) { 220 | ncnn::Extractor ex = net.create_extractor(); 221 | ex.set_blob_vkallocator(blob_vkallocator); 222 | ex.set_workspace_vkallocator(blob_vkallocator); 223 | ex.set_staging_vkallocator(staging_vkallocator); 224 | 225 | ex.input("Input1", in_tile_gpu[i]); 226 | 227 | if (ex.extract("Eltwise4", out_tile_gpu[i], cmd)) { 228 | return ERROR_EXTRACTOR; 229 | } 230 | } 231 | 232 | 233 | // postproc 234 | { 235 | std::vector bindings(waifu2x_times + 1); 236 | for (int i = 0; i < waifu2x_times; ++i) { 237 | bindings[i] = out_tile_gpu[i]; 238 | } 239 | bindings.back() = out_gpu; 240 | 241 | std::vector constants(8); 242 | constants[0].i = out_tile_gpu[0].w; 243 | constants[1].i = out_tile_gpu[0].h; 244 | constants[2].i = out_tile_gpu[0].cstep; 245 | constants[3].i = out_gpu.w; 246 | constants[4].i = out_gpu.h; 247 | constants[5].i = out_gpu.cstep; 248 | constants[6].i = tile_nopad_x0 * scale; 249 | constants[7].i = std::min(out_gpu.w - tile_nopad_x0 * scale, tilesizew * scale); 250 | 251 | ncnn::VkMat dispatcher; 252 | dispatcher.w = std::min(out_gpu.w - tile_nopad_x0 * scale, tilesizew * scale); 253 | dispatcher.h = out_gpu.h; 254 | dispatcher.c = RGB_CHANNELS; 255 | 256 | cmd.record_pipeline(waifu2x_postproc, bindings, constants, dispatcher); 257 | } 258 | 259 | 260 | if (xtiles > 1) { 261 | if (cmd.submit_and_wait()) { 262 | return ERROR_SUBMIT; 263 | } 264 | cmd.reset(); 265 | } 266 | } 267 | 268 | // download 269 | { 270 | ncnn::Mat out; 271 | cmd.record_clone(out_gpu, out, opt); 272 | if (cmd.submit_and_wait()) { 273 | return ERROR_DOWNLOAD; 274 | } 275 | 276 | for (int y = 0; y < out.h; y++) { 277 | memcpy(dstR + tile_nopad_y0 * scale * dstStride + y * dstStride, (float *)out.channel(0) + y * out.w, out.w * sizeof(float)); 278 | memcpy(dstG + tile_nopad_y0 * scale * dstStride + y * dstStride, (float *)out.channel(1) + y * out.w, out.w * sizeof(float)); 279 | memcpy(dstB + tile_nopad_y0 * scale * dstStride + y * dstStride, (float *)out.channel(2) + y * out.w, out.w * sizeof(float)); 280 | } 281 | } 282 | } 283 | 284 | net.vulkan_device()->reclaim_blob_allocator(blob_vkallocator); 285 | net.vulkan_device()->reclaim_staging_allocator(staging_vkallocator); 286 | semaphore.signal(); // release only when successful 287 | return ERROR_OK; 288 | } 289 | --------------------------------------------------------------------------------