├── .gitignore
├── .gitmodules
├── src
    ├── waifu2x_postproc_fp32.comp
    ├── waifu2x_postproc_fp16.comp
    ├── waifu2x_preproc_fp32.comp
    ├── waifu2x_preproc_fp16.comp
    ├── waifu2x.hpp
    ├── waifu2x_postproc_tta_fp32.comp
    ├── waifu2x_preproc_tta_fp32.comp
    ├── waifu2x_postproc_tta_fp16.comp
    ├── waifu2x_preproc_tta_fp16.comp
    ├── vsw2xnvk.cpp
    └── waifu2x.cpp
├── LICENSE
├── README.md
└── CMakeLists.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | build*/
2 | .vs
3 | .idea
4 | .vscode
5 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "deps/ncnn"]
2 | 	path = deps/ncnn
3 | 	url = https://github.com/Tencent/ncnn.git
4 | 


--------------------------------------------------------------------------------
/src/waifu2x_postproc_fp32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
 4 | layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
 5 | 
 6 | layout (push_constant) uniform parameter
 7 | {
 8 |     int w;
 9 |     int h;
10 |     int cstep;
11 | 
12 |     int outw;
13 |     int outh;
14 |     int outcstep;
15 | 
16 |     int offset_x;
17 |     int gx_max;
18 | } p;
19 | 
20 | void main()
21 | {
22 |     int gx = int(gl_GlobalInvocationID.x);
23 |     int gy = int(gl_GlobalInvocationID.y);
24 |     int gz = int(gl_GlobalInvocationID.z);
25 | 
26 |     if (gx >= p.gx_max || gy >= p.outh || gz >= 3)
27 |         return;
28 | 
29 |     float v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
30 |     top_blob_data[gz * p.outcstep + gy * p.outw + gx + p.offset_x] = clamp(v, 0.0, 1.0);
31 | }
32 | 


--------------------------------------------------------------------------------
/src/waifu2x_postproc_fp16.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #extension GL_EXT_shader_16bit_storage: require
 3 | 
 4 | layout (binding = 0) readonly buffer bottom_blob { float16_t bottom_blob_data[]; };
 5 | layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
 6 | 
 7 | layout (push_constant) uniform parameter
 8 | {
 9 |     int w;
10 |     int h;
11 |     int cstep;
12 | 
13 |     int outw;
14 |     int outh;
15 |     int outcstep;
16 | 
17 |     int offset_x;
18 |     int gx_max;
19 | } p;
20 | 
21 | void main()
22 | {
23 |     int gx = int(gl_GlobalInvocationID.x);
24 |     int gy = int(gl_GlobalInvocationID.y);
25 |     int gz = int(gl_GlobalInvocationID.z);
26 | 
27 |     if (gx >= p.gx_max || gy >= p.outh || gz >= 3)
28 |         return;
29 | 
30 | 	float v = float(bottom_blob_data[gz * p.cstep + gy * p.w + gx]);
31 |     top_blob_data[gz * p.outcstep + gy * p.outw + gx + p.offset_x] = clamp(v * 1.006, 0.0, 1.0);
32 | }
33 | 


--------------------------------------------------------------------------------
/src/waifu2x_preproc_fp32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
 4 | layout (binding = 1) writeonly buffer top_blob { float top_blob_data[]; };
 5 | 
 6 | layout (push_constant) uniform parameter
 7 | {
 8 |     int w;
 9 |     int h;
10 |     int cstep;
11 | 
12 |     int outw;
13 |     int outh;
14 |     int outcstep;
15 | 
16 |     int pad_top;
17 |     int pad_left;
18 | 
19 |     int crop_x;
20 |     int crop_y;
21 | } p;
22 | 
23 | void main()
24 | {
25 |     int gx = int(gl_GlobalInvocationID.x);
26 |     int gy = int(gl_GlobalInvocationID.y);
27 |     int gz = int(gl_GlobalInvocationID.z);
28 | 
29 |     if (gx >= p.outw || gy >= p.outh || gz >= 3)
30 |         return;
31 | 
32 |     int x = gx + p.crop_x - p.pad_left;
33 |     int y = gy + p.crop_y - p.pad_top;
34 |     x = clamp(x, 0, p.w - 1);
35 |     y = clamp(y, 0, p.h - 1);
36 | 
37 |     float v = bottom_blob_data[gz * p.cstep + y * p.w + x];
38 |     top_blob_data[gz * p.outcstep + gy * p.outw + gx] = clamp(v, 0.0, 1.0);
39 | }
40 | 


--------------------------------------------------------------------------------
/src/waifu2x_preproc_fp16.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #extension GL_EXT_shader_16bit_storage: require
 3 | 
 4 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
 5 | layout (binding = 1) writeonly buffer top_blob { float16_t top_blob_data[]; };
 6 | 
 7 | layout (push_constant) uniform parameter
 8 | {
 9 |     int w;
10 |     int h;
11 |     int cstep;
12 | 
13 |     int outw;
14 |     int outh;
15 |     int outcstep;
16 | 
17 |     int pad_top;
18 |     int pad_left;
19 | 
20 |     int crop_x;
21 |     int crop_y;
22 | } p;
23 | 
24 | void main()
25 | {
26 |     int gx = int(gl_GlobalInvocationID.x);
27 |     int gy = int(gl_GlobalInvocationID.y);
28 |     int gz = int(gl_GlobalInvocationID.z);
29 | 
30 |     if (gx >= p.outw || gy >= p.outh || gz >= 3)
31 |         return;
32 | 
33 |     int x = gx + p.crop_x - p.pad_left;
34 |     int y = gy + p.crop_y - p.pad_top;
35 |     x = clamp(x, 0, p.w - 1);
36 |     y = clamp(y, 0, p.h - 1);
37 | 
38 |     float v = bottom_blob_data[gz * p.cstep + y * p.w + x];
39 |     top_blob_data[gz * p.outcstep + gy * p.outw + gx] = float16_t(clamp(v, 0.0, 1.0));
40 | }
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2018-2019 HolyWu
 4 | Copyright (c) 2019 nihui
 5 | Copyright (c) 2019-2020 NaLan ZeYu
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/src/waifu2x.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef WAIFU2X_HPP
 2 | #define WAIFU2X_HPP
 3 | 
 4 | #define RGB_CHANNELS 3
 5 | 
 6 | #include <string>
 7 | #include <mutex>
 8 | #include <condition_variable>
 9 | #include "net.h"
10 | #include "gpu.h"
11 | 
12 | class Waifu2x
13 | {
14 | public:
15 |     Waifu2x(int width, int height, int scale, int tilesizew, int tilesizeh, int gpuid, int gputhread,
16 |             int precision, int tta, int prepadding, const std::string& parampath, const std::string& modelpath);
17 |     ~Waifu2x();
18 | 
19 |     int process(const float *srcR, const float *srcG, const float *srcB, float *dstR, float *dstG, float *dstB, ptrdiff_t srcStride, ptrdiff_t dstStride) const;
20 | 
21 |     enum {
22 |         ERROR_OK = 0,
23 |         ERROR_EXTRACTOR = -1,
24 |         ERROR_SUBMIT = -2,
25 |         ERROR_UPLOAD = -3,
26 |         ERROR_DOWNLOAD = -4
27 |     };
28 | 
29 | private:
30 |     int width;
31 |     int height;
32 |     int scale;
33 |     int tilesizew;
34 |     int tilesizeh;
35 |     int prepadding;
36 |     int tta;
37 | 
38 |     ncnn::Net net;
39 |     ncnn::Pipeline* waifu2x_preproc;
40 |     ncnn::Pipeline* waifu2x_postproc;
41 | 
42 |     class Semaphore {
43 |     private:
44 |         int val;
45 |         std::mutex mtx;
46 |         std::condition_variable cv;
47 |     public:
48 |         explicit Semaphore(int init_value) : val(init_value) {
49 |         }
50 |         void wait() {
51 |             std::unique_lock<std::mutex> lock(mtx);
52 |             while (val <= 0) {
53 |                 cv.wait(lock);
54 |             }
55 |             val--;
56 |         }
57 |         void signal() {
58 |             std::lock_guard<std::mutex> guard(mtx);
59 |             val++;
60 |             cv.notify_one();
61 |         }
62 |     };
63 | 
64 |     mutable Semaphore semaphore;
65 | };
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/src/waifu2x_postproc_tta_fp32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (binding = 0) readonly buffer bottom_blob0 { float bottom_blob0_data[]; };
 4 | layout (binding = 1) readonly buffer bottom_blob1 { float bottom_blob1_data[]; };
 5 | layout (binding = 2) readonly buffer bottom_blob2 { float bottom_blob2_data[]; };
 6 | layout (binding = 3) readonly buffer bottom_blob3 { float bottom_blob3_data[]; };
 7 | layout (binding = 4) readonly buffer bottom_blob4 { float bottom_blob4_data[]; };
 8 | layout (binding = 5) readonly buffer bottom_blob5 { float bottom_blob5_data[]; };
 9 | layout (binding = 6) readonly buffer bottom_blob6 { float bottom_blob6_data[]; };
10 | layout (binding = 7) readonly buffer bottom_blob7 { float bottom_blob7_data[]; };
11 | layout (binding = 8) writeonly buffer top_blob { float top_blob_data[]; };
12 | 
13 | layout (push_constant) uniform parameter
14 | {
15 |     int w;
16 |     int h;
17 |     int cstep;
18 | 
19 |     int outw;
20 |     int outh;
21 |     int outcstep;
22 | 
23 |     int offset_x;
24 |     int gx_max;
25 | } p;
26 | 
27 | void main()
28 | {
29 |     int gx = int(gl_GlobalInvocationID.x);
30 |     int gy = int(gl_GlobalInvocationID.y);
31 |     int gz = int(gl_GlobalInvocationID.z);
32 |     int gzi = gz * p.cstep;
33 | 
34 |     if (gx >= p.gx_max || gy >= p.outh || gz >= 3)
35 |         return;
36 | 
37 |     float v0 = bottom_blob0_data[gzi + gy * p.w + gx];
38 |     float v1 = bottom_blob1_data[gzi + gy * p.w + (p.w - 1 - gx)];
39 |     float v2 = bottom_blob2_data[gzi + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)];
40 |     float v3 = bottom_blob3_data[gzi + (p.h - 1 - gy) * p.w + gx];
41 |     float v4 = bottom_blob4_data[gzi + gx * p.h + gy];
42 |     float v5 = bottom_blob5_data[gzi + gx * p.h + (p.h - 1 - gy)];
43 |     float v6 = bottom_blob6_data[gzi + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)];
44 |     float v7 = bottom_blob7_data[gzi + (p.w - 1 - gx) * p.h + gy];
45 | 
46 |     float v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f;
47 | 
48 |     top_blob_data[gz * p.outcstep + gy * p.outw + gx + p.offset_x] = clamp(v * 1.006, 0.0, 1.0);
49 | }
50 | 


--------------------------------------------------------------------------------
/src/waifu2x_preproc_tta_fp32.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | 
 3 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
 4 | layout (binding = 1) writeonly buffer top_blob0 { float top_blob0_data[]; };
 5 | layout (binding = 2) writeonly buffer top_blob1 { float top_blob1_data[]; };
 6 | layout (binding = 3) writeonly buffer top_blob2 { float top_blob2_data[]; };
 7 | layout (binding = 4) writeonly buffer top_blob3 { float top_blob3_data[]; };
 8 | layout (binding = 5) writeonly buffer top_blob4 { float top_blob4_data[]; };
 9 | layout (binding = 6) writeonly buffer top_blob5 { float top_blob5_data[]; };
10 | layout (binding = 7) writeonly buffer top_blob6 { float top_blob6_data[]; };
11 | layout (binding = 8) writeonly buffer top_blob7 { float top_blob7_data[]; };
12 | 
13 | layout (push_constant) uniform parameter
14 | {
15 |     int w;
16 |     int h;
17 |     int cstep;
18 | 
19 |     int outw;
20 |     int outh;
21 |     int outcstep;
22 | 
23 |     int pad_top;
24 |     int pad_left;
25 | 
26 |     int crop_x;
27 |     int crop_y;
28 | } p;
29 | 
30 | void main()
31 | {
32 |     int gx = int(gl_GlobalInvocationID.x);
33 |     int gy = int(gl_GlobalInvocationID.y);
34 |     int gz = int(gl_GlobalInvocationID.z);
35 |     int gzi = gz * p.outcstep;
36 | 
37 |     if (gx >= p.outw || gy >= p.outh || gz >= 3)
38 |         return;
39 | 
40 |     int x = gx + p.crop_x - p.pad_left;
41 |     int y = gy + p.crop_y - p.pad_top;
42 |     x = clamp(x, 0, p.w - 1);
43 |     y = clamp(y, 0, p.h - 1);
44 | 
45 |     float v = clamp(bottom_blob_data[gz * p.cstep + y * p.w + x], 0.0, 1.0);
46 | 
47 |     top_blob0_data[gzi + gy * p.outw + gx] = v;
48 |     top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = v;
49 |     top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = v;
50 |     top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = v;
51 |     top_blob4_data[gzi + gx * p.outh + gy] = v;
52 |     top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = v;
53 |     top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = v;
54 |     top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = v;
55 | }
56 | 


--------------------------------------------------------------------------------
/src/waifu2x_postproc_tta_fp16.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #extension GL_EXT_shader_16bit_storage: require
 3 | 
 4 | layout (binding = 0) readonly buffer bottom_blob0 { float16_t bottom_blob0_data[]; };
 5 | layout (binding = 1) readonly buffer bottom_blob1 { float16_t bottom_blob1_data[]; };
 6 | layout (binding = 2) readonly buffer bottom_blob2 { float16_t bottom_blob2_data[]; };
 7 | layout (binding = 3) readonly buffer bottom_blob3 { float16_t bottom_blob3_data[]; };
 8 | layout (binding = 4) readonly buffer bottom_blob4 { float16_t bottom_blob4_data[]; };
 9 | layout (binding = 5) readonly buffer bottom_blob5 { float16_t bottom_blob5_data[]; };
10 | layout (binding = 6) readonly buffer bottom_blob6 { float16_t bottom_blob6_data[]; };
11 | layout (binding = 7) readonly buffer bottom_blob7 { float16_t bottom_blob7_data[]; };
12 | layout (binding = 8) writeonly buffer top_blob { float top_blob_data[]; };
13 | 
14 | layout (push_constant) uniform parameter
15 | {
16 |     int w;
17 |     int h;
18 |     int cstep;
19 | 
20 |     int outw;
21 |     int outh;
22 |     int outcstep;
23 | 
24 |     int offset_x;
25 |     int gx_max;
26 | } p;
27 | 
28 | void main()
29 | {
30 |     int gx = int(gl_GlobalInvocationID.x);
31 |     int gy = int(gl_GlobalInvocationID.y);
32 |     int gz = int(gl_GlobalInvocationID.z);
33 |     int gzi = gz * p.cstep;
34 | 
35 |     if (gx >= p.gx_max || gy >= p.outh || gz >= 3)
36 |         return;
37 | 
38 |     float v0 = float(bottom_blob0_data[gzi + gy * p.w + gx]);
39 |     float v1 = float(bottom_blob1_data[gzi + gy * p.w + (p.w - 1 - gx)]);
40 |     float v2 = float(bottom_blob2_data[gzi + (p.h - 1 - gy) * p.w + (p.w - 1 - gx)]);
41 |     float v3 = float(bottom_blob3_data[gzi + (p.h - 1 - gy) * p.w + gx]);
42 |     float v4 = float(bottom_blob4_data[gzi + gx * p.h + gy]);
43 |     float v5 = float(bottom_blob5_data[gzi + gx * p.h + (p.h - 1 - gy)]);
44 |     float v6 = float(bottom_blob6_data[gzi + (p.w - 1 - gx) * p.h + (p.h - 1 - gy)]);
45 |     float v7 = float(bottom_blob7_data[gzi + (p.w - 1 - gx) * p.h + gy]);
46 | 
47 |     float v = (v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7) * 0.125f;
48 | 
49 |     top_blob_data[gz * p.outcstep + gy * p.outw + gx + p.offset_x] = clamp(v * 1.006, 0.0, 1.0);
50 | }
51 | 


--------------------------------------------------------------------------------
/src/waifu2x_preproc_tta_fp16.comp:
--------------------------------------------------------------------------------
 1 | #version 450
 2 | #extension GL_EXT_shader_16bit_storage: require
 3 | 
 4 | layout (binding = 0) readonly buffer bottom_blob { float bottom_blob_data[]; };
 5 | layout (binding = 1) writeonly buffer top_blob0 { float16_t top_blob0_data[]; };
 6 | layout (binding = 2) writeonly buffer top_blob1 { float16_t top_blob1_data[]; };
 7 | layout (binding = 3) writeonly buffer top_blob2 { float16_t top_blob2_data[]; };
 8 | layout (binding = 4) writeonly buffer top_blob3 { float16_t top_blob3_data[]; };
 9 | layout (binding = 5) writeonly buffer top_blob4 { float16_t top_blob4_data[]; };
10 | layout (binding = 6) writeonly buffer top_blob5 { float16_t top_blob5_data[]; };
11 | layout (binding = 7) writeonly buffer top_blob6 { float16_t top_blob6_data[]; };
12 | layout (binding = 8) writeonly buffer top_blob7 { float16_t top_blob7_data[]; };
13 | 
14 | layout (push_constant) uniform parameter
15 | {
16 |     int w;
17 |     int h;
18 |     int cstep;
19 | 
20 |     int outw;
21 |     int outh;
22 |     int outcstep;
23 | 
24 |     int pad_top;
25 |     int pad_left;
26 | 
27 |     int crop_x;
28 |     int crop_y;
29 | } p;
30 | 
31 | void main()
32 | {
33 |     int gx = int(gl_GlobalInvocationID.x);
34 |     int gy = int(gl_GlobalInvocationID.y);
35 |     int gz = int(gl_GlobalInvocationID.z);
36 |     int gzi = gz * p.outcstep;
37 | 
38 |     if (gx >= p.outw || gy >= p.outh || gz >= 3)
39 |         return;
40 | 
41 |     int x = gx + p.crop_x - p.pad_left;
42 |     int y = gy + p.crop_y - p.pad_top;
43 |     x = clamp(x, 0, p.w - 1);
44 |     y = clamp(y, 0, p.h - 1);
45 | 
46 |     float v = clamp(bottom_blob_data[gz * p.cstep + y * p.w + x], 0.0, 1.0);
47 | 
48 |     top_blob0_data[gzi + gy * p.outw + gx] = float16_t(v);
49 |     top_blob1_data[gzi + gy * p.outw + (p.outw - 1 - gx)] = float16_t(v);
50 |     top_blob2_data[gzi + (p.outh - 1 - gy) * p.outw + (p.outw - 1 - gx)] = float16_t(v);
51 |     top_blob3_data[gzi + (p.outh - 1 - gy) * p.outw + gx] = float16_t(v);
52 |     top_blob4_data[gzi + gx * p.outh + gy] = float16_t(v);
53 |     top_blob5_data[gzi + gx * p.outh + (p.outh - 1 - gy)] = float16_t(v);
54 |     top_blob6_data[gzi + (p.outw - 1 - gx) * p.outh + (p.outh - 1 - gy)] = float16_t(v);
55 |     top_blob7_data[gzi + (p.outw - 1 - gx) * p.outh + gy] = float16_t(v);
56 | }
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # VapourSynth Waifu2x NCNN Vulkan Plugin
  2 | 
  3 | Waifu2x filter for VapourSynth, based on [waifu2x-ncnn-vulkan](https://github.com/nihui/waifu2x-ncnn-vulkan).
  4 | 
  5 | ## DEPRECATED
  6 | 
  7 | **This plugin has been deprecated.**
  8 | 
  9 | I recommend using [vs-mlrt](https://github.com/AmusementClub/vs-mlrt) instead, which offers broader model support, provides more inference framework options (including NCNN Vulkan), and is actively maintained.
 10 | 
 11 | ## Install
 12 | 
 13 | Download pre-built binaries and model files from [releases](https://github.com/Nlzy/vapoursynth-waifu2x-ncnn-vulkan/releases). Uncompress and put into VapourSynth plugin folder.
 14 | 
 15 | ## Usage
 16 | 
 17 | ```
 18 | core.w2xnvk.Waifu2x(clip[, noise, scale, model, tile_size, gpu_id, gpu_thread, precision, tile_size_w, tile_size_h, tta])
 19 | ```
 20 | 
 21 | * clip: Input clip. Only 32-bit float RGB is supported.
 22 | 
 23 | * noise: Denoise level. (int -1/0/1/2/3, defualt=0)
 24 |   * -1 = none
 25 |   * 0 = low
 26 |   * 1 = medium
 27 |   * 2 = high
 28 |   * 3 = highest
 29 | 
 30 | * scale: Upscale ratio. (int 1/2, default=2)
 31 |   * 1 = no scaling, denoise only. upconv_7 doesn't support this mode.
 32 |   * 2 = upscale 2x.
 33 | 
 34 | * model: Model to use. (int 0/1/2, default=0)
 35 |   * 0 = upconv_7_anime_style_art_rgb
 36 |   * 1 = upconv_7_photo
 37 |   * 2 = cunet (For 2D artwork. Slow, but better quality.)
 38 | 
 39 | * tile_size: Tile size. Must be divisible by 4. Increasing this value may improve performance and take more VRAM. (int >=32, default=0 for auto choose)
 40 | 
 41 | * gpu_id: GPU device to use. (int >=0, default=0)
 42 | 
 43 | * gpu_thread: Number of threads that can simultaneously access GPU. (int >=1, default=0 for auto detect)
 44 | 
 45 | * precision: Floating-point precision. Single-precision (fp32) is slow but more precise in color. Default is half-precision (fp16). (int 16/32, default=16)
 46 | 
 47 | * tile_size_w / tile_size_h: Override width and height of tile_size.
 48 | 
 49 | * tta: TTA (test-time augmentation) mode. (bool True/False, default=False)
 50 | 
 51 | ## Build
 52 | 
 53 | ### Linux
 54 | 
 55 | Install dependencies:
 56 | 
 57 | ```bash
 58 | # Arch Linux
 59 | sudo pacman -S vapoursynth glslang vulkan-icd-loader vulkan-headers
 60 | # Fedora
 61 | sudo dnf install vapoursynth-devel glslang vulkan-loader-devel vulkan-headers
 62 | ```
 63 | 
 64 | Get source code and build:
 65 | 
 66 | ```bash
 67 | # clone repository and submodule
 68 | git clone https://github.com/Nlzy/vapoursynth-waifu2x-ncnn-vulkan.git
 69 | cd vapoursynth-waifu2x-ncnn-vulkan
 70 | git submodule update --init --recursive
 71 | mkdir build
 72 | cd build
 73 | 
 74 | # build
 75 | cmake ..
 76 | cmake --build . -j 4
 77 | ```
 78 | 
 79 | ### Windows
 80 | 
 81 | Install [Vulkan SDK](https://vulkan.lunarg.com/sdk/home).
 82 | 
 83 | Open `Git Bash`, clone repository and submodule:
 84 | 
 85 | ```bash
 86 | git clone https://github.com/Nlzy/vapoursynth-waifu2x-ncnn-vulkan.git
 87 | cd vapoursynth-waifu2x-ncnn-vulkan
 88 | git submodule update --init --recursive
 89 | mkdir build
 90 | ```
 91 | 
 92 | Open `Start Menu` -> `Visual Studio 2019` -> `x64 Native Tools Command Prompt for VS 2019`, then build:
 93 | 
 94 | ```
 95 | cd X:\path_to_vapoursynth-waifu2x-ncnn-vulkan\build
 96 | cmake -G "NMake Makefiles" -DVAPOURSYNTH_HEADER_DIR=X:\path_to_vapoursynth\sdk\include\vapoursynth ..
 97 | cmake --build .
 98 | ```
 99 | 
100 | Note: If you are using VapourSynth "Portable" version, use `-DVAPOURSYNTH_HEADER_DIR=X:\path_to_vapoursynth\sdk\include` instead.
101 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | project(vapoursynth-waifu2x-ncnn-vulkan)
  2 | cmake_minimum_required(VERSION 3.9)
  3 | set(CMAKE_BUILD_TYPE Release)
  4 | 
  5 | find_package(Vulkan REQUIRED)
  6 | 
  7 | # check glslangValidator
  8 | find_program(GLSLANGVALIDATOR_EXECUTABLE NAMES glslangValidator PATHS $ENV{VULKAN_SDK}/bin NO_CMAKE_FIND_ROOT_PATH)
  9 | message(STATUS "Found glslangValidator: ${GLSLANGVALIDATOR_EXECUTABLE}")
 10 | 
 11 | macro(compile_shader SHADER_SRC)
 12 |     set(SHADER_SRC_FULLPATH ${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_SRC})
 13 |     get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
 14 |     set(SHADER_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.hex.h)
 15 |     add_custom_command(
 16 |             OUTPUT ${SHADER_SPV_HEX_FILE}
 17 |             COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
 18 |             ARGS -V -s -x -o ${SHADER_SPV_HEX_FILE} ${SHADER_SRC_FULLPATH}
 19 |             DEPENDS ${SHADER_SRC_FULLPATH}
 20 |             COMMENT "Building SPIR-V module ${SHADER_SRC_NAME_WE}.spv"
 21 |             VERBATIM
 22 |     )
 23 |     set_source_files_properties(${SHADER_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
 24 |     list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_FILE})
 25 | endmacro()
 26 | 
 27 | # enable global link time optimization
 28 | cmake_policy(SET CMP0069 NEW)
 29 | set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
 30 | include(CheckIPOSupported)
 31 | check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output)
 32 | if(ipo_supported)
 33 |     set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 34 | else()
 35 |     message(WARNING "IPO is not supported: ${ipo_supported_output}")
 36 | endif()
 37 | 
 38 | # build ncnn library
 39 | if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/deps/ncnn/CMakeLists.txt")
 40 |     message(FATAL_ERROR "The submodules were not downloaded! Please update submodules with \"git submodule update --init --recursive\" and try again.")
 41 | endif()
 42 | option(NCNN_INSTALL_SDK "" OFF)
 43 | option(NCNN_PIXEL_ROTATE "" OFF)
 44 | option(NCNN_PIXEL_AFFINE "" OFF)
 45 | option(NCNN_PIXEL_DRAWING "" OFF)
 46 | option(NCNN_VULKAN "" ON)
 47 | option(NCNN_VULKAN_ONLINE_SPIRV "" ON)
 48 | option(NCNN_BUILD_BENCHMARK "" OFF)
 49 | option(NCNN_BUILD_TESTS "" OFF)
 50 | option(NCNN_BUILD_TOOLS "" OFF)
 51 | option(NCNN_BUILD_EXAMPLES "" OFF)
 52 | option(NCNN_DISABLE_RTTI "" ON)
 53 | option(NCNN_DISABLE_EXCEPTION "" ON)
 54 | option(NCNN_INT8 "" OFF)
 55 | option(NCNN_OPENMP "" OFF)
 56 | option(WITH_LAYER_absval "" OFF)
 57 | option(WITH_LAYER_argmax "" OFF)
 58 | option(WITH_LAYER_batchnorm "" OFF)
 59 | option(WITH_LAYER_bias "" OFF)
 60 | option(WITH_LAYER_bnll "" OFF)
 61 | option(WITH_LAYER_concat "" OFF)
 62 | option(WITH_LAYER_convolution "" ON)
 63 | option(WITH_LAYER_crop "" ON)
 64 | option(WITH_LAYER_deconvolution "" ON)
 65 | option(WITH_LAYER_dropout "" OFF)
 66 | option(WITH_LAYER_eltwise "" ON)
 67 | option(WITH_LAYER_elu "" OFF)
 68 | option(WITH_LAYER_embed "" OFF)
 69 | option(WITH_LAYER_exp "" OFF)
 70 | option(WITH_LAYER_flatten "" ON)
 71 | option(WITH_LAYER_innerproduct "" ON)
 72 | option(WITH_LAYER_input "" ON)
 73 | option(WITH_LAYER_log "" OFF)
 74 | option(WITH_LAYER_lrn "" OFF)
 75 | option(WITH_LAYER_memorydata "" OFF)
 76 | option(WITH_LAYER_mvn "" OFF)
 77 | option(WITH_LAYER_pooling "" ON)
 78 | option(WITH_LAYER_power "" OFF)
 79 | option(WITH_LAYER_prelu "" OFF)
 80 | option(WITH_LAYER_proposal "" OFF)
 81 | option(WITH_LAYER_reduction "" OFF)
 82 | option(WITH_LAYER_relu "" ON)
 83 | option(WITH_LAYER_reshape "" OFF)
 84 | option(WITH_LAYER_roipooling "" OFF)
 85 | option(WITH_LAYER_scale "" ON)
 86 | option(WITH_LAYER_sigmoid "" OFF)
 87 | option(WITH_LAYER_slice "" OFF)
 88 | option(WITH_LAYER_softmax "" OFF)
 89 | option(WITH_LAYER_split "" ON)
 90 | option(WITH_LAYER_spp "" OFF)
 91 | option(WITH_LAYER_tanh "" OFF)
 92 | option(WITH_LAYER_threshold "" OFF)
 93 | option(WITH_LAYER_tile "" OFF)
 94 | option(WITH_LAYER_rnn "" OFF)
 95 | option(WITH_LAYER_lstm "" OFF)
 96 | option(WITH_LAYER_binaryop "" OFF)
 97 | option(WITH_LAYER_unaryop "" OFF)
 98 | option(WITH_LAYER_convolutiondepthwise "" OFF)
 99 | option(WITH_LAYER_padding "" ON)
100 | option(WITH_LAYER_squeeze "" OFF)
101 | option(WITH_LAYER_expanddims "" OFF)
102 | option(WITH_LAYER_normalize "" OFF)
103 | option(WITH_LAYER_permute "" OFF)
104 | option(WITH_LAYER_priorbox "" OFF)
105 | option(WITH_LAYER_detectionoutput "" OFF)
106 | option(WITH_LAYER_interp "" ON)
107 | option(WITH_LAYER_deconvolutiondepthwise "" OFF)
108 | option(WITH_LAYER_shufflechannel "" OFF)
109 | option(WITH_LAYER_instancenorm "" OFF)
110 | option(WITH_LAYER_clip "" OFF)
111 | option(WITH_LAYER_reorg "" OFF)
112 | option(WITH_LAYER_yolodetectionoutput "" OFF)
113 | option(WITH_LAYER_quantize "" OFF)
114 | option(WITH_LAYER_dequantize "" OFF)
115 | option(WITH_LAYER_yolov3detectionoutput "" OFF)
116 | option(WITH_LAYER_psroipooling "" OFF)
117 | option(WITH_LAYER_roialign "" OFF)
118 | option(WITH_LAYER_packing "" ON)
119 | option(WITH_LAYER_requantize "" OFF)
120 | option(WITH_LAYER_cast "" ON)
121 | option(WITH_LAYER_hardsigmoid "" OFF)
122 | option(WITH_LAYER_selu "" OFF)
123 | option(WITH_LAYER_hardswish "" OFF)
124 | option(WITH_LAYER_noop "" OFF)
125 | option(WITH_LAYER_pixelshuffle "" OFF)
126 | option(WITH_LAYER_deepcopy "" OFF)
127 | option(WITH_LAYER_mish "" OFF)
128 | option(WITH_LAYER_statisticspooling "" OFF)
129 | option(WITH_LAYER_swish "" OFF)
130 | option(WITH_LAYER_gemm "" OFF)
131 | option(WITH_LAYER_groupnorm "" OFF)
132 | option(WITH_LAYER_layernorm "" OFF)
133 | option(WITH_LAYER_softplus "" OFF)
134 | option(WITH_LAYER_gru "" OFF)
135 | option(WITH_LAYER_multiheadattention "" OFF)
136 | option(WITH_LAYER_gelu "" OFF)
137 | option(WITH_LAYER_convolution1d "" OFF)
138 | option(WITH_LAYER_pooling1d "" OFF)
139 | option(WITH_LAYER_convolutiondepthwise1d "" OFF)
140 | option(WITH_LAYER_convolution3d "" OFF)
141 | option(WITH_LAYER_convolutiondepthwise3d "" OFF)
142 | option(WITH_LAYER_pooling3d "" OFF)
143 | option(WITH_LAYER_matmul "" OFF)
144 | option(WITH_LAYER_deconvolution1d "" OFF)
145 | option(WITH_LAYER_deconvolutiondepthwise1d "" OFF)
146 | option(WITH_LAYER_deconvolution3d "" OFF)
147 | option(WITH_LAYER_deconvolutiondepthwise3d "" OFF)
148 | add_subdirectory(deps/ncnn)
149 | 
150 | # generate-spirv target
151 | set(SHADER_SPV_HEX_FILES)
152 | compile_shader(src/waifu2x_preproc_fp16.comp)
153 | compile_shader(src/waifu2x_preproc_fp32.comp)
154 | compile_shader(src/waifu2x_preproc_tta_fp16.comp)
155 | compile_shader(src/waifu2x_preproc_tta_fp32.comp)
156 | compile_shader(src/waifu2x_postproc_fp16.comp)
157 | compile_shader(src/waifu2x_postproc_fp32.comp)
158 | compile_shader(src/waifu2x_postproc_tta_fp16.comp)
159 | compile_shader(src/waifu2x_postproc_tta_fp32.comp)
160 | add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES})
161 | 
162 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
163 | 
164 | # check VapourSynth headers
165 | set(VAPOURSYNTH_HEADER_DIR "/usr/include/vapoursynth" CACHE PATH "VapourSynth header files")
166 | if(NOT EXISTS "${VAPOURSYNTH_HEADER_DIR}/VSHelper.h")
167 |     message(FATAL_ERROR "VapourSynth header files NOT FOUND, specify with -DVAPOURSYNTH_HEADER_DIR")
168 | endif()
169 | 
170 | # libvsw2xnvk
171 | add_library(vsw2xnvk SHARED src/vsw2xnvk.cpp src/waifu2x.cpp)
172 | target_include_directories(vsw2xnvk PRIVATE ${VAPOURSYNTH_HEADER_DIR})
173 | target_link_libraries(vsw2xnvk ncnn ${Vulkan_LIBRARY})
174 | add_dependencies(vsw2xnvk generate-spirv)
175 | 


--------------------------------------------------------------------------------
/src/vsw2xnvk.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |   MIT License
  3 | 
  4 |   Copyright (c) 2018-2019 HolyWu
  5 |   Copyright (c) 2019-2020 NaLan ZeYu
  6 | 
  7 |   Permission is hereby granted, free of charge, to any person obtaining a copy
  8 |   of this software and associated documentation files (the "Software"), to deal
  9 |   in the Software without restriction, including without limitation the rights
 10 |   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |   copies of the Software, and to permit persons to whom the Software is
 12 |   furnished to do so, subject to the following conditions:
 13 | 
 14 |   The above copyright notice and this permission notice shall be included in all
 15 |   copies or substantial portions of the Software.
 16 | 
 17 |   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 |   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 |   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 |   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |   SOFTWARE.
 24 | */
 25 | 
 26 | #include <fstream>
 27 | #include <algorithm>
 28 | #include "gpu.h"
 29 | #include "waifu2x.hpp"
 30 | #include "VSHelper.h"
 31 | 
 32 | static ncnn::Mutex instanceLock;
 33 | static int instanceCounter = 0;
 34 | 
 35 | static int tryCreateGpuInstance() {
 36 |     ncnn::MutexLockGuard lg(instanceLock);
 37 |     if (instanceCounter++ == 0) {
 38 |         return ncnn::create_gpu_instance();
 39 |     } else {
 40 |         return 0;
 41 |     }
 42 | }
 43 | 
 44 | static void tryDestoryGpuInstance() {
 45 |     ncnn::MutexLockGuard lg(instanceLock);
 46 |     if (--instanceCounter == 0) {
 47 |         ncnn::destroy_gpu_instance();
 48 |     }
 49 | }
 50 | 
 51 | typedef struct {
 52 |     VSNodeRef *node;
 53 |     VSVideoInfo vi;
 54 |     Waifu2x *waifu2x;
 55 | } FilterData;
 56 | 
 57 | static int filter(const VSFrameRef *src, VSFrameRef *dst, FilterData * const VS_RESTRICT d, const VSAPI *vsapi) noexcept {
 58 |     const int srcStride = vsapi->getStride(src, 0) / static_cast<int>(sizeof(float));
 59 |     const int dstStride = vsapi->getStride(dst, 0) / static_cast<int>(sizeof(float));
 60 |     auto *             srcR = reinterpret_cast<const float *>(vsapi->getReadPtr(src, 0));
 61 |     auto *             srcG = reinterpret_cast<const float *>(vsapi->getReadPtr(src, 1));
 62 |     auto *             srcB = reinterpret_cast<const float *>(vsapi->getReadPtr(src, 2));
 63 |     auto * VS_RESTRICT dstR = reinterpret_cast<float *>(vsapi->getWritePtr(dst, 0));
 64 |     auto * VS_RESTRICT dstG = reinterpret_cast<float *>(vsapi->getWritePtr(dst, 1));
 65 |     auto * VS_RESTRICT dstB = reinterpret_cast<float *>(vsapi->getWritePtr(dst, 2));
 66 |     return d->waifu2x->process(srcR, srcG, srcB, dstR, dstG, dstB, srcStride, dstStride);
 67 | }
 68 | 
 69 | static void VS_CC filterInit(VSMap *in, VSMap *out, void **instanceData, VSNode *node, VSCore *core, const VSAPI *vsapi) {
 70 |     auto *d = static_cast<FilterData *>(*instanceData);
 71 |     vsapi->setVideoInfo(&d->vi, 1, node);
 72 | }
 73 | 
 74 | static const VSFrameRef *VS_CC filterGetFrame(int n, int activationReason, void **instanceData, void **frameData, VSFrameContext *frameCtx, VSCore *core, const VSAPI *vsapi) {
 75 |     auto *d = static_cast<FilterData *>(*instanceData);
 76 | 
 77 |     if (activationReason == arInitial) {
 78 |         vsapi->requestFrameFilter(n, d->node, frameCtx);
 79 |     } else if (activationReason == arAllFramesReady) {
 80 |         auto src = vsapi->getFrameFilter(n, d->node, frameCtx);
 81 |         auto dst = vsapi->newVideoFrame(d->vi.format, d->vi.width, d->vi.height, src, core);
 82 | 
 83 |         int err = filter(src, dst, d, vsapi);
 84 |         switch (err) {
 85 |             case Waifu2x::ERROR_OK:
 86 |                 vsapi->freeFrame(src);
 87 |                 return dst;
 88 |             case Waifu2x::ERROR_EXTRACTOR:
 89 |                 vsapi->setFilterError("Waifu2x-NCNN-Vulkan: Waifu2x extractor error. Try to decrease tile_size or gpu_thread", frameCtx);
 90 |                 vsapi->freeFrame(src);
 91 |                 vsapi->freeFrame(dst);
 92 |                 return nullptr;
 93 |             case Waifu2x::ERROR_DOWNLOAD:
 94 |             case Waifu2x::ERROR_UPLOAD:
 95 |             case Waifu2x::ERROR_SUBMIT:
 96 |                 vsapi->setFilterError("Waifu2x-NCNN-Vulkan: Waifu2x submit error. Try to decrease gpu_thread", frameCtx);
 97 |                 vsapi->freeFrame(src);
 98 |                 vsapi->freeFrame(dst);
 99 |                 return nullptr;
100 |         }
101 |     }
102 | 
103 |     return nullptr;
104 | }
105 | 
106 | static void VS_CC filterFree(void *instanceData, VSCore *core, const VSAPI *vsapi) {
107 |     auto *d = static_cast<FilterData *>(instanceData);
108 |     vsapi->freeNode(d->node);
109 |     delete d->waifu2x;
110 |     delete d;
111 |     tryDestoryGpuInstance();
112 | }
113 | 
114 | static void VS_CC filterCreate(const VSMap *in, VSMap *out, void *userData, VSCore *core, const VSAPI *vsapi) {
115 |     FilterData d{};
116 |     d.node = vsapi->propGetNode(in, "clip", 0, nullptr);
117 |     d.vi = *vsapi->getVideoInfo(d.node);
118 | 
119 |     int gpuId, noise, scale, model, tileSizeW, tileSizeH, gpuThread, precision, tta;
120 |     std::string paramPath, modelPath;
121 |     char const * err_prompt = nullptr;
122 |     do {
123 |         int err;
124 | 
125 |         err = tryCreateGpuInstance();
126 |         if (err) {
127 |             err_prompt = "create gpu instance failed";
128 |             break;
129 |         }
130 | 
131 |         if (!isConstantFormat(&d.vi) || d.vi.format->colorFamily != cmRGB || d.vi.format->sampleType != stFloat || d.vi.format->bitsPerSample != 32) {
132 |             err_prompt = "only constant RGB format and 32 bit float input supported";
133 |             break;
134 |         }
135 | 
136 |         gpuId = int64ToIntS(vsapi->propGetInt(in, "gpu_id", 0, &err));
137 |         if (gpuId < 0 || gpuId >= ncnn::get_gpu_count()) {
138 |             err_prompt = "invalid 'gpu_id'";
139 |             break;
140 |         }
141 | 
142 |         noise = int64ToIntS(vsapi->propGetInt(in, "noise", 0, &err));
143 |         if (noise < -1 || noise > 3) {
144 |             err_prompt = "'noise' must be -1, 0, 1, 2, or 3";
145 |             break;
146 |         }
147 | 
148 |         scale = int64ToIntS(vsapi->propGetInt(in, "scale", 0, &err));
149 |         if (err)
150 |             scale = 2;
151 |         if (scale != 1 && scale != 2) {
152 |             err_prompt = "'scale' must be 1 or 2";
153 |             break;
154 |         }
155 | 
156 |         model = int64ToIntS(vsapi->propGetInt(in, "model", 0, &err));
157 |         if (model < 0 || model > 2) {
158 |             err_prompt = "'model' must be 0, 1 or 2";
159 |             break;
160 |         }
161 | 
162 |         precision = int64ToIntS(vsapi->propGetInt(in, "precision", 0, &err));
163 |         if (err)
164 |             precision = 16;
165 |         if (precision != 16 && precision != 32) {
166 |             err_prompt = "'precision' must be 16 or 32";
167 |             break;
168 |         }
169 | 
170 |         tta = int64ToIntS(vsapi->propGetInt(in, "tta", 0, &err));
171 |         if (err)
172 |             tta = 0;
173 |         if (tta != 0)
174 |             tta = 1;
175 | 
176 |         int customGpuThread = int64ToIntS(vsapi->propGetInt(in, "gpu_thread", 0, &err));
177 |         if (customGpuThread > 0) {
178 |             gpuThread = customGpuThread;
179 |         }
180 |         else {
181 |             gpuThread = int64ToIntS(ncnn::get_gpu_info(gpuId).transfer_queue_count());
182 |         }
183 |         gpuThread = std::min(gpuThread, int64ToIntS(ncnn::get_gpu_info(gpuId).compute_queue_count()));
184 | 
185 |         int tileSize = int64ToIntS(vsapi->propGetInt(in, "tile_size", 0, &err));
186 |         if (tileSize == 0) {
187 |             double vram = ncnn::get_gpu_device(gpuId)->get_heap_budget(); // in MByte
188 |             double factor = (precision == 32 ? 2 : 1) * (model == 2 ? 1.5 : 1) * gpuThread;
189 |             if (vram / factor > 900)
190 |                 tileSize = 360;
191 |             else if (vram / factor > 450)
192 |                 tileSize = 240;
193 |             else
194 |                 tileSize = 180;
195 |         }
196 |         if (tileSize < 32) {
197 |             err_prompt = "'tile_size' must be greater than or equal to 32";
198 |             break;
199 |         }
200 |         if (tileSize % 4) {
201 |             err_prompt = "'tile_size' must be multiple of 4";
202 |             break;
203 |         }
204 |         tileSizeW = tileSizeH = tileSize;
205 | 
206 |         int tw = int64ToIntS(vsapi->propGetInt(in, "tile_size_w", 0, &err));
207 |         if (!err) {
208 |             if (tw < 32) {
209 |                 err_prompt = "'tile_size_w' must be greater than or equal to 32";
210 |                 break;
211 |             }
212 |             if (tw % 4) {
213 |                 err_prompt = "'tile_size_w' must be multiple of 4";
214 |                 break;
215 |             }
216 |             tileSizeW = tw;
217 |         }
218 | 
219 |         int th = int64ToIntS(vsapi->propGetInt(in, "tile_size_h", 0, &err));
220 |         if (!err) {
221 |             if (th < 32) {
222 |                 err_prompt = "'tile_size_h' must be greater than or equal to 32";
223 |                 break;
224 |             }
225 |             if (th % 4) {
226 |                 err_prompt = "'tile_size_h' must be multiple of 4";
227 |                 break;
228 |             }
229 |             tileSizeH = th;
230 |         }
231 | 
232 |         if (scale == 1 && noise == -1) {
233 |             err_prompt = "use 'noise=-1' and 'scale=1' at same time is useless";
234 |             break;
235 |         }
236 | 
237 |         if (scale == 1 && model != 2) {
238 |             err_prompt = "only cunet model support 'scale=1'";
239 |             break;
240 |         }
241 | 
242 |         // set model path
243 |         const std::string pluginFilePath{ vsapi->getPluginPath(vsapi->getPluginById("net.nlzy.vsw2xnvk", core)) };
244 |         const std::string pluginDir = pluginFilePath.substr(0, pluginFilePath.find_last_of('/'));
245 | 
246 |         std::string modelsDir;
247 |         if (model == 0)
248 |             modelsDir += pluginDir + "/models-upconv_7_anime_style_art_rgb/";
249 |         else if (model == 1)
250 |             modelsDir += pluginDir + "/models-upconv_7_photo/";
251 |         else
252 |             modelsDir += pluginDir + "/models-cunet/";
253 | 
254 |         std::string modelName;
255 |         if (noise == -1)
256 |             modelName = "scale2.0x_model";
257 |         else if (scale == 1)
258 |             modelName = "noise" + std::to_string(noise) + "_model";
259 |         else
260 |             modelName = "noise" + std::to_string(noise) + "_scale2.0x_model";
261 | 
262 |         paramPath = modelsDir + modelName + ".param";
263 |         modelPath = modelsDir + modelName + ".bin";
264 | 
265 |         // check model file readable
266 |         std::ifstream pf(paramPath);
267 |         std::ifstream mf(modelPath);
268 |         if (!pf.good() || !mf.good()) {
269 |             err_prompt = "can't open model file";
270 |             break;
271 |         }
272 | 
273 |         break;
274 |     } while (false);
275 | 
276 |     if (err_prompt) {
277 |         vsapi->setError(out, (std::string{"Waifu2x-NCNN-Vulkan: "} + err_prompt).c_str());
278 |         vsapi->freeNode(d.node);
279 |         tryDestoryGpuInstance();
280 |         return;
281 |     }
282 | 
283 |     int prepadding;
284 |     if (model == 2 && scale == 1)
285 |         prepadding = 28;
286 |     else if (model == 2)
287 |         prepadding = 18;
288 |     else
289 |         prepadding = 7;
290 | 
291 |     d.waifu2x = new Waifu2x(d.vi.width, d.vi.height, scale, tileSizeW, tileSizeH, gpuId, gpuThread, precision, tta, prepadding, paramPath, modelPath);
292 |     d.vi.width *= scale;
293 |     d.vi.height *= scale;
294 | 
295 |     auto *data = new FilterData{ d };
296 | 
297 |     vsapi->createFilter(in, out, "Waifu2x", filterInit, filterGetFrame, filterFree, fmParallel, 0, data, core);
298 | }
299 | 
300 | VS_EXTERNAL_API(void) VapourSynthPluginInit(VSConfigPlugin configFunc, VSRegisterFunction registerFunc, VSPlugin *plugin) {
301 |     configFunc("net.nlzy.vsw2xnvk", "w2xnvk", "VapourSynth Waifu2x NCNN Vulkan Plugin", VAPOURSYNTH_API_VERSION, 1, plugin);
302 |     registerFunc("Waifu2x", "clip:clip;"
303 |                             "noise:int:opt;"
304 |                             "scale:int:opt;"
305 |                             "model:int:opt;"
306 |                             "tile_size:int:opt;"
307 |                             "gpu_id:int:opt;"
308 |                             "gpu_thread:int:opt;"
309 |                             "precision:int:opt;"
310 |                             "tile_size_w:int:opt;"
311 |                             "tile_size_h:int:opt;"
312 |                             "tta:int:opt;"
313 |                             , filterCreate, nullptr, plugin);
314 | }
315 | 


--------------------------------------------------------------------------------
/src/waifu2x.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |   MIT License
  3 | 
  4 |   Copyright (c) 2019 nihui
  5 |   Copyright (c) 2019-2020 NaLan ZeYu
  6 | 
  7 |   Permission is hereby granted, free of charge, to any person obtaining a copy
  8 |   of this software and associated documentation files (the "Software"), to deal
  9 |   in the Software without restriction, including without limitation the rights
 10 |   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |   copies of the Software, and to permit persons to whom the Software is
 12 |   furnished to do so, subject to the following conditions:
 13 | 
 14 |   The above copyright notice and this permission notice shall be included in all
 15 |   copies or substantial portions of the Software.
 16 | 
 17 |   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 |   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 |   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 |   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 23 |   SOFTWARE.
 24 | */
 25 | 
 26 | #include <algorithm>
 27 | #include <vector>
 28 | #include "waifu2x.hpp"
 29 | 
 30 | #define DIV_CEIL(a, b) (((a) + (b) - 1) / (b))
 31 | #define PAD_TO_ALIGN(a, b) ((((a) + (b) - 1) / (b)) * (b) - (a))
 32 | 
 33 | static const uint32_t waifu2x_preproc_fp32_spv_data[] = {
 34 |     #include "waifu2x_preproc_fp32.spv.hex.h"
 35 | };
 36 | static const uint32_t waifu2x_preproc_fp16_spv_data[] = {
 37 |     #include "waifu2x_preproc_fp16.spv.hex.h"
 38 | };
 39 | static const uint32_t waifu2x_preproc_tta_fp32_spv_data[] = {
 40 |     #include "waifu2x_preproc_tta_fp32.spv.hex.h"
 41 | };
 42 | static const uint32_t waifu2x_preproc_tta_fp16_spv_data[] = {
 43 |     #include "waifu2x_preproc_tta_fp16.spv.hex.h"
 44 | };
 45 | 
 46 | static const uint32_t waifu2x_postproc_fp32_spv_data[] = {
 47 |     #include "waifu2x_postproc_fp32.spv.hex.h"
 48 | };
 49 | static const uint32_t waifu2x_postproc_fp16_spv_data[] = {
 50 |     #include "waifu2x_postproc_fp16.spv.hex.h"
 51 | };
 52 | static const uint32_t waifu2x_postproc_tta_fp32_spv_data[] = {
 53 |     #include "waifu2x_postproc_tta_fp32.spv.hex.h"
 54 | };
 55 | static const uint32_t waifu2x_postproc_tta_fp16_spv_data[] = {
 56 |     #include "waifu2x_postproc_tta_fp16.spv.hex.h"
 57 | };
 58 | 
 59 | 
 60 | Waifu2x::Waifu2x(int width, int height, int scale, int tilesizew, int tilesizeh, int gpuid, int gputhread,
 61 |     int precision, int tta, int prepadding, const std::string& parampath, const std::string& modelpath) :
 62 |     width(width), height(height), scale(scale), tilesizew(tilesizew), tilesizeh(tilesizeh), prepadding(prepadding), tta(tta), semaphore(gputhread)
 63 | {
 64 |     net.opt.use_vulkan_compute = true;
 65 |     net.opt.use_fp16_packed = precision == 16;
 66 |     net.opt.use_fp16_storage = precision == 16;
 67 |     net.opt.use_fp16_arithmetic = false;
 68 |     net.opt.use_int8_storage = false;
 69 |     net.opt.use_int8_arithmetic = false;
 70 |     net.set_vulkan_device(gpuid);
 71 |     net.load_param(parampath.c_str());
 72 |     net.load_model(modelpath.c_str());
 73 | 
 74 |     std::vector<ncnn::vk_specialization_type> specializations;
 75 |     waifu2x_preproc = new ncnn::Pipeline(net.vulkan_device());
 76 |     waifu2x_preproc->set_optimal_local_size_xyz(8, 8, 3);
 77 |     if (tta) {
 78 |         if (net.opt.use_fp16_storage)
 79 |             waifu2x_preproc->create(waifu2x_preproc_tta_fp16_spv_data, sizeof(waifu2x_preproc_tta_fp16_spv_data), specializations);
 80 |         else
 81 |             waifu2x_preproc->create(waifu2x_preproc_tta_fp32_spv_data, sizeof(waifu2x_preproc_tta_fp32_spv_data), specializations);
 82 |     } else {
 83 |         if (net.opt.use_fp16_storage)
 84 |             waifu2x_preproc->create(waifu2x_preproc_fp16_spv_data, sizeof(waifu2x_preproc_fp16_spv_data), specializations);
 85 |         else
 86 |             waifu2x_preproc->create(waifu2x_preproc_fp32_spv_data, sizeof(waifu2x_preproc_fp32_spv_data), specializations);
 87 |     }
 88 | 
 89 | 
 90 |     waifu2x_postproc = new ncnn::Pipeline(net.vulkan_device());
 91 |     waifu2x_postproc->set_optimal_local_size_xyz(8, 8, 3);
 92 |     if (tta) {
 93 |         if (net.opt.use_fp16_storage)
 94 |             waifu2x_postproc->create(waifu2x_postproc_tta_fp16_spv_data, sizeof(waifu2x_postproc_tta_fp16_spv_data), specializations);
 95 |         else
 96 |             waifu2x_postproc->create(waifu2x_postproc_tta_fp32_spv_data, sizeof(waifu2x_postproc_tta_fp32_spv_data), specializations);
 97 |     } else {
 98 |         if (net.opt.use_fp16_storage)
 99 |             waifu2x_postproc->create(waifu2x_postproc_fp16_spv_data, sizeof(waifu2x_postproc_fp16_spv_data), specializations);
100 |         else
101 |             waifu2x_postproc->create(waifu2x_postproc_fp32_spv_data, sizeof(waifu2x_postproc_fp32_spv_data), specializations);
102 |     }
103 | }
104 | 
105 | Waifu2x::~Waifu2x() {
106 |     delete waifu2x_preproc;
107 |     delete waifu2x_postproc;
108 | }
109 | 
110 | int Waifu2x::process(const float *srcR, const float *srcG, const float *srcB,
111 |                      float *dstR, float *dstG, float *dstB,
112 |                      const ptrdiff_t srcStride, const ptrdiff_t dstStride) const {
113 |     semaphore.wait();
114 | 
115 |     ncnn::VkAllocator* blob_vkallocator = net.vulkan_device()->acquire_blob_allocator();
116 |     ncnn::VkAllocator* staging_vkallocator = net.vulkan_device()->acquire_staging_allocator();
117 |     ncnn::Option opt = net.opt;
118 |     opt.blob_vkallocator = blob_vkallocator;
119 |     opt.workspace_vkallocator = blob_vkallocator;
120 |     opt.staging_vkallocator = staging_vkallocator;
121 | 
122 |     const int xtiles = DIV_CEIL(width, tilesizew);
123 |     const int ytiles = DIV_CEIL(height, tilesizeh);
124 | 
125 |     for (int yi = 0; yi < ytiles; yi++) {
126 |         ncnn::VkCompute cmd(net.vulkan_device());
127 | 
128 |         const int tile_nopad_y0 = yi * tilesizeh;
129 |         const int tile_nopad_y1 = std::min(tile_nopad_y0 + tilesizeh, height);
130 |         const int tile_nopad_h = tile_nopad_y1 - tile_nopad_y0;
131 |         const int prepadding_bottom = prepadding + PAD_TO_ALIGN(tile_nopad_h, 4 / scale);
132 |         const int tile_pad_y0 = std::max(tile_nopad_y0 - prepadding, 0);
133 |         const int tile_pad_y1 = std::min(tile_nopad_y1 + prepadding_bottom, height);
134 |         const int tile_pad_h = tile_pad_y1 - tile_pad_y0;
135 | 
136 | 
137 |         // upload
138 |         ncnn::Mat in(width, tile_pad_h, RGB_CHANNELS, sizeof(float));
139 |         for (int y = 0; y < tile_pad_h; y++) {
140 |             memcpy((float*)in.channel(0) + y * width, srcR + (y + tile_pad_y0) * srcStride, sizeof(float) * width);
141 |             memcpy((float*)in.channel(1) + y * width, srcG + (y + tile_pad_y0) * srcStride, sizeof(float) * width);
142 |             memcpy((float*)in.channel(2) + y * width, srcB + (y + tile_pad_y0) * srcStride, sizeof(float) * width);
143 |         }
144 |         
145 |         ncnn::VkMat in_gpu;
146 |         cmd.record_clone(in, in_gpu, opt);
147 |         if (xtiles > 1) {
148 |             if (cmd.submit_and_wait()) {
149 |                 return ERROR_UPLOAD;
150 |             }
151 |             cmd.reset();
152 |         }
153 | 
154 | 
155 |         ncnn::VkMat out_gpu;
156 |         out_gpu.create(width * scale, tile_nopad_h * scale, RGB_CHANNELS, sizeof(float), blob_vkallocator);
157 | 
158 |         for (int xi = 0; xi < xtiles; xi++) {
159 |             const int tile_nopad_x0 = xi * tilesizew;
160 |             const int tile_nopad_x1 = std::min(tile_nopad_x0 + tilesizew, width);
161 |             const int tile_nopad_w = tile_nopad_x1 - tile_nopad_x0;
162 |             const int prepadding_right = prepadding + PAD_TO_ALIGN(tile_nopad_w, 4 / scale);
163 | 
164 |             const int waifu2x_times = tta ? 8 : 1;
165 | 
166 |             std::vector<ncnn::VkMat> in_tile_gpu(waifu2x_times);
167 |             if (tta) {
168 |                 for (int i = 0; i < 4; i++) {
169 |                     in_tile_gpu[i].create(
170 |                         tile_nopad_x1 - tile_nopad_x0 + prepadding + prepadding_right,
171 |                         tile_nopad_y1 - tile_nopad_y0 + prepadding + prepadding_bottom,
172 |                         RGB_CHANNELS, net.opt.use_fp16_storage ? 2u : 4u, 1, blob_vkallocator);
173 |                 }
174 |                 for (int i = 0; i < 4; i++) {
175 |                     in_tile_gpu[4 + i].create(
176 |                         tile_nopad_y1 - tile_nopad_y0 + prepadding + prepadding_bottom,
177 |                         tile_nopad_x1 - tile_nopad_x0 + prepadding + prepadding_right,
178 |                         RGB_CHANNELS, net.opt.use_fp16_storage ? 2u : 4u, 1, blob_vkallocator);
179 |                 }
180 |             } else {
181 |                 in_tile_gpu[0].create(
182 |                     tile_nopad_x1 - tile_nopad_x0 + prepadding + prepadding_right,
183 |                     tile_nopad_y1 - tile_nopad_y0 + prepadding + prepadding_bottom,
184 |                     RGB_CHANNELS, net.opt.use_fp16_storage ? 2u : 4u, 1, blob_vkallocator);
185 |             }
186 | 
187 |             // preproc
188 |             {
189 |                 std::vector<ncnn::VkMat> bindings(1 + waifu2x_times);
190 |                 bindings[0] = in_gpu;
191 |                 for (int i = 0; i < waifu2x_times; ++i) {
192 |                     bindings[1 + i] = in_tile_gpu[i];
193 |                 }
194 | 
195 |                 std::vector<ncnn::vk_constant_type> constants(10);
196 |                 constants[0].i = in_gpu.w;
197 |                 constants[1].i = in_gpu.h;
198 |                 constants[2].i = in_gpu.cstep;
199 |                 constants[3].i = in_tile_gpu[0].w;
200 |                 constants[4].i = in_tile_gpu[0].h;
201 |                 constants[5].i = in_tile_gpu[0].cstep;
202 |                 constants[6].i = prepadding;
203 |                 constants[7].i = prepadding;
204 |                 constants[8].i = tile_nopad_x0;
205 |                 constants[9].i = std::min(tile_nopad_y0, prepadding);
206 | 
207 |                 ncnn::VkMat dispatcher;
208 |                 dispatcher.w = in_tile_gpu[0].w;
209 |                 dispatcher.h = in_tile_gpu[0].h;
210 |                 dispatcher.c = RGB_CHANNELS;
211 | 
212 |                 cmd.record_pipeline(waifu2x_preproc, bindings, constants, dispatcher);
213 |             }
214 | 
215 | 
216 |             // waifu2x
217 |             std::vector<ncnn::VkMat> out_tile_gpu(waifu2x_times);
218 | 
219 |             for (int i = 0; i < waifu2x_times; ++i) {
220 |                 ncnn::Extractor ex = net.create_extractor();
221 |                 ex.set_blob_vkallocator(blob_vkallocator);
222 |                 ex.set_workspace_vkallocator(blob_vkallocator);
223 |                 ex.set_staging_vkallocator(staging_vkallocator);
224 | 
225 |                 ex.input("Input1", in_tile_gpu[i]);
226 | 
227 |                 if (ex.extract("Eltwise4", out_tile_gpu[i], cmd)) {
228 |                     return ERROR_EXTRACTOR;
229 |                 }
230 |             }
231 | 
232 | 
233 |             // postproc
234 |             {
235 |                 std::vector<ncnn::VkMat> bindings(waifu2x_times + 1);
236 |                 for (int i = 0; i < waifu2x_times; ++i) {
237 |                     bindings[i] = out_tile_gpu[i];
238 |                 }
239 |                 bindings.back() = out_gpu;
240 | 
241 |                 std::vector<ncnn::vk_constant_type> constants(8);
242 |                 constants[0].i = out_tile_gpu[0].w;
243 |                 constants[1].i = out_tile_gpu[0].h;
244 |                 constants[2].i = out_tile_gpu[0].cstep;
245 |                 constants[3].i = out_gpu.w;
246 |                 constants[4].i = out_gpu.h;
247 |                 constants[5].i = out_gpu.cstep;
248 |                 constants[6].i = tile_nopad_x0 * scale;
249 |                 constants[7].i = std::min(out_gpu.w - tile_nopad_x0 * scale, tilesizew * scale);
250 | 
251 |                 ncnn::VkMat dispatcher;
252 |                 dispatcher.w = std::min(out_gpu.w - tile_nopad_x0 * scale, tilesizew * scale);
253 |                 dispatcher.h = out_gpu.h;
254 |                 dispatcher.c = RGB_CHANNELS;
255 | 
256 |                 cmd.record_pipeline(waifu2x_postproc, bindings, constants, dispatcher);
257 |             }
258 | 
259 | 
260 |             if (xtiles > 1) {
261 |                 if (cmd.submit_and_wait()) {
262 |                     return ERROR_SUBMIT;
263 |                 }
264 |                 cmd.reset();
265 |             }
266 |         }
267 | 
268 |         // download
269 |         {
270 |             ncnn::Mat out;
271 |             cmd.record_clone(out_gpu, out, opt);
272 |             if (cmd.submit_and_wait()) {
273 |                 return ERROR_DOWNLOAD;
274 |             }
275 | 
276 |             for (int y = 0; y < out.h; y++) {
277 |                 memcpy(dstR + tile_nopad_y0 * scale * dstStride + y * dstStride, (float *)out.channel(0) + y * out.w, out.w * sizeof(float));
278 |                 memcpy(dstG + tile_nopad_y0 * scale * dstStride + y * dstStride, (float *)out.channel(1) + y * out.w, out.w * sizeof(float));
279 |                 memcpy(dstB + tile_nopad_y0 * scale * dstStride + y * dstStride, (float *)out.channel(2) + y * out.w, out.w * sizeof(float));
280 |             }
281 |         }
282 |     }
283 | 
284 |     net.vulkan_device()->reclaim_blob_allocator(blob_vkallocator);
285 |     net.vulkan_device()->reclaim_staging_allocator(staging_vkallocator);
286 |     semaphore.signal(); // release only when successful
287 |     return ERROR_OK;
288 | }
289 | 


--------------------------------------------------------------------------------