├── source
├── config.h.in
├── internalfilters.h
└── source.cpp
├── .gitmodules
├── README.md
├── .github
└── workflows
│ ├── linux.yml
│ └── windows.yml
├── CMakeLists.txt
└── COPYING.LESSER
/source/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "thirdparty/vectorclass"]
2 | path = thirdparty/vectorclass
3 | url = https://github.com/vectorclass/version2.git
4 |
--------------------------------------------------------------------------------
/source/internalfilters.h:
--------------------------------------------------------------------------------
1 | VS_EXTERNAL_API(void) vsExtBoxBlurInitialize3(
2 | VSConfigPlugin configFunc,
3 | VSRegisterFunction registerFunc,
4 | VSPlugin * plugin
5 | ) noexcept;
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # vs-boxblur
2 | AVX2-vectorized box filter.
3 |
4 | For integer input, it favors architectures with fast cross lane shuffle (e.g. haswell or later architectures of intel) or slow integer division (e.g. pre-zen3 architectures of amd).
5 |
6 | ## Usage
7 | Prototype:
8 |
9 | `box.Blur(vnode clip[, int[] planes, int hradius = 1, int hpasses = 1, int vradius = 1, int vpasses = 1])`
10 |
11 | ## Building
12 | ```bash
13 | cmake -S . -B build -D CMAKE_BUILD_TYPE=Release
14 | cmake --build build
15 | cmake --install build
16 | ```
17 |
--------------------------------------------------------------------------------
/.github/workflows/linux.yml:
--------------------------------------------------------------------------------
1 | name: Build (Linux)
2 |
3 | on:
4 | push:
5 | paths:
6 | - 'source/source.cpp'
7 | - 'CMakeLists.txt'
8 | - '.github/workflows/linux.yml'
9 | workflow_dispatch:
10 |
11 | jobs:
12 | build-linux:
13 | runs-on: ubuntu-20.04
14 | steps:
15 | - name: Checkout repo
16 | uses: actions/checkout@v3
17 | with:
18 | fetch-depth: 0
19 | submodules: true
20 |
21 | - name: Setup GCC and Ninja
22 | run: |
23 | sudo apt-get update
24 | sudo apt-get install -y g++-11 ninja-build
25 | echo "CC=gcc-11" >> $GITHUB_ENV
26 | echo "CXX=g++-11" >> $GITHUB_ENV
27 |
28 | - name: Download VapourSynth headers
29 | run: |
30 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
31 | unzip -q vs.zip
32 | mv vapoursynth*/ vapoursynth
33 |
34 | - name: Configure
35 | run: cmake -S . -B build -G Ninja
36 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include"
37 | -D CMAKE_BUILD_TYPE=Release
38 | -D CMAKE_CXX_FLAGS_RELEASE="-ffast-math -march=x86-64-v3 -Wall"
39 |
40 | - name: Build
41 | run: cmake --build build --config Release --verbose
42 |
43 | - name: Install
44 | run: cmake --install build --prefix artifact
45 |
46 | - name: Upload
47 | uses: actions/upload-artifact@v3
48 | with:
49 | name: VapourSynth-BoxBlur-Linux
50 | path: artifact
51 |
52 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.20.0)
2 |
3 | project(BoxBlur VERSION 0.1 LANGUAGES CXX)
4 |
5 | add_library(boxblur SHARED
6 | source/source.cpp
7 | thirdparty/vectorclass/instrset_detect.cpp)
8 |
9 | target_include_directories(boxblur PRIVATE thirdparty/vectorclass)
10 |
11 | set_target_properties(boxblur PROPERTIES
12 | CXX_EXTENSIONS OFF
13 | CXX_STANDARD 17
14 | CXX_STANDARD_REQUIRED ON)
15 |
16 | if(MSVC)
17 | target_compile_options(boxblur PRIVATE /arch:AVX2)
18 | else()
19 | target_compile_options(boxblur PRIVATE -mavx2 -mfma)
20 | endif()
21 |
22 | find_package(PkgConfig QUIET MODULE)
23 |
24 | if(PKG_CONFIG_FOUND)
25 | pkg_search_module(VS vapoursynth)
26 |
27 | if(VS_FOUND)
28 | message(STATUS "Found VapourSynth r${VS_VERSION}")
29 |
30 | cmake_path(APPEND install_dir ${VS_LIBDIR} vapoursynth)
31 | target_include_directories(boxblur PRIVATE ${VS_INCLUDE_DIRS})
32 |
33 | install(TARGETS boxblur LIBRARY DESTINATION ${install_dir})
34 | endif()
35 | endif()
36 |
37 | if(NOT VS_FOUND)
38 | set(VS_INCLUDE_DIR "" CACHE PATH "Path to VapourSynth headers")
39 |
40 | if(VS_INCLUDE_DIR EQUAL "")
41 | message(WARNING "VapourSynth not found")
42 | endif()
43 |
44 | target_include_directories(boxblur PRIVATE ${VS_INCLUDE_DIR})
45 |
46 | install(TARGETS boxblur LIBRARY RUNTIME)
47 | endif()
48 |
49 | find_package(Git QUIET)
50 | if(GIT_FOUND)
51 | execute_process(
52 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
53 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
54 | OUTPUT_VARIABLE VCS_TAG
55 | )
56 | if(VCS_TAG)
57 | string(STRIP ${VCS_TAG} VCS_TAG)
58 | endif()
59 | endif()
60 |
61 | if(VCS_TAG)
62 | message(STATUS "VapourSynth-BoxBlur ${VCS_TAG}")
63 | else()
64 | message(WARNING "unknown plugin version")
65 | set(VCS_TAG "unknown")
66 | endif()
67 |
68 | configure_file(source/config.h.in config.h)
69 |
70 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
71 |
--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
1 | name: Build (Windows)
2 |
3 | on:
4 | push:
5 | paths:
6 | - 'source/source.cpp'
7 | - 'CMakeLists.txt'
8 | - '.github/workflows/windows.yml'
9 | workflow_dispatch:
10 | inputs:
11 | tag:
12 | description: 'which tag to upload to'
13 | default: ''
14 |
15 | jobs:
16 | build-windows:
17 | runs-on: windows-2022
18 |
19 | defaults:
20 | run:
21 | shell: cmd
22 |
23 | steps:
24 | - name: Checkout repo
25 | uses: actions/checkout@v3
26 | with:
27 | fetch-depth: 0
28 | submodules: true
29 |
30 | - name: Setup MSVC
31 | uses: ilammy/msvc-dev-cmd@v1
32 |
33 | - name: Setup Ninja
34 | run: pip install ninja
35 |
36 | - name: Download VapourSynth headers
37 | run: |
38 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
39 | unzip -q vs.zip
40 | mv vapoursynth-*/ vapoursynth/
41 |
42 | - name: Setup LLVM
43 | shell: bash
44 | run: |
45 | curl -s -o llvm-win64.exe -LJO https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.7/LLVM-15.0.7-win64.exe
46 | 7z x -ollvm llvm-win64.exe
47 |
48 | - name: Configure
49 | shell: bash
50 | run: cmake -S . -B build -G Ninja
51 | -D VS_INCLUDE_DIR="$(pwd)\vapoursynth\include"
52 | -D CMAKE_BUILD_TYPE=Release
53 | -D CMAKE_CXX_COMPILER="$(pwd)/llvm/bin/clang++.exe"
54 | -D CMAKE_CXX_FLAGS="-ffast-math -Wall -Wno-deprecated-declarations"
55 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
56 |
57 | - name: Build
58 | run: cmake --build build --verbose
59 |
60 | - name: Install
61 | run: cmake --install build --prefix install
62 |
63 | - name: Prepare for upload
64 | run: |
65 | mkdir artifact
66 | copy install\bin\*.dll artifact
67 |
68 | - name: Upload
69 | uses: actions/upload-artifact@v3
70 | with:
71 | name: VapourSynth-BoxBlur-Windows
72 | path: artifact
73 |
74 | - name: Release
75 | uses: softprops/action-gh-release@v1
76 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
77 | with:
78 | tag_name: ${{ github.event.inputs.tag }}
79 | files: artifact/*
80 | fail_on_unmatched_files: true
81 | generate_release_notes: false
82 | prerelease: true
83 |
84 |
--------------------------------------------------------------------------------
/COPYING.LESSER:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/source/source.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | * Modified from boxblurfilter.cpp of VapourSynth
3 | *
4 | * Copyright (c) 2017 Fredrik Mellbin
5 | * Copyright (c) 2022 AmusementClub
6 | *
7 | * VapourSynth is free software; you can redistribute it and/or
8 | * modify it under the terms of the GNU Lesser General Public
9 | * License as published by the Free Software Foundation; either
10 | * version 2.1 of the License, or (at your option) any later version.
11 | *
12 | * VapourSynth is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | * Lesser General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU Lesser General Public
18 | * License along with VapourSynth; if not, write to the Free Software
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 | */
21 |
22 | #include
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 | #include
31 |
32 | #include
33 | #include
34 |
35 | #include
36 |
37 | #ifdef INSIDE_VS
38 | #define VERSION "builtin"
39 | #include "internalfilters.h"
40 | #else
41 | #include
42 | #endif
43 |
44 | struct BoxBlurVData {
45 | VSNodeRef *node;
46 | std::array radius;
47 | bool rounding;
48 |
49 | std::shared_mutex buffer_lock;
50 | std::unordered_map buffers;
51 | };
52 |
53 | static void VS_CC BoxBlurVInit(
54 | VSMap * in,
55 | VSMap * out,
56 | void ** instanceData,
57 | VSNode * node,
58 | VSCore * core,
59 | const VSAPI * vsapi
60 | ) noexcept {
61 |
62 | const auto * d = reinterpret_cast(*instanceData);
63 | vsapi->setVideoInfo(vsapi->getVideoInfo(d->node), 1, node);
64 | }
65 |
66 | template
67 | static
68 | inline Vec8ui load_vec(const T src[8]) noexcept {
69 | static_assert(std::is_integral_v);
70 |
71 | uint32_t tmp[8];
72 | for (int i = 0; i < 8; ++i) {
73 | tmp[i] = src[i];
74 | }
75 | return Vec8ui().load(tmp);
76 | }
77 |
78 | static inline void store_vec(uint8_t dst[8], Vec8ui src) noexcept {
79 | Vec32uc tmp = reinterpret_i(src);
80 |
81 | Vec16uc dst_tmp = permute32<
82 | 0, 4, 8, 12, 16, 20, 24, 28,
83 | V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC,
84 | V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC,
85 | V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC
86 | >(tmp).get_low();
87 |
88 | uint64_t dst_val = Vec2uq(reinterpret_i(dst_tmp)).extract(0);
89 |
90 | *((uint64_t *) dst) = dst_val;
91 | }
92 |
93 | static inline void store_vec(uint16_t dst[8], Vec8ui src) noexcept {
94 | Vec16us tmp = reinterpret_i(src);
95 |
96 | Vec8us dst_vec = permute16<
97 | 0, 2, 4, 6, 8, 10, 12, 14,
98 | V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC
99 | >(tmp).get_low();
100 |
101 | dst_vec.store_a(dst);
102 | }
103 |
104 | template
105 | static void blurV(
106 | T *VS_RESTRICT dst,
107 | const T *VS_RESTRICT src,
108 | const int width,
109 | const int height,
110 | const int stride,
111 | const int radius,
112 | void * buffer, // [((width + 7) / 8 * 8) * 4]
113 | unsigned round
114 | ) {
115 |
116 | // utilities
117 | Divisor_ui div = radius * 2 + 1;
118 |
119 | uint32_t * buf = reinterpret_cast(buffer);
120 | auto for_each_vec = [buf, width](auto func) -> void {
121 | for (int x = 0; x < width; x += Vec8ui().size()) {
122 | auto vec = Vec8ui().load_a(&buf[x]);
123 | func(vec, x).store_a(&buf[x]);
124 | }
125 | };
126 |
127 | // process
128 | for (int x = 0; x < width; x += Vec8ui().size()) {
129 | Vec8ui vec = load_vec(&src[x]);
130 | (radius * vec).store_a(&buf[x]);
131 | }
132 |
133 | for (int y = 0; y < radius; y++) {
134 | for_each_vec([=](Vec8ui vec, int x) {
135 | vec += load_vec(&src[std::min(y, height - 1) * stride + x]);
136 | return vec;
137 | });
138 | }
139 |
140 | for (int y = 0; y < std::min(radius, height); y++) {
141 | for_each_vec([=](Vec8ui vec, int x) {
142 | vec += load_vec(&src[std::min(y + radius, height - 1) * stride + x]);
143 | store_vec(&dst[y * stride + x], (vec + round) / div);
144 | vec -= load_vec(&src[std::max(y - radius, 0) * stride + x]);
145 | return vec;
146 | });
147 | }
148 | if (height > radius) {
149 | for (int y = radius; y < height - radius; y++) {
150 | for_each_vec([=](Vec8ui vec, int x) {
151 | vec += load_vec(&src[(y + radius) * stride + x]);
152 | store_vec(&dst[y * stride + x], (vec + round) / div);
153 | vec -= load_vec(&src[(y - radius) * stride + x]);
154 | return vec;
155 | });
156 | }
157 | for (int y = std::max(height - radius, radius); y < height; y++) {
158 | for_each_vec([=](Vec8ui vec, int x) {
159 | vec += load_vec(&src[std::min(y + radius, height - 1) * stride + x]);
160 | store_vec(&dst[y * stride + x], (vec + round) / div);
161 | vec -= load_vec(&src[std::max(y - radius, 0) * stride + x]);
162 | return vec;
163 | });
164 | }
165 | }
166 | }
167 |
168 | static void blurVF(
169 | float * VS_RESTRICT dst,
170 | const float * VS_RESTRICT src,
171 | const int width,
172 | const int height,
173 | const int stride,
174 | const int radius,
175 | void * buffer // [((width + 7) / 8 * 8) * 4]
176 | ) noexcept {
177 |
178 | Vec8f div = static_cast(1) / (radius * 2 + 1);
179 |
180 | float * buf = reinterpret_cast(buffer);
181 |
182 | for (int x = 0; x < width; x += Vec8f().size()) {
183 | auto vec = Vec8f().load_a(&src[x]);
184 | (radius * vec).store_a(&buf[x]);
185 | }
186 |
187 | auto for_each_vec = [=](auto func) {
188 | for (int x = 0; x < width; x += Vec8f().size()) {
189 | auto vec = Vec8f().load_a(&buf[x]);
190 | func(vec, x).store_a(&buf[x]);
191 | }
192 | };
193 |
194 | for (int y = 0; y < radius; y++) {
195 | for_each_vec([=](Vec8f vec, int x) {
196 | vec += Vec8f().load_a(&src[std::min(y, height - 1) * stride + x]);
197 | return vec;
198 | });
199 | }
200 |
201 | for (int y = 0; y < std::min(radius, height); y++) {
202 | for_each_vec([=](Vec8f vec, int x) {
203 | vec += Vec8f().load_a(&src[std::min(y + radius, height - 1) * stride + x]);
204 | (vec * div).store_a(&dst[y * stride + x]);
205 | vec -= Vec8f().load_a(&src[std::max(y - radius, 0) * stride + x]);
206 | return vec;
207 | });
208 | }
209 |
210 | if (height > radius) {
211 | for (int y = radius; y < height - radius; y++) {
212 | for_each_vec([=](Vec8f vec, int x) {
213 | vec += Vec8f().load_a(&src[(y + radius) * stride + x]);
214 | (vec * div).store_a(&dst[y * stride + x]);
215 | vec -= Vec8f().load_a(&src[(y - radius) * stride + x]);
216 | return vec;
217 | });
218 | }
219 |
220 | for (int y = std::max(height - radius, radius); y < height; y++) {
221 | for_each_vec([=](Vec8f vec, int x) {
222 | vec += Vec8f().load_a(&src[std::min(y + radius, height - 1) * stride + x]);
223 | (vec * div).store_a(&dst[y * stride + x]);
224 | vec -= Vec8f().load_a(&src[std::max(y - radius, 0) * stride + x]);
225 | return vec;
226 | });
227 | }
228 | }
229 | }
230 |
231 | static const VSFrameRef *VS_CC BoxBlurVGetFrame(
232 | int n,
233 | int activationReason,
234 | void ** instanceData,
235 | void ** frameData,
236 | VSFrameContext * frameCtx,
237 | VSCore * core,
238 | const VSAPI * vsapi
239 | ) noexcept {
240 |
241 | auto * d = reinterpret_cast(*instanceData);
242 |
243 | if (activationReason == arInitial) {
244 | vsapi->requestFrameFilter(n, d->node, frameCtx);
245 | } else if (activationReason == arAllFramesReady) {
246 |
247 | const VSVideoInfo * vi = vsapi->getVideoInfo(d->node);
248 |
249 | void * buffer;
250 | {
251 | auto thread_id = std::this_thread::get_id();
252 |
253 | bool init {};
254 |
255 | d->buffer_lock.lock_shared();
256 |
257 | try {
258 | buffer = d->buffers.at(thread_id);
259 | init = true;
260 | } catch (const std::out_of_range & e) {
261 | d->buffer_lock.unlock_shared();
262 |
263 | buffer = vs_aligned_malloc(((vi->width + 7) / 8 * 8) * 4, 32);
264 |
265 | std::lock_guard l(d->buffer_lock);
266 | d->buffers.emplace(thread_id, buffer);
267 | }
268 |
269 | if (init) {
270 | d->buffer_lock.unlock_shared();
271 | }
272 | }
273 |
274 | const VSFrameRef * src_frame = vsapi->getFrameFilter(n, d->node, frameCtx);
275 | VSFrameRef * dst_frame = vsapi->newVideoFrame(vi->format, vi->width, vi->height, src_frame, core);
276 |
277 | for (int plane = 0; plane < vi->format->numPlanes; ++plane) {
278 | if (d->radius[plane] > 0) {
279 | int width = vsapi->getFrameWidth(src_frame, plane);
280 | int height = vsapi->getFrameHeight(src_frame, plane);
281 | int bytes = vi->format->bytesPerSample;
282 | int stride = vsapi->getStride(src_frame, plane) / bytes;
283 |
284 | const auto * srcp = vsapi->getReadPtr(src_frame, plane);
285 | auto * dstp = vsapi->getWritePtr(dst_frame, plane);
286 |
287 | auto round = (unsigned int) (d->rounding ? d->radius[plane] * 2 : 0);
288 |
289 | if (bytes == 4) {
290 | blurVF((float *) dstp, (const float *) srcp, width, height, stride, d->radius[plane], buffer);
291 | } else if (bytes == 2) {
292 | blurV((uint16_t *) dstp, (const uint16_t *) srcp, width, height, stride, d->radius[plane], buffer, round);
293 | } else if (bytes == 1) {
294 | blurV((uint8_t *) dstp, (const uint8_t *) srcp, width, height, stride, d->radius[plane], buffer, round);
295 | }
296 | }
297 | }
298 |
299 | vsapi->freeFrame(src_frame);
300 |
301 | return dst_frame;
302 | }
303 |
304 | return nullptr;
305 | }
306 |
307 | static void VS_CC BoxBlurVFree(
308 | void * instanceData,
309 | VSCore * core,
310 | const VSAPI * vsapi
311 | ) noexcept {
312 |
313 | auto * d = reinterpret_cast(instanceData);
314 |
315 | vsapi->freeNode(d->node);
316 |
317 | for (const auto & [_, buffer] : d->buffers) {
318 | vs_aligned_free(buffer);
319 | }
320 |
321 | delete d;
322 | }
323 |
324 | static void VS_CC BoxBlurVCreate(
325 | const VSMap * in,
326 | VSMap * out,
327 | void * userData,
328 | VSCore * core,
329 | const VSAPI * vsapi
330 | ) noexcept {
331 |
332 | if (instrset_detect() < 8 || !hasFMA3()) {
333 | vsapi->setError(out, "AVX2 is required");
334 | return ;
335 | }
336 |
337 | auto d = std::make_unique();
338 |
339 | d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
340 |
341 | const VSVideoInfo * vi = vsapi->getVideoInfo(d->node);
342 |
343 | if (const auto fi = vi->format;
344 | (fi->sampleType == stInteger && fi->bitsPerSample > 16) ||
345 | (fi->sampleType == stFloat && fi->bitsPerSample != 32)
346 | ) {
347 | vsapi->setError(out, "not supported format");
348 | vsapi->freeNode(d->node);
349 | return ;
350 | }
351 |
352 | for (int i = 0; i < vi->format->numPlanes; ++i) {
353 | int err;
354 | d->radius[i] = int64ToIntS(vsapi->propGetInt(in, "radius", i, &err));
355 | if (err) {
356 | d->radius[i] = (i == 0) ? 1 : d->radius[i - 1];
357 | }
358 | }
359 |
360 | vsapi->createFilter(
361 | in, out,
362 | "BlurV",
363 | BoxBlurVInit, BoxBlurVGetFrame, BoxBlurVFree,
364 | fmParallelRequests, 0, d.release(), core
365 | );
366 | }
367 |
368 | static void VS_CC BoxBlurCreate(
369 | const VSMap * in,
370 | VSMap * out,
371 | void * userData,
372 | VSCore * core,
373 | const VSAPI * vsapi
374 | ) noexcept {
375 |
376 | auto node = vsapi->propGetNode(in, "clip", 0, nullptr);
377 | auto vi = vsapi->getVideoInfo(node);
378 | const auto fi = vi->format;
379 |
380 | int err;
381 |
382 | // not supported
383 | if (instrset_detect() < 8 || !hasFMA3() ||
384 | (fi->sampleType == stInteger && (fi->bitsPerSample > 16)) ||
385 | (fi->sampleType == stFloat && fi->bitsPerSample != 32)
386 | ) {
387 | vsapi->freeNode(node);
388 | node = nullptr;
389 |
390 | if (std::getenv("VS_BOXFILTER_DEBUG")) {
391 | vsapi->logMessage(mtWarning, "fallback to std.BoxBlur");
392 | }
393 |
394 | VSPlugin *stdplugin = vsapi->getPluginById("com.vapoursynth.std", core);
395 |
396 | VSMap *out_map = vsapi->invoke(stdplugin, "BoxBlur", in);
397 |
398 | auto node = vsapi->propGetNode(out_map, "clip", 0, &err);
399 |
400 | if (err) {
401 | vsapi->setError(out, vsapi->getError(out_map));
402 | vsapi->freeMap(out_map);
403 | return;
404 | }
405 |
406 | vsapi->propSetNode(out, "clip", node, paAppend);
407 |
408 | vsapi->freeNode(node);
409 | vsapi->freeMap(out_map);
410 |
411 | return ;
412 | }
413 |
414 | std::array process {};
415 | int num_planes_args = vsapi->propNumElements(in, "planes");
416 | if (num_planes_args == -1) {
417 | for (int i = 0; i < vi->format->numPlanes; ++i) {
418 | process[i] = true;
419 | }
420 | } else {
421 | for (int i = 0; i < num_planes_args; ++i) {
422 | int plane = vsapi->propGetInt(in, "planes", i, nullptr);
423 | if (0 <= plane && plane < vi->format->numPlanes) {
424 | if (process[plane]) {
425 | vsapi->setError(out, "plane specified twice");
426 | vsapi->freeNode(node);
427 | return ;
428 | }
429 | process[plane] = true;
430 | } else {
431 | vsapi->setError(out, "plane index out of range");
432 | vsapi->freeNode(node);
433 | return ;
434 | }
435 | }
436 | }
437 |
438 | int hradius = int64ToIntS(vsapi->propGetInt(in, "hradius", 0, &err));
439 | if (err) {
440 | hradius = 1;
441 | }
442 |
443 | int hpasses = int64ToIntS(vsapi->propGetInt(in, "hpasses", 0, &err));
444 | if (err) {
445 | hpasses = 1;
446 | }
447 |
448 | int vradius = int64ToIntS(vsapi->propGetInt(in, "vradius", 0, &err));
449 | if (err) {
450 | vradius = 1;
451 | }
452 |
453 | int vpasses = int64ToIntS(vsapi->propGetInt(in, "vpasses", 0, &err));
454 | if (err) {
455 | vpasses = 1;
456 | }
457 |
458 | if (vpasses > 0 && vradius > 0) {
459 | VSMap * in_map = vsapi->createMap();
460 | VSMap * out_map = vsapi->createMap();
461 |
462 | std::array radius {};
463 | for (int plane = 0; plane < vi->format->numPlanes; ++plane) {
464 | if (process[plane]) {
465 | radius[plane] = vradius;
466 | }
467 | }
468 |
469 | for (int pass = 0; pass < vpasses; ++pass) {
470 | vsapi->propSetNode(in_map, "clip", node, paReplace);
471 | vsapi->createFilter(
472 | in_map, out_map, "BlurV", BoxBlurVInit, BoxBlurVGetFrame, BoxBlurVFree,
473 | fmParallel, 0, new BoxBlurVData{ node, radius, (pass % 2) == 0 }, core);
474 |
475 | node = vsapi->propGetNode(out_map, "clip", 0, nullptr);
476 | vsapi->clearMap(out_map);
477 | vsapi->clearMap(in_map);
478 | }
479 |
480 | vsapi->freeMap(in_map);
481 | vsapi->freeMap(out_map);
482 | }
483 |
484 | if (hpasses > 0 && hradius > 0) {
485 | VSPlugin *stdplugin = vsapi->getPluginById("com.vapoursynth.std", core);
486 |
487 | VSMap *vtmp1 = vsapi->createMap();
488 |
489 | vsapi->propSetNode(vtmp1, "clip", node, paAppend);
490 | vsapi->freeNode(node);
491 | VSMap *vtmp2 = vsapi->invoke(stdplugin, "Transpose", vtmp1);
492 | vsapi->clearMap(vtmp1);
493 | node = vsapi->propGetNode(vtmp2, "clip", 0, nullptr);
494 | vsapi->clearMap(vtmp2);
495 |
496 | std::array radius {};
497 | for (unsigned plane = 0; plane < radius.size(); ++plane) {
498 | if (process[plane]) {
499 | radius[plane] = hradius;
500 | }
501 | }
502 |
503 | for (int pass = 0; pass < hpasses; ++pass) {
504 | vsapi->propSetNode(vtmp1, "clip", node, paReplace);
505 | vsapi->createFilter(
506 | vtmp1, vtmp2, "BlurV", BoxBlurVInit, BoxBlurVGetFrame, BoxBlurVFree,
507 | fmParallel, 0, new BoxBlurVData{ node, radius, (pass % 2) == 0 }, core);
508 |
509 | node = vsapi->propGetNode(vtmp2, "clip", 0, &err);
510 | vsapi->clearMap(vtmp2);
511 | vsapi->clearMap(vtmp1);
512 | }
513 |
514 | vsapi->propSetNode(vtmp2, "clip", node, paReplace);
515 | vsapi->freeNode(node);
516 | vsapi->freeMap(vtmp1);
517 | vtmp1 = vsapi->invoke(stdplugin, "Transpose", vtmp2);
518 | vsapi->freeMap(vtmp2);
519 | node = vsapi->propGetNode(vtmp1, "clip", 0, nullptr);
520 | vsapi->freeMap(vtmp1);
521 | }
522 |
523 | vsapi->propSetNode(out, "clip", node, paAppend);
524 | vsapi->freeNode(node);
525 | }
526 |
527 | #ifdef INSIDE_VS
528 | VS_EXTERNAL_API(void) vsExtBoxBlurInitialize3(
529 | #else
530 | VS_EXTERNAL_API(void) VapourSynthPluginInit(
531 | #endif
532 | VSConfigPlugin configFunc,
533 | VSRegisterFunction registerFunc,
534 | VSPlugin * plugin
535 | ) noexcept {
536 |
537 | configFunc("io.github.amusementclub.boxblur", "box", "AVX2-optimized boxfilter", VAPOURSYNTH_API_VERSION, 1, plugin);
538 |
539 | registerFunc("BlurV",
540 | "clip:clip;"
541 | "radius:int[]:opt;",
542 | BoxBlurVCreate, nullptr, plugin
543 | );
544 |
545 | registerFunc("Blur",
546 | "clip:clip;"
547 | "planes:int[]:opt;"
548 | "hradius:int:opt;"
549 | "hpasses:int:opt;"
550 | "vradius:int:opt;"
551 | "vpasses:int:opt;",
552 | BoxBlurCreate, nullptr, plugin
553 | );
554 |
555 | auto getVersion = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
556 | vsapi->propSetData(out, "version", VERSION, -1, paReplace);
557 | };
558 | registerFunc("Version", "", getVersion, nullptr, plugin);
559 | }
560 |
--------------------------------------------------------------------------------