├── source ├── config.h.in ├── internalfilters.h └── source.cpp ├── .gitmodules ├── README.md ├── .github └── workflows │ ├── linux.yml │ └── windows.yml ├── CMakeLists.txt └── COPYING.LESSER /source/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thirdparty/vectorclass"] 2 | path = thirdparty/vectorclass 3 | url = https://github.com/vectorclass/version2.git 4 | -------------------------------------------------------------------------------- /source/internalfilters.h: -------------------------------------------------------------------------------- 1 | VS_EXTERNAL_API(void) vsExtBoxBlurInitialize3( 2 | VSConfigPlugin configFunc, 3 | VSRegisterFunction registerFunc, 4 | VSPlugin * plugin 5 | ) noexcept; 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vs-boxblur 2 | AVX2-vectorized box filter. 3 | 4 | For integer input, it favors architectures with fast cross lane shuffle (e.g. haswell or later architectures of intel) or slow integer division (e.g. pre-zen3 architectures of amd). 5 | 6 | ## Usage 7 | Prototype: 8 | 9 | `box.Blur(vnode clip[, int[] planes, int hradius = 1, int hpasses = 1, int vradius = 1, int vpasses = 1])` 10 | 11 | ## Building 12 | ```bash 13 | cmake -S . -B build -D CMAKE_BUILD_TYPE=Release 14 | cmake --build build 15 | cmake --install build 16 | ``` 17 | -------------------------------------------------------------------------------- /.github/workflows/linux.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'source/source.cpp' 7 | - 'CMakeLists.txt' 8 | - '.github/workflows/linux.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-20.04 14 | steps: 15 | - name: Checkout repo 16 | uses: actions/checkout@v3 17 | with: 18 | fetch-depth: 0 19 | submodules: true 20 | 21 | - name: Setup GCC and Ninja 22 | run: | 23 | sudo apt-get update 24 | sudo apt-get install -y g++-11 ninja-build 25 | echo "CC=gcc-11" >> $GITHUB_ENV 26 | echo "CXX=g++-11" >> $GITHUB_ENV 27 | 28 | - name: Download VapourSynth headers 29 | run: | 30 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 31 | unzip -q vs.zip 32 | mv vapoursynth*/ vapoursynth 33 | 34 | - name: Configure 35 | run: cmake -S . -B build -G Ninja 36 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include" 37 | -D CMAKE_BUILD_TYPE=Release 38 | -D CMAKE_CXX_FLAGS_RELEASE="-ffast-math -march=x86-64-v3 -Wall" 39 | 40 | - name: Build 41 | run: cmake --build build --config Release --verbose 42 | 43 | - name: Install 44 | run: cmake --install build --prefix artifact 45 | 46 | - name: Upload 47 | uses: actions/upload-artifact@v3 48 | with: 49 | name: VapourSynth-BoxBlur-Linux 50 | path: artifact 51 | 52 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20.0) 2 | 3 | project(BoxBlur VERSION 0.1 LANGUAGES CXX) 4 | 5 | add_library(boxblur SHARED 6 | source/source.cpp 7 | thirdparty/vectorclass/instrset_detect.cpp) 8 | 9 | target_include_directories(boxblur PRIVATE thirdparty/vectorclass) 10 | 11 | set_target_properties(boxblur PROPERTIES 12 | CXX_EXTENSIONS OFF 13 | CXX_STANDARD 17 14 | CXX_STANDARD_REQUIRED ON) 15 | 16 | if(MSVC) 17 | target_compile_options(boxblur PRIVATE /arch:AVX2) 18 | else() 19 | target_compile_options(boxblur PRIVATE -mavx2 -mfma) 20 | endif() 21 | 22 | find_package(PkgConfig QUIET MODULE) 23 | 24 | if(PKG_CONFIG_FOUND) 25 | pkg_search_module(VS vapoursynth) 26 | 27 | if(VS_FOUND) 28 | message(STATUS "Found VapourSynth r${VS_VERSION}") 29 | 30 | cmake_path(APPEND install_dir ${VS_LIBDIR} vapoursynth) 31 | target_include_directories(boxblur PRIVATE ${VS_INCLUDE_DIRS}) 32 | 33 | install(TARGETS boxblur LIBRARY DESTINATION ${install_dir}) 34 | endif() 35 | endif() 36 | 37 | if(NOT VS_FOUND) 38 | set(VS_INCLUDE_DIR "" CACHE PATH "Path to VapourSynth headers") 39 | 40 | if(VS_INCLUDE_DIR EQUAL "") 41 | message(WARNING "VapourSynth not found") 42 | endif() 43 | 44 | target_include_directories(boxblur PRIVATE ${VS_INCLUDE_DIR}) 45 | 46 | install(TARGETS boxblur LIBRARY RUNTIME) 47 | endif() 48 | 49 | find_package(Git QUIET) 50 | if(GIT_FOUND) 51 | execute_process( 52 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 53 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 54 | OUTPUT_VARIABLE VCS_TAG 55 | ) 56 | if(VCS_TAG) 57 | string(STRIP ${VCS_TAG} VCS_TAG) 58 | endif() 59 | endif() 60 | 61 | if(VCS_TAG) 62 | message(STATUS "VapourSynth-BoxBlur ${VCS_TAG}") 63 | else() 64 | message(WARNING "unknown plugin version") 65 | set(VCS_TAG "unknown") 66 | endif() 67 | 68 | configure_file(source/config.h.in config.h) 69 | 70 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 71 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'source/source.cpp' 7 | - 'CMakeLists.txt' 8 | - '.github/workflows/windows.yml' 9 | workflow_dispatch: 10 | inputs: 11 | tag: 12 | description: 'which tag to upload to' 13 | default: '' 14 | 15 | jobs: 16 | build-windows: 17 | runs-on: windows-2022 18 | 19 | defaults: 20 | run: 21 | shell: cmd 22 | 23 | steps: 24 | - name: Checkout repo 25 | uses: actions/checkout@v3 26 | with: 27 | fetch-depth: 0 28 | submodules: true 29 | 30 | - name: Setup MSVC 31 | uses: ilammy/msvc-dev-cmd@v1 32 | 33 | - name: Setup Ninja 34 | run: pip install ninja 35 | 36 | - name: Download VapourSynth headers 37 | run: | 38 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 39 | unzip -q vs.zip 40 | mv vapoursynth-*/ vapoursynth/ 41 | 42 | - name: Setup LLVM 43 | shell: bash 44 | run: | 45 | curl -s -o llvm-win64.exe -LJO https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.7/LLVM-15.0.7-win64.exe 46 | 7z x -ollvm llvm-win64.exe 47 | 48 | - name: Configure 49 | shell: bash 50 | run: cmake -S . -B build -G Ninja 51 | -D VS_INCLUDE_DIR="$(pwd)\vapoursynth\include" 52 | -D CMAKE_BUILD_TYPE=Release 53 | -D CMAKE_CXX_COMPILER="$(pwd)/llvm/bin/clang++.exe" 54 | -D CMAKE_CXX_FLAGS="-ffast-math -Wall -Wno-deprecated-declarations" 55 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 56 | 57 | - name: Build 58 | run: cmake --build build --verbose 59 | 60 | - name: Install 61 | run: cmake --install build --prefix install 62 | 63 | - name: Prepare for upload 64 | run: | 65 | mkdir artifact 66 | copy install\bin\*.dll artifact 67 | 68 | - name: Upload 69 | uses: actions/upload-artifact@v3 70 | with: 71 | name: VapourSynth-BoxBlur-Windows 72 | path: artifact 73 | 74 | - name: Release 75 | uses: softprops/action-gh-release@v1 76 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 77 | with: 78 | tag_name: ${{ github.event.inputs.tag }} 79 | files: artifact/* 80 | fail_on_unmatched_files: true 81 | generate_release_notes: false 82 | prerelease: true 83 | 84 | -------------------------------------------------------------------------------- /COPYING.LESSER: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /source/source.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Modified from boxblurfilter.cpp of VapourSynth 3 | * 4 | * Copyright (c) 2017 Fredrik Mellbin 5 | * Copyright (c) 2022 AmusementClub 6 | * 7 | * VapourSynth is free software; you can redistribute it and/or 8 | * modify it under the terms of the GNU Lesser General Public 9 | * License as published by the Free Software Foundation; either 10 | * version 2.1 of the License, or (at your option) any later version. 11 | * 12 | * VapourSynth is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | * Lesser General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Lesser General Public 18 | * License along with VapourSynth; if not, write to the Free Software 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 | */ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include 33 | #include 34 | 35 | #include 36 | 37 | #ifdef INSIDE_VS 38 | #define VERSION "builtin" 39 | #include "internalfilters.h" 40 | #else 41 | #include 42 | #endif 43 | 44 | struct BoxBlurVData { 45 | VSNodeRef *node; 46 | std::array radius; 47 | bool rounding; 48 | 49 | std::shared_mutex buffer_lock; 50 | std::unordered_map buffers; 51 | }; 52 | 53 | static void VS_CC BoxBlurVInit( 54 | VSMap * in, 55 | VSMap * out, 56 | void ** instanceData, 57 | VSNode * node, 58 | VSCore * core, 59 | const VSAPI * vsapi 60 | ) noexcept { 61 | 62 | const auto * d = reinterpret_cast(*instanceData); 63 | vsapi->setVideoInfo(vsapi->getVideoInfo(d->node), 1, node); 64 | } 65 | 66 | template 67 | static 68 | inline Vec8ui load_vec(const T src[8]) noexcept { 69 | static_assert(std::is_integral_v); 70 | 71 | uint32_t tmp[8]; 72 | for (int i = 0; i < 8; ++i) { 73 | tmp[i] = src[i]; 74 | } 75 | return Vec8ui().load(tmp); 76 | } 77 | 78 | static inline void store_vec(uint8_t dst[8], Vec8ui src) noexcept { 79 | Vec32uc tmp = reinterpret_i(src); 80 | 81 | Vec16uc dst_tmp = permute32< 82 | 0, 4, 8, 12, 16, 20, 24, 28, 83 | V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, 84 | V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, 85 | V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC 86 | >(tmp).get_low(); 87 | 88 | uint64_t dst_val = Vec2uq(reinterpret_i(dst_tmp)).extract(0); 89 | 90 | *((uint64_t *) dst) = dst_val; 91 | } 92 | 93 | static inline void store_vec(uint16_t dst[8], Vec8ui src) noexcept { 94 | Vec16us tmp = reinterpret_i(src); 95 | 96 | Vec8us dst_vec = permute16< 97 | 0, 2, 4, 6, 8, 10, 12, 14, 98 | V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC 99 | >(tmp).get_low(); 100 | 101 | dst_vec.store_a(dst); 102 | } 103 | 104 | template 105 | static void blurV( 106 | T *VS_RESTRICT dst, 107 | const T *VS_RESTRICT src, 108 | const int width, 109 | const int height, 110 | const int stride, 111 | const int radius, 112 | void * buffer, // [((width + 7) / 8 * 8) * 4] 113 | unsigned round 114 | ) { 115 | 116 | // utilities 117 | Divisor_ui div = radius * 2 + 1; 118 | 119 | uint32_t * buf = reinterpret_cast(buffer); 120 | auto for_each_vec = [buf, width](auto func) -> void { 121 | for (int x = 0; x < width; x += Vec8ui().size()) { 122 | auto vec = Vec8ui().load_a(&buf[x]); 123 | func(vec, x).store_a(&buf[x]); 124 | } 125 | }; 126 | 127 | // process 128 | for (int x = 0; x < width; x += Vec8ui().size()) { 129 | Vec8ui vec = load_vec(&src[x]); 130 | (radius * vec).store_a(&buf[x]); 131 | } 132 | 133 | for (int y = 0; y < radius; y++) { 134 | for_each_vec([=](Vec8ui vec, int x) { 135 | vec += load_vec(&src[std::min(y, height - 1) * stride + x]); 136 | return vec; 137 | }); 138 | } 139 | 140 | for (int y = 0; y < std::min(radius, height); y++) { 141 | for_each_vec([=](Vec8ui vec, int x) { 142 | vec += load_vec(&src[std::min(y + radius, height - 1) * stride + x]); 143 | store_vec(&dst[y * stride + x], (vec + round) / div); 144 | vec -= load_vec(&src[std::max(y - radius, 0) * stride + x]); 145 | return vec; 146 | }); 147 | } 148 | if (height > radius) { 149 | for (int y = radius; y < height - radius; y++) { 150 | for_each_vec([=](Vec8ui vec, int x) { 151 | vec += load_vec(&src[(y + radius) * stride + x]); 152 | store_vec(&dst[y * stride + x], (vec + round) / div); 153 | vec -= load_vec(&src[(y - radius) * stride + x]); 154 | return vec; 155 | }); 156 | } 157 | for (int y = std::max(height - radius, radius); y < height; y++) { 158 | for_each_vec([=](Vec8ui vec, int x) { 159 | vec += load_vec(&src[std::min(y + radius, height - 1) * stride + x]); 160 | store_vec(&dst[y * stride + x], (vec + round) / div); 161 | vec -= load_vec(&src[std::max(y - radius, 0) * stride + x]); 162 | return vec; 163 | }); 164 | } 165 | } 166 | } 167 | 168 | static void blurVF( 169 | float * VS_RESTRICT dst, 170 | const float * VS_RESTRICT src, 171 | const int width, 172 | const int height, 173 | const int stride, 174 | const int radius, 175 | void * buffer // [((width + 7) / 8 * 8) * 4] 176 | ) noexcept { 177 | 178 | Vec8f div = static_cast(1) / (radius * 2 + 1); 179 | 180 | float * buf = reinterpret_cast(buffer); 181 | 182 | for (int x = 0; x < width; x += Vec8f().size()) { 183 | auto vec = Vec8f().load_a(&src[x]); 184 | (radius * vec).store_a(&buf[x]); 185 | } 186 | 187 | auto for_each_vec = [=](auto func) { 188 | for (int x = 0; x < width; x += Vec8f().size()) { 189 | auto vec = Vec8f().load_a(&buf[x]); 190 | func(vec, x).store_a(&buf[x]); 191 | } 192 | }; 193 | 194 | for (int y = 0; y < radius; y++) { 195 | for_each_vec([=](Vec8f vec, int x) { 196 | vec += Vec8f().load_a(&src[std::min(y, height - 1) * stride + x]); 197 | return vec; 198 | }); 199 | } 200 | 201 | for (int y = 0; y < std::min(radius, height); y++) { 202 | for_each_vec([=](Vec8f vec, int x) { 203 | vec += Vec8f().load_a(&src[std::min(y + radius, height - 1) * stride + x]); 204 | (vec * div).store_a(&dst[y * stride + x]); 205 | vec -= Vec8f().load_a(&src[std::max(y - radius, 0) * stride + x]); 206 | return vec; 207 | }); 208 | } 209 | 210 | if (height > radius) { 211 | for (int y = radius; y < height - radius; y++) { 212 | for_each_vec([=](Vec8f vec, int x) { 213 | vec += Vec8f().load_a(&src[(y + radius) * stride + x]); 214 | (vec * div).store_a(&dst[y * stride + x]); 215 | vec -= Vec8f().load_a(&src[(y - radius) * stride + x]); 216 | return vec; 217 | }); 218 | } 219 | 220 | for (int y = std::max(height - radius, radius); y < height; y++) { 221 | for_each_vec([=](Vec8f vec, int x) { 222 | vec += Vec8f().load_a(&src[std::min(y + radius, height - 1) * stride + x]); 223 | (vec * div).store_a(&dst[y * stride + x]); 224 | vec -= Vec8f().load_a(&src[std::max(y - radius, 0) * stride + x]); 225 | return vec; 226 | }); 227 | } 228 | } 229 | } 230 | 231 | static const VSFrameRef *VS_CC BoxBlurVGetFrame( 232 | int n, 233 | int activationReason, 234 | void ** instanceData, 235 | void ** frameData, 236 | VSFrameContext * frameCtx, 237 | VSCore * core, 238 | const VSAPI * vsapi 239 | ) noexcept { 240 | 241 | auto * d = reinterpret_cast(*instanceData); 242 | 243 | if (activationReason == arInitial) { 244 | vsapi->requestFrameFilter(n, d->node, frameCtx); 245 | } else if (activationReason == arAllFramesReady) { 246 | 247 | const VSVideoInfo * vi = vsapi->getVideoInfo(d->node); 248 | 249 | void * buffer; 250 | { 251 | auto thread_id = std::this_thread::get_id(); 252 | 253 | bool init {}; 254 | 255 | d->buffer_lock.lock_shared(); 256 | 257 | try { 258 | buffer = d->buffers.at(thread_id); 259 | init = true; 260 | } catch (const std::out_of_range & e) { 261 | d->buffer_lock.unlock_shared(); 262 | 263 | buffer = vs_aligned_malloc(((vi->width + 7) / 8 * 8) * 4, 32); 264 | 265 | std::lock_guard l(d->buffer_lock); 266 | d->buffers.emplace(thread_id, buffer); 267 | } 268 | 269 | if (init) { 270 | d->buffer_lock.unlock_shared(); 271 | } 272 | } 273 | 274 | const VSFrameRef * src_frame = vsapi->getFrameFilter(n, d->node, frameCtx); 275 | VSFrameRef * dst_frame = vsapi->newVideoFrame(vi->format, vi->width, vi->height, src_frame, core); 276 | 277 | for (int plane = 0; plane < vi->format->numPlanes; ++plane) { 278 | if (d->radius[plane] > 0) { 279 | int width = vsapi->getFrameWidth(src_frame, plane); 280 | int height = vsapi->getFrameHeight(src_frame, plane); 281 | int bytes = vi->format->bytesPerSample; 282 | int stride = vsapi->getStride(src_frame, plane) / bytes; 283 | 284 | const auto * srcp = vsapi->getReadPtr(src_frame, plane); 285 | auto * dstp = vsapi->getWritePtr(dst_frame, plane); 286 | 287 | auto round = (unsigned int) (d->rounding ? d->radius[plane] * 2 : 0); 288 | 289 | if (bytes == 4) { 290 | blurVF((float *) dstp, (const float *) srcp, width, height, stride, d->radius[plane], buffer); 291 | } else if (bytes == 2) { 292 | blurV((uint16_t *) dstp, (const uint16_t *) srcp, width, height, stride, d->radius[plane], buffer, round); 293 | } else if (bytes == 1) { 294 | blurV((uint8_t *) dstp, (const uint8_t *) srcp, width, height, stride, d->radius[plane], buffer, round); 295 | } 296 | } 297 | } 298 | 299 | vsapi->freeFrame(src_frame); 300 | 301 | return dst_frame; 302 | } 303 | 304 | return nullptr; 305 | } 306 | 307 | static void VS_CC BoxBlurVFree( 308 | void * instanceData, 309 | VSCore * core, 310 | const VSAPI * vsapi 311 | ) noexcept { 312 | 313 | auto * d = reinterpret_cast(instanceData); 314 | 315 | vsapi->freeNode(d->node); 316 | 317 | for (const auto & [_, buffer] : d->buffers) { 318 | vs_aligned_free(buffer); 319 | } 320 | 321 | delete d; 322 | } 323 | 324 | static void VS_CC BoxBlurVCreate( 325 | const VSMap * in, 326 | VSMap * out, 327 | void * userData, 328 | VSCore * core, 329 | const VSAPI * vsapi 330 | ) noexcept { 331 | 332 | if (instrset_detect() < 8 || !hasFMA3()) { 333 | vsapi->setError(out, "AVX2 is required"); 334 | return ; 335 | } 336 | 337 | auto d = std::make_unique(); 338 | 339 | d->node = vsapi->propGetNode(in, "clip", 0, nullptr); 340 | 341 | const VSVideoInfo * vi = vsapi->getVideoInfo(d->node); 342 | 343 | if (const auto fi = vi->format; 344 | (fi->sampleType == stInteger && fi->bitsPerSample > 16) || 345 | (fi->sampleType == stFloat && fi->bitsPerSample != 32) 346 | ) { 347 | vsapi->setError(out, "not supported format"); 348 | vsapi->freeNode(d->node); 349 | return ; 350 | } 351 | 352 | for (int i = 0; i < vi->format->numPlanes; ++i) { 353 | int err; 354 | d->radius[i] = int64ToIntS(vsapi->propGetInt(in, "radius", i, &err)); 355 | if (err) { 356 | d->radius[i] = (i == 0) ? 1 : d->radius[i - 1]; 357 | } 358 | } 359 | 360 | vsapi->createFilter( 361 | in, out, 362 | "BlurV", 363 | BoxBlurVInit, BoxBlurVGetFrame, BoxBlurVFree, 364 | fmParallelRequests, 0, d.release(), core 365 | ); 366 | } 367 | 368 | static void VS_CC BoxBlurCreate( 369 | const VSMap * in, 370 | VSMap * out, 371 | void * userData, 372 | VSCore * core, 373 | const VSAPI * vsapi 374 | ) noexcept { 375 | 376 | auto node = vsapi->propGetNode(in, "clip", 0, nullptr); 377 | auto vi = vsapi->getVideoInfo(node); 378 | const auto fi = vi->format; 379 | 380 | int err; 381 | 382 | // not supported 383 | if (instrset_detect() < 8 || !hasFMA3() || 384 | (fi->sampleType == stInteger && (fi->bitsPerSample > 16)) || 385 | (fi->sampleType == stFloat && fi->bitsPerSample != 32) 386 | ) { 387 | vsapi->freeNode(node); 388 | node = nullptr; 389 | 390 | if (std::getenv("VS_BOXFILTER_DEBUG")) { 391 | vsapi->logMessage(mtWarning, "fallback to std.BoxBlur"); 392 | } 393 | 394 | VSPlugin *stdplugin = vsapi->getPluginById("com.vapoursynth.std", core); 395 | 396 | VSMap *out_map = vsapi->invoke(stdplugin, "BoxBlur", in); 397 | 398 | auto node = vsapi->propGetNode(out_map, "clip", 0, &err); 399 | 400 | if (err) { 401 | vsapi->setError(out, vsapi->getError(out_map)); 402 | vsapi->freeMap(out_map); 403 | return; 404 | } 405 | 406 | vsapi->propSetNode(out, "clip", node, paAppend); 407 | 408 | vsapi->freeNode(node); 409 | vsapi->freeMap(out_map); 410 | 411 | return ; 412 | } 413 | 414 | std::array process {}; 415 | int num_planes_args = vsapi->propNumElements(in, "planes"); 416 | if (num_planes_args == -1) { 417 | for (int i = 0; i < vi->format->numPlanes; ++i) { 418 | process[i] = true; 419 | } 420 | } else { 421 | for (int i = 0; i < num_planes_args; ++i) { 422 | int plane = vsapi->propGetInt(in, "planes", i, nullptr); 423 | if (0 <= plane && plane < vi->format->numPlanes) { 424 | if (process[plane]) { 425 | vsapi->setError(out, "plane specified twice"); 426 | vsapi->freeNode(node); 427 | return ; 428 | } 429 | process[plane] = true; 430 | } else { 431 | vsapi->setError(out, "plane index out of range"); 432 | vsapi->freeNode(node); 433 | return ; 434 | } 435 | } 436 | } 437 | 438 | int hradius = int64ToIntS(vsapi->propGetInt(in, "hradius", 0, &err)); 439 | if (err) { 440 | hradius = 1; 441 | } 442 | 443 | int hpasses = int64ToIntS(vsapi->propGetInt(in, "hpasses", 0, &err)); 444 | if (err) { 445 | hpasses = 1; 446 | } 447 | 448 | int vradius = int64ToIntS(vsapi->propGetInt(in, "vradius", 0, &err)); 449 | if (err) { 450 | vradius = 1; 451 | } 452 | 453 | int vpasses = int64ToIntS(vsapi->propGetInt(in, "vpasses", 0, &err)); 454 | if (err) { 455 | vpasses = 1; 456 | } 457 | 458 | if (vpasses > 0 && vradius > 0) { 459 | VSMap * in_map = vsapi->createMap(); 460 | VSMap * out_map = vsapi->createMap(); 461 | 462 | std::array radius {}; 463 | for (int plane = 0; plane < vi->format->numPlanes; ++plane) { 464 | if (process[plane]) { 465 | radius[plane] = vradius; 466 | } 467 | } 468 | 469 | for (int pass = 0; pass < vpasses; ++pass) { 470 | vsapi->propSetNode(in_map, "clip", node, paReplace); 471 | vsapi->createFilter( 472 | in_map, out_map, "BlurV", BoxBlurVInit, BoxBlurVGetFrame, BoxBlurVFree, 473 | fmParallel, 0, new BoxBlurVData{ node, radius, (pass % 2) == 0 }, core); 474 | 475 | node = vsapi->propGetNode(out_map, "clip", 0, nullptr); 476 | vsapi->clearMap(out_map); 477 | vsapi->clearMap(in_map); 478 | } 479 | 480 | vsapi->freeMap(in_map); 481 | vsapi->freeMap(out_map); 482 | } 483 | 484 | if (hpasses > 0 && hradius > 0) { 485 | VSPlugin *stdplugin = vsapi->getPluginById("com.vapoursynth.std", core); 486 | 487 | VSMap *vtmp1 = vsapi->createMap(); 488 | 489 | vsapi->propSetNode(vtmp1, "clip", node, paAppend); 490 | vsapi->freeNode(node); 491 | VSMap *vtmp2 = vsapi->invoke(stdplugin, "Transpose", vtmp1); 492 | vsapi->clearMap(vtmp1); 493 | node = vsapi->propGetNode(vtmp2, "clip", 0, nullptr); 494 | vsapi->clearMap(vtmp2); 495 | 496 | std::array radius {}; 497 | for (unsigned plane = 0; plane < radius.size(); ++plane) { 498 | if (process[plane]) { 499 | radius[plane] = hradius; 500 | } 501 | } 502 | 503 | for (int pass = 0; pass < hpasses; ++pass) { 504 | vsapi->propSetNode(vtmp1, "clip", node, paReplace); 505 | vsapi->createFilter( 506 | vtmp1, vtmp2, "BlurV", BoxBlurVInit, BoxBlurVGetFrame, BoxBlurVFree, 507 | fmParallel, 0, new BoxBlurVData{ node, radius, (pass % 2) == 0 }, core); 508 | 509 | node = vsapi->propGetNode(vtmp2, "clip", 0, &err); 510 | vsapi->clearMap(vtmp2); 511 | vsapi->clearMap(vtmp1); 512 | } 513 | 514 | vsapi->propSetNode(vtmp2, "clip", node, paReplace); 515 | vsapi->freeNode(node); 516 | vsapi->freeMap(vtmp1); 517 | vtmp1 = vsapi->invoke(stdplugin, "Transpose", vtmp2); 518 | vsapi->freeMap(vtmp2); 519 | node = vsapi->propGetNode(vtmp1, "clip", 0, nullptr); 520 | vsapi->freeMap(vtmp1); 521 | } 522 | 523 | vsapi->propSetNode(out, "clip", node, paAppend); 524 | vsapi->freeNode(node); 525 | } 526 | 527 | #ifdef INSIDE_VS 528 | VS_EXTERNAL_API(void) vsExtBoxBlurInitialize3( 529 | #else 530 | VS_EXTERNAL_API(void) VapourSynthPluginInit( 531 | #endif 532 | VSConfigPlugin configFunc, 533 | VSRegisterFunction registerFunc, 534 | VSPlugin * plugin 535 | ) noexcept { 536 | 537 | configFunc("io.github.amusementclub.boxblur", "box", "AVX2-optimized boxfilter", VAPOURSYNTH_API_VERSION, 1, plugin); 538 | 539 | registerFunc("BlurV", 540 | "clip:clip;" 541 | "radius:int[]:opt;", 542 | BoxBlurVCreate, nullptr, plugin 543 | ); 544 | 545 | registerFunc("Blur", 546 | "clip:clip;" 547 | "planes:int[]:opt;" 548 | "hradius:int:opt;" 549 | "hpasses:int:opt;" 550 | "vradius:int:opt;" 551 | "vpasses:int:opt;", 552 | BoxBlurCreate, nullptr, plugin 553 | ); 554 | 555 | auto getVersion = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) { 556 | vsapi->propSetData(out, "version", VERSION, -1, paReplace); 557 | }; 558 | registerFunc("Version", "", getVersion, nullptr, plugin); 559 | } 560 | --------------------------------------------------------------------------------