├── source ├── config.h.in ├── vsnlm.cpp └── nlm.ispc ├── README.md ├── .github └── workflows │ ├── linux-arm64.yml │ ├── linux.yml │ └── windows.yml ├── CMakeLists.txt └── LICENSE /source/config.h.in: -------------------------------------------------------------------------------- 1 | #define VERSION "@VCS_TAG@" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vs-nlm-ispc 2 | Non-local means denoise filter, drop-in replacement of the venerable [KNLMeansCL](https://github.com/Khanattila/KNLMeansCL), but without the OpenCL dependency (CPU only). 3 | 4 | x86 and arm are supported. 5 | 6 | ## Usage 7 | Prototype: 8 | 9 | `core.nlm_ispc.NLMeans(clip clip[, int d = 1, int a = 2, int s = 4, float h = 1.2, string channels = "AUTO", int wmode = 0, float wref = 1.0, clip rclip = None])` 10 | 11 | ## Compilation 12 | [ISPC](https://github.com/ispc/ispc) is required. 13 | 14 | ### x86 15 | ```bash 16 | cmake -S . -B build -D CMAKE_BUILD_TYPE=Release \ 17 | -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8" \ 18 | -D CMAKE_ISPC_FLAGS="--opt=fast-math" 19 | 20 | cmake --build build 21 | 22 | cmake --install build 23 | ``` 24 | 25 | ### arm 26 | ```bash 27 | cmake -S . -B build -D CMAKE_BUILD_TYPE=Release \ 28 | -D CMAKE_ISPC_INSTRUCTION_SETS="neon-i32x4" \ 29 | -D CMAKE_ISPC_FLAGS="--opt=fast-math" 30 | 31 | cmake --build build 32 | 33 | cmake --install build 34 | ``` 35 | 36 | -------------------------------------------------------------------------------- /.github/workflows/linux-arm64.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux, ARM64) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'source/nlm.ispc' 7 | - 'source/vsnlm.cpp' 8 | - '.github/workflows/linux-arm64.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-24.04-arm 14 | steps: 15 | - name: Checkout repo 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Download ISPC 21 | run: | 22 | curl -s -o ispc.tar.gz -LJO https://github.com/ispc/ispc/releases/download/v1.25.3/ispc-v1.25.3-linux.aarch64.tar.gz 23 | tar -xzf ispc.tar.gz 24 | mv ispc-*/ ispc/ 25 | 26 | - name: Download VapourSynth headers 27 | run: | 28 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 29 | unzip -q vs.zip 30 | mv vapoursynth*/ vapoursynth 31 | 32 | - name: Setup Ninja 33 | run: pip install ninja 34 | 35 | - name: Configure 36 | run: cmake -S . -B build -G Ninja -LA 37 | -D CMAKE_BUILD_TYPE=Release 38 | -D CMAKE_CXX_FLAGS="-Wall" 39 | -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc" 40 | -D CMAKE_ISPC_INSTRUCTION_SETS="neon-i32x4" 41 | -D CMAKE_ISPC_FLAGS="--opt=fast-math" 42 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include" 43 | 44 | - name: Build 45 | run: cmake --build build --verbose 46 | 47 | - name: Install 48 | run: cmake --install build --prefix install 49 | 50 | - name: Upload 51 | uses: actions/upload-artifact@v4 52 | if: false 53 | with: 54 | name: Linux-x64 55 | path: install/lib/*.so 56 | 57 | -------------------------------------------------------------------------------- /.github/workflows/linux.yml: -------------------------------------------------------------------------------- 1 | name: Build (Linux) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'source/nlm.ispc' 7 | - 'source/vsnlm.cpp' 8 | - '.github/workflows/linux.yml' 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build-linux: 13 | runs-on: ubuntu-22.04 14 | steps: 15 | - name: Checkout repo 16 | uses: actions/checkout@v3 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Download ISPC 21 | run: | 22 | curl -s -o ispc.tar.gz -LJO https://github.com/ispc/ispc/releases/download/v1.20.0/ispc-v1.20.0-linux.tar.gz 23 | tar -xzf ispc.tar.gz 24 | mv ispc-*/ ispc/ 25 | 26 | - name: Download VapourSynth headers 27 | run: | 28 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 29 | unzip -q vs.zip 30 | mv vapoursynth*/ vapoursynth 31 | 32 | - name: Setup Ninja 33 | run: pip install ninja 34 | 35 | - name: Configure 36 | run: cmake -S . -B build -G Ninja -LA 37 | -D CMAKE_BUILD_TYPE=Release 38 | -D CMAKE_CXX_COMPILER=g++-12 39 | -D CMAKE_CXX_FLAGS="-Wall" 40 | -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc" 41 | -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8" 42 | -D CMAKE_ISPC_FLAGS="--opt=fast-math" 43 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include" 44 | 45 | - name: Build 46 | run: cmake --build build --verbose 47 | 48 | - name: Install 49 | run: cmake --install build --prefix install 50 | 51 | - name: Upload 52 | uses: actions/upload-artifact@v3 53 | if: false 54 | with: 55 | name: Linux-x64 56 | path: install/lib/*.so 57 | 58 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20.0) 2 | 3 | project(vs-nlm-ispc VERSION 0.1 LANGUAGES CXX ISPC) 4 | 5 | add_library(vsnlm_ispc SHARED source/vsnlm.cpp source/nlm.ispc) 6 | 7 | set_target_properties(vsnlm_ispc PROPERTIES 8 | CXX_EXTENSIONS OFF 9 | CXX_STANDARD 17 10 | CXX_STANDARD_REQUIRED ON 11 | ) 12 | 13 | find_package(PkgConfig QUIET MODULE) 14 | 15 | if(PKG_CONFIG_FOUND) 16 | pkg_search_module(VS vapoursynth) 17 | 18 | if(VS_FOUND) 19 | message(STATUS "Found VapourSynth r${VS_VERSION}") 20 | 21 | cmake_path(APPEND install_dir ${VS_LIBDIR} vapoursynth) 22 | target_include_directories(vsnlm_ispc PRIVATE ${VS_INCLUDE_DIRS}) 23 | 24 | install(TARGETS vsnlm_ispc LIBRARY DESTINATION ${install_dir}) 25 | endif() 26 | endif() 27 | 28 | if(NOT VS_FOUND) 29 | set(VS_INCLUDE_DIR "" CACHE PATH "Path to VapourSynth headers") 30 | 31 | if(VS_INCLUDE_DIR STREQUAL "") 32 | message(WARNING "VapourSynth not found") 33 | endif() 34 | 35 | target_include_directories(vsnlm_ispc PRIVATE ${VS_INCLUDE_DIR}) 36 | 37 | install(TARGETS vsnlm_ispc LIBRARY RUNTIME) 38 | endif() 39 | 40 | find_package(Git QUIET) 41 | 42 | if(GIT_FOUND) 43 | execute_process( 44 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always 45 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" 46 | OUTPUT_VARIABLE VCS_TAG 47 | ) 48 | if(VCS_TAG) 49 | string(STRIP ${VCS_TAG} VCS_TAG) 50 | endif() 51 | endif() 52 | 53 | if(VCS_TAG) 54 | message(STATUS "vs-nlm-ispc ${VCS_TAG}") 55 | else() 56 | message(WARNING "unknown plugin version") 57 | set(VCS_TAG "unknown") 58 | endif() 59 | 60 | configure_file(source/config.h.in config.h) 61 | 62 | target_include_directories(vsnlm_ispc PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) 63 | -------------------------------------------------------------------------------- /.github/workflows/windows.yml: -------------------------------------------------------------------------------- 1 | name: Build (Windows) 2 | 3 | on: 4 | push: 5 | paths: 6 | - 'source/nlm.ispc' 7 | - 'source/vsnlm.cpp' 8 | - '.github/workflows/windows.yml' 9 | workflow_dispatch: 10 | inputs: 11 | tag: 12 | description: 'which tag to upload to' 13 | default: '' 14 | 15 | jobs: 16 | build-windows: 17 | runs-on: windows-2022 18 | outputs: 19 | runID: ${{ steps.output.outputs.runID }} 20 | 21 | defaults: 22 | run: 23 | shell: cmd 24 | 25 | steps: 26 | - name: Checkout repo 27 | uses: actions/checkout@v3 28 | with: 29 | fetch-depth: 0 30 | 31 | - name: Setup MSVC 32 | uses: ilammy/msvc-dev-cmd@v1 33 | 34 | - name: Download VapourSynth headers 35 | run: | 36 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip 37 | unzip -q vs.zip 38 | mv vapoursynth-*/ vapoursynth/ 39 | 40 | - name: Download ISPC 41 | run: | 42 | curl -s -o ispc.zip -LJO https://github.com/ispc/ispc/releases/download/v1.20.0/ispc-v1.20.0-windows.zip 43 | unzip -q ispc.zip 44 | mv ispc-*/ ispc/ 45 | tree ispc 46 | 47 | - name: Configure 48 | shell: bash 49 | run: cmake -S . -B build -G Ninja 50 | -D VS_INCLUDE_DIR="$(pwd)\vapoursynth\include" 51 | -D CMAKE_BUILD_TYPE=Release 52 | -D CMAKE_CXX_COMPILER="clang++" 53 | -D CMAKE_CXX_FLAGS="-Wall" 54 | -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc.exe" 55 | -D CMAKE_ISPC_FLAGS="--opt=fast-math" 56 | -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8" 57 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded 58 | 59 | - name: Build 60 | run: cmake --build build --verbose 61 | 62 | - name: Install 63 | run: | 64 | cmake --install build --prefix install 65 | mkdir artifact 66 | copy install\bin\vsnlm_ispc.dll artifact\ 67 | 68 | - name: Upload 69 | uses: actions/upload-artifact@v3 70 | with: 71 | name: Windows-x64 72 | path: artifact 73 | 74 | - name: Describe 75 | run: git describe --tags --long 76 | 77 | - name: Compress artifact for release 78 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 79 | run: | 80 | cd artifact 81 | 7z a -t7z -mx=7 ../vs-nlm-ispc-windows-x64.${{ github.event.inputs.tag }}.7z . 82 | 83 | - name: Release 84 | uses: softprops/action-gh-release@v1 85 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != '' 86 | with: 87 | tag_name: ${{ github.event.inputs.tag }} 88 | files: vs-nlm-ispc-windows-x64.${{ github.event.inputs.tag }}.7z 89 | fail_on_unmatched_files: true 90 | generate_release_notes: false 91 | prerelease: true 92 | 93 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /source/vsnlm.cpp: -------------------------------------------------------------------------------- 1 | // based on KNLMeansCL by Khanattila 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | 20 | #include // generated by the ispc compiler 21 | 22 | #include // generated by cmake and git 23 | 24 | namespace { 25 | enum struct ChannelMode { Y, UV, YUV, RGB }; 26 | 27 | struct NLMData { 28 | VSNodeRef * node; // clip 29 | const VSVideoInfo *vi; 30 | int d; 31 | int a; 32 | int s; 33 | float h; 34 | ChannelMode channels; 35 | decltype(&ispc::nlmVerticalWelsch) nlm_vertical; // wmode 36 | float wref; 37 | VSNodeRef * ref_node; // rclip 38 | 39 | // run-time resources 40 | std::shared_mutex workspaces_lock; 41 | std::unordered_map workspaces; 42 | }; 43 | } 44 | 45 | template 46 | static inline auto castVoidPtr(T * p) noexcept { 47 | if constexpr (std::is_const_v) { 48 | return reinterpret_cast(p); 49 | } else { 50 | return reinterpret_cast(p); 51 | } 52 | } 53 | 54 | template 55 | static inline std::array castPtrs(std::array ptrs) { 56 | return { 57 | (T1 *) ptrs[0], 58 | (T1 *) ptrs[1], 59 | (T1 *) ptrs[2], 60 | }; 61 | } 62 | 63 | template 64 | static inline constexpr T square(T x) noexcept { 65 | return x * x; 66 | } 67 | 68 | // T: (const) VSFrameRef 69 | template 70 | static inline auto getPtrs( 71 | T * frame, 72 | ChannelMode channels, 73 | const VSAPI * vsapi 74 | ) noexcept { 75 | 76 | using value_type = std::conditional_t, const void, void>; 77 | 78 | std::array ptrs {}; 79 | 80 | auto get_ptr = [frame, vsapi](int plane) { 81 | if constexpr (std::is_const_v) { 82 | return castVoidPtr(vsapi->getReadPtr(frame, plane)); 83 | } else { 84 | return castVoidPtr(vsapi->getWritePtr(frame, plane)); 85 | } 86 | }; 87 | 88 | switch (channels) { 89 | case ChannelMode::Y: 90 | ptrs[0] = get_ptr(0); 91 | break; 92 | case ChannelMode::UV: 93 | ptrs[1] = get_ptr(1); 94 | ptrs[2] = get_ptr(2); 95 | break; 96 | case ChannelMode::YUV: 97 | case ChannelMode::RGB: 98 | ptrs[0] = get_ptr(0); 99 | ptrs[1] = get_ptr(1); 100 | ptrs[2] = get_ptr(2); 101 | break; 102 | } 103 | 104 | return ptrs; 105 | } 106 | 107 | static void VS_CC nlmInit( 108 | VSMap * in, 109 | VSMap * out, 110 | void ** instanceData, 111 | VSNode * node, 112 | VSCore * core, 113 | const VSAPI * vsapi 114 | ) noexcept { 115 | 116 | const auto * d = reinterpret_cast(*instanceData); 117 | vsapi->setVideoInfo(vsapi->getVideoInfo(d->node), 1, node); 118 | } 119 | 120 | static inline void nlmDistanceDispatch_f32( 121 | float * temp0, 122 | std::array centerp, 123 | std::array neighborp, 124 | int offset_x, 125 | int offset_y, 126 | int width, 127 | int height, 128 | int stride, 129 | ChannelMode channels 130 | ) noexcept { 131 | 132 | switch (channels) { 133 | case ChannelMode::Y: 134 | ispc::nlmDistanceLuma_f32( 135 | temp0, 136 | centerp[0], 137 | neighborp[0], 138 | offset_x, offset_y, 139 | width, height, stride 140 | ); 141 | break; 142 | case ChannelMode::UV: 143 | ispc::nlmDistanceChroma_f32( 144 | temp0, 145 | centerp[1], centerp[2], 146 | neighborp[1], neighborp[2], 147 | offset_x, offset_y, 148 | width, height, stride 149 | ); 150 | break; 151 | case ChannelMode::YUV: 152 | ispc::nlmDistanceYUV_f32( 153 | temp0, 154 | centerp[0], centerp[1], centerp[2], 155 | neighborp[0], neighborp[1], neighborp[2], 156 | offset_x, offset_y, 157 | width, height, stride 158 | ); 159 | break; 160 | case ChannelMode::RGB: 161 | ispc::nlmDistanceRGB_f32( 162 | temp0, 163 | centerp[0], centerp[1], centerp[2], 164 | neighborp[0], neighborp[1], neighborp[2], 165 | offset_x, offset_y, 166 | width, height, stride 167 | ); 168 | break; 169 | } 170 | } 171 | 172 | static inline void nlmDistanceDispatch_u8( 173 | float * temp0, 174 | std::array centerp, 175 | std::array neighborp, 176 | int offset_x, 177 | int offset_y, 178 | int width, 179 | int height, 180 | int stride, 181 | ChannelMode channels, 182 | float inv_divisor 183 | ) noexcept { 184 | 185 | switch (channels) { 186 | case ChannelMode::Y: 187 | ispc::nlmDistanceLuma_u8( 188 | temp0, 189 | centerp[0], 190 | neighborp[0], 191 | offset_x, offset_y, 192 | width, height, stride, 193 | inv_divisor 194 | ); 195 | break; 196 | case ChannelMode::UV: 197 | ispc::nlmDistanceChroma_u8( 198 | temp0, 199 | centerp[1], centerp[2], 200 | neighborp[1], neighborp[2], 201 | offset_x, offset_y, 202 | width, height, stride, 203 | inv_divisor 204 | ); 205 | break; 206 | case ChannelMode::YUV: 207 | ispc::nlmDistanceYUV_u8( 208 | temp0, 209 | centerp[0], centerp[1], centerp[2], 210 | neighborp[0], neighborp[1], neighborp[2], 211 | offset_x, offset_y, 212 | width, height, stride, 213 | inv_divisor 214 | ); 215 | break; 216 | case ChannelMode::RGB: 217 | ispc::nlmDistanceRGB_u8( 218 | temp0, 219 | centerp[0], centerp[1], centerp[2], 220 | neighborp[0], neighborp[1], neighborp[2], 221 | offset_x, offset_y, 222 | width, height, stride, 223 | inv_divisor 224 | ); 225 | break; 226 | } 227 | } 228 | 229 | static inline void nlmDistanceDispatch_u16( 230 | float * temp0, 231 | std::array centerp, 232 | std::array neighborp, 233 | int offset_x, 234 | int offset_y, 235 | int width, 236 | int height, 237 | int stride, 238 | ChannelMode channels, 239 | float inv_divisor 240 | ) noexcept { 241 | 242 | switch (channels) { 243 | case ChannelMode::Y: 244 | ispc::nlmDistanceLuma_u16( 245 | temp0, 246 | centerp[0], 247 | neighborp[0], 248 | offset_x, offset_y, 249 | width, height, stride, 250 | inv_divisor 251 | ); 252 | break; 253 | case ChannelMode::UV: 254 | ispc::nlmDistanceChroma_u16( 255 | temp0, 256 | centerp[1], centerp[2], 257 | neighborp[1], neighborp[2], 258 | offset_x, offset_y, 259 | width, height, stride, 260 | inv_divisor 261 | ); 262 | break; 263 | case ChannelMode::YUV: 264 | ispc::nlmDistanceYUV_u16( 265 | temp0, 266 | centerp[0], centerp[1], centerp[2], 267 | neighborp[0], neighborp[1], neighborp[2], 268 | offset_x, offset_y, 269 | width, height, stride, 270 | inv_divisor 271 | ); 272 | break; 273 | case ChannelMode::RGB: 274 | ispc::nlmDistanceRGB_u16( 275 | temp0, 276 | centerp[0], centerp[1], centerp[2], 277 | neighborp[0], neighborp[1], neighborp[2], 278 | offset_x, offset_y, 279 | width, height, stride, 280 | inv_divisor 281 | ); 282 | break; 283 | } 284 | } 285 | 286 | static inline void nlmDistance( 287 | float * temp0, 288 | std::array centerp, 289 | std::array neighborp, 290 | int offset_x, 291 | int offset_y, 292 | int width, 293 | int height, 294 | int stride, 295 | ChannelMode channels, 296 | int bits 297 | ) noexcept { 298 | 299 | if (bits == 32) { 300 | nlmDistanceDispatch_f32( 301 | temp0, 302 | castPtrs(centerp), castPtrs(neighborp), 303 | offset_x, offset_y, 304 | width, height, stride, channels 305 | ); 306 | } else if (bits <= 8) { 307 | float inv_divisor = 1.0f / ((1 << bits) - 1); 308 | nlmDistanceDispatch_u8( 309 | temp0, 310 | castPtrs(centerp), castPtrs(neighborp), 311 | offset_x, offset_y, 312 | width, height, stride, channels, inv_divisor 313 | ); 314 | } else if (bits <= 16) { 315 | float inv_divisor = 1.0f / ((1 << bits) - 1); 316 | nlmDistanceDispatch_u16( 317 | temp0, 318 | castPtrs(centerp), castPtrs(neighborp), 319 | offset_x, offset_y, 320 | width, height, stride, channels, inv_divisor 321 | ); 322 | } else { 323 | assert(false); 324 | } 325 | } 326 | 327 | static inline void nlmAccumulationDispatch_f32( 328 | float * weightp, 329 | std::array wdstp, 330 | float * max_weightp, 331 | std::array srcp_bwd, 332 | std::array srcp_fwd, 333 | const float * temp_bwd, 334 | const float * temp_fwd, 335 | int offset_x, 336 | int offset_y, 337 | int width, 338 | int height, 339 | int stride, 340 | ChannelMode channels 341 | ) noexcept { 342 | 343 | switch (channels) { 344 | case ChannelMode::Y: 345 | ispc::nlmAccumulationCh1_f32( 346 | weightp, wdstp[0], max_weightp, 347 | srcp_bwd[0], 348 | srcp_fwd[0], 349 | temp_bwd, temp_fwd, 350 | offset_x, offset_y, 351 | width, height, stride 352 | ); 353 | break; 354 | case ChannelMode::UV: 355 | ispc::nlmAccumulationCh2_f32( 356 | weightp, wdstp[0], wdstp[1], max_weightp, 357 | srcp_bwd[1], srcp_bwd[2], 358 | srcp_fwd[1], srcp_fwd[2], 359 | temp_bwd, temp_fwd, 360 | offset_x, offset_y, 361 | width, height, stride 362 | ); 363 | break; 364 | case ChannelMode::YUV: 365 | case ChannelMode::RGB: 366 | ispc::nlmAccumulationCh3_f32( 367 | weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp, 368 | srcp_bwd[0], srcp_bwd[1], srcp_bwd[2], 369 | srcp_fwd[0], srcp_fwd[1], srcp_fwd[2], 370 | temp_bwd, temp_fwd, 371 | offset_x, offset_y, 372 | width, height, stride 373 | ); 374 | break; 375 | } 376 | } 377 | 378 | static inline void nlmAccumulationDispatch_u8( 379 | float * weightp, 380 | std::array wdstp, 381 | float * max_weightp, 382 | std::array srcp_bwd, 383 | std::array srcp_fwd, 384 | const float * temp_bwd, 385 | const float * temp_fwd, 386 | int offset_x, 387 | int offset_y, 388 | int width, 389 | int height, 390 | int stride, 391 | ChannelMode channels 392 | ) noexcept { 393 | 394 | switch (channels) { 395 | case ChannelMode::Y: 396 | ispc::nlmAccumulationCh1_u8( 397 | weightp, wdstp[0], max_weightp, 398 | srcp_bwd[0], 399 | srcp_fwd[0], 400 | temp_bwd, temp_fwd, 401 | offset_x, offset_y, 402 | width, height, stride 403 | ); 404 | break; 405 | case ChannelMode::UV: 406 | ispc::nlmAccumulationCh2_u8( 407 | weightp, wdstp[0], wdstp[1], max_weightp, 408 | srcp_bwd[1], srcp_bwd[2], 409 | srcp_fwd[1], srcp_fwd[2], 410 | temp_bwd, temp_fwd, 411 | offset_x, offset_y, 412 | width, height, stride 413 | ); 414 | break; 415 | case ChannelMode::YUV: 416 | case ChannelMode::RGB: 417 | ispc::nlmAccumulationCh3_u8( 418 | weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp, 419 | srcp_bwd[0], srcp_bwd[1], srcp_bwd[2], 420 | srcp_fwd[0], srcp_fwd[1], srcp_fwd[2], 421 | temp_bwd, temp_fwd, 422 | offset_x, offset_y, 423 | width, height, stride 424 | ); 425 | break; 426 | } 427 | } 428 | 429 | static inline void nlmAccumulationDispatch_u16( 430 | float * weightp, 431 | std::array wdstp, 432 | float * max_weightp, 433 | std::array srcp_bwd, 434 | std::array srcp_fwd, 435 | const float * temp_bwd, 436 | const float * temp_fwd, 437 | int offset_x, 438 | int offset_y, 439 | int width, 440 | int height, 441 | int stride, 442 | ChannelMode channels 443 | ) noexcept { 444 | 445 | switch (channels) { 446 | case ChannelMode::Y: 447 | ispc::nlmAccumulationCh1_u16( 448 | weightp, wdstp[0], max_weightp, 449 | srcp_bwd[0], 450 | srcp_fwd[0], 451 | temp_bwd, temp_fwd, 452 | offset_x, offset_y, 453 | width, height, stride 454 | ); 455 | break; 456 | case ChannelMode::UV: 457 | ispc::nlmAccumulationCh2_u16( 458 | weightp, wdstp[0], wdstp[1], max_weightp, 459 | srcp_bwd[1], srcp_bwd[2], 460 | srcp_fwd[1], srcp_fwd[2], 461 | temp_bwd, temp_fwd, 462 | offset_x, offset_y, 463 | width, height, stride 464 | ); 465 | break; 466 | case ChannelMode::YUV: 467 | case ChannelMode::RGB: 468 | ispc::nlmAccumulationCh3_u16( 469 | weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp, 470 | srcp_bwd[0], srcp_bwd[1], srcp_bwd[2], 471 | srcp_fwd[0], srcp_fwd[1], srcp_fwd[2], 472 | temp_bwd, temp_fwd, 473 | offset_x, offset_y, 474 | width, height, stride 475 | ); 476 | break; 477 | } 478 | } 479 | 480 | static inline void nlmAccumulation( 481 | float * weightp, 482 | std::array wdstp, 483 | float * max_weightp, 484 | std::array srcp_bwd, 485 | std::array srcp_fwd, 486 | const float * temp_bwd, 487 | const float * temp_fwd, 488 | int offset_x, 489 | int offset_y, 490 | int width, 491 | int height, 492 | int stride, 493 | ChannelMode channels, 494 | int bits 495 | ) noexcept { 496 | 497 | if (bits == 32) { 498 | nlmAccumulationDispatch_f32( 499 | weightp, wdstp, max_weightp, 500 | castPtrs(srcp_bwd), castPtrs(srcp_fwd), temp_bwd, temp_fwd, 501 | offset_x, offset_y, width, height, stride, channels 502 | ); 503 | } else if (bits <= 8) { 504 | nlmAccumulationDispatch_u8( 505 | weightp, wdstp, max_weightp, 506 | castPtrs(srcp_bwd), castPtrs(srcp_fwd), temp_bwd, temp_fwd, 507 | offset_x, offset_y, width, height, stride, channels 508 | ); 509 | } else if (bits <= 16) { 510 | nlmAccumulationDispatch_u16( 511 | weightp, wdstp, max_weightp, 512 | castPtrs(srcp_bwd), castPtrs(srcp_fwd), temp_bwd, temp_fwd, 513 | offset_x, offset_y, width, height, stride, channels 514 | ); 515 | } else { 516 | assert(false); 517 | } 518 | } 519 | 520 | static inline void nlmFinishDispatch_f32( 521 | std::array dstp, 522 | std::array srcp, 523 | const float * weightp, 524 | std::array wdstp, 525 | const float * max_weightp, 526 | float wref, 527 | int width, 528 | int height, 529 | int stride, 530 | ChannelMode channels 531 | ) noexcept { 532 | 533 | switch (channels) { 534 | case ChannelMode::Y: 535 | ispc::nlmFinishCh1_f32( 536 | dstp[0], 537 | srcp[0], 538 | weightp, wdstp[0], 539 | max_weightp, wref, 540 | width, height, stride 541 | ); 542 | break; 543 | case ChannelMode::UV: 544 | ispc::nlmFinishCh2_f32( 545 | dstp[1], dstp[2], 546 | srcp[1], srcp[2], 547 | weightp, wdstp[0], wdstp[1], 548 | max_weightp, wref, 549 | width, height, stride 550 | ); 551 | break; 552 | case ChannelMode::YUV: 553 | case ChannelMode::RGB: 554 | ispc::nlmFinishCh3_f32( 555 | dstp[0], dstp[1], dstp[2], 556 | srcp[0], srcp[1], srcp[2], 557 | weightp, wdstp[0], wdstp[1], wdstp[2], 558 | max_weightp, wref, 559 | width, height, stride 560 | ); 561 | break; 562 | } 563 | } 564 | 565 | static inline void nlmFinishDispatch_u8( 566 | std::array dstp, 567 | std::array srcp, 568 | const float * weightp, 569 | std::array wdstp, 570 | const float * max_weightp, 571 | float wref, 572 | int width, 573 | int height, 574 | int stride, 575 | ChannelMode channels, 576 | int peak 577 | ) noexcept { 578 | 579 | switch (channels) { 580 | case ChannelMode::Y: 581 | ispc::nlmFinishCh1_u8( 582 | dstp[0], 583 | srcp[0], 584 | weightp, wdstp[0], 585 | max_weightp, wref, 586 | width, height, stride, 587 | peak 588 | ); 589 | break; 590 | case ChannelMode::UV: 591 | ispc::nlmFinishCh2_u8( 592 | dstp[1], dstp[2], 593 | srcp[1], srcp[2], 594 | weightp, wdstp[0], wdstp[1], 595 | max_weightp, wref, 596 | width, height, stride, 597 | peak 598 | ); 599 | break; 600 | case ChannelMode::YUV: 601 | case ChannelMode::RGB: 602 | ispc::nlmFinishCh3_u8( 603 | dstp[0], dstp[1], dstp[2], 604 | srcp[0], srcp[1], srcp[2], 605 | weightp, wdstp[0], wdstp[1], wdstp[2], 606 | max_weightp, wref, 607 | width, height, stride, 608 | peak 609 | ); 610 | break; 611 | } 612 | } 613 | 614 | static inline void nlmFinishDispatch_u16( 615 | std::array dstp, 616 | std::array srcp, 617 | const float * weightp, 618 | std::array wdstp, 619 | const float * max_weightp, 620 | float wref, 621 | int width, 622 | int height, 623 | int stride, 624 | ChannelMode channels, 625 | int peak 626 | ) noexcept { 627 | 628 | switch (channels) { 629 | case ChannelMode::Y: 630 | ispc::nlmFinishCh1_u16( 631 | dstp[0], 632 | srcp[0], 633 | weightp, wdstp[0], 634 | max_weightp, wref, 635 | width, height, stride, 636 | peak 637 | ); 638 | break; 639 | case ChannelMode::UV: 640 | ispc::nlmFinishCh2_u16( 641 | dstp[1], dstp[2], 642 | srcp[1], srcp[2], 643 | weightp, wdstp[0], wdstp[1], 644 | max_weightp, wref, 645 | width, height, stride, 646 | peak 647 | ); 648 | break; 649 | case ChannelMode::YUV: 650 | case ChannelMode::RGB: 651 | ispc::nlmFinishCh3_u16( 652 | dstp[0], dstp[1], dstp[2], 653 | srcp[0], srcp[1], srcp[2], 654 | weightp, wdstp[0], wdstp[1], wdstp[2], 655 | max_weightp, wref, 656 | width, height, stride, 657 | peak 658 | ); 659 | break; 660 | } 661 | } 662 | 663 | static inline void nlmFinish( 664 | std::array dstp, 665 | std::array srcp, 666 | const float * weightp, 667 | std::array wdstp, 668 | const float * max_weightp, 669 | float wref, 670 | int width, 671 | int height, 672 | int stride, 673 | ChannelMode channels, 674 | int bits 675 | ) noexcept { 676 | 677 | if (bits == 32) { 678 | nlmFinishDispatch_f32( 679 | castPtrs(dstp), castPtrs(srcp), 680 | weightp, wdstp, max_weightp, wref, width, height, stride, channels 681 | ); 682 | } else if (bits <= 8) { 683 | int peak = (1 << bits) - 1; 684 | nlmFinishDispatch_u8( 685 | castPtrs(dstp), castPtrs(srcp), 686 | weightp, wdstp, max_weightp, wref, width, height, stride, channels, peak 687 | ); 688 | } else if (bits <= 16) { 689 | int peak = (1 << bits) - 1; 690 | nlmFinishDispatch_u16( 691 | castPtrs(dstp), castPtrs(srcp), 692 | weightp, wdstp, max_weightp, wref, width, height, stride, channels, peak 693 | ); 694 | } else { 695 | assert(false); 696 | } 697 | } 698 | 699 | static const VSFrameRef *VS_CC nlmGetFrame( 700 | int n, 701 | int activationReason, 702 | void ** instanceData, 703 | void ** frameData, 704 | VSFrameContext * frameCtx, 705 | VSCore * core, 706 | const VSAPI * vsapi 707 | ) noexcept { 708 | 709 | auto * d = reinterpret_cast(*instanceData); 710 | 711 | if (activationReason == arInitial) { 712 | int start = std::max(0, n - d->d); 713 | int end = std::min(n + d->d, d->vi->numFrames - 1); 714 | for (int i = start; i <= end; i++) { 715 | vsapi->requestFrameFilter(i, d->node, frameCtx); 716 | if (d->ref_node) { 717 | vsapi->requestFrameFilter(i, d->ref_node, frameCtx); 718 | } 719 | } 720 | return nullptr; 721 | } else if (activationReason != arAllFramesReady) { 722 | return nullptr; 723 | } 724 | 725 | // activationReason == arAllFramesReady 726 | 727 | int nlm_d = d->d; 728 | int nlm_a = d->a; 729 | int nlm_s = d->s; 730 | float nlm_h2_inv_norm = square(255.0f) / (3.0f * square(d->h) * square(2 * nlm_s + 1)); 731 | float nlm_wref = d->wref; 732 | ChannelMode channels = d->channels; 733 | 734 | const auto & ref_node = d->ref_node ? d->ref_node : d->node; 735 | auto ref_frame = vsapi->getFrameFilter(n, ref_node, frameCtx); 736 | 737 | int bits = d->vi->format->bitsPerSample; 738 | int width, height, stride; // dimensions of the plane to be processed, not the video dimension 739 | if (channels == ChannelMode::UV) { 740 | width = d->vi->width >> d->vi->format->subSamplingW; 741 | height = d->vi->height >> d->vi->format->subSamplingH; 742 | stride = vsapi->getStride(ref_frame, 1) / d->vi->format->bytesPerSample; 743 | } else { 744 | width = d->vi->width; 745 | height = d->vi->height; 746 | stride = vsapi->getStride(ref_frame, 0) / d->vi->format->bytesPerSample; 747 | } 748 | 749 | int size = height * stride; // size of each plane in quad-bytes 750 | // number of input channels 751 | int num_input_channels = [channels]() { 752 | if (channels == ChannelMode::Y) { 753 | return 1; 754 | } else if (channels == ChannelMode::UV) { 755 | return 2; 756 | } else { 757 | // channels == ChannelMode::YUV || channels == ChannelMode::RGB 758 | return 3; 759 | } 760 | }(); 761 | // size in quad-bytes: size * (4 + num_input_channels + (nlm_d != 0)) + width 762 | float * workspace; 763 | { 764 | auto thread_id = std::this_thread::get_id(); 765 | d->workspaces_lock.lock_shared(); 766 | bool init = true; 767 | try { 768 | const auto & const_workspaces = d->workspaces; 769 | workspace = const_workspaces.at(thread_id); 770 | } catch (const std::out_of_range &) { 771 | init = false; 772 | } 773 | d->workspaces_lock.unlock_shared(); 774 | 775 | if (!init) { 776 | auto workspace_size = size * (4 + num_input_channels + (nlm_d != 0)) + width; 777 | auto workspace_bytes = workspace_size * sizeof(float); 778 | workspace = vs_aligned_malloc(workspace_bytes, 256); 779 | 780 | if (!workspace) { 781 | vsapi->freeFrame(ref_frame); 782 | vsapi->setFilterError("nlm_ispc: malloc() failed", frameCtx); 783 | return nullptr; 784 | } 785 | 786 | std::lock_guard _ { d->workspaces_lock }; 787 | d->workspaces.emplace(thread_id, workspace); 788 | } 789 | } 790 | 791 | // zero-initialize aggregation buffers 792 | std::memset(workspace, 0, (1 + num_input_channels) * size * sizeof(float)); 793 | // stores the sum of weights of each pixel 794 | float * weightp = workspace; 795 | std::array wdstp { 796 | // stores the weighted sum of pixel values of the first processed plane 797 | workspace + size, 798 | // stores the weighted sum of pixel values of the second processed plane 799 | num_input_channels <= 1 ? nullptr : workspace + 2 * size, 800 | // stores the weighted sum of pixel values of the third processed plane 801 | num_input_channels <= 2 ? nullptr : workspace + 3 * size 802 | }; 803 | 804 | // stores the maximum weight encountered of each pixel 805 | float * max_weightp = workspace + (1 + num_input_channels) * size; 806 | for (int i = 0; i < size; i++) { 807 | max_weightp[i] = std::numeric_limits::epsilon(); 808 | } 809 | 810 | // temporary storage for the calculation of patch distances 811 | float * temp = workspace + (2 + num_input_channels) * size; 812 | float * temp_bwd = workspace + (3 + num_input_channels) * size; 813 | float * temp_fwd = nlm_d == 0 ? nullptr : workspace + (4 + num_input_channels) * size; 814 | 815 | // buffer for the vertical box filter during patch distance calculation 816 | // size in quad-bytes: width 817 | float * buffer = workspace + (4 + num_input_channels + (nlm_d != 0)) * size; 818 | 819 | std::array refp { getPtrs(ref_frame, channels, vsapi) }; 820 | 821 | for (int i = -nlm_d; i <= 0; i++) { 822 | auto bwd_n = std::max(n + i, 0); 823 | auto fwd_n = std::min(n - i, d->vi->numFrames - 1); 824 | auto src_frame_bwd = vsapi->getFrameFilter(bwd_n, d->node, frameCtx); 825 | auto src_frame_fwd = vsapi->getFrameFilter(fwd_n, d->node, frameCtx); 826 | auto ref_frame_bwd = vsapi->getFrameFilter(bwd_n, ref_node, frameCtx); 827 | auto ref_frame_fwd = vsapi->getFrameFilter(fwd_n, ref_node, frameCtx); 828 | 829 | std::array srcp_bwd { getPtrs(src_frame_bwd, channels, vsapi) }; 830 | std::array srcp_fwd { getPtrs(src_frame_fwd, channels, vsapi) }; 831 | std::array refp_bwd { getPtrs(ref_frame_bwd, channels, vsapi) }; 832 | std::array refp_fwd { getPtrs(ref_frame_fwd, channels, vsapi) }; 833 | 834 | for (int offset_y = -nlm_a; offset_y <= nlm_a; offset_y++) { 835 | for (int offset_x = -nlm_a; offset_x <= nlm_a; offset_x++) { 836 | if (i * square(2 * nlm_a + 1) + offset_y * (2 * nlm_a + 1) + offset_x >= 0) { 837 | continue; 838 | } 839 | 840 | nlmDistance( 841 | temp_bwd, 842 | refp, refp_bwd, 843 | offset_x, offset_y, width, height, stride, channels, bits 844 | ); 845 | 846 | ispc::nlmHorizontal( 847 | temp, 848 | temp_bwd, 849 | nlm_s, width, height, stride 850 | ); 851 | 852 | d->nlm_vertical( 853 | temp_bwd, 854 | temp, 855 | nlm_s, nlm_h2_inv_norm, width, height, stride, buffer 856 | ); 857 | 858 | // jump at the end of this basic block 859 | if (i == 0) { 860 | // bwd == fwd 861 | nlmAccumulation( 862 | weightp, wdstp, max_weightp, 863 | srcp_bwd, srcp_bwd, temp_bwd, temp_bwd, 864 | offset_x, offset_y, width, height, stride, channels, bits 865 | ); 866 | continue; 867 | } 868 | 869 | // i != 0 870 | nlmDistance( 871 | temp_fwd, 872 | refp_fwd, refp, 873 | offset_x, offset_y, width, height, stride, channels, bits 874 | ); 875 | 876 | ispc::nlmHorizontal( 877 | temp, 878 | temp_fwd, 879 | nlm_s, width, height, stride 880 | ); 881 | 882 | d->nlm_vertical( 883 | temp_fwd, 884 | temp, 885 | nlm_s, nlm_h2_inv_norm, width, height, stride, buffer 886 | ); 887 | 888 | nlmAccumulation( 889 | weightp, wdstp, max_weightp, 890 | srcp_bwd, srcp_fwd, temp_bwd, temp_fwd, 891 | offset_x, offset_y, width, height, stride, channels, bits 892 | ); 893 | } 894 | } 895 | 896 | vsapi->freeFrame(src_frame_fwd); 897 | vsapi->freeFrame(src_frame_bwd); 898 | vsapi->freeFrame(ref_frame_fwd); 899 | vsapi->freeFrame(ref_frame_bwd); 900 | } 901 | 902 | vsapi->freeFrame(ref_frame); 903 | 904 | auto src_frame = vsapi->getFrameFilter(n, d->node, frameCtx); 905 | std::array srcp { getPtrs(src_frame, channels, vsapi) }; 906 | 907 | VSFrameRef * dst_frame; 908 | if (channels == ChannelMode::Y && d->vi->format->numPlanes > 1) { 909 | const VSFrameRef * fr[3] { nullptr, src_frame, src_frame }; 910 | constexpr int pl[3] { 0, 1, 2 }; 911 | dst_frame = vsapi->newVideoFrame2(d->vi->format, d->vi->width, d->vi->height, fr, pl, src_frame, core); 912 | } else if (channels == ChannelMode::UV && d->vi->format->numPlanes > 1) { 913 | const VSFrameRef * fr[3] { src_frame, nullptr, nullptr }; 914 | constexpr int pl[3] { 0, 1, 2 }; 915 | dst_frame = vsapi->newVideoFrame2(d->vi->format, d->vi->width, d->vi->height, fr, pl, src_frame, core); 916 | } else { 917 | dst_frame = vsapi->newVideoFrame(d->vi->format, d->vi->width, d->vi->height, src_frame, core); 918 | } 919 | std::array dstp { getPtrs(dst_frame, channels, vsapi) }; 920 | 921 | nlmFinish(dstp, srcp, weightp, wdstp, max_weightp, nlm_wref, width, height, stride, channels, bits); 922 | 923 | vsapi->freeFrame(src_frame); 924 | 925 | return dst_frame; 926 | } 927 | 928 | static void VS_CC nlmFree( 929 | void * instanceData, 930 | VSCore * core, 931 | const VSAPI * vsapi 932 | ) noexcept { 933 | 934 | auto * d = reinterpret_cast(instanceData); 935 | 936 | vsapi->freeNode(d->node); 937 | if (d->ref_node) { 938 | vsapi->freeNode(d->ref_node); 939 | } 940 | 941 | for (const auto & [_, ptr] : d->workspaces) { 942 | vs_aligned_free(ptr); 943 | } 944 | 945 | delete d; 946 | } 947 | 948 | static void VS_CC nlmCreate( 949 | const VSMap * in, 950 | VSMap * out, 951 | void * userData, 952 | VSCore * core, 953 | const VSAPI * vsapi 954 | ) noexcept { 955 | 956 | auto d = std::make_unique(); 957 | 958 | d->node = vsapi->propGetNode(in, "clip", 0, nullptr); 959 | d->vi = vsapi->getVideoInfo(d->node); 960 | 961 | auto set_error = [vsapi, out, &d](const char * error_message) -> void { 962 | vsapi->setError(out, error_message); 963 | vsapi->freeNode(d->node); 964 | }; 965 | 966 | if ((d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) || 967 | (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32) 968 | ) { 969 | return set_error("only 1-16 bit integer or 32-bit float supported"); 970 | } 971 | 972 | int err; 973 | 974 | d->d = int64ToIntS(vsapi->propGetInt(in, "d", 0, &err)); 975 | if (err) { 976 | d->d = 1; 977 | } 978 | if (d->d < 0) { 979 | return set_error("\"d\" must be non-negative"); 980 | } 981 | 982 | d->a = int64ToIntS(vsapi->propGetInt(in, "a", 0, &err)); 983 | if (err) { 984 | d->a = 2; 985 | } 986 | if (d->a <= 0) { 987 | return set_error("\"a\" must be positive"); 988 | } 989 | 990 | d->s = int64ToIntS(vsapi->propGetInt(in, "s", 0, &err)); 991 | if (err) { 992 | d->s = 4; 993 | } 994 | if (d->s < 0) { 995 | return set_error("\"s\" must be non-negative"); 996 | } 997 | 998 | d->h = static_cast(vsapi->propGetFloat(in, "h", 0, &err)); 999 | if (err) { 1000 | d->h = 1.2f; 1001 | } 1002 | if (d->h <= 0.0f) { 1003 | return set_error("\"h\" must be positive"); 1004 | } 1005 | 1006 | auto wmode = vsapi->propGetInt(in, "wmode", 0, &err); 1007 | if (err) { 1008 | wmode = 0; 1009 | } 1010 | if (wmode < 0 || wmode > 3) { 1011 | return set_error("\"wmode\" must be 0, 1, 2 or 3"); 1012 | } 1013 | decltype(d->nlm_vertical) nlmVerticalKernels[] { 1014 | &ispc::nlmVerticalWelsch, 1015 | &ispc::nlmVerticalBisquareA, 1016 | &ispc::nlmVerticalBisquareB, 1017 | &ispc::nlmVerticalBisquareC 1018 | }; 1019 | d->nlm_vertical = nlmVerticalKernels[wmode]; 1020 | 1021 | auto channels = vsapi->propGetData(in, "channels", 0, &err); 1022 | if (err) { 1023 | channels = "AUTO"; 1024 | } 1025 | auto channels_len = std::strlen(channels); 1026 | if (channels_len == 1 && *channels == 'Y') { 1027 | d->channels = ChannelMode::Y; 1028 | } else if (channels_len == 2 && std::strncmp(channels, "UV", 2) == 0) { 1029 | d->channels = ChannelMode::UV; 1030 | } else if (channels_len == 3 && std::strncmp(channels, "YUV", 3) == 0) { 1031 | d->channels = ChannelMode::YUV; 1032 | } else if (channels_len == 3 && std::strncmp(channels, "RGB", 3) == 0) { 1033 | d->channels = ChannelMode::RGB; 1034 | } else if (channels_len == 4 && std::strncmp(channels, "AUTO", 4) == 0) { 1035 | if (d->vi->format->colorFamily == cmRGB) { 1036 | d->channels = ChannelMode::RGB; 1037 | } else { 1038 | d->channels = ChannelMode::Y; 1039 | } 1040 | } else { 1041 | return set_error("\"channels\" must be \"Y\", \"UV\', \"YUV\", \"RGB\" or \"AUTO\""); 1042 | } 1043 | 1044 | if (d->channels == ChannelMode::Y) { 1045 | if (d->vi->format->colorFamily != cmGray && d->vi->format->colorFamily != cmYUV) { 1046 | return set_error("color family must be Gray or YUV for \"channels\" == \"Y\""); 1047 | } 1048 | } else if (d->channels == ChannelMode::UV) { 1049 | if (d->vi->format->colorFamily != cmYUV) { 1050 | return set_error("color family must be YUV for \"channels\" == \"UV\""); 1051 | } 1052 | } else if (d->channels == ChannelMode::YUV) { 1053 | if (d->vi->format->colorFamily != cmYUV || d->vi->format->subSamplingW || d->vi->format->subSamplingH) { 1054 | return set_error("color family must be YUV444 for \"channels\" == \"YUV\""); 1055 | } 1056 | } else if (d->channels == ChannelMode::RGB) { 1057 | if (d->vi->format->colorFamily != cmRGB) { 1058 | return set_error("color family must be RGB for \"channels\" == \"RGB\""); 1059 | } 1060 | } 1061 | 1062 | d->wref = static_cast(vsapi->propGetFloat(in, "wref", 0, &err)); 1063 | if (err) { 1064 | d->wref = 1.0f; 1065 | } 1066 | 1067 | d->ref_node = vsapi->propGetNode(in, "rclip", 0, &err); 1068 | if (err) { 1069 | d->ref_node = nullptr; 1070 | } 1071 | if (d->ref_node) { 1072 | const auto ref_vi = vsapi->getVideoInfo(d->ref_node); 1073 | if (!isSameFormat(d->vi, ref_vi) || d->vi->numFrames != ref_vi->numFrames) { 1074 | vsapi->freeNode(d->ref_node); 1075 | return set_error("\"rclip\" must be of the same format as \"clip\""); 1076 | } 1077 | } 1078 | 1079 | VSCoreInfo core_info; 1080 | vsapi->getCoreInfo2(core, &core_info); 1081 | d->workspaces.reserve(core_info.numThreads); 1082 | 1083 | vsapi->createFilter( 1084 | in, out, 1085 | "NLMeans", nlmInit, nlmGetFrame, nlmFree, 1086 | fmParallel, 0, d.release(), core 1087 | ); 1088 | } 1089 | 1090 | VS_EXTERNAL_API(void) VapourSynthPluginInit( 1091 | VSConfigPlugin configFunc, 1092 | VSRegisterFunction registerFunc, 1093 | VSPlugin * plugin 1094 | ) noexcept { 1095 | 1096 | configFunc( 1097 | "io.github.amusementclub.vs-nlm-ispc", 1098 | "nlm_ispc", 1099 | "Non-local means denoise filter implemented in ISPC", 1100 | VAPOURSYNTH_API_VERSION, 1, plugin 1101 | ); 1102 | 1103 | registerFunc( 1104 | "NLMeans", 1105 | "clip:clip;" 1106 | "d:int:opt;" 1107 | "a:int:opt;" 1108 | "s:int:opt;" 1109 | "h:float:opt;" 1110 | "channels:data:opt;" 1111 | "wmode:int:opt;" 1112 | "wref:float:opt;" 1113 | "rclip:clip:opt;", 1114 | nlmCreate, 1115 | nullptr, plugin 1116 | ); 1117 | 1118 | auto getVersion = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) { 1119 | vsapi->propSetData(out, "version", VERSION, -1, paReplace); 1120 | }; 1121 | registerFunc("Version", "", getVersion, nullptr, plugin); 1122 | } 1123 | -------------------------------------------------------------------------------- /source/nlm.ispc: -------------------------------------------------------------------------------- 1 | // based on KNLMeansCL by Khanattila and vs-boxblur 2 | 3 | #define CLAMPX(x) clamp(x, 0, width - 1) 4 | #define CLAMPY(y) clamp(y, 0, height - 1) 5 | 6 | static inline uniform float square(uniform float x) { 7 | return x * x; 8 | } 9 | 10 | static inline float square(float x) { 11 | return x * x; 12 | } 13 | 14 | export void nlmDistanceLuma_f32( 15 | uniform float temp0[], // shape: (height, stride) 16 | uniform const float centerp[], // shape: (height, stride) 17 | uniform const float neighborp[], // shape: (height, stride) 18 | uniform int offset_x, 19 | uniform int offset_y, 20 | uniform int width, 21 | uniform int height, 22 | uniform int stride 23 | ) { 24 | 25 | uniform int start_x = abs(offset_x); 26 | uniform int end_x = width - abs(offset_x); 27 | 28 | for (uniform int y = 0; y < height ;y++) { 29 | for (uniform int x = 0; x < start_x; x++) { 30 | uniform int idx = y * stride + x; 31 | uniform float u1 = centerp[idx]; 32 | 33 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 34 | uniform float u1_pq = neighborp[neighbor_idx]; 35 | 36 | temp0[idx] = 3.0f * square(u1 - u1_pq); 37 | } 38 | 39 | foreach (x = start_x ... end_x) { 40 | int idx = y * stride + x; 41 | float u1 = centerp[idx]; 42 | 43 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 44 | float u1_pq = neighborp[neighbor_idx]; 45 | 46 | temp0[idx] = 3.0f * square(u1 - u1_pq); 47 | } 48 | 49 | for (uniform int x = end_x; x < width; x++) { 50 | uniform int idx = y * stride + x; 51 | uniform float u1 = centerp[idx]; 52 | 53 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 54 | uniform float u1_pq = neighborp[neighbor_idx]; 55 | 56 | temp0[idx] = 3.0f * square(u1 - u1_pq); 57 | } 58 | } 59 | } 60 | 61 | export void nlmDistanceLuma_u8( 62 | uniform float temp0[], // shape: (height, stride) 63 | uniform const unsigned int8 centerp[], // shape: (height, stride) 64 | uniform const unsigned int8 neighborp[], // shape: (height, stride) 65 | uniform int offset_x, 66 | uniform int offset_y, 67 | uniform int width, 68 | uniform int height, 69 | uniform int stride, 70 | uniform float inv_divisor 71 | ) { 72 | 73 | uniform int start_x = abs(offset_x); 74 | uniform int end_x = width - abs(offset_x); 75 | 76 | uniform float sq_inv_divisor = square(inv_divisor); 77 | 78 | for (uniform int y = 0; y < height ;y++) { 79 | for (uniform int x = 0; x < start_x; x++) { 80 | uniform int idx = y * stride + x; 81 | uniform float u1 = centerp[idx]; 82 | 83 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 84 | uniform float u1_pq = neighborp[neighbor_idx]; 85 | 86 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor; 87 | } 88 | 89 | foreach (x = start_x ... end_x) { 90 | int idx = y * stride + x; 91 | float u1 = centerp[idx]; 92 | 93 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 94 | float u1_pq = neighborp[neighbor_idx]; 95 | 96 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor; 97 | } 98 | 99 | for (uniform int x = end_x; x < width; x++) { 100 | uniform int idx = y * stride + x; 101 | uniform float u1 = centerp[idx]; 102 | 103 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 104 | uniform float u1_pq = neighborp[neighbor_idx]; 105 | 106 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor; 107 | } 108 | } 109 | } 110 | 111 | export void nlmDistanceLuma_u16( 112 | uniform float temp0[], // shape: (height, stride) 113 | uniform const unsigned int16 centerp[], // shape: (height, stride) 114 | uniform const unsigned int16 neighborp[], // shape: (height, stride) 115 | uniform int offset_x, 116 | uniform int offset_y, 117 | uniform int width, 118 | uniform int height, 119 | uniform int stride, 120 | uniform float inv_divisor 121 | ) { 122 | 123 | uniform int start_x = abs(offset_x); 124 | uniform int end_x = width - abs(offset_x); 125 | 126 | uniform float sq_inv_divisor = square(inv_divisor); 127 | 128 | for (uniform int y = 0; y < height ;y++) { 129 | for (uniform int x = 0; x < start_x; x++) { 130 | uniform int idx = y * stride + x; 131 | uniform float u1 = centerp[idx]; 132 | 133 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 134 | uniform float u1_pq = neighborp[neighbor_idx]; 135 | 136 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor; 137 | } 138 | 139 | foreach (x = start_x ... end_x) { 140 | int idx = y * stride + x; 141 | float u1 = centerp[idx]; 142 | 143 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 144 | float u1_pq = neighborp[neighbor_idx]; 145 | 146 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor; 147 | } 148 | 149 | for (uniform int x = end_x; x < width; x++) { 150 | uniform int idx = y * stride + x; 151 | uniform float u1 = centerp[idx]; 152 | 153 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 154 | uniform float u1_pq = neighborp[neighbor_idx]; 155 | 156 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor; 157 | } 158 | } 159 | } 160 | 161 | export void nlmDistanceChroma_f32( 162 | uniform float temp0[], // shape: (height, stride) 163 | uniform const float centerp1[], // shape: (height, stride) 164 | uniform const float centerp2[], // shape: (height, stride) 165 | uniform const float neighborp1[], // shape: (height, stride) 166 | uniform const float neighborp2[], // shape: (height, stride) 167 | uniform int offset_x, 168 | uniform int offset_y, 169 | uniform int width, 170 | uniform int height, 171 | uniform int stride 172 | ) { 173 | 174 | uniform int start_x = abs(offset_x); 175 | uniform int end_x = width - abs(offset_x); 176 | 177 | for (uniform int y = 0; y < height; y++) { 178 | for (uniform int x = 0; x < start_x; x++) { 179 | uniform int idx = y * stride + x; 180 | uniform float u1_1 = centerp1[idx]; 181 | uniform float u1_2 = centerp2[idx]; 182 | 183 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 184 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 185 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 186 | 187 | uniform float dst = 1.5f * ( 188 | square(u1_1 - u1_pq_1) + 189 | square(u1_2 - u1_pq_2) 190 | ); 191 | 192 | temp0[idx] = dst; 193 | } 194 | 195 | foreach (x = start_x ... end_x) { 196 | int idx = y * stride + x; 197 | float u1_1 = centerp1[idx]; 198 | float u1_2 = centerp2[idx]; 199 | 200 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 201 | float u1_pq_1 = neighborp1[neighbor_idx]; 202 | float u1_pq_2 = neighborp2[neighbor_idx]; 203 | 204 | float dst = 1.5f * ( 205 | square(u1_1 - u1_pq_1) + 206 | square(u1_2 - u1_pq_2) 207 | ); 208 | 209 | temp0[idx] = dst; 210 | } 211 | 212 | for (uniform int x = end_x; x < width; x++) { 213 | uniform int idx = y * stride + x; 214 | uniform float u1_1 = centerp1[idx]; 215 | uniform float u1_2 = centerp2[idx]; 216 | 217 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 218 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 219 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 220 | 221 | uniform float dst = 1.5f * ( 222 | square(u1_1 - u1_pq_1) + 223 | square(u1_2 - u1_pq_2) 224 | ); 225 | 226 | temp0[idx] = dst; 227 | } 228 | } 229 | } 230 | 231 | export void nlmDistanceChroma_u8( 232 | uniform float temp0[], // shape: (height, stride) 233 | uniform const unsigned int8 centerp1[], // shape: (height, stride) 234 | uniform const unsigned int8 centerp2[], // shape: (height, stride) 235 | uniform const unsigned int8 neighborp1[], // shape: (height, stride) 236 | uniform const unsigned int8 neighborp2[], // shape: (height, stride) 237 | uniform int offset_x, 238 | uniform int offset_y, 239 | uniform int width, 240 | uniform int height, 241 | uniform int stride, 242 | uniform float inv_divisor 243 | ) { 244 | 245 | uniform int start_x = abs(offset_x); 246 | uniform int end_x = width - abs(offset_x); 247 | 248 | uniform float sq_inv_divisor = square(inv_divisor); 249 | 250 | for (uniform int y = 0; y < height; y++) { 251 | for (uniform int x = 0; x < start_x; x++) { 252 | uniform int idx = y * stride + x; 253 | uniform float u1_1 = centerp1[idx]; 254 | uniform float u1_2 = centerp2[idx]; 255 | 256 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 257 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 258 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 259 | 260 | uniform float dst = 1.5f * ( 261 | square(u1_1 - u1_pq_1) + 262 | square(u1_2 - u1_pq_2) 263 | ); 264 | 265 | temp0[idx] = dst * sq_inv_divisor; 266 | } 267 | 268 | foreach (x = start_x ... end_x) { 269 | int idx = y * stride + x; 270 | float u1_1 = centerp1[idx]; 271 | float u1_2 = centerp2[idx]; 272 | 273 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 274 | float u1_pq_1 = neighborp1[neighbor_idx]; 275 | float u1_pq_2 = neighborp2[neighbor_idx]; 276 | 277 | float dst = 1.5f * ( 278 | square(u1_1 - u1_pq_1) + 279 | square(u1_2 - u1_pq_2) 280 | ); 281 | 282 | temp0[idx] = dst * sq_inv_divisor; 283 | } 284 | 285 | for (uniform int x = end_x; x < width; x++) { 286 | uniform int idx = y * stride + x; 287 | uniform float u1_1 = centerp1[idx]; 288 | uniform float u1_2 = centerp2[idx]; 289 | 290 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 291 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 292 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 293 | 294 | uniform float dst = 1.5f * ( 295 | square(u1_1 - u1_pq_1) + 296 | square(u1_2 - u1_pq_2) 297 | ); 298 | 299 | temp0[idx] = dst * sq_inv_divisor; 300 | } 301 | } 302 | } 303 | 304 | export void nlmDistanceChroma_u16( 305 | uniform float temp0[], // shape: (height, stride) 306 | uniform const unsigned int16 centerp1[], // shape: (height, stride) 307 | uniform const unsigned int16 centerp2[], // shape: (height, stride) 308 | uniform const unsigned int16 neighborp1[], // shape: (height, stride) 309 | uniform const unsigned int16 neighborp2[], // shape: (height, stride) 310 | uniform int offset_x, 311 | uniform int offset_y, 312 | uniform int width, 313 | uniform int height, 314 | uniform int stride, 315 | uniform float inv_divisor 316 | ) { 317 | 318 | uniform int start_x = abs(offset_x); 319 | uniform int end_x = width - abs(offset_x); 320 | 321 | uniform float sq_inv_divisor = square(inv_divisor); 322 | 323 | for (uniform int y = 0; y < height; y++) { 324 | for (uniform int x = 0; x < start_x; x++) { 325 | uniform int idx = y * stride + x; 326 | uniform float u1_1 = centerp1[idx]; 327 | uniform float u1_2 = centerp2[idx]; 328 | 329 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 330 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 331 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 332 | 333 | uniform float dst = 1.5f * ( 334 | square(u1_1 - u1_pq_1) + 335 | square(u1_2 - u1_pq_2) 336 | ); 337 | 338 | temp0[idx] = dst * sq_inv_divisor; 339 | } 340 | 341 | foreach (x = start_x ... end_x) { 342 | int idx = y * stride + x; 343 | float u1_1 = centerp1[idx]; 344 | float u1_2 = centerp2[idx]; 345 | 346 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 347 | float u1_pq_1 = neighborp1[neighbor_idx]; 348 | float u1_pq_2 = neighborp2[neighbor_idx]; 349 | 350 | float dst = 1.5f * ( 351 | square(u1_1 - u1_pq_1) + 352 | square(u1_2 - u1_pq_2) 353 | ); 354 | 355 | temp0[idx] = dst * sq_inv_divisor; 356 | } 357 | 358 | for (uniform int x = end_x; x < width; x++) { 359 | uniform int idx = y * stride + x; 360 | uniform float u1_1 = centerp1[idx]; 361 | uniform float u1_2 = centerp2[idx]; 362 | 363 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 364 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 365 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 366 | 367 | uniform float dst = 1.5f * ( 368 | square(u1_1 - u1_pq_1) + 369 | square(u1_2 - u1_pq_2) 370 | ); 371 | 372 | temp0[idx] = dst * sq_inv_divisor; 373 | } 374 | } 375 | } 376 | 377 | export void nlmDistanceYUV_f32( 378 | uniform float temp0[], // shape: (height, stride) 379 | uniform const float centerp1[], // shape: (height, stride) 380 | uniform const float centerp2[], // shape: (height, stride) 381 | uniform const float centerp3[], // shape: (height, stride) 382 | uniform const float neighborp1[], // shape: (height, stride) 383 | uniform const float neighborp2[], // shape: (height, stride) 384 | uniform const float neighborp3[], // shape: (height, stride) 385 | uniform int offset_x, 386 | uniform int offset_y, 387 | uniform int width, 388 | uniform int height, 389 | uniform int stride 390 | ) { 391 | 392 | uniform int start_x = abs(offset_x); 393 | uniform int end_x = width - abs(offset_x); 394 | 395 | for (uniform int y = 0; y < height; y++) { 396 | for (uniform int x = 0; x < start_x; x++) { 397 | uniform int idx = y * stride + x; 398 | uniform float u1_1 = centerp1[idx]; 399 | uniform float u1_2 = centerp2[idx]; 400 | uniform float u1_3 = centerp3[idx]; 401 | 402 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 403 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 404 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 405 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 406 | 407 | uniform float dst = ( 408 | square(u1_1 - u1_pq_1) + 409 | square(u1_2 - u1_pq_2) + 410 | square(u1_3 - u1_pq_3) 411 | ); 412 | 413 | temp0[idx] = dst; 414 | } 415 | 416 | foreach (x = start_x ... end_x) { 417 | int idx = y * stride + x; 418 | float u1_1 = centerp1[idx]; 419 | float u1_2 = centerp2[idx]; 420 | float u1_3 = centerp3[idx]; 421 | 422 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 423 | float u1_pq_1 = neighborp1[neighbor_idx]; 424 | float u1_pq_2 = neighborp2[neighbor_idx]; 425 | float u1_pq_3 = neighborp3[neighbor_idx]; 426 | 427 | float dst = ( 428 | square(u1_1 - u1_pq_1) + 429 | square(u1_2 - u1_pq_2) + 430 | square(u1_3 - u1_pq_3) 431 | ); 432 | 433 | temp0[idx] = dst; 434 | } 435 | 436 | for (uniform int x = end_x; x < width; x++) { 437 | uniform int idx = y * stride + x; 438 | uniform float u1_1 = centerp1[idx]; 439 | uniform float u1_2 = centerp2[idx]; 440 | uniform float u1_3 = centerp3[idx]; 441 | 442 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 443 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 444 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 445 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 446 | 447 | uniform float dst = ( 448 | square(u1_1 - u1_pq_1) + 449 | square(u1_2 - u1_pq_2) + 450 | square(u1_3 - u1_pq_3) 451 | ); 452 | 453 | temp0[idx] = dst; 454 | } 455 | } 456 | } 457 | 458 | export void nlmDistanceYUV_u8( 459 | uniform float temp0[], // shape: (height, stride) 460 | uniform const unsigned int8 centerp1[], // shape: (height, stride) 461 | uniform const unsigned int8 centerp2[], // shape: (height, stride) 462 | uniform const unsigned int8 centerp3[], // shape: (height, stride) 463 | uniform const unsigned int8 neighborp1[], // shape: (height, stride) 464 | uniform const unsigned int8 neighborp2[], // shape: (height, stride) 465 | uniform const unsigned int8 neighborp3[], // shape: (height, stride) 466 | uniform int offset_x, 467 | uniform int offset_y, 468 | uniform int width, 469 | uniform int height, 470 | uniform int stride, 471 | uniform float inv_divisor 472 | ) { 473 | 474 | uniform int start_x = abs(offset_x); 475 | uniform int end_x = width - abs(offset_x); 476 | 477 | uniform float sq_inv_divisor = square(inv_divisor); 478 | 479 | for (uniform int y = 0; y < height; y++) { 480 | for (uniform int x = 0; x < start_x; x++) { 481 | uniform int idx = y * stride + x; 482 | uniform float u1_1 = centerp1[idx]; 483 | uniform float u1_2 = centerp2[idx]; 484 | uniform float u1_3 = centerp3[idx]; 485 | 486 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 487 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 488 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 489 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 490 | 491 | uniform float dst = ( 492 | square(u1_1 - u1_pq_1) + 493 | square(u1_2 - u1_pq_2) + 494 | square(u1_3 - u1_pq_3) 495 | ); 496 | 497 | temp0[idx] = dst * sq_inv_divisor; 498 | } 499 | 500 | foreach (x = start_x ... end_x) { 501 | int idx = y * stride + x; 502 | float u1_1 = centerp1[idx]; 503 | float u1_2 = centerp2[idx]; 504 | float u1_3 = centerp3[idx]; 505 | 506 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 507 | float u1_pq_1 = neighborp1[neighbor_idx]; 508 | float u1_pq_2 = neighborp2[neighbor_idx]; 509 | float u1_pq_3 = neighborp3[neighbor_idx]; 510 | 511 | float dst = ( 512 | square(u1_1 - u1_pq_1) + 513 | square(u1_2 - u1_pq_2) + 514 | square(u1_3 - u1_pq_3) 515 | ); 516 | 517 | temp0[idx] = dst * sq_inv_divisor; 518 | } 519 | 520 | for (uniform int x = end_x; x < width; x++) { 521 | uniform int idx = y * stride + x; 522 | uniform float u1_1 = centerp1[idx]; 523 | uniform float u1_2 = centerp2[idx]; 524 | uniform float u1_3 = centerp3[idx]; 525 | 526 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 527 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 528 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 529 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 530 | 531 | uniform float dst = ( 532 | square(u1_1 - u1_pq_1) + 533 | square(u1_2 - u1_pq_2) + 534 | square(u1_3 - u1_pq_3) 535 | ); 536 | 537 | temp0[idx] = dst * sq_inv_divisor; 538 | } 539 | } 540 | } 541 | 542 | export void nlmDistanceYUV_u16( 543 | uniform float temp0[], // shape: (height, stride) 544 | uniform const unsigned int16 centerp1[], // shape: (height, stride) 545 | uniform const unsigned int16 centerp2[], // shape: (height, stride) 546 | uniform const unsigned int16 centerp3[], // shape: (height, stride) 547 | uniform const unsigned int16 neighborp1[], // shape: (height, stride) 548 | uniform const unsigned int16 neighborp2[], // shape: (height, stride) 549 | uniform const unsigned int16 neighborp3[], // shape: (height, stride) 550 | uniform int offset_x, 551 | uniform int offset_y, 552 | uniform int width, 553 | uniform int height, 554 | uniform int stride, 555 | uniform float inv_divisor 556 | ) { 557 | 558 | uniform int start_x = abs(offset_x); 559 | uniform int end_x = width - abs(offset_x); 560 | 561 | uniform float sq_inv_divisor = square(inv_divisor); 562 | 563 | for (uniform int y = 0; y < height; y++) { 564 | for (uniform int x = 0; x < start_x; x++) { 565 | uniform int idx = y * stride + x; 566 | uniform float u1_1 = centerp1[idx]; 567 | uniform float u1_2 = centerp2[idx]; 568 | uniform float u1_3 = centerp3[idx]; 569 | 570 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 571 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 572 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 573 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 574 | 575 | uniform float dst = ( 576 | square(u1_1 - u1_pq_1) + 577 | square(u1_2 - u1_pq_2) + 578 | square(u1_3 - u1_pq_3) 579 | ); 580 | 581 | temp0[idx] = dst * sq_inv_divisor; 582 | } 583 | 584 | foreach (x = start_x ... end_x) { 585 | int idx = y * stride + x; 586 | float u1_1 = centerp1[idx]; 587 | float u1_2 = centerp2[idx]; 588 | float u1_3 = centerp3[idx]; 589 | 590 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 591 | float u1_pq_1 = neighborp1[neighbor_idx]; 592 | float u1_pq_2 = neighborp2[neighbor_idx]; 593 | float u1_pq_3 = neighborp3[neighbor_idx]; 594 | 595 | float dst = ( 596 | square(u1_1 - u1_pq_1) + 597 | square(u1_2 - u1_pq_2) + 598 | square(u1_3 - u1_pq_3) 599 | ); 600 | 601 | temp0[idx] = dst * sq_inv_divisor; 602 | } 603 | 604 | for (uniform int x = end_x; x < width; x++) { 605 | uniform int idx = y * stride + x; 606 | uniform float u1_1 = centerp1[idx]; 607 | uniform float u1_2 = centerp2[idx]; 608 | uniform float u1_3 = centerp3[idx]; 609 | 610 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 611 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 612 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 613 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 614 | 615 | uniform float dst = ( 616 | square(u1_1 - u1_pq_1) + 617 | square(u1_2 - u1_pq_2) + 618 | square(u1_3 - u1_pq_3) 619 | ); 620 | 621 | temp0[idx] = dst * sq_inv_divisor; 622 | } 623 | } 624 | } 625 | 626 | export void nlmDistanceRGB_f32( 627 | uniform float temp0[], // shape: (height, stride) 628 | uniform const float centerp1[], // shape: (height, stride) 629 | uniform const float centerp2[], // shape: (height, stride) 630 | uniform const float centerp3[], // shape: (height, stride) 631 | uniform const float neighborp1[], // shape: (height, stride) 632 | uniform const float neighborp2[], // shape: (height, stride) 633 | uniform const float neighborp3[], // shape: (height, stride) 634 | uniform int offset_x, 635 | uniform int offset_y, 636 | uniform int width, 637 | uniform int height, 638 | uniform int stride 639 | ) { 640 | 641 | uniform int start_x = abs(offset_x); 642 | uniform int end_x = width - abs(offset_x); 643 | 644 | for (uniform int y = 0; y < height; y++) { 645 | for (uniform int x = 0; x < start_x; x++) { 646 | uniform int idx = y * stride + x; 647 | uniform float u1_1 = centerp1[idx]; 648 | uniform float u1_2 = centerp2[idx]; 649 | uniform float u1_3 = centerp3[idx]; 650 | 651 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 652 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 653 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 654 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 655 | 656 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f; 657 | 658 | uniform float dst = ( 659 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 660 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 661 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 662 | ); 663 | 664 | temp0[idx] = dst; 665 | } 666 | 667 | foreach (x = start_x ... end_x) { 668 | int idx = y * stride + x; 669 | float u1_1 = centerp1[idx]; 670 | float u1_2 = centerp2[idx]; 671 | float u1_3 = centerp3[idx]; 672 | 673 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 674 | float u1_pq_1 = neighborp1[neighbor_idx]; 675 | float u1_pq_2 = neighborp2[neighbor_idx]; 676 | float u1_pq_3 = neighborp3[neighbor_idx]; 677 | 678 | float m_red = (u1_1 + u1_pq_1) / 6.0f; 679 | 680 | float dst = ( 681 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 682 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 683 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 684 | ); 685 | 686 | temp0[idx] = dst; 687 | } 688 | 689 | for (uniform int x = end_x; x < width; x++) { 690 | uniform int idx = y * stride + x; 691 | uniform float u1_1 = centerp1[idx]; 692 | uniform float u1_2 = centerp2[idx]; 693 | uniform float u1_3 = centerp3[idx]; 694 | 695 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 696 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 697 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 698 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 699 | 700 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f; 701 | 702 | uniform float dst = ( 703 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 704 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 705 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 706 | ); 707 | 708 | temp0[idx] = dst; 709 | } 710 | } 711 | } 712 | 713 | export void nlmDistanceRGB_u8( 714 | uniform float temp0[], // shape: (height, stride) 715 | uniform const unsigned int8 centerp1[], // shape: (height, stride) 716 | uniform const unsigned int8 centerp2[], // shape: (height, stride) 717 | uniform const unsigned int8 centerp3[], // shape: (height, stride) 718 | uniform const unsigned int8 neighborp1[], // shape: (height, stride) 719 | uniform const unsigned int8 neighborp2[], // shape: (height, stride) 720 | uniform const unsigned int8 neighborp3[], // shape: (height, stride) 721 | uniform int offset_x, 722 | uniform int offset_y, 723 | uniform int width, 724 | uniform int height, 725 | uniform int stride, 726 | uniform float inv_divisor 727 | ) { 728 | 729 | uniform int start_x = abs(offset_x); 730 | uniform int end_x = width - abs(offset_x); 731 | 732 | uniform float sq_inv_divisor = square(inv_divisor); 733 | 734 | for (uniform int y = 0; y < height; y++) { 735 | for (uniform int x = 0; x < start_x; x++) { 736 | uniform int idx = y * stride + x; 737 | uniform float u1_1 = centerp1[idx]; 738 | uniform float u1_2 = centerp2[idx]; 739 | uniform float u1_3 = centerp3[idx]; 740 | 741 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 742 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 743 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 744 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 745 | 746 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor; 747 | 748 | uniform float dst = ( 749 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 750 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 751 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 752 | ); 753 | 754 | temp0[idx] = dst * sq_inv_divisor; 755 | } 756 | 757 | foreach (x = start_x ... end_x) { 758 | int idx = y * stride + x; 759 | float u1_1 = centerp1[idx]; 760 | float u1_2 = centerp2[idx]; 761 | float u1_3 = centerp3[idx]; 762 | 763 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 764 | float u1_pq_1 = neighborp1[neighbor_idx]; 765 | float u1_pq_2 = neighborp2[neighbor_idx]; 766 | float u1_pq_3 = neighborp3[neighbor_idx]; 767 | 768 | float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor; 769 | 770 | float dst = ( 771 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 772 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 773 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 774 | ); 775 | 776 | temp0[idx] = dst * sq_inv_divisor; 777 | } 778 | 779 | for (uniform int x = end_x; x < width; x++) { 780 | uniform int idx = y * stride + x; 781 | uniform float u1_1 = centerp1[idx]; 782 | uniform float u1_2 = centerp2[idx]; 783 | uniform float u1_3 = centerp3[idx]; 784 | 785 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 786 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 787 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 788 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 789 | 790 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor; 791 | 792 | uniform float dst = ( 793 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 794 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 795 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 796 | ); 797 | 798 | temp0[idx] = dst * sq_inv_divisor; 799 | } 800 | } 801 | } 802 | 803 | export void nlmDistanceRGB_u16( 804 | uniform float temp0[], // shape: (height, stride) 805 | uniform const unsigned int16 centerp1[], // shape: (height, stride) 806 | uniform const unsigned int16 centerp2[], // shape: (height, stride) 807 | uniform const unsigned int16 centerp3[], // shape: (height, stride) 808 | uniform const unsigned int16 neighborp1[], // shape: (height, stride) 809 | uniform const unsigned int16 neighborp2[], // shape: (height, stride) 810 | uniform const unsigned int16 neighborp3[], // shape: (height, stride) 811 | uniform int offset_x, 812 | uniform int offset_y, 813 | uniform int width, 814 | uniform int height, 815 | uniform int stride, 816 | uniform float inv_divisor 817 | ) { 818 | 819 | uniform int start_x = abs(offset_x); 820 | uniform int end_x = width - abs(offset_x); 821 | 822 | uniform float sq_inv_divisor = square(inv_divisor); 823 | 824 | for (uniform int y = 0; y < height; y++) { 825 | for (uniform int x = 0; x < start_x; x++) { 826 | uniform int idx = y * stride + x; 827 | uniform float u1_1 = centerp1[idx]; 828 | uniform float u1_2 = centerp2[idx]; 829 | uniform float u1_3 = centerp3[idx]; 830 | 831 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 832 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 833 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 834 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 835 | 836 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor; 837 | 838 | uniform float dst = ( 839 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 840 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 841 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 842 | ); 843 | 844 | temp0[idx] = dst * sq_inv_divisor; 845 | } 846 | 847 | foreach (x = start_x ... end_x) { 848 | int idx = y * stride + x; 849 | float u1_1 = centerp1[idx]; 850 | float u1_2 = centerp2[idx]; 851 | float u1_3 = centerp3[idx]; 852 | 853 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x); 854 | float u1_pq_1 = neighborp1[neighbor_idx]; 855 | float u1_pq_2 = neighborp2[neighbor_idx]; 856 | float u1_pq_3 = neighborp3[neighbor_idx]; 857 | 858 | float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor; 859 | 860 | float dst = ( 861 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 862 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 863 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 864 | ); 865 | 866 | temp0[idx] = dst * sq_inv_divisor; 867 | } 868 | 869 | for (uniform int x = end_x; x < width; x++) { 870 | uniform int idx = y * stride + x; 871 | uniform float u1_1 = centerp1[idx]; 872 | uniform float u1_2 = centerp2[idx]; 873 | uniform float u1_3 = centerp3[idx]; 874 | 875 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x); 876 | uniform float u1_pq_1 = neighborp1[neighbor_idx]; 877 | uniform float u1_pq_2 = neighborp2[neighbor_idx]; 878 | uniform float u1_pq_3 = neighborp3[neighbor_idx]; 879 | 880 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor; 881 | 882 | uniform float dst = ( 883 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) + 884 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) + 885 | (1.0f - m_red) * square(u1_3 - u1_pq_3) 886 | ); 887 | 888 | temp0[idx] = dst * sq_inv_divisor; 889 | } 890 | } 891 | } 892 | 893 | // manually unrolled nlmHorizontal() 894 | static void nlmHorizontalS0( 895 | uniform float temp0[], // shape: (height, stride) 896 | uniform const float temp[], // shape: (height, stride) 897 | uniform int width, 898 | uniform int height, 899 | uniform int stride 900 | ) { 901 | const uniform int nlm_s = 0; 902 | uniform int start = nlm_s; 903 | uniform int end = width - nlm_s; 904 | 905 | for (uniform int y = 0; y < height; y++) { 906 | for (uniform int x = 0; x < nlm_s; x++) { 907 | uniform float sum = 0.0f; 908 | #pragma unroll 909 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 910 | sum += temp[y * stride + CLAMPX(x + j)]; 911 | } 912 | temp0[y * stride + x] = sum; 913 | } 914 | 915 | foreach (x = start ... end) { 916 | float sum = 0.0f; 917 | #pragma unroll 918 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 919 | sum += temp[y * stride + x + j]; 920 | } 921 | temp0[y * stride + x] = sum; 922 | } 923 | 924 | for (uniform int x = end; x < width; x++) { 925 | uniform float sum = 0.0f; 926 | #pragma unroll 927 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 928 | sum += temp[y * stride + CLAMPX(x + j)]; 929 | } 930 | temp0[y * stride + x] = sum; 931 | } 932 | } 933 | } 934 | 935 | static void nlmHorizontalS1( 936 | uniform float temp0[], // shape: (height, stride) 937 | uniform const float temp[], // shape: (height, stride) 938 | uniform int width, 939 | uniform int height, 940 | uniform int stride 941 | ) { 942 | const uniform int nlm_s = 1; 943 | uniform int start = nlm_s; 944 | uniform int end = width - nlm_s; 945 | 946 | for (uniform int y = 0; y < height; y++) { 947 | for (uniform int x = 0; x < nlm_s; x++) { 948 | uniform float sum = 0.0f; 949 | #pragma unroll 950 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 951 | sum += temp[y * stride + CLAMPX(x + j)]; 952 | } 953 | temp0[y * stride + x] = sum; 954 | } 955 | 956 | foreach (x = start ... end) { 957 | float sum = 0.0f; 958 | #pragma unroll 959 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 960 | sum += temp[y * stride + x + j]; 961 | } 962 | temp0[y * stride + x] = sum; 963 | } 964 | 965 | for (uniform int x = end; x < width; x++) { 966 | uniform float sum = 0.0f; 967 | #pragma unroll 968 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 969 | sum += temp[y * stride + CLAMPX(x + j)]; 970 | } 971 | temp0[y * stride + x] = sum; 972 | } 973 | } 974 | } 975 | 976 | static void nlmHorizontalS2( 977 | uniform float temp0[], // shape: (height, stride) 978 | uniform const float temp[], // shape: (height, stride) 979 | uniform int width, 980 | uniform int height, 981 | uniform int stride 982 | ) { 983 | 984 | const uniform int nlm_s = 2; 985 | uniform int start = nlm_s; 986 | uniform int end = width - nlm_s; 987 | 988 | for (uniform int y = 0; y < height; y++) { 989 | for (uniform int x = 0; x < nlm_s; x++) { 990 | uniform float sum = 0.0f; 991 | #pragma unroll 992 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 993 | sum += temp[y * stride + CLAMPX(x + j)]; 994 | } 995 | temp0[y * stride + x] = sum; 996 | } 997 | 998 | foreach (x = start ... end) { 999 | float sum = 0.0f; 1000 | #pragma unroll 1001 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1002 | sum += temp[y * stride + x + j]; 1003 | } 1004 | temp0[y * stride + x] = sum; 1005 | } 1006 | 1007 | for (uniform int x = end; x < width; x++) { 1008 | uniform float sum = 0.0f; 1009 | #pragma unroll 1010 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1011 | sum += temp[y * stride + CLAMPX(x + j)]; 1012 | } 1013 | temp0[y * stride + x] = sum; 1014 | } 1015 | } 1016 | } 1017 | 1018 | static void nlmHorizontalS3( 1019 | uniform float temp0[], // shape: (height, stride) 1020 | uniform const float temp[], // shape: (height, stride) 1021 | uniform int width, 1022 | uniform int height, 1023 | uniform int stride 1024 | ) { 1025 | 1026 | const uniform int nlm_s = 3; 1027 | uniform int start = nlm_s; 1028 | uniform int end = width - nlm_s; 1029 | 1030 | for (uniform int y = 0; y < height; y++) { 1031 | for (uniform int x = 0; x < nlm_s; x++) { 1032 | uniform float sum = 0.0f; 1033 | #pragma unroll 1034 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1035 | sum += temp[y * stride + CLAMPX(x + j)]; 1036 | } 1037 | temp0[y * stride + x] = sum; 1038 | } 1039 | 1040 | foreach (x = start ... end) { 1041 | float sum = 0.0f; 1042 | #pragma unroll 1043 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1044 | sum += temp[y * stride + x + j]; 1045 | } 1046 | temp0[y * stride + x] = sum; 1047 | } 1048 | 1049 | for (uniform int x = end; x < width; x++) { 1050 | uniform float sum = 0.0f; 1051 | #pragma unroll 1052 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1053 | sum += temp[y * stride + CLAMPX(x + j)]; 1054 | } 1055 | temp0[y * stride + x] = sum; 1056 | } 1057 | } 1058 | } 1059 | 1060 | static void nlmHorizontalS4( 1061 | uniform float temp0[], // shape: (height, stride) 1062 | uniform const float temp[], // shape: (height, stride) 1063 | uniform int width, 1064 | uniform int height, 1065 | uniform int stride 1066 | ) { 1067 | 1068 | const uniform int nlm_s = 4; 1069 | uniform int start = nlm_s; 1070 | uniform int end = width - nlm_s; 1071 | 1072 | for (uniform int y = 0; y < height; y++) { 1073 | for (uniform int x = 0; x < nlm_s; x++) { 1074 | uniform float sum = 0.0f; 1075 | #pragma unroll 1076 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1077 | sum += temp[y * stride + CLAMPX(x + j)]; 1078 | } 1079 | temp0[y * stride + x] = sum; 1080 | } 1081 | 1082 | foreach (x = start ... end) { 1083 | float sum = 0.0f; 1084 | #pragma unroll 1085 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1086 | sum += temp[y * stride + x + j]; 1087 | } 1088 | temp0[y * stride + x] = sum; 1089 | } 1090 | 1091 | for (uniform int x = end; x < width; x++) { 1092 | uniform float sum = 0.0f; 1093 | #pragma unroll 1094 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1095 | sum += temp[y * stride + CLAMPX(x + j)]; 1096 | } 1097 | temp0[y * stride + x] = sum; 1098 | } 1099 | } 1100 | } 1101 | 1102 | export void nlmHorizontal( 1103 | uniform float temp0[], // shape: (height, stride) 1104 | uniform const float temp[], // shape: (height, stride) 1105 | uniform int nlm_s, 1106 | uniform int width, 1107 | uniform int height, 1108 | uniform int stride 1109 | ) { 1110 | // dynamic dispatch on nlm_s 1111 | if (nlm_s == 0) { 1112 | nlmHorizontalS0(temp0, temp, width, height, stride); 1113 | return ; 1114 | } else if (nlm_s == 1) { 1115 | nlmHorizontalS1(temp0, temp, width, height, stride); 1116 | return ; 1117 | } else if (nlm_s == 2) { 1118 | nlmHorizontalS2(temp0, temp, width, height, stride); 1119 | return ; 1120 | } else if (nlm_s == 3) { 1121 | nlmHorizontalS3(temp0, temp, width, height, stride); 1122 | return ; 1123 | } else if (nlm_s == 4) { 1124 | nlmHorizontalS4(temp0, temp, width, height, stride); 1125 | return ; 1126 | } 1127 | 1128 | uniform int start = nlm_s; 1129 | uniform int end = width - nlm_s; 1130 | 1131 | for (uniform int y = 0; y < height; y++) { 1132 | for (uniform int x = 0; x < nlm_s; x++) { 1133 | uniform float sum = 0.0f; 1134 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1135 | sum += temp[y * stride + CLAMPX(x + j)]; 1136 | } 1137 | temp0[y * stride + x] = sum; 1138 | } 1139 | 1140 | foreach (x = start ... end) { 1141 | float sum = 0.0f; 1142 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1143 | sum += temp[y * stride + x + j]; 1144 | } 1145 | temp0[y * stride + x] = sum; 1146 | } 1147 | 1148 | for (uniform int x = end; x < width; x++) { 1149 | uniform float sum = 0.0f; 1150 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) { 1151 | sum += temp[y * stride + CLAMPX(x + j)]; 1152 | } 1153 | temp0[y * stride + x] = sum; 1154 | } 1155 | } 1156 | } 1157 | 1158 | static inline float welsch(float sum, uniform float h2_inv_norm) { 1159 | return exp(-sum * h2_inv_norm); 1160 | } 1161 | 1162 | export void nlmVerticalWelsch( 1163 | uniform float dstp[], // shape: (height, stride) 1164 | uniform const float srcp[], // shape: (height, stride) 1165 | uniform int radius, 1166 | uniform float h2_inv_norm, 1167 | uniform int width, 1168 | uniform int height, 1169 | uniform int stride, 1170 | uniform float buffer[] // shape: (width,) 1171 | ) { 1172 | 1173 | foreach (x = 0 ... width) { 1174 | buffer[x] = radius * srcp[x]; 1175 | } 1176 | 1177 | for (uniform int y = 0; y < radius; ++y) { 1178 | foreach (x = 0 ... width) { 1179 | buffer[x] += srcp[min(y, height - 1) * stride + x]; 1180 | } 1181 | } 1182 | 1183 | for (uniform int y = 0; y < min(radius, height); ++y) { 1184 | foreach (x = 0 ... width) { 1185 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x]; 1186 | dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm); 1187 | buffer[x] -= srcp[x]; 1188 | } 1189 | } 1190 | 1191 | if (height > radius) { 1192 | for (uniform int y = radius; y < height - radius; ++y) { 1193 | foreach (x = 0 ... width) { 1194 | buffer[x] += srcp[(y + radius) * stride + x]; 1195 | dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm); 1196 | buffer[x] -= srcp[(y - radius) * stride + x]; 1197 | } 1198 | } 1199 | 1200 | for (uniform int y = max(height - radius, radius); y < height; ++y) { 1201 | foreach (x = 0 ... width) { 1202 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x]; 1203 | dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm); 1204 | buffer[x] -= srcp[(y - radius) * stride + x]; 1205 | } 1206 | } 1207 | } 1208 | } 1209 | 1210 | // positive difference 1211 | static inline float fdim(uniform float x, float y) { 1212 | return (x > y) ? x - y : 0.0f; 1213 | } 1214 | 1215 | static inline float bisquareA(float sum, uniform float h2_inv_norm) { 1216 | float tmp = fdim(1.0f, sum * h2_inv_norm); 1217 | return tmp; 1218 | } 1219 | 1220 | export void nlmVerticalBisquareA( 1221 | uniform float dstp[], // shape: (height, stride) 1222 | uniform const float srcp[], // shape: (height, stride) 1223 | uniform int radius, 1224 | uniform float h2_inv_norm, 1225 | uniform int width, 1226 | uniform int height, 1227 | uniform int stride, 1228 | uniform float buffer[] // shape: (width,) 1229 | ) { 1230 | 1231 | foreach (x = 0 ... width) { 1232 | buffer[x] = radius * srcp[x]; 1233 | } 1234 | 1235 | for (uniform int y = 0; y < radius; ++y) { 1236 | foreach (x = 0 ... width) { 1237 | buffer[x] += srcp[min(y, height - 1) * stride + x]; 1238 | } 1239 | } 1240 | 1241 | for (uniform int y = 0; y < min(radius, height); ++y) { 1242 | foreach (x = 0 ... width) { 1243 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x]; 1244 | dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm); 1245 | buffer[x] -= srcp[x]; 1246 | } 1247 | } 1248 | 1249 | if (height > radius) { 1250 | for (uniform int y = radius; y < height - radius; ++y) { 1251 | foreach (x = 0 ... width) { 1252 | buffer[x] += srcp[(y + radius) * stride + x]; 1253 | dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm); 1254 | buffer[x] -= srcp[(y - radius) * stride + x]; 1255 | } 1256 | } 1257 | 1258 | for (uniform int y = max(height - radius, radius); y < height; ++y) { 1259 | foreach (x = 0 ... width) { 1260 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x]; 1261 | dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm); 1262 | buffer[x] -= srcp[(y - radius) * stride + x]; 1263 | } 1264 | } 1265 | } 1266 | } 1267 | 1268 | static inline float bisquareB(float sum, uniform float h2_inv_norm) { 1269 | float tmp = fdim(1.0f, sum * h2_inv_norm); 1270 | tmp *= tmp; 1271 | return tmp; 1272 | } 1273 | 1274 | export void nlmVerticalBisquareB( 1275 | uniform float dstp[], // shape: (height, stride) 1276 | uniform const float srcp[], // shape: (height, stride) 1277 | uniform int radius, 1278 | uniform float h2_inv_norm, 1279 | uniform int width, 1280 | uniform int height, 1281 | uniform int stride, 1282 | uniform float buffer[] // shape: (width,) 1283 | ) { 1284 | 1285 | foreach (x = 0 ... width) { 1286 | buffer[x] = radius * srcp[x]; 1287 | } 1288 | 1289 | for (uniform int y = 0; y < radius; ++y) { 1290 | foreach (x = 0 ... width) { 1291 | buffer[x] += srcp[min(y, height - 1) * stride + x]; 1292 | } 1293 | } 1294 | 1295 | for (uniform int y = 0; y < min(radius, height); ++y) { 1296 | foreach (x = 0 ... width) { 1297 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x]; 1298 | dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm); 1299 | buffer[x] -= srcp[x]; 1300 | } 1301 | } 1302 | 1303 | if (height > radius) { 1304 | for (uniform int y = radius; y < height - radius; ++y) { 1305 | foreach (x = 0 ... width) { 1306 | buffer[x] += srcp[(y + radius) * stride + x]; 1307 | dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm); 1308 | buffer[x] -= srcp[(y - radius) * stride + x]; 1309 | } 1310 | } 1311 | 1312 | for (uniform int y = max(height - radius, radius); y < height; ++y) { 1313 | foreach (x = 0 ... width) { 1314 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x]; 1315 | dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm); 1316 | buffer[x] -= srcp[(y - radius) * stride + x]; 1317 | } 1318 | } 1319 | } 1320 | } 1321 | 1322 | static inline float bisquareC(float sum, uniform float h2_inv_norm) { 1323 | float tmp = fdim(1.0f, sum * h2_inv_norm); 1324 | tmp *= tmp; 1325 | tmp *= tmp; 1326 | tmp *= tmp; 1327 | return tmp; 1328 | } 1329 | 1330 | export void nlmVerticalBisquareC( 1331 | uniform float dstp[], // shape: (height, stride) 1332 | uniform const float srcp[], // shape: (height, stride) 1333 | uniform int radius, 1334 | uniform float h2_inv_norm, 1335 | uniform int width, 1336 | uniform int height, 1337 | uniform int stride, 1338 | uniform float buffer[] // shape: (width,) 1339 | ) { 1340 | 1341 | foreach (x = 0 ... width) { 1342 | buffer[x] = radius * srcp[x]; 1343 | } 1344 | 1345 | for (uniform int y = 0; y < radius; ++y) { 1346 | foreach (x = 0 ... width) { 1347 | buffer[x] += srcp[min(y, height - 1) * stride + x]; 1348 | } 1349 | } 1350 | 1351 | for (uniform int y = 0; y < min(radius, height); ++y) { 1352 | foreach (x = 0 ... width) { 1353 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x]; 1354 | dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm); 1355 | buffer[x] -= srcp[x]; 1356 | } 1357 | } 1358 | 1359 | if (height > radius) { 1360 | for (uniform int y = radius; y < height - radius; ++y) { 1361 | foreach (x = 0 ... width) { 1362 | buffer[x] += srcp[(y + radius) * stride + x]; 1363 | dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm); 1364 | buffer[x] -= srcp[(y - radius) * stride + x]; 1365 | } 1366 | } 1367 | 1368 | for (uniform int y = max(height - radius, radius); y < height; ++y) { 1369 | foreach (x = 0 ... width) { 1370 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x]; 1371 | dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm); 1372 | buffer[x] -= srcp[(y - radius) * stride + x]; 1373 | } 1374 | } 1375 | } 1376 | } 1377 | 1378 | export void nlmAccumulationCh1_f32( 1379 | uniform float weightp[], // shape: (height, stride) 1380 | uniform float wdstp[], // shape: (height, stride) 1381 | uniform float max_weightp[], // shape: (height, stride) 1382 | uniform const float srcp_bwd[], // shape: (height, stride) 1383 | uniform const float srcp_fwd[], // shape: (height, stride) 1384 | uniform const float temp1[], // shape: (height, stride) 1385 | uniform const float temp2[], // shape: (height, stride) 1386 | uniform int offset_x, 1387 | uniform int offset_y, 1388 | uniform int width, 1389 | uniform int height, 1390 | uniform int stride 1391 | ) { 1392 | 1393 | uniform int start_x = abs(offset_x); 1394 | uniform int end_x = width - abs(offset_x); 1395 | 1396 | for (uniform int y = 0; y < height; y++) { 1397 | for (uniform int x = 0; x < start_x; x++) { 1398 | uniform int idx = y * stride + x; 1399 | 1400 | uniform float u4 = temp1[idx]; 1401 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1402 | 1403 | weightp[idx] += u4 + u4_mq; 1404 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1405 | 1406 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1407 | 1408 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1409 | 1410 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1411 | } 1412 | 1413 | foreach (x = start_x ... end_x) { 1414 | int idx = y * stride + x; 1415 | 1416 | float u4 = temp1[idx]; 1417 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1418 | 1419 | weightp[idx] += u4 + u4_mq; 1420 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1421 | 1422 | float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1423 | 1424 | float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1425 | 1426 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1427 | } 1428 | 1429 | for (uniform int x = end_x; x < width; x++) { 1430 | uniform int idx = y * stride + x; 1431 | 1432 | uniform float u4 = temp1[idx]; 1433 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1434 | 1435 | weightp[idx] += u4 + u4_mq; 1436 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1437 | 1438 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1439 | 1440 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1441 | 1442 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1443 | } 1444 | } 1445 | } 1446 | 1447 | export void nlmAccumulationCh1_u8( 1448 | uniform float weightp[], // shape: (height, stride) 1449 | uniform float wdstp[], // shape: (height, stride) 1450 | uniform float max_weightp[], // shape: (height, stride) 1451 | uniform const unsigned int8 srcp_bwd[], // shape: (height, stride) 1452 | uniform const unsigned int8 srcp_fwd[], // shape: (height, stride) 1453 | uniform const float temp1[], // shape: (height, stride) 1454 | uniform const float temp2[], // shape: (height, stride) 1455 | uniform int offset_x, 1456 | uniform int offset_y, 1457 | uniform int width, 1458 | uniform int height, 1459 | uniform int stride 1460 | ) { 1461 | 1462 | uniform int start_x = abs(offset_x); 1463 | uniform int end_x = width - abs(offset_x); 1464 | 1465 | for (uniform int y = 0; y < height; y++) { 1466 | for (uniform int x = 0; x < start_x; x++) { 1467 | uniform int idx = y * stride + x; 1468 | 1469 | uniform float u4 = temp1[idx]; 1470 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1471 | 1472 | weightp[idx] += u4 + u4_mq; 1473 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1474 | 1475 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1476 | 1477 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1478 | 1479 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1480 | } 1481 | 1482 | foreach (x = start_x ... end_x) { 1483 | int idx = y * stride + x; 1484 | 1485 | float u4 = temp1[idx]; 1486 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1487 | 1488 | weightp[idx] += u4 + u4_mq; 1489 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1490 | 1491 | float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1492 | 1493 | float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1494 | 1495 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1496 | } 1497 | 1498 | for (uniform int x = end_x; x < width; x++) { 1499 | uniform int idx = y * stride + x; 1500 | 1501 | uniform float u4 = temp1[idx]; 1502 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1503 | 1504 | weightp[idx] += u4 + u4_mq; 1505 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1506 | 1507 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1508 | 1509 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1510 | 1511 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1512 | } 1513 | } 1514 | } 1515 | 1516 | export void nlmAccumulationCh1_u16( 1517 | uniform float weightp[], // shape: (height, stride) 1518 | uniform float wdstp[], // shape: (height, stride) 1519 | uniform float max_weightp[], // shape: (height, stride) 1520 | uniform const unsigned int16 srcp_bwd[], // shape: (height, stride) 1521 | uniform const unsigned int16 srcp_fwd[], // shape: (height, stride) 1522 | uniform const float temp1[], // shape: (height, stride) 1523 | uniform const float temp2[], // shape: (height, stride) 1524 | uniform int offset_x, 1525 | uniform int offset_y, 1526 | uniform int width, 1527 | uniform int height, 1528 | uniform int stride 1529 | ) { 1530 | 1531 | uniform int start_x = abs(offset_x); 1532 | uniform int end_x = width - abs(offset_x); 1533 | 1534 | for (uniform int y = 0; y < height; y++) { 1535 | for (uniform int x = 0; x < start_x; x++) { 1536 | uniform int idx = y * stride + x; 1537 | 1538 | uniform float u4 = temp1[idx]; 1539 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1540 | 1541 | weightp[idx] += u4 + u4_mq; 1542 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1543 | 1544 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1545 | 1546 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1547 | 1548 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1549 | } 1550 | 1551 | foreach (x = start_x ... end_x) { 1552 | int idx = y * stride + x; 1553 | 1554 | float u4 = temp1[idx]; 1555 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1556 | 1557 | weightp[idx] += u4 + u4_mq; 1558 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1559 | 1560 | float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1561 | 1562 | float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1563 | 1564 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1565 | } 1566 | 1567 | for (uniform int x = end_x; x < width; x++) { 1568 | uniform int idx = y * stride + x; 1569 | 1570 | uniform float u4 = temp1[idx]; 1571 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1572 | 1573 | weightp[idx] += u4 + u4_mq; 1574 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1575 | 1576 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1577 | 1578 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1579 | 1580 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq; 1581 | } 1582 | } 1583 | } 1584 | 1585 | export void nlmAccumulationCh2_f32( 1586 | uniform float weightp[], // shape: (height, stride) 1587 | uniform float wdstp1[], // shape: (height, stride) 1588 | uniform float wdstp2[], // shape: (height, stride) 1589 | uniform float max_weightp[], // shape: (height, stride) 1590 | uniform const float srcp_bwd1[], // shape: (height, stride) 1591 | uniform const float srcp_bwd2[], // shape: (height, stride) 1592 | uniform const float srcp_fwd1[], // shape: (height, stride) 1593 | uniform const float srcp_fwd2[], // shape: (height, stride) 1594 | uniform const float temp1[], // shape: (height, stride) 1595 | uniform const float temp2[], // shape: (height, stride) 1596 | uniform int offset_x, 1597 | uniform int offset_y, 1598 | uniform int width, 1599 | uniform int height, 1600 | uniform int stride 1601 | ) { 1602 | 1603 | uniform int start_x = abs(offset_x); 1604 | uniform int end_x = width - abs(offset_x); 1605 | 1606 | for (uniform int y = 0; y < height; y++) { 1607 | for (uniform int x = 0; x < start_x; x++) { 1608 | uniform int idx = y * stride + x; 1609 | 1610 | uniform float u4 = temp1[idx]; 1611 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1612 | 1613 | weightp[idx] += u4 + u4_mq; 1614 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1615 | 1616 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1617 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1618 | 1619 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1620 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1621 | 1622 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1623 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1624 | } 1625 | 1626 | foreach (x = start_x ... end_x) { 1627 | int idx = y * stride + x; 1628 | 1629 | float u4 = temp1[idx]; 1630 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1631 | 1632 | weightp[idx] += u4 + u4_mq; 1633 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1634 | 1635 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1636 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1637 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1638 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1639 | 1640 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1641 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1642 | } 1643 | 1644 | for (uniform int x = end_x; x < width; x++) { 1645 | uniform int idx = y * stride + x; 1646 | 1647 | uniform float u4 = temp1[idx]; 1648 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1649 | 1650 | weightp[idx] += u4 + u4_mq; 1651 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1652 | 1653 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1654 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1655 | 1656 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1657 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1658 | 1659 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1660 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1661 | } 1662 | } 1663 | } 1664 | 1665 | export void nlmAccumulationCh2_u8( 1666 | uniform float weightp[], // shape: (height, stride) 1667 | uniform float wdstp1[], // shape: (height, stride) 1668 | uniform float wdstp2[], // shape: (height, stride) 1669 | uniform float max_weightp[], // shape: (height, stride) 1670 | uniform const unsigned int8 srcp_bwd1[], // shape: (height, stride) 1671 | uniform const unsigned int8 srcp_bwd2[], // shape: (height, stride) 1672 | uniform const unsigned int8 srcp_fwd1[], // shape: (height, stride) 1673 | uniform const unsigned int8 srcp_fwd2[], // shape: (height, stride) 1674 | uniform const float temp1[], // shape: (height, stride) 1675 | uniform const float temp2[], // shape: (height, stride) 1676 | uniform int offset_x, 1677 | uniform int offset_y, 1678 | uniform int width, 1679 | uniform int height, 1680 | uniform int stride 1681 | ) { 1682 | 1683 | uniform int start_x = abs(offset_x); 1684 | uniform int end_x = width - abs(offset_x); 1685 | 1686 | for (uniform int y = 0; y < height; y++) { 1687 | for (uniform int x = 0; x < start_x; x++) { 1688 | uniform int idx = y * stride + x; 1689 | 1690 | uniform float u4 = temp1[idx]; 1691 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1692 | 1693 | weightp[idx] += u4 + u4_mq; 1694 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1695 | 1696 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1697 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1698 | 1699 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1700 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1701 | 1702 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1703 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1704 | } 1705 | 1706 | foreach (x = start_x ... end_x) { 1707 | int idx = y * stride + x; 1708 | 1709 | float u4 = temp1[idx]; 1710 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1711 | 1712 | weightp[idx] += u4 + u4_mq; 1713 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1714 | 1715 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1716 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1717 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1718 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1719 | 1720 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1721 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1722 | } 1723 | 1724 | for (uniform int x = end_x; x < width; x++) { 1725 | uniform int idx = y * stride + x; 1726 | 1727 | uniform float u4 = temp1[idx]; 1728 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1729 | 1730 | weightp[idx] += u4 + u4_mq; 1731 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1732 | 1733 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1734 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1735 | 1736 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1737 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1738 | 1739 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1740 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1741 | } 1742 | } 1743 | } 1744 | 1745 | export void nlmAccumulationCh2_u16( 1746 | uniform float weightp[], // shape: (height, stride) 1747 | uniform float wdstp1[], // shape: (height, stride) 1748 | uniform float wdstp2[], // shape: (height, stride) 1749 | uniform float max_weightp[], // shape: (height, stride) 1750 | uniform const unsigned int16 srcp_bwd1[], // shape: (height, stride) 1751 | uniform const unsigned int16 srcp_bwd2[], // shape: (height, stride) 1752 | uniform const unsigned int16 srcp_fwd1[], // shape: (height, stride) 1753 | uniform const unsigned int16 srcp_fwd2[], // shape: (height, stride) 1754 | uniform const float temp1[], // shape: (height, stride) 1755 | uniform const float temp2[], // shape: (height, stride) 1756 | uniform int offset_x, 1757 | uniform int offset_y, 1758 | uniform int width, 1759 | uniform int height, 1760 | uniform int stride 1761 | ) { 1762 | 1763 | uniform int start_x = abs(offset_x); 1764 | uniform int end_x = width - abs(offset_x); 1765 | 1766 | for (uniform int y = 0; y < height; y++) { 1767 | for (uniform int x = 0; x < start_x; x++) { 1768 | uniform int idx = y * stride + x; 1769 | 1770 | uniform float u4 = temp1[idx]; 1771 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1772 | 1773 | weightp[idx] += u4 + u4_mq; 1774 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1775 | 1776 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1777 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1778 | 1779 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1780 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1781 | 1782 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1783 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1784 | } 1785 | 1786 | foreach (x = start_x ... end_x) { 1787 | int idx = y * stride + x; 1788 | 1789 | float u4 = temp1[idx]; 1790 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1791 | 1792 | weightp[idx] += u4 + u4_mq; 1793 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1794 | 1795 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1796 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1797 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1798 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1799 | 1800 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1801 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1802 | } 1803 | 1804 | for (uniform int x = end_x; x < width; x++) { 1805 | uniform int idx = y * stride + x; 1806 | 1807 | uniform float u4 = temp1[idx]; 1808 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1809 | 1810 | weightp[idx] += u4 + u4_mq; 1811 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1812 | 1813 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1814 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1815 | 1816 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1817 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1818 | 1819 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1820 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1821 | } 1822 | } 1823 | } 1824 | 1825 | export void nlmAccumulationCh3_f32( 1826 | uniform float weightp[], // shape: (height, stride) 1827 | uniform float wdstp1[], // shape: (height, stride) 1828 | uniform float wdstp2[], // shape: (height, stride) 1829 | uniform float wdstp3[], // shape: (height, stride) 1830 | uniform float max_weightp[], // shape: (height, stride) 1831 | uniform const float srcp_bwd1[], // shape: (height, stride) 1832 | uniform const float srcp_bwd2[], // shape: (height, stride) 1833 | uniform const float srcp_bwd3[], // shape: (height, stride) 1834 | uniform const float srcp_fwd1[], // shape: (height, stride) 1835 | uniform const float srcp_fwd2[], // shape: (height, stride) 1836 | uniform const float srcp_fwd3[], // shape: (height, stride) 1837 | uniform const float temp1[], // shape: (height, stride) 1838 | uniform const float temp2[], // shape: (height, stride) 1839 | uniform int offset_x, 1840 | uniform int offset_y, 1841 | uniform int width, 1842 | uniform int height, 1843 | uniform int stride 1844 | ) { 1845 | 1846 | uniform int start_x = abs(offset_x); 1847 | uniform int end_x = width - abs(offset_x); 1848 | 1849 | for (uniform int y = 0; y < height; y++) { 1850 | for (uniform int x = 0; x < start_x; x++) { 1851 | uniform int idx = y * stride + x; 1852 | 1853 | uniform float u4 = temp1[idx]; 1854 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1855 | 1856 | weightp[idx] += u4 + u4_mq; 1857 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1858 | 1859 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1860 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1861 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1862 | 1863 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1864 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1865 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1866 | 1867 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1868 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1869 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 1870 | } 1871 | 1872 | foreach (x = start_x ... end_x) { 1873 | int idx = y * stride + x; 1874 | 1875 | float u4 = temp1[idx]; 1876 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1877 | 1878 | weightp[idx] += u4 + u4_mq; 1879 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1880 | 1881 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1882 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1883 | float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1884 | 1885 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1886 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1887 | float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1888 | 1889 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1890 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1891 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 1892 | } 1893 | 1894 | for (uniform int x = end_x; x < width; x++) { 1895 | uniform int idx = y * stride + x; 1896 | 1897 | uniform float u4 = temp1[idx]; 1898 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1899 | 1900 | weightp[idx] += u4 + u4_mq; 1901 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1902 | 1903 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1904 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1905 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1906 | 1907 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1908 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1909 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1910 | 1911 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1912 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1913 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 1914 | } 1915 | } 1916 | } 1917 | 1918 | export void nlmAccumulationCh3_u8( 1919 | uniform float weightp[], // shape: (height, stride) 1920 | uniform float wdstp1[], // shape: (height, stride) 1921 | uniform float wdstp2[], // shape: (height, stride) 1922 | uniform float wdstp3[], // shape: (height, stride) 1923 | uniform float max_weightp[], // shape: (height, stride) 1924 | uniform const unsigned int8 srcp_bwd1[], // shape: (height, stride) 1925 | uniform const unsigned int8 srcp_bwd2[], // shape: (height, stride) 1926 | uniform const unsigned int8 srcp_bwd3[], // shape: (height, stride) 1927 | uniform const unsigned int8 srcp_fwd1[], // shape: (height, stride) 1928 | uniform const unsigned int8 srcp_fwd2[], // shape: (height, stride) 1929 | uniform const unsigned int8 srcp_fwd3[], // shape: (height, stride) 1930 | uniform const float temp1[], // shape: (height, stride) 1931 | uniform const float temp2[], // shape: (height, stride) 1932 | uniform int offset_x, 1933 | uniform int offset_y, 1934 | uniform int width, 1935 | uniform int height, 1936 | uniform int stride 1937 | ) { 1938 | 1939 | uniform int start_x = abs(offset_x); 1940 | uniform int end_x = width - abs(offset_x); 1941 | 1942 | for (uniform int y = 0; y < height; y++) { 1943 | for (uniform int x = 0; x < start_x; x++) { 1944 | uniform int idx = y * stride + x; 1945 | 1946 | uniform float u4 = temp1[idx]; 1947 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1948 | 1949 | weightp[idx] += u4 + u4_mq; 1950 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1951 | 1952 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1953 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1954 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1955 | 1956 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1957 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1958 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1959 | 1960 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1961 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1962 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 1963 | } 1964 | 1965 | foreach (x = start_x ... end_x) { 1966 | int idx = y * stride + x; 1967 | 1968 | float u4 = temp1[idx]; 1969 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1970 | 1971 | weightp[idx] += u4 + u4_mq; 1972 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1973 | 1974 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1975 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1976 | float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 1977 | 1978 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1979 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1980 | float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 1981 | 1982 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 1983 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 1984 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 1985 | } 1986 | 1987 | for (uniform int x = end_x; x < width; x++) { 1988 | uniform int idx = y * stride + x; 1989 | 1990 | uniform float u4 = temp1[idx]; 1991 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 1992 | 1993 | weightp[idx] += u4 + u4_mq; 1994 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 1995 | 1996 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1997 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1998 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 1999 | 2000 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2001 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2002 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2003 | 2004 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 2005 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 2006 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 2007 | } 2008 | } 2009 | } 2010 | 2011 | export void nlmAccumulationCh3_u16( 2012 | uniform float weightp[], // shape: (height, stride) 2013 | uniform float wdstp1[], // shape: (height, stride) 2014 | uniform float wdstp2[], // shape: (height, stride) 2015 | uniform float wdstp3[], // shape: (height, stride) 2016 | uniform float max_weightp[], // shape: (height, stride) 2017 | uniform const unsigned int16 srcp_bwd1[], // shape: (height, stride) 2018 | uniform const unsigned int16 srcp_bwd2[], // shape: (height, stride) 2019 | uniform const unsigned int16 srcp_bwd3[], // shape: (height, stride) 2020 | uniform const unsigned int16 srcp_fwd1[], // shape: (height, stride) 2021 | uniform const unsigned int16 srcp_fwd2[], // shape: (height, stride) 2022 | uniform const unsigned int16 srcp_fwd3[], // shape: (height, stride) 2023 | uniform const float temp1[], // shape: (height, stride) 2024 | uniform const float temp2[], // shape: (height, stride) 2025 | uniform int offset_x, 2026 | uniform int offset_y, 2027 | uniform int width, 2028 | uniform int height, 2029 | uniform int stride 2030 | ) { 2031 | 2032 | uniform int start_x = abs(offset_x); 2033 | uniform int end_x = width - abs(offset_x); 2034 | 2035 | for (uniform int y = 0; y < height; y++) { 2036 | for (uniform int x = 0; x < start_x; x++) { 2037 | uniform int idx = y * stride + x; 2038 | 2039 | uniform float u4 = temp1[idx]; 2040 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2041 | 2042 | weightp[idx] += u4 + u4_mq; 2043 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 2044 | 2045 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 2046 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 2047 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 2048 | 2049 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2050 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2051 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2052 | 2053 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 2054 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 2055 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 2056 | } 2057 | 2058 | foreach (x = start_x ... end_x) { 2059 | int idx = y * stride + x; 2060 | 2061 | float u4 = temp1[idx]; 2062 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 2063 | 2064 | weightp[idx] += u4 + u4_mq; 2065 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 2066 | 2067 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 2068 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 2069 | float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)]; 2070 | 2071 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 2072 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 2073 | float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)]; 2074 | 2075 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 2076 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 2077 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 2078 | } 2079 | 2080 | for (uniform int x = end_x; x < width; x++) { 2081 | uniform int idx = y * stride + x; 2082 | 2083 | uniform float u4 = temp1[idx]; 2084 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2085 | 2086 | weightp[idx] += u4 + u4_mq; 2087 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]); 2088 | 2089 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 2090 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 2091 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)]; 2092 | 2093 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2094 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2095 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)]; 2096 | 2097 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1; 2098 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2; 2099 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3; 2100 | } 2101 | } 2102 | } 2103 | 2104 | export void nlmFinishCh1_f32( 2105 | uniform float dstp[], // shape: (height, stride) 2106 | uniform const float srcp[], // shape: (height, stride) 2107 | uniform const float weightp[], // shape: (height, stride) 2108 | uniform const float wdstp[], // shape: (height, stride) 2109 | uniform const float max_weightp[], // shape: (height, stride) // epsilon 2110 | uniform float wref, 2111 | uniform int width, 2112 | uniform int height, 2113 | uniform int stride 2114 | ) { 2115 | 2116 | foreach (y = 0 ... height, x = 0 ... width) { 2117 | int idx = y * stride + x; 2118 | 2119 | float multiplier = wref * max_weightp[idx]; 2120 | 2121 | float denominator = multiplier + weightp[idx]; 2122 | 2123 | dstp[idx] = (multiplier * srcp[idx] + wdstp[idx]) / denominator; 2124 | } 2125 | } 2126 | 2127 | export void nlmFinishCh1_u8( 2128 | uniform unsigned int8 dstp[], // shape: (height, stride) 2129 | uniform const unsigned int8 srcp[], // shape: (height, stride) 2130 | uniform const float weightp[], // shape: (height, stride) 2131 | uniform const float wdstp[], // shape: (height, stride) 2132 | uniform const float max_weightp[], // shape: (height, stride) // epsilon 2133 | uniform float wref, 2134 | uniform int width, 2135 | uniform int height, 2136 | uniform int stride, 2137 | uniform int peak 2138 | ) { 2139 | 2140 | foreach (y = 0 ... height, x = 0 ... width) { 2141 | int idx = y * stride + x; 2142 | 2143 | float multiplier = wref * max_weightp[idx]; 2144 | 2145 | float denominator = multiplier + weightp[idx]; 2146 | 2147 | dstp[idx] = max(0, min((int) round((multiplier * srcp[idx] + wdstp[idx]) / denominator), peak)); 2148 | } 2149 | } 2150 | 2151 | export void nlmFinishCh1_u16( 2152 | uniform unsigned int16 dstp[], // shape: (height, stride) 2153 | uniform const unsigned int16 srcp[], // shape: (height, stride) 2154 | uniform const float weightp[], // shape: (height, stride) 2155 | uniform const float wdstp[], // shape: (height, stride) 2156 | uniform const float max_weightp[], // shape: (height, stride) // epsilon 2157 | uniform float wref, 2158 | uniform int width, 2159 | uniform int height, 2160 | uniform int stride, 2161 | uniform int peak 2162 | ) { 2163 | 2164 | foreach (y = 0 ... height, x = 0 ... width) { 2165 | int idx = y * stride + x; 2166 | 2167 | float multiplier = wref * max_weightp[idx]; 2168 | 2169 | float denominator = multiplier + weightp[idx]; 2170 | 2171 | dstp[idx] = max(0, min((int) round((multiplier * srcp[idx] + wdstp[idx]) / denominator), peak)); 2172 | } 2173 | } 2174 | 2175 | export void nlmFinishCh2_f32( 2176 | uniform float dstp1[], // shape: (height, stride) 2177 | uniform float dstp2[], // shape: (height, stride) 2178 | uniform const float srcp1[], // shape: (height, stride) 2179 | uniform const float srcp2[], // shape: (height, stride) 2180 | uniform const float weightp[], // shape: (height, stride) 2181 | uniform const float wdstp1[], // shape: (height, stride) 2182 | uniform const float wdstp2[], // shape: (height, stride) 2183 | uniform const float max_weightp[], // shape: (height, stride) 2184 | uniform float wref, 2185 | uniform int width, 2186 | uniform int height, 2187 | uniform int stride 2188 | ) { 2189 | 2190 | foreach (y = 0 ... height, x = 0 ... width) { 2191 | int idx = y * stride + x; 2192 | 2193 | float multiplier = wref * max_weightp[idx]; 2194 | 2195 | float denominator = multiplier + weightp[idx]; 2196 | 2197 | dstp1[idx] = (multiplier * srcp1[idx] + wdstp1[idx]) / denominator; 2198 | dstp2[idx] = (multiplier * srcp2[idx] + wdstp2[idx]) / denominator; 2199 | } 2200 | } 2201 | 2202 | export void nlmFinishCh2_u8( 2203 | uniform unsigned int8 dstp1[], // shape: (height, stride) 2204 | uniform unsigned int8 dstp2[], // shape: (height, stride) 2205 | uniform const unsigned int8 srcp1[], // shape: (height, stride) 2206 | uniform const unsigned int8 srcp2[], // shape: (height, stride) 2207 | uniform const float weightp[], // shape: (height, stride) 2208 | uniform const float wdstp1[], // shape: (height, stride) 2209 | uniform const float wdstp2[], // shape: (height, stride) 2210 | uniform const float max_weightp[], // shape: (height, stride) 2211 | uniform float wref, 2212 | uniform int width, 2213 | uniform int height, 2214 | uniform int stride, 2215 | uniform int peak 2216 | ) { 2217 | 2218 | foreach (y = 0 ... height, x = 0 ... width) { 2219 | int idx = y * stride + x; 2220 | 2221 | float multiplier = wref * max_weightp[idx]; 2222 | 2223 | float denominator = multiplier + weightp[idx]; 2224 | 2225 | dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak)); 2226 | dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak)); 2227 | } 2228 | } 2229 | 2230 | export void nlmFinishCh2_u16( 2231 | uniform unsigned int16 dstp1[], // shape: (height, stride) 2232 | uniform unsigned int16 dstp2[], // shape: (height, stride) 2233 | uniform const unsigned int16 srcp1[], // shape: (height, stride) 2234 | uniform const unsigned int16 srcp2[], // shape: (height, stride) 2235 | uniform const float weightp[], // shape: (height, stride) 2236 | uniform const float wdstp1[], // shape: (height, stride) 2237 | uniform const float wdstp2[], // shape: (height, stride) 2238 | uniform const float max_weightp[], // shape: (height, stride) 2239 | uniform float wref, 2240 | uniform int width, 2241 | uniform int height, 2242 | uniform int stride, 2243 | uniform int peak 2244 | ) { 2245 | 2246 | foreach (y = 0 ... height, x = 0 ... width) { 2247 | int idx = y * stride + x; 2248 | 2249 | float multiplier = wref * max_weightp[idx]; 2250 | 2251 | float denominator = multiplier + weightp[idx]; 2252 | 2253 | dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak)); 2254 | dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak)); 2255 | } 2256 | } 2257 | 2258 | export void nlmFinishCh3_f32( 2259 | uniform float dstp1[], // shape: (height, stride) 2260 | uniform float dstp2[], // shape: (height, stride) 2261 | uniform float dstp3[], // shape: (height, stride) 2262 | uniform const float srcp1[], // shape: (height, stride) 2263 | uniform const float srcp2[], // shape: (height, stride) 2264 | uniform const float srcp3[], // shape: (height, stride) 2265 | uniform const float weightp[], // shape: (height, stride) 2266 | uniform const float wdstp1[], // shape: (height, stride) 2267 | uniform const float wdstp2[], // shape: (height, stride) 2268 | uniform const float wdstp3[], // shape: (height, stride) 2269 | uniform const float max_weightp[], // shape: (height, stride) 2270 | uniform float wref, 2271 | uniform int width, 2272 | uniform int height, 2273 | uniform int stride 2274 | ) { 2275 | 2276 | foreach (y = 0 ... height, x = 0 ... width) { 2277 | int idx = y * stride + x; 2278 | 2279 | float multiplier = wref * max_weightp[idx]; 2280 | 2281 | float denominator = multiplier + weightp[idx]; 2282 | 2283 | dstp1[idx] = (multiplier * srcp1[idx] + wdstp1[idx]) / denominator; 2284 | dstp2[idx] = (multiplier * srcp2[idx] + wdstp2[idx]) / denominator; 2285 | dstp3[idx] = (multiplier * srcp3[idx] + wdstp3[idx]) / denominator; 2286 | } 2287 | } 2288 | 2289 | export void nlmFinishCh3_u8( 2290 | uniform unsigned int8 dstp1[], // shape: (height, stride) 2291 | uniform unsigned int8 dstp2[], // shape: (height, stride) 2292 | uniform unsigned int8 dstp3[], // shape: (height, stride) 2293 | uniform const unsigned int8 srcp1[], // shape: (height, stride) 2294 | uniform const unsigned int8 srcp2[], // shape: (height, stride) 2295 | uniform const unsigned int8 srcp3[], // shape: (height, stride) 2296 | uniform const float weightp[], // shape: (height, stride) 2297 | uniform const float wdstp1[], // shape: (height, stride) 2298 | uniform const float wdstp2[], // shape: (height, stride) 2299 | uniform const float wdstp3[], // shape: (height, stride) 2300 | uniform const float max_weightp[], // shape: (height, stride) 2301 | uniform float wref, 2302 | uniform int width, 2303 | uniform int height, 2304 | uniform int stride, 2305 | uniform int peak 2306 | ) { 2307 | 2308 | foreach (y = 0 ... height, x = 0 ... width) { 2309 | int idx = y * stride + x; 2310 | 2311 | float multiplier = wref * max_weightp[idx]; 2312 | 2313 | float denominator = multiplier + weightp[idx]; 2314 | 2315 | dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak)); 2316 | dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak)); 2317 | dstp3[idx] = max(0, min((int) round((multiplier * srcp3[idx] + wdstp3[idx]) / denominator), peak)); 2318 | } 2319 | } 2320 | 2321 | export void nlmFinishCh3_u16( 2322 | uniform unsigned int16 dstp1[], // shape: (height, stride) 2323 | uniform unsigned int16 dstp2[], // shape: (height, stride) 2324 | uniform unsigned int16 dstp3[], // shape: (height, stride) 2325 | uniform const unsigned int16 srcp1[], // shape: (height, stride) 2326 | uniform const unsigned int16 srcp2[], // shape: (height, stride) 2327 | uniform const unsigned int16 srcp3[], // shape: (height, stride) 2328 | uniform const float weightp[], // shape: (height, stride) 2329 | uniform const float wdstp1[], // shape: (height, stride) 2330 | uniform const float wdstp2[], // shape: (height, stride) 2331 | uniform const float wdstp3[], // shape: (height, stride) 2332 | uniform const float max_weightp[], // shape: (height, stride) 2333 | uniform float wref, 2334 | uniform int width, 2335 | uniform int height, 2336 | uniform int stride, 2337 | uniform int peak 2338 | ) { 2339 | 2340 | foreach (y = 0 ... height, x = 0 ... width) { 2341 | int idx = y * stride + x; 2342 | 2343 | float multiplier = wref * max_weightp[idx]; 2344 | 2345 | float denominator = multiplier + weightp[idx]; 2346 | 2347 | dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak)); 2348 | dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak)); 2349 | dstp3[idx] = max(0, min((int) round((multiplier * srcp3[idx] + wdstp3[idx]) / denominator), peak)); 2350 | } 2351 | } 2352 | --------------------------------------------------------------------------------