├── source
    ├── config.h.in
    ├── vsnlm.cpp
    └── nlm.ispc
├── README.md
├── .github
    └── workflows
    │   ├── linux-arm64.yml
    │   ├── linux.yml
    │   └── windows.yml
├── CMakeLists.txt
└── LICENSE


/source/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # vs-nlm-ispc
 2 | Non-local means denoise filter, drop-in replacement of the venerable [KNLMeansCL](https://github.com/Khanattila/KNLMeansCL), but without the OpenCL dependency (CPU only).
 3 | 
 4 | x86 and arm are supported.
 5 | 
 6 | ## Usage
 7 | Prototype:
 8 | 
 9 | `core.nlm_ispc.NLMeans(clip clip[, int d = 1, int a = 2, int s = 4, float h = 1.2, string channels = "AUTO", int wmode = 0, float wref = 1.0, clip rclip = None])`
10 | 
11 | ## Compilation
12 | [ISPC](https://github.com/ispc/ispc) is required.
13 | 
14 | ### x86
15 | ```bash
16 | cmake -S . -B build -D CMAKE_BUILD_TYPE=Release \
17 | -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8" \
18 | -D CMAKE_ISPC_FLAGS="--opt=fast-math"
19 | 
20 | cmake --build build
21 | 
22 | cmake --install build
23 | ```
24 | 
25 | ### arm
26 | ```bash
27 | cmake -S . -B build -D CMAKE_BUILD_TYPE=Release \
28 | -D CMAKE_ISPC_INSTRUCTION_SETS="neon-i32x4" \
29 | -D CMAKE_ISPC_FLAGS="--opt=fast-math"
30 | 
31 | cmake --build build
32 | 
33 | cmake --install build
34 | ```
35 | 
36 | 


--------------------------------------------------------------------------------
/.github/workflows/linux-arm64.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux, ARM64)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'source/nlm.ispc'
 7 |       - 'source/vsnlm.cpp'
 8 |       - '.github/workflows/linux-arm64.yml'
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build-linux:
13 |     runs-on: ubuntu-24.04-arm
14 |     steps:
15 |     - name: Checkout repo
16 |       uses: actions/checkout@v4
17 |       with:
18 |         fetch-depth: 0
19 | 
20 |     - name: Download ISPC
21 |       run: |
22 |         curl -s -o ispc.tar.gz -LJO https://github.com/ispc/ispc/releases/download/v1.25.3/ispc-v1.25.3-linux.aarch64.tar.gz
23 |         tar -xzf ispc.tar.gz
24 |         mv ispc-*/ ispc/
25 | 
26 |     - name: Download VapourSynth headers
27 |       run: |
28 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
29 |         unzip -q vs.zip
30 |         mv vapoursynth*/ vapoursynth
31 | 
32 |     - name: Setup Ninja
33 |       run: pip install ninja
34 | 
35 |     - name: Configure
36 |       run: cmake -S . -B build -G Ninja -LA
37 |         -D CMAKE_BUILD_TYPE=Release
38 |         -D CMAKE_CXX_FLAGS="-Wall"
39 |         -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc"
40 |         -D CMAKE_ISPC_INSTRUCTION_SETS="neon-i32x4"
41 |         -D CMAKE_ISPC_FLAGS="--opt=fast-math"
42 |         -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include"
43 | 
44 |     - name: Build
45 |       run: cmake --build build --verbose
46 | 
47 |     - name: Install
48 |       run: cmake --install build --prefix install
49 | 
50 |     - name: Upload
51 |       uses: actions/upload-artifact@v4
52 |       if: false
53 |       with:
54 |         name: Linux-x64
55 |         path: install/lib/*.so
56 | 
57 | 


--------------------------------------------------------------------------------
/.github/workflows/linux.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Linux)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'source/nlm.ispc'
 7 |       - 'source/vsnlm.cpp'
 8 |       - '.github/workflows/linux.yml'
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build-linux:
13 |     runs-on: ubuntu-22.04
14 |     steps:
15 |     - name: Checkout repo
16 |       uses: actions/checkout@v3
17 |       with:
18 |         fetch-depth: 0
19 | 
20 |     - name: Download ISPC
21 |       run: |
22 |         curl -s -o ispc.tar.gz -LJO https://github.com/ispc/ispc/releases/download/v1.20.0/ispc-v1.20.0-linux.tar.gz
23 |         tar -xzf ispc.tar.gz
24 |         mv ispc-*/ ispc/
25 | 
26 |     - name: Download VapourSynth headers
27 |       run: |
28 |         wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
29 |         unzip -q vs.zip
30 |         mv vapoursynth*/ vapoursynth
31 | 
32 |     - name: Setup Ninja
33 |       run: pip install ninja
34 | 
35 |     - name: Configure
36 |       run: cmake -S . -B build -G Ninja -LA
37 |         -D CMAKE_BUILD_TYPE=Release
38 |         -D CMAKE_CXX_COMPILER=g++-12
39 |         -D CMAKE_CXX_FLAGS="-Wall"
40 |         -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc"
41 |         -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8"
42 |         -D CMAKE_ISPC_FLAGS="--opt=fast-math"
43 |         -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include"
44 | 
45 |     - name: Build
46 |       run: cmake --build build --verbose
47 | 
48 |     - name: Install
49 |       run: cmake --install build --prefix install
50 | 
51 |     - name: Upload
52 |       uses: actions/upload-artifact@v3
53 |       if: false
54 |       with:
55 |         name: Linux-x64
56 |         path: install/lib/*.so
57 | 
58 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20.0)
 2 | 
 3 | project(vs-nlm-ispc VERSION 0.1 LANGUAGES CXX ISPC)
 4 | 
 5 | add_library(vsnlm_ispc SHARED source/vsnlm.cpp source/nlm.ispc)
 6 | 
 7 | set_target_properties(vsnlm_ispc PROPERTIES
 8 |     CXX_EXTENSIONS OFF
 9 |     CXX_STANDARD 17
10 |     CXX_STANDARD_REQUIRED ON
11 | )
12 | 
13 | find_package(PkgConfig QUIET MODULE)
14 | 
15 | if(PKG_CONFIG_FOUND)
16 |     pkg_search_module(VS vapoursynth)
17 | 
18 |     if(VS_FOUND)
19 |         message(STATUS "Found VapourSynth r${VS_VERSION}")
20 | 
21 |         cmake_path(APPEND install_dir ${VS_LIBDIR} vapoursynth)
22 |         target_include_directories(vsnlm_ispc PRIVATE ${VS_INCLUDE_DIRS})
23 | 
24 |         install(TARGETS vsnlm_ispc LIBRARY DESTINATION ${install_dir})
25 |     endif()
26 | endif()
27 | 
28 | if(NOT VS_FOUND)
29 |     set(VS_INCLUDE_DIR "" CACHE PATH "Path to VapourSynth headers")
30 | 
31 |     if(VS_INCLUDE_DIR STREQUAL "")
32 |         message(WARNING "VapourSynth not found")
33 |     endif()
34 | 
35 |     target_include_directories(vsnlm_ispc PRIVATE ${VS_INCLUDE_DIR})
36 | 
37 |     install(TARGETS vsnlm_ispc LIBRARY RUNTIME)
38 | endif()
39 | 
40 | find_package(Git QUIET)
41 | 
42 | if(GIT_FOUND)
43 |     execute_process(
44 |         COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
45 |         WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
46 |         OUTPUT_VARIABLE VCS_TAG
47 |     )
48 |     if(VCS_TAG)
49 |         string(STRIP ${VCS_TAG} VCS_TAG)
50 |     endif()
51 | endif()
52 | 
53 | if(VCS_TAG)
54 |     message(STATUS "vs-nlm-ispc ${VCS_TAG}")
55 | else()
56 |     message(WARNING "unknown plugin version")
57 |     set(VCS_TAG "unknown")
58 | endif()
59 | 
60 | configure_file(source/config.h.in config.h)
61 | 
62 | target_include_directories(vsnlm_ispc PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
63 | 


--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
 1 | name: Build (Windows)
 2 | 
 3 | on:
 4 |   push:
 5 |     paths:
 6 |       - 'source/nlm.ispc'
 7 |       - 'source/vsnlm.cpp'
 8 |       - '.github/workflows/windows.yml'
 9 |   workflow_dispatch:
10 |     inputs:
11 |       tag:
12 |         description: 'which tag to upload to'
13 |         default: ''
14 | 
15 | jobs:
16 |   build-windows:
17 |     runs-on: windows-2022
18 |     outputs:
19 |       runID: ${{ steps.output.outputs.runID }}
20 | 
21 |     defaults:
22 |       run:
23 |         shell: cmd
24 | 
25 |     steps:
26 |     - name: Checkout repo
27 |       uses: actions/checkout@v3
28 |       with:
29 |         fetch-depth: 0
30 | 
31 |     - name: Setup MSVC
32 |       uses: ilammy/msvc-dev-cmd@v1
33 | 
34 |     - name: Download VapourSynth headers
35 |       run: |
36 |         curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
37 |         unzip -q vs.zip
38 |         mv vapoursynth-*/ vapoursynth/
39 | 
40 |     - name: Download ISPC
41 |       run: |
42 |         curl -s -o ispc.zip -LJO https://github.com/ispc/ispc/releases/download/v1.20.0/ispc-v1.20.0-windows.zip
43 |         unzip -q ispc.zip
44 |         mv ispc-*/ ispc/
45 |         tree ispc
46 | 
47 |     - name: Configure
48 |       shell: bash
49 |       run: cmake -S . -B build -G Ninja
50 |         -D VS_INCLUDE_DIR="$(pwd)\vapoursynth\include"
51 |         -D CMAKE_BUILD_TYPE=Release
52 |         -D CMAKE_CXX_COMPILER="clang++"
53 |         -D CMAKE_CXX_FLAGS="-Wall"
54 |         -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc.exe"
55 |         -D CMAKE_ISPC_FLAGS="--opt=fast-math"
56 |         -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8"
57 |         -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
58 | 
59 |     - name: Build
60 |       run: cmake --build build --verbose
61 | 
62 |     - name: Install
63 |       run: |
64 |         cmake --install build --prefix install
65 |         mkdir artifact
66 |         copy install\bin\vsnlm_ispc.dll artifact\
67 | 
68 |     - name: Upload
69 |       uses: actions/upload-artifact@v3
70 |       with:
71 |         name: Windows-x64
72 |         path: artifact
73 | 
74 |     - name: Describe
75 |       run: git describe --tags --long
76 | 
77 |     - name: Compress artifact for release
78 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
79 |       run: |
80 |         cd artifact
81 |         7z a -t7z -mx=7 ../vs-nlm-ispc-windows-x64.${{ github.event.inputs.tag }}.7z .
82 | 
83 |     - name: Release
84 |       uses: softprops/action-gh-release@v1
85 |       if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
86 |       with:
87 |         tag_name: ${{ github.event.inputs.tag }}
88 |         files: vs-nlm-ispc-windows-x64.${{ github.event.inputs.tag }}.7z
89 |         fail_on_unmatched_files: true
90 |         generate_release_notes: false
91 |         prerelease: true
92 | 
93 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/source/vsnlm.cpp:
--------------------------------------------------------------------------------
   1 | // based on KNLMeansCL by Khanattila
   2 | 
   3 | #include <algorithm>
   4 | #include <array>
   5 | #include <cstdint>
   6 | #include <cstdlib>
   7 | #include <cstring>
   8 | #include <limits>
   9 | #include <memory>
  10 | #include <mutex>
  11 | #include <shared_mutex>
  12 | #include <stdexcept>
  13 | #include <thread>
  14 | #include <type_traits>
  15 | #include <unordered_map>
  16 | 
  17 | #include <VapourSynth.h>
  18 | #include <VSHelper.h>
  19 | 
  20 | #include <nlm_ispc.h> // generated by the ispc compiler
  21 | 
  22 | #include <config.h> // generated by cmake and git
  23 | 
  24 | namespace {
  25 | enum struct ChannelMode { Y, UV, YUV, RGB };
  26 | 
  27 | struct NLMData {
  28 |     VSNodeRef * node; // clip
  29 |     const VSVideoInfo *vi;
  30 |     int d;
  31 |     int a;
  32 |     int s;
  33 |     float h;
  34 |     ChannelMode channels;
  35 |     decltype(&ispc::nlmVerticalWelsch) nlm_vertical; // wmode
  36 |     float wref;
  37 |     VSNodeRef * ref_node; // rclip
  38 | 
  39 |     // run-time resources
  40 |     std::shared_mutex workspaces_lock;
  41 |     std::unordered_map<std::thread::id, float *> workspaces;
  42 | };
  43 | }
  44 | 
  45 | template <typename T>
  46 | static inline auto castVoidPtr(T * p) noexcept {
  47 |     if constexpr (std::is_const_v<T>) {
  48 |         return reinterpret_cast<const void *>(p);
  49 |     } else {
  50 |         return reinterpret_cast<void *>(p);
  51 |     }
  52 | }
  53 | 
  54 | template <typename T1, typename T2>
  55 | static inline std::array<T1 *, 3> castPtrs(std::array<T2 *, 3> ptrs) {
  56 |     return {
  57 |         (T1 *) ptrs[0],
  58 |         (T1 *) ptrs[1],
  59 |         (T1 *) ptrs[2],
  60 |     };
  61 | }
  62 | 
  63 | template <typename T>
  64 | static inline constexpr T square(T x) noexcept {
  65 |     return x * x;
  66 | }
  67 | 
  68 | // T: (const) VSFrameRef
  69 | template <typename T>
  70 | static inline auto getPtrs(
  71 |     T * frame,
  72 |     ChannelMode channels,
  73 |     const VSAPI * vsapi
  74 | ) noexcept {
  75 | 
  76 |     using value_type = std::conditional_t<std::is_const_v<T>, const void, void>;
  77 | 
  78 |     std::array<value_type *, 3> ptrs {};
  79 | 
  80 |     auto get_ptr = [frame, vsapi](int plane) {
  81 |         if constexpr (std::is_const_v<T>) {
  82 |             return castVoidPtr(vsapi->getReadPtr(frame, plane));
  83 |         } else {
  84 |             return castVoidPtr(vsapi->getWritePtr(frame, plane));
  85 |         }
  86 |     };
  87 | 
  88 |     switch (channels) {
  89 |         case ChannelMode::Y:
  90 |             ptrs[0] = get_ptr(0);
  91 |             break;
  92 |         case ChannelMode::UV:
  93 |             ptrs[1] = get_ptr(1);
  94 |             ptrs[2] = get_ptr(2);
  95 |             break;
  96 |         case ChannelMode::YUV:
  97 |         case ChannelMode::RGB:
  98 |             ptrs[0] = get_ptr(0);
  99 |             ptrs[1] = get_ptr(1);
 100 |             ptrs[2] = get_ptr(2);
 101 |             break;
 102 |     }
 103 | 
 104 |     return ptrs;
 105 | }
 106 | 
 107 | static void VS_CC nlmInit(
 108 |     VSMap * in,
 109 |     VSMap * out,
 110 |     void ** instanceData,
 111 |     VSNode * node,
 112 |     VSCore * core,
 113 |     const VSAPI * vsapi
 114 | ) noexcept {
 115 | 
 116 |     const auto * d = reinterpret_cast<const NLMData *>(*instanceData);
 117 |     vsapi->setVideoInfo(vsapi->getVideoInfo(d->node), 1, node);
 118 | }
 119 | 
 120 | static inline void nlmDistanceDispatch_f32(
 121 |     float * temp0,
 122 |     std::array<const float *, 3> centerp,
 123 |     std::array<const float *, 3> neighborp,
 124 |     int offset_x,
 125 |     int offset_y,
 126 |     int width,
 127 |     int height,
 128 |     int stride,
 129 |     ChannelMode channels
 130 | ) noexcept {
 131 | 
 132 |     switch (channels) {
 133 |         case ChannelMode::Y:
 134 |             ispc::nlmDistanceLuma_f32(
 135 |                 temp0,
 136 |                 centerp[0],
 137 |                 neighborp[0],
 138 |                 offset_x, offset_y,
 139 |                 width, height, stride
 140 |             );
 141 |             break;
 142 |         case ChannelMode::UV:
 143 |             ispc::nlmDistanceChroma_f32(
 144 |                 temp0,
 145 |                 centerp[1], centerp[2],
 146 |                 neighborp[1], neighborp[2],
 147 |                 offset_x, offset_y,
 148 |                 width, height, stride
 149 |             );
 150 |             break;
 151 |         case ChannelMode::YUV:
 152 |             ispc::nlmDistanceYUV_f32(
 153 |                 temp0,
 154 |                 centerp[0], centerp[1], centerp[2],
 155 |                 neighborp[0], neighborp[1], neighborp[2],
 156 |                 offset_x, offset_y,
 157 |                 width, height, stride
 158 |             );
 159 |             break;
 160 |         case ChannelMode::RGB:
 161 |             ispc::nlmDistanceRGB_f32(
 162 |                 temp0,
 163 |                 centerp[0], centerp[1], centerp[2],
 164 |                 neighborp[0], neighborp[1], neighborp[2],
 165 |                 offset_x, offset_y,
 166 |                 width, height, stride
 167 |             );
 168 |             break;
 169 |     }
 170 | }
 171 | 
 172 | static inline void nlmDistanceDispatch_u8(
 173 |     float * temp0,
 174 |     std::array<const uint8_t *, 3> centerp,
 175 |     std::array<const uint8_t *, 3> neighborp,
 176 |     int offset_x,
 177 |     int offset_y,
 178 |     int width,
 179 |     int height,
 180 |     int stride,
 181 |     ChannelMode channels,
 182 |     float inv_divisor
 183 | ) noexcept {
 184 | 
 185 |     switch (channels) {
 186 |         case ChannelMode::Y:
 187 |             ispc::nlmDistanceLuma_u8(
 188 |                 temp0,
 189 |                 centerp[0],
 190 |                 neighborp[0],
 191 |                 offset_x, offset_y,
 192 |                 width, height, stride,
 193 |                 inv_divisor
 194 |             );
 195 |             break;
 196 |         case ChannelMode::UV:
 197 |             ispc::nlmDistanceChroma_u8(
 198 |                 temp0,
 199 |                 centerp[1], centerp[2],
 200 |                 neighborp[1], neighborp[2],
 201 |                 offset_x, offset_y,
 202 |                 width, height, stride,
 203 |                 inv_divisor
 204 |             );
 205 |             break;
 206 |         case ChannelMode::YUV:
 207 |             ispc::nlmDistanceYUV_u8(
 208 |                 temp0,
 209 |                 centerp[0], centerp[1], centerp[2],
 210 |                 neighborp[0], neighborp[1], neighborp[2],
 211 |                 offset_x, offset_y,
 212 |                 width, height, stride,
 213 |                 inv_divisor
 214 |             );
 215 |             break;
 216 |         case ChannelMode::RGB:
 217 |             ispc::nlmDistanceRGB_u8(
 218 |                 temp0,
 219 |                 centerp[0], centerp[1], centerp[2],
 220 |                 neighborp[0], neighborp[1], neighborp[2],
 221 |                 offset_x, offset_y,
 222 |                 width, height, stride,
 223 |                 inv_divisor
 224 |             );
 225 |             break;
 226 |     }
 227 | }
 228 | 
 229 | static inline void nlmDistanceDispatch_u16(
 230 |     float * temp0,
 231 |     std::array<const uint16_t *, 3> centerp,
 232 |     std::array<const uint16_t *, 3> neighborp,
 233 |     int offset_x,
 234 |     int offset_y,
 235 |     int width,
 236 |     int height,
 237 |     int stride,
 238 |     ChannelMode channels,
 239 |     float inv_divisor
 240 | ) noexcept {
 241 | 
 242 |     switch (channels) {
 243 |         case ChannelMode::Y:
 244 |             ispc::nlmDistanceLuma_u16(
 245 |                 temp0,
 246 |                 centerp[0],
 247 |                 neighborp[0],
 248 |                 offset_x, offset_y,
 249 |                 width, height, stride,
 250 |                 inv_divisor
 251 |             );
 252 |             break;
 253 |         case ChannelMode::UV:
 254 |             ispc::nlmDistanceChroma_u16(
 255 |                 temp0,
 256 |                 centerp[1], centerp[2],
 257 |                 neighborp[1], neighborp[2],
 258 |                 offset_x, offset_y,
 259 |                 width, height, stride,
 260 |                 inv_divisor
 261 |             );
 262 |             break;
 263 |         case ChannelMode::YUV:
 264 |             ispc::nlmDistanceYUV_u16(
 265 |                 temp0,
 266 |                 centerp[0], centerp[1], centerp[2],
 267 |                 neighborp[0], neighborp[1], neighborp[2],
 268 |                 offset_x, offset_y,
 269 |                 width, height, stride,
 270 |                 inv_divisor
 271 |             );
 272 |             break;
 273 |         case ChannelMode::RGB:
 274 |             ispc::nlmDistanceRGB_u16(
 275 |                 temp0,
 276 |                 centerp[0], centerp[1], centerp[2],
 277 |                 neighborp[0], neighborp[1], neighborp[2],
 278 |                 offset_x, offset_y,
 279 |                 width, height, stride,
 280 |                 inv_divisor
 281 |             );
 282 |             break;
 283 |     }
 284 | }
 285 | 
 286 | static inline void nlmDistance(
 287 |     float * temp0,
 288 |     std::array<const void *, 3> centerp,
 289 |     std::array<const void *, 3> neighborp,
 290 |     int offset_x,
 291 |     int offset_y,
 292 |     int width,
 293 |     int height,
 294 |     int stride,
 295 |     ChannelMode channels,
 296 |     int bits
 297 | ) noexcept {
 298 | 
 299 |     if (bits == 32) {
 300 |         nlmDistanceDispatch_f32(
 301 |             temp0,
 302 |             castPtrs<const float>(centerp), castPtrs<const float>(neighborp),
 303 |             offset_x, offset_y,
 304 |             width, height, stride, channels
 305 |         );
 306 |     } else if (bits <= 8) {
 307 |         float inv_divisor = 1.0f / ((1 << bits) - 1);
 308 |         nlmDistanceDispatch_u8(
 309 |             temp0,
 310 |             castPtrs<const uint8_t>(centerp), castPtrs<const uint8_t>(neighborp),
 311 |             offset_x, offset_y,
 312 |             width, height, stride, channels, inv_divisor
 313 |         );
 314 |     } else if (bits <= 16) {
 315 |         float inv_divisor = 1.0f / ((1 << bits) - 1);
 316 |         nlmDistanceDispatch_u16(
 317 |             temp0,
 318 |             castPtrs<const uint16_t>(centerp), castPtrs<const uint16_t>(neighborp),
 319 |             offset_x, offset_y,
 320 |             width, height, stride, channels, inv_divisor
 321 |         );
 322 |     } else {
 323 |         assert(false);
 324 |     }
 325 | }
 326 | 
 327 | static inline void nlmAccumulationDispatch_f32(
 328 |     float * weightp,
 329 |     std::array<float *, 3> wdstp,
 330 |     float * max_weightp,
 331 |     std::array<const float *, 3> srcp_bwd,
 332 |     std::array<const float *, 3> srcp_fwd,
 333 |     const float * temp_bwd,
 334 |     const float * temp_fwd,
 335 |     int offset_x,
 336 |     int offset_y,
 337 |     int width,
 338 |     int height,
 339 |     int stride,
 340 |     ChannelMode channels
 341 | ) noexcept {
 342 | 
 343 |     switch (channels) {
 344 |         case ChannelMode::Y:
 345 |             ispc::nlmAccumulationCh1_f32(
 346 |                 weightp, wdstp[0], max_weightp,
 347 |                 srcp_bwd[0],
 348 |                 srcp_fwd[0],
 349 |                 temp_bwd, temp_fwd,
 350 |                 offset_x, offset_y,
 351 |                 width, height, stride
 352 |             );
 353 |             break;
 354 |         case ChannelMode::UV:
 355 |             ispc::nlmAccumulationCh2_f32(
 356 |                 weightp, wdstp[0], wdstp[1], max_weightp,
 357 |                 srcp_bwd[1], srcp_bwd[2],
 358 |                 srcp_fwd[1], srcp_fwd[2],
 359 |                 temp_bwd, temp_fwd,
 360 |                 offset_x, offset_y,
 361 |                 width, height, stride
 362 |             );
 363 |             break;
 364 |         case ChannelMode::YUV:
 365 |         case ChannelMode::RGB:
 366 |             ispc::nlmAccumulationCh3_f32(
 367 |                 weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp,
 368 |                 srcp_bwd[0], srcp_bwd[1], srcp_bwd[2],
 369 |                 srcp_fwd[0], srcp_fwd[1], srcp_fwd[2],
 370 |                 temp_bwd, temp_fwd,
 371 |                 offset_x, offset_y,
 372 |                 width, height, stride
 373 |             );
 374 |             break;
 375 |     }
 376 | }
 377 | 
 378 | static inline void nlmAccumulationDispatch_u8(
 379 |     float * weightp,
 380 |     std::array<float *, 3> wdstp,
 381 |     float * max_weightp,
 382 |     std::array<const uint8_t *, 3> srcp_bwd,
 383 |     std::array<const uint8_t *, 3> srcp_fwd,
 384 |     const float * temp_bwd,
 385 |     const float * temp_fwd,
 386 |     int offset_x,
 387 |     int offset_y,
 388 |     int width,
 389 |     int height,
 390 |     int stride,
 391 |     ChannelMode channels
 392 | ) noexcept {
 393 | 
 394 |     switch (channels) {
 395 |         case ChannelMode::Y:
 396 |             ispc::nlmAccumulationCh1_u8(
 397 |                 weightp, wdstp[0], max_weightp,
 398 |                 srcp_bwd[0],
 399 |                 srcp_fwd[0],
 400 |                 temp_bwd, temp_fwd,
 401 |                 offset_x, offset_y,
 402 |                 width, height, stride
 403 |             );
 404 |             break;
 405 |         case ChannelMode::UV:
 406 |             ispc::nlmAccumulationCh2_u8(
 407 |                 weightp, wdstp[0], wdstp[1], max_weightp,
 408 |                 srcp_bwd[1], srcp_bwd[2],
 409 |                 srcp_fwd[1], srcp_fwd[2],
 410 |                 temp_bwd, temp_fwd,
 411 |                 offset_x, offset_y,
 412 |                 width, height, stride
 413 |             );
 414 |             break;
 415 |         case ChannelMode::YUV:
 416 |         case ChannelMode::RGB:
 417 |             ispc::nlmAccumulationCh3_u8(
 418 |                 weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp,
 419 |                 srcp_bwd[0], srcp_bwd[1], srcp_bwd[2],
 420 |                 srcp_fwd[0], srcp_fwd[1], srcp_fwd[2],
 421 |                 temp_bwd, temp_fwd,
 422 |                 offset_x, offset_y,
 423 |                 width, height, stride
 424 |             );
 425 |             break;
 426 |     }
 427 | }
 428 | 
 429 | static inline void nlmAccumulationDispatch_u16(
 430 |     float * weightp,
 431 |     std::array<float *, 3> wdstp,
 432 |     float * max_weightp,
 433 |     std::array<const uint16_t *, 3> srcp_bwd,
 434 |     std::array<const uint16_t *, 3> srcp_fwd,
 435 |     const float * temp_bwd,
 436 |     const float * temp_fwd,
 437 |     int offset_x,
 438 |     int offset_y,
 439 |     int width,
 440 |     int height,
 441 |     int stride,
 442 |     ChannelMode channels
 443 | ) noexcept {
 444 | 
 445 |     switch (channels) {
 446 |         case ChannelMode::Y:
 447 |             ispc::nlmAccumulationCh1_u16(
 448 |                 weightp, wdstp[0], max_weightp,
 449 |                 srcp_bwd[0],
 450 |                 srcp_fwd[0],
 451 |                 temp_bwd, temp_fwd,
 452 |                 offset_x, offset_y,
 453 |                 width, height, stride
 454 |             );
 455 |             break;
 456 |         case ChannelMode::UV:
 457 |             ispc::nlmAccumulationCh2_u16(
 458 |                 weightp, wdstp[0], wdstp[1], max_weightp,
 459 |                 srcp_bwd[1], srcp_bwd[2],
 460 |                 srcp_fwd[1], srcp_fwd[2],
 461 |                 temp_bwd, temp_fwd,
 462 |                 offset_x, offset_y,
 463 |                 width, height, stride
 464 |             );
 465 |             break;
 466 |         case ChannelMode::YUV:
 467 |         case ChannelMode::RGB:
 468 |             ispc::nlmAccumulationCh3_u16(
 469 |                 weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp,
 470 |                 srcp_bwd[0], srcp_bwd[1], srcp_bwd[2],
 471 |                 srcp_fwd[0], srcp_fwd[1], srcp_fwd[2],
 472 |                 temp_bwd, temp_fwd,
 473 |                 offset_x, offset_y,
 474 |                 width, height, stride
 475 |             );
 476 |             break;
 477 |     }
 478 | }
 479 | 
 480 | static inline void nlmAccumulation(
 481 |     float * weightp,
 482 |     std::array<float *, 3> wdstp,
 483 |     float * max_weightp,
 484 |     std::array<const void *, 3> srcp_bwd,
 485 |     std::array<const void *, 3> srcp_fwd,
 486 |     const float * temp_bwd,
 487 |     const float * temp_fwd,
 488 |     int offset_x,
 489 |     int offset_y,
 490 |     int width,
 491 |     int height,
 492 |     int stride,
 493 |     ChannelMode channels,
 494 |     int bits
 495 | ) noexcept {
 496 | 
 497 |     if (bits == 32) {
 498 |         nlmAccumulationDispatch_f32(
 499 |             weightp, wdstp, max_weightp,
 500 |             castPtrs<const float>(srcp_bwd), castPtrs<const float>(srcp_fwd), temp_bwd, temp_fwd,
 501 |             offset_x, offset_y, width, height, stride, channels
 502 |         );
 503 |     } else if (bits <= 8) {
 504 |         nlmAccumulationDispatch_u8(
 505 |             weightp, wdstp, max_weightp,
 506 |             castPtrs<const uint8_t>(srcp_bwd), castPtrs<const uint8_t>(srcp_fwd), temp_bwd, temp_fwd,
 507 |             offset_x, offset_y, width, height, stride, channels
 508 |         );
 509 |     } else if (bits <= 16) {
 510 |         nlmAccumulationDispatch_u16(
 511 |             weightp, wdstp, max_weightp,
 512 |             castPtrs<const uint16_t>(srcp_bwd), castPtrs<const uint16_t>(srcp_fwd), temp_bwd, temp_fwd,
 513 |             offset_x, offset_y, width, height, stride, channels
 514 |         );
 515 |     } else {
 516 |         assert(false);
 517 |     }
 518 | }
 519 | 
 520 | static inline void nlmFinishDispatch_f32(
 521 |     std::array<float *, 3> dstp,
 522 |     std::array<const float *, 3> srcp,
 523 |     const float * weightp,
 524 |     std::array<float *, 3> wdstp,
 525 |     const float * max_weightp,
 526 |     float wref,
 527 |     int width,
 528 |     int height,
 529 |     int stride,
 530 |     ChannelMode channels
 531 | ) noexcept {
 532 | 
 533 |     switch (channels) {
 534 |         case ChannelMode::Y:
 535 |             ispc::nlmFinishCh1_f32(
 536 |                 dstp[0],
 537 |                 srcp[0],
 538 |                 weightp, wdstp[0],
 539 |                 max_weightp, wref,
 540 |                 width, height, stride
 541 |             );
 542 |             break;
 543 |         case ChannelMode::UV:
 544 |             ispc::nlmFinishCh2_f32(
 545 |                 dstp[1], dstp[2],
 546 |                 srcp[1], srcp[2],
 547 |                 weightp, wdstp[0], wdstp[1],
 548 |                 max_weightp, wref,
 549 |                 width, height, stride
 550 |             );
 551 |             break;
 552 |         case ChannelMode::YUV:
 553 |         case ChannelMode::RGB:
 554 |             ispc::nlmFinishCh3_f32(
 555 |                 dstp[0], dstp[1], dstp[2],
 556 |                 srcp[0], srcp[1], srcp[2],
 557 |                 weightp, wdstp[0], wdstp[1], wdstp[2],
 558 |                 max_weightp, wref,
 559 |                 width, height, stride
 560 |             );
 561 |             break;
 562 |     }
 563 | }
 564 | 
 565 | static inline void nlmFinishDispatch_u8(
 566 |     std::array<uint8_t *, 3> dstp,
 567 |     std::array<const uint8_t *, 3> srcp,
 568 |     const float * weightp,
 569 |     std::array<float *, 3> wdstp,
 570 |     const float * max_weightp,
 571 |     float wref,
 572 |     int width,
 573 |     int height,
 574 |     int stride,
 575 |     ChannelMode channels,
 576 |     int peak
 577 | ) noexcept {
 578 | 
 579 |     switch (channels) {
 580 |         case ChannelMode::Y:
 581 |             ispc::nlmFinishCh1_u8(
 582 |                 dstp[0],
 583 |                 srcp[0],
 584 |                 weightp, wdstp[0],
 585 |                 max_weightp, wref,
 586 |                 width, height, stride,
 587 |                 peak
 588 |             );
 589 |             break;
 590 |         case ChannelMode::UV:
 591 |             ispc::nlmFinishCh2_u8(
 592 |                 dstp[1], dstp[2],
 593 |                 srcp[1], srcp[2],
 594 |                 weightp, wdstp[0], wdstp[1],
 595 |                 max_weightp, wref,
 596 |                 width, height, stride,
 597 |                 peak
 598 |             );
 599 |             break;
 600 |         case ChannelMode::YUV:
 601 |         case ChannelMode::RGB:
 602 |             ispc::nlmFinishCh3_u8(
 603 |                 dstp[0], dstp[1], dstp[2],
 604 |                 srcp[0], srcp[1], srcp[2],
 605 |                 weightp, wdstp[0], wdstp[1], wdstp[2],
 606 |                 max_weightp, wref,
 607 |                 width, height, stride,
 608 |                 peak
 609 |             );
 610 |             break;
 611 |     }
 612 | }
 613 | 
 614 | static inline void nlmFinishDispatch_u16(
 615 |     std::array<uint16_t *, 3> dstp,
 616 |     std::array<const uint16_t *, 3> srcp,
 617 |     const float * weightp,
 618 |     std::array<float *, 3> wdstp,
 619 |     const float * max_weightp,
 620 |     float wref,
 621 |     int width,
 622 |     int height,
 623 |     int stride,
 624 |     ChannelMode channels,
 625 |     int peak
 626 | ) noexcept {
 627 | 
 628 |     switch (channels) {
 629 |         case ChannelMode::Y:
 630 |             ispc::nlmFinishCh1_u16(
 631 |                 dstp[0],
 632 |                 srcp[0],
 633 |                 weightp, wdstp[0],
 634 |                 max_weightp, wref,
 635 |                 width, height, stride,
 636 |                 peak
 637 |             );
 638 |             break;
 639 |         case ChannelMode::UV:
 640 |             ispc::nlmFinishCh2_u16(
 641 |                 dstp[1], dstp[2],
 642 |                 srcp[1], srcp[2],
 643 |                 weightp, wdstp[0], wdstp[1],
 644 |                 max_weightp, wref,
 645 |                 width, height, stride,
 646 |                 peak
 647 |             );
 648 |             break;
 649 |         case ChannelMode::YUV:
 650 |         case ChannelMode::RGB:
 651 |             ispc::nlmFinishCh3_u16(
 652 |                 dstp[0], dstp[1], dstp[2],
 653 |                 srcp[0], srcp[1], srcp[2],
 654 |                 weightp, wdstp[0], wdstp[1], wdstp[2],
 655 |                 max_weightp, wref,
 656 |                 width, height, stride,
 657 |                 peak
 658 |             );
 659 |             break;
 660 |     }
 661 | }
 662 | 
 663 | static inline void nlmFinish(
 664 |     std::array<void *, 3> dstp,
 665 |     std::array<const void *, 3> srcp,
 666 |     const float * weightp,
 667 |     std::array<float *, 3> wdstp,
 668 |     const float * max_weightp,
 669 |     float wref,
 670 |     int width,
 671 |     int height,
 672 |     int stride,
 673 |     ChannelMode channels,
 674 |     int bits
 675 | ) noexcept {
 676 | 
 677 |     if (bits == 32) {
 678 |         nlmFinishDispatch_f32(
 679 |             castPtrs<float>(dstp), castPtrs<const float>(srcp),
 680 |             weightp, wdstp, max_weightp, wref, width, height, stride, channels
 681 |         );
 682 |     } else if (bits <= 8) {
 683 |         int peak = (1 << bits) - 1;
 684 |         nlmFinishDispatch_u8(
 685 |             castPtrs<uint8_t>(dstp), castPtrs<const uint8_t>(srcp),
 686 |             weightp, wdstp, max_weightp, wref, width, height, stride, channels, peak
 687 |         );
 688 |     } else if (bits <= 16) {
 689 |         int peak = (1 << bits) - 1;
 690 |         nlmFinishDispatch_u16(
 691 |             castPtrs<uint16_t>(dstp), castPtrs<const uint16_t>(srcp),
 692 |             weightp, wdstp, max_weightp, wref, width, height, stride, channels, peak
 693 |         );
 694 |     } else {
 695 |         assert(false);
 696 |     }
 697 | }
 698 | 
 699 | static const VSFrameRef *VS_CC nlmGetFrame(
 700 |     int n,
 701 |     int activationReason,
 702 |     void ** instanceData,
 703 |     void ** frameData,
 704 |     VSFrameContext * frameCtx,
 705 |     VSCore * core,
 706 |     const VSAPI * vsapi
 707 | ) noexcept {
 708 | 
 709 |     auto * d = reinterpret_cast<NLMData *>(*instanceData);
 710 | 
 711 |     if (activationReason == arInitial) {
 712 |         int start = std::max(0, n - d->d);
 713 |         int end = std::min(n + d->d, d->vi->numFrames - 1);
 714 |         for (int i = start; i <= end; i++) {
 715 |             vsapi->requestFrameFilter(i, d->node, frameCtx);
 716 |             if (d->ref_node) {
 717 |                 vsapi->requestFrameFilter(i, d->ref_node, frameCtx);
 718 |             }
 719 |         }
 720 |         return nullptr;
 721 |     } else if (activationReason != arAllFramesReady) {
 722 |         return nullptr;
 723 |     }
 724 | 
 725 |     // activationReason == arAllFramesReady
 726 | 
 727 |     int nlm_d = d->d;
 728 |     int nlm_a = d->a;
 729 |     int nlm_s = d->s;
 730 |     float nlm_h2_inv_norm = square(255.0f) / (3.0f * square(d->h) * square(2 * nlm_s + 1));
 731 |     float nlm_wref = d->wref;
 732 |     ChannelMode channels = d->channels;
 733 | 
 734 |     const auto & ref_node = d->ref_node ? d->ref_node : d->node;
 735 |     auto ref_frame = vsapi->getFrameFilter(n, ref_node, frameCtx);
 736 | 
 737 |     int bits = d->vi->format->bitsPerSample;
 738 |     int width, height, stride; // dimensions of the plane to be processed, not the video dimension
 739 |     if (channels == ChannelMode::UV) {
 740 |         width = d->vi->width >> d->vi->format->subSamplingW;
 741 |         height = d->vi->height >> d->vi->format->subSamplingH;
 742 |         stride = vsapi->getStride(ref_frame, 1) / d->vi->format->bytesPerSample;
 743 |     } else {
 744 |         width = d->vi->width;
 745 |         height = d->vi->height;
 746 |         stride = vsapi->getStride(ref_frame, 0) / d->vi->format->bytesPerSample;
 747 |     }
 748 | 
 749 |     int size = height * stride; // size of each plane in quad-bytes
 750 |     // number of input channels
 751 |     int num_input_channels = [channels]() {
 752 |         if (channels == ChannelMode::Y) {
 753 |             return 1;
 754 |         } else if (channels == ChannelMode::UV) {
 755 |             return 2;
 756 |         } else {
 757 |             // channels == ChannelMode::YUV || channels == ChannelMode::RGB
 758 |             return 3;
 759 |         }
 760 |     }();
 761 |     // size in quad-bytes: size * (4 + num_input_channels + (nlm_d != 0)) + width
 762 |     float * workspace;
 763 |     {
 764 |         auto thread_id = std::this_thread::get_id();
 765 |         d->workspaces_lock.lock_shared();
 766 |         bool init = true;
 767 |         try {
 768 |             const auto & const_workspaces = d->workspaces;
 769 |             workspace = const_workspaces.at(thread_id);
 770 |         } catch (const std::out_of_range &) {
 771 |             init = false;
 772 |         }
 773 |         d->workspaces_lock.unlock_shared();
 774 | 
 775 |         if (!init) {
 776 |             auto workspace_size = size * (4 + num_input_channels + (nlm_d != 0)) + width;
 777 |             auto workspace_bytes = workspace_size * sizeof(float);
 778 |             workspace = vs_aligned_malloc<float>(workspace_bytes, 256);
 779 | 
 780 |             if (!workspace) {
 781 |                 vsapi->freeFrame(ref_frame);
 782 |                 vsapi->setFilterError("nlm_ispc: malloc() failed", frameCtx);
 783 |                 return nullptr;
 784 |             }
 785 | 
 786 |             std::lock_guard _ { d->workspaces_lock };
 787 |             d->workspaces.emplace(thread_id, workspace);
 788 |         }
 789 |     }
 790 | 
 791 |     // zero-initialize aggregation buffers
 792 |     std::memset(workspace, 0, (1 + num_input_channels) * size * sizeof(float));
 793 |     // stores the sum of weights of each pixel
 794 |     float * weightp = workspace;
 795 |     std::array<float *, 3> wdstp {
 796 |         // stores the weighted sum of pixel values of the first processed plane
 797 |         workspace + size,
 798 |         // stores the weighted sum of pixel values of the second processed plane
 799 |         num_input_channels <= 1 ? nullptr : workspace + 2 * size,
 800 |         // stores the weighted sum of pixel values of the third processed plane
 801 |         num_input_channels <= 2 ? nullptr : workspace + 3 * size
 802 |     };
 803 | 
 804 |     // stores the maximum weight encountered of each pixel
 805 |     float * max_weightp = workspace + (1 + num_input_channels) * size;
 806 |     for (int i = 0; i < size; i++) {
 807 |         max_weightp[i] = std::numeric_limits<float>::epsilon();
 808 |     }
 809 | 
 810 |     // temporary storage for the calculation of patch distances
 811 |     float * temp = workspace + (2 + num_input_channels) * size;
 812 |     float * temp_bwd = workspace + (3 + num_input_channels) * size;
 813 |     float * temp_fwd = nlm_d == 0 ? nullptr : workspace + (4 + num_input_channels) * size;
 814 | 
 815 |     // buffer for the vertical box filter during patch distance calculation
 816 |     // size in quad-bytes: width
 817 |     float * buffer = workspace + (4 + num_input_channels + (nlm_d != 0)) * size;
 818 | 
 819 |     std::array refp { getPtrs(ref_frame, channels, vsapi) };
 820 | 
 821 |     for (int i = -nlm_d; i <= 0; i++) {
 822 |         auto bwd_n = std::max(n + i, 0);
 823 |         auto fwd_n = std::min(n - i, d->vi->numFrames - 1);
 824 |         auto src_frame_bwd = vsapi->getFrameFilter(bwd_n, d->node, frameCtx);
 825 |         auto src_frame_fwd = vsapi->getFrameFilter(fwd_n, d->node, frameCtx);
 826 |         auto ref_frame_bwd = vsapi->getFrameFilter(bwd_n, ref_node, frameCtx);
 827 |         auto ref_frame_fwd = vsapi->getFrameFilter(fwd_n, ref_node, frameCtx);
 828 | 
 829 |         std::array srcp_bwd { getPtrs(src_frame_bwd, channels, vsapi) };
 830 |         std::array srcp_fwd { getPtrs(src_frame_fwd, channels, vsapi) };
 831 |         std::array refp_bwd { getPtrs(ref_frame_bwd, channels, vsapi) };
 832 |         std::array refp_fwd { getPtrs(ref_frame_fwd, channels, vsapi) };
 833 | 
 834 |         for (int offset_y = -nlm_a; offset_y <= nlm_a; offset_y++) {
 835 |             for (int offset_x = -nlm_a; offset_x <= nlm_a; offset_x++) {
 836 |                 if (i * square(2 * nlm_a + 1) + offset_y * (2 * nlm_a + 1) + offset_x >= 0) {
 837 |                     continue;
 838 |                 }
 839 | 
 840 |                 nlmDistance(
 841 |                     temp_bwd,
 842 |                     refp, refp_bwd,
 843 |                     offset_x, offset_y, width, height, stride, channels, bits
 844 |                 );
 845 | 
 846 |                 ispc::nlmHorizontal(
 847 |                     temp,
 848 |                     temp_bwd,
 849 |                     nlm_s, width, height, stride
 850 |                 );
 851 | 
 852 |                 d->nlm_vertical(
 853 |                     temp_bwd,
 854 |                     temp,
 855 |                     nlm_s, nlm_h2_inv_norm, width, height, stride, buffer
 856 |                 );
 857 | 
 858 |                 // jump at the end of this basic block
 859 |                 if (i == 0) {
 860 |                     // bwd == fwd
 861 |                     nlmAccumulation(
 862 |                         weightp, wdstp, max_weightp,
 863 |                         srcp_bwd, srcp_bwd, temp_bwd, temp_bwd,
 864 |                         offset_x, offset_y, width, height, stride, channels, bits
 865 |                     );
 866 |                     continue;
 867 |                 }
 868 | 
 869 |                 // i != 0
 870 |                 nlmDistance(
 871 |                     temp_fwd,
 872 |                     refp_fwd, refp,
 873 |                     offset_x, offset_y, width, height, stride, channels, bits
 874 |                 );
 875 | 
 876 |                 ispc::nlmHorizontal(
 877 |                     temp,
 878 |                     temp_fwd,
 879 |                     nlm_s, width, height, stride
 880 |                 );
 881 | 
 882 |                 d->nlm_vertical(
 883 |                     temp_fwd,
 884 |                     temp,
 885 |                     nlm_s, nlm_h2_inv_norm, width, height, stride, buffer
 886 |                 );
 887 | 
 888 |                 nlmAccumulation(
 889 |                     weightp, wdstp, max_weightp,
 890 |                     srcp_bwd, srcp_fwd, temp_bwd, temp_fwd,
 891 |                     offset_x, offset_y, width, height, stride, channels, bits
 892 |                 );
 893 |             }
 894 |         }
 895 | 
 896 |         vsapi->freeFrame(src_frame_fwd);
 897 |         vsapi->freeFrame(src_frame_bwd);
 898 |         vsapi->freeFrame(ref_frame_fwd);
 899 |         vsapi->freeFrame(ref_frame_bwd);
 900 |     }
 901 | 
 902 |     vsapi->freeFrame(ref_frame);
 903 | 
 904 |     auto src_frame = vsapi->getFrameFilter(n, d->node, frameCtx);
 905 |     std::array srcp { getPtrs(src_frame, channels, vsapi) };
 906 | 
 907 |     VSFrameRef * dst_frame;
 908 |     if (channels == ChannelMode::Y && d->vi->format->numPlanes > 1) {
 909 |         const VSFrameRef * fr[3] { nullptr, src_frame, src_frame };
 910 |         constexpr int pl[3] { 0, 1, 2 };
 911 |         dst_frame = vsapi->newVideoFrame2(d->vi->format, d->vi->width, d->vi->height, fr, pl, src_frame, core);
 912 |     } else if (channels == ChannelMode::UV && d->vi->format->numPlanes > 1) {
 913 |         const VSFrameRef * fr[3] { src_frame, nullptr, nullptr };
 914 |         constexpr int pl[3] { 0, 1, 2 };
 915 |         dst_frame = vsapi->newVideoFrame2(d->vi->format, d->vi->width, d->vi->height, fr, pl, src_frame, core);
 916 |     } else {
 917 |         dst_frame = vsapi->newVideoFrame(d->vi->format, d->vi->width, d->vi->height, src_frame, core);
 918 |     }
 919 |     std::array dstp { getPtrs(dst_frame, channels, vsapi) };
 920 | 
 921 |     nlmFinish(dstp, srcp, weightp, wdstp, max_weightp, nlm_wref, width, height, stride, channels, bits);
 922 | 
 923 |     vsapi->freeFrame(src_frame);
 924 | 
 925 |     return dst_frame;
 926 | }
 927 | 
 928 | static void VS_CC nlmFree(
 929 |     void * instanceData,
 930 |     VSCore * core,
 931 |     const VSAPI * vsapi
 932 | ) noexcept {
 933 | 
 934 |     auto * d = reinterpret_cast<NLMData *>(instanceData);
 935 | 
 936 |     vsapi->freeNode(d->node);
 937 |     if (d->ref_node) {
 938 |         vsapi->freeNode(d->ref_node);
 939 |     }
 940 | 
 941 |     for (const auto & [_, ptr] : d->workspaces) {
 942 |         vs_aligned_free(ptr);
 943 |     }
 944 | 
 945 |     delete d;
 946 | }
 947 | 
 948 | static void VS_CC nlmCreate(
 949 |     const VSMap * in,
 950 |     VSMap * out,
 951 |     void * userData,
 952 |     VSCore * core,
 953 |     const VSAPI * vsapi
 954 | ) noexcept {
 955 | 
 956 |     auto d = std::make_unique<NLMData>();
 957 | 
 958 |     d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
 959 |     d->vi = vsapi->getVideoInfo(d->node);
 960 | 
 961 |     auto set_error = [vsapi, out, &d](const char * error_message) -> void {
 962 |         vsapi->setError(out, error_message);
 963 |         vsapi->freeNode(d->node);
 964 |     };
 965 | 
 966 |     if ((d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
 967 |         (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32)
 968 |      ) {
 969 |         return set_error("only 1-16 bit integer or 32-bit float supported");
 970 |     }
 971 | 
 972 |     int err;
 973 | 
 974 |     d->d = int64ToIntS(vsapi->propGetInt(in, "d", 0, &err));
 975 |     if (err) {
 976 |         d->d = 1;
 977 |     }
 978 |     if (d->d < 0) {
 979 |         return set_error("\"d\" must be non-negative");
 980 |     }
 981 | 
 982 |     d->a = int64ToIntS(vsapi->propGetInt(in, "a", 0, &err));
 983 |     if (err) {
 984 |         d->a = 2;
 985 |     }
 986 |     if (d->a <= 0) {
 987 |         return set_error("\"a\" must be positive");
 988 |     }
 989 | 
 990 |     d->s = int64ToIntS(vsapi->propGetInt(in, "s", 0, &err));
 991 |     if (err) {
 992 |         d->s = 4;
 993 |     }
 994 |     if (d->s < 0) {
 995 |         return set_error("\"s\" must be non-negative");
 996 |     }
 997 | 
 998 |     d->h = static_cast<float>(vsapi->propGetFloat(in, "h", 0, &err));
 999 |     if (err) {
1000 |         d->h = 1.2f;
1001 |     }
1002 |     if (d->h <= 0.0f) {
1003 |         return set_error("\"h\" must be positive");
1004 |     }
1005 | 
1006 |     auto wmode = vsapi->propGetInt(in, "wmode", 0, &err);
1007 |     if (err) {
1008 |         wmode = 0;
1009 |     }
1010 |     if (wmode < 0 || wmode > 3) {
1011 |         return set_error("\"wmode\" must be 0, 1, 2 or 3");
1012 |     }
1013 |     decltype(d->nlm_vertical) nlmVerticalKernels[] {
1014 |         &ispc::nlmVerticalWelsch,
1015 |         &ispc::nlmVerticalBisquareA,
1016 |         &ispc::nlmVerticalBisquareB,
1017 |         &ispc::nlmVerticalBisquareC
1018 |     };
1019 |     d->nlm_vertical = nlmVerticalKernels[wmode];
1020 | 
1021 |     auto channels = vsapi->propGetData(in, "channels", 0, &err);
1022 |     if (err) {
1023 |         channels = "AUTO";
1024 |     }
1025 |     auto channels_len = std::strlen(channels);
1026 |     if (channels_len == 1 && *channels == 'Y') {
1027 |         d->channels = ChannelMode::Y;
1028 |     } else if (channels_len == 2 && std::strncmp(channels, "UV", 2) == 0) {
1029 |         d->channels = ChannelMode::UV;
1030 |     } else if (channels_len == 3 && std::strncmp(channels, "YUV", 3) == 0) {
1031 |         d->channels = ChannelMode::YUV;
1032 |     } else if (channels_len == 3 && std::strncmp(channels, "RGB", 3) == 0) {
1033 |         d->channels = ChannelMode::RGB;
1034 |     } else if (channels_len == 4 && std::strncmp(channels, "AUTO", 4) == 0) {
1035 |         if (d->vi->format->colorFamily == cmRGB) {
1036 |             d->channels = ChannelMode::RGB;
1037 |         } else {
1038 |             d->channels = ChannelMode::Y;
1039 |         }
1040 |     } else {
1041 |         return set_error("\"channels\" must be \"Y\", \"UV\', \"YUV\", \"RGB\" or \"AUTO\"");
1042 |     }
1043 | 
1044 |     if (d->channels == ChannelMode::Y) {
1045 |         if (d->vi->format->colorFamily != cmGray && d->vi->format->colorFamily != cmYUV) {
1046 |             return set_error("color family must be Gray or YUV for \"channels\" == \"Y\"");
1047 |         }
1048 |     } else if (d->channels == ChannelMode::UV) {
1049 |         if (d->vi->format->colorFamily != cmYUV) {
1050 |             return set_error("color family must be YUV for \"channels\" == \"UV\"");
1051 |         }
1052 |     } else if (d->channels == ChannelMode::YUV) {
1053 |         if (d->vi->format->colorFamily != cmYUV || d->vi->format->subSamplingW || d->vi->format->subSamplingH) {
1054 |             return set_error("color family must be YUV444 for \"channels\" == \"YUV\"");
1055 |         }
1056 |     } else if (d->channels == ChannelMode::RGB) {
1057 |         if (d->vi->format->colorFamily != cmRGB) {
1058 |             return set_error("color family must be RGB for \"channels\" == \"RGB\"");
1059 |         }
1060 |     }
1061 | 
1062 |     d->wref = static_cast<float>(vsapi->propGetFloat(in, "wref", 0, &err));
1063 |     if (err) {
1064 |         d->wref = 1.0f;
1065 |     }
1066 | 
1067 |     d->ref_node = vsapi->propGetNode(in, "rclip", 0, &err);
1068 |     if (err) {
1069 |         d->ref_node = nullptr;
1070 |     }
1071 |     if (d->ref_node) {
1072 |         const auto ref_vi = vsapi->getVideoInfo(d->ref_node);
1073 |         if (!isSameFormat(d->vi, ref_vi) || d->vi->numFrames != ref_vi->numFrames) {
1074 |             vsapi->freeNode(d->ref_node);
1075 |             return set_error("\"rclip\" must be of the same format as \"clip\"");
1076 |         }
1077 |     }
1078 | 
1079 |     VSCoreInfo core_info;
1080 |     vsapi->getCoreInfo2(core, &core_info);
1081 |     d->workspaces.reserve(core_info.numThreads);
1082 | 
1083 |     vsapi->createFilter(
1084 |         in, out,
1085 |         "NLMeans", nlmInit, nlmGetFrame, nlmFree,
1086 |         fmParallel, 0, d.release(), core
1087 |     );
1088 | }
1089 | 
1090 | VS_EXTERNAL_API(void) VapourSynthPluginInit(
1091 |     VSConfigPlugin configFunc,
1092 |     VSRegisterFunction registerFunc,
1093 |     VSPlugin * plugin
1094 | ) noexcept {
1095 | 
1096 |     configFunc(
1097 |         "io.github.amusementclub.vs-nlm-ispc",
1098 |         "nlm_ispc",
1099 |         "Non-local means denoise filter implemented in ISPC",
1100 |         VAPOURSYNTH_API_VERSION, 1, plugin
1101 |     );
1102 | 
1103 |     registerFunc(
1104 |         "NLMeans",
1105 |         "clip:clip;"
1106 |         "d:int:opt;"
1107 |         "a:int:opt;"
1108 |         "s:int:opt;"
1109 |         "h:float:opt;"
1110 |         "channels:data:opt;"
1111 |         "wmode:int:opt;"
1112 |         "wref:float:opt;"
1113 |         "rclip:clip:opt;",
1114 |         nlmCreate,
1115 |         nullptr, plugin
1116 |     );
1117 | 
1118 |     auto getVersion = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
1119 |         vsapi->propSetData(out, "version", VERSION, -1, paReplace);
1120 |     };
1121 |     registerFunc("Version", "", getVersion, nullptr, plugin);
1122 | }
1123 | 


--------------------------------------------------------------------------------
/source/nlm.ispc:
--------------------------------------------------------------------------------
   1 | // based on KNLMeansCL by Khanattila and vs-boxblur
   2 | 
   3 | #define CLAMPX(x) clamp(x, 0, width - 1)
   4 | #define CLAMPY(y) clamp(y, 0, height - 1)
   5 | 
   6 | static inline uniform float square(uniform float x) {
   7 |     return x * x;
   8 | }
   9 | 
  10 | static inline float square(float x) {
  11 |     return x * x;
  12 | }
  13 | 
  14 | export void nlmDistanceLuma_f32(
  15 |     uniform float temp0[], // shape: (height, stride)
  16 |     uniform const float centerp[], // shape: (height, stride)
  17 |     uniform const float neighborp[], // shape: (height, stride)
  18 |     uniform int offset_x,
  19 |     uniform int offset_y,
  20 |     uniform int width,
  21 |     uniform int height,
  22 |     uniform int stride
  23 | ) {
  24 | 
  25 |     uniform int start_x = abs(offset_x);
  26 |     uniform int end_x = width - abs(offset_x);
  27 | 
  28 |     for (uniform int y = 0; y < height ;y++) {
  29 |         for (uniform int x = 0; x < start_x; x++) {
  30 |             uniform int idx = y * stride + x;
  31 |             uniform float u1 = centerp[idx];
  32 | 
  33 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
  34 |             uniform float u1_pq = neighborp[neighbor_idx];
  35 | 
  36 |             temp0[idx] = 3.0f * square(u1 - u1_pq);
  37 |         }
  38 | 
  39 |         foreach (x = start_x ... end_x) {
  40 |             int idx = y * stride + x;
  41 |             float u1 = centerp[idx];
  42 | 
  43 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
  44 |             float u1_pq = neighborp[neighbor_idx];
  45 | 
  46 |             temp0[idx] = 3.0f * square(u1 - u1_pq);
  47 |         }
  48 | 
  49 |         for (uniform int x = end_x; x < width; x++) {
  50 |             uniform int idx = y * stride + x;
  51 |             uniform float u1 = centerp[idx];
  52 | 
  53 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
  54 |             uniform float u1_pq = neighborp[neighbor_idx];
  55 | 
  56 |             temp0[idx] = 3.0f * square(u1 - u1_pq);
  57 |         }
  58 |     }
  59 | }
  60 | 
  61 | export void nlmDistanceLuma_u8(
  62 |     uniform float temp0[], // shape: (height, stride)
  63 |     uniform const unsigned int8 centerp[], // shape: (height, stride)
  64 |     uniform const unsigned int8 neighborp[], // shape: (height, stride)
  65 |     uniform int offset_x,
  66 |     uniform int offset_y,
  67 |     uniform int width,
  68 |     uniform int height,
  69 |     uniform int stride,
  70 |     uniform float inv_divisor
  71 | ) {
  72 | 
  73 |     uniform int start_x = abs(offset_x);
  74 |     uniform int end_x = width - abs(offset_x);
  75 | 
  76 |     uniform float sq_inv_divisor = square(inv_divisor);
  77 | 
  78 |     for (uniform int y = 0; y < height ;y++) {
  79 |         for (uniform int x = 0; x < start_x; x++) {
  80 |             uniform int idx = y * stride + x;
  81 |             uniform float u1 = centerp[idx];
  82 | 
  83 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
  84 |             uniform float u1_pq = neighborp[neighbor_idx];
  85 | 
  86 |             temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
  87 |         }
  88 | 
  89 |         foreach (x = start_x ... end_x) {
  90 |             int idx = y * stride + x;
  91 |             float u1 = centerp[idx];
  92 | 
  93 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
  94 |             float u1_pq = neighborp[neighbor_idx];
  95 | 
  96 |             temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
  97 |         }
  98 | 
  99 |         for (uniform int x = end_x; x < width; x++) {
 100 |             uniform int idx = y * stride + x;
 101 |             uniform float u1 = centerp[idx];
 102 | 
 103 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 104 |             uniform float u1_pq = neighborp[neighbor_idx];
 105 | 
 106 |             temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
 107 |         }
 108 |     }
 109 | }
 110 | 
 111 | export void nlmDistanceLuma_u16(
 112 |     uniform float temp0[], // shape: (height, stride)
 113 |     uniform const unsigned int16 centerp[], // shape: (height, stride)
 114 |     uniform const unsigned int16 neighborp[], // shape: (height, stride)
 115 |     uniform int offset_x,
 116 |     uniform int offset_y,
 117 |     uniform int width,
 118 |     uniform int height,
 119 |     uniform int stride,
 120 |     uniform float inv_divisor
 121 | ) {
 122 | 
 123 |     uniform int start_x = abs(offset_x);
 124 |     uniform int end_x = width - abs(offset_x);
 125 | 
 126 |     uniform float sq_inv_divisor = square(inv_divisor);
 127 | 
 128 |     for (uniform int y = 0; y < height ;y++) {
 129 |         for (uniform int x = 0; x < start_x; x++) {
 130 |             uniform int idx = y * stride + x;
 131 |             uniform float u1 = centerp[idx];
 132 | 
 133 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 134 |             uniform float u1_pq = neighborp[neighbor_idx];
 135 | 
 136 |             temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
 137 |         }
 138 | 
 139 |         foreach (x = start_x ... end_x) {
 140 |             int idx = y * stride + x;
 141 |             float u1 = centerp[idx];
 142 | 
 143 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 144 |             float u1_pq = neighborp[neighbor_idx];
 145 | 
 146 |             temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
 147 |         }
 148 | 
 149 |         for (uniform int x = end_x; x < width; x++) {
 150 |             uniform int idx = y * stride + x;
 151 |             uniform float u1 = centerp[idx];
 152 | 
 153 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 154 |             uniform float u1_pq = neighborp[neighbor_idx];
 155 | 
 156 |             temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
 157 |         }
 158 |     }
 159 | }
 160 | 
 161 | export void nlmDistanceChroma_f32(
 162 |     uniform float temp0[], // shape: (height, stride)
 163 |     uniform const float centerp1[], // shape: (height, stride)
 164 |     uniform const float centerp2[], // shape: (height, stride)
 165 |     uniform const float neighborp1[], // shape: (height, stride)
 166 |     uniform const float neighborp2[], // shape: (height, stride)
 167 |     uniform int offset_x,
 168 |     uniform int offset_y,
 169 |     uniform int width,
 170 |     uniform int height,
 171 |     uniform int stride
 172 | ) {
 173 | 
 174 |     uniform int start_x = abs(offset_x);
 175 |     uniform int end_x = width - abs(offset_x);
 176 | 
 177 |     for (uniform int y = 0; y < height; y++) {
 178 |         for (uniform int x = 0; x < start_x; x++) {
 179 |             uniform int idx = y * stride + x;
 180 |             uniform float u1_1 = centerp1[idx];
 181 |             uniform float u1_2 = centerp2[idx];
 182 | 
 183 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 184 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 185 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 186 | 
 187 |             uniform float dst = 1.5f * (
 188 |                 square(u1_1 - u1_pq_1) +
 189 |                 square(u1_2 - u1_pq_2)
 190 |             );
 191 | 
 192 |             temp0[idx] = dst;
 193 |         }
 194 | 
 195 |         foreach (x = start_x ... end_x) {
 196 |             int idx = y * stride + x;
 197 |             float u1_1 = centerp1[idx];
 198 |             float u1_2 = centerp2[idx];
 199 | 
 200 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 201 |             float u1_pq_1 = neighborp1[neighbor_idx];
 202 |             float u1_pq_2 = neighborp2[neighbor_idx];
 203 | 
 204 |             float dst = 1.5f * (
 205 |                 square(u1_1 - u1_pq_1) +
 206 |                 square(u1_2 - u1_pq_2)
 207 |             );
 208 | 
 209 |             temp0[idx] = dst;
 210 |         }
 211 | 
 212 |         for (uniform int x = end_x; x < width; x++) {
 213 |             uniform int idx = y * stride + x;
 214 |             uniform float u1_1 = centerp1[idx];
 215 |             uniform float u1_2 = centerp2[idx];
 216 | 
 217 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 218 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 219 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 220 | 
 221 |             uniform float dst = 1.5f * (
 222 |                 square(u1_1 - u1_pq_1) +
 223 |                 square(u1_2 - u1_pq_2)
 224 |             );
 225 | 
 226 |             temp0[idx] = dst;
 227 |         }
 228 |     }
 229 | }
 230 | 
 231 | export void nlmDistanceChroma_u8(
 232 |     uniform float temp0[], // shape: (height, stride)
 233 |     uniform const unsigned int8 centerp1[], // shape: (height, stride)
 234 |     uniform const unsigned int8 centerp2[], // shape: (height, stride)
 235 |     uniform const unsigned int8 neighborp1[], // shape: (height, stride)
 236 |     uniform const unsigned int8 neighborp2[], // shape: (height, stride)
 237 |     uniform int offset_x,
 238 |     uniform int offset_y,
 239 |     uniform int width,
 240 |     uniform int height,
 241 |     uniform int stride,
 242 |     uniform float inv_divisor
 243 | ) {
 244 | 
 245 |     uniform int start_x = abs(offset_x);
 246 |     uniform int end_x = width - abs(offset_x);
 247 | 
 248 |     uniform float sq_inv_divisor = square(inv_divisor);
 249 | 
 250 |     for (uniform int y = 0; y < height; y++) {
 251 |         for (uniform int x = 0; x < start_x; x++) {
 252 |             uniform int idx = y * stride + x;
 253 |             uniform float u1_1 = centerp1[idx];
 254 |             uniform float u1_2 = centerp2[idx];
 255 | 
 256 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 257 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 258 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 259 | 
 260 |             uniform float dst = 1.5f * (
 261 |                 square(u1_1 - u1_pq_1) +
 262 |                 square(u1_2 - u1_pq_2)
 263 |             );
 264 | 
 265 |             temp0[idx] = dst * sq_inv_divisor;
 266 |         }
 267 | 
 268 |         foreach (x = start_x ... end_x) {
 269 |             int idx = y * stride + x;
 270 |             float u1_1 = centerp1[idx];
 271 |             float u1_2 = centerp2[idx];
 272 | 
 273 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 274 |             float u1_pq_1 = neighborp1[neighbor_idx];
 275 |             float u1_pq_2 = neighborp2[neighbor_idx];
 276 | 
 277 |             float dst = 1.5f * (
 278 |                 square(u1_1 - u1_pq_1) +
 279 |                 square(u1_2 - u1_pq_2)
 280 |             );
 281 | 
 282 |             temp0[idx] = dst * sq_inv_divisor;
 283 |         }
 284 | 
 285 |         for (uniform int x = end_x; x < width; x++) {
 286 |             uniform int idx = y * stride + x;
 287 |             uniform float u1_1 = centerp1[idx];
 288 |             uniform float u1_2 = centerp2[idx];
 289 | 
 290 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 291 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 292 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 293 | 
 294 |             uniform float dst = 1.5f * (
 295 |                 square(u1_1 - u1_pq_1) +
 296 |                 square(u1_2 - u1_pq_2)
 297 |             );
 298 | 
 299 |             temp0[idx] = dst * sq_inv_divisor;
 300 |         }
 301 |     }
 302 | }
 303 | 
 304 | export void nlmDistanceChroma_u16(
 305 |     uniform float temp0[], // shape: (height, stride)
 306 |     uniform const unsigned int16 centerp1[], // shape: (height, stride)
 307 |     uniform const unsigned int16 centerp2[], // shape: (height, stride)
 308 |     uniform const unsigned int16 neighborp1[], // shape: (height, stride)
 309 |     uniform const unsigned int16 neighborp2[], // shape: (height, stride)
 310 |     uniform int offset_x,
 311 |     uniform int offset_y,
 312 |     uniform int width,
 313 |     uniform int height,
 314 |     uniform int stride,
 315 |     uniform float inv_divisor
 316 | ) {
 317 | 
 318 |     uniform int start_x = abs(offset_x);
 319 |     uniform int end_x = width - abs(offset_x);
 320 | 
 321 |     uniform float sq_inv_divisor = square(inv_divisor);
 322 | 
 323 |     for (uniform int y = 0; y < height; y++) {
 324 |         for (uniform int x = 0; x < start_x; x++) {
 325 |             uniform int idx = y * stride + x;
 326 |             uniform float u1_1 = centerp1[idx];
 327 |             uniform float u1_2 = centerp2[idx];
 328 | 
 329 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 330 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 331 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 332 | 
 333 |             uniform float dst = 1.5f * (
 334 |                 square(u1_1 - u1_pq_1) +
 335 |                 square(u1_2 - u1_pq_2)
 336 |             );
 337 | 
 338 |             temp0[idx] = dst * sq_inv_divisor;
 339 |         }
 340 | 
 341 |         foreach (x = start_x ... end_x) {
 342 |             int idx = y * stride + x;
 343 |             float u1_1 = centerp1[idx];
 344 |             float u1_2 = centerp2[idx];
 345 | 
 346 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 347 |             float u1_pq_1 = neighborp1[neighbor_idx];
 348 |             float u1_pq_2 = neighborp2[neighbor_idx];
 349 | 
 350 |             float dst = 1.5f * (
 351 |                 square(u1_1 - u1_pq_1) +
 352 |                 square(u1_2 - u1_pq_2)
 353 |             );
 354 | 
 355 |             temp0[idx] = dst * sq_inv_divisor;
 356 |         }
 357 | 
 358 |         for (uniform int x = end_x; x < width; x++) {
 359 |             uniform int idx = y * stride + x;
 360 |             uniform float u1_1 = centerp1[idx];
 361 |             uniform float u1_2 = centerp2[idx];
 362 | 
 363 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 364 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 365 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 366 | 
 367 |             uniform float dst = 1.5f * (
 368 |                 square(u1_1 - u1_pq_1) +
 369 |                 square(u1_2 - u1_pq_2)
 370 |             );
 371 | 
 372 |             temp0[idx] = dst * sq_inv_divisor;
 373 |         }
 374 |     }
 375 | }
 376 | 
 377 | export void nlmDistanceYUV_f32(
 378 |     uniform float temp0[], // shape: (height, stride)
 379 |     uniform const float centerp1[], // shape: (height, stride)
 380 |     uniform const float centerp2[], // shape: (height, stride)
 381 |     uniform const float centerp3[], // shape: (height, stride)
 382 |     uniform const float neighborp1[], // shape: (height, stride)
 383 |     uniform const float neighborp2[], // shape: (height, stride)
 384 |     uniform const float neighborp3[], // shape: (height, stride)
 385 |     uniform int offset_x,
 386 |     uniform int offset_y,
 387 |     uniform int width,
 388 |     uniform int height,
 389 |     uniform int stride
 390 | ) {
 391 | 
 392 |     uniform int start_x = abs(offset_x);
 393 |     uniform int end_x = width - abs(offset_x);
 394 | 
 395 |     for (uniform int y = 0; y < height; y++) {
 396 |         for (uniform int x = 0; x < start_x; x++) {
 397 |             uniform int idx = y * stride + x;
 398 |             uniform float u1_1 = centerp1[idx];
 399 |             uniform float u1_2 = centerp2[idx];
 400 |             uniform float u1_3 = centerp3[idx];
 401 | 
 402 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 403 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 404 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 405 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 406 | 
 407 |             uniform float dst = (
 408 |                 square(u1_1 - u1_pq_1) +
 409 |                 square(u1_2 - u1_pq_2) +
 410 |                 square(u1_3 - u1_pq_3)
 411 |             );
 412 | 
 413 |             temp0[idx] = dst;
 414 |         }
 415 | 
 416 |         foreach (x = start_x ... end_x) {
 417 |             int idx = y * stride + x;
 418 |             float u1_1 = centerp1[idx];
 419 |             float u1_2 = centerp2[idx];
 420 |             float u1_3 = centerp3[idx];
 421 | 
 422 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 423 |             float u1_pq_1 = neighborp1[neighbor_idx];
 424 |             float u1_pq_2 = neighborp2[neighbor_idx];
 425 |             float u1_pq_3 = neighborp3[neighbor_idx];
 426 | 
 427 |             float dst = (
 428 |                 square(u1_1 - u1_pq_1) +
 429 |                 square(u1_2 - u1_pq_2) +
 430 |                 square(u1_3 - u1_pq_3)
 431 |             );
 432 | 
 433 |             temp0[idx] = dst;
 434 |         }
 435 | 
 436 |         for (uniform int x = end_x; x < width; x++) {
 437 |             uniform int idx = y * stride + x;
 438 |             uniform float u1_1 = centerp1[idx];
 439 |             uniform float u1_2 = centerp2[idx];
 440 |             uniform float u1_3 = centerp3[idx];
 441 | 
 442 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 443 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 444 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 445 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 446 | 
 447 |             uniform float dst = (
 448 |                 square(u1_1 - u1_pq_1) +
 449 |                 square(u1_2 - u1_pq_2) +
 450 |                 square(u1_3 - u1_pq_3)
 451 |             );
 452 | 
 453 |             temp0[idx] = dst;
 454 |         }
 455 |     }
 456 | }
 457 | 
 458 | export void nlmDistanceYUV_u8(
 459 |     uniform float temp0[], // shape: (height, stride)
 460 |     uniform const unsigned int8 centerp1[], // shape: (height, stride)
 461 |     uniform const unsigned int8 centerp2[], // shape: (height, stride)
 462 |     uniform const unsigned int8 centerp3[], // shape: (height, stride)
 463 |     uniform const unsigned int8 neighborp1[], // shape: (height, stride)
 464 |     uniform const unsigned int8 neighborp2[], // shape: (height, stride)
 465 |     uniform const unsigned int8 neighborp3[], // shape: (height, stride)
 466 |     uniform int offset_x,
 467 |     uniform int offset_y,
 468 |     uniform int width,
 469 |     uniform int height,
 470 |     uniform int stride,
 471 |     uniform float inv_divisor
 472 | ) {
 473 | 
 474 |     uniform int start_x = abs(offset_x);
 475 |     uniform int end_x = width - abs(offset_x);
 476 | 
 477 |     uniform float sq_inv_divisor = square(inv_divisor);
 478 | 
 479 |     for (uniform int y = 0; y < height; y++) {
 480 |         for (uniform int x = 0; x < start_x; x++) {
 481 |             uniform int idx = y * stride + x;
 482 |             uniform float u1_1 = centerp1[idx];
 483 |             uniform float u1_2 = centerp2[idx];
 484 |             uniform float u1_3 = centerp3[idx];
 485 | 
 486 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 487 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 488 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 489 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 490 | 
 491 |             uniform float dst = (
 492 |                 square(u1_1 - u1_pq_1) +
 493 |                 square(u1_2 - u1_pq_2) +
 494 |                 square(u1_3 - u1_pq_3)
 495 |             );
 496 | 
 497 |             temp0[idx] = dst * sq_inv_divisor;
 498 |         }
 499 | 
 500 |         foreach (x = start_x ... end_x) {
 501 |             int idx = y * stride + x;
 502 |             float u1_1 = centerp1[idx];
 503 |             float u1_2 = centerp2[idx];
 504 |             float u1_3 = centerp3[idx];
 505 | 
 506 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 507 |             float u1_pq_1 = neighborp1[neighbor_idx];
 508 |             float u1_pq_2 = neighborp2[neighbor_idx];
 509 |             float u1_pq_3 = neighborp3[neighbor_idx];
 510 | 
 511 |             float dst = (
 512 |                 square(u1_1 - u1_pq_1) +
 513 |                 square(u1_2 - u1_pq_2) +
 514 |                 square(u1_3 - u1_pq_3)
 515 |             );
 516 | 
 517 |             temp0[idx] = dst * sq_inv_divisor;
 518 |         }
 519 | 
 520 |         for (uniform int x = end_x; x < width; x++) {
 521 |             uniform int idx = y * stride + x;
 522 |             uniform float u1_1 = centerp1[idx];
 523 |             uniform float u1_2 = centerp2[idx];
 524 |             uniform float u1_3 = centerp3[idx];
 525 | 
 526 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 527 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 528 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 529 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 530 | 
 531 |             uniform float dst = (
 532 |                 square(u1_1 - u1_pq_1) +
 533 |                 square(u1_2 - u1_pq_2) +
 534 |                 square(u1_3 - u1_pq_3)
 535 |             );
 536 | 
 537 |             temp0[idx] = dst * sq_inv_divisor;
 538 |         }
 539 |     }
 540 | }
 541 | 
 542 | export void nlmDistanceYUV_u16(
 543 |     uniform float temp0[], // shape: (height, stride)
 544 |     uniform const unsigned int16 centerp1[], // shape: (height, stride)
 545 |     uniform const unsigned int16 centerp2[], // shape: (height, stride)
 546 |     uniform const unsigned int16 centerp3[], // shape: (height, stride)
 547 |     uniform const unsigned int16 neighborp1[], // shape: (height, stride)
 548 |     uniform const unsigned int16 neighborp2[], // shape: (height, stride)
 549 |     uniform const unsigned int16 neighborp3[], // shape: (height, stride)
 550 |     uniform int offset_x,
 551 |     uniform int offset_y,
 552 |     uniform int width,
 553 |     uniform int height,
 554 |     uniform int stride,
 555 |     uniform float inv_divisor
 556 | ) {
 557 | 
 558 |     uniform int start_x = abs(offset_x);
 559 |     uniform int end_x = width - abs(offset_x);
 560 | 
 561 |     uniform float sq_inv_divisor = square(inv_divisor);
 562 | 
 563 |     for (uniform int y = 0; y < height; y++) {
 564 |         for (uniform int x = 0; x < start_x; x++) {
 565 |             uniform int idx = y * stride + x;
 566 |             uniform float u1_1 = centerp1[idx];
 567 |             uniform float u1_2 = centerp2[idx];
 568 |             uniform float u1_3 = centerp3[idx];
 569 | 
 570 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 571 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 572 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 573 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 574 | 
 575 |             uniform float dst = (
 576 |                 square(u1_1 - u1_pq_1) +
 577 |                 square(u1_2 - u1_pq_2) +
 578 |                 square(u1_3 - u1_pq_3)
 579 |             );
 580 | 
 581 |             temp0[idx] = dst * sq_inv_divisor;
 582 |         }
 583 | 
 584 |         foreach (x = start_x ... end_x) {
 585 |             int idx = y * stride + x;
 586 |             float u1_1 = centerp1[idx];
 587 |             float u1_2 = centerp2[idx];
 588 |             float u1_3 = centerp3[idx];
 589 | 
 590 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 591 |             float u1_pq_1 = neighborp1[neighbor_idx];
 592 |             float u1_pq_2 = neighborp2[neighbor_idx];
 593 |             float u1_pq_3 = neighborp3[neighbor_idx];
 594 | 
 595 |             float dst = (
 596 |                 square(u1_1 - u1_pq_1) +
 597 |                 square(u1_2 - u1_pq_2) +
 598 |                 square(u1_3 - u1_pq_3)
 599 |             );
 600 | 
 601 |             temp0[idx] = dst * sq_inv_divisor;
 602 |         }
 603 | 
 604 |         for (uniform int x = end_x; x < width; x++) {
 605 |             uniform int idx = y * stride + x;
 606 |             uniform float u1_1 = centerp1[idx];
 607 |             uniform float u1_2 = centerp2[idx];
 608 |             uniform float u1_3 = centerp3[idx];
 609 | 
 610 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 611 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 612 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 613 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 614 | 
 615 |             uniform float dst = (
 616 |                 square(u1_1 - u1_pq_1) +
 617 |                 square(u1_2 - u1_pq_2) +
 618 |                 square(u1_3 - u1_pq_3)
 619 |             );
 620 | 
 621 |             temp0[idx] = dst * sq_inv_divisor;
 622 |         }
 623 |     }
 624 | }
 625 | 
 626 | export void nlmDistanceRGB_f32(
 627 |     uniform float temp0[], // shape: (height, stride)
 628 |     uniform const float centerp1[], // shape: (height, stride)
 629 |     uniform const float centerp2[], // shape: (height, stride)
 630 |     uniform const float centerp3[], // shape: (height, stride)
 631 |     uniform const float neighborp1[], // shape: (height, stride)
 632 |     uniform const float neighborp2[], // shape: (height, stride)
 633 |     uniform const float neighborp3[], // shape: (height, stride)
 634 |     uniform int offset_x,
 635 |     uniform int offset_y,
 636 |     uniform int width,
 637 |     uniform int height,
 638 |     uniform int stride
 639 | ) {
 640 | 
 641 |     uniform int start_x = abs(offset_x);
 642 |     uniform int end_x = width - abs(offset_x);
 643 | 
 644 |     for (uniform int y = 0; y < height; y++) {
 645 |         for (uniform int x = 0; x < start_x; x++) {
 646 |             uniform int idx = y * stride + x;
 647 |             uniform float u1_1 = centerp1[idx];
 648 |             uniform float u1_2 = centerp2[idx];
 649 |             uniform float u1_3 = centerp3[idx];
 650 | 
 651 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 652 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 653 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 654 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 655 | 
 656 |             uniform float m_red = (u1_1 + u1_pq_1) / 6.0f;
 657 | 
 658 |             uniform float dst = (
 659 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 660 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 661 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 662 |             );
 663 | 
 664 |             temp0[idx] = dst;
 665 |         }
 666 | 
 667 |         foreach (x = start_x ... end_x) {
 668 |             int idx = y * stride + x;
 669 |             float u1_1 = centerp1[idx];
 670 |             float u1_2 = centerp2[idx];
 671 |             float u1_3 = centerp3[idx];
 672 | 
 673 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 674 |             float u1_pq_1 = neighborp1[neighbor_idx];
 675 |             float u1_pq_2 = neighborp2[neighbor_idx];
 676 |             float u1_pq_3 = neighborp3[neighbor_idx];
 677 | 
 678 |             float m_red = (u1_1 + u1_pq_1) / 6.0f;
 679 | 
 680 |             float dst = (
 681 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 682 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 683 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 684 |             );
 685 | 
 686 |             temp0[idx] = dst;
 687 |         }
 688 | 
 689 |         for (uniform int x = end_x; x < width; x++) {
 690 |             uniform int idx = y * stride + x;
 691 |             uniform float u1_1 = centerp1[idx];
 692 |             uniform float u1_2 = centerp2[idx];
 693 |             uniform float u1_3 = centerp3[idx];
 694 | 
 695 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 696 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 697 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 698 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 699 | 
 700 |             uniform float m_red = (u1_1 + u1_pq_1) / 6.0f;
 701 | 
 702 |             uniform float dst = (
 703 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 704 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 705 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 706 |             );
 707 | 
 708 |             temp0[idx] = dst;
 709 |         }
 710 |     }
 711 | }
 712 | 
 713 | export void nlmDistanceRGB_u8(
 714 |     uniform float temp0[], // shape: (height, stride)
 715 |     uniform const unsigned int8 centerp1[], // shape: (height, stride)
 716 |     uniform const unsigned int8 centerp2[], // shape: (height, stride)
 717 |     uniform const unsigned int8 centerp3[], // shape: (height, stride)
 718 |     uniform const unsigned int8 neighborp1[], // shape: (height, stride)
 719 |     uniform const unsigned int8 neighborp2[], // shape: (height, stride)
 720 |     uniform const unsigned int8 neighborp3[], // shape: (height, stride)
 721 |     uniform int offset_x,
 722 |     uniform int offset_y,
 723 |     uniform int width,
 724 |     uniform int height,
 725 |     uniform int stride,
 726 |     uniform float inv_divisor
 727 | ) {
 728 | 
 729 |     uniform int start_x = abs(offset_x);
 730 |     uniform int end_x = width - abs(offset_x);
 731 | 
 732 |     uniform float sq_inv_divisor = square(inv_divisor);
 733 | 
 734 |     for (uniform int y = 0; y < height; y++) {
 735 |         for (uniform int x = 0; x < start_x; x++) {
 736 |             uniform int idx = y * stride + x;
 737 |             uniform float u1_1 = centerp1[idx];
 738 |             uniform float u1_2 = centerp2[idx];
 739 |             uniform float u1_3 = centerp3[idx];
 740 | 
 741 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 742 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 743 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 744 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 745 | 
 746 |             uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
 747 | 
 748 |             uniform float dst = (
 749 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 750 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 751 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 752 |             );
 753 | 
 754 |             temp0[idx] = dst * sq_inv_divisor;
 755 |         }
 756 | 
 757 |         foreach (x = start_x ... end_x) {
 758 |             int idx = y * stride + x;
 759 |             float u1_1 = centerp1[idx];
 760 |             float u1_2 = centerp2[idx];
 761 |             float u1_3 = centerp3[idx];
 762 | 
 763 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 764 |             float u1_pq_1 = neighborp1[neighbor_idx];
 765 |             float u1_pq_2 = neighborp2[neighbor_idx];
 766 |             float u1_pq_3 = neighborp3[neighbor_idx];
 767 | 
 768 |             float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
 769 | 
 770 |             float dst = (
 771 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 772 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 773 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 774 |             );
 775 | 
 776 |             temp0[idx] = dst * sq_inv_divisor;
 777 |         }
 778 | 
 779 |         for (uniform int x = end_x; x < width; x++) {
 780 |             uniform int idx = y * stride + x;
 781 |             uniform float u1_1 = centerp1[idx];
 782 |             uniform float u1_2 = centerp2[idx];
 783 |             uniform float u1_3 = centerp3[idx];
 784 | 
 785 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 786 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 787 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 788 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 789 | 
 790 |             uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
 791 | 
 792 |             uniform float dst = (
 793 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 794 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 795 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 796 |             );
 797 | 
 798 |             temp0[idx] = dst * sq_inv_divisor;
 799 |         }
 800 |     }
 801 | }
 802 | 
 803 | export void nlmDistanceRGB_u16(
 804 |     uniform float temp0[], // shape: (height, stride)
 805 |     uniform const unsigned int16 centerp1[], // shape: (height, stride)
 806 |     uniform const unsigned int16 centerp2[], // shape: (height, stride)
 807 |     uniform const unsigned int16 centerp3[], // shape: (height, stride)
 808 |     uniform const unsigned int16 neighborp1[], // shape: (height, stride)
 809 |     uniform const unsigned int16 neighborp2[], // shape: (height, stride)
 810 |     uniform const unsigned int16 neighborp3[], // shape: (height, stride)
 811 |     uniform int offset_x,
 812 |     uniform int offset_y,
 813 |     uniform int width,
 814 |     uniform int height,
 815 |     uniform int stride,
 816 |     uniform float inv_divisor
 817 | ) {
 818 | 
 819 |     uniform int start_x = abs(offset_x);
 820 |     uniform int end_x = width - abs(offset_x);
 821 | 
 822 |     uniform float sq_inv_divisor = square(inv_divisor);
 823 | 
 824 |     for (uniform int y = 0; y < height; y++) {
 825 |         for (uniform int x = 0; x < start_x; x++) {
 826 |             uniform int idx = y * stride + x;
 827 |             uniform float u1_1 = centerp1[idx];
 828 |             uniform float u1_2 = centerp2[idx];
 829 |             uniform float u1_3 = centerp3[idx];
 830 | 
 831 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 832 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 833 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 834 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 835 | 
 836 |             uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
 837 | 
 838 |             uniform float dst = (
 839 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 840 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 841 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 842 |             );
 843 | 
 844 |             temp0[idx] = dst * sq_inv_divisor;
 845 |         }
 846 | 
 847 |         foreach (x = start_x ... end_x) {
 848 |             int idx = y * stride + x;
 849 |             float u1_1 = centerp1[idx];
 850 |             float u1_2 = centerp2[idx];
 851 |             float u1_3 = centerp3[idx];
 852 | 
 853 |             int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
 854 |             float u1_pq_1 = neighborp1[neighbor_idx];
 855 |             float u1_pq_2 = neighborp2[neighbor_idx];
 856 |             float u1_pq_3 = neighborp3[neighbor_idx];
 857 | 
 858 |             float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
 859 | 
 860 |             float dst = (
 861 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 862 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 863 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 864 |             );
 865 | 
 866 |             temp0[idx] = dst * sq_inv_divisor;
 867 |         }
 868 | 
 869 |         for (uniform int x = end_x; x < width; x++) {
 870 |             uniform int idx = y * stride + x;
 871 |             uniform float u1_1 = centerp1[idx];
 872 |             uniform float u1_2 = centerp2[idx];
 873 |             uniform float u1_3 = centerp3[idx];
 874 | 
 875 |             uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
 876 |             uniform float u1_pq_1 = neighborp1[neighbor_idx];
 877 |             uniform float u1_pq_2 = neighborp2[neighbor_idx];
 878 |             uniform float u1_pq_3 = neighborp3[neighbor_idx];
 879 | 
 880 |             uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
 881 | 
 882 |             uniform float dst = (
 883 |                 (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
 884 |                 (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
 885 |                 (1.0f - m_red) * square(u1_3 - u1_pq_3)
 886 |             );
 887 | 
 888 |             temp0[idx] = dst * sq_inv_divisor;
 889 |         }
 890 |     }
 891 | }
 892 | 
 893 | // manually unrolled nlmHorizontal()
 894 | static void nlmHorizontalS0(
 895 |     uniform float temp0[], // shape: (height, stride)
 896 |     uniform const float temp[], // shape: (height, stride)
 897 |     uniform int width,
 898 |     uniform int height,
 899 |     uniform int stride
 900 | ) {
 901 |     const uniform int nlm_s = 0;
 902 |     uniform int start = nlm_s;
 903 |     uniform int end = width - nlm_s;
 904 | 
 905 |     for (uniform int y = 0; y < height; y++) {
 906 |         for (uniform int x = 0; x < nlm_s; x++) {
 907 |             uniform float sum = 0.0f;
 908 |             #pragma unroll
 909 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
 910 |                 sum += temp[y * stride + CLAMPX(x + j)];
 911 |             }
 912 |             temp0[y * stride + x] = sum;
 913 |         }
 914 | 
 915 |         foreach (x = start ... end) {
 916 |             float sum = 0.0f;
 917 |             #pragma unroll
 918 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
 919 |                 sum += temp[y * stride + x + j];
 920 |             }
 921 |             temp0[y * stride + x] = sum;
 922 |         }
 923 | 
 924 |         for (uniform int x = end; x < width; x++) {
 925 |             uniform float sum = 0.0f;
 926 |             #pragma unroll
 927 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
 928 |                 sum += temp[y * stride + CLAMPX(x + j)];
 929 |             }
 930 |             temp0[y * stride + x] = sum;
 931 |         }
 932 |     }
 933 | }
 934 | 
 935 | static void nlmHorizontalS1(
 936 |     uniform float temp0[], // shape: (height, stride)
 937 |     uniform const float temp[], // shape: (height, stride)
 938 |     uniform int width,
 939 |     uniform int height,
 940 |     uniform int stride
 941 | ) {
 942 |     const uniform int nlm_s = 1;
 943 |     uniform int start = nlm_s;
 944 |     uniform int end = width - nlm_s;
 945 | 
 946 |     for (uniform int y = 0; y < height; y++) {
 947 |         for (uniform int x = 0; x < nlm_s; x++) {
 948 |             uniform float sum = 0.0f;
 949 |             #pragma unroll
 950 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
 951 |                 sum += temp[y * stride + CLAMPX(x + j)];
 952 |             }
 953 |             temp0[y * stride + x] = sum;
 954 |         }
 955 | 
 956 |         foreach (x = start ... end) {
 957 |             float sum = 0.0f;
 958 |             #pragma unroll
 959 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
 960 |                 sum += temp[y * stride + x + j];
 961 |             }
 962 |             temp0[y * stride + x] = sum;
 963 |         }
 964 | 
 965 |         for (uniform int x = end; x < width; x++) {
 966 |             uniform float sum = 0.0f;
 967 |             #pragma unroll
 968 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
 969 |                 sum += temp[y * stride + CLAMPX(x + j)];
 970 |             }
 971 |             temp0[y * stride + x] = sum;
 972 |         }
 973 |     }
 974 | }
 975 | 
 976 | static void nlmHorizontalS2(
 977 |     uniform float temp0[], // shape: (height, stride)
 978 |     uniform const float temp[], // shape: (height, stride)
 979 |     uniform int width,
 980 |     uniform int height,
 981 |     uniform int stride
 982 | ) {
 983 | 
 984 |     const uniform int nlm_s = 2;
 985 |     uniform int start = nlm_s;
 986 |     uniform int end = width - nlm_s;
 987 | 
 988 |     for (uniform int y = 0; y < height; y++) {
 989 |         for (uniform int x = 0; x < nlm_s; x++) {
 990 |             uniform float sum = 0.0f;
 991 |             #pragma unroll
 992 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
 993 |                 sum += temp[y * stride + CLAMPX(x + j)];
 994 |             }
 995 |             temp0[y * stride + x] = sum;
 996 |         }
 997 | 
 998 |         foreach (x = start ... end) {
 999 |             float sum = 0.0f;
1000 |             #pragma unroll
1001 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1002 |                 sum += temp[y * stride + x + j];
1003 |             }
1004 |             temp0[y * stride + x] = sum;
1005 |         }
1006 | 
1007 |         for (uniform int x = end; x < width; x++) {
1008 |             uniform float sum = 0.0f;
1009 |             #pragma unroll
1010 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1011 |                 sum += temp[y * stride + CLAMPX(x + j)];
1012 |             }
1013 |             temp0[y * stride + x] = sum;
1014 |         }
1015 |     }
1016 | }
1017 | 
1018 | static void nlmHorizontalS3(
1019 |     uniform float temp0[], // shape: (height, stride)
1020 |     uniform const float temp[], // shape: (height, stride)
1021 |     uniform int width,
1022 |     uniform int height,
1023 |     uniform int stride
1024 | ) {
1025 | 
1026 |     const uniform int nlm_s = 3;
1027 |     uniform int start = nlm_s;
1028 |     uniform int end = width - nlm_s;
1029 | 
1030 |     for (uniform int y = 0; y < height; y++) {
1031 |         for (uniform int x = 0; x < nlm_s; x++) {
1032 |             uniform float sum = 0.0f;
1033 |             #pragma unroll
1034 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1035 |                 sum += temp[y * stride + CLAMPX(x + j)];
1036 |             }
1037 |             temp0[y * stride + x] = sum;
1038 |         }
1039 | 
1040 |         foreach (x = start ... end) {
1041 |             float sum = 0.0f;
1042 |             #pragma unroll
1043 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1044 |                 sum += temp[y * stride + x + j];
1045 |             }
1046 |             temp0[y * stride + x] = sum;
1047 |         }
1048 | 
1049 |         for (uniform int x = end; x < width; x++) {
1050 |             uniform float sum = 0.0f;
1051 |             #pragma unroll
1052 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1053 |                 sum += temp[y * stride + CLAMPX(x + j)];
1054 |             }
1055 |             temp0[y * stride + x] = sum;
1056 |         }
1057 |     }
1058 | }
1059 | 
1060 | static void nlmHorizontalS4(
1061 |     uniform float temp0[], // shape: (height, stride)
1062 |     uniform const float temp[], // shape: (height, stride)
1063 |     uniform int width,
1064 |     uniform int height,
1065 |     uniform int stride
1066 | ) {
1067 | 
1068 |     const uniform int nlm_s = 4;
1069 |     uniform int start = nlm_s;
1070 |     uniform int end = width - nlm_s;
1071 | 
1072 |     for (uniform int y = 0; y < height; y++) {
1073 |         for (uniform int x = 0; x < nlm_s; x++) {
1074 |             uniform float sum = 0.0f;
1075 |             #pragma unroll
1076 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1077 |                 sum += temp[y * stride + CLAMPX(x + j)];
1078 |             }
1079 |             temp0[y * stride + x] = sum;
1080 |         }
1081 | 
1082 |         foreach (x = start ... end) {
1083 |             float sum = 0.0f;
1084 |             #pragma unroll
1085 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1086 |                 sum += temp[y * stride + x + j];
1087 |             }
1088 |             temp0[y * stride + x] = sum;
1089 |         }
1090 | 
1091 |         for (uniform int x = end; x < width; x++) {
1092 |             uniform float sum = 0.0f;
1093 |             #pragma unroll
1094 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1095 |                 sum += temp[y * stride + CLAMPX(x + j)];
1096 |             }
1097 |             temp0[y * stride + x] = sum;
1098 |         }
1099 |     }
1100 | }
1101 | 
1102 | export void nlmHorizontal(
1103 |     uniform float temp0[], // shape: (height, stride)
1104 |     uniform const float temp[], // shape: (height, stride)
1105 |     uniform int nlm_s,
1106 |     uniform int width,
1107 |     uniform int height,
1108 |     uniform int stride
1109 | ) {
1110 |     // dynamic dispatch on nlm_s
1111 |     if (nlm_s == 0) {
1112 |         nlmHorizontalS0(temp0, temp, width, height, stride);
1113 |         return ;
1114 |     } else if (nlm_s == 1) {
1115 |         nlmHorizontalS1(temp0, temp, width, height, stride);
1116 |         return ;
1117 |     } else if (nlm_s == 2) {
1118 |         nlmHorizontalS2(temp0, temp, width, height, stride);
1119 |         return ;
1120 |     } else if (nlm_s == 3) {
1121 |         nlmHorizontalS3(temp0, temp, width, height, stride);
1122 |         return ;
1123 |     } else if (nlm_s == 4) {
1124 |         nlmHorizontalS4(temp0, temp, width, height, stride);
1125 |         return ;
1126 |     }
1127 | 
1128 |     uniform int start = nlm_s;
1129 |     uniform int end = width - nlm_s;
1130 | 
1131 |     for (uniform int y = 0; y < height; y++) {
1132 |         for (uniform int x = 0; x < nlm_s; x++) {
1133 |             uniform float sum = 0.0f;
1134 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1135 |                 sum += temp[y * stride + CLAMPX(x + j)];
1136 |             }
1137 |             temp0[y * stride + x] = sum;
1138 |         }
1139 | 
1140 |         foreach (x = start ... end) {
1141 |             float sum = 0.0f;
1142 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1143 |                 sum += temp[y * stride + x + j];
1144 |             }
1145 |             temp0[y * stride + x] = sum;
1146 |         }
1147 | 
1148 |         for (uniform int x = end; x < width; x++) {
1149 |             uniform float sum = 0.0f;
1150 |             for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1151 |                 sum += temp[y * stride + CLAMPX(x + j)];
1152 |             }
1153 |             temp0[y * stride + x] = sum;
1154 |         }
1155 |     }
1156 | }
1157 | 
1158 | static inline float welsch(float sum, uniform float h2_inv_norm) {
1159 |     return exp(-sum * h2_inv_norm);
1160 | }
1161 | 
1162 | export void nlmVerticalWelsch(
1163 |     uniform float dstp[], // shape: (height, stride)
1164 |     uniform const float srcp[], // shape: (height, stride)
1165 |     uniform int radius,
1166 |     uniform float h2_inv_norm,
1167 |     uniform int width,
1168 |     uniform int height,
1169 |     uniform int stride,
1170 |     uniform float buffer[] // shape: (width,)
1171 | ) {
1172 | 
1173 |     foreach (x = 0 ... width) {
1174 |         buffer[x] = radius * srcp[x];
1175 |     }
1176 | 
1177 |     for (uniform int y = 0; y < radius; ++y) {
1178 |         foreach (x = 0 ... width) {
1179 |             buffer[x] += srcp[min(y, height - 1) * stride + x];
1180 |         }
1181 |     }
1182 | 
1183 |     for (uniform int y = 0; y < min(radius, height); ++y) {
1184 |         foreach (x = 0 ... width) {
1185 |             buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1186 |             dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm);
1187 |             buffer[x] -= srcp[x];
1188 |         }
1189 |     }
1190 | 
1191 |     if (height > radius) {
1192 |         for (uniform int y = radius; y < height - radius; ++y) {
1193 |             foreach (x = 0 ... width) {
1194 |                 buffer[x] += srcp[(y + radius) * stride + x];
1195 |                 dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm);
1196 |                 buffer[x] -= srcp[(y - radius) * stride + x];
1197 |             }
1198 |         }
1199 | 
1200 |         for (uniform int y = max(height - radius, radius); y < height; ++y) {
1201 |             foreach (x = 0 ... width) {
1202 |                 buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1203 |                 dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm);
1204 |                 buffer[x] -= srcp[(y - radius) * stride + x];
1205 |             }
1206 |         }
1207 |     }
1208 | }
1209 | 
1210 | // positive difference
1211 | static inline float fdim(uniform float x, float y) {
1212 |     return (x > y) ? x - y : 0.0f;
1213 | }
1214 | 
1215 | static inline float bisquareA(float sum, uniform float h2_inv_norm) {
1216 |     float tmp = fdim(1.0f, sum * h2_inv_norm);
1217 |     return tmp;
1218 | }
1219 | 
1220 | export void nlmVerticalBisquareA(
1221 |     uniform float dstp[], // shape: (height, stride)
1222 |     uniform const float srcp[], // shape: (height, stride)
1223 |     uniform int radius,
1224 |     uniform float h2_inv_norm,
1225 |     uniform int width,
1226 |     uniform int height,
1227 |     uniform int stride,
1228 |     uniform float buffer[] // shape: (width,)
1229 | ) {
1230 | 
1231 |     foreach (x = 0 ... width) {
1232 |         buffer[x] = radius * srcp[x];
1233 |     }
1234 | 
1235 |     for (uniform int y = 0; y < radius; ++y) {
1236 |         foreach (x = 0 ... width) {
1237 |             buffer[x] += srcp[min(y, height - 1) * stride + x];
1238 |         }
1239 |     }
1240 | 
1241 |     for (uniform int y = 0; y < min(radius, height); ++y) {
1242 |         foreach (x = 0 ... width) {
1243 |             buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1244 |             dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm);
1245 |             buffer[x] -= srcp[x];
1246 |         }
1247 |     }
1248 | 
1249 |     if (height > radius) {
1250 |         for (uniform int y = radius; y < height - radius; ++y) {
1251 |             foreach (x = 0 ... width) {
1252 |                 buffer[x] += srcp[(y + radius) * stride + x];
1253 |                 dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm);
1254 |                 buffer[x] -= srcp[(y - radius) * stride + x];
1255 |             }
1256 |         }
1257 | 
1258 |         for (uniform int y = max(height - radius, radius); y < height; ++y) {
1259 |             foreach (x = 0 ... width) {
1260 |                 buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1261 |                 dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm);
1262 |                 buffer[x] -= srcp[(y - radius) * stride + x];
1263 |             }
1264 |         }
1265 |     }
1266 | }
1267 | 
1268 | static inline float bisquareB(float sum, uniform float h2_inv_norm) {
1269 |     float tmp = fdim(1.0f, sum * h2_inv_norm);
1270 |     tmp *= tmp;
1271 |     return tmp;
1272 | }
1273 | 
1274 | export void nlmVerticalBisquareB(
1275 |     uniform float dstp[], // shape: (height, stride)
1276 |     uniform const float srcp[], // shape: (height, stride)
1277 |     uniform int radius,
1278 |     uniform float h2_inv_norm,
1279 |     uniform int width,
1280 |     uniform int height,
1281 |     uniform int stride,
1282 |     uniform float buffer[] // shape: (width,)
1283 | ) {
1284 | 
1285 |     foreach (x = 0 ... width) {
1286 |         buffer[x] = radius * srcp[x];
1287 |     }
1288 | 
1289 |     for (uniform int y = 0; y < radius; ++y) {
1290 |         foreach (x = 0 ... width) {
1291 |             buffer[x] += srcp[min(y, height - 1) * stride + x];
1292 |         }
1293 |     }
1294 | 
1295 |     for (uniform int y = 0; y < min(radius, height); ++y) {
1296 |         foreach (x = 0 ... width) {
1297 |             buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1298 |             dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm);
1299 |             buffer[x] -= srcp[x];
1300 |         }
1301 |     }
1302 | 
1303 |     if (height > radius) {
1304 |         for (uniform int y = radius; y < height - radius; ++y) {
1305 |             foreach (x = 0 ... width) {
1306 |                 buffer[x] += srcp[(y + radius) * stride + x];
1307 |                 dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm);
1308 |                 buffer[x] -= srcp[(y - radius) * stride + x];
1309 |             }
1310 |         }
1311 | 
1312 |         for (uniform int y = max(height - radius, radius); y < height; ++y) {
1313 |             foreach (x = 0 ... width) {
1314 |                 buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1315 |                 dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm);
1316 |                 buffer[x] -= srcp[(y - radius) * stride + x];
1317 |             }
1318 |         }
1319 |     }
1320 | }
1321 | 
1322 | static inline float bisquareC(float sum, uniform float h2_inv_norm) {
1323 |     float tmp = fdim(1.0f, sum * h2_inv_norm);
1324 |     tmp *= tmp;
1325 |     tmp *= tmp;
1326 |     tmp *= tmp;
1327 |     return tmp;
1328 | }
1329 | 
1330 | export void nlmVerticalBisquareC(
1331 |     uniform float dstp[], // shape: (height, stride)
1332 |     uniform const float srcp[], // shape: (height, stride)
1333 |     uniform int radius,
1334 |     uniform float h2_inv_norm,
1335 |     uniform int width,
1336 |     uniform int height,
1337 |     uniform int stride,
1338 |     uniform float buffer[] // shape: (width,)
1339 | ) {
1340 | 
1341 |     foreach (x = 0 ... width) {
1342 |         buffer[x] = radius * srcp[x];
1343 |     }
1344 | 
1345 |     for (uniform int y = 0; y < radius; ++y) {
1346 |         foreach (x = 0 ... width) {
1347 |             buffer[x] += srcp[min(y, height - 1) * stride + x];
1348 |         }
1349 |     }
1350 | 
1351 |     for (uniform int y = 0; y < min(radius, height); ++y) {
1352 |         foreach (x = 0 ... width) {
1353 |             buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1354 |             dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm);
1355 |             buffer[x] -= srcp[x];
1356 |         }
1357 |     }
1358 | 
1359 |     if (height > radius) {
1360 |         for (uniform int y = radius; y < height - radius; ++y) {
1361 |             foreach (x = 0 ... width) {
1362 |                 buffer[x] += srcp[(y + radius) * stride + x];
1363 |                 dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm);
1364 |                 buffer[x] -= srcp[(y - radius) * stride + x];
1365 |             }
1366 |         }
1367 | 
1368 |         for (uniform int y = max(height - radius, radius); y < height; ++y) {
1369 |             foreach (x = 0 ... width) {
1370 |                 buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1371 |                 dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm);
1372 |                 buffer[x] -= srcp[(y - radius) * stride + x];
1373 |             }
1374 |         }
1375 |     }
1376 | }
1377 | 
1378 | export void nlmAccumulationCh1_f32(
1379 |     uniform float weightp[], // shape: (height, stride)
1380 |     uniform float wdstp[], // shape: (height, stride)
1381 |     uniform float max_weightp[], // shape: (height, stride)
1382 |     uniform const float srcp_bwd[], // shape: (height, stride)
1383 |     uniform const float srcp_fwd[], // shape: (height, stride)
1384 |     uniform const float temp1[], // shape: (height, stride)
1385 |     uniform const float temp2[], // shape: (height, stride)
1386 |     uniform int offset_x,
1387 |     uniform int offset_y,
1388 |     uniform int width,
1389 |     uniform int height,
1390 |     uniform int stride
1391 | ) {
1392 | 
1393 |     uniform int start_x = abs(offset_x);
1394 |     uniform int end_x = width - abs(offset_x);
1395 | 
1396 |     for (uniform int y = 0; y < height; y++) {
1397 |         for (uniform int x = 0; x < start_x; x++) {
1398 |             uniform int idx = y * stride + x;
1399 | 
1400 |             uniform float u4 = temp1[idx];
1401 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1402 | 
1403 |             weightp[idx] += u4 + u4_mq;
1404 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1405 | 
1406 |             uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1407 | 
1408 |             uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1409 | 
1410 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1411 |         }
1412 | 
1413 |         foreach (x = start_x ... end_x) {
1414 |             int idx = y * stride + x;
1415 | 
1416 |             float u4 = temp1[idx];
1417 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1418 | 
1419 |             weightp[idx] += u4 + u4_mq;
1420 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1421 | 
1422 |             float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1423 | 
1424 |             float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1425 | 
1426 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1427 |         }
1428 | 
1429 |         for (uniform int x = end_x; x < width; x++) {
1430 |             uniform int idx = y * stride + x;
1431 | 
1432 |             uniform float u4 = temp1[idx];
1433 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1434 | 
1435 |             weightp[idx] += u4 + u4_mq;
1436 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1437 | 
1438 |             uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1439 | 
1440 |             uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1441 | 
1442 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1443 |         }
1444 |     }
1445 | }
1446 | 
1447 | export void nlmAccumulationCh1_u8(
1448 |     uniform float weightp[], // shape: (height, stride)
1449 |     uniform float wdstp[], // shape: (height, stride)
1450 |     uniform float max_weightp[], // shape: (height, stride)
1451 |     uniform const unsigned int8 srcp_bwd[], // shape: (height, stride)
1452 |     uniform const unsigned int8 srcp_fwd[], // shape: (height, stride)
1453 |     uniform const float temp1[], // shape: (height, stride)
1454 |     uniform const float temp2[], // shape: (height, stride)
1455 |     uniform int offset_x,
1456 |     uniform int offset_y,
1457 |     uniform int width,
1458 |     uniform int height,
1459 |     uniform int stride
1460 | ) {
1461 | 
1462 |     uniform int start_x = abs(offset_x);
1463 |     uniform int end_x = width - abs(offset_x);
1464 | 
1465 |     for (uniform int y = 0; y < height; y++) {
1466 |         for (uniform int x = 0; x < start_x; x++) {
1467 |             uniform int idx = y * stride + x;
1468 | 
1469 |             uniform float u4 = temp1[idx];
1470 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1471 | 
1472 |             weightp[idx] += u4 + u4_mq;
1473 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1474 | 
1475 |             uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1476 | 
1477 |             uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1478 | 
1479 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1480 |         }
1481 | 
1482 |         foreach (x = start_x ... end_x) {
1483 |             int idx = y * stride + x;
1484 | 
1485 |             float u4 = temp1[idx];
1486 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1487 | 
1488 |             weightp[idx] += u4 + u4_mq;
1489 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1490 | 
1491 |             float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1492 | 
1493 |             float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1494 | 
1495 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1496 |         }
1497 | 
1498 |         for (uniform int x = end_x; x < width; x++) {
1499 |             uniform int idx = y * stride + x;
1500 | 
1501 |             uniform float u4 = temp1[idx];
1502 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1503 | 
1504 |             weightp[idx] += u4 + u4_mq;
1505 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1506 | 
1507 |             uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1508 | 
1509 |             uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1510 | 
1511 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1512 |         }
1513 |     }
1514 | }
1515 | 
1516 | export void nlmAccumulationCh1_u16(
1517 |     uniform float weightp[], // shape: (height, stride)
1518 |     uniform float wdstp[], // shape: (height, stride)
1519 |     uniform float max_weightp[], // shape: (height, stride)
1520 |     uniform const unsigned int16 srcp_bwd[], // shape: (height, stride)
1521 |     uniform const unsigned int16 srcp_fwd[], // shape: (height, stride)
1522 |     uniform const float temp1[], // shape: (height, stride)
1523 |     uniform const float temp2[], // shape: (height, stride)
1524 |     uniform int offset_x,
1525 |     uniform int offset_y,
1526 |     uniform int width,
1527 |     uniform int height,
1528 |     uniform int stride
1529 | ) {
1530 | 
1531 |     uniform int start_x = abs(offset_x);
1532 |     uniform int end_x = width - abs(offset_x);
1533 | 
1534 |     for (uniform int y = 0; y < height; y++) {
1535 |         for (uniform int x = 0; x < start_x; x++) {
1536 |             uniform int idx = y * stride + x;
1537 | 
1538 |             uniform float u4 = temp1[idx];
1539 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1540 | 
1541 |             weightp[idx] += u4 + u4_mq;
1542 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1543 | 
1544 |             uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1545 | 
1546 |             uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1547 | 
1548 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1549 |         }
1550 | 
1551 |         foreach (x = start_x ... end_x) {
1552 |             int idx = y * stride + x;
1553 | 
1554 |             float u4 = temp1[idx];
1555 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1556 | 
1557 |             weightp[idx] += u4 + u4_mq;
1558 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1559 | 
1560 |             float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1561 | 
1562 |             float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1563 | 
1564 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1565 |         }
1566 | 
1567 |         for (uniform int x = end_x; x < width; x++) {
1568 |             uniform int idx = y * stride + x;
1569 | 
1570 |             uniform float u4 = temp1[idx];
1571 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1572 | 
1573 |             weightp[idx] += u4 + u4_mq;
1574 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1575 | 
1576 |             uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1577 | 
1578 |             uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1579 | 
1580 |             wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1581 |         }
1582 |     }
1583 | }
1584 | 
1585 | export void nlmAccumulationCh2_f32(
1586 |     uniform float weightp[], // shape: (height, stride)
1587 |     uniform float wdstp1[], // shape: (height, stride)
1588 |     uniform float wdstp2[], // shape: (height, stride)
1589 |     uniform float max_weightp[], // shape: (height, stride)
1590 |     uniform const float srcp_bwd1[], // shape: (height, stride)
1591 |     uniform const float srcp_bwd2[], // shape: (height, stride)
1592 |     uniform const float srcp_fwd1[], // shape: (height, stride)
1593 |     uniform const float srcp_fwd2[], // shape: (height, stride)
1594 |     uniform const float temp1[], // shape: (height, stride)
1595 |     uniform const float temp2[], // shape: (height, stride)
1596 |     uniform int offset_x,
1597 |     uniform int offset_y,
1598 |     uniform int width,
1599 |     uniform int height,
1600 |     uniform int stride
1601 | ) {
1602 | 
1603 |     uniform int start_x = abs(offset_x);
1604 |     uniform int end_x = width - abs(offset_x);
1605 | 
1606 |     for (uniform int y = 0; y < height; y++) {
1607 |         for (uniform int x = 0; x < start_x; x++) {
1608 |             uniform int idx = y * stride + x;
1609 | 
1610 |             uniform float u4 = temp1[idx];
1611 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1612 | 
1613 |             weightp[idx] += u4 + u4_mq;
1614 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1615 | 
1616 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1617 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1618 | 
1619 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1620 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1621 | 
1622 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1623 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1624 |         }
1625 | 
1626 |         foreach (x = start_x ... end_x) {
1627 |             int idx = y * stride + x;
1628 | 
1629 |             float u4 = temp1[idx];
1630 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1631 | 
1632 |             weightp[idx] += u4 + u4_mq;
1633 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1634 | 
1635 |             float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1636 |             float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1637 |             float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1638 |             float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1639 | 
1640 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1641 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1642 |         }
1643 | 
1644 |         for (uniform int x = end_x; x < width; x++) {
1645 |             uniform int idx = y * stride + x;
1646 | 
1647 |             uniform float u4 = temp1[idx];
1648 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1649 | 
1650 |             weightp[idx] += u4 + u4_mq;
1651 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1652 | 
1653 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1654 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1655 | 
1656 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1657 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1658 | 
1659 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1660 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1661 |         }
1662 |     }
1663 | }
1664 | 
1665 | export void nlmAccumulationCh2_u8(
1666 |     uniform float weightp[], // shape: (height, stride)
1667 |     uniform float wdstp1[], // shape: (height, stride)
1668 |     uniform float wdstp2[], // shape: (height, stride)
1669 |     uniform float max_weightp[], // shape: (height, stride)
1670 |     uniform const unsigned int8 srcp_bwd1[], // shape: (height, stride)
1671 |     uniform const unsigned int8 srcp_bwd2[], // shape: (height, stride)
1672 |     uniform const unsigned int8 srcp_fwd1[], // shape: (height, stride)
1673 |     uniform const unsigned int8 srcp_fwd2[], // shape: (height, stride)
1674 |     uniform const float temp1[], // shape: (height, stride)
1675 |     uniform const float temp2[], // shape: (height, stride)
1676 |     uniform int offset_x,
1677 |     uniform int offset_y,
1678 |     uniform int width,
1679 |     uniform int height,
1680 |     uniform int stride
1681 | ) {
1682 | 
1683 |     uniform int start_x = abs(offset_x);
1684 |     uniform int end_x = width - abs(offset_x);
1685 | 
1686 |     for (uniform int y = 0; y < height; y++) {
1687 |         for (uniform int x = 0; x < start_x; x++) {
1688 |             uniform int idx = y * stride + x;
1689 | 
1690 |             uniform float u4 = temp1[idx];
1691 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1692 | 
1693 |             weightp[idx] += u4 + u4_mq;
1694 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1695 | 
1696 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1697 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1698 | 
1699 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1700 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1701 | 
1702 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1703 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1704 |         }
1705 | 
1706 |         foreach (x = start_x ... end_x) {
1707 |             int idx = y * stride + x;
1708 | 
1709 |             float u4 = temp1[idx];
1710 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1711 | 
1712 |             weightp[idx] += u4 + u4_mq;
1713 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1714 | 
1715 |             float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1716 |             float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1717 |             float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1718 |             float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1719 | 
1720 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1721 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1722 |         }
1723 | 
1724 |         for (uniform int x = end_x; x < width; x++) {
1725 |             uniform int idx = y * stride + x;
1726 | 
1727 |             uniform float u4 = temp1[idx];
1728 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1729 | 
1730 |             weightp[idx] += u4 + u4_mq;
1731 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1732 | 
1733 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1734 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1735 | 
1736 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1737 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1738 | 
1739 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1740 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1741 |         }
1742 |     }
1743 | }
1744 | 
1745 | export void nlmAccumulationCh2_u16(
1746 |     uniform float weightp[], // shape: (height, stride)
1747 |     uniform float wdstp1[], // shape: (height, stride)
1748 |     uniform float wdstp2[], // shape: (height, stride)
1749 |     uniform float max_weightp[], // shape: (height, stride)
1750 |     uniform const unsigned int16 srcp_bwd1[], // shape: (height, stride)
1751 |     uniform const unsigned int16 srcp_bwd2[], // shape: (height, stride)
1752 |     uniform const unsigned int16 srcp_fwd1[], // shape: (height, stride)
1753 |     uniform const unsigned int16 srcp_fwd2[], // shape: (height, stride)
1754 |     uniform const float temp1[], // shape: (height, stride)
1755 |     uniform const float temp2[], // shape: (height, stride)
1756 |     uniform int offset_x,
1757 |     uniform int offset_y,
1758 |     uniform int width,
1759 |     uniform int height,
1760 |     uniform int stride
1761 | ) {
1762 | 
1763 |     uniform int start_x = abs(offset_x);
1764 |     uniform int end_x = width - abs(offset_x);
1765 | 
1766 |     for (uniform int y = 0; y < height; y++) {
1767 |         for (uniform int x = 0; x < start_x; x++) {
1768 |             uniform int idx = y * stride + x;
1769 | 
1770 |             uniform float u4 = temp1[idx];
1771 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1772 | 
1773 |             weightp[idx] += u4 + u4_mq;
1774 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1775 | 
1776 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1777 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1778 | 
1779 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1780 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1781 | 
1782 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1783 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1784 |         }
1785 | 
1786 |         foreach (x = start_x ... end_x) {
1787 |             int idx = y * stride + x;
1788 | 
1789 |             float u4 = temp1[idx];
1790 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1791 | 
1792 |             weightp[idx] += u4 + u4_mq;
1793 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1794 | 
1795 |             float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1796 |             float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1797 |             float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1798 |             float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1799 | 
1800 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1801 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1802 |         }
1803 | 
1804 |         for (uniform int x = end_x; x < width; x++) {
1805 |             uniform int idx = y * stride + x;
1806 | 
1807 |             uniform float u4 = temp1[idx];
1808 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1809 | 
1810 |             weightp[idx] += u4 + u4_mq;
1811 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1812 | 
1813 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1814 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1815 | 
1816 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1817 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1818 | 
1819 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1820 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1821 |         }
1822 |     }
1823 | }
1824 | 
1825 | export void nlmAccumulationCh3_f32(
1826 |     uniform float weightp[], // shape: (height, stride)
1827 |     uniform float wdstp1[], // shape: (height, stride)
1828 |     uniform float wdstp2[], // shape: (height, stride)
1829 |     uniform float wdstp3[], // shape: (height, stride)
1830 |     uniform float max_weightp[], // shape: (height, stride)
1831 |     uniform const float srcp_bwd1[], // shape: (height, stride)
1832 |     uniform const float srcp_bwd2[], // shape: (height, stride)
1833 |     uniform const float srcp_bwd3[], // shape: (height, stride)
1834 |     uniform const float srcp_fwd1[], // shape: (height, stride)
1835 |     uniform const float srcp_fwd2[], // shape: (height, stride)
1836 |     uniform const float srcp_fwd3[], // shape: (height, stride)
1837 |     uniform const float temp1[], // shape: (height, stride)
1838 |     uniform const float temp2[], // shape: (height, stride)
1839 |     uniform int offset_x,
1840 |     uniform int offset_y,
1841 |     uniform int width,
1842 |     uniform int height,
1843 |     uniform int stride
1844 | ) {
1845 | 
1846 |     uniform int start_x = abs(offset_x);
1847 |     uniform int end_x = width - abs(offset_x);
1848 | 
1849 |     for (uniform int y = 0; y < height; y++) {
1850 |         for (uniform int x = 0; x < start_x; x++) {
1851 |             uniform int idx = y * stride + x;
1852 | 
1853 |             uniform float u4 = temp1[idx];
1854 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1855 | 
1856 |             weightp[idx] += u4 + u4_mq;
1857 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1858 | 
1859 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1860 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1861 |             uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1862 | 
1863 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1864 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1865 |             uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1866 | 
1867 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1868 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1869 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1870 |         }
1871 | 
1872 |         foreach (x = start_x ... end_x) {
1873 |             int idx = y * stride + x;
1874 | 
1875 |             float u4 = temp1[idx];
1876 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1877 | 
1878 |             weightp[idx] += u4 + u4_mq;
1879 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1880 | 
1881 |             float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1882 |             float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1883 |             float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1884 | 
1885 |             float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1886 |             float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1887 |             float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1888 | 
1889 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1890 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1891 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1892 |         }
1893 | 
1894 |         for (uniform int x = end_x; x < width; x++) {
1895 |             uniform int idx = y * stride + x;
1896 | 
1897 |             uniform float u4 = temp1[idx];
1898 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1899 | 
1900 |             weightp[idx] += u4 + u4_mq;
1901 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1902 | 
1903 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1904 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1905 |             uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1906 | 
1907 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1908 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1909 |             uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1910 | 
1911 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1912 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1913 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1914 |         }
1915 |     }
1916 | }
1917 | 
1918 | export void nlmAccumulationCh3_u8(
1919 |     uniform float weightp[], // shape: (height, stride)
1920 |     uniform float wdstp1[], // shape: (height, stride)
1921 |     uniform float wdstp2[], // shape: (height, stride)
1922 |     uniform float wdstp3[], // shape: (height, stride)
1923 |     uniform float max_weightp[], // shape: (height, stride)
1924 |     uniform const unsigned int8 srcp_bwd1[], // shape: (height, stride)
1925 |     uniform const unsigned int8 srcp_bwd2[], // shape: (height, stride)
1926 |     uniform const unsigned int8 srcp_bwd3[], // shape: (height, stride)
1927 |     uniform const unsigned int8 srcp_fwd1[], // shape: (height, stride)
1928 |     uniform const unsigned int8 srcp_fwd2[], // shape: (height, stride)
1929 |     uniform const unsigned int8 srcp_fwd3[], // shape: (height, stride)
1930 |     uniform const float temp1[], // shape: (height, stride)
1931 |     uniform const float temp2[], // shape: (height, stride)
1932 |     uniform int offset_x,
1933 |     uniform int offset_y,
1934 |     uniform int width,
1935 |     uniform int height,
1936 |     uniform int stride
1937 | ) {
1938 | 
1939 |     uniform int start_x = abs(offset_x);
1940 |     uniform int end_x = width - abs(offset_x);
1941 | 
1942 |     for (uniform int y = 0; y < height; y++) {
1943 |         for (uniform int x = 0; x < start_x; x++) {
1944 |             uniform int idx = y * stride + x;
1945 | 
1946 |             uniform float u4 = temp1[idx];
1947 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1948 | 
1949 |             weightp[idx] += u4 + u4_mq;
1950 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1951 | 
1952 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1953 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1954 |             uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1955 | 
1956 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1957 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1958 |             uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1959 | 
1960 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1961 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1962 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1963 |         }
1964 | 
1965 |         foreach (x = start_x ... end_x) {
1966 |             int idx = y * stride + x;
1967 | 
1968 |             float u4 = temp1[idx];
1969 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1970 | 
1971 |             weightp[idx] += u4 + u4_mq;
1972 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1973 | 
1974 |             float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1975 |             float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1976 |             float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1977 | 
1978 |             float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1979 |             float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1980 |             float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1981 | 
1982 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1983 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1984 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1985 |         }
1986 | 
1987 |         for (uniform int x = end_x; x < width; x++) {
1988 |             uniform int idx = y * stride + x;
1989 | 
1990 |             uniform float u4 = temp1[idx];
1991 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1992 | 
1993 |             weightp[idx] += u4 + u4_mq;
1994 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1995 | 
1996 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1997 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1998 |             uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1999 | 
2000 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2001 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2002 |             uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2003 | 
2004 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
2005 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
2006 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
2007 |         }
2008 |     }
2009 | }
2010 | 
2011 | export void nlmAccumulationCh3_u16(
2012 |     uniform float weightp[], // shape: (height, stride)
2013 |     uniform float wdstp1[], // shape: (height, stride)
2014 |     uniform float wdstp2[], // shape: (height, stride)
2015 |     uniform float wdstp3[], // shape: (height, stride)
2016 |     uniform float max_weightp[], // shape: (height, stride)
2017 |     uniform const unsigned int16 srcp_bwd1[], // shape: (height, stride)
2018 |     uniform const unsigned int16 srcp_bwd2[], // shape: (height, stride)
2019 |     uniform const unsigned int16 srcp_bwd3[], // shape: (height, stride)
2020 |     uniform const unsigned int16 srcp_fwd1[], // shape: (height, stride)
2021 |     uniform const unsigned int16 srcp_fwd2[], // shape: (height, stride)
2022 |     uniform const unsigned int16 srcp_fwd3[], // shape: (height, stride)
2023 |     uniform const float temp1[], // shape: (height, stride)
2024 |     uniform const float temp2[], // shape: (height, stride)
2025 |     uniform int offset_x,
2026 |     uniform int offset_y,
2027 |     uniform int width,
2028 |     uniform int height,
2029 |     uniform int stride
2030 | ) {
2031 | 
2032 |     uniform int start_x = abs(offset_x);
2033 |     uniform int end_x = width - abs(offset_x);
2034 | 
2035 |     for (uniform int y = 0; y < height; y++) {
2036 |         for (uniform int x = 0; x < start_x; x++) {
2037 |             uniform int idx = y * stride + x;
2038 | 
2039 |             uniform float u4 = temp1[idx];
2040 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2041 | 
2042 |             weightp[idx] += u4 + u4_mq;
2043 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
2044 | 
2045 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2046 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2047 |             uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2048 | 
2049 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2050 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2051 |             uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2052 | 
2053 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
2054 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
2055 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
2056 |         }
2057 | 
2058 |         foreach (x = start_x ... end_x) {
2059 |             int idx = y * stride + x;
2060 | 
2061 |             float u4 = temp1[idx];
2062 |             float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
2063 | 
2064 |             weightp[idx] += u4 + u4_mq;
2065 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
2066 | 
2067 |             float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
2068 |             float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
2069 |             float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)];
2070 | 
2071 |             float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
2072 |             float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
2073 |             float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)];
2074 | 
2075 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
2076 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
2077 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
2078 |         }
2079 | 
2080 |         for (uniform int x = end_x; x < width; x++) {
2081 |             uniform int idx = y * stride + x;
2082 | 
2083 |             uniform float u4 = temp1[idx];
2084 |             uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2085 | 
2086 |             weightp[idx] += u4 + u4_mq;
2087 |             max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
2088 | 
2089 |             uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2090 |             uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2091 |             uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2092 | 
2093 |             uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2094 |             uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2095 |             uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2096 | 
2097 |             wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
2098 |             wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
2099 |             wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
2100 |         }
2101 |     }
2102 | }
2103 | 
2104 | export void nlmFinishCh1_f32(
2105 |     uniform float dstp[], // shape: (height, stride)
2106 |     uniform const float srcp[], // shape: (height, stride)
2107 |     uniform const float weightp[], // shape: (height, stride)
2108 |     uniform const float wdstp[], // shape: (height, stride)
2109 |     uniform const float max_weightp[], // shape: (height, stride) // epsilon
2110 |     uniform float wref,
2111 |     uniform int width,
2112 |     uniform int height,
2113 |     uniform int stride
2114 | ) {
2115 | 
2116 |     foreach (y = 0 ... height, x = 0 ... width) {
2117 |         int idx = y * stride + x;
2118 | 
2119 |         float multiplier = wref * max_weightp[idx];
2120 | 
2121 |         float denominator = multiplier + weightp[idx];
2122 | 
2123 |         dstp[idx] = (multiplier * srcp[idx] + wdstp[idx]) / denominator;
2124 |     }
2125 | }
2126 | 
2127 | export void nlmFinishCh1_u8(
2128 |     uniform unsigned int8 dstp[], // shape: (height, stride)
2129 |     uniform const unsigned int8 srcp[], // shape: (height, stride)
2130 |     uniform const float weightp[], // shape: (height, stride)
2131 |     uniform const float wdstp[], // shape: (height, stride)
2132 |     uniform const float max_weightp[], // shape: (height, stride) // epsilon
2133 |     uniform float wref,
2134 |     uniform int width,
2135 |     uniform int height,
2136 |     uniform int stride,
2137 |     uniform int peak
2138 | ) {
2139 | 
2140 |     foreach (y = 0 ... height, x = 0 ... width) {
2141 |         int idx = y * stride + x;
2142 | 
2143 |         float multiplier = wref * max_weightp[idx];
2144 | 
2145 |         float denominator = multiplier + weightp[idx];
2146 | 
2147 |         dstp[idx] = max(0, min((int) round((multiplier * srcp[idx] + wdstp[idx]) / denominator), peak));
2148 |     }
2149 | }
2150 | 
2151 | export void nlmFinishCh1_u16(
2152 |     uniform unsigned int16 dstp[], // shape: (height, stride)
2153 |     uniform const unsigned int16 srcp[], // shape: (height, stride)
2154 |     uniform const float weightp[], // shape: (height, stride)
2155 |     uniform const float wdstp[], // shape: (height, stride)
2156 |     uniform const float max_weightp[], // shape: (height, stride) // epsilon
2157 |     uniform float wref,
2158 |     uniform int width,
2159 |     uniform int height,
2160 |     uniform int stride,
2161 |     uniform int peak
2162 | ) {
2163 | 
2164 |     foreach (y = 0 ... height, x = 0 ... width) {
2165 |         int idx = y * stride + x;
2166 | 
2167 |         float multiplier = wref * max_weightp[idx];
2168 | 
2169 |         float denominator = multiplier + weightp[idx];
2170 | 
2171 |         dstp[idx] = max(0, min((int) round((multiplier * srcp[idx] + wdstp[idx]) / denominator), peak));
2172 |     }
2173 | }
2174 | 
2175 | export void nlmFinishCh2_f32(
2176 |     uniform float dstp1[], // shape: (height, stride)
2177 |     uniform float dstp2[], // shape: (height, stride)
2178 |     uniform const float srcp1[], // shape: (height, stride)
2179 |     uniform const float srcp2[], // shape: (height, stride)
2180 |     uniform const float weightp[], // shape: (height, stride)
2181 |     uniform const float wdstp1[], // shape: (height, stride)
2182 |     uniform const float wdstp2[], // shape: (height, stride)
2183 |     uniform const float max_weightp[], // shape: (height, stride)
2184 |     uniform float wref,
2185 |     uniform int width,
2186 |     uniform int height,
2187 |     uniform int stride
2188 | ) {
2189 | 
2190 |     foreach (y = 0 ... height, x = 0 ... width) {
2191 |         int idx = y * stride + x;
2192 | 
2193 |         float multiplier = wref * max_weightp[idx];
2194 | 
2195 |         float denominator = multiplier + weightp[idx];
2196 | 
2197 |         dstp1[idx] = (multiplier * srcp1[idx] + wdstp1[idx]) / denominator;
2198 |         dstp2[idx] = (multiplier * srcp2[idx] + wdstp2[idx]) / denominator;
2199 |     }
2200 | }
2201 | 
2202 | export void nlmFinishCh2_u8(
2203 |     uniform unsigned int8 dstp1[], // shape: (height, stride)
2204 |     uniform unsigned int8 dstp2[], // shape: (height, stride)
2205 |     uniform const unsigned int8 srcp1[], // shape: (height, stride)
2206 |     uniform const unsigned int8 srcp2[], // shape: (height, stride)
2207 |     uniform const float weightp[], // shape: (height, stride)
2208 |     uniform const float wdstp1[], // shape: (height, stride)
2209 |     uniform const float wdstp2[], // shape: (height, stride)
2210 |     uniform const float max_weightp[], // shape: (height, stride)
2211 |     uniform float wref,
2212 |     uniform int width,
2213 |     uniform int height,
2214 |     uniform int stride,
2215 |     uniform int peak
2216 | ) {
2217 | 
2218 |     foreach (y = 0 ... height, x = 0 ... width) {
2219 |         int idx = y * stride + x;
2220 | 
2221 |         float multiplier = wref * max_weightp[idx];
2222 | 
2223 |         float denominator = multiplier + weightp[idx];
2224 | 
2225 |         dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak));
2226 |         dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak));
2227 |     }
2228 | }
2229 | 
2230 | export void nlmFinishCh2_u16(
2231 |     uniform unsigned int16 dstp1[], // shape: (height, stride)
2232 |     uniform unsigned int16 dstp2[], // shape: (height, stride)
2233 |     uniform const unsigned int16 srcp1[], // shape: (height, stride)
2234 |     uniform const unsigned int16 srcp2[], // shape: (height, stride)
2235 |     uniform const float weightp[], // shape: (height, stride)
2236 |     uniform const float wdstp1[], // shape: (height, stride)
2237 |     uniform const float wdstp2[], // shape: (height, stride)
2238 |     uniform const float max_weightp[], // shape: (height, stride)
2239 |     uniform float wref,
2240 |     uniform int width,
2241 |     uniform int height,
2242 |     uniform int stride,
2243 |     uniform int peak
2244 | ) {
2245 | 
2246 |     foreach (y = 0 ... height, x = 0 ... width) {
2247 |         int idx = y * stride + x;
2248 | 
2249 |         float multiplier = wref * max_weightp[idx];
2250 | 
2251 |         float denominator = multiplier + weightp[idx];
2252 | 
2253 |         dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak));
2254 |         dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak));
2255 |     }
2256 | }
2257 | 
2258 | export void nlmFinishCh3_f32(
2259 |     uniform float dstp1[], // shape: (height, stride)
2260 |     uniform float dstp2[], // shape: (height, stride)
2261 |     uniform float dstp3[], // shape: (height, stride)
2262 |     uniform const float srcp1[], // shape: (height, stride)
2263 |     uniform const float srcp2[], // shape: (height, stride)
2264 |     uniform const float srcp3[], // shape: (height, stride)
2265 |     uniform const float weightp[], // shape: (height, stride)
2266 |     uniform const float wdstp1[], // shape: (height, stride)
2267 |     uniform const float wdstp2[], // shape: (height, stride)
2268 |     uniform const float wdstp3[], // shape: (height, stride)
2269 |     uniform const float max_weightp[], // shape: (height, stride)
2270 |     uniform float wref,
2271 |     uniform int width,
2272 |     uniform int height,
2273 |     uniform int stride
2274 | ) {
2275 | 
2276 |     foreach (y = 0 ... height, x = 0 ... width) {
2277 |         int idx = y * stride + x;
2278 | 
2279 |         float multiplier = wref * max_weightp[idx];
2280 | 
2281 |         float denominator = multiplier + weightp[idx];
2282 | 
2283 |         dstp1[idx] = (multiplier * srcp1[idx] + wdstp1[idx]) / denominator;
2284 |         dstp2[idx] = (multiplier * srcp2[idx] + wdstp2[idx]) / denominator;
2285 |         dstp3[idx] = (multiplier * srcp3[idx] + wdstp3[idx]) / denominator;
2286 |     }
2287 | }
2288 | 
2289 | export void nlmFinishCh3_u8(
2290 |     uniform unsigned int8 dstp1[], // shape: (height, stride)
2291 |     uniform unsigned int8 dstp2[], // shape: (height, stride)
2292 |     uniform unsigned int8 dstp3[], // shape: (height, stride)
2293 |     uniform const unsigned int8 srcp1[], // shape: (height, stride)
2294 |     uniform const unsigned int8 srcp2[], // shape: (height, stride)
2295 |     uniform const unsigned int8 srcp3[], // shape: (height, stride)
2296 |     uniform const float weightp[], // shape: (height, stride)
2297 |     uniform const float wdstp1[], // shape: (height, stride)
2298 |     uniform const float wdstp2[], // shape: (height, stride)
2299 |     uniform const float wdstp3[], // shape: (height, stride)
2300 |     uniform const float max_weightp[], // shape: (height, stride)
2301 |     uniform float wref,
2302 |     uniform int width,
2303 |     uniform int height,
2304 |     uniform int stride,
2305 |     uniform int peak
2306 | ) {
2307 | 
2308 |     foreach (y = 0 ... height, x = 0 ... width) {
2309 |         int idx = y * stride + x;
2310 | 
2311 |         float multiplier = wref * max_weightp[idx];
2312 | 
2313 |         float denominator = multiplier + weightp[idx];
2314 | 
2315 |         dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak));
2316 |         dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak));
2317 |         dstp3[idx] = max(0, min((int) round((multiplier * srcp3[idx] + wdstp3[idx]) / denominator), peak));
2318 |     }
2319 | }
2320 | 
2321 | export void nlmFinishCh3_u16(
2322 |     uniform unsigned int16 dstp1[], // shape: (height, stride)
2323 |     uniform unsigned int16 dstp2[], // shape: (height, stride)
2324 |     uniform unsigned int16 dstp3[], // shape: (height, stride)
2325 |     uniform const unsigned int16 srcp1[], // shape: (height, stride)
2326 |     uniform const unsigned int16 srcp2[], // shape: (height, stride)
2327 |     uniform const unsigned int16 srcp3[], // shape: (height, stride)
2328 |     uniform const float weightp[], // shape: (height, stride)
2329 |     uniform const float wdstp1[], // shape: (height, stride)
2330 |     uniform const float wdstp2[], // shape: (height, stride)
2331 |     uniform const float wdstp3[], // shape: (height, stride)
2332 |     uniform const float max_weightp[], // shape: (height, stride)
2333 |     uniform float wref,
2334 |     uniform int width,
2335 |     uniform int height,
2336 |     uniform int stride,
2337 |     uniform int peak
2338 | ) {
2339 | 
2340 |     foreach (y = 0 ... height, x = 0 ... width) {
2341 |         int idx = y * stride + x;
2342 | 
2343 |         float multiplier = wref * max_weightp[idx];
2344 | 
2345 |         float denominator = multiplier + weightp[idx];
2346 | 
2347 |         dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak));
2348 |         dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak));
2349 |         dstp3[idx] = max(0, min((int) round((multiplier * srcp3[idx] + wdstp3[idx]) / denominator), peak));
2350 |     }
2351 | }
2352 | 


--------------------------------------------------------------------------------