├── source
├── config.h.in
├── vsnlm.cpp
└── nlm.ispc
├── README.md
├── .github
└── workflows
│ ├── linux-arm64.yml
│ ├── linux.yml
│ └── windows.yml
├── CMakeLists.txt
└── LICENSE
/source/config.h.in:
--------------------------------------------------------------------------------
1 | #define VERSION "@VCS_TAG@"
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # vs-nlm-ispc
2 | Non-local means denoise filter, drop-in replacement of the venerable [KNLMeansCL](https://github.com/Khanattila/KNLMeansCL), but without the OpenCL dependency (CPU only).
3 |
4 | x86 and arm are supported.
5 |
6 | ## Usage
7 | Prototype:
8 |
9 | `core.nlm_ispc.NLMeans(clip clip[, int d = 1, int a = 2, int s = 4, float h = 1.2, string channels = "AUTO", int wmode = 0, float wref = 1.0, clip rclip = None])`
10 |
11 | ## Compilation
12 | [ISPC](https://github.com/ispc/ispc) is required.
13 |
14 | ### x86
15 | ```bash
16 | cmake -S . -B build -D CMAKE_BUILD_TYPE=Release \
17 | -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8" \
18 | -D CMAKE_ISPC_FLAGS="--opt=fast-math"
19 |
20 | cmake --build build
21 |
22 | cmake --install build
23 | ```
24 |
25 | ### arm
26 | ```bash
27 | cmake -S . -B build -D CMAKE_BUILD_TYPE=Release \
28 | -D CMAKE_ISPC_INSTRUCTION_SETS="neon-i32x4" \
29 | -D CMAKE_ISPC_FLAGS="--opt=fast-math"
30 |
31 | cmake --build build
32 |
33 | cmake --install build
34 | ```
35 |
36 |
--------------------------------------------------------------------------------
/.github/workflows/linux-arm64.yml:
--------------------------------------------------------------------------------
1 | name: Build (Linux, ARM64)
2 |
3 | on:
4 | push:
5 | paths:
6 | - 'source/nlm.ispc'
7 | - 'source/vsnlm.cpp'
8 | - '.github/workflows/linux-arm64.yml'
9 | workflow_dispatch:
10 |
11 | jobs:
12 | build-linux:
13 | runs-on: ubuntu-24.04-arm
14 | steps:
15 | - name: Checkout repo
16 | uses: actions/checkout@v4
17 | with:
18 | fetch-depth: 0
19 |
20 | - name: Download ISPC
21 | run: |
22 | curl -s -o ispc.tar.gz -LJO https://github.com/ispc/ispc/releases/download/v1.25.3/ispc-v1.25.3-linux.aarch64.tar.gz
23 | tar -xzf ispc.tar.gz
24 | mv ispc-*/ ispc/
25 |
26 | - name: Download VapourSynth headers
27 | run: |
28 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
29 | unzip -q vs.zip
30 | mv vapoursynth*/ vapoursynth
31 |
32 | - name: Setup Ninja
33 | run: pip install ninja
34 |
35 | - name: Configure
36 | run: cmake -S . -B build -G Ninja -LA
37 | -D CMAKE_BUILD_TYPE=Release
38 | -D CMAKE_CXX_FLAGS="-Wall"
39 | -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc"
40 | -D CMAKE_ISPC_INSTRUCTION_SETS="neon-i32x4"
41 | -D CMAKE_ISPC_FLAGS="--opt=fast-math"
42 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include"
43 |
44 | - name: Build
45 | run: cmake --build build --verbose
46 |
47 | - name: Install
48 | run: cmake --install build --prefix install
49 |
50 | - name: Upload
51 | uses: actions/upload-artifact@v4
52 | if: false
53 | with:
54 | name: Linux-x64
55 | path: install/lib/*.so
56 |
57 |
--------------------------------------------------------------------------------
/.github/workflows/linux.yml:
--------------------------------------------------------------------------------
1 | name: Build (Linux)
2 |
3 | on:
4 | push:
5 | paths:
6 | - 'source/nlm.ispc'
7 | - 'source/vsnlm.cpp'
8 | - '.github/workflows/linux.yml'
9 | workflow_dispatch:
10 |
11 | jobs:
12 | build-linux:
13 | runs-on: ubuntu-22.04
14 | steps:
15 | - name: Checkout repo
16 | uses: actions/checkout@v3
17 | with:
18 | fetch-depth: 0
19 |
20 | - name: Download ISPC
21 | run: |
22 | curl -s -o ispc.tar.gz -LJO https://github.com/ispc/ispc/releases/download/v1.20.0/ispc-v1.20.0-linux.tar.gz
23 | tar -xzf ispc.tar.gz
24 | mv ispc-*/ ispc/
25 |
26 | - name: Download VapourSynth headers
27 | run: |
28 | wget -q -O vs.zip https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
29 | unzip -q vs.zip
30 | mv vapoursynth*/ vapoursynth
31 |
32 | - name: Setup Ninja
33 | run: pip install ninja
34 |
35 | - name: Configure
36 | run: cmake -S . -B build -G Ninja -LA
37 | -D CMAKE_BUILD_TYPE=Release
38 | -D CMAKE_CXX_COMPILER=g++-12
39 | -D CMAKE_CXX_FLAGS="-Wall"
40 | -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc"
41 | -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8"
42 | -D CMAKE_ISPC_FLAGS="--opt=fast-math"
43 | -D VS_INCLUDE_DIR="`pwd`/vapoursynth/include"
44 |
45 | - name: Build
46 | run: cmake --build build --verbose
47 |
48 | - name: Install
49 | run: cmake --install build --prefix install
50 |
51 | - name: Upload
52 | uses: actions/upload-artifact@v3
53 | if: false
54 | with:
55 | name: Linux-x64
56 | path: install/lib/*.so
57 |
58 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.20.0)
2 |
3 | project(vs-nlm-ispc VERSION 0.1 LANGUAGES CXX ISPC)
4 |
5 | add_library(vsnlm_ispc SHARED source/vsnlm.cpp source/nlm.ispc)
6 |
7 | set_target_properties(vsnlm_ispc PROPERTIES
8 | CXX_EXTENSIONS OFF
9 | CXX_STANDARD 17
10 | CXX_STANDARD_REQUIRED ON
11 | )
12 |
13 | find_package(PkgConfig QUIET MODULE)
14 |
15 | if(PKG_CONFIG_FOUND)
16 | pkg_search_module(VS vapoursynth)
17 |
18 | if(VS_FOUND)
19 | message(STATUS "Found VapourSynth r${VS_VERSION}")
20 |
21 | cmake_path(APPEND install_dir ${VS_LIBDIR} vapoursynth)
22 | target_include_directories(vsnlm_ispc PRIVATE ${VS_INCLUDE_DIRS})
23 |
24 | install(TARGETS vsnlm_ispc LIBRARY DESTINATION ${install_dir})
25 | endif()
26 | endif()
27 |
28 | if(NOT VS_FOUND)
29 | set(VS_INCLUDE_DIR "" CACHE PATH "Path to VapourSynth headers")
30 |
31 | if(VS_INCLUDE_DIR STREQUAL "")
32 | message(WARNING "VapourSynth not found")
33 | endif()
34 |
35 | target_include_directories(vsnlm_ispc PRIVATE ${VS_INCLUDE_DIR})
36 |
37 | install(TARGETS vsnlm_ispc LIBRARY RUNTIME)
38 | endif()
39 |
40 | find_package(Git QUIET)
41 |
42 | if(GIT_FOUND)
43 | execute_process(
44 | COMMAND ${GIT_EXECUTABLE} describe --tags --long --always
45 | WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
46 | OUTPUT_VARIABLE VCS_TAG
47 | )
48 | if(VCS_TAG)
49 | string(STRIP ${VCS_TAG} VCS_TAG)
50 | endif()
51 | endif()
52 |
53 | if(VCS_TAG)
54 | message(STATUS "vs-nlm-ispc ${VCS_TAG}")
55 | else()
56 | message(WARNING "unknown plugin version")
57 | set(VCS_TAG "unknown")
58 | endif()
59 |
60 | configure_file(source/config.h.in config.h)
61 |
62 | target_include_directories(vsnlm_ispc PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
63 |
--------------------------------------------------------------------------------
/.github/workflows/windows.yml:
--------------------------------------------------------------------------------
1 | name: Build (Windows)
2 |
3 | on:
4 | push:
5 | paths:
6 | - 'source/nlm.ispc'
7 | - 'source/vsnlm.cpp'
8 | - '.github/workflows/windows.yml'
9 | workflow_dispatch:
10 | inputs:
11 | tag:
12 | description: 'which tag to upload to'
13 | default: ''
14 |
15 | jobs:
16 | build-windows:
17 | runs-on: windows-2022
18 | outputs:
19 | runID: ${{ steps.output.outputs.runID }}
20 |
21 | defaults:
22 | run:
23 | shell: cmd
24 |
25 | steps:
26 | - name: Checkout repo
27 | uses: actions/checkout@v3
28 | with:
29 | fetch-depth: 0
30 |
31 | - name: Setup MSVC
32 | uses: ilammy/msvc-dev-cmd@v1
33 |
34 | - name: Download VapourSynth headers
35 | run: |
36 | curl -s -o vs.zip -L https://github.com/vapoursynth/vapoursynth/archive/refs/tags/R57.zip
37 | unzip -q vs.zip
38 | mv vapoursynth-*/ vapoursynth/
39 |
40 | - name: Download ISPC
41 | run: |
42 | curl -s -o ispc.zip -LJO https://github.com/ispc/ispc/releases/download/v1.20.0/ispc-v1.20.0-windows.zip
43 | unzip -q ispc.zip
44 | mv ispc-*/ ispc/
45 | tree ispc
46 |
47 | - name: Configure
48 | shell: bash
49 | run: cmake -S . -B build -G Ninja
50 | -D VS_INCLUDE_DIR="$(pwd)\vapoursynth\include"
51 | -D CMAKE_BUILD_TYPE=Release
52 | -D CMAKE_CXX_COMPILER="clang++"
53 | -D CMAKE_CXX_FLAGS="-Wall"
54 | -D CMAKE_ISPC_COMPILER="$(pwd)/ispc/bin/ispc.exe"
55 | -D CMAKE_ISPC_FLAGS="--opt=fast-math"
56 | -D CMAKE_ISPC_INSTRUCTION_SETS="sse2-i32x4;avx1-i32x4;avx2-i32x8"
57 | -D CMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded
58 |
59 | - name: Build
60 | run: cmake --build build --verbose
61 |
62 | - name: Install
63 | run: |
64 | cmake --install build --prefix install
65 | mkdir artifact
66 | copy install\bin\vsnlm_ispc.dll artifact\
67 |
68 | - name: Upload
69 | uses: actions/upload-artifact@v3
70 | with:
71 | name: Windows-x64
72 | path: artifact
73 |
74 | - name: Describe
75 | run: git describe --tags --long
76 |
77 | - name: Compress artifact for release
78 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
79 | run: |
80 | cd artifact
81 | 7z a -t7z -mx=7 ../vs-nlm-ispc-windows-x64.${{ github.event.inputs.tag }}.7z .
82 |
83 | - name: Release
84 | uses: softprops/action-gh-release@v1
85 | if: github.event_name == 'workflow_dispatch' && github.event.inputs.tag != ''
86 | with:
87 | tag_name: ${{ github.event.inputs.tag }}
88 | files: vs-nlm-ispc-windows-x64.${{ github.event.inputs.tag }}.7z
89 | fail_on_unmatched_files: true
90 | generate_release_notes: false
91 | prerelease: true
92 |
93 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/source/vsnlm.cpp:
--------------------------------------------------------------------------------
1 | // based on KNLMeansCL by Khanattila
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 |
17 | #include
18 | #include
19 |
20 | #include // generated by the ispc compiler
21 |
22 | #include // generated by cmake and git
23 |
24 | namespace {
25 | enum struct ChannelMode { Y, UV, YUV, RGB };
26 |
27 | struct NLMData {
28 | VSNodeRef * node; // clip
29 | const VSVideoInfo *vi;
30 | int d;
31 | int a;
32 | int s;
33 | float h;
34 | ChannelMode channels;
35 | decltype(&ispc::nlmVerticalWelsch) nlm_vertical; // wmode
36 | float wref;
37 | VSNodeRef * ref_node; // rclip
38 |
39 | // run-time resources
40 | std::shared_mutex workspaces_lock;
41 | std::unordered_map workspaces;
42 | };
43 | }
44 |
45 | template
46 | static inline auto castVoidPtr(T * p) noexcept {
47 | if constexpr (std::is_const_v) {
48 | return reinterpret_cast(p);
49 | } else {
50 | return reinterpret_cast(p);
51 | }
52 | }
53 |
54 | template
55 | static inline std::array castPtrs(std::array ptrs) {
56 | return {
57 | (T1 *) ptrs[0],
58 | (T1 *) ptrs[1],
59 | (T1 *) ptrs[2],
60 | };
61 | }
62 |
63 | template
64 | static inline constexpr T square(T x) noexcept {
65 | return x * x;
66 | }
67 |
68 | // T: (const) VSFrameRef
69 | template
70 | static inline auto getPtrs(
71 | T * frame,
72 | ChannelMode channels,
73 | const VSAPI * vsapi
74 | ) noexcept {
75 |
76 | using value_type = std::conditional_t, const void, void>;
77 |
78 | std::array ptrs {};
79 |
80 | auto get_ptr = [frame, vsapi](int plane) {
81 | if constexpr (std::is_const_v) {
82 | return castVoidPtr(vsapi->getReadPtr(frame, plane));
83 | } else {
84 | return castVoidPtr(vsapi->getWritePtr(frame, plane));
85 | }
86 | };
87 |
88 | switch (channels) {
89 | case ChannelMode::Y:
90 | ptrs[0] = get_ptr(0);
91 | break;
92 | case ChannelMode::UV:
93 | ptrs[1] = get_ptr(1);
94 | ptrs[2] = get_ptr(2);
95 | break;
96 | case ChannelMode::YUV:
97 | case ChannelMode::RGB:
98 | ptrs[0] = get_ptr(0);
99 | ptrs[1] = get_ptr(1);
100 | ptrs[2] = get_ptr(2);
101 | break;
102 | }
103 |
104 | return ptrs;
105 | }
106 |
107 | static void VS_CC nlmInit(
108 | VSMap * in,
109 | VSMap * out,
110 | void ** instanceData,
111 | VSNode * node,
112 | VSCore * core,
113 | const VSAPI * vsapi
114 | ) noexcept {
115 |
116 | const auto * d = reinterpret_cast(*instanceData);
117 | vsapi->setVideoInfo(vsapi->getVideoInfo(d->node), 1, node);
118 | }
119 |
120 | static inline void nlmDistanceDispatch_f32(
121 | float * temp0,
122 | std::array centerp,
123 | std::array neighborp,
124 | int offset_x,
125 | int offset_y,
126 | int width,
127 | int height,
128 | int stride,
129 | ChannelMode channels
130 | ) noexcept {
131 |
132 | switch (channels) {
133 | case ChannelMode::Y:
134 | ispc::nlmDistanceLuma_f32(
135 | temp0,
136 | centerp[0],
137 | neighborp[0],
138 | offset_x, offset_y,
139 | width, height, stride
140 | );
141 | break;
142 | case ChannelMode::UV:
143 | ispc::nlmDistanceChroma_f32(
144 | temp0,
145 | centerp[1], centerp[2],
146 | neighborp[1], neighborp[2],
147 | offset_x, offset_y,
148 | width, height, stride
149 | );
150 | break;
151 | case ChannelMode::YUV:
152 | ispc::nlmDistanceYUV_f32(
153 | temp0,
154 | centerp[0], centerp[1], centerp[2],
155 | neighborp[0], neighborp[1], neighborp[2],
156 | offset_x, offset_y,
157 | width, height, stride
158 | );
159 | break;
160 | case ChannelMode::RGB:
161 | ispc::nlmDistanceRGB_f32(
162 | temp0,
163 | centerp[0], centerp[1], centerp[2],
164 | neighborp[0], neighborp[1], neighborp[2],
165 | offset_x, offset_y,
166 | width, height, stride
167 | );
168 | break;
169 | }
170 | }
171 |
172 | static inline void nlmDistanceDispatch_u8(
173 | float * temp0,
174 | std::array centerp,
175 | std::array neighborp,
176 | int offset_x,
177 | int offset_y,
178 | int width,
179 | int height,
180 | int stride,
181 | ChannelMode channels,
182 | float inv_divisor
183 | ) noexcept {
184 |
185 | switch (channels) {
186 | case ChannelMode::Y:
187 | ispc::nlmDistanceLuma_u8(
188 | temp0,
189 | centerp[0],
190 | neighborp[0],
191 | offset_x, offset_y,
192 | width, height, stride,
193 | inv_divisor
194 | );
195 | break;
196 | case ChannelMode::UV:
197 | ispc::nlmDistanceChroma_u8(
198 | temp0,
199 | centerp[1], centerp[2],
200 | neighborp[1], neighborp[2],
201 | offset_x, offset_y,
202 | width, height, stride,
203 | inv_divisor
204 | );
205 | break;
206 | case ChannelMode::YUV:
207 | ispc::nlmDistanceYUV_u8(
208 | temp0,
209 | centerp[0], centerp[1], centerp[2],
210 | neighborp[0], neighborp[1], neighborp[2],
211 | offset_x, offset_y,
212 | width, height, stride,
213 | inv_divisor
214 | );
215 | break;
216 | case ChannelMode::RGB:
217 | ispc::nlmDistanceRGB_u8(
218 | temp0,
219 | centerp[0], centerp[1], centerp[2],
220 | neighborp[0], neighborp[1], neighborp[2],
221 | offset_x, offset_y,
222 | width, height, stride,
223 | inv_divisor
224 | );
225 | break;
226 | }
227 | }
228 |
229 | static inline void nlmDistanceDispatch_u16(
230 | float * temp0,
231 | std::array centerp,
232 | std::array neighborp,
233 | int offset_x,
234 | int offset_y,
235 | int width,
236 | int height,
237 | int stride,
238 | ChannelMode channels,
239 | float inv_divisor
240 | ) noexcept {
241 |
242 | switch (channels) {
243 | case ChannelMode::Y:
244 | ispc::nlmDistanceLuma_u16(
245 | temp0,
246 | centerp[0],
247 | neighborp[0],
248 | offset_x, offset_y,
249 | width, height, stride,
250 | inv_divisor
251 | );
252 | break;
253 | case ChannelMode::UV:
254 | ispc::nlmDistanceChroma_u16(
255 | temp0,
256 | centerp[1], centerp[2],
257 | neighborp[1], neighborp[2],
258 | offset_x, offset_y,
259 | width, height, stride,
260 | inv_divisor
261 | );
262 | break;
263 | case ChannelMode::YUV:
264 | ispc::nlmDistanceYUV_u16(
265 | temp0,
266 | centerp[0], centerp[1], centerp[2],
267 | neighborp[0], neighborp[1], neighborp[2],
268 | offset_x, offset_y,
269 | width, height, stride,
270 | inv_divisor
271 | );
272 | break;
273 | case ChannelMode::RGB:
274 | ispc::nlmDistanceRGB_u16(
275 | temp0,
276 | centerp[0], centerp[1], centerp[2],
277 | neighborp[0], neighborp[1], neighborp[2],
278 | offset_x, offset_y,
279 | width, height, stride,
280 | inv_divisor
281 | );
282 | break;
283 | }
284 | }
285 |
286 | static inline void nlmDistance(
287 | float * temp0,
288 | std::array centerp,
289 | std::array neighborp,
290 | int offset_x,
291 | int offset_y,
292 | int width,
293 | int height,
294 | int stride,
295 | ChannelMode channels,
296 | int bits
297 | ) noexcept {
298 |
299 | if (bits == 32) {
300 | nlmDistanceDispatch_f32(
301 | temp0,
302 | castPtrs(centerp), castPtrs(neighborp),
303 | offset_x, offset_y,
304 | width, height, stride, channels
305 | );
306 | } else if (bits <= 8) {
307 | float inv_divisor = 1.0f / ((1 << bits) - 1);
308 | nlmDistanceDispatch_u8(
309 | temp0,
310 | castPtrs(centerp), castPtrs(neighborp),
311 | offset_x, offset_y,
312 | width, height, stride, channels, inv_divisor
313 | );
314 | } else if (bits <= 16) {
315 | float inv_divisor = 1.0f / ((1 << bits) - 1);
316 | nlmDistanceDispatch_u16(
317 | temp0,
318 | castPtrs(centerp), castPtrs(neighborp),
319 | offset_x, offset_y,
320 | width, height, stride, channels, inv_divisor
321 | );
322 | } else {
323 | assert(false);
324 | }
325 | }
326 |
327 | static inline void nlmAccumulationDispatch_f32(
328 | float * weightp,
329 | std::array wdstp,
330 | float * max_weightp,
331 | std::array srcp_bwd,
332 | std::array srcp_fwd,
333 | const float * temp_bwd,
334 | const float * temp_fwd,
335 | int offset_x,
336 | int offset_y,
337 | int width,
338 | int height,
339 | int stride,
340 | ChannelMode channels
341 | ) noexcept {
342 |
343 | switch (channels) {
344 | case ChannelMode::Y:
345 | ispc::nlmAccumulationCh1_f32(
346 | weightp, wdstp[0], max_weightp,
347 | srcp_bwd[0],
348 | srcp_fwd[0],
349 | temp_bwd, temp_fwd,
350 | offset_x, offset_y,
351 | width, height, stride
352 | );
353 | break;
354 | case ChannelMode::UV:
355 | ispc::nlmAccumulationCh2_f32(
356 | weightp, wdstp[0], wdstp[1], max_weightp,
357 | srcp_bwd[1], srcp_bwd[2],
358 | srcp_fwd[1], srcp_fwd[2],
359 | temp_bwd, temp_fwd,
360 | offset_x, offset_y,
361 | width, height, stride
362 | );
363 | break;
364 | case ChannelMode::YUV:
365 | case ChannelMode::RGB:
366 | ispc::nlmAccumulationCh3_f32(
367 | weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp,
368 | srcp_bwd[0], srcp_bwd[1], srcp_bwd[2],
369 | srcp_fwd[0], srcp_fwd[1], srcp_fwd[2],
370 | temp_bwd, temp_fwd,
371 | offset_x, offset_y,
372 | width, height, stride
373 | );
374 | break;
375 | }
376 | }
377 |
378 | static inline void nlmAccumulationDispatch_u8(
379 | float * weightp,
380 | std::array wdstp,
381 | float * max_weightp,
382 | std::array srcp_bwd,
383 | std::array srcp_fwd,
384 | const float * temp_bwd,
385 | const float * temp_fwd,
386 | int offset_x,
387 | int offset_y,
388 | int width,
389 | int height,
390 | int stride,
391 | ChannelMode channels
392 | ) noexcept {
393 |
394 | switch (channels) {
395 | case ChannelMode::Y:
396 | ispc::nlmAccumulationCh1_u8(
397 | weightp, wdstp[0], max_weightp,
398 | srcp_bwd[0],
399 | srcp_fwd[0],
400 | temp_bwd, temp_fwd,
401 | offset_x, offset_y,
402 | width, height, stride
403 | );
404 | break;
405 | case ChannelMode::UV:
406 | ispc::nlmAccumulationCh2_u8(
407 | weightp, wdstp[0], wdstp[1], max_weightp,
408 | srcp_bwd[1], srcp_bwd[2],
409 | srcp_fwd[1], srcp_fwd[2],
410 | temp_bwd, temp_fwd,
411 | offset_x, offset_y,
412 | width, height, stride
413 | );
414 | break;
415 | case ChannelMode::YUV:
416 | case ChannelMode::RGB:
417 | ispc::nlmAccumulationCh3_u8(
418 | weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp,
419 | srcp_bwd[0], srcp_bwd[1], srcp_bwd[2],
420 | srcp_fwd[0], srcp_fwd[1], srcp_fwd[2],
421 | temp_bwd, temp_fwd,
422 | offset_x, offset_y,
423 | width, height, stride
424 | );
425 | break;
426 | }
427 | }
428 |
429 | static inline void nlmAccumulationDispatch_u16(
430 | float * weightp,
431 | std::array wdstp,
432 | float * max_weightp,
433 | std::array srcp_bwd,
434 | std::array srcp_fwd,
435 | const float * temp_bwd,
436 | const float * temp_fwd,
437 | int offset_x,
438 | int offset_y,
439 | int width,
440 | int height,
441 | int stride,
442 | ChannelMode channels
443 | ) noexcept {
444 |
445 | switch (channels) {
446 | case ChannelMode::Y:
447 | ispc::nlmAccumulationCh1_u16(
448 | weightp, wdstp[0], max_weightp,
449 | srcp_bwd[0],
450 | srcp_fwd[0],
451 | temp_bwd, temp_fwd,
452 | offset_x, offset_y,
453 | width, height, stride
454 | );
455 | break;
456 | case ChannelMode::UV:
457 | ispc::nlmAccumulationCh2_u16(
458 | weightp, wdstp[0], wdstp[1], max_weightp,
459 | srcp_bwd[1], srcp_bwd[2],
460 | srcp_fwd[1], srcp_fwd[2],
461 | temp_bwd, temp_fwd,
462 | offset_x, offset_y,
463 | width, height, stride
464 | );
465 | break;
466 | case ChannelMode::YUV:
467 | case ChannelMode::RGB:
468 | ispc::nlmAccumulationCh3_u16(
469 | weightp, wdstp[0], wdstp[1], wdstp[2], max_weightp,
470 | srcp_bwd[0], srcp_bwd[1], srcp_bwd[2],
471 | srcp_fwd[0], srcp_fwd[1], srcp_fwd[2],
472 | temp_bwd, temp_fwd,
473 | offset_x, offset_y,
474 | width, height, stride
475 | );
476 | break;
477 | }
478 | }
479 |
480 | static inline void nlmAccumulation(
481 | float * weightp,
482 | std::array wdstp,
483 | float * max_weightp,
484 | std::array srcp_bwd,
485 | std::array srcp_fwd,
486 | const float * temp_bwd,
487 | const float * temp_fwd,
488 | int offset_x,
489 | int offset_y,
490 | int width,
491 | int height,
492 | int stride,
493 | ChannelMode channels,
494 | int bits
495 | ) noexcept {
496 |
497 | if (bits == 32) {
498 | nlmAccumulationDispatch_f32(
499 | weightp, wdstp, max_weightp,
500 | castPtrs(srcp_bwd), castPtrs(srcp_fwd), temp_bwd, temp_fwd,
501 | offset_x, offset_y, width, height, stride, channels
502 | );
503 | } else if (bits <= 8) {
504 | nlmAccumulationDispatch_u8(
505 | weightp, wdstp, max_weightp,
506 | castPtrs(srcp_bwd), castPtrs(srcp_fwd), temp_bwd, temp_fwd,
507 | offset_x, offset_y, width, height, stride, channels
508 | );
509 | } else if (bits <= 16) {
510 | nlmAccumulationDispatch_u16(
511 | weightp, wdstp, max_weightp,
512 | castPtrs(srcp_bwd), castPtrs(srcp_fwd), temp_bwd, temp_fwd,
513 | offset_x, offset_y, width, height, stride, channels
514 | );
515 | } else {
516 | assert(false);
517 | }
518 | }
519 |
520 | static inline void nlmFinishDispatch_f32(
521 | std::array dstp,
522 | std::array srcp,
523 | const float * weightp,
524 | std::array wdstp,
525 | const float * max_weightp,
526 | float wref,
527 | int width,
528 | int height,
529 | int stride,
530 | ChannelMode channels
531 | ) noexcept {
532 |
533 | switch (channels) {
534 | case ChannelMode::Y:
535 | ispc::nlmFinishCh1_f32(
536 | dstp[0],
537 | srcp[0],
538 | weightp, wdstp[0],
539 | max_weightp, wref,
540 | width, height, stride
541 | );
542 | break;
543 | case ChannelMode::UV:
544 | ispc::nlmFinishCh2_f32(
545 | dstp[1], dstp[2],
546 | srcp[1], srcp[2],
547 | weightp, wdstp[0], wdstp[1],
548 | max_weightp, wref,
549 | width, height, stride
550 | );
551 | break;
552 | case ChannelMode::YUV:
553 | case ChannelMode::RGB:
554 | ispc::nlmFinishCh3_f32(
555 | dstp[0], dstp[1], dstp[2],
556 | srcp[0], srcp[1], srcp[2],
557 | weightp, wdstp[0], wdstp[1], wdstp[2],
558 | max_weightp, wref,
559 | width, height, stride
560 | );
561 | break;
562 | }
563 | }
564 |
565 | static inline void nlmFinishDispatch_u8(
566 | std::array dstp,
567 | std::array srcp,
568 | const float * weightp,
569 | std::array wdstp,
570 | const float * max_weightp,
571 | float wref,
572 | int width,
573 | int height,
574 | int stride,
575 | ChannelMode channels,
576 | int peak
577 | ) noexcept {
578 |
579 | switch (channels) {
580 | case ChannelMode::Y:
581 | ispc::nlmFinishCh1_u8(
582 | dstp[0],
583 | srcp[0],
584 | weightp, wdstp[0],
585 | max_weightp, wref,
586 | width, height, stride,
587 | peak
588 | );
589 | break;
590 | case ChannelMode::UV:
591 | ispc::nlmFinishCh2_u8(
592 | dstp[1], dstp[2],
593 | srcp[1], srcp[2],
594 | weightp, wdstp[0], wdstp[1],
595 | max_weightp, wref,
596 | width, height, stride,
597 | peak
598 | );
599 | break;
600 | case ChannelMode::YUV:
601 | case ChannelMode::RGB:
602 | ispc::nlmFinishCh3_u8(
603 | dstp[0], dstp[1], dstp[2],
604 | srcp[0], srcp[1], srcp[2],
605 | weightp, wdstp[0], wdstp[1], wdstp[2],
606 | max_weightp, wref,
607 | width, height, stride,
608 | peak
609 | );
610 | break;
611 | }
612 | }
613 |
614 | static inline void nlmFinishDispatch_u16(
615 | std::array dstp,
616 | std::array srcp,
617 | const float * weightp,
618 | std::array wdstp,
619 | const float * max_weightp,
620 | float wref,
621 | int width,
622 | int height,
623 | int stride,
624 | ChannelMode channels,
625 | int peak
626 | ) noexcept {
627 |
628 | switch (channels) {
629 | case ChannelMode::Y:
630 | ispc::nlmFinishCh1_u16(
631 | dstp[0],
632 | srcp[0],
633 | weightp, wdstp[0],
634 | max_weightp, wref,
635 | width, height, stride,
636 | peak
637 | );
638 | break;
639 | case ChannelMode::UV:
640 | ispc::nlmFinishCh2_u16(
641 | dstp[1], dstp[2],
642 | srcp[1], srcp[2],
643 | weightp, wdstp[0], wdstp[1],
644 | max_weightp, wref,
645 | width, height, stride,
646 | peak
647 | );
648 | break;
649 | case ChannelMode::YUV:
650 | case ChannelMode::RGB:
651 | ispc::nlmFinishCh3_u16(
652 | dstp[0], dstp[1], dstp[2],
653 | srcp[0], srcp[1], srcp[2],
654 | weightp, wdstp[0], wdstp[1], wdstp[2],
655 | max_weightp, wref,
656 | width, height, stride,
657 | peak
658 | );
659 | break;
660 | }
661 | }
662 |
663 | static inline void nlmFinish(
664 | std::array dstp,
665 | std::array srcp,
666 | const float * weightp,
667 | std::array wdstp,
668 | const float * max_weightp,
669 | float wref,
670 | int width,
671 | int height,
672 | int stride,
673 | ChannelMode channels,
674 | int bits
675 | ) noexcept {
676 |
677 | if (bits == 32) {
678 | nlmFinishDispatch_f32(
679 | castPtrs(dstp), castPtrs(srcp),
680 | weightp, wdstp, max_weightp, wref, width, height, stride, channels
681 | );
682 | } else if (bits <= 8) {
683 | int peak = (1 << bits) - 1;
684 | nlmFinishDispatch_u8(
685 | castPtrs(dstp), castPtrs(srcp),
686 | weightp, wdstp, max_weightp, wref, width, height, stride, channels, peak
687 | );
688 | } else if (bits <= 16) {
689 | int peak = (1 << bits) - 1;
690 | nlmFinishDispatch_u16(
691 | castPtrs(dstp), castPtrs(srcp),
692 | weightp, wdstp, max_weightp, wref, width, height, stride, channels, peak
693 | );
694 | } else {
695 | assert(false);
696 | }
697 | }
698 |
699 | static const VSFrameRef *VS_CC nlmGetFrame(
700 | int n,
701 | int activationReason,
702 | void ** instanceData,
703 | void ** frameData,
704 | VSFrameContext * frameCtx,
705 | VSCore * core,
706 | const VSAPI * vsapi
707 | ) noexcept {
708 |
709 | auto * d = reinterpret_cast(*instanceData);
710 |
711 | if (activationReason == arInitial) {
712 | int start = std::max(0, n - d->d);
713 | int end = std::min(n + d->d, d->vi->numFrames - 1);
714 | for (int i = start; i <= end; i++) {
715 | vsapi->requestFrameFilter(i, d->node, frameCtx);
716 | if (d->ref_node) {
717 | vsapi->requestFrameFilter(i, d->ref_node, frameCtx);
718 | }
719 | }
720 | return nullptr;
721 | } else if (activationReason != arAllFramesReady) {
722 | return nullptr;
723 | }
724 |
725 | // activationReason == arAllFramesReady
726 |
727 | int nlm_d = d->d;
728 | int nlm_a = d->a;
729 | int nlm_s = d->s;
730 | float nlm_h2_inv_norm = square(255.0f) / (3.0f * square(d->h) * square(2 * nlm_s + 1));
731 | float nlm_wref = d->wref;
732 | ChannelMode channels = d->channels;
733 |
734 | const auto & ref_node = d->ref_node ? d->ref_node : d->node;
735 | auto ref_frame = vsapi->getFrameFilter(n, ref_node, frameCtx);
736 |
737 | int bits = d->vi->format->bitsPerSample;
738 | int width, height, stride; // dimensions of the plane to be processed, not the video dimension
739 | if (channels == ChannelMode::UV) {
740 | width = d->vi->width >> d->vi->format->subSamplingW;
741 | height = d->vi->height >> d->vi->format->subSamplingH;
742 | stride = vsapi->getStride(ref_frame, 1) / d->vi->format->bytesPerSample;
743 | } else {
744 | width = d->vi->width;
745 | height = d->vi->height;
746 | stride = vsapi->getStride(ref_frame, 0) / d->vi->format->bytesPerSample;
747 | }
748 |
749 | int size = height * stride; // size of each plane in quad-bytes
750 | // number of input channels
751 | int num_input_channels = [channels]() {
752 | if (channels == ChannelMode::Y) {
753 | return 1;
754 | } else if (channels == ChannelMode::UV) {
755 | return 2;
756 | } else {
757 | // channels == ChannelMode::YUV || channels == ChannelMode::RGB
758 | return 3;
759 | }
760 | }();
761 | // size in quad-bytes: size * (4 + num_input_channels + (nlm_d != 0)) + width
762 | float * workspace;
763 | {
764 | auto thread_id = std::this_thread::get_id();
765 | d->workspaces_lock.lock_shared();
766 | bool init = true;
767 | try {
768 | const auto & const_workspaces = d->workspaces;
769 | workspace = const_workspaces.at(thread_id);
770 | } catch (const std::out_of_range &) {
771 | init = false;
772 | }
773 | d->workspaces_lock.unlock_shared();
774 |
775 | if (!init) {
776 | auto workspace_size = size * (4 + num_input_channels + (nlm_d != 0)) + width;
777 | auto workspace_bytes = workspace_size * sizeof(float);
778 | workspace = vs_aligned_malloc(workspace_bytes, 256);
779 |
780 | if (!workspace) {
781 | vsapi->freeFrame(ref_frame);
782 | vsapi->setFilterError("nlm_ispc: malloc() failed", frameCtx);
783 | return nullptr;
784 | }
785 |
786 | std::lock_guard _ { d->workspaces_lock };
787 | d->workspaces.emplace(thread_id, workspace);
788 | }
789 | }
790 |
791 | // zero-initialize aggregation buffers
792 | std::memset(workspace, 0, (1 + num_input_channels) * size * sizeof(float));
793 | // stores the sum of weights of each pixel
794 | float * weightp = workspace;
795 | std::array wdstp {
796 | // stores the weighted sum of pixel values of the first processed plane
797 | workspace + size,
798 | // stores the weighted sum of pixel values of the second processed plane
799 | num_input_channels <= 1 ? nullptr : workspace + 2 * size,
800 | // stores the weighted sum of pixel values of the third processed plane
801 | num_input_channels <= 2 ? nullptr : workspace + 3 * size
802 | };
803 |
804 | // stores the maximum weight encountered of each pixel
805 | float * max_weightp = workspace + (1 + num_input_channels) * size;
806 | for (int i = 0; i < size; i++) {
807 | max_weightp[i] = std::numeric_limits::epsilon();
808 | }
809 |
810 | // temporary storage for the calculation of patch distances
811 | float * temp = workspace + (2 + num_input_channels) * size;
812 | float * temp_bwd = workspace + (3 + num_input_channels) * size;
813 | float * temp_fwd = nlm_d == 0 ? nullptr : workspace + (4 + num_input_channels) * size;
814 |
815 | // buffer for the vertical box filter during patch distance calculation
816 | // size in quad-bytes: width
817 | float * buffer = workspace + (4 + num_input_channels + (nlm_d != 0)) * size;
818 |
819 | std::array refp { getPtrs(ref_frame, channels, vsapi) };
820 |
821 | for (int i = -nlm_d; i <= 0; i++) {
822 | auto bwd_n = std::max(n + i, 0);
823 | auto fwd_n = std::min(n - i, d->vi->numFrames - 1);
824 | auto src_frame_bwd = vsapi->getFrameFilter(bwd_n, d->node, frameCtx);
825 | auto src_frame_fwd = vsapi->getFrameFilter(fwd_n, d->node, frameCtx);
826 | auto ref_frame_bwd = vsapi->getFrameFilter(bwd_n, ref_node, frameCtx);
827 | auto ref_frame_fwd = vsapi->getFrameFilter(fwd_n, ref_node, frameCtx);
828 |
829 | std::array srcp_bwd { getPtrs(src_frame_bwd, channels, vsapi) };
830 | std::array srcp_fwd { getPtrs(src_frame_fwd, channels, vsapi) };
831 | std::array refp_bwd { getPtrs(ref_frame_bwd, channels, vsapi) };
832 | std::array refp_fwd { getPtrs(ref_frame_fwd, channels, vsapi) };
833 |
834 | for (int offset_y = -nlm_a; offset_y <= nlm_a; offset_y++) {
835 | for (int offset_x = -nlm_a; offset_x <= nlm_a; offset_x++) {
836 | if (i * square(2 * nlm_a + 1) + offset_y * (2 * nlm_a + 1) + offset_x >= 0) {
837 | continue;
838 | }
839 |
840 | nlmDistance(
841 | temp_bwd,
842 | refp, refp_bwd,
843 | offset_x, offset_y, width, height, stride, channels, bits
844 | );
845 |
846 | ispc::nlmHorizontal(
847 | temp,
848 | temp_bwd,
849 | nlm_s, width, height, stride
850 | );
851 |
852 | d->nlm_vertical(
853 | temp_bwd,
854 | temp,
855 | nlm_s, nlm_h2_inv_norm, width, height, stride, buffer
856 | );
857 |
858 | // jump at the end of this basic block
859 | if (i == 0) {
860 | // bwd == fwd
861 | nlmAccumulation(
862 | weightp, wdstp, max_weightp,
863 | srcp_bwd, srcp_bwd, temp_bwd, temp_bwd,
864 | offset_x, offset_y, width, height, stride, channels, bits
865 | );
866 | continue;
867 | }
868 |
869 | // i != 0
870 | nlmDistance(
871 | temp_fwd,
872 | refp_fwd, refp,
873 | offset_x, offset_y, width, height, stride, channels, bits
874 | );
875 |
876 | ispc::nlmHorizontal(
877 | temp,
878 | temp_fwd,
879 | nlm_s, width, height, stride
880 | );
881 |
882 | d->nlm_vertical(
883 | temp_fwd,
884 | temp,
885 | nlm_s, nlm_h2_inv_norm, width, height, stride, buffer
886 | );
887 |
888 | nlmAccumulation(
889 | weightp, wdstp, max_weightp,
890 | srcp_bwd, srcp_fwd, temp_bwd, temp_fwd,
891 | offset_x, offset_y, width, height, stride, channels, bits
892 | );
893 | }
894 | }
895 |
896 | vsapi->freeFrame(src_frame_fwd);
897 | vsapi->freeFrame(src_frame_bwd);
898 | vsapi->freeFrame(ref_frame_fwd);
899 | vsapi->freeFrame(ref_frame_bwd);
900 | }
901 |
902 | vsapi->freeFrame(ref_frame);
903 |
904 | auto src_frame = vsapi->getFrameFilter(n, d->node, frameCtx);
905 | std::array srcp { getPtrs(src_frame, channels, vsapi) };
906 |
907 | VSFrameRef * dst_frame;
908 | if (channels == ChannelMode::Y && d->vi->format->numPlanes > 1) {
909 | const VSFrameRef * fr[3] { nullptr, src_frame, src_frame };
910 | constexpr int pl[3] { 0, 1, 2 };
911 | dst_frame = vsapi->newVideoFrame2(d->vi->format, d->vi->width, d->vi->height, fr, pl, src_frame, core);
912 | } else if (channels == ChannelMode::UV && d->vi->format->numPlanes > 1) {
913 | const VSFrameRef * fr[3] { src_frame, nullptr, nullptr };
914 | constexpr int pl[3] { 0, 1, 2 };
915 | dst_frame = vsapi->newVideoFrame2(d->vi->format, d->vi->width, d->vi->height, fr, pl, src_frame, core);
916 | } else {
917 | dst_frame = vsapi->newVideoFrame(d->vi->format, d->vi->width, d->vi->height, src_frame, core);
918 | }
919 | std::array dstp { getPtrs(dst_frame, channels, vsapi) };
920 |
921 | nlmFinish(dstp, srcp, weightp, wdstp, max_weightp, nlm_wref, width, height, stride, channels, bits);
922 |
923 | vsapi->freeFrame(src_frame);
924 |
925 | return dst_frame;
926 | }
927 |
928 | static void VS_CC nlmFree(
929 | void * instanceData,
930 | VSCore * core,
931 | const VSAPI * vsapi
932 | ) noexcept {
933 |
934 | auto * d = reinterpret_cast(instanceData);
935 |
936 | vsapi->freeNode(d->node);
937 | if (d->ref_node) {
938 | vsapi->freeNode(d->ref_node);
939 | }
940 |
941 | for (const auto & [_, ptr] : d->workspaces) {
942 | vs_aligned_free(ptr);
943 | }
944 |
945 | delete d;
946 | }
947 |
948 | static void VS_CC nlmCreate(
949 | const VSMap * in,
950 | VSMap * out,
951 | void * userData,
952 | VSCore * core,
953 | const VSAPI * vsapi
954 | ) noexcept {
955 |
956 | auto d = std::make_unique();
957 |
958 | d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
959 | d->vi = vsapi->getVideoInfo(d->node);
960 |
961 | auto set_error = [vsapi, out, &d](const char * error_message) -> void {
962 | vsapi->setError(out, error_message);
963 | vsapi->freeNode(d->node);
964 | };
965 |
966 | if ((d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
967 | (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32)
968 | ) {
969 | return set_error("only 1-16 bit integer or 32-bit float supported");
970 | }
971 |
972 | int err;
973 |
974 | d->d = int64ToIntS(vsapi->propGetInt(in, "d", 0, &err));
975 | if (err) {
976 | d->d = 1;
977 | }
978 | if (d->d < 0) {
979 | return set_error("\"d\" must be non-negative");
980 | }
981 |
982 | d->a = int64ToIntS(vsapi->propGetInt(in, "a", 0, &err));
983 | if (err) {
984 | d->a = 2;
985 | }
986 | if (d->a <= 0) {
987 | return set_error("\"a\" must be positive");
988 | }
989 |
990 | d->s = int64ToIntS(vsapi->propGetInt(in, "s", 0, &err));
991 | if (err) {
992 | d->s = 4;
993 | }
994 | if (d->s < 0) {
995 | return set_error("\"s\" must be non-negative");
996 | }
997 |
998 | d->h = static_cast(vsapi->propGetFloat(in, "h", 0, &err));
999 | if (err) {
1000 | d->h = 1.2f;
1001 | }
1002 | if (d->h <= 0.0f) {
1003 | return set_error("\"h\" must be positive");
1004 | }
1005 |
1006 | auto wmode = vsapi->propGetInt(in, "wmode", 0, &err);
1007 | if (err) {
1008 | wmode = 0;
1009 | }
1010 | if (wmode < 0 || wmode > 3) {
1011 | return set_error("\"wmode\" must be 0, 1, 2 or 3");
1012 | }
1013 | decltype(d->nlm_vertical) nlmVerticalKernels[] {
1014 | &ispc::nlmVerticalWelsch,
1015 | &ispc::nlmVerticalBisquareA,
1016 | &ispc::nlmVerticalBisquareB,
1017 | &ispc::nlmVerticalBisquareC
1018 | };
1019 | d->nlm_vertical = nlmVerticalKernels[wmode];
1020 |
1021 | auto channels = vsapi->propGetData(in, "channels", 0, &err);
1022 | if (err) {
1023 | channels = "AUTO";
1024 | }
1025 | auto channels_len = std::strlen(channels);
1026 | if (channels_len == 1 && *channels == 'Y') {
1027 | d->channels = ChannelMode::Y;
1028 | } else if (channels_len == 2 && std::strncmp(channels, "UV", 2) == 0) {
1029 | d->channels = ChannelMode::UV;
1030 | } else if (channels_len == 3 && std::strncmp(channels, "YUV", 3) == 0) {
1031 | d->channels = ChannelMode::YUV;
1032 | } else if (channels_len == 3 && std::strncmp(channels, "RGB", 3) == 0) {
1033 | d->channels = ChannelMode::RGB;
1034 | } else if (channels_len == 4 && std::strncmp(channels, "AUTO", 4) == 0) {
1035 | if (d->vi->format->colorFamily == cmRGB) {
1036 | d->channels = ChannelMode::RGB;
1037 | } else {
1038 | d->channels = ChannelMode::Y;
1039 | }
1040 | } else {
1041 | return set_error("\"channels\" must be \"Y\", \"UV\', \"YUV\", \"RGB\" or \"AUTO\"");
1042 | }
1043 |
1044 | if (d->channels == ChannelMode::Y) {
1045 | if (d->vi->format->colorFamily != cmGray && d->vi->format->colorFamily != cmYUV) {
1046 | return set_error("color family must be Gray or YUV for \"channels\" == \"Y\"");
1047 | }
1048 | } else if (d->channels == ChannelMode::UV) {
1049 | if (d->vi->format->colorFamily != cmYUV) {
1050 | return set_error("color family must be YUV for \"channels\" == \"UV\"");
1051 | }
1052 | } else if (d->channels == ChannelMode::YUV) {
1053 | if (d->vi->format->colorFamily != cmYUV || d->vi->format->subSamplingW || d->vi->format->subSamplingH) {
1054 | return set_error("color family must be YUV444 for \"channels\" == \"YUV\"");
1055 | }
1056 | } else if (d->channels == ChannelMode::RGB) {
1057 | if (d->vi->format->colorFamily != cmRGB) {
1058 | return set_error("color family must be RGB for \"channels\" == \"RGB\"");
1059 | }
1060 | }
1061 |
1062 | d->wref = static_cast(vsapi->propGetFloat(in, "wref", 0, &err));
1063 | if (err) {
1064 | d->wref = 1.0f;
1065 | }
1066 |
1067 | d->ref_node = vsapi->propGetNode(in, "rclip", 0, &err);
1068 | if (err) {
1069 | d->ref_node = nullptr;
1070 | }
1071 | if (d->ref_node) {
1072 | const auto ref_vi = vsapi->getVideoInfo(d->ref_node);
1073 | if (!isSameFormat(d->vi, ref_vi) || d->vi->numFrames != ref_vi->numFrames) {
1074 | vsapi->freeNode(d->ref_node);
1075 | return set_error("\"rclip\" must be of the same format as \"clip\"");
1076 | }
1077 | }
1078 |
1079 | VSCoreInfo core_info;
1080 | vsapi->getCoreInfo2(core, &core_info);
1081 | d->workspaces.reserve(core_info.numThreads);
1082 |
1083 | vsapi->createFilter(
1084 | in, out,
1085 | "NLMeans", nlmInit, nlmGetFrame, nlmFree,
1086 | fmParallel, 0, d.release(), core
1087 | );
1088 | }
1089 |
1090 | VS_EXTERNAL_API(void) VapourSynthPluginInit(
1091 | VSConfigPlugin configFunc,
1092 | VSRegisterFunction registerFunc,
1093 | VSPlugin * plugin
1094 | ) noexcept {
1095 |
1096 | configFunc(
1097 | "io.github.amusementclub.vs-nlm-ispc",
1098 | "nlm_ispc",
1099 | "Non-local means denoise filter implemented in ISPC",
1100 | VAPOURSYNTH_API_VERSION, 1, plugin
1101 | );
1102 |
1103 | registerFunc(
1104 | "NLMeans",
1105 | "clip:clip;"
1106 | "d:int:opt;"
1107 | "a:int:opt;"
1108 | "s:int:opt;"
1109 | "h:float:opt;"
1110 | "channels:data:opt;"
1111 | "wmode:int:opt;"
1112 | "wref:float:opt;"
1113 | "rclip:clip:opt;",
1114 | nlmCreate,
1115 | nullptr, plugin
1116 | );
1117 |
1118 | auto getVersion = [](const VSMap *, VSMap * out, void *, VSCore *, const VSAPI *vsapi) {
1119 | vsapi->propSetData(out, "version", VERSION, -1, paReplace);
1120 | };
1121 | registerFunc("Version", "", getVersion, nullptr, plugin);
1122 | }
1123 |
--------------------------------------------------------------------------------
/source/nlm.ispc:
--------------------------------------------------------------------------------
1 | // based on KNLMeansCL by Khanattila and vs-boxblur
2 |
3 | #define CLAMPX(x) clamp(x, 0, width - 1)
4 | #define CLAMPY(y) clamp(y, 0, height - 1)
5 |
6 | static inline uniform float square(uniform float x) {
7 | return x * x;
8 | }
9 |
10 | static inline float square(float x) {
11 | return x * x;
12 | }
13 |
14 | export void nlmDistanceLuma_f32(
15 | uniform float temp0[], // shape: (height, stride)
16 | uniform const float centerp[], // shape: (height, stride)
17 | uniform const float neighborp[], // shape: (height, stride)
18 | uniform int offset_x,
19 | uniform int offset_y,
20 | uniform int width,
21 | uniform int height,
22 | uniform int stride
23 | ) {
24 |
25 | uniform int start_x = abs(offset_x);
26 | uniform int end_x = width - abs(offset_x);
27 |
28 | for (uniform int y = 0; y < height ;y++) {
29 | for (uniform int x = 0; x < start_x; x++) {
30 | uniform int idx = y * stride + x;
31 | uniform float u1 = centerp[idx];
32 |
33 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
34 | uniform float u1_pq = neighborp[neighbor_idx];
35 |
36 | temp0[idx] = 3.0f * square(u1 - u1_pq);
37 | }
38 |
39 | foreach (x = start_x ... end_x) {
40 | int idx = y * stride + x;
41 | float u1 = centerp[idx];
42 |
43 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
44 | float u1_pq = neighborp[neighbor_idx];
45 |
46 | temp0[idx] = 3.0f * square(u1 - u1_pq);
47 | }
48 |
49 | for (uniform int x = end_x; x < width; x++) {
50 | uniform int idx = y * stride + x;
51 | uniform float u1 = centerp[idx];
52 |
53 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
54 | uniform float u1_pq = neighborp[neighbor_idx];
55 |
56 | temp0[idx] = 3.0f * square(u1 - u1_pq);
57 | }
58 | }
59 | }
60 |
61 | export void nlmDistanceLuma_u8(
62 | uniform float temp0[], // shape: (height, stride)
63 | uniform const unsigned int8 centerp[], // shape: (height, stride)
64 | uniform const unsigned int8 neighborp[], // shape: (height, stride)
65 | uniform int offset_x,
66 | uniform int offset_y,
67 | uniform int width,
68 | uniform int height,
69 | uniform int stride,
70 | uniform float inv_divisor
71 | ) {
72 |
73 | uniform int start_x = abs(offset_x);
74 | uniform int end_x = width - abs(offset_x);
75 |
76 | uniform float sq_inv_divisor = square(inv_divisor);
77 |
78 | for (uniform int y = 0; y < height ;y++) {
79 | for (uniform int x = 0; x < start_x; x++) {
80 | uniform int idx = y * stride + x;
81 | uniform float u1 = centerp[idx];
82 |
83 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
84 | uniform float u1_pq = neighborp[neighbor_idx];
85 |
86 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
87 | }
88 |
89 | foreach (x = start_x ... end_x) {
90 | int idx = y * stride + x;
91 | float u1 = centerp[idx];
92 |
93 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
94 | float u1_pq = neighborp[neighbor_idx];
95 |
96 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
97 | }
98 |
99 | for (uniform int x = end_x; x < width; x++) {
100 | uniform int idx = y * stride + x;
101 | uniform float u1 = centerp[idx];
102 |
103 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
104 | uniform float u1_pq = neighborp[neighbor_idx];
105 |
106 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
107 | }
108 | }
109 | }
110 |
111 | export void nlmDistanceLuma_u16(
112 | uniform float temp0[], // shape: (height, stride)
113 | uniform const unsigned int16 centerp[], // shape: (height, stride)
114 | uniform const unsigned int16 neighborp[], // shape: (height, stride)
115 | uniform int offset_x,
116 | uniform int offset_y,
117 | uniform int width,
118 | uniform int height,
119 | uniform int stride,
120 | uniform float inv_divisor
121 | ) {
122 |
123 | uniform int start_x = abs(offset_x);
124 | uniform int end_x = width - abs(offset_x);
125 |
126 | uniform float sq_inv_divisor = square(inv_divisor);
127 |
128 | for (uniform int y = 0; y < height ;y++) {
129 | for (uniform int x = 0; x < start_x; x++) {
130 | uniform int idx = y * stride + x;
131 | uniform float u1 = centerp[idx];
132 |
133 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
134 | uniform float u1_pq = neighborp[neighbor_idx];
135 |
136 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
137 | }
138 |
139 | foreach (x = start_x ... end_x) {
140 | int idx = y * stride + x;
141 | float u1 = centerp[idx];
142 |
143 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
144 | float u1_pq = neighborp[neighbor_idx];
145 |
146 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
147 | }
148 |
149 | for (uniform int x = end_x; x < width; x++) {
150 | uniform int idx = y * stride + x;
151 | uniform float u1 = centerp[idx];
152 |
153 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
154 | uniform float u1_pq = neighborp[neighbor_idx];
155 |
156 | temp0[idx] = 3.0f * square(u1 - u1_pq) * sq_inv_divisor;
157 | }
158 | }
159 | }
160 |
161 | export void nlmDistanceChroma_f32(
162 | uniform float temp0[], // shape: (height, stride)
163 | uniform const float centerp1[], // shape: (height, stride)
164 | uniform const float centerp2[], // shape: (height, stride)
165 | uniform const float neighborp1[], // shape: (height, stride)
166 | uniform const float neighborp2[], // shape: (height, stride)
167 | uniform int offset_x,
168 | uniform int offset_y,
169 | uniform int width,
170 | uniform int height,
171 | uniform int stride
172 | ) {
173 |
174 | uniform int start_x = abs(offset_x);
175 | uniform int end_x = width - abs(offset_x);
176 |
177 | for (uniform int y = 0; y < height; y++) {
178 | for (uniform int x = 0; x < start_x; x++) {
179 | uniform int idx = y * stride + x;
180 | uniform float u1_1 = centerp1[idx];
181 | uniform float u1_2 = centerp2[idx];
182 |
183 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
184 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
185 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
186 |
187 | uniform float dst = 1.5f * (
188 | square(u1_1 - u1_pq_1) +
189 | square(u1_2 - u1_pq_2)
190 | );
191 |
192 | temp0[idx] = dst;
193 | }
194 |
195 | foreach (x = start_x ... end_x) {
196 | int idx = y * stride + x;
197 | float u1_1 = centerp1[idx];
198 | float u1_2 = centerp2[idx];
199 |
200 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
201 | float u1_pq_1 = neighborp1[neighbor_idx];
202 | float u1_pq_2 = neighborp2[neighbor_idx];
203 |
204 | float dst = 1.5f * (
205 | square(u1_1 - u1_pq_1) +
206 | square(u1_2 - u1_pq_2)
207 | );
208 |
209 | temp0[idx] = dst;
210 | }
211 |
212 | for (uniform int x = end_x; x < width; x++) {
213 | uniform int idx = y * stride + x;
214 | uniform float u1_1 = centerp1[idx];
215 | uniform float u1_2 = centerp2[idx];
216 |
217 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
218 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
219 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
220 |
221 | uniform float dst = 1.5f * (
222 | square(u1_1 - u1_pq_1) +
223 | square(u1_2 - u1_pq_2)
224 | );
225 |
226 | temp0[idx] = dst;
227 | }
228 | }
229 | }
230 |
231 | export void nlmDistanceChroma_u8(
232 | uniform float temp0[], // shape: (height, stride)
233 | uniform const unsigned int8 centerp1[], // shape: (height, stride)
234 | uniform const unsigned int8 centerp2[], // shape: (height, stride)
235 | uniform const unsigned int8 neighborp1[], // shape: (height, stride)
236 | uniform const unsigned int8 neighborp2[], // shape: (height, stride)
237 | uniform int offset_x,
238 | uniform int offset_y,
239 | uniform int width,
240 | uniform int height,
241 | uniform int stride,
242 | uniform float inv_divisor
243 | ) {
244 |
245 | uniform int start_x = abs(offset_x);
246 | uniform int end_x = width - abs(offset_x);
247 |
248 | uniform float sq_inv_divisor = square(inv_divisor);
249 |
250 | for (uniform int y = 0; y < height; y++) {
251 | for (uniform int x = 0; x < start_x; x++) {
252 | uniform int idx = y * stride + x;
253 | uniform float u1_1 = centerp1[idx];
254 | uniform float u1_2 = centerp2[idx];
255 |
256 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
257 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
258 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
259 |
260 | uniform float dst = 1.5f * (
261 | square(u1_1 - u1_pq_1) +
262 | square(u1_2 - u1_pq_2)
263 | );
264 |
265 | temp0[idx] = dst * sq_inv_divisor;
266 | }
267 |
268 | foreach (x = start_x ... end_x) {
269 | int idx = y * stride + x;
270 | float u1_1 = centerp1[idx];
271 | float u1_2 = centerp2[idx];
272 |
273 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
274 | float u1_pq_1 = neighborp1[neighbor_idx];
275 | float u1_pq_2 = neighborp2[neighbor_idx];
276 |
277 | float dst = 1.5f * (
278 | square(u1_1 - u1_pq_1) +
279 | square(u1_2 - u1_pq_2)
280 | );
281 |
282 | temp0[idx] = dst * sq_inv_divisor;
283 | }
284 |
285 | for (uniform int x = end_x; x < width; x++) {
286 | uniform int idx = y * stride + x;
287 | uniform float u1_1 = centerp1[idx];
288 | uniform float u1_2 = centerp2[idx];
289 |
290 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
291 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
292 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
293 |
294 | uniform float dst = 1.5f * (
295 | square(u1_1 - u1_pq_1) +
296 | square(u1_2 - u1_pq_2)
297 | );
298 |
299 | temp0[idx] = dst * sq_inv_divisor;
300 | }
301 | }
302 | }
303 |
304 | export void nlmDistanceChroma_u16(
305 | uniform float temp0[], // shape: (height, stride)
306 | uniform const unsigned int16 centerp1[], // shape: (height, stride)
307 | uniform const unsigned int16 centerp2[], // shape: (height, stride)
308 | uniform const unsigned int16 neighborp1[], // shape: (height, stride)
309 | uniform const unsigned int16 neighborp2[], // shape: (height, stride)
310 | uniform int offset_x,
311 | uniform int offset_y,
312 | uniform int width,
313 | uniform int height,
314 | uniform int stride,
315 | uniform float inv_divisor
316 | ) {
317 |
318 | uniform int start_x = abs(offset_x);
319 | uniform int end_x = width - abs(offset_x);
320 |
321 | uniform float sq_inv_divisor = square(inv_divisor);
322 |
323 | for (uniform int y = 0; y < height; y++) {
324 | for (uniform int x = 0; x < start_x; x++) {
325 | uniform int idx = y * stride + x;
326 | uniform float u1_1 = centerp1[idx];
327 | uniform float u1_2 = centerp2[idx];
328 |
329 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
330 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
331 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
332 |
333 | uniform float dst = 1.5f * (
334 | square(u1_1 - u1_pq_1) +
335 | square(u1_2 - u1_pq_2)
336 | );
337 |
338 | temp0[idx] = dst * sq_inv_divisor;
339 | }
340 |
341 | foreach (x = start_x ... end_x) {
342 | int idx = y * stride + x;
343 | float u1_1 = centerp1[idx];
344 | float u1_2 = centerp2[idx];
345 |
346 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
347 | float u1_pq_1 = neighborp1[neighbor_idx];
348 | float u1_pq_2 = neighborp2[neighbor_idx];
349 |
350 | float dst = 1.5f * (
351 | square(u1_1 - u1_pq_1) +
352 | square(u1_2 - u1_pq_2)
353 | );
354 |
355 | temp0[idx] = dst * sq_inv_divisor;
356 | }
357 |
358 | for (uniform int x = end_x; x < width; x++) {
359 | uniform int idx = y * stride + x;
360 | uniform float u1_1 = centerp1[idx];
361 | uniform float u1_2 = centerp2[idx];
362 |
363 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
364 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
365 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
366 |
367 | uniform float dst = 1.5f * (
368 | square(u1_1 - u1_pq_1) +
369 | square(u1_2 - u1_pq_2)
370 | );
371 |
372 | temp0[idx] = dst * sq_inv_divisor;
373 | }
374 | }
375 | }
376 |
377 | export void nlmDistanceYUV_f32(
378 | uniform float temp0[], // shape: (height, stride)
379 | uniform const float centerp1[], // shape: (height, stride)
380 | uniform const float centerp2[], // shape: (height, stride)
381 | uniform const float centerp3[], // shape: (height, stride)
382 | uniform const float neighborp1[], // shape: (height, stride)
383 | uniform const float neighborp2[], // shape: (height, stride)
384 | uniform const float neighborp3[], // shape: (height, stride)
385 | uniform int offset_x,
386 | uniform int offset_y,
387 | uniform int width,
388 | uniform int height,
389 | uniform int stride
390 | ) {
391 |
392 | uniform int start_x = abs(offset_x);
393 | uniform int end_x = width - abs(offset_x);
394 |
395 | for (uniform int y = 0; y < height; y++) {
396 | for (uniform int x = 0; x < start_x; x++) {
397 | uniform int idx = y * stride + x;
398 | uniform float u1_1 = centerp1[idx];
399 | uniform float u1_2 = centerp2[idx];
400 | uniform float u1_3 = centerp3[idx];
401 |
402 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
403 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
404 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
405 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
406 |
407 | uniform float dst = (
408 | square(u1_1 - u1_pq_1) +
409 | square(u1_2 - u1_pq_2) +
410 | square(u1_3 - u1_pq_3)
411 | );
412 |
413 | temp0[idx] = dst;
414 | }
415 |
416 | foreach (x = start_x ... end_x) {
417 | int idx = y * stride + x;
418 | float u1_1 = centerp1[idx];
419 | float u1_2 = centerp2[idx];
420 | float u1_3 = centerp3[idx];
421 |
422 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
423 | float u1_pq_1 = neighborp1[neighbor_idx];
424 | float u1_pq_2 = neighborp2[neighbor_idx];
425 | float u1_pq_3 = neighborp3[neighbor_idx];
426 |
427 | float dst = (
428 | square(u1_1 - u1_pq_1) +
429 | square(u1_2 - u1_pq_2) +
430 | square(u1_3 - u1_pq_3)
431 | );
432 |
433 | temp0[idx] = dst;
434 | }
435 |
436 | for (uniform int x = end_x; x < width; x++) {
437 | uniform int idx = y * stride + x;
438 | uniform float u1_1 = centerp1[idx];
439 | uniform float u1_2 = centerp2[idx];
440 | uniform float u1_3 = centerp3[idx];
441 |
442 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
443 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
444 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
445 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
446 |
447 | uniform float dst = (
448 | square(u1_1 - u1_pq_1) +
449 | square(u1_2 - u1_pq_2) +
450 | square(u1_3 - u1_pq_3)
451 | );
452 |
453 | temp0[idx] = dst;
454 | }
455 | }
456 | }
457 |
458 | export void nlmDistanceYUV_u8(
459 | uniform float temp0[], // shape: (height, stride)
460 | uniform const unsigned int8 centerp1[], // shape: (height, stride)
461 | uniform const unsigned int8 centerp2[], // shape: (height, stride)
462 | uniform const unsigned int8 centerp3[], // shape: (height, stride)
463 | uniform const unsigned int8 neighborp1[], // shape: (height, stride)
464 | uniform const unsigned int8 neighborp2[], // shape: (height, stride)
465 | uniform const unsigned int8 neighborp3[], // shape: (height, stride)
466 | uniform int offset_x,
467 | uniform int offset_y,
468 | uniform int width,
469 | uniform int height,
470 | uniform int stride,
471 | uniform float inv_divisor
472 | ) {
473 |
474 | uniform int start_x = abs(offset_x);
475 | uniform int end_x = width - abs(offset_x);
476 |
477 | uniform float sq_inv_divisor = square(inv_divisor);
478 |
479 | for (uniform int y = 0; y < height; y++) {
480 | for (uniform int x = 0; x < start_x; x++) {
481 | uniform int idx = y * stride + x;
482 | uniform float u1_1 = centerp1[idx];
483 | uniform float u1_2 = centerp2[idx];
484 | uniform float u1_3 = centerp3[idx];
485 |
486 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
487 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
488 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
489 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
490 |
491 | uniform float dst = (
492 | square(u1_1 - u1_pq_1) +
493 | square(u1_2 - u1_pq_2) +
494 | square(u1_3 - u1_pq_3)
495 | );
496 |
497 | temp0[idx] = dst * sq_inv_divisor;
498 | }
499 |
500 | foreach (x = start_x ... end_x) {
501 | int idx = y * stride + x;
502 | float u1_1 = centerp1[idx];
503 | float u1_2 = centerp2[idx];
504 | float u1_3 = centerp3[idx];
505 |
506 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
507 | float u1_pq_1 = neighborp1[neighbor_idx];
508 | float u1_pq_2 = neighborp2[neighbor_idx];
509 | float u1_pq_3 = neighborp3[neighbor_idx];
510 |
511 | float dst = (
512 | square(u1_1 - u1_pq_1) +
513 | square(u1_2 - u1_pq_2) +
514 | square(u1_3 - u1_pq_3)
515 | );
516 |
517 | temp0[idx] = dst * sq_inv_divisor;
518 | }
519 |
520 | for (uniform int x = end_x; x < width; x++) {
521 | uniform int idx = y * stride + x;
522 | uniform float u1_1 = centerp1[idx];
523 | uniform float u1_2 = centerp2[idx];
524 | uniform float u1_3 = centerp3[idx];
525 |
526 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
527 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
528 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
529 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
530 |
531 | uniform float dst = (
532 | square(u1_1 - u1_pq_1) +
533 | square(u1_2 - u1_pq_2) +
534 | square(u1_3 - u1_pq_3)
535 | );
536 |
537 | temp0[idx] = dst * sq_inv_divisor;
538 | }
539 | }
540 | }
541 |
542 | export void nlmDistanceYUV_u16(
543 | uniform float temp0[], // shape: (height, stride)
544 | uniform const unsigned int16 centerp1[], // shape: (height, stride)
545 | uniform const unsigned int16 centerp2[], // shape: (height, stride)
546 | uniform const unsigned int16 centerp3[], // shape: (height, stride)
547 | uniform const unsigned int16 neighborp1[], // shape: (height, stride)
548 | uniform const unsigned int16 neighborp2[], // shape: (height, stride)
549 | uniform const unsigned int16 neighborp3[], // shape: (height, stride)
550 | uniform int offset_x,
551 | uniform int offset_y,
552 | uniform int width,
553 | uniform int height,
554 | uniform int stride,
555 | uniform float inv_divisor
556 | ) {
557 |
558 | uniform int start_x = abs(offset_x);
559 | uniform int end_x = width - abs(offset_x);
560 |
561 | uniform float sq_inv_divisor = square(inv_divisor);
562 |
563 | for (uniform int y = 0; y < height; y++) {
564 | for (uniform int x = 0; x < start_x; x++) {
565 | uniform int idx = y * stride + x;
566 | uniform float u1_1 = centerp1[idx];
567 | uniform float u1_2 = centerp2[idx];
568 | uniform float u1_3 = centerp3[idx];
569 |
570 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
571 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
572 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
573 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
574 |
575 | uniform float dst = (
576 | square(u1_1 - u1_pq_1) +
577 | square(u1_2 - u1_pq_2) +
578 | square(u1_3 - u1_pq_3)
579 | );
580 |
581 | temp0[idx] = dst * sq_inv_divisor;
582 | }
583 |
584 | foreach (x = start_x ... end_x) {
585 | int idx = y * stride + x;
586 | float u1_1 = centerp1[idx];
587 | float u1_2 = centerp2[idx];
588 | float u1_3 = centerp3[idx];
589 |
590 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
591 | float u1_pq_1 = neighborp1[neighbor_idx];
592 | float u1_pq_2 = neighborp2[neighbor_idx];
593 | float u1_pq_3 = neighborp3[neighbor_idx];
594 |
595 | float dst = (
596 | square(u1_1 - u1_pq_1) +
597 | square(u1_2 - u1_pq_2) +
598 | square(u1_3 - u1_pq_3)
599 | );
600 |
601 | temp0[idx] = dst * sq_inv_divisor;
602 | }
603 |
604 | for (uniform int x = end_x; x < width; x++) {
605 | uniform int idx = y * stride + x;
606 | uniform float u1_1 = centerp1[idx];
607 | uniform float u1_2 = centerp2[idx];
608 | uniform float u1_3 = centerp3[idx];
609 |
610 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
611 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
612 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
613 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
614 |
615 | uniform float dst = (
616 | square(u1_1 - u1_pq_1) +
617 | square(u1_2 - u1_pq_2) +
618 | square(u1_3 - u1_pq_3)
619 | );
620 |
621 | temp0[idx] = dst * sq_inv_divisor;
622 | }
623 | }
624 | }
625 |
626 | export void nlmDistanceRGB_f32(
627 | uniform float temp0[], // shape: (height, stride)
628 | uniform const float centerp1[], // shape: (height, stride)
629 | uniform const float centerp2[], // shape: (height, stride)
630 | uniform const float centerp3[], // shape: (height, stride)
631 | uniform const float neighborp1[], // shape: (height, stride)
632 | uniform const float neighborp2[], // shape: (height, stride)
633 | uniform const float neighborp3[], // shape: (height, stride)
634 | uniform int offset_x,
635 | uniform int offset_y,
636 | uniform int width,
637 | uniform int height,
638 | uniform int stride
639 | ) {
640 |
641 | uniform int start_x = abs(offset_x);
642 | uniform int end_x = width - abs(offset_x);
643 |
644 | for (uniform int y = 0; y < height; y++) {
645 | for (uniform int x = 0; x < start_x; x++) {
646 | uniform int idx = y * stride + x;
647 | uniform float u1_1 = centerp1[idx];
648 | uniform float u1_2 = centerp2[idx];
649 | uniform float u1_3 = centerp3[idx];
650 |
651 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
652 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
653 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
654 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
655 |
656 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f;
657 |
658 | uniform float dst = (
659 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
660 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
661 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
662 | );
663 |
664 | temp0[idx] = dst;
665 | }
666 |
667 | foreach (x = start_x ... end_x) {
668 | int idx = y * stride + x;
669 | float u1_1 = centerp1[idx];
670 | float u1_2 = centerp2[idx];
671 | float u1_3 = centerp3[idx];
672 |
673 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
674 | float u1_pq_1 = neighborp1[neighbor_idx];
675 | float u1_pq_2 = neighborp2[neighbor_idx];
676 | float u1_pq_3 = neighborp3[neighbor_idx];
677 |
678 | float m_red = (u1_1 + u1_pq_1) / 6.0f;
679 |
680 | float dst = (
681 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
682 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
683 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
684 | );
685 |
686 | temp0[idx] = dst;
687 | }
688 |
689 | for (uniform int x = end_x; x < width; x++) {
690 | uniform int idx = y * stride + x;
691 | uniform float u1_1 = centerp1[idx];
692 | uniform float u1_2 = centerp2[idx];
693 | uniform float u1_3 = centerp3[idx];
694 |
695 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
696 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
697 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
698 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
699 |
700 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f;
701 |
702 | uniform float dst = (
703 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
704 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
705 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
706 | );
707 |
708 | temp0[idx] = dst;
709 | }
710 | }
711 | }
712 |
713 | export void nlmDistanceRGB_u8(
714 | uniform float temp0[], // shape: (height, stride)
715 | uniform const unsigned int8 centerp1[], // shape: (height, stride)
716 | uniform const unsigned int8 centerp2[], // shape: (height, stride)
717 | uniform const unsigned int8 centerp3[], // shape: (height, stride)
718 | uniform const unsigned int8 neighborp1[], // shape: (height, stride)
719 | uniform const unsigned int8 neighborp2[], // shape: (height, stride)
720 | uniform const unsigned int8 neighborp3[], // shape: (height, stride)
721 | uniform int offset_x,
722 | uniform int offset_y,
723 | uniform int width,
724 | uniform int height,
725 | uniform int stride,
726 | uniform float inv_divisor
727 | ) {
728 |
729 | uniform int start_x = abs(offset_x);
730 | uniform int end_x = width - abs(offset_x);
731 |
732 | uniform float sq_inv_divisor = square(inv_divisor);
733 |
734 | for (uniform int y = 0; y < height; y++) {
735 | for (uniform int x = 0; x < start_x; x++) {
736 | uniform int idx = y * stride + x;
737 | uniform float u1_1 = centerp1[idx];
738 | uniform float u1_2 = centerp2[idx];
739 | uniform float u1_3 = centerp3[idx];
740 |
741 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
742 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
743 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
744 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
745 |
746 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
747 |
748 | uniform float dst = (
749 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
750 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
751 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
752 | );
753 |
754 | temp0[idx] = dst * sq_inv_divisor;
755 | }
756 |
757 | foreach (x = start_x ... end_x) {
758 | int idx = y * stride + x;
759 | float u1_1 = centerp1[idx];
760 | float u1_2 = centerp2[idx];
761 | float u1_3 = centerp3[idx];
762 |
763 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
764 | float u1_pq_1 = neighborp1[neighbor_idx];
765 | float u1_pq_2 = neighborp2[neighbor_idx];
766 | float u1_pq_3 = neighborp3[neighbor_idx];
767 |
768 | float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
769 |
770 | float dst = (
771 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
772 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
773 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
774 | );
775 |
776 | temp0[idx] = dst * sq_inv_divisor;
777 | }
778 |
779 | for (uniform int x = end_x; x < width; x++) {
780 | uniform int idx = y * stride + x;
781 | uniform float u1_1 = centerp1[idx];
782 | uniform float u1_2 = centerp2[idx];
783 | uniform float u1_3 = centerp3[idx];
784 |
785 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
786 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
787 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
788 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
789 |
790 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
791 |
792 | uniform float dst = (
793 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
794 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
795 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
796 | );
797 |
798 | temp0[idx] = dst * sq_inv_divisor;
799 | }
800 | }
801 | }
802 |
803 | export void nlmDistanceRGB_u16(
804 | uniform float temp0[], // shape: (height, stride)
805 | uniform const unsigned int16 centerp1[], // shape: (height, stride)
806 | uniform const unsigned int16 centerp2[], // shape: (height, stride)
807 | uniform const unsigned int16 centerp3[], // shape: (height, stride)
808 | uniform const unsigned int16 neighborp1[], // shape: (height, stride)
809 | uniform const unsigned int16 neighborp2[], // shape: (height, stride)
810 | uniform const unsigned int16 neighborp3[], // shape: (height, stride)
811 | uniform int offset_x,
812 | uniform int offset_y,
813 | uniform int width,
814 | uniform int height,
815 | uniform int stride,
816 | uniform float inv_divisor
817 | ) {
818 |
819 | uniform int start_x = abs(offset_x);
820 | uniform int end_x = width - abs(offset_x);
821 |
822 | uniform float sq_inv_divisor = square(inv_divisor);
823 |
824 | for (uniform int y = 0; y < height; y++) {
825 | for (uniform int x = 0; x < start_x; x++) {
826 | uniform int idx = y * stride + x;
827 | uniform float u1_1 = centerp1[idx];
828 | uniform float u1_2 = centerp2[idx];
829 | uniform float u1_3 = centerp3[idx];
830 |
831 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
832 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
833 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
834 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
835 |
836 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
837 |
838 | uniform float dst = (
839 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
840 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
841 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
842 | );
843 |
844 | temp0[idx] = dst * sq_inv_divisor;
845 | }
846 |
847 | foreach (x = start_x ... end_x) {
848 | int idx = y * stride + x;
849 | float u1_1 = centerp1[idx];
850 | float u1_2 = centerp2[idx];
851 | float u1_3 = centerp3[idx];
852 |
853 | int neighbor_idx = CLAMPY(y + offset_y) * stride + (x + offset_x);
854 | float u1_pq_1 = neighborp1[neighbor_idx];
855 | float u1_pq_2 = neighborp2[neighbor_idx];
856 | float u1_pq_3 = neighborp3[neighbor_idx];
857 |
858 | float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
859 |
860 | float dst = (
861 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
862 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
863 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
864 | );
865 |
866 | temp0[idx] = dst * sq_inv_divisor;
867 | }
868 |
869 | for (uniform int x = end_x; x < width; x++) {
870 | uniform int idx = y * stride + x;
871 | uniform float u1_1 = centerp1[idx];
872 | uniform float u1_2 = centerp2[idx];
873 | uniform float u1_3 = centerp3[idx];
874 |
875 | uniform int neighbor_idx = CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x);
876 | uniform float u1_pq_1 = neighborp1[neighbor_idx];
877 | uniform float u1_pq_2 = neighborp2[neighbor_idx];
878 | uniform float u1_pq_3 = neighborp3[neighbor_idx];
879 |
880 | uniform float m_red = (u1_1 + u1_pq_1) / 6.0f * inv_divisor;
881 |
882 | uniform float dst = (
883 | (2.0f / 3.0f + m_red) * square(u1_1 - u1_pq_1) +
884 | (4.0f / 3.0f) * square(u1_2 - u1_pq_2) +
885 | (1.0f - m_red) * square(u1_3 - u1_pq_3)
886 | );
887 |
888 | temp0[idx] = dst * sq_inv_divisor;
889 | }
890 | }
891 | }
892 |
893 | // manually unrolled nlmHorizontal()
894 | static void nlmHorizontalS0(
895 | uniform float temp0[], // shape: (height, stride)
896 | uniform const float temp[], // shape: (height, stride)
897 | uniform int width,
898 | uniform int height,
899 | uniform int stride
900 | ) {
901 | const uniform int nlm_s = 0;
902 | uniform int start = nlm_s;
903 | uniform int end = width - nlm_s;
904 |
905 | for (uniform int y = 0; y < height; y++) {
906 | for (uniform int x = 0; x < nlm_s; x++) {
907 | uniform float sum = 0.0f;
908 | #pragma unroll
909 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
910 | sum += temp[y * stride + CLAMPX(x + j)];
911 | }
912 | temp0[y * stride + x] = sum;
913 | }
914 |
915 | foreach (x = start ... end) {
916 | float sum = 0.0f;
917 | #pragma unroll
918 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
919 | sum += temp[y * stride + x + j];
920 | }
921 | temp0[y * stride + x] = sum;
922 | }
923 |
924 | for (uniform int x = end; x < width; x++) {
925 | uniform float sum = 0.0f;
926 | #pragma unroll
927 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
928 | sum += temp[y * stride + CLAMPX(x + j)];
929 | }
930 | temp0[y * stride + x] = sum;
931 | }
932 | }
933 | }
934 |
935 | static void nlmHorizontalS1(
936 | uniform float temp0[], // shape: (height, stride)
937 | uniform const float temp[], // shape: (height, stride)
938 | uniform int width,
939 | uniform int height,
940 | uniform int stride
941 | ) {
942 | const uniform int nlm_s = 1;
943 | uniform int start = nlm_s;
944 | uniform int end = width - nlm_s;
945 |
946 | for (uniform int y = 0; y < height; y++) {
947 | for (uniform int x = 0; x < nlm_s; x++) {
948 | uniform float sum = 0.0f;
949 | #pragma unroll
950 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
951 | sum += temp[y * stride + CLAMPX(x + j)];
952 | }
953 | temp0[y * stride + x] = sum;
954 | }
955 |
956 | foreach (x = start ... end) {
957 | float sum = 0.0f;
958 | #pragma unroll
959 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
960 | sum += temp[y * stride + x + j];
961 | }
962 | temp0[y * stride + x] = sum;
963 | }
964 |
965 | for (uniform int x = end; x < width; x++) {
966 | uniform float sum = 0.0f;
967 | #pragma unroll
968 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
969 | sum += temp[y * stride + CLAMPX(x + j)];
970 | }
971 | temp0[y * stride + x] = sum;
972 | }
973 | }
974 | }
975 |
976 | static void nlmHorizontalS2(
977 | uniform float temp0[], // shape: (height, stride)
978 | uniform const float temp[], // shape: (height, stride)
979 | uniform int width,
980 | uniform int height,
981 | uniform int stride
982 | ) {
983 |
984 | const uniform int nlm_s = 2;
985 | uniform int start = nlm_s;
986 | uniform int end = width - nlm_s;
987 |
988 | for (uniform int y = 0; y < height; y++) {
989 | for (uniform int x = 0; x < nlm_s; x++) {
990 | uniform float sum = 0.0f;
991 | #pragma unroll
992 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
993 | sum += temp[y * stride + CLAMPX(x + j)];
994 | }
995 | temp0[y * stride + x] = sum;
996 | }
997 |
998 | foreach (x = start ... end) {
999 | float sum = 0.0f;
1000 | #pragma unroll
1001 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1002 | sum += temp[y * stride + x + j];
1003 | }
1004 | temp0[y * stride + x] = sum;
1005 | }
1006 |
1007 | for (uniform int x = end; x < width; x++) {
1008 | uniform float sum = 0.0f;
1009 | #pragma unroll
1010 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1011 | sum += temp[y * stride + CLAMPX(x + j)];
1012 | }
1013 | temp0[y * stride + x] = sum;
1014 | }
1015 | }
1016 | }
1017 |
1018 | static void nlmHorizontalS3(
1019 | uniform float temp0[], // shape: (height, stride)
1020 | uniform const float temp[], // shape: (height, stride)
1021 | uniform int width,
1022 | uniform int height,
1023 | uniform int stride
1024 | ) {
1025 |
1026 | const uniform int nlm_s = 3;
1027 | uniform int start = nlm_s;
1028 | uniform int end = width - nlm_s;
1029 |
1030 | for (uniform int y = 0; y < height; y++) {
1031 | for (uniform int x = 0; x < nlm_s; x++) {
1032 | uniform float sum = 0.0f;
1033 | #pragma unroll
1034 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1035 | sum += temp[y * stride + CLAMPX(x + j)];
1036 | }
1037 | temp0[y * stride + x] = sum;
1038 | }
1039 |
1040 | foreach (x = start ... end) {
1041 | float sum = 0.0f;
1042 | #pragma unroll
1043 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1044 | sum += temp[y * stride + x + j];
1045 | }
1046 | temp0[y * stride + x] = sum;
1047 | }
1048 |
1049 | for (uniform int x = end; x < width; x++) {
1050 | uniform float sum = 0.0f;
1051 | #pragma unroll
1052 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1053 | sum += temp[y * stride + CLAMPX(x + j)];
1054 | }
1055 | temp0[y * stride + x] = sum;
1056 | }
1057 | }
1058 | }
1059 |
1060 | static void nlmHorizontalS4(
1061 | uniform float temp0[], // shape: (height, stride)
1062 | uniform const float temp[], // shape: (height, stride)
1063 | uniform int width,
1064 | uniform int height,
1065 | uniform int stride
1066 | ) {
1067 |
1068 | const uniform int nlm_s = 4;
1069 | uniform int start = nlm_s;
1070 | uniform int end = width - nlm_s;
1071 |
1072 | for (uniform int y = 0; y < height; y++) {
1073 | for (uniform int x = 0; x < nlm_s; x++) {
1074 | uniform float sum = 0.0f;
1075 | #pragma unroll
1076 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1077 | sum += temp[y * stride + CLAMPX(x + j)];
1078 | }
1079 | temp0[y * stride + x] = sum;
1080 | }
1081 |
1082 | foreach (x = start ... end) {
1083 | float sum = 0.0f;
1084 | #pragma unroll
1085 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1086 | sum += temp[y * stride + x + j];
1087 | }
1088 | temp0[y * stride + x] = sum;
1089 | }
1090 |
1091 | for (uniform int x = end; x < width; x++) {
1092 | uniform float sum = 0.0f;
1093 | #pragma unroll
1094 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1095 | sum += temp[y * stride + CLAMPX(x + j)];
1096 | }
1097 | temp0[y * stride + x] = sum;
1098 | }
1099 | }
1100 | }
1101 |
1102 | export void nlmHorizontal(
1103 | uniform float temp0[], // shape: (height, stride)
1104 | uniform const float temp[], // shape: (height, stride)
1105 | uniform int nlm_s,
1106 | uniform int width,
1107 | uniform int height,
1108 | uniform int stride
1109 | ) {
1110 | // dynamic dispatch on nlm_s
1111 | if (nlm_s == 0) {
1112 | nlmHorizontalS0(temp0, temp, width, height, stride);
1113 | return ;
1114 | } else if (nlm_s == 1) {
1115 | nlmHorizontalS1(temp0, temp, width, height, stride);
1116 | return ;
1117 | } else if (nlm_s == 2) {
1118 | nlmHorizontalS2(temp0, temp, width, height, stride);
1119 | return ;
1120 | } else if (nlm_s == 3) {
1121 | nlmHorizontalS3(temp0, temp, width, height, stride);
1122 | return ;
1123 | } else if (nlm_s == 4) {
1124 | nlmHorizontalS4(temp0, temp, width, height, stride);
1125 | return ;
1126 | }
1127 |
1128 | uniform int start = nlm_s;
1129 | uniform int end = width - nlm_s;
1130 |
1131 | for (uniform int y = 0; y < height; y++) {
1132 | for (uniform int x = 0; x < nlm_s; x++) {
1133 | uniform float sum = 0.0f;
1134 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1135 | sum += temp[y * stride + CLAMPX(x + j)];
1136 | }
1137 | temp0[y * stride + x] = sum;
1138 | }
1139 |
1140 | foreach (x = start ... end) {
1141 | float sum = 0.0f;
1142 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1143 | sum += temp[y * stride + x + j];
1144 | }
1145 | temp0[y * stride + x] = sum;
1146 | }
1147 |
1148 | for (uniform int x = end; x < width; x++) {
1149 | uniform float sum = 0.0f;
1150 | for (uniform int j = -nlm_s; j <= nlm_s; ++j) {
1151 | sum += temp[y * stride + CLAMPX(x + j)];
1152 | }
1153 | temp0[y * stride + x] = sum;
1154 | }
1155 | }
1156 | }
1157 |
1158 | static inline float welsch(float sum, uniform float h2_inv_norm) {
1159 | return exp(-sum * h2_inv_norm);
1160 | }
1161 |
1162 | export void nlmVerticalWelsch(
1163 | uniform float dstp[], // shape: (height, stride)
1164 | uniform const float srcp[], // shape: (height, stride)
1165 | uniform int radius,
1166 | uniform float h2_inv_norm,
1167 | uniform int width,
1168 | uniform int height,
1169 | uniform int stride,
1170 | uniform float buffer[] // shape: (width,)
1171 | ) {
1172 |
1173 | foreach (x = 0 ... width) {
1174 | buffer[x] = radius * srcp[x];
1175 | }
1176 |
1177 | for (uniform int y = 0; y < radius; ++y) {
1178 | foreach (x = 0 ... width) {
1179 | buffer[x] += srcp[min(y, height - 1) * stride + x];
1180 | }
1181 | }
1182 |
1183 | for (uniform int y = 0; y < min(radius, height); ++y) {
1184 | foreach (x = 0 ... width) {
1185 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1186 | dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm);
1187 | buffer[x] -= srcp[x];
1188 | }
1189 | }
1190 |
1191 | if (height > radius) {
1192 | for (uniform int y = radius; y < height - radius; ++y) {
1193 | foreach (x = 0 ... width) {
1194 | buffer[x] += srcp[(y + radius) * stride + x];
1195 | dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm);
1196 | buffer[x] -= srcp[(y - radius) * stride + x];
1197 | }
1198 | }
1199 |
1200 | for (uniform int y = max(height - radius, radius); y < height; ++y) {
1201 | foreach (x = 0 ... width) {
1202 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1203 | dstp[y * stride + x] = welsch(buffer[x], h2_inv_norm);
1204 | buffer[x] -= srcp[(y - radius) * stride + x];
1205 | }
1206 | }
1207 | }
1208 | }
1209 |
1210 | // positive difference
1211 | static inline float fdim(uniform float x, float y) {
1212 | return (x > y) ? x - y : 0.0f;
1213 | }
1214 |
1215 | static inline float bisquareA(float sum, uniform float h2_inv_norm) {
1216 | float tmp = fdim(1.0f, sum * h2_inv_norm);
1217 | return tmp;
1218 | }
1219 |
1220 | export void nlmVerticalBisquareA(
1221 | uniform float dstp[], // shape: (height, stride)
1222 | uniform const float srcp[], // shape: (height, stride)
1223 | uniform int radius,
1224 | uniform float h2_inv_norm,
1225 | uniform int width,
1226 | uniform int height,
1227 | uniform int stride,
1228 | uniform float buffer[] // shape: (width,)
1229 | ) {
1230 |
1231 | foreach (x = 0 ... width) {
1232 | buffer[x] = radius * srcp[x];
1233 | }
1234 |
1235 | for (uniform int y = 0; y < radius; ++y) {
1236 | foreach (x = 0 ... width) {
1237 | buffer[x] += srcp[min(y, height - 1) * stride + x];
1238 | }
1239 | }
1240 |
1241 | for (uniform int y = 0; y < min(radius, height); ++y) {
1242 | foreach (x = 0 ... width) {
1243 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1244 | dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm);
1245 | buffer[x] -= srcp[x];
1246 | }
1247 | }
1248 |
1249 | if (height > radius) {
1250 | for (uniform int y = radius; y < height - radius; ++y) {
1251 | foreach (x = 0 ... width) {
1252 | buffer[x] += srcp[(y + radius) * stride + x];
1253 | dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm);
1254 | buffer[x] -= srcp[(y - radius) * stride + x];
1255 | }
1256 | }
1257 |
1258 | for (uniform int y = max(height - radius, radius); y < height; ++y) {
1259 | foreach (x = 0 ... width) {
1260 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1261 | dstp[y * stride + x] = bisquareA(buffer[x], h2_inv_norm);
1262 | buffer[x] -= srcp[(y - radius) * stride + x];
1263 | }
1264 | }
1265 | }
1266 | }
1267 |
1268 | static inline float bisquareB(float sum, uniform float h2_inv_norm) {
1269 | float tmp = fdim(1.0f, sum * h2_inv_norm);
1270 | tmp *= tmp;
1271 | return tmp;
1272 | }
1273 |
1274 | export void nlmVerticalBisquareB(
1275 | uniform float dstp[], // shape: (height, stride)
1276 | uniform const float srcp[], // shape: (height, stride)
1277 | uniform int radius,
1278 | uniform float h2_inv_norm,
1279 | uniform int width,
1280 | uniform int height,
1281 | uniform int stride,
1282 | uniform float buffer[] // shape: (width,)
1283 | ) {
1284 |
1285 | foreach (x = 0 ... width) {
1286 | buffer[x] = radius * srcp[x];
1287 | }
1288 |
1289 | for (uniform int y = 0; y < radius; ++y) {
1290 | foreach (x = 0 ... width) {
1291 | buffer[x] += srcp[min(y, height - 1) * stride + x];
1292 | }
1293 | }
1294 |
1295 | for (uniform int y = 0; y < min(radius, height); ++y) {
1296 | foreach (x = 0 ... width) {
1297 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1298 | dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm);
1299 | buffer[x] -= srcp[x];
1300 | }
1301 | }
1302 |
1303 | if (height > radius) {
1304 | for (uniform int y = radius; y < height - radius; ++y) {
1305 | foreach (x = 0 ... width) {
1306 | buffer[x] += srcp[(y + radius) * stride + x];
1307 | dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm);
1308 | buffer[x] -= srcp[(y - radius) * stride + x];
1309 | }
1310 | }
1311 |
1312 | for (uniform int y = max(height - radius, radius); y < height; ++y) {
1313 | foreach (x = 0 ... width) {
1314 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1315 | dstp[y * stride + x] = bisquareB(buffer[x], h2_inv_norm);
1316 | buffer[x] -= srcp[(y - radius) * stride + x];
1317 | }
1318 | }
1319 | }
1320 | }
1321 |
1322 | static inline float bisquareC(float sum, uniform float h2_inv_norm) {
1323 | float tmp = fdim(1.0f, sum * h2_inv_norm);
1324 | tmp *= tmp;
1325 | tmp *= tmp;
1326 | tmp *= tmp;
1327 | return tmp;
1328 | }
1329 |
1330 | export void nlmVerticalBisquareC(
1331 | uniform float dstp[], // shape: (height, stride)
1332 | uniform const float srcp[], // shape: (height, stride)
1333 | uniform int radius,
1334 | uniform float h2_inv_norm,
1335 | uniform int width,
1336 | uniform int height,
1337 | uniform int stride,
1338 | uniform float buffer[] // shape: (width,)
1339 | ) {
1340 |
1341 | foreach (x = 0 ... width) {
1342 | buffer[x] = radius * srcp[x];
1343 | }
1344 |
1345 | for (uniform int y = 0; y < radius; ++y) {
1346 | foreach (x = 0 ... width) {
1347 | buffer[x] += srcp[min(y, height - 1) * stride + x];
1348 | }
1349 | }
1350 |
1351 | for (uniform int y = 0; y < min(radius, height); ++y) {
1352 | foreach (x = 0 ... width) {
1353 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1354 | dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm);
1355 | buffer[x] -= srcp[x];
1356 | }
1357 | }
1358 |
1359 | if (height > radius) {
1360 | for (uniform int y = radius; y < height - radius; ++y) {
1361 | foreach (x = 0 ... width) {
1362 | buffer[x] += srcp[(y + radius) * stride + x];
1363 | dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm);
1364 | buffer[x] -= srcp[(y - radius) * stride + x];
1365 | }
1366 | }
1367 |
1368 | for (uniform int y = max(height - radius, radius); y < height; ++y) {
1369 | foreach (x = 0 ... width) {
1370 | buffer[x] += srcp[min(y + radius, height - 1) * stride + x];
1371 | dstp[y * stride + x] = bisquareC(buffer[x], h2_inv_norm);
1372 | buffer[x] -= srcp[(y - radius) * stride + x];
1373 | }
1374 | }
1375 | }
1376 | }
1377 |
1378 | export void nlmAccumulationCh1_f32(
1379 | uniform float weightp[], // shape: (height, stride)
1380 | uniform float wdstp[], // shape: (height, stride)
1381 | uniform float max_weightp[], // shape: (height, stride)
1382 | uniform const float srcp_bwd[], // shape: (height, stride)
1383 | uniform const float srcp_fwd[], // shape: (height, stride)
1384 | uniform const float temp1[], // shape: (height, stride)
1385 | uniform const float temp2[], // shape: (height, stride)
1386 | uniform int offset_x,
1387 | uniform int offset_y,
1388 | uniform int width,
1389 | uniform int height,
1390 | uniform int stride
1391 | ) {
1392 |
1393 | uniform int start_x = abs(offset_x);
1394 | uniform int end_x = width - abs(offset_x);
1395 |
1396 | for (uniform int y = 0; y < height; y++) {
1397 | for (uniform int x = 0; x < start_x; x++) {
1398 | uniform int idx = y * stride + x;
1399 |
1400 | uniform float u4 = temp1[idx];
1401 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1402 |
1403 | weightp[idx] += u4 + u4_mq;
1404 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1405 |
1406 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1407 |
1408 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1409 |
1410 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1411 | }
1412 |
1413 | foreach (x = start_x ... end_x) {
1414 | int idx = y * stride + x;
1415 |
1416 | float u4 = temp1[idx];
1417 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1418 |
1419 | weightp[idx] += u4 + u4_mq;
1420 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1421 |
1422 | float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1423 |
1424 | float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1425 |
1426 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1427 | }
1428 |
1429 | for (uniform int x = end_x; x < width; x++) {
1430 | uniform int idx = y * stride + x;
1431 |
1432 | uniform float u4 = temp1[idx];
1433 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1434 |
1435 | weightp[idx] += u4 + u4_mq;
1436 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1437 |
1438 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1439 |
1440 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1441 |
1442 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1443 | }
1444 | }
1445 | }
1446 |
1447 | export void nlmAccumulationCh1_u8(
1448 | uniform float weightp[], // shape: (height, stride)
1449 | uniform float wdstp[], // shape: (height, stride)
1450 | uniform float max_weightp[], // shape: (height, stride)
1451 | uniform const unsigned int8 srcp_bwd[], // shape: (height, stride)
1452 | uniform const unsigned int8 srcp_fwd[], // shape: (height, stride)
1453 | uniform const float temp1[], // shape: (height, stride)
1454 | uniform const float temp2[], // shape: (height, stride)
1455 | uniform int offset_x,
1456 | uniform int offset_y,
1457 | uniform int width,
1458 | uniform int height,
1459 | uniform int stride
1460 | ) {
1461 |
1462 | uniform int start_x = abs(offset_x);
1463 | uniform int end_x = width - abs(offset_x);
1464 |
1465 | for (uniform int y = 0; y < height; y++) {
1466 | for (uniform int x = 0; x < start_x; x++) {
1467 | uniform int idx = y * stride + x;
1468 |
1469 | uniform float u4 = temp1[idx];
1470 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1471 |
1472 | weightp[idx] += u4 + u4_mq;
1473 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1474 |
1475 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1476 |
1477 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1478 |
1479 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1480 | }
1481 |
1482 | foreach (x = start_x ... end_x) {
1483 | int idx = y * stride + x;
1484 |
1485 | float u4 = temp1[idx];
1486 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1487 |
1488 | weightp[idx] += u4 + u4_mq;
1489 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1490 |
1491 | float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1492 |
1493 | float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1494 |
1495 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1496 | }
1497 |
1498 | for (uniform int x = end_x; x < width; x++) {
1499 | uniform int idx = y * stride + x;
1500 |
1501 | uniform float u4 = temp1[idx];
1502 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1503 |
1504 | weightp[idx] += u4 + u4_mq;
1505 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1506 |
1507 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1508 |
1509 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1510 |
1511 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1512 | }
1513 | }
1514 | }
1515 |
1516 | export void nlmAccumulationCh1_u16(
1517 | uniform float weightp[], // shape: (height, stride)
1518 | uniform float wdstp[], // shape: (height, stride)
1519 | uniform float max_weightp[], // shape: (height, stride)
1520 | uniform const unsigned int16 srcp_bwd[], // shape: (height, stride)
1521 | uniform const unsigned int16 srcp_fwd[], // shape: (height, stride)
1522 | uniform const float temp1[], // shape: (height, stride)
1523 | uniform const float temp2[], // shape: (height, stride)
1524 | uniform int offset_x,
1525 | uniform int offset_y,
1526 | uniform int width,
1527 | uniform int height,
1528 | uniform int stride
1529 | ) {
1530 |
1531 | uniform int start_x = abs(offset_x);
1532 | uniform int end_x = width - abs(offset_x);
1533 |
1534 | for (uniform int y = 0; y < height; y++) {
1535 | for (uniform int x = 0; x < start_x; x++) {
1536 | uniform int idx = y * stride + x;
1537 |
1538 | uniform float u4 = temp1[idx];
1539 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1540 |
1541 | weightp[idx] += u4 + u4_mq;
1542 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1543 |
1544 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1545 |
1546 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1547 |
1548 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1549 | }
1550 |
1551 | foreach (x = start_x ... end_x) {
1552 | int idx = y * stride + x;
1553 |
1554 | float u4 = temp1[idx];
1555 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1556 |
1557 | weightp[idx] += u4 + u4_mq;
1558 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1559 |
1560 | float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1561 |
1562 | float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1563 |
1564 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1565 | }
1566 |
1567 | for (uniform int x = end_x; x < width; x++) {
1568 | uniform int idx = y * stride + x;
1569 |
1570 | uniform float u4 = temp1[idx];
1571 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1572 |
1573 | weightp[idx] += u4 + u4_mq;
1574 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1575 |
1576 | uniform float u1_pq = srcp_bwd[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1577 |
1578 | uniform float u1_mq = srcp_fwd[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1579 |
1580 | wdstp[idx] += u4 * u1_pq + u4_mq * u1_mq;
1581 | }
1582 | }
1583 | }
1584 |
1585 | export void nlmAccumulationCh2_f32(
1586 | uniform float weightp[], // shape: (height, stride)
1587 | uniform float wdstp1[], // shape: (height, stride)
1588 | uniform float wdstp2[], // shape: (height, stride)
1589 | uniform float max_weightp[], // shape: (height, stride)
1590 | uniform const float srcp_bwd1[], // shape: (height, stride)
1591 | uniform const float srcp_bwd2[], // shape: (height, stride)
1592 | uniform const float srcp_fwd1[], // shape: (height, stride)
1593 | uniform const float srcp_fwd2[], // shape: (height, stride)
1594 | uniform const float temp1[], // shape: (height, stride)
1595 | uniform const float temp2[], // shape: (height, stride)
1596 | uniform int offset_x,
1597 | uniform int offset_y,
1598 | uniform int width,
1599 | uniform int height,
1600 | uniform int stride
1601 | ) {
1602 |
1603 | uniform int start_x = abs(offset_x);
1604 | uniform int end_x = width - abs(offset_x);
1605 |
1606 | for (uniform int y = 0; y < height; y++) {
1607 | for (uniform int x = 0; x < start_x; x++) {
1608 | uniform int idx = y * stride + x;
1609 |
1610 | uniform float u4 = temp1[idx];
1611 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1612 |
1613 | weightp[idx] += u4 + u4_mq;
1614 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1615 |
1616 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1617 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1618 |
1619 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1620 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1621 |
1622 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1623 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1624 | }
1625 |
1626 | foreach (x = start_x ... end_x) {
1627 | int idx = y * stride + x;
1628 |
1629 | float u4 = temp1[idx];
1630 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1631 |
1632 | weightp[idx] += u4 + u4_mq;
1633 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1634 |
1635 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1636 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1637 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1638 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1639 |
1640 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1641 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1642 | }
1643 |
1644 | for (uniform int x = end_x; x < width; x++) {
1645 | uniform int idx = y * stride + x;
1646 |
1647 | uniform float u4 = temp1[idx];
1648 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1649 |
1650 | weightp[idx] += u4 + u4_mq;
1651 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1652 |
1653 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1654 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1655 |
1656 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1657 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1658 |
1659 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1660 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1661 | }
1662 | }
1663 | }
1664 |
1665 | export void nlmAccumulationCh2_u8(
1666 | uniform float weightp[], // shape: (height, stride)
1667 | uniform float wdstp1[], // shape: (height, stride)
1668 | uniform float wdstp2[], // shape: (height, stride)
1669 | uniform float max_weightp[], // shape: (height, stride)
1670 | uniform const unsigned int8 srcp_bwd1[], // shape: (height, stride)
1671 | uniform const unsigned int8 srcp_bwd2[], // shape: (height, stride)
1672 | uniform const unsigned int8 srcp_fwd1[], // shape: (height, stride)
1673 | uniform const unsigned int8 srcp_fwd2[], // shape: (height, stride)
1674 | uniform const float temp1[], // shape: (height, stride)
1675 | uniform const float temp2[], // shape: (height, stride)
1676 | uniform int offset_x,
1677 | uniform int offset_y,
1678 | uniform int width,
1679 | uniform int height,
1680 | uniform int stride
1681 | ) {
1682 |
1683 | uniform int start_x = abs(offset_x);
1684 | uniform int end_x = width - abs(offset_x);
1685 |
1686 | for (uniform int y = 0; y < height; y++) {
1687 | for (uniform int x = 0; x < start_x; x++) {
1688 | uniform int idx = y * stride + x;
1689 |
1690 | uniform float u4 = temp1[idx];
1691 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1692 |
1693 | weightp[idx] += u4 + u4_mq;
1694 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1695 |
1696 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1697 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1698 |
1699 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1700 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1701 |
1702 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1703 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1704 | }
1705 |
1706 | foreach (x = start_x ... end_x) {
1707 | int idx = y * stride + x;
1708 |
1709 | float u4 = temp1[idx];
1710 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1711 |
1712 | weightp[idx] += u4 + u4_mq;
1713 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1714 |
1715 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1716 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1717 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1718 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1719 |
1720 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1721 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1722 | }
1723 |
1724 | for (uniform int x = end_x; x < width; x++) {
1725 | uniform int idx = y * stride + x;
1726 |
1727 | uniform float u4 = temp1[idx];
1728 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1729 |
1730 | weightp[idx] += u4 + u4_mq;
1731 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1732 |
1733 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1734 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1735 |
1736 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1737 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1738 |
1739 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1740 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1741 | }
1742 | }
1743 | }
1744 |
1745 | export void nlmAccumulationCh2_u16(
1746 | uniform float weightp[], // shape: (height, stride)
1747 | uniform float wdstp1[], // shape: (height, stride)
1748 | uniform float wdstp2[], // shape: (height, stride)
1749 | uniform float max_weightp[], // shape: (height, stride)
1750 | uniform const unsigned int16 srcp_bwd1[], // shape: (height, stride)
1751 | uniform const unsigned int16 srcp_bwd2[], // shape: (height, stride)
1752 | uniform const unsigned int16 srcp_fwd1[], // shape: (height, stride)
1753 | uniform const unsigned int16 srcp_fwd2[], // shape: (height, stride)
1754 | uniform const float temp1[], // shape: (height, stride)
1755 | uniform const float temp2[], // shape: (height, stride)
1756 | uniform int offset_x,
1757 | uniform int offset_y,
1758 | uniform int width,
1759 | uniform int height,
1760 | uniform int stride
1761 | ) {
1762 |
1763 | uniform int start_x = abs(offset_x);
1764 | uniform int end_x = width - abs(offset_x);
1765 |
1766 | for (uniform int y = 0; y < height; y++) {
1767 | for (uniform int x = 0; x < start_x; x++) {
1768 | uniform int idx = y * stride + x;
1769 |
1770 | uniform float u4 = temp1[idx];
1771 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1772 |
1773 | weightp[idx] += u4 + u4_mq;
1774 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1775 |
1776 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1777 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1778 |
1779 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1780 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1781 |
1782 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1783 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1784 | }
1785 |
1786 | foreach (x = start_x ... end_x) {
1787 | int idx = y * stride + x;
1788 |
1789 | float u4 = temp1[idx];
1790 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1791 |
1792 | weightp[idx] += u4 + u4_mq;
1793 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1794 |
1795 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1796 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1797 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1798 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1799 |
1800 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1801 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1802 | }
1803 |
1804 | for (uniform int x = end_x; x < width; x++) {
1805 | uniform int idx = y * stride + x;
1806 |
1807 | uniform float u4 = temp1[idx];
1808 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1809 |
1810 | weightp[idx] += u4 + u4_mq;
1811 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1812 |
1813 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1814 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1815 |
1816 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1817 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1818 |
1819 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1820 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1821 | }
1822 | }
1823 | }
1824 |
1825 | export void nlmAccumulationCh3_f32(
1826 | uniform float weightp[], // shape: (height, stride)
1827 | uniform float wdstp1[], // shape: (height, stride)
1828 | uniform float wdstp2[], // shape: (height, stride)
1829 | uniform float wdstp3[], // shape: (height, stride)
1830 | uniform float max_weightp[], // shape: (height, stride)
1831 | uniform const float srcp_bwd1[], // shape: (height, stride)
1832 | uniform const float srcp_bwd2[], // shape: (height, stride)
1833 | uniform const float srcp_bwd3[], // shape: (height, stride)
1834 | uniform const float srcp_fwd1[], // shape: (height, stride)
1835 | uniform const float srcp_fwd2[], // shape: (height, stride)
1836 | uniform const float srcp_fwd3[], // shape: (height, stride)
1837 | uniform const float temp1[], // shape: (height, stride)
1838 | uniform const float temp2[], // shape: (height, stride)
1839 | uniform int offset_x,
1840 | uniform int offset_y,
1841 | uniform int width,
1842 | uniform int height,
1843 | uniform int stride
1844 | ) {
1845 |
1846 | uniform int start_x = abs(offset_x);
1847 | uniform int end_x = width - abs(offset_x);
1848 |
1849 | for (uniform int y = 0; y < height; y++) {
1850 | for (uniform int x = 0; x < start_x; x++) {
1851 | uniform int idx = y * stride + x;
1852 |
1853 | uniform float u4 = temp1[idx];
1854 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1855 |
1856 | weightp[idx] += u4 + u4_mq;
1857 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1858 |
1859 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1860 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1861 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1862 |
1863 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1864 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1865 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1866 |
1867 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1868 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1869 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1870 | }
1871 |
1872 | foreach (x = start_x ... end_x) {
1873 | int idx = y * stride + x;
1874 |
1875 | float u4 = temp1[idx];
1876 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1877 |
1878 | weightp[idx] += u4 + u4_mq;
1879 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1880 |
1881 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1882 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1883 | float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1884 |
1885 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1886 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1887 | float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1888 |
1889 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1890 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1891 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1892 | }
1893 |
1894 | for (uniform int x = end_x; x < width; x++) {
1895 | uniform int idx = y * stride + x;
1896 |
1897 | uniform float u4 = temp1[idx];
1898 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1899 |
1900 | weightp[idx] += u4 + u4_mq;
1901 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1902 |
1903 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1904 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1905 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1906 |
1907 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1908 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1909 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1910 |
1911 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1912 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1913 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1914 | }
1915 | }
1916 | }
1917 |
1918 | export void nlmAccumulationCh3_u8(
1919 | uniform float weightp[], // shape: (height, stride)
1920 | uniform float wdstp1[], // shape: (height, stride)
1921 | uniform float wdstp2[], // shape: (height, stride)
1922 | uniform float wdstp3[], // shape: (height, stride)
1923 | uniform float max_weightp[], // shape: (height, stride)
1924 | uniform const unsigned int8 srcp_bwd1[], // shape: (height, stride)
1925 | uniform const unsigned int8 srcp_bwd2[], // shape: (height, stride)
1926 | uniform const unsigned int8 srcp_bwd3[], // shape: (height, stride)
1927 | uniform const unsigned int8 srcp_fwd1[], // shape: (height, stride)
1928 | uniform const unsigned int8 srcp_fwd2[], // shape: (height, stride)
1929 | uniform const unsigned int8 srcp_fwd3[], // shape: (height, stride)
1930 | uniform const float temp1[], // shape: (height, stride)
1931 | uniform const float temp2[], // shape: (height, stride)
1932 | uniform int offset_x,
1933 | uniform int offset_y,
1934 | uniform int width,
1935 | uniform int height,
1936 | uniform int stride
1937 | ) {
1938 |
1939 | uniform int start_x = abs(offset_x);
1940 | uniform int end_x = width - abs(offset_x);
1941 |
1942 | for (uniform int y = 0; y < height; y++) {
1943 | for (uniform int x = 0; x < start_x; x++) {
1944 | uniform int idx = y * stride + x;
1945 |
1946 | uniform float u4 = temp1[idx];
1947 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1948 |
1949 | weightp[idx] += u4 + u4_mq;
1950 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1951 |
1952 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1953 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1954 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1955 |
1956 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1957 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1958 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1959 |
1960 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1961 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1962 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1963 | }
1964 |
1965 | foreach (x = start_x ... end_x) {
1966 | int idx = y * stride + x;
1967 |
1968 | float u4 = temp1[idx];
1969 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1970 |
1971 | weightp[idx] += u4 + u4_mq;
1972 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1973 |
1974 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1975 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1976 | float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)];
1977 |
1978 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1979 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1980 | float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)];
1981 |
1982 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
1983 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
1984 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
1985 | }
1986 |
1987 | for (uniform int x = end_x; x < width; x++) {
1988 | uniform int idx = y * stride + x;
1989 |
1990 | uniform float u4 = temp1[idx];
1991 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
1992 |
1993 | weightp[idx] += u4 + u4_mq;
1994 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
1995 |
1996 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1997 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1998 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
1999 |
2000 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2001 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2002 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2003 |
2004 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
2005 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
2006 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
2007 | }
2008 | }
2009 | }
2010 |
2011 | export void nlmAccumulationCh3_u16(
2012 | uniform float weightp[], // shape: (height, stride)
2013 | uniform float wdstp1[], // shape: (height, stride)
2014 | uniform float wdstp2[], // shape: (height, stride)
2015 | uniform float wdstp3[], // shape: (height, stride)
2016 | uniform float max_weightp[], // shape: (height, stride)
2017 | uniform const unsigned int16 srcp_bwd1[], // shape: (height, stride)
2018 | uniform const unsigned int16 srcp_bwd2[], // shape: (height, stride)
2019 | uniform const unsigned int16 srcp_bwd3[], // shape: (height, stride)
2020 | uniform const unsigned int16 srcp_fwd1[], // shape: (height, stride)
2021 | uniform const unsigned int16 srcp_fwd2[], // shape: (height, stride)
2022 | uniform const unsigned int16 srcp_fwd3[], // shape: (height, stride)
2023 | uniform const float temp1[], // shape: (height, stride)
2024 | uniform const float temp2[], // shape: (height, stride)
2025 | uniform int offset_x,
2026 | uniform int offset_y,
2027 | uniform int width,
2028 | uniform int height,
2029 | uniform int stride
2030 | ) {
2031 |
2032 | uniform int start_x = abs(offset_x);
2033 | uniform int end_x = width - abs(offset_x);
2034 |
2035 | for (uniform int y = 0; y < height; y++) {
2036 | for (uniform int x = 0; x < start_x; x++) {
2037 | uniform int idx = y * stride + x;
2038 |
2039 | uniform float u4 = temp1[idx];
2040 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2041 |
2042 | weightp[idx] += u4 + u4_mq;
2043 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
2044 |
2045 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2046 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2047 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2048 |
2049 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2050 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2051 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2052 |
2053 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
2054 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
2055 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
2056 | }
2057 |
2058 | foreach (x = start_x ... end_x) {
2059 | int idx = y * stride + x;
2060 |
2061 | float u4 = temp1[idx];
2062 | float u4_mq = temp2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
2063 |
2064 | weightp[idx] += u4 + u4_mq;
2065 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
2066 |
2067 | float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + (x + offset_x)];
2068 | float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + (x + offset_x)];
2069 | float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + (x + offset_x)];
2070 |
2071 | float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + (x - offset_x)];
2072 | float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + (x - offset_x)];
2073 | float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + (x - offset_x)];
2074 |
2075 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
2076 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
2077 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
2078 | }
2079 |
2080 | for (uniform int x = end_x; x < width; x++) {
2081 | uniform int idx = y * stride + x;
2082 |
2083 | uniform float u4 = temp1[idx];
2084 | uniform float u4_mq = temp2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2085 |
2086 | weightp[idx] += u4 + u4_mq;
2087 | max_weightp[idx] = max(max(u4, u4_mq), max_weightp[idx]);
2088 |
2089 | uniform float u1_pq_1 = srcp_bwd1[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2090 | uniform float u1_pq_2 = srcp_bwd2[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2091 | uniform float u1_pq_3 = srcp_bwd3[CLAMPY(y + offset_y) * stride + CLAMPX(x + offset_x)];
2092 |
2093 | uniform float u1_mq_1 = srcp_fwd1[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2094 | uniform float u1_mq_2 = srcp_fwd2[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2095 | uniform float u1_mq_3 = srcp_fwd3[CLAMPY(y - offset_y) * stride + CLAMPX(x - offset_x)];
2096 |
2097 | wdstp1[idx] += u4 * u1_pq_1 + u4_mq * u1_mq_1;
2098 | wdstp2[idx] += u4 * u1_pq_2 + u4_mq * u1_mq_2;
2099 | wdstp3[idx] += u4 * u1_pq_3 + u4_mq * u1_mq_3;
2100 | }
2101 | }
2102 | }
2103 |
2104 | export void nlmFinishCh1_f32(
2105 | uniform float dstp[], // shape: (height, stride)
2106 | uniform const float srcp[], // shape: (height, stride)
2107 | uniform const float weightp[], // shape: (height, stride)
2108 | uniform const float wdstp[], // shape: (height, stride)
2109 | uniform const float max_weightp[], // shape: (height, stride) // epsilon
2110 | uniform float wref,
2111 | uniform int width,
2112 | uniform int height,
2113 | uniform int stride
2114 | ) {
2115 |
2116 | foreach (y = 0 ... height, x = 0 ... width) {
2117 | int idx = y * stride + x;
2118 |
2119 | float multiplier = wref * max_weightp[idx];
2120 |
2121 | float denominator = multiplier + weightp[idx];
2122 |
2123 | dstp[idx] = (multiplier * srcp[idx] + wdstp[idx]) / denominator;
2124 | }
2125 | }
2126 |
2127 | export void nlmFinishCh1_u8(
2128 | uniform unsigned int8 dstp[], // shape: (height, stride)
2129 | uniform const unsigned int8 srcp[], // shape: (height, stride)
2130 | uniform const float weightp[], // shape: (height, stride)
2131 | uniform const float wdstp[], // shape: (height, stride)
2132 | uniform const float max_weightp[], // shape: (height, stride) // epsilon
2133 | uniform float wref,
2134 | uniform int width,
2135 | uniform int height,
2136 | uniform int stride,
2137 | uniform int peak
2138 | ) {
2139 |
2140 | foreach (y = 0 ... height, x = 0 ... width) {
2141 | int idx = y * stride + x;
2142 |
2143 | float multiplier = wref * max_weightp[idx];
2144 |
2145 | float denominator = multiplier + weightp[idx];
2146 |
2147 | dstp[idx] = max(0, min((int) round((multiplier * srcp[idx] + wdstp[idx]) / denominator), peak));
2148 | }
2149 | }
2150 |
2151 | export void nlmFinishCh1_u16(
2152 | uniform unsigned int16 dstp[], // shape: (height, stride)
2153 | uniform const unsigned int16 srcp[], // shape: (height, stride)
2154 | uniform const float weightp[], // shape: (height, stride)
2155 | uniform const float wdstp[], // shape: (height, stride)
2156 | uniform const float max_weightp[], // shape: (height, stride) // epsilon
2157 | uniform float wref,
2158 | uniform int width,
2159 | uniform int height,
2160 | uniform int stride,
2161 | uniform int peak
2162 | ) {
2163 |
2164 | foreach (y = 0 ... height, x = 0 ... width) {
2165 | int idx = y * stride + x;
2166 |
2167 | float multiplier = wref * max_weightp[idx];
2168 |
2169 | float denominator = multiplier + weightp[idx];
2170 |
2171 | dstp[idx] = max(0, min((int) round((multiplier * srcp[idx] + wdstp[idx]) / denominator), peak));
2172 | }
2173 | }
2174 |
2175 | export void nlmFinishCh2_f32(
2176 | uniform float dstp1[], // shape: (height, stride)
2177 | uniform float dstp2[], // shape: (height, stride)
2178 | uniform const float srcp1[], // shape: (height, stride)
2179 | uniform const float srcp2[], // shape: (height, stride)
2180 | uniform const float weightp[], // shape: (height, stride)
2181 | uniform const float wdstp1[], // shape: (height, stride)
2182 | uniform const float wdstp2[], // shape: (height, stride)
2183 | uniform const float max_weightp[], // shape: (height, stride)
2184 | uniform float wref,
2185 | uniform int width,
2186 | uniform int height,
2187 | uniform int stride
2188 | ) {
2189 |
2190 | foreach (y = 0 ... height, x = 0 ... width) {
2191 | int idx = y * stride + x;
2192 |
2193 | float multiplier = wref * max_weightp[idx];
2194 |
2195 | float denominator = multiplier + weightp[idx];
2196 |
2197 | dstp1[idx] = (multiplier * srcp1[idx] + wdstp1[idx]) / denominator;
2198 | dstp2[idx] = (multiplier * srcp2[idx] + wdstp2[idx]) / denominator;
2199 | }
2200 | }
2201 |
2202 | export void nlmFinishCh2_u8(
2203 | uniform unsigned int8 dstp1[], // shape: (height, stride)
2204 | uniform unsigned int8 dstp2[], // shape: (height, stride)
2205 | uniform const unsigned int8 srcp1[], // shape: (height, stride)
2206 | uniform const unsigned int8 srcp2[], // shape: (height, stride)
2207 | uniform const float weightp[], // shape: (height, stride)
2208 | uniform const float wdstp1[], // shape: (height, stride)
2209 | uniform const float wdstp2[], // shape: (height, stride)
2210 | uniform const float max_weightp[], // shape: (height, stride)
2211 | uniform float wref,
2212 | uniform int width,
2213 | uniform int height,
2214 | uniform int stride,
2215 | uniform int peak
2216 | ) {
2217 |
2218 | foreach (y = 0 ... height, x = 0 ... width) {
2219 | int idx = y * stride + x;
2220 |
2221 | float multiplier = wref * max_weightp[idx];
2222 |
2223 | float denominator = multiplier + weightp[idx];
2224 |
2225 | dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak));
2226 | dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak));
2227 | }
2228 | }
2229 |
2230 | export void nlmFinishCh2_u16(
2231 | uniform unsigned int16 dstp1[], // shape: (height, stride)
2232 | uniform unsigned int16 dstp2[], // shape: (height, stride)
2233 | uniform const unsigned int16 srcp1[], // shape: (height, stride)
2234 | uniform const unsigned int16 srcp2[], // shape: (height, stride)
2235 | uniform const float weightp[], // shape: (height, stride)
2236 | uniform const float wdstp1[], // shape: (height, stride)
2237 | uniform const float wdstp2[], // shape: (height, stride)
2238 | uniform const float max_weightp[], // shape: (height, stride)
2239 | uniform float wref,
2240 | uniform int width,
2241 | uniform int height,
2242 | uniform int stride,
2243 | uniform int peak
2244 | ) {
2245 |
2246 | foreach (y = 0 ... height, x = 0 ... width) {
2247 | int idx = y * stride + x;
2248 |
2249 | float multiplier = wref * max_weightp[idx];
2250 |
2251 | float denominator = multiplier + weightp[idx];
2252 |
2253 | dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak));
2254 | dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak));
2255 | }
2256 | }
2257 |
2258 | export void nlmFinishCh3_f32(
2259 | uniform float dstp1[], // shape: (height, stride)
2260 | uniform float dstp2[], // shape: (height, stride)
2261 | uniform float dstp3[], // shape: (height, stride)
2262 | uniform const float srcp1[], // shape: (height, stride)
2263 | uniform const float srcp2[], // shape: (height, stride)
2264 | uniform const float srcp3[], // shape: (height, stride)
2265 | uniform const float weightp[], // shape: (height, stride)
2266 | uniform const float wdstp1[], // shape: (height, stride)
2267 | uniform const float wdstp2[], // shape: (height, stride)
2268 | uniform const float wdstp3[], // shape: (height, stride)
2269 | uniform const float max_weightp[], // shape: (height, stride)
2270 | uniform float wref,
2271 | uniform int width,
2272 | uniform int height,
2273 | uniform int stride
2274 | ) {
2275 |
2276 | foreach (y = 0 ... height, x = 0 ... width) {
2277 | int idx = y * stride + x;
2278 |
2279 | float multiplier = wref * max_weightp[idx];
2280 |
2281 | float denominator = multiplier + weightp[idx];
2282 |
2283 | dstp1[idx] = (multiplier * srcp1[idx] + wdstp1[idx]) / denominator;
2284 | dstp2[idx] = (multiplier * srcp2[idx] + wdstp2[idx]) / denominator;
2285 | dstp3[idx] = (multiplier * srcp3[idx] + wdstp3[idx]) / denominator;
2286 | }
2287 | }
2288 |
2289 | export void nlmFinishCh3_u8(
2290 | uniform unsigned int8 dstp1[], // shape: (height, stride)
2291 | uniform unsigned int8 dstp2[], // shape: (height, stride)
2292 | uniform unsigned int8 dstp3[], // shape: (height, stride)
2293 | uniform const unsigned int8 srcp1[], // shape: (height, stride)
2294 | uniform const unsigned int8 srcp2[], // shape: (height, stride)
2295 | uniform const unsigned int8 srcp3[], // shape: (height, stride)
2296 | uniform const float weightp[], // shape: (height, stride)
2297 | uniform const float wdstp1[], // shape: (height, stride)
2298 | uniform const float wdstp2[], // shape: (height, stride)
2299 | uniform const float wdstp3[], // shape: (height, stride)
2300 | uniform const float max_weightp[], // shape: (height, stride)
2301 | uniform float wref,
2302 | uniform int width,
2303 | uniform int height,
2304 | uniform int stride,
2305 | uniform int peak
2306 | ) {
2307 |
2308 | foreach (y = 0 ... height, x = 0 ... width) {
2309 | int idx = y * stride + x;
2310 |
2311 | float multiplier = wref * max_weightp[idx];
2312 |
2313 | float denominator = multiplier + weightp[idx];
2314 |
2315 | dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak));
2316 | dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak));
2317 | dstp3[idx] = max(0, min((int) round((multiplier * srcp3[idx] + wdstp3[idx]) / denominator), peak));
2318 | }
2319 | }
2320 |
2321 | export void nlmFinishCh3_u16(
2322 | uniform unsigned int16 dstp1[], // shape: (height, stride)
2323 | uniform unsigned int16 dstp2[], // shape: (height, stride)
2324 | uniform unsigned int16 dstp3[], // shape: (height, stride)
2325 | uniform const unsigned int16 srcp1[], // shape: (height, stride)
2326 | uniform const unsigned int16 srcp2[], // shape: (height, stride)
2327 | uniform const unsigned int16 srcp3[], // shape: (height, stride)
2328 | uniform const float weightp[], // shape: (height, stride)
2329 | uniform const float wdstp1[], // shape: (height, stride)
2330 | uniform const float wdstp2[], // shape: (height, stride)
2331 | uniform const float wdstp3[], // shape: (height, stride)
2332 | uniform const float max_weightp[], // shape: (height, stride)
2333 | uniform float wref,
2334 | uniform int width,
2335 | uniform int height,
2336 | uniform int stride,
2337 | uniform int peak
2338 | ) {
2339 |
2340 | foreach (y = 0 ... height, x = 0 ... width) {
2341 | int idx = y * stride + x;
2342 |
2343 | float multiplier = wref * max_weightp[idx];
2344 |
2345 | float denominator = multiplier + weightp[idx];
2346 |
2347 | dstp1[idx] = max(0, min((int) round((multiplier * srcp1[idx] + wdstp1[idx]) / denominator), peak));
2348 | dstp2[idx] = max(0, min((int) round((multiplier * srcp2[idx] + wdstp2[idx]) / denominator), peak));
2349 | dstp3[idx] = max(0, min((int) round((multiplier * srcp3[idx] + wdstp3[idx]) / denominator), peak));
2350 | }
2351 | }
2352 |
--------------------------------------------------------------------------------