├── FluxSmooth
    ├── FluxSmoothTest.cpp
    ├── FluxSmooth.rc
    ├── avs
    │   ├── minmax.h
    │   ├── types.h
    │   ├── win.h
    │   ├── cpuid.h
    │   ├── capi.h
    │   ├── alignment.h
    │   ├── posix.h
    │   └── config.h
    ├── CMakeLists.txt
    ├── FluxSmooth.h
    ├── documentation
    │   └── readme.html
    ├── FilterDef.cpp
    ├── FluxSmooth.vcxproj
    ├── FluxSmooth_avx2.cpp
    ├── FluxSmooth_avx512.cpp
    └── FluxSmooth.cpp
├── CMakeLists.txt
├── cmake_uninstall.cmake.in
├── FluxSmooth.sln
├── README.md
├── .gitattributes
└── .gitignore


/FluxSmooth/FluxSmoothTest.cpp:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/FluxSmooth/FluxSmooth.rc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pinterf/FluxSmooth/HEAD/FluxSmooth/FluxSmooth.rc


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.8.2)
 2 | project(FluxSmooth-pfmod LANGUAGES CXX)
 3 | 
 4 | add_subdirectory(FluxSmooth)
 5 | 
 6 | # uninstall target
 7 | configure_file(
 8 |     "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in"
 9 |     "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
10 |     IMMEDIATE @ONLY)
11 | 
12 | add_custom_target(uninstall
13 |     COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
14 | 


--------------------------------------------------------------------------------
/cmake_uninstall.cmake.in:
--------------------------------------------------------------------------------
 1 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
 2 |   message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
 3 | endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
 4 | 
 5 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
 6 | string(REGEX REPLACE "\n" ";" files "${files}")
 7 | foreach(file ${files})
 8 |   message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
 9 |   if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
10 |     exec_program(
11 |       "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
12 |       OUTPUT_VARIABLE rm_out
13 |       RETURN_VALUE rm_retval
14 |       )
15 |     if(NOT "${rm_retval}" STREQUAL 0)
16 |       message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
17 |     endif(NOT "${rm_retval}" STREQUAL 0)
18 |   else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
19 |     message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
20 |   endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
21 | endforeach(file)
22 | 


--------------------------------------------------------------------------------
/FluxSmooth/avs/minmax.h:
--------------------------------------------------------------------------------
 1 | // This program is free software; you can redistribute it and/or modify
 2 | // it under the terms of the GNU General Public License as published by
 3 | // the Free Software Foundation; either version 2 of the License, or
 4 | // (at your option) any later version.
 5 | //
 6 | // This program is distributed in the hope that it will be useful,
 7 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 8 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 9 | // GNU General Public License for more details.
10 | //
11 | // You should have received a copy of the GNU General Public License
12 | // along with this program; if not, write to the Free Software
13 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
14 | // http://www.gnu.org/copyleft/gpl.html .
15 | //
16 | // Linking Avisynth statically or dynamically with other modules is making a
17 | // combined work based on Avisynth.  Thus, the terms and conditions of the GNU
18 | // General Public License cover the whole combination.
19 | //
20 | // As a special exception, the copyright holders of Avisynth give you
21 | // permission to link Avisynth with independent modules that communicate with
22 | // Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
23 | // terms of these independent modules, and to copy and distribute the
24 | // resulting combined work under terms of your choice, provided that
25 | // every copy of the combined work is accompanied by a complete copy of
26 | // the source code of Avisynth (the version of Avisynth used to produce the
27 | // combined work), being distributed under the terms of the GNU General
28 | // Public License plus this exception.  An independent module is a module
29 | // which is not derived from or based on Avisynth, such as 3rd-party filters,
30 | // import and export plugins, or graphical user interfaces.
31 | 
32 | #ifndef AVSCORE_MINMAX_H
33 | #define AVSCORE_MINMAX_H
34 | 
35 | template<typename T>
36 | T min(T v1, T v2)
37 | {
38 |   return v1 < v2 ? v1 : v2;
39 | }
40 | 
41 | template<typename T>
42 | T max(T v1, T v2)
43 | {
44 |   return v1 > v2 ? v1 : v2;
45 | }
46 | 
47 | template<typename T>
48 | T clamp(T n, T min, T max)
49 | {
50 |     n = n > max ? max : n;
51 |     return n < min ? min : n;
52 | }
53 | 
54 | #endif // AVSCORE_MINMAX_H
55 | 


--------------------------------------------------------------------------------
/FluxSmooth/avs/types.h:
--------------------------------------------------------------------------------
 1 | // Avisynth C Interface Version 0.20
 2 | // Copyright 2003 Kevin Atkinson
 3 | 
 4 | // This program is free software; you can redistribute it and/or modify
 5 | // it under the terms of the GNU General Public License as published by
 6 | // the Free Software Foundation; either version 2 of the License, or
 7 | // (at your option) any later version.
 8 | //
 9 | // This program is distributed in the hope that it will be useful,
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | // GNU General Public License for more details.
13 | //
14 | // You should have received a copy of the GNU General Public License
15 | // along with this program; if not, write to the Free Software
16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
17 | // http://www.gnu.org/copyleft/gpl.html .
18 | //
19 | // As a special exception, I give you permission to link to the
20 | // Avisynth C interface with independent modules that communicate with
21 | // the Avisynth C interface solely through the interfaces defined in
22 | // avisynth_c.h, regardless of the license terms of these independent
23 | // modules, and to copy and distribute the resulting combined work
24 | // under terms of your choice, provided that every copy of the
25 | // combined work is accompanied by a complete copy of the source code
26 | // of the Avisynth C interface and Avisynth itself (with the version
27 | // used to produce the combined work), being distributed under the
28 | // terms of the GNU General Public License plus this exception.  An
29 | // independent module is a module which is not derived from or based
30 | // on Avisynth C Interface, such as 3rd-party filters, import and
31 | // export plugins, or graphical user interfaces.
32 | 
33 | #ifndef AVS_TYPES_H
34 | #define AVS_TYPES_H
35 | 
36 | // Define all types necessary for interfacing with avisynth.dll
37 | #include <stdint.h>
38 | #include <stdbool.h>
39 | #ifdef __cplusplus
40 |   #include <cstddef>
41 |   #include <cstdarg>
42 | #else
43 |   #include <stddef.h>
44 |   #include <stdarg.h>
45 | #endif
46 | 
47 | // Raster types used by VirtualDub & Avisynth
48 | typedef uint32_t Pixel32;
49 | typedef uint8_t  BYTE;
50 | 
51 | // Audio Sample information
52 | typedef float SFLOAT;
53 | 
54 | #endif //AVS_TYPES_H
55 | 


--------------------------------------------------------------------------------
/FluxSmooth.sln:
--------------------------------------------------------------------------------
 1 | 
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 16
 4 | VisualStudioVersion = 16.0.28729.10
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FluxSmooth", "FluxSmooth\FluxSmooth.vcxproj", "{588984EE-FDBE-4901-894A-32781B765F07}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|x64 = Debug|x64
11 | 		Debug|x86 = Debug|x86
12 | 		Release LLVM|x64 = Release LLVM|x64
13 | 		Release LLVM|x86 = Release LLVM|x86
14 | 		Release XP|x64 = Release XP|x64
15 | 		Release XP|x86 = Release XP|x86
16 | 		Release|x64 = Release|x64
17 | 		Release|x86 = Release|x86
18 | 	EndGlobalSection
19 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
20 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Debug|x64.ActiveCfg = Debug|x64
21 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Debug|x64.Build.0 = Debug|x64
22 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Debug|x86.ActiveCfg = Debug|Win32
23 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Debug|x86.Build.0 = Debug|Win32
24 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release LLVM|x64.ActiveCfg = Release LLVM|x64
25 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release LLVM|x64.Build.0 = Release LLVM|x64
26 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release LLVM|x86.ActiveCfg = Release LLVM|Win32
27 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release LLVM|x86.Build.0 = Release LLVM|Win32
28 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release XP|x64.ActiveCfg = Release XP|x64
29 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release XP|x64.Build.0 = Release XP|x64
30 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release XP|x86.ActiveCfg = Release XP|Win32
31 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release XP|x86.Build.0 = Release XP|Win32
32 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release|x64.ActiveCfg = Release|x64
33 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release|x64.Build.0 = Release|x64
34 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release|x86.ActiveCfg = Release|Win32
35 | 		{588984EE-FDBE-4901-894A-32781B765F07}.Release|x86.Build.0 = Release|Win32
36 | 	EndGlobalSection
37 | 	GlobalSection(SolutionProperties) = preSolution
38 | 		HideSolutionNode = FALSE
39 | 	EndGlobalSection
40 | 	GlobalSection(ExtensibilityGlobals) = postSolution
41 | 		SolutionGuid = {483D6367-0542-4995-B683-A9CE97059A76}
42 | 	EndGlobalSection
43 | EndGlobal
44 | 


--------------------------------------------------------------------------------
/FluxSmooth/avs/win.h:
--------------------------------------------------------------------------------
 1 | // This program is free software; you can redistribute it and/or modify
 2 | // it under the terms of the GNU General Public License as published by
 3 | // the Free Software Foundation; either version 2 of the License, or
 4 | // (at your option) any later version.
 5 | //
 6 | // This program is distributed in the hope that it will be useful,
 7 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 8 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 9 | // GNU General Public License for more details.
10 | //
11 | // You should have received a copy of the GNU General Public License
12 | // along with this program; if not, write to the Free Software
13 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
14 | // http://www.gnu.org/copyleft/gpl.html .
15 | //
16 | // Linking Avisynth statically or dynamically with other modules is making a
17 | // combined work based on Avisynth.  Thus, the terms and conditions of the GNU
18 | // General Public License cover the whole combination.
19 | //
20 | // As a special exception, the copyright holders of Avisynth give you
21 | // permission to link Avisynth with independent modules that communicate with
22 | // Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
23 | // terms of these independent modules, and to copy and distribute the
24 | // resulting combined work under terms of your choice, provided that
25 | // every copy of the combined work is accompanied by a complete copy of
26 | // the source code of Avisynth (the version of Avisynth used to produce the
27 | // combined work), being distributed under the terms of the GNU General
28 | // Public License plus this exception.  An independent module is a module
29 | // which is not derived from or based on Avisynth, such as 3rd-party filters,
30 | // import and export plugins, or graphical user interfaces.
31 | 
32 | #ifndef AVSCORE_WIN_H
33 | #define AVSCORE_WIN_H
34 | 
35 | // Whenever you need windows headers, start by including this file, then the rest.
36 | 
37 | // WWUUT? We require XP now?
38 | #if !defined(NTDDI_VERSION) && !defined(_WIN32_WINNT)
39 |   #define NTDDI_VERSION 0x05020000
40 |   #define _WIN32_WINNT  0x0502
41 | #endif
42 | 
43 | #define WIN32_LEAN_AND_MEAN
44 | #define STRICT
45 | #if !defined(NOMINMAX)
46 |     #define NOMINMAX
47 | #endif
48 | 
49 | #include <windows.h>
50 | 
51 | // Provision for UTF-8 max 4 bytes per code point
52 | #define AVS_MAX_PATH MAX_PATH*4
53 | 
54 | #endif // AVSCORE_WIN_H
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FluxSmooth - pfmod
 2 | Avisynth filter for spatio-temporal smoothing of fluctuations
 3 | 
 4 | By Ross Thomas <ross@grinfinity.com>
 5 | 
 6 | There is no copyright on this code, and there are no conditions
 7 | on its distribution or use. Do with it what you will.
 8 | 
 9 | ## Changelog
10 | - (20190426) v1.4
11 |   - AVX512 support 
12 |     when both AVX512F and AVX512BW extensions are available (e.g. Skylake X and Cannon Lake).
13 |     Available processor flags can be shown through the .Info() filter in Avisynth+.
14 |     New value for 'opt': opt=4 means forced AVX512. Error message if system does not support those AVX512 flags.
15 |   - Moved to Visual Studio 2019
16 |   - (xp builds: Microsoft C++, main builds: LLVM clang)
17 | 
18 | - (20190402) v1.3, rewrite by pinterf
19 |   - project moved to github: https://github.com/pinterf/FluxSmooth
20 |   - Built using Visual Studio 2017, additional LLVM 8.0 clang support
21 |   - Changed to AVS 2.6 plugin interface
22 |   - x64 build for Avisynth+
23 |   - Added version resource to DLL
24 |   - Removed MMX support, requires SSE2. (Though pure C is still available in the source)
25 |   - Drop all inline assembly, SIMD intrinsics based on C code, SSE2, SSE4.1 and AVX2 optimizations
26 |   - Single DLL, optimizations for different CPU instruction sets are chosen automatically.
27 |   - Reports MT Modes for Avisynth+: MT_NICE_FILTER
28 |   - Added Y, YV411, YV16 and YV24, 10-16 bits 4:2:0, 4:2:2, 4:4:4, planar RGB(A) 8-16 bits support besides existing YV12
29 |   - (YUY2 support with workaround: internally converted to YV16, process and convert back 
30 |     conversion is lossless, but slower than using native YV16)
31 |   - New parameters: bool "luma", bool "chroma" (default true) to disable processing of luma/chroma planes
32 | - (20101130) x64 inline assembler optimized version by Devin Gardner
33 | - (2002-2004) FluxSmooth v1.1b 
34 |   Original version by Ross Thomas
35 |   http://web.archive.org/web/20070225212908/http://bengal.missouri.edu/~kes25c/FluxSmooth-1.1b.zip
36 |   https://forum.doom9.org/showthread.php?t=38296
37 | 
38 | ## Notes
39 | Previous DLL versions named differently (FluxSmoothSSE2.DLL, FluxSmoothSSSE3) should be deleted from your plugin folder.
40 | From version 1.3 a single DLL exists, which automatically chosen CPU optimization (SSE2, SSE4.1, AVX2)
41 | 
42 | ## Links
43 | - Project: https://github.com/pinterf/FluxSmooth
44 | - Forum: https://forum.doom9.org/showthread.php?t=176246
45 | - Additional info: http://avisynth.nl/index.php/FluxSmooth


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Set default behavior to automatically normalize line endings.
 3 | ###############################################################################
 4 | * text=auto
 5 | 
 6 | ###############################################################################
 7 | # Set default behavior for command prompt diff.
 8 | #
 9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs     diff=csharp
14 | 
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following 
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln       merge=binary
26 | #*.csproj    merge=binary
27 | #*.vbproj    merge=binary
28 | #*.vcxproj   merge=binary
29 | #*.vcproj    merge=binary
30 | #*.dbproj    merge=binary
31 | #*.fsproj    merge=binary
32 | #*.lsproj    merge=binary
33 | #*.wixproj   merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj   merge=binary
36 | #*.wwaproj   merge=binary
37 | 
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg   binary
44 | #*.png   binary
45 | #*.gif   binary
46 | 
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | # 
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the 
52 | # entries below.
53 | ###############################################################################
54 | #*.doc   diff=astextplain
55 | #*.DOC   diff=astextplain
56 | #*.docx  diff=astextplain
57 | #*.DOCX  diff=astextplain
58 | #*.dot   diff=astextplain
59 | #*.DOT   diff=astextplain
60 | #*.pdf   diff=astextplain
61 | #*.PDF   diff=astextplain
62 | #*.rtf   diff=astextplain
63 | #*.RTF   diff=astextplain
64 | 


--------------------------------------------------------------------------------
/FluxSmooth/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(PluginName "FluxSmooth")
 2 | 
 3 | if (NOT WIN32)
 4 |   string(TOLOWER "${PluginName}" PluginName)
 5 | endif()
 6 | 
 7 | set(ProjectName "${PluginName}")
 8 | project(${ProjectName} LANGUAGES CXX)
 9 | 
10 | file(GLOB FluxSmoothSources RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" *.cpp *.h)
11 | add_library(${PluginName} SHARED ${FluxSmoothSources})
12 | 
13 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINTEL_INTRINSICS -msse4.1")
14 | 
15 | if (MSVC_IDE)
16 |   IF(CLANG_IN_VS STREQUAL "1")
17 |       # special AVX option for source files with *_avx.cpp pattern
18 |       file(GLOB_RECURSE SRCS_AVX "*_avx.cpp")
19 |       set_source_files_properties(${SRCS_AVX} PROPERTIES COMPILE_FLAGS " -mavx ")
20 | 
21 |       # special AVX2 option for source files with *_avx2.cpp pattern
22 |       file(GLOB_RECURSE SRCS_AVX2 "*_avx2.cpp")
23 |       set_source_files_properties(${SRCS_AVX2} PROPERTIES COMPILE_FLAGS " -mavx2 -mfma ")
24 | 
25 |       # special AVX512 option for source files with *_avx512.cpp pattern
26 |       file(GLOB_RECURSE SRCS_AVX512 "*_avx512.cpp")
27 |       set_source_files_properties(${SRCS_AVX512} PROPERTIES COMPILE_FLAGS " -mavx512f -mavx512bw ")
28 |   ELSE()
29 |       # special AVX option for source files with *_avx.cpp pattern
30 |       file(GLOB_RECURSE SRCS_AVX "*_avx.cpp")
31 |       set_source_files_properties(${SRCS_AVX} PROPERTIES COMPILE_FLAGS " /arch:AVX ")
32 | 
33 |       # special AVX2 option for source files with *_avx2.cpp pattern
34 |       file(GLOB_RECURSE SRCS_AVX2 "*_avx2.cpp")
35 |       set_source_files_properties(${SRCS_AVX2} PROPERTIES COMPILE_FLAGS " /arch:AVX2 ")
36 | 
37 |       # special AVX512 option for source files with *_avx512.cpp pattern
38 |       file(GLOB_RECURSE SRCS_AVX512 "*_avx512.cpp")
39 |       set_source_files_properties(${SRCS_AVX512} PROPERTIES COMPILE_FLAGS " /arch:AVX512 ")
40 |   ENDIF()
41 | else()
42 |   # special AVX option for source files with *_avx.cpp pattern
43 |   file(GLOB_RECURSE SRCS_AVX "*_avx.cpp")
44 |   set_source_files_properties(${SRCS_AVX} PROPERTIES COMPILE_FLAGS " -mavx ")
45 | 
46 |   # special AVX2 option for source files with *_avx2.cpp pattern
47 |   file(GLOB_RECURSE SRCS_AVX2 "*_avx2.cpp")
48 |   set_source_files_properties(${SRCS_AVX2} PROPERTIES COMPILE_FLAGS " -mavx2 -mfma ")
49 | 
50 |   # special AVX512 option for source files with *_avx512.cpp pattern
51 |   file(GLOB_RECURSE SRCS_AVX512 "*_avx512.cpp")
52 |   set_source_files_properties(${SRCS_AVX512} PROPERTIES COMPILE_FLAGS " -mavx512f -mavx512bw ")
53 | endif()
54 | 
55 | target_link_libraries(${ProjectName})
56 | target_include_directories(${ProjectName} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
57 | 
58 | include(GNUInstallDirs)
59 | 
60 | INSTALL(TARGETS ${ProjectName}
61 |         LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/avisynth")
62 | 


--------------------------------------------------------------------------------
/FluxSmooth/avs/cpuid.h:
--------------------------------------------------------------------------------
 1 | // This program is free software; you can redistribute it and/or modify
 2 | // it under the terms of the GNU General Public License as published by
 3 | // the Free Software Foundation; either version 2 of the License, or
 4 | // (at your option) any later version.
 5 | //
 6 | // This program is distributed in the hope that it will be useful,
 7 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 8 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 9 | // GNU General Public License for more details.
10 | //
11 | // You should have received a copy of the GNU General Public License
12 | // along with this program; if not, write to the Free Software
13 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
14 | // http://www.gnu.org/copyleft/gpl.html .
15 | //
16 | // Linking Avisynth statically or dynamically with other modules is making a
17 | // combined work based on Avisynth.  Thus, the terms and conditions of the GNU
18 | // General Public License cover the whole combination.
19 | //
20 | // As a special exception, the copyright holders of Avisynth give you
21 | // permission to link Avisynth with independent modules that communicate with
22 | // Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
23 | // terms of these independent modules, and to copy and distribute the
24 | // resulting combined work under terms of your choice, provided that
25 | // every copy of the combined work is accompanied by a complete copy of
26 | // the source code of Avisynth (the version of Avisynth used to produce the
27 | // combined work), being distributed under the terms of the GNU General
28 | // Public License plus this exception.  An independent module is a module
29 | // which is not derived from or based on Avisynth, such as 3rd-party filters,
30 | // import and export plugins, or graphical user interfaces.
31 | 
32 | #ifndef AVSCORE_CPUID_H
33 | #define AVSCORE_CPUID_H
34 | 
35 | // For GetCPUFlags.  These are backwards-compatible with those in VirtualDub.
36 | // ending with SSE4_2
37 | // For emulation see https://software.intel.com/en-us/articles/intel-software-development-emulator
38 | enum {
39 |                     /* oldest CPU to support extension */
40 |   CPUF_FORCE        =  0x01,   //  N/A
41 |   CPUF_FPU          =  0x02,   //  386/486DX
42 |   CPUF_MMX          =  0x04,   //  P55C, K6, PII
43 |   CPUF_INTEGER_SSE  =  0x08,   //  PIII, Athlon
44 |   CPUF_SSE          =  0x10,   //  PIII, Athlon XP/MP
45 |   CPUF_SSE2         =  0x20,   //  PIV, K8
46 |   CPUF_3DNOW        =  0x40,   //  K6-2
47 |   CPUF_3DNOW_EXT    =  0x80,   //  Athlon
48 |   CPUF_X86_64       =  0xA0,   //  Hammer (note: equiv. to 3DNow + SSE2, which
49 |                                //          only Hammer will have anyway)
50 |   CPUF_SSE3         = 0x100,   //  PIV+, K8 Venice
51 |   CPUF_SSSE3        = 0x200,   //  Core 2
52 |   CPUF_SSE4         = 0x400,
53 |   CPUF_SSE4_1       = 0x400,   //  Penryn, Wolfdale, Yorkfield
54 |   CPUF_AVX          = 0x800,   //  Sandy Bridge, Bulldozer
55 |   CPUF_SSE4_2       = 0x1000,  //  Nehalem
56 |   // AVS+
57 |   CPUF_AVX2         = 0x2000,   //  Haswell
58 |   CPUF_FMA3         = 0x4000,
59 |   CPUF_F16C         = 0x8000,
60 |   CPUF_MOVBE        = 0x10000,  // Big Endian move
61 |   CPUF_POPCNT       = 0x20000,
62 |   CPUF_AES          = 0x40000,
63 |   CPUF_FMA4         = 0x80000,
64 | 
65 |   CPUF_AVX512F      = 0x100000,  // AVX-512 Foundation.
66 |   CPUF_AVX512DQ     = 0x200000,  // AVX-512 DQ (Double/Quad granular) Instructions
67 |   CPUF_AVX512PF     = 0x400000,  // AVX-512 Prefetch
68 |   CPUF_AVX512ER     = 0x800000,  // AVX-512 Exponential and Reciprocal
69 |   CPUF_AVX512CD     = 0x1000000, // AVX-512 Conflict Detection
70 |   CPUF_AVX512BW     = 0x2000000, // AVX-512 BW (Byte/Word granular) Instructions
71 |   CPUF_AVX512VL     = 0x4000000, // AVX-512 VL (128/256 Vector Length) Extensions
72 |   CPUF_AVX512IFMA   = 0x8000000, // AVX-512 IFMA integer 52 bit
73 |   CPUF_AVX512VBMI   = 0x10000000,// AVX-512 VBMI
74 | };
75 | 
76 | #ifdef BUILDING_AVSCORE
77 | int GetCPUFlags();
78 | void SetMaxCPU(int new_flags);
79 | #endif
80 | 
81 | #endif // AVSCORE_CPUID_H
82 | 


--------------------------------------------------------------------------------
/FluxSmooth/avs/capi.h:
--------------------------------------------------------------------------------
  1 | // Avisynth C Interface Version 0.20
  2 | // Copyright 2003 Kevin Atkinson
  3 | 
  4 | // This program is free software; you can redistribute it and/or modify
  5 | // it under the terms of the GNU General Public License as published by
  6 | // the Free Software Foundation; either version 2 of the License, or
  7 | // (at your option) any later version.
  8 | //
  9 | // This program is distributed in the hope that it will be useful,
 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | // GNU General Public License for more details.
 13 | //
 14 | // You should have received a copy of the GNU General Public License
 15 | // along with this program; if not, write to the Free Software
 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 17 | // http://www.gnu.org/copyleft/gpl.html .
 18 | //
 19 | // As a special exception, I give you permission to link to the
 20 | // Avisynth C interface with independent modules that communicate with
 21 | // the Avisynth C interface solely through the interfaces defined in
 22 | // avisynth_c.h, regardless of the license terms of these independent
 23 | // modules, and to copy and distribute the resulting combined work
 24 | // under terms of your choice, provided that every copy of the
 25 | // combined work is accompanied by a complete copy of the source code
 26 | // of the Avisynth C interface and Avisynth itself (with the version
 27 | // used to produce the combined work), being distributed under the
 28 | // terms of the GNU General Public License plus this exception.  An
 29 | // independent module is a module which is not derived from or based
 30 | // on Avisynth C Interface, such as 3rd-party filters, import and
 31 | // export plugins, or graphical user interfaces.
 32 | 
 33 | #ifndef AVS_CAPI_H
 34 | #define AVS_CAPI_H
 35 | 
 36 | #include "config.h"
 37 | 
 38 | #ifdef AVS_POSIX
 39 | // this is also defined in avs/posix.h
 40 | #define __declspec(x)
 41 | #endif
 42 | 
 43 | #ifdef __cplusplus
 44 | #  define EXTERN_C extern "C"
 45 | #else
 46 | #  define EXTERN_C
 47 | #endif
 48 | 
 49 | #ifdef AVS_WINDOWS
 50 | #ifdef BUILDING_AVSCORE
 51 | #  if defined(GCC) && defined(X86_32)
 52 | #    define AVSC_CC
 53 | #  else // MSVC builds and 64-bit GCC
 54 | #    ifndef AVSC_USE_STDCALL
 55 | #      define AVSC_CC __cdecl
 56 | #    else
 57 | #      define AVSC_CC __stdcall
 58 | #    endif
 59 | #  endif
 60 | #else // needed for programs that talk to AviSynth+
 61 | #  ifndef AVSC_WIN32_GCC32 // see comment below
 62 | #    ifndef AVSC_USE_STDCALL
 63 | #      define AVSC_CC __cdecl
 64 | #    else
 65 | #      define AVSC_CC __stdcall
 66 | #    endif
 67 | #  else
 68 | #    define AVSC_CC
 69 | #  endif
 70 | #endif
 71 | #  else
 72 | #    define AVSC_CC
 73 | #endif
 74 | 
 75 | // On 64-bit Windows, there's only one calling convention,
 76 | // so there is no difference between MSVC and GCC. On 32-bit,
 77 | // this isn't true. The convention that GCC needs to use to
 78 | // even build AviSynth+ as 32-bit makes anything that uses
 79 | // it incompatible with 32-bit MSVC builds of AviSynth+.
 80 | // The AVSC_WIN32_GCC32 define is meant to provide a user
 81 | // switchable way to make builds of FFmpeg to test 32-bit
 82 | // GCC builds of AviSynth+ without having to screw around
 83 | // with alternate headers, while still default to the usual
 84 | // situation of using 32-bit MSVC builds of AviSynth+.
 85 | 
 86 | // Hopefully, this situation will eventually be resolved
 87 | // and a broadly compatible solution will arise so the
 88 | // same 32-bit FFmpeg build can handle either MSVC or GCC
 89 | // builds of AviSynth+.
 90 | 
 91 | #define AVSC_INLINE static __inline
 92 | 
 93 | #ifdef BUILDING_AVSCORE
 94 | #ifdef AVS_WINDOWS
 95 | #  define AVSC_EXPORT __declspec(dllexport)
 96 | #  define AVSC_API(ret, name) EXTERN_C AVSC_EXPORT ret AVSC_CC name
 97 | #else
 98 | #  define AVSC_EXPORT EXTERN_C
 99 | #  define AVSC_API(ret, name) EXTERN_C ret AVSC_CC name
100 | #endif
101 | #else
102 | #  define AVSC_EXPORT EXTERN_C __declspec(dllexport)
103 | #  ifndef AVSC_NO_DECLSPEC
104 | #    define AVSC_API(ret, name) EXTERN_C __declspec(dllimport) ret AVSC_CC name
105 | #  else
106 | #    define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func)
107 | #  endif
108 | #endif
109 | 
110 | #endif //AVS_CAPI_H
111 | 


--------------------------------------------------------------------------------
/FluxSmooth/avs/alignment.h:
--------------------------------------------------------------------------------
  1 | // Avisynth C Interface Version 0.20
  2 | // Copyright 2003 Kevin Atkinson
  3 | 
  4 | // This program is free software; you can redistribute it and/or modify
  5 | // it under the terms of the GNU General Public License as published by
  6 | // the Free Software Foundation; either version 2 of the License, or
  7 | // (at your option) any later version.
  8 | //
  9 | // This program is distributed in the hope that it will be useful,
 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | // GNU General Public License for more details.
 13 | //
 14 | // You should have received a copy of the GNU General Public License
 15 | // along with this program; if not, write to the Free Software
 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 17 | // http://www.gnu.org/copyleft/gpl.html .
 18 | //
 19 | // As a special exception, I give you permission to link to the
 20 | // Avisynth C interface with independent modules that communicate with
 21 | // the Avisynth C interface solely through the interfaces defined in
 22 | // avisynth_c.h, regardless of the license terms of these independent
 23 | // modules, and to copy and distribute the resulting combined work
 24 | // under terms of your choice, provided that every copy of the
 25 | // combined work is accompanied by a complete copy of the source code
 26 | // of the Avisynth C interface and Avisynth itself (with the version
 27 | // used to produce the combined work), being distributed under the
 28 | // terms of the GNU General Public License plus this exception.  An
 29 | // independent module is a module which is not derived from or based
 30 | // on Avisynth C Interface, such as 3rd-party filters, import and
 31 | // export plugins, or graphical user interfaces.
 32 | 
 33 | #ifndef AVS_ALIGNMENT_H
 34 | #define AVS_ALIGNMENT_H
 35 | 
 36 | // Functions and macros to help work with alignment requirements.
 37 | 
 38 | // Tells if a number is a power of two.
 39 | #define IS_POWER2(n) ((n) && !((n) & ((n) - 1)))
 40 | 
 41 | // Tells if the pointer "ptr" is aligned to "align" bytes.
 42 | #define IS_PTR_ALIGNED(ptr, align) (((uintptr_t)ptr & ((uintptr_t)(align-1))) == 0)
 43 | 
 44 | // Rounds up the number "n" to the next greater multiple of "align"
 45 | #define ALIGN_NUMBER(n, align) (((n) + (align)-1) & (~((align)-1)))
 46 | 
 47 | // Rounds up the pointer address "ptr" to the next greater multiple of "align"
 48 | #define ALIGN_POINTER(ptr, align) (((uintptr_t)(ptr) + (align)-1) & (~(uintptr_t)((align)-1)))
 49 | 
 50 | #ifdef __cplusplus
 51 | 
 52 | #include <cassert>
 53 | #include <cstdlib>
 54 | #include <cstdint>
 55 | #include "config.h"
 56 | 
 57 | #if defined(MSVC) && _MSC_VER<1400
 58 |     // needed for VS2013, otherwise C++11 'alignas' works
 59 |     #define avs_alignas(x) __declspec(align(x))
 60 | #else
 61 |     // assumes C++11 support
 62 |     #define avs_alignas(x) alignas(x)
 63 | #endif
 64 | 
 65 | template<typename T>
 66 | static bool IsPtrAligned(T* ptr, size_t align)
 67 | {
 68 |   assert(IS_POWER2(align));
 69 |   return (bool)IS_PTR_ALIGNED(ptr, align);
 70 | }
 71 | 
 72 | template<typename T>
 73 | static T AlignNumber(T n, T align)
 74 | {
 75 |   assert(IS_POWER2(align));
 76 |   return ALIGN_NUMBER(n, align);
 77 | }
 78 | 
 79 | template<typename T>
 80 | static T* AlignPointer(T* ptr, size_t align)
 81 | {
 82 |   assert(IS_POWER2(align));
 83 |   return (T*)ALIGN_POINTER(ptr, align);
 84 | }
 85 | 
 86 | extern "C"
 87 | {
 88 | #else
 89 | #include <stdlib.h>
 90 | #endif  // __cplusplus
 91 | 
 92 | // Returns a new buffer that is at least the size "nbytes".
 93 | // The buffer will be aligned to "align" bytes.
 94 | // Returns NULL on error. On successful allocation,
 95 | // the returned buffer must be freed using "avs_free".
 96 | inline void* avs_malloc(size_t nbytes, size_t align)
 97 | {
 98 |   if (!IS_POWER2(align))
 99 |     return NULL;
100 | 
101 |   size_t offset = sizeof(void*) + align - 1;
102 | 
103 |   void *orig = malloc(nbytes + offset);
104 |   if (orig == NULL)
105 |    return NULL;
106 | 
107 |   void **aligned = (void**)(((uintptr_t)orig + (uintptr_t)offset) & (~(uintptr_t)(align-1)));
108 |   aligned[-1] = orig;
109 |   return aligned;
110 | }
111 | 
112 | // Buffers allocated using "avs_malloc" must be freed
113 | // using "avs_free" instead of "free".
114 | inline void avs_free(void *ptr)
115 | {
116 |   // Mirroring free()'s semantic requires us to accept NULLs
117 |   if (ptr == NULL)
118 |     return;
119 | 
120 |   free(((void**)ptr)[-1]);
121 | }
122 | 
123 | #ifdef __cplusplus
124 | } // extern "C"
125 | 
126 | // The point of these undef's is to force using the template functions
127 | // if we are in C++ mode. For C, the user can rely only on the macros.
128 | #undef IS_PTR_ALIGNED
129 | #undef ALIGN_NUMBER
130 | #undef ALIGN_POINTER
131 | 
132 | #endif  // __cplusplus
133 | 
134 | #endif  //AVS_ALIGNMENT_H
135 | 


--------------------------------------------------------------------------------
/FluxSmooth/avs/posix.h:
--------------------------------------------------------------------------------
  1 | // This program is free software; you can redistribute it and/or modify
  2 | // it under the terms of the GNU General Public License as published by
  3 | // the Free Software Foundation; either version 2 of the License, or
  4 | // (at your option) any later version.
  5 | //
  6 | // This program is distributed in the hope that it will be useful,
  7 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
  8 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  9 | // GNU General Public License for more details.
 10 | //
 11 | // You should have received a copy of the GNU General Public License
 12 | // along with this program; if not, write to the Free Software
 13 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 14 | // http://www.gnu.org/copyleft/gpl.html .
 15 | //
 16 | // Linking Avisynth statically or dynamically with other modules is making a
 17 | // combined work based on Avisynth.  Thus, the terms and conditions of the GNU
 18 | // General Public License cover the whole combination.
 19 | //
 20 | // As a special exception, the copyright holders of Avisynth give you
 21 | // permission to link Avisynth with independent modules that communicate with
 22 | // Avisynth solely through the interfaces defined in avisynth.h, regardless of the license
 23 | // terms of these independent modules, and to copy and distribute the
 24 | // resulting combined work under terms of your choice, provided that
 25 | // every copy of the combined work is accompanied by a complete copy of
 26 | // the source code of Avisynth (the version of Avisynth used to produce the
 27 | // combined work), being distributed under the terms of the GNU General
 28 | // Public License plus this exception.  An independent module is a module
 29 | // which is not derived from or based on Avisynth, such as 3rd-party filters,
 30 | // import and export plugins, or graphical user interfaces.
 31 | 
 32 | #ifdef AVS_POSIX
 33 | #ifndef AVSCORE_POSIX_H
 34 | #define AVSCORE_POSIX_H
 35 | 
 36 | #ifdef __cplusplus
 37 | #include <cstring>
 38 | #endif
 39 | #include <strings.h>
 40 | #include <unistd.h>
 41 | 
 42 | // Define these MSVC-extension used in Avisynth
 43 | #define __single_inheritance
 44 | 
 45 | // These things don't exist in Linux
 46 | #define __declspec(x)
 47 | #define lstrlen strlen
 48 | #define lstrcmp strcmp
 49 | #define lstrcmpi strcasecmp
 50 | #define _stricmp strcasecmp
 51 | #define _strnicmp strncasecmp
 52 | #define _strdup strdup
 53 | #define SetCurrentDirectory(x) chdir(x)
 54 | #define SetCurrentDirectoryW(x) chdir(x)
 55 | #define GetCurrentDirectoryW(x) getcwd(x)
 56 | #define _putenv putenv
 57 | #define _alloca alloca
 58 | 
 59 | // Borrowing some compatibility macros from AvxSynth, slightly modified
 60 | #define UInt32x32To64(a, b) ((uint64_t)(((uint64_t)((uint32_t)(a))) * ((uint32_t)(b))))
 61 | #define Int64ShrlMod32(a, b) ((uint64_t)((uint64_t)(a) >> (b)))
 62 | #define Int32x32To64(a, b)  ((int64_t)(((int64_t)((long)(a))) * ((long)(b))))
 63 | 
 64 | #define InterlockedIncrement(x) __sync_add_and_fetch((x), 1)
 65 | #define InterlockedDecrement(x) __sync_sub_and_fetch((x), 1)
 66 | #define MulDiv(nNumber, nNumerator, nDenominator)   (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator))
 67 | 
 68 | #ifndef TRUE
 69 | #define TRUE  true
 70 | #endif
 71 | 
 72 | #ifndef FALSE
 73 | #define FALSE false
 74 | #endif
 75 | 
 76 | #define S_FALSE       (0x00000001)
 77 | #define E_FAIL        (0x80004005)
 78 | #define FAILED(hr)    ((hr) & 0x80000000)
 79 | #define SUCCEEDED(hr) (!FAILED(hr))
 80 | 
 81 | // Statuses copied from comments in exception.cpp
 82 | #define STATUS_GUARD_PAGE_VIOLATION 0x80000001
 83 | #define STATUS_DATATYPE_MISALIGNMENT 0x80000002
 84 | #define STATUS_BREAKPOINT 0x80000003
 85 | #define STATUS_SINGLE_STEP 0x80000004
 86 | #define STATUS_ACCESS_VIOLATION 0xc0000005
 87 | #define STATUS_IN_PAGE_ERROR 0xc0000006
 88 | #define STATUS_INVALID_HANDLE 0xc0000008
 89 | #define STATUS_NO_MEMORY 0xc0000017
 90 | #define STATUS_ILLEGAL_INSTRUCTION 0xc000001d
 91 | #define STATUS_NONCONTINUABLE_EXCEPTION 0xc0000025
 92 | #define STATUS_INVALID_DISPOSITION 0xc0000026
 93 | #define STATUS_ARRAY_BOUNDS_EXCEEDED 0xc000008c
 94 | #define STATUS_FLOAT_DENORMAL_OPERAND 0xc000008d
 95 | #define STATUS_FLOAT_DIVIDE_BY_ZERO 0xc000008e
 96 | #define STATUS_FLOAT_INEXACT_RESULT 0xc000008f
 97 | #define STATUS_FLOAT_INVALID_OPERATION 0xc0000090
 98 | #define STATUS_FLOAT_OVERFLOW 0xc0000091
 99 | #define STATUS_FLOAT_STACK_CHECK 0xc0000092
100 | #define STATUS_FLOAT_UNDERFLOW 0xc0000093
101 | #define STATUS_INTEGER_DIVIDE_BY_ZERO 0xc0000094
102 | #define STATUS_INTEGER_OVERFLOW 0xc0000095
103 | #define STATUS_PRIVILEGED_INSTRUCTION 0xc0000096
104 | #define STATUS_STACK_OVERFLOW 0xc00000fd
105 | 
106 | // Calling convension
107 | #define __stdcall
108 | #define __cdecl
109 | 
110 | #endif // AVSCORE_POSIX_H
111 | #endif // AVS_POSIX
112 | 


--------------------------------------------------------------------------------
/FluxSmooth/avs/config.h:
--------------------------------------------------------------------------------
  1 | // Avisynth C Interface Version 0.20
  2 | // Copyright 2003 Kevin Atkinson
  3 | 
  4 | // This program is free software; you can redistribute it and/or modify
  5 | // it under the terms of the GNU General Public License as published by
  6 | // the Free Software Foundation; either version 2 of the License, or
  7 | // (at your option) any later version.
  8 | //
  9 | // This program is distributed in the hope that it will be useful,
 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | // GNU General Public License for more details.
 13 | //
 14 | // You should have received a copy of the GNU General Public License
 15 | // along with this program; if not, write to the Free Software
 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 17 | // http://www.gnu.org/copyleft/gpl.html .
 18 | //
 19 | // As a special exception, I give you permission to link to the
 20 | // Avisynth C interface with independent modules that communicate with
 21 | // the Avisynth C interface solely through the interfaces defined in
 22 | // avisynth_c.h, regardless of the license terms of these independent
 23 | // modules, and to copy and distribute the resulting combined work
 24 | // under terms of your choice, provided that every copy of the
 25 | // combined work is accompanied by a complete copy of the source code
 26 | // of the Avisynth C interface and Avisynth itself (with the version
 27 | // used to produce the combined work), being distributed under the
 28 | // terms of the GNU General Public License plus this exception.  An
 29 | // independent module is a module which is not derived from or based
 30 | // on Avisynth C Interface, such as 3rd-party filters, import and
 31 | // export plugins, or graphical user interfaces.
 32 | 
 33 | #ifndef AVS_CONFIG_H
 34 | #define AVS_CONFIG_H
 35 | 
 36 | // Undefine this to get cdecl calling convention
 37 | #define AVSC_USE_STDCALL 1
 38 | 
 39 | // NOTE TO PLUGIN AUTHORS:
 40 | // Because FRAME_ALIGN can be substantially higher than the alignment
 41 | // a plugin actually needs, plugins should not use FRAME_ALIGN to check for
 42 | // alignment. They should always request the exact alignment value they need.
 43 | // This is to make sure that plugins work over the widest range of AviSynth
 44 | // builds possible.
 45 | #define FRAME_ALIGN 64
 46 | 
 47 | #if   defined(_M_AMD64) || defined(__x86_64)
 48 | #   define X86_64
 49 | #elif defined(_M_IX86) || defined(__i386__)
 50 | #   define X86_32
 51 | // VS2017 introduced _M_ARM64
 52 | #elif defined(_M_ARM64) || defined(__aarch64__)
 53 | #   define ARM64
 54 | #elif defined(_M_ARM) || defined(__arm__)
 55 | #   define ARM32
 56 | #else
 57 | #   error Unsupported CPU architecture.
 58 | #endif
 59 | 
 60 | //            VC++  LLVM-Clang-cl   MinGW-Gnu
 61 | // MSVC        x          x
 62 | // MSVC_PURE   x
 63 | // CLANG                  x
 64 | // GCC                                  x
 65 | 
 66 | #if defined(__clang__)
 67 | // Check clang first. clang-cl also defines __MSC_VER
 68 | // We set MSVC because they are mostly compatible
 69 | #   define CLANG
 70 | #if defined(_MSC_VER)
 71 | #   define MSVC
 72 | #   define AVS_FORCEINLINE __attribute__((always_inline))
 73 | #else
 74 | #   define AVS_FORCEINLINE __attribute__((always_inline)) inline
 75 | #endif
 76 | #elif   defined(_MSC_VER)
 77 | #   define MSVC
 78 | #   define MSVC_PURE
 79 | #   define AVS_FORCEINLINE __forceinline
 80 | #elif defined(__GNUC__)
 81 | #   define GCC
 82 | #   define AVS_FORCEINLINE __attribute__((always_inline)) inline
 83 | #else
 84 | #   error Unsupported compiler.
 85 | #   define AVS_FORCEINLINE inline
 86 | #   undef __forceinline
 87 | #   define __forceinline inline
 88 | #endif
 89 | 
 90 | #if defined(_WIN32)
 91 | #   define AVS_WINDOWS
 92 | #elif defined(__linux__)
 93 | #   define AVS_LINUX
 94 | #   define AVS_POSIX
 95 | #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__)
 96 | #   define AVS_BSD
 97 | #   define AVS_POSIX
 98 | #elif defined(__APPLE__)
 99 | #   define AVS_MACOS
100 | #   define AVS_POSIX
101 | #else
102 | #   error Operating system unsupported.
103 | #endif
104 | 
105 | // useful warnings disabler macros for supported compilers
106 | 
107 | #if defined(_MSC_VER)
108 | #define DISABLE_WARNING_PUSH           __pragma(warning( push ))
109 | #define DISABLE_WARNING_POP            __pragma(warning( pop ))
110 | #define DISABLE_WARNING(warningNumber) __pragma(warning( disable : warningNumber ))
111 | 
112 | #define DISABLE_WARNING_UNREFERENCED_LOCAL_VARIABLE      DISABLE_WARNING(4101)
113 | #define DISABLE_WARNING_UNREFERENCED_FUNCTION            DISABLE_WARNING(4505)
114 | // other warnings you want to deactivate...
115 | 
116 | #elif defined(__GNUC__) || defined(__clang__)
117 | #define DO_PRAGMA(X) _Pragma(#X)
118 | #define DISABLE_WARNING_PUSH           DO_PRAGMA(GCC diagnostic push)
119 | #define DISABLE_WARNING_POP            DO_PRAGMA(GCC diagnostic pop)
120 | #define DISABLE_WARNING(warningName)   DO_PRAGMA(GCC diagnostic ignored #warningName)
121 | 
122 | #define DISABLE_WARNING_UNREFERENCED_LOCAL_VARIABLE      DISABLE_WARNING(-Wunused-variable)
123 | #define DISABLE_WARNING_UNREFERENCED_FUNCTION            DISABLE_WARNING(-Wunused-function)
124 | // other warnings you want to deactivate...
125 | 
126 | #else
127 | #define DISABLE_WARNING_PUSH
128 | #define DISABLE_WARNING_POP
129 | #define DISABLE_WARNING_UNREFERENCED_LOCAL_VARIABLE
130 | #define DISABLE_WARNING_UNREFERENCED_FUNCTION
131 | // other warnings you want to deactivate...
132 | 
133 | #endif
134 | 
135 | #if defined(AVS_POSIX)
136 | #define NEW_AVSVALUE
137 | #else
138 | #define NEW_AVSVALUE
139 | #endif
140 | 
141 | #endif //AVS_CONFIG_H
142 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | 
  4 | # User-specific files
  5 | *.suo
  6 | *.user
  7 | *.userosscache
  8 | *.sln.docstates
  9 | 
 10 | # User-specific files (MonoDevelop/Xamarin Studio)
 11 | *.userprefs
 12 | 
 13 | # Build results
 14 | [Dd]ebug/
 15 | [Dd]ebugPublic/
 16 | [Rr]elease/
 17 | [Rr]eleases/
 18 | [Rr]elease LLVM/
 19 | [Rr]eleases LLVM/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | # MSTest test Results
 33 | [Tt]est[Rr]esult*/
 34 | [Bb]uild[Ll]og.*
 35 | 
 36 | # NUNIT
 37 | *.VisualState.xml
 38 | TestResult.xml
 39 | 
 40 | # Build Results of an ATL Project
 41 | [Dd]ebugPS/
 42 | [Rr]eleasePS/
 43 | dlldata.c
 44 | 
 45 | # DNX
 46 | project.lock.json
 47 | project.fragment.lock.json
 48 | artifacts/
 49 | 
 50 | *_i.c
 51 | *_p.c
 52 | *_i.h
 53 | *.ilk
 54 | *.meta
 55 | *.obj
 56 | *.pch
 57 | *.pdb
 58 | *.pgc
 59 | *.pgd
 60 | *.rsp
 61 | *.sbr
 62 | *.tlb
 63 | *.tli
 64 | *.tlh
 65 | *.tmp
 66 | *.tmp_proj
 67 | *.log
 68 | *.vspscc
 69 | *.vssscc
 70 | .builds
 71 | *.pidb
 72 | *.svclog
 73 | *.scc
 74 | 
 75 | # Chutzpah Test files
 76 | _Chutzpah*
 77 | 
 78 | # Visual C++ cache files
 79 | ipch/
 80 | *.aps
 81 | *.ncb
 82 | *.opendb
 83 | *.opensdf
 84 | *.sdf
 85 | *.cachefile
 86 | *.VC.db
 87 | *.VC.VC.opendb
 88 | 
 89 | # Visual Studio profiler
 90 | *.psess
 91 | *.vsp
 92 | *.vspx
 93 | *.sap
 94 | 
 95 | # TFS 2012 Local Workspace
 96 | $tf/
 97 | 
 98 | # Guidance Automation Toolkit
 99 | *.gpState
100 | 
101 | # ReSharper is a .NET coding add-in
102 | _ReSharper*/
103 | *.[Rr]e[Ss]harper
104 | *.DotSettings.user
105 | 
106 | # JustCode is a .NET coding add-in
107 | .JustCode
108 | 
109 | # TeamCity is a build add-in
110 | _TeamCity*
111 | 
112 | # DotCover is a Code Coverage Tool
113 | *.dotCover
114 | 
115 | # NCrunch
116 | _NCrunch_*
117 | .*crunch*.local.xml
118 | nCrunchTemp_*
119 | 
120 | # MightyMoose
121 | *.mm.*
122 | AutoTest.Net/
123 | 
124 | # Web workbench (sass)
125 | .sass-cache/
126 | 
127 | # Installshield output folder
128 | [Ee]xpress/
129 | 
130 | # DocProject is a documentation generator add-in
131 | DocProject/buildhelp/
132 | DocProject/Help/*.HxT
133 | DocProject/Help/*.HxC
134 | DocProject/Help/*.hhc
135 | DocProject/Help/*.hhk
136 | DocProject/Help/*.hhp
137 | DocProject/Help/Html2
138 | DocProject/Help/html
139 | 
140 | # Click-Once directory
141 | publish/
142 | 
143 | # Publish Web Output
144 | *.[Pp]ublish.xml
145 | *.azurePubxml
146 | # TODO: Comment the next line if you want to checkin your web deploy settings
147 | # but database connection strings (with potential passwords) will be unencrypted
148 | #*.pubxml
149 | *.publishproj
150 | 
151 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
152 | # checkin your Azure Web App publish settings, but sensitive information contained
153 | # in these scripts will be unencrypted
154 | PublishScripts/
155 | 
156 | # NuGet Packages
157 | *.nupkg
158 | # The packages folder can be ignored because of Package Restore
159 | **/packages/*
160 | # except build/, which is used as an MSBuild target.
161 | !**/packages/build/
162 | # Uncomment if necessary however generally it will be regenerated when needed
163 | #!**/packages/repositories.config
164 | # NuGet v3's project.json files produces more ignoreable files
165 | *.nuget.props
166 | *.nuget.targets
167 | 
168 | # Microsoft Azure Build Output
169 | csx/
170 | *.build.csdef
171 | 
172 | # Microsoft Azure Emulator
173 | ecf/
174 | rcf/
175 | 
176 | # Windows Store app package directories and files
177 | AppPackages/
178 | BundleArtifacts/
179 | Package.StoreAssociation.xml
180 | _pkginfo.txt
181 | 
182 | # Visual Studio cache files
183 | # files ending in .cache can be ignored
184 | *.[Cc]ache
185 | # but keep track of directories ending in .cache
186 | !*.[Cc]ache/
187 | 
188 | # Others
189 | ClientBin/
190 | ~$*
191 | *~
192 | *.dbmdl
193 | *.dbproj.schemaview
194 | *.jfm
195 | *.pfx
196 | *.publishsettings
197 | node_modules/
198 | orleans.codegen.cs
199 | 
200 | # Since there are multiple workflows, uncomment next line to ignore bower_components
201 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
202 | #bower_components/
203 | 
204 | # RIA/Silverlight projects
205 | Generated_Code/
206 | 
207 | # Backup & report files from converting an old project file
208 | # to a newer Visual Studio version. Backup files are not needed,
209 | # because we have git ;-)
210 | _UpgradeReport_Files/
211 | Backup*/
212 | UpgradeLog*.XML
213 | UpgradeLog*.htm
214 | 
215 | # SQL Server files
216 | *.mdf
217 | *.ldf
218 | 
219 | # Business Intelligence projects
220 | *.rdl.data
221 | *.bim.layout
222 | *.bim_*.settings
223 | 
224 | # Microsoft Fakes
225 | FakesAssemblies/
226 | 
227 | # GhostDoc plugin setting file
228 | *.GhostDoc.xml
229 | 
230 | # Node.js Tools for Visual Studio
231 | .ntvs_analysis.dat
232 | 
233 | # Visual Studio 6 build log
234 | *.plg
235 | 
236 | # Visual Studio 6 workspace options file
237 | *.opt
238 | 
239 | # Visual Studio LightSwitch build output
240 | **/*.HTMLClient/GeneratedArtifacts
241 | **/*.DesktopClient/GeneratedArtifacts
242 | **/*.DesktopClient/ModelManifest.xml
243 | **/*.Server/GeneratedArtifacts
244 | **/*.Server/ModelManifest.xml
245 | _Pvt_Extensions
246 | 
247 | # Paket dependency manager
248 | .paket/paket.exe
249 | paket-files/
250 | 
251 | # FAKE - F# Make
252 | .fake/
253 | 
254 | # JetBrains Rider
255 | .idea/
256 | *.sln.iml
257 | 
258 | # CodeRush
259 | .cr/
260 | 
261 | # Python Tools for Visual Studio (PTVS)
262 | __pycache__/
263 | *.pyc


--------------------------------------------------------------------------------
/FluxSmooth/FluxSmooth.h:
--------------------------------------------------------------------------------
  1 | #ifndef __FLUXSMOOTH_H__
  2 | #define __FLUXSMOOTH_H__
  3 | 
  4 | #include "avisynth.h"
  5 | #include "stdint.h"
  6 | #include "emmintrin.h"
  7 | #include <cassert>
  8 | #include <memory.h>
  9 | #include <algorithm>
 10 | 
 11 | /************************************
 12 | // AVX512 enabler switch!!!
 13 | ************************************/
 14 | #define FLUXSMOOTH_AVX512_ENABLED
 15 | 
 16 | #if defined(_MSC_VER) && !defined(__clang__)
 17 | // Some missing avx512 mask intrinsics are handmade for Microsoft (for 19.20)
 18 | // As of April 2019, MS version of ??intrin.h does not support AVX512BW _k*_mask* functions
 19 | // https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
 20 | // uncomment if AVX512 is really not needed
 21 | // #undef FLUXSMOOTH_AVX512_ENABLED
 22 | #endif
 23 | 
 24 | /************************************
 25 | // Helpers, missing intrinsics
 26 | ************************************/
 27 | 
 28 | // SSE4.1 simulation for SSE2
 29 | static AVS_FORCEINLINE __m128i _MM_BLENDV_EPI8(__m128i const &a, __m128i const &b, __m128i const &selector) {
 30 |   return _mm_or_si128(_mm_and_si128(selector, b), _mm_andnot_si128(selector, a));
 31 | }
 32 | 
 33 | // non-existant simd
 34 | static AVS_FORCEINLINE __m128i _MM_CMPLE_EPU16(__m128i x, __m128i y)
 35 | {
 36 |   // Returns 0xFFFF where x <= y:
 37 |   return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
 38 | }
 39 | 
 40 | #define _mm_cmpge_epu8(a, b) \
 41 |     _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
 42 | 
 43 | #define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
 44 | 
 45 | // non-existant simd
 46 | static AVS_FORCEINLINE __m128i _mm_cmpgt_epu8(__m128i x, __m128i y)
 47 | {
 48 |   // Returns 0xFF where x > y:
 49 |   return _mm_andnot_si128(
 50 |     _mm_cmpeq_epi8(x, y),
 51 |     _mm_cmpeq_epi8(_mm_max_epu8(x, y), x)
 52 |   );
 53 | }
 54 | 
 55 | #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
 56 | 
 57 | static AVS_FORCEINLINE __m128i _mm_cmpge_epi16(__m128i x, __m128i y)
 58 | {
 59 |   // Returns 0xFFFF where x >= y:
 60 |   return _mm_or_si128(_mm_cmpeq_epi16(x, y), _mm_cmpgt_epi16(x, y));
 61 | }
 62 | 
 63 | #define _mm_cmple_epi16(a, b) _mm_cmpge_epi16(b, a)
 64 | 
 65 | /************************************
 66 | // other constants
 67 | ************************************/
 68 | 
 69 | // Optimizations by 'opt' parameter
 70 | enum { USE_OPT_C = 0, USE_OPT_SSE2 = 1, USE_OPT_SSE41 = 2, USE_OPT_AVX2 = 3, USE_OPT_AVX512 = 4};
 71 | 
 72 | constexpr int planes_y[4] = { PLANAR_Y, PLANAR_U, PLANAR_V, PLANAR_A };
 73 | constexpr int planes_r[4] = { PLANAR_G, PLANAR_B, PLANAR_R, PLANAR_A };
 74 | 
 75 | /************************************
 76 | // Prototypes, Temporal
 77 | ************************************/
 78 | #ifdef FLUXSMOOTH_AVX512_ENABLED
 79 | void fluxT_avx512_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
 80 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
 81 | 
 82 | void fluxT_avx512(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
 83 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
 84 | #endif
 85 | 
 86 | void fluxT_avx2_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
 87 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
 88 | 
 89 | void fluxT_avx2(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
 90 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
 91 | 
 92 | void fluxT_sse41(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
 93 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
 94 | 
 95 | void fluxT_sse41_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
 96 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
 97 | 
 98 | void fluxT_sse2(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
 99 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
100 | 
101 | template<typename pixel_t>
102 | void fluxT_C(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
103 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
104 | 
105 | /************************************
106 | // Prototypes, Spatial - Temporal
107 | ************************************/
108 | #ifdef FLUXSMOOTH_AVX512_ENABLED
109 | void fluxST_avx512_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
110 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
111 | 
112 | void fluxST_avx512(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
113 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
114 | #endif
115 | 
116 | void fluxST_avx2_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
117 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
118 | 
119 | void fluxST_avx2(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
120 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
121 | 
122 | void fluxST_sse41(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
123 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
124 | 
125 | void fluxST_sse41_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
126 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
127 | 
128 | void fluxST_sse2(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
129 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
130 | 
131 | template<typename pixel_t>
132 | void fluxST_C(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
133 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
134 | 
135 | /************************************
136 | // Filter classes
137 | ************************************/
138 | 
139 | class FluxSmoothST: public GenericVideoFilter
140 | {
141 |   using proc_ST_t = void(*)(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 
142 |     uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
143 | 
144 | protected:
145 |   PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment * env) override;
146 | 
147 | public:
148 |   FluxSmoothST(PClip _child, int _temporal_threshold, int _spatial_threshold, bool _luma, bool _chroma, int _opt, IScriptEnvironment * env);
149 | 
150 |   // Auto register AVS+ mode: NICE filter
151 |   int __stdcall SetCacheHints(int cachehints, int frame_range) override {
152 |     return cachehints == CACHE_GET_MTMODE ? MT_NICE_FILTER : 0;
153 |   }
154 | 
155 | private:
156 |   int spatial_threshold;
157 |   int temporal_threshold;
158 |   bool processPlane[3];
159 |   int opt;
160 |   short scaletab[16];
161 |   proc_ST_t proc_ST[3]; // for all planes
162 | };
163 | 
164 | class FluxSmoothT : public GenericVideoFilter
165 | {
166 |   using proc_T_t = void(*)(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
167 |     uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
168 | 
169 | private:
170 |   int temporal_threshold;
171 |   bool processPlane[3];
172 |   int opt;
173 |   short scaletab[16]; // for C
174 |   proc_T_t proc_T[3]; // for all planes
175 | 
176 | protected:
177 |   PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) override;
178 | 
179 | public:
180 |   FluxSmoothT(PClip _child, int _temporal_threshold, bool _luma, bool _chroma, int _opt, IScriptEnvironment * env);
181 | 
182 |   // Auto register AVS+ mode: NICE filter
183 |   int __stdcall SetCacheHints(int cachehints, int frame_range) override {
184 |     return cachehints == CACHE_GET_MTMODE ? MT_NICE_FILTER : 0;
185 |   }
186 | 
187 | };
188 | 
189 | #endif // #define __FLUXSMOOTH_H__
190 | 
191 | 


--------------------------------------------------------------------------------
/FluxSmooth/documentation/readme.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | 	<head>
  3 | 		<title>FluxSmooth</title>
  4 | 	</head>
  5 | 	<body bgcolor="#ffffff" text="#000000">
  6 | 		<h1>FluxSmooth</h1>
  7 | 		<h3>An Avisynth filter for smoothing of fluctuations</h3>
  8 | 		<p><b>By Ross Thomas &lt;ross@grinfinity.com&gt;</b></p>
  9 | 		<p><b>Rewrite by Ferenc Pinter</b></p>
 10 | 		<p><b>There is no copyright on this code, and there are no conditions on its 
 11 | 				distribution or use. Do with it what you will.</b></p>
 12 | 		<h3>Latest version:</h3>
 13 | 		<p>FluxSmooth v1.4 (20190426) <a href=" https://github.com/pinterf/FluxSmooth"> https://github.com/pinterf/FluxSmooth</a></p>
 14 | 		<h2>Description</h2>
 15 | 		<p>One of the fundamental properties of noise is that it's random. One of the 
 16 | 			fundamental properties of motion is that it's not. This is the premise behind 
 17 | 			FluxSmooth, which examines each pixel and compares it to the corresponding 
 18 | 			pixel in the previous and last frame. Smoothing occurs if both the previous 
 19 | 			frame's value and the next frame's value are greater, or if both are less, than 
 20 | 			the value in the current frame.</p>
 21 | 		<p>I like to call this a "fluctuating" pixel, then I like to wipe that pixel from 
 22 | 			existence by averaging it with its neighbours. For FluxSmoothST, this 
 23 | 			is&nbsp;(by default) done in a spatio-temporal manner, in that for each 
 24 | 			fluctuating pixel its 8 immediate spatial neighbours as well as its 2 temporal 
 25 | 			neighbours (the abovementioned corresponding pixel from the previous and next 
 26 | 			frames) are considered for inclusion in the average. If the value of each pixel 
 27 | 			is within the specified threshold, it is included. If not, it isn't. 
 28 | 			FluxSmoothT performs only temporal averaging.</p>
 29 | 		<p>This filter seems to remove almost all noise from low-noise sources (such as 
 30 | 			DVD) and a lot of noise from high-noise sources (such as cable TV captures), 
 31 | 			while maintaining a good amount of detail.</p>
 32 | 		<P>Using FluxSmoothT instead of FluxSmoothST for temporal-only smoothing is faster.</P>
 33 | 		<h2>Usage</h2>
 34 | 		<blockquote>
 35 | 			<P>FluxSmoothT([clip], int <I>temporal_threshold</I>, bool <I>luma</I>, bool <I>chroma</I>, int <I>opt</I>)</P>
 36 | 			<P>FluxSmoothST([clip], int <i>temporal_threshold</i>, int <i>spatial_threshold</i>, bool <I>luma</I>, bool <I>chroma</I>, int <I>opt</I>)</P>
 37 | 		</blockquote>
 38 | 		<table border="1">
 39 | 			<tr>
 40 | 				<th>
 41 | 					Parameter</th>
 42 | 				<th>
 43 | 					Meaning</th>
 44 | 				<th>
 45 | 					Default</th>
 46 | 			</tr>
 47 | 			<tr>
 48 | 				<td><i>temporal_threshold</i></td>
 49 | 				<td>
 50 | 					<p>Temporal neighbour pixels within this threshold from the current pixel are 
 51 | 						included in the average.</p>
 52 | 					<p>The threshold is normalized to match with the old 8 bit clips, you can keep the same value for 10-16 bit clips to have the same effect</p>
 53 | 					<p>If set to -1, no temporal smoothing occurs. (Cannot be set to -1 in 
 54 | 						FluxSmoothT.)</p>
 55 | 				</td>
 56 | 				<td>7</td>
 57 | 			</tr>
 58 | 			<tr>
 59 | 				<td><i>spatial_threshold</i></td>
 60 | 				<td>
 61 | 					<p>Spatial neighbour pixels within this threshold from the current pixel are 
 62 | 						included in the average.</p>
 63 | 					<p>The threshold is normalized to match with the old 8 bit clips, you can keep the same value for 10-16 bit clips to have the same effect</p>
 64 | 					<p>If set to -1, no spatial smoothing occurs.</p>
 65 | 				</td>
 66 | 				<td>7</td>
 67 | 			</tr>
 68 | 			<tr>
 69 | 				<td><i>luma</i></td>
 70 | 				<td>
 71 | 					<p>Enables luma channel processing</p>
 72 | 					<p>If set to false, luma (Y) channel is simply copied. Ineffective for RGB clips</p>
 73 | 				</td>
 74 | 				<td>true</td>
 75 | 			</tr>
 76 | 			<tr>
 77 | 				<td><i>chroma</i></td>
 78 | 				<td>
 79 | 					<p>Enables chroma channel processing</p>
 80 | 					<p>If set to false, chroma (U/V) channels are simply copied. Ineffective for RGB clips</p>
 81 | 				</td>
 82 | 				<td>true</td>
 83 | 			</tr>
 84 | 			<tr>
 85 | 				<td><i>opt</i></td>
 86 | 				<td>
 87 | 					<p>Debug parameter for directly choosing optimization</p>
 88 | 					<p>0=C, 1=SSE2, 2=SSE4.1, 3=AVX2, 4=AVX512, -1: automatic</p>
 89 | 					<p>If set to -1, the fastest one is chosen automatically</p>
 90 | 					<p>Note for 10-16 bits: due to the different averaging methods (precision of 1/x, rounding), results may not be bit-identical for different CPU targets</p>
 91 | 				</td>
 92 | 				<td>-1</td>
 93 | 			</tr>
 94 | 		</table>
 95 | 		<h2>Known Issues</h2>
 96 | 		<ul>
 97 | 			<li>
 98 | 			The very edges of the frame are unprocessed.
 99 | 			<li>
100 | 			The very first and very last frame of a clip is unprocessed.
101 | 		</ul>
102 | 		<h2>Author</h2>
103 | 		<p>Ross Thomas &lt;ross@grinfinity.com&gt;</p>
104 | 		<p>Ferenc Pinter https://github.com/pinterf</p>
105 | 		<h2>History</h2>
106 | 		<table border="1">
107 | 			<tr>
108 | 				<th>
109 | 					Version</th>
110 | 				<th>
111 | 					Description</th>
112 | 			</tr>
113 | 			<TR>
114 | 				<TD>1.4 (pinterf)<br />20190426</TD>
115 |                 <TD>AVX512 support: when both AVX512F and AVX512BW extension are available (e.g. Skylake X and Cannon Lake).<br />
116 |                     Available processor flags can be shown through the .Info() filter in Avisynth+.<br />
117 |                     New value for 'opt': opt=4 means forced AVX512. Error message if system does not support those AVX512 flags.</TD>
118 | 			</TR>
119 | 			<TR>
120 | 				<TD>1.3 (pinterf)<br />20190402</TD>
121 |                 <TD>project moved to github: https://github.com/pinterf/FluxSmooth<br />
122 |                     Built using Visual Studio 2017, additional LLVM 8.0 clang support<br />
123 |                     Changed to AVS 2.6 plugin interface<br />
124 |                     x64 build for Avisynth+<br />
125 |                     Added version resource to DLL<br />
126 |                     Removed MMX support, requires SSE2. (Though pure C is still available in the source)<br />
127 |                     Dropped all inline assembly, new SIMD intrinsics based on C code, SSE2, SSE4.1 and AVX2 optimizations<br />
128 |                     Single DLL, optimizations for different CPU instruction sets are chosen automatically.<br />
129 |                     Reports MT Modes for Avisynth+: MT_NICE_FILTER<br />
130 |                     Added Y, YV411, YV16 and YV24, 10-16 bits 4:2:0, 4:2:2, 4:4:4, planar RGB(A) 8-16 bits support besides existing YV12<br />
131 |                     (YUY2 support was kept by a behind-the-scene YV16 to-from conversion. Conversion is lossless but slower than using native YV16)<br />
132 |                     New parameters: bool "luma", bool "chroma" (default true) to disable processing of luma/chroma planes</TD>
133 | 			</TR>
134 | 			<TR>
135 | 				<TD>1.1b</TD>
136 | 				<TD>Fixed assuming previous and next frame pitches were the same as
137 | 					the current frame pitch.</TD>
138 | 			</TR>
139 | 			<TR>
140 | 				<TD>1.1a</TD>
141 | 				<TD>Yet another "oops" release. Current pixel is once again considered in the 
142 | 					averaging code -- I found the lack of it too aggressive, especially during fast 
143 | 					motion. Also fixed stupid "3am bug" involving a couple of variables I'd 
144 | 					declared static that shouldn't've been. Thanks to krieger2005 for spotting that 
145 | 					one, and ARDA for diagnosing it.</TD>
146 | 			</TR>
147 | 			<TR>
148 | 				<TD>1.1</TD>
149 | 				<TD>Changed the averaging code so that the current pixel is excluded, which 
150 | 					produces better noise reduction. Also split the code into two different 
151 | 					filters, FluxSmoothT and FluxSmoothST. The former does temporal-only smoothing 
152 | 					(equivalent to setting "spatial_threshold=-1" in&nbsp;FluxSmoothST) and is 
153 | 					about 50% faster. Removed&nbsp;Avisynth 2.0x version to tidy up the code base. 
154 | 					Does anyone actually use it any more? My thanks to fabrice and sh0dan for the 
155 | 					1.01 release during my extended absence&nbsp;:).</TD>
156 | 			</TR>
157 | 			<TR>
158 | 				<TD>1.01</TD>
159 | 				<TD>Added by sh0dan:<br>
160 | 					- Removed leak in AviSynth 2.5 YV12 mode (code by fabrice)<br>
161 | 					- Aligned tables and variables.<br>
162 | 					- Use AviSynth BitBlt for copying chroma.<br>
163 | 					- Don't use streaming store. (movntq)<br>
164 | 					All in all an approximate 15% speedup compared to previous version. All changes 
165 | 					are marked with "sh0:".</TD>
166 | 			<TR>
167 | 				<TD>1.0</TD>
168 | 				<TD>First "stable" release. I think it's been tested enough, but wait for a bunch 
169 | 					of bugs to emerge and make me a liar... Fixed a bug that, in conjunction with a 
170 | 					bug in the built-in resizers, caused an access violation under certain 
171 | 					circumstances. Thanks to sh0dan for spotting that one :). Added "SetCacheHints" 
172 | 					and upgraded to "AvisynthPluginInit2" in 2.5 version.</TD>
173 | 			</TR>
174 | 			<TR>
175 | 				<TD>0.4</TD>
176 | 				<TD>Implemented iSSE-optimized version, which runs roughly double the speed of the 
177 | 					C++ version. Some small optimizations to C++ version. Now smooths chroma as 
178 | 					well as luma.</TD>
179 | 			</TR>
180 | 			<TR>
181 | 				<TD>0.3</TD>
182 | 				<TD>Fixed bad bug that caused incorrect smoothing: no more in-place filtering. 
183 | 					Changed defaults back to what they were, now that the algorithm works 
184 | 					correctly.&nbsp;Spent some time benchmarking and tweaking various pieces of 
185 | 					code, so should now be significantly faster.</TD>
186 | 			</TR>
187 | 			<tr>
188 | 				<td>0.2</td>
189 | 				<td>
190 | 					<p>Fixed non-fatal bug that caused a request for one frame beyond the end of the 
191 | 						clip. Changed to in-place filtering so could squeeze a few optimizations here 
192 | 						and there. Changed too-high defaults. First Avisynth 2.5/YV12 release.</p>
193 | 				</td>
194 | 			</tr>
195 | 			<tr>
196 | 				<td>0.1</td>
197 | 				<td>First release. Alpha code.</td>
198 | 			</tr>
199 | 		</table>
200 | 	</body>
201 | </html>
202 | 


--------------------------------------------------------------------------------
/FluxSmooth/FilterDef.cpp:
--------------------------------------------------------------------------------
  1 | // FluxSmooth
  2 | // Avisynth filter for spatio-temporal smoothing of fluctuations
  3 | //
  4 | // By Ross Thomas <ross@grinfinity.com>
  5 | //
  6 | // There is no copyright on this code, and there are no conditions
  7 | // on its distribution or use. Do with it what you will.
  8 | 
  9 | #ifdef AVS_WINDOWS
 10 | #include <windows.h>
 11 | #else
 12 | #include "avs/posix.h"
 13 | #endif
 14 | #include <cassert>
 15 | #include "avisynth.h"
 16 | #include "FluxSmooth.h"
 17 | #include <algorithm>
 18 | 
 19 | FluxSmoothST::FluxSmoothST(PClip _child, int _temporal_threshold, int _spatial_threshold, bool _luma, bool _chroma, int _opt, IScriptEnvironment * env)
 20 |   : GenericVideoFilter(_child),
 21 |   spatial_threshold(_spatial_threshold),
 22 |   temporal_threshold(_temporal_threshold),
 23 |   opt(_opt)
 24 | {
 25 |   assert(temporal_threshold >= -1);
 26 |   assert(spatial_threshold >= -1);
 27 |   assert(!((-1 == temporal_threshold) && (-1 == spatial_threshold)));
 28 |   assert(env);
 29 | 
 30 |   // division table 1/1, 1/2, ... 1/11
 31 |   // only 1..11 is valid
 32 |   scaletab[0] = 0;
 33 |   scaletab[1] = 32767;
 34 |   for (int i = 2; i < 16; ++i)
 35 |     scaletab[i] = (int)(32768.0 / i + 0.5);
 36 | 
 37 |   const bool goodAVX512 = ((env->GetCPUFlags() & CPUF_AVX512F) == CPUF_AVX512F) && (env->GetCPUFlags() & CPUF_AVX512BW) == CPUF_AVX512BW;
 38 | 
 39 | #ifndef FLUXSMOOTH_AVX512_ENABLED
 40 |   if (opt == USE_OPT_AVX512)
 41 |     env->ThrowError("FluxSmoothST: cannot apply opt: this DLL version does not support AVX512");
 42 | #endif
 43 | 
 44 |   if (opt == USE_OPT_AVX512 && !goodAVX512)
 45 |     env->ThrowError("FluxSmoothST: cannot apply opt: AVX512F and AVX512BW is needed");
 46 |   if (opt == USE_OPT_AVX2 && !(env->GetCPUFlags() & CPUF_AVX2))
 47 |     env->ThrowError("FluxSmoothST: cannot apply opt: AVX2 is not supported");
 48 |   if (opt == USE_OPT_SSE41 && !(env->GetCPUFlags() & CPUF_SSE4_1))
 49 |     env->ThrowError("FluxSmoothST: cannot apply opt: SSE4.1 is not supported");
 50 |   if (opt == USE_OPT_SSE2 && !(env->GetCPUFlags() & CPUF_SSE2))
 51 |     env->ThrowError("FluxSmoothST: cannot apply opt: SSE2 is not supported");
 52 | 
 53 |   const int *current_planes = (vi.IsYUV() || vi.IsYUVA()) ? planes_y : planes_r;
 54 |   int planecount = std::min(vi.NumComponents(), 3);
 55 |   int bits_per_pixel = vi.BitsPerComponent();
 56 | 
 57 |   for (int i = 0; i < planecount; i++) {
 58 |     if (vi.IsRGB())
 59 |       processPlane[i] = true;
 60 |     else if (i == 0) // Y
 61 |       processPlane[i] = _luma;
 62 |     else
 63 |       processPlane[i] = _chroma;
 64 | 
 65 |     const int actual_width = vi.width >> vi.GetPlaneWidthSubsampling(current_planes[i]);
 66 |     if (bits_per_pixel == 8) {
 67 | #ifdef FLUXSMOOTH_AVX512_ENABLED
 68 |       if ((actual_width >= 1 + 64 + 1) && ((goodAVX512 && opt < 0) || opt >= USE_OPT_AVX512))
 69 |         proc_ST[i] = fluxST_avx512;
 70 |       else
 71 | #endif
 72 |       if ((actual_width >= 1 + 32 + 1) && (((env->GetCPUFlags() & CPUF_AVX2) == CPUF_AVX2 && opt < 0) || opt >= USE_OPT_AVX2))
 73 |         proc_ST[i] = fluxST_avx2;
 74 |       else if ((actual_width >= 1 + 16 + 1) && (((env->GetCPUFlags() & CPUF_SSE4_1) == CPUF_SSE4_1 && opt < 0) || opt >= USE_OPT_SSE41))
 75 |         proc_ST[i] = fluxST_sse41;
 76 |       else if ((actual_width >= 1 + 16 + 1) && (((env->GetCPUFlags() & CPUF_SSE2) == CPUF_SSE2 && opt < 0) || opt >= USE_OPT_SSE2))
 77 |         proc_ST[i] = fluxST_sse2;
 78 |       else
 79 |         proc_ST[i] = fluxST_C<uint8_t>;
 80 |     }
 81 |     else {
 82 | #ifdef FLUXSMOOTH_AVX512_ENABLED
 83 |       if ((actual_width >= 1 + 32 + 1) && ((goodAVX512 && opt < 0) || opt >= USE_OPT_AVX512))
 84 |         proc_ST[i] = fluxST_avx512_uint16;
 85 |       else
 86 | #endif
 87 |       if ((actual_width >= 1 + 16 + 1) && (((env->GetCPUFlags() & CPUF_AVX2) == CPUF_AVX2 && opt < 0) || opt >= USE_OPT_AVX2))
 88 |         proc_ST[i] = fluxST_avx2_uint16;
 89 |       else if ((actual_width >= 1 + 8 + 1) && (((env->GetCPUFlags() & CPUF_SSE4_1) == CPUF_SSE4_1 && opt < 0) || opt >= USE_OPT_SSE41))
 90 |         proc_ST[i] = fluxST_sse41_uint16;
 91 |       else
 92 |         proc_ST[i] = fluxST_C<uint16_t>;
 93 |     }
 94 |   }
 95 | }
 96 | 
 97 | static void copy_plane(PVideoFrame &destf, PVideoFrame &currf, int plane, IScriptEnvironment *env) {
 98 |   const uint8_t* srcp = currf->GetReadPtr(plane);
 99 |   int src_pitch = currf->GetPitch(plane);
100 |   int height = currf->GetHeight(plane);
101 |   int row_size = currf->GetRowSize(plane);
102 |   uint8_t* destp = destf->GetWritePtr(plane);
103 |   int dst_pitch = destf->GetPitch(plane);
104 |   env->BitBlt(destp, dst_pitch, srcp, src_pitch, row_size, height);
105 | }
106 | 
107 | PVideoFrame __stdcall FluxSmoothST::GetFrame(int n, IScriptEnvironment * env)
108 | {
109 |   const uint8_t* srcp;
110 |   const uint8_t* prevp;
111 |   const uint8_t* nextp;
112 |   uint8_t* destp;
113 |   int src_pitch, dst_pitch, prv_pitch, nxt_pitch, row_size, height;
114 | 
115 |   PVideoFrame currf = child->GetFrame(n, env);
116 |   PVideoFrame destf = env->NewVideoFrame(vi);
117 | 
118 |   const int *current_planes = (vi.IsYUV() || vi.IsYUVA()) ? planes_y : planes_r;
119 | 
120 |   if (n == 0 || n == vi.num_frames - 1)
121 |   {
122 |     // 1st or last: not temporal
123 |     for (int i = 0; i < vi.NumComponents(); i++)
124 |     {
125 |       const int plane = current_planes[i];
126 |       copy_plane(destf, currf, plane, env);
127 |     }
128 |     return destf;
129 |   }
130 | 
131 |   PVideoFrame prevf = child->GetFrame(n - 1, env);
132 |   PVideoFrame nextf = child->GetFrame(n + 1, env);
133 | 
134 |   int planecount = std::min(vi.NumComponents(), 3);
135 | 
136 |   for (int i = 0; i < planecount; i++)
137 |   {
138 |     const int plane = current_planes[i];
139 |     if (processPlane[i]) {
140 |       dst_pitch = destf->GetPitch(plane);
141 |       src_pitch = currf->GetPitch(plane);
142 |       prv_pitch = prevf->GetPitch(plane);
143 |       nxt_pitch = nextf->GetPitch(plane);
144 |       row_size = currf->GetRowSize(plane);
145 |       const int width = row_size / vi.ComponentSize();
146 |       height = currf->GetHeight(plane);
147 |       srcp = currf->GetReadPtr(plane);
148 |       prevp = prevf->GetReadPtr(plane);
149 |       nextp = nextf->GetReadPtr(plane);
150 |       destp = destf->GetWritePtr(plane);
151 | 
152 |       // copy top and bottom lines
153 |       memcpy(destp + dst_pitch * (height - 1), srcp + src_pitch * (height - 1), row_size);
154 |       memcpy(destp, srcp, row_size);
155 |       // skip to 2nd line
156 |       srcp += src_pitch;
157 |       prevp += prv_pitch;
158 |       nextp += nxt_pitch;
159 |       destp += dst_pitch;
160 |       height -= 2; // two lines less
161 | 
162 |       const int bits_per_pixel = vi.BitsPerComponent();
163 | 
164 |       proc_ST[i](srcp, src_pitch, prevp, prv_pitch, nextp, nxt_pitch, destp, dst_pitch, width, height, temporal_threshold << (bits_per_pixel - 8), spatial_threshold << (bits_per_pixel - 8), scaletab);
165 |     }
166 |     else {
167 |       copy_plane(destf, currf, plane, env);
168 |     }
169 |   }
170 |   // copy alpha
171 |   if (vi.NumComponents() == 4) {
172 |     const int plane = PLANAR_A;
173 |     copy_plane(destf, currf, plane, env);
174 |   }
175 | 
176 |   return destf;
177 | }
178 | 
179 | FluxSmoothT::FluxSmoothT(PClip _child, int _temporal_threshold, bool _luma, bool _chroma, int _opt, IScriptEnvironment * env)
180 |   : GenericVideoFilter(_child), temporal_threshold(_temporal_threshold),
181 |   opt(_opt)
182 | {
183 |   assert(temporal_threshold >= -1);
184 |   assert(!((-1 == temporal_threshold)));
185 |   assert(env);
186 | 
187 |   // division table 1/1, 1/2, ... 1/11
188 |   // only 1..11 is valid
189 |   scaletab[0] = 0;
190 |   scaletab[1] = 32767;
191 |   for (int i = 2; i < 16; ++i)
192 |     scaletab[i] = (int)(32768.0 / i + 0.5);
193 | 
194 |   const bool goodAVX512 = ((env->GetCPUFlags() & CPUF_AVX512F) == CPUF_AVX512F) && (env->GetCPUFlags() & CPUF_AVX512BW) == CPUF_AVX512BW;
195 | 
196 | #ifndef FLUXSMOOTH_AVX512_ENABLED
197 |   if (opt == USE_OPT_AVX512)
198 |     env->ThrowError("FluxSmoothT: cannot apply opt: this DLL version does not support AVX512");
199 | #endif
200 | 
201 |   if (opt == USE_OPT_AVX512 && !goodAVX512)
202 |     env->ThrowError("FluxSmoothT: cannot apply opt: AVX512F and AVX512BW is needed");
203 |   if (opt == USE_OPT_AVX2 && !(env->GetCPUFlags() & CPUF_AVX2))
204 |     env->ThrowError("FluxSmoothT: cannot apply opt: AVX2 is not supported");
205 |   if (opt == USE_OPT_SSE41 && !(env->GetCPUFlags() & CPUF_SSE4_1))
206 |     env->ThrowError("FluxSmoothT: cannot apply opt: SSE4.1 is not supported");
207 |   if (opt == USE_OPT_SSE2 && !(env->GetCPUFlags() & CPUF_SSE2))
208 |     env->ThrowError("FluxSmoothT: cannot apply opt: SSE2 is not supported");
209 | 
210 |   const int *current_planes = (vi.IsYUV() || vi.IsYUVA()) ? planes_y : planes_r;
211 |   int planecount = std::min(vi.NumComponents(), 3);
212 |   int bits_per_pixel = vi.BitsPerComponent();
213 | 
214 |   for (int i = 0; i < planecount; i++) {
215 |     if (vi.IsRGB())
216 |       processPlane[i] = true;
217 |     else if (i == 0) // Y
218 |       processPlane[i] = _luma;
219 |     else
220 |       processPlane[i] = _chroma;
221 | 
222 |     const int actual_width = vi.width >> vi.GetPlaneWidthSubsampling(current_planes[i]);
223 | 
224 |     if (bits_per_pixel == 8) {
225 | #ifdef FLUXSMOOTH_AVX512_ENABLED
226 |       if ((actual_width >= 64) && ((goodAVX512 && opt < 0) || opt >= USE_OPT_AVX512))
227 |         proc_T[i] = fluxT_avx512;
228 |       else
229 | #endif
230 |       if ((actual_width >= 32) && (((env->GetCPUFlags() & CPUF_AVX2) == CPUF_AVX2 && opt < 0) || opt >= USE_OPT_AVX2))
231 |         proc_T[i] = fluxT_avx2;
232 |       else if ((actual_width >= 16) && (((env->GetCPUFlags() & CPUF_SSE4_1) == CPUF_SSE4_1 && opt < 0) || opt >= USE_OPT_SSE41))
233 |         proc_T[i] = fluxT_sse41;
234 |       else if ((actual_width >= 16) && (((env->GetCPUFlags() & CPUF_SSE2) == CPUF_SSE2 && opt < 0) || opt >= USE_OPT_SSE2))
235 |         proc_T[i] = fluxT_sse2;
236 |       else
237 |         proc_T[i] = fluxT_C<uint8_t>;
238 |     }
239 |     else {
240 | #ifdef FLUXSMOOTH_AVX512_ENABLED
241 |       if ((actual_width >= 32) && ((goodAVX512 && opt < 0) || opt >= USE_OPT_AVX512))
242 |         proc_T[i] = fluxT_avx512_uint16;
243 |       else
244 | #endif
245 |       if ((actual_width >= 16) && (((env->GetCPUFlags() & CPUF_AVX2) == CPUF_AVX2 && opt < 0) || opt >= USE_OPT_AVX2))
246 |         proc_T[i] = fluxT_avx2_uint16;
247 |       else if ((actual_width >= 8) && (((env->GetCPUFlags() & CPUF_SSE4_1) == CPUF_SSE4_1 && opt < 0) || opt >= USE_OPT_SSE41))
248 |         proc_T[i] = fluxT_sse41_uint16;
249 |       else
250 |         proc_T[i] = fluxT_C<uint16_t>;
251 |     }
252 |   }
253 | }
254 | 
255 | PVideoFrame __stdcall FluxSmoothT::GetFrame(int n, IScriptEnvironment * env)
256 | {
257 |   const uint8_t* srcp;
258 |   const uint8_t* prevp;
259 |   const uint8_t* nextp;
260 |   uint8_t* destp;
261 |   int src_pitch, dst_pitch, prv_pitch, nxt_pitch, row_size, height;
262 | 
263 |   PVideoFrame currf = child->GetFrame(n, env);
264 |   PVideoFrame destf = env->NewVideoFrame(vi);
265 | 
266 |   const int *current_planes = (vi.IsYUV() || vi.IsYUVA()) ? planes_y : planes_r;
267 | 
268 |   if (n == 0 || n == vi.num_frames - 1)
269 |   {
270 |     // 1st or last: simple copy
271 |     for (int i = 0; i < vi.NumComponents(); i++)
272 |     {
273 |       const int plane = current_planes[i];
274 |       copy_plane(destf, currf, plane, env);
275 |     }
276 |     return destf;
277 |   }
278 | 
279 |   PVideoFrame prevf = child->GetFrame(n - 1, env);
280 |   PVideoFrame nextf = child->GetFrame(n + 1, env);
281 | 
282 |   int planecount = std::min(vi.NumComponents(), 3);
283 | 
284 |   for (int i = 0; i < planecount; i++)
285 |   {
286 |     const int plane = current_planes[i];
287 |     if (processPlane[i]) {
288 |       dst_pitch = destf->GetPitch(plane);
289 |       src_pitch = currf->GetPitch(plane);
290 |       prv_pitch = prevf->GetPitch(plane);
291 |       nxt_pitch = nextf->GetPitch(plane);
292 |       row_size = currf->GetRowSize(plane);
293 |       const int width = row_size / vi.ComponentSize();
294 |       height = currf->GetHeight(plane);
295 |       srcp = currf->GetReadPtr(plane);
296 |       prevp = prevf->GetReadPtr(plane);
297 |       nextp = nextf->GetReadPtr(plane);
298 |       destp = destf->GetWritePtr(plane);
299 | 
300 |       const int bits_per_pixel = vi.BitsPerComponent();
301 | 
302 |       proc_T[i](srcp, src_pitch, prevp, prv_pitch, nextp, nxt_pitch, destp, dst_pitch, width, height, temporal_threshold << (bits_per_pixel - 8), scaletab);
303 |     }
304 |     else {
305 |       copy_plane(destf, currf, plane, env);
306 |     }
307 |   }
308 |   // copy alpha
309 |   if (vi.NumComponents() == 4) {
310 |     const int plane = PLANAR_A;
311 |     copy_plane(destf, currf, plane, env);
312 |   }
313 | 
314 |   return destf;
315 | }
316 | 
317 | AVSValue __cdecl Create_FluxSmoothT(AVSValue args, void * user_data, IScriptEnvironment * env)
318 | {
319 |   enum ARGS { CLIP, TEMPORAL_THRESHOLD, LUMA, CHROMA, OPT };
320 | 
321 |   PClip clip = args[CLIP].AsClip();
322 |   int temporal_threshold = args[TEMPORAL_THRESHOLD].AsInt(7);
323 |   bool luma = args[LUMA].AsBool(true);
324 |   bool chroma = args[CHROMA].AsBool(true);
325 |   int opt = args[OPT].AsInt(-1);
326 | 
327 |   if (temporal_threshold < 0)
328 |     env->ThrowError("FluxSmoothT: temporal_threshold must be >= 0");
329 | 
330 |   const VideoInfo & vi = clip->GetVideoInfo();
331 | 
332 |   // YUY2 support only through YV16 autoconversion
333 |   if (vi.IsYUY2()) {
334 |     AVSValue new_args[1] = { clip };
335 |     clip = env->Invoke("ConvertToYV16", AVSValue(new_args, 1)).AsClip();
336 |     clip = new FluxSmoothT(clip, temporal_threshold, luma, chroma, opt, env);
337 |     AVSValue new_args2[1] = { clip };
338 |     clip = env->Invoke("ConvertToYUY2", AVSValue(new_args2, 1)).AsClip();
339 |     return clip;
340 |   }
341 | 
342 |   if (vi.BitsPerComponent() == 32)
343 |     env->ThrowError("FluxSmoothT: 32 bit float formats not supported");
344 | 
345 |   if (vi.IsY() || vi.IsYV411() || vi.Is420() || vi.Is422() || vi.Is444() || vi.IsPlanarRGB() || vi.IsPlanarRGBA())
346 |     return new FluxSmoothT(clip, temporal_threshold, luma, chroma, opt, env);
347 |   else
348 |     env->ThrowError("FluxSmoothT: Clip must be in Y or planar YUV(A), RGB(A) or YUY2 format (8-16 bits)");
349 | 
350 |   return 0; // Unreached
351 | }
352 | 
353 | AVSValue __cdecl Create_FluxSmoothST(AVSValue args, void * user_data, IScriptEnvironment * env)
354 | {
355 |   enum ARGS { CLIP, TEMPORAL_THRESHOLD, SPATIAL_THRESHOLD, LUMA, CHROMA, OPT };
356 | 
357 |   PClip clip = args[CLIP].AsClip();
358 |   int temporal_threshold = args[TEMPORAL_THRESHOLD].AsInt(7);
359 |   int spatial_threshold = args[SPATIAL_THRESHOLD].AsInt(7);
360 |   bool luma = args[LUMA].AsBool(true);
361 |   bool chroma = args[CHROMA].AsBool(true);
362 |   int opt = args[OPT].AsInt(-1);
363 | 
364 |   if (temporal_threshold < -1)
365 |     env->ThrowError("FluxSmoothST: temporal_threshold must be >= -1");
366 |   if (spatial_threshold < -1)
367 |     env->ThrowError("FluxSmoothST: spatial_threshold must be >= -1");
368 |   if (-1 == temporal_threshold && -1 == spatial_threshold)
369 |     env->ThrowError("FluxSmoothST: Both thresholds cannot be -1");
370 | 
371 |   const VideoInfo & vi = clip->GetVideoInfo();
372 | 
373 |   // YUY2 support only through YV16 autoconversion
374 |   if (vi.IsYUY2()) {
375 |     AVSValue new_args[1] = { clip };
376 |     clip = env->Invoke("ConvertToYV16", AVSValue(new_args, 1)).AsClip();
377 |     clip = new FluxSmoothST(clip, temporal_threshold, spatial_threshold, luma, chroma, opt, env);
378 |     AVSValue new_args2[1] = { clip };
379 |     clip = env->Invoke("ConvertToYUY2", AVSValue(new_args2, 1)).AsClip();
380 |     return clip;
381 |   }
382 | 
383 |   if (vi.BitsPerComponent() == 32)
384 |     env->ThrowError("FluxSmoothST: 32 bit float formats not supported");
385 | 
386 |   if (vi.IsY() || vi.IsYV411() || vi.Is420() || vi.Is422() || vi.Is444() || vi.IsPlanarRGB() || vi.IsPlanarRGBA())
387 |     return new FluxSmoothST(clip, temporal_threshold, spatial_threshold, luma, chroma, opt, env);
388 |   else
389 |     env->ThrowError("FluxSmoothST: Clip must be in Y or planar YUV(A), RGB(A) or YUY2 format (8-16 bits)");
390 | 
391 |   return 0; // Unreached
392 | }
393 | 
394 | /* New 2.6 requirement!!! */
395 | // Declare and initialise server pointers static storage.
396 | const AVS_Linkage *AVS_linkage = 0;
397 | 
398 | /* New 2.6 requirement!!! */
399 | // DLL entry point called from LoadPlugin() to setup a user plugin.
400 | extern "C" __declspec(dllexport) const char* __stdcall
401 | AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors) {
402 |   /* New 2.6 requirement!!! */
403 |   // Save the server pointers.
404 |   AVS_linkage = vectors;
405 |   env->AddFunction("FluxSmoothT", "c[temporal_threshold]i[luma]b[chroma]b[opt]i", Create_FluxSmoothT, 0);
406 |   env->AddFunction("FluxSmoothST", "c[temporal_threshold]i[spatial_threshold]i[luma]b[chroma]b[opt]i", Create_FluxSmoothST, 0);
407 |   return "FluxSmooth";
408 | }
409 | 


--------------------------------------------------------------------------------
/FluxSmooth/FluxSmooth.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release LLVM|Win32">
  9 |       <Configuration>Release LLVM</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Release LLVM|x64">
 13 |       <Configuration>Release LLVM</Configuration>
 14 |       <Platform>x64</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release XP|Win32">
 17 |       <Configuration>Release XP</Configuration>
 18 |       <Platform>Win32</Platform>
 19 |     </ProjectConfiguration>
 20 |     <ProjectConfiguration Include="Release XP|x64">
 21 |       <Configuration>Release XP</Configuration>
 22 |       <Platform>x64</Platform>
 23 |     </ProjectConfiguration>
 24 |     <ProjectConfiguration Include="Release|Win32">
 25 |       <Configuration>Release</Configuration>
 26 |       <Platform>Win32</Platform>
 27 |     </ProjectConfiguration>
 28 |     <ProjectConfiguration Include="Debug|x64">
 29 |       <Configuration>Debug</Configuration>
 30 |       <Platform>x64</Platform>
 31 |     </ProjectConfiguration>
 32 |     <ProjectConfiguration Include="Release|x64">
 33 |       <Configuration>Release</Configuration>
 34 |       <Platform>x64</Platform>
 35 |     </ProjectConfiguration>
 36 |   </ItemGroup>
 37 |   <ItemGroup>
 38 |     <ClInclude Include="avisynth.h" />
 39 |     <ClInclude Include="avs\alignment.h" />
 40 |     <ClInclude Include="avs\capi.h" />
 41 |     <ClInclude Include="avs\config.h" />
 42 |     <ClInclude Include="avs\cpuid.h" />
 43 |     <ClInclude Include="avs\minmax.h" />
 44 |     <ClInclude Include="avs\types.h" />
 45 |     <ClInclude Include="avs\win.h" />
 46 |     <ClInclude Include="FluxSmooth.h" />
 47 |     <ClInclude Include="resource.h" />
 48 |   </ItemGroup>
 49 |   <ItemGroup>
 50 |     <ClCompile Include="FilterDef.cpp" />
 51 |     <ClCompile Include="FluxSmooth.cpp" />
 52 |     <ClCompile Include="FluxSmooth_avx2.cpp">
 53 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 54 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release LLVM|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 55 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 56 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release XP|Win32'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 57 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 58 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release LLVM|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 59 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 60 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release XP|x64'">AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 61 |     </ClCompile>
 62 |     <ClCompile Include="FluxSmooth_avx512.cpp">
 63 |       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release LLVM|Win32'">-mavx512bw -mavx512f %(AdditionalOptions)</AdditionalOptions>
 64 |       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release LLVM|x64'">-mavx512bw -mavx512f %(AdditionalOptions)</AdditionalOptions>
 65 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotSet</EnableEnhancedInstructionSet>
 66 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release LLVM|Win32'">NotSet</EnableEnhancedInstructionSet>
 67 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotSet</EnableEnhancedInstructionSet>
 68 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release XP|Win32'">NotSet</EnableEnhancedInstructionSet>
 69 |       <EnableEnhancedInstructionSet Condition="'$(Configuration)|$(Platform)'=='Release LLVM|x64'">NotSet</EnableEnhancedInstructionSet>
 70 |       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'"> /arch:AVX512 %(AdditionalOptions)</AdditionalOptions>
 71 |       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'"> /arch:AVX512 %(AdditionalOptions)</AdditionalOptions>
 72 |       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release XP|Win32'"> /arch:AVX512 %(AdditionalOptions)</AdditionalOptions>
 73 |       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'"> /arch:AVX512 %(AdditionalOptions)</AdditionalOptions>
 74 |       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'"> /arch:AVX512 %(AdditionalOptions)</AdditionalOptions>
 75 |       <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release XP|x64'"> /arch:AVX512 %(AdditionalOptions)</AdditionalOptions>
 76 |     </ClCompile>
 77 |   </ItemGroup>
 78 |   <ItemGroup>
 79 |     <ResourceCompile Include="FluxSmooth.rc" />
 80 |   </ItemGroup>
 81 |   <PropertyGroup Label="Globals">
 82 |     <VCProjectVersion>15.0</VCProjectVersion>
 83 |     <ProjectGuid>{588984EE-FDBE-4901-894A-32781B765F07}</ProjectGuid>
 84 |     <Keyword>Win32Proj</Keyword>
 85 |     <RootNamespace>FluxSmooth</RootNamespace>
 86 |     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
 87 |   </PropertyGroup>
 88 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 89 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 90 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
 91 |     <UseDebugLibraries>true</UseDebugLibraries>
 92 |     <PlatformToolset>v142</PlatformToolset>
 93 |     <CharacterSet>MultiByte</CharacterSet>
 94 |   </PropertyGroup>
 95 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 96 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
 97 |     <UseDebugLibraries>false</UseDebugLibraries>
 98 |     <PlatformToolset>v142</PlatformToolset>
 99 |     <WholeProgramOptimization>true</WholeProgramOptimization>
100 |     <CharacterSet>MultiByte</CharacterSet>
101 |   </PropertyGroup>
102 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release XP|Win32'" Label="Configuration">
103 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
104 |     <UseDebugLibraries>false</UseDebugLibraries>
105 |     <PlatformToolset>v141_xp</PlatformToolset>
106 |     <WholeProgramOptimization>true</WholeProgramOptimization>
107 |     <CharacterSet>MultiByte</CharacterSet>
108 |   </PropertyGroup>
109 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release LLVM|Win32'" Label="Configuration">
110 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
111 |     <UseDebugLibraries>false</UseDebugLibraries>
112 |     <PlatformToolset>llvm</PlatformToolset>
113 |     <WholeProgramOptimization>true</WholeProgramOptimization>
114 |     <CharacterSet>MultiByte</CharacterSet>
115 |   </PropertyGroup>
116 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
117 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
118 |     <UseDebugLibraries>true</UseDebugLibraries>
119 |     <PlatformToolset>v142</PlatformToolset>
120 |     <CharacterSet>MultiByte</CharacterSet>
121 |   </PropertyGroup>
122 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
123 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
124 |     <UseDebugLibraries>false</UseDebugLibraries>
125 |     <PlatformToolset>v142</PlatformToolset>
126 |     <WholeProgramOptimization>true</WholeProgramOptimization>
127 |     <CharacterSet>MultiByte</CharacterSet>
128 |   </PropertyGroup>
129 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release XP|x64'" Label="Configuration">
130 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
131 |     <UseDebugLibraries>false</UseDebugLibraries>
132 |     <PlatformToolset>v141_xp</PlatformToolset>
133 |     <WholeProgramOptimization>true</WholeProgramOptimization>
134 |     <CharacterSet>MultiByte</CharacterSet>
135 |   </PropertyGroup>
136 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release LLVM|x64'" Label="Configuration">
137 |     <ConfigurationType>DynamicLibrary</ConfigurationType>
138 |     <UseDebugLibraries>false</UseDebugLibraries>
139 |     <PlatformToolset>llvm</PlatformToolset>
140 |     <WholeProgramOptimization>true</WholeProgramOptimization>
141 |     <CharacterSet>MultiByte</CharacterSet>
142 |   </PropertyGroup>
143 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
144 |   <ImportGroup Label="ExtensionSettings">
145 |   </ImportGroup>
146 |   <ImportGroup Label="Shared">
147 |   </ImportGroup>
148 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
149 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
150 |   </ImportGroup>
151 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
152 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
153 |   </ImportGroup>
154 |   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release XP|Win32'" Label="PropertySheets">
155 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
156 |   </ImportGroup>
157 |   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release LLVM|Win32'" Label="PropertySheets">
158 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
159 |   </ImportGroup>
160 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
161 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
162 |   </ImportGroup>
163 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
164 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
165 |   </ImportGroup>
166 |   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release XP|x64'" Label="PropertySheets">
167 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
168 |   </ImportGroup>
169 |   <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release LLVM|x64'" Label="PropertySheets">
170 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
171 |   </ImportGroup>
172 |   <PropertyGroup Label="UserMacros" />
173 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
174 |     <LinkIncremental>true</LinkIncremental>
175 |     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
176 |     <IntDir>$(Platform)\$(Configuration)\</IntDir>
177 |   </PropertyGroup>
178 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
179 |     <LinkIncremental>true</LinkIncremental>
180 |   </PropertyGroup>
181 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
182 |     <LinkIncremental>false</LinkIncremental>
183 |     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
184 |     <IntDir>$(Platform)\$(Configuration)\</IntDir>
185 |   </PropertyGroup>
186 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release XP|Win32'">
187 |     <LinkIncremental>false</LinkIncremental>
188 |     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
189 |     <IntDir>$(Platform)\$(Configuration)\</IntDir>
190 |   </PropertyGroup>
191 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release LLVM|Win32'">
192 |     <LinkIncremental>false</LinkIncremental>
193 |     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
194 |     <IntDir>$(Platform)\$(Configuration)\</IntDir>
195 |   </PropertyGroup>
196 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
197 |     <LinkIncremental>false</LinkIncremental>
198 |   </PropertyGroup>
199 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release XP|x64'">
200 |     <LinkIncremental>false</LinkIncremental>
201 |   </PropertyGroup>
202 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release LLVM|x64'">
203 |     <LinkIncremental>false</LinkIncremental>
204 |     <OutDir>$(SolutionDir)$(Platform)\$(Configuration)\</OutDir>
205 |     <IntDir>$(Platform)\$(Configuration)\</IntDir>
206 |   </PropertyGroup>
207 |   <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Release LLVM|x64'">
208 |     <ClangClAdditionalOptions>
209 |     </ClangClAdditionalOptions>
210 |   </PropertyGroup>
211 |   <PropertyGroup Label="LLVM" Condition="'$(Configuration)|$(Platform)'=='Release LLVM|Win32'">
212 |     <ClangClAdditionalOptions>
213 |     </ClangClAdditionalOptions>
214 |   </PropertyGroup>
215 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
216 |     <ClCompile>
217 |       <PrecompiledHeader>NotUsing</PrecompiledHeader>
218 |       <WarningLevel>Level3</WarningLevel>
219 |       <Optimization>Disabled</Optimization>
220 |       <SDLCheck>true</SDLCheck>
221 |       <PreprocessorDefinitions>WIN32;_DEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
222 |       <ConformanceMode>true</ConformanceMode>
223 |       <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
224 |       <LanguageStandard>stdcpp17</LanguageStandard>
225 |       <AssemblerOutput>NoListing</AssemblerOutput>
226 |     </ClCompile>
227 |     <Link>
228 |       <SubSystem>Windows</SubSystem>
229 |       <GenerateDebugInformation>true</GenerateDebugInformation>
230 |     </Link>
231 |   </ItemDefinitionGroup>
232 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
233 |     <ClCompile>
234 |       <PrecompiledHeader>NotUsing</PrecompiledHeader>
235 |       <WarningLevel>Level3</WarningLevel>
236 |       <Optimization>Disabled</Optimization>
237 |       <SDLCheck>true</SDLCheck>
238 |       <PreprocessorDefinitions>_DEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
239 |       <ConformanceMode>true</ConformanceMode>
240 |       <LanguageStandard>stdcpp17</LanguageStandard>
241 |       <AssemblerOutput>NoListing</AssemblerOutput>
242 |     </ClCompile>
243 |     <Link>
244 |       <SubSystem>Windows</SubSystem>
245 |       <GenerateDebugInformation>true</GenerateDebugInformation>
246 |     </Link>
247 |   </ItemDefinitionGroup>
248 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
249 |     <ClCompile>
250 |       <PrecompiledHeader>NotUsing</PrecompiledHeader>
251 |       <WarningLevel>Level3</WarningLevel>
252 |       <Optimization>MaxSpeed</Optimization>
253 |       <FunctionLevelLinking>true</FunctionLevelLinking>
254 |       <IntrinsicFunctions>true</IntrinsicFunctions>
255 |       <SDLCheck>true</SDLCheck>
256 |       <PreprocessorDefinitions>WIN32;NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
257 |       <ConformanceMode>true</ConformanceMode>
258 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
259 |       <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
260 |       <LanguageStandard>stdcpp17</LanguageStandard>
261 |       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
262 |       <BufferSecurityCheck>false</BufferSecurityCheck>
263 |       <OmitFramePointers>true</OmitFramePointers>
264 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
265 |     </ClCompile>
266 |     <Link>
267 |       <SubSystem>Windows</SubSystem>
268 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
269 |       <OptimizeReferences>true</OptimizeReferences>
270 |       <GenerateDebugInformation>true</GenerateDebugInformation>
271 |     </Link>
272 |   </ItemDefinitionGroup>
273 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release XP|Win32'">
274 |     <ClCompile>
275 |       <PrecompiledHeader>NotUsing</PrecompiledHeader>
276 |       <WarningLevel>Level3</WarningLevel>
277 |       <Optimization>MaxSpeed</Optimization>
278 |       <FunctionLevelLinking>true</FunctionLevelLinking>
279 |       <IntrinsicFunctions>true</IntrinsicFunctions>
280 |       <SDLCheck>true</SDLCheck>
281 |       <PreprocessorDefinitions>WIN32;NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
282 |       <ConformanceMode>true</ConformanceMode>
283 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
284 |       <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
285 |       <AdditionalOptions>/Zc:threadSafeInit- %(AdditionalOptions)</AdditionalOptions>
286 |       <LanguageStandard>stdcpp17</LanguageStandard>
287 |       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
288 |       <BufferSecurityCheck>false</BufferSecurityCheck>
289 |       <OmitFramePointers>true</OmitFramePointers>
290 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
291 |     </ClCompile>
292 |     <Link>
293 |       <SubSystem>Windows</SubSystem>
294 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
295 |       <OptimizeReferences>true</OptimizeReferences>
296 |       <GenerateDebugInformation>true</GenerateDebugInformation>
297 |     </Link>
298 |   </ItemDefinitionGroup>
299 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release LLVM|Win32'">
300 |     <ClCompile>
301 |       <PrecompiledHeader>NotUsing</PrecompiledHeader>
302 |       <WarningLevel>Level3</WarningLevel>
303 |       <Optimization>MaxSpeed</Optimization>
304 |       <FunctionLevelLinking>true</FunctionLevelLinking>
305 |       <IntrinsicFunctions>true</IntrinsicFunctions>
306 |       <PreprocessorDefinitions>WIN32;NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
307 |       <ConformanceMode>true</ConformanceMode>
308 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
309 |       <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
310 |       <LanguageStandard>stdcpp17</LanguageStandard>
311 |       <AssemblerOutput>NoListing</AssemblerOutput>
312 |       <BufferSecurityCheck>false</BufferSecurityCheck>
313 |       <OmitFramePointers>true</OmitFramePointers>
314 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
315 |       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
316 |       <AdditionalOptions>-Wno-gcc-compat %(AdditionalOptions)</AdditionalOptions>
317 |       <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
318 |       <UseUnicodeForAssemblerListing>false</UseUnicodeForAssemblerListing>
319 |     </ClCompile>
320 |     <Link>
321 |       <SubSystem>Windows</SubSystem>
322 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
323 |       <OptimizeReferences>true</OptimizeReferences>
324 |       <GenerateDebugInformation>false</GenerateDebugInformation>
325 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
326 |     </Link>
327 |   </ItemDefinitionGroup>
328 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
329 |     <ClCompile>
330 |       <PrecompiledHeader>NotUsing</PrecompiledHeader>
331 |       <WarningLevel>Level3</WarningLevel>
332 |       <Optimization>MaxSpeed</Optimization>
333 |       <FunctionLevelLinking>true</FunctionLevelLinking>
334 |       <IntrinsicFunctions>true</IntrinsicFunctions>
335 |       <SDLCheck>true</SDLCheck>
336 |       <PreprocessorDefinitions>NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
337 |       <ConformanceMode>true</ConformanceMode>
338 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
339 |       <LanguageStandard>stdcpp17</LanguageStandard>
340 |       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
341 |       <BufferSecurityCheck>false</BufferSecurityCheck>
342 |       <OmitFramePointers>true</OmitFramePointers>
343 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
344 |     </ClCompile>
345 |     <Link>
346 |       <SubSystem>Windows</SubSystem>
347 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
348 |       <OptimizeReferences>true</OptimizeReferences>
349 |       <GenerateDebugInformation>true</GenerateDebugInformation>
350 |     </Link>
351 |   </ItemDefinitionGroup>
352 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release XP|x64'">
353 |     <ClCompile>
354 |       <PrecompiledHeader>NotUsing</PrecompiledHeader>
355 |       <WarningLevel>Level3</WarningLevel>
356 |       <Optimization>MaxSpeed</Optimization>
357 |       <FunctionLevelLinking>true</FunctionLevelLinking>
358 |       <IntrinsicFunctions>true</IntrinsicFunctions>
359 |       <SDLCheck>true</SDLCheck>
360 |       <PreprocessorDefinitions>NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
361 |       <ConformanceMode>true</ConformanceMode>
362 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
363 |       <AdditionalOptions>/Zc:threadSafeInit- %(AdditionalOptions)</AdditionalOptions>
364 |       <LanguageStandard>stdcpp17</LanguageStandard>
365 |       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
366 |       <BufferSecurityCheck>false</BufferSecurityCheck>
367 |       <OmitFramePointers>true</OmitFramePointers>
368 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
369 |     </ClCompile>
370 |     <Link>
371 |       <SubSystem>Windows</SubSystem>
372 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
373 |       <OptimizeReferences>true</OptimizeReferences>
374 |       <GenerateDebugInformation>true</GenerateDebugInformation>
375 |     </Link>
376 |   </ItemDefinitionGroup>
377 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release LLVM|x64'">
378 |     <ClCompile>
379 |       <PrecompiledHeader>NotUsing</PrecompiledHeader>
380 |       <WarningLevel>Level3</WarningLevel>
381 |       <Optimization>MaxSpeed</Optimization>
382 |       <FunctionLevelLinking>true</FunctionLevelLinking>
383 |       <IntrinsicFunctions>true</IntrinsicFunctions>
384 |       <PreprocessorDefinitions>NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
385 |       <ConformanceMode>true</ConformanceMode>
386 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
387 |       <LanguageStandard>stdcpp17</LanguageStandard>
388 |       <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
389 |       <BufferSecurityCheck>false</BufferSecurityCheck>
390 |       <OmitFramePointers>true</OmitFramePointers>
391 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
392 |       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
393 |       <AdditionalOptions>-Wno-gcc-compat %(AdditionalOptions)</AdditionalOptions>
394 |       <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
395 |     </ClCompile>
396 |     <Link>
397 |       <SubSystem>Windows</SubSystem>
398 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
399 |       <OptimizeReferences>true</OptimizeReferences>
400 |       <GenerateDebugInformation>false</GenerateDebugInformation>
401 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
402 |     </Link>
403 |   </ItemDefinitionGroup>
404 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
405 |   <ImportGroup Label="ExtensionTargets">
406 |   </ImportGroup>
407 | </Project>


--------------------------------------------------------------------------------
/FluxSmooth/FluxSmooth_avx2.cpp:
--------------------------------------------------------------------------------
  1 | #include "FluxSmooth.h"
  2 | #include <algorithm>
  3 | #include "stdint.h"
  4 | #include "immintrin.h" // AVX
  5 | 
  6 | #if !defined(__AVX2__)
  7 | #error "This source file will only work properly when compiled with AVX2 option"
  8 | #endif
  9 | 
 10 | /************************************
 11 | // Helpers, missing intrinsics
 12 | ************************************/
 13 | 
 14 | #define _mm256_cmpge_epu8(a, b) _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a)
 15 | 
 16 | #define _mm256_cmple_epu8(a, b) _mm256_cmpge_epu8(b, a)
 17 | 
 18 | // does not exist
 19 | static AVS_FORCEINLINE __m256i _mm256_cmpgt_epu8(__m256i x, __m256i y)
 20 | {
 21 |   // Returns 0xFF where x > y:
 22 |   return _mm256_andnot_si256(
 23 |     _mm256_cmpeq_epi8(x, y),
 24 |     _mm256_cmpeq_epi8(_mm256_max_epu8(x, y), x)
 25 |   );
 26 | }
 27 | 
 28 | AVS_FORCEINLINE __m256i _mm256_cmpge_epi16(__m256i x, __m256i y)
 29 | {
 30 |   // Returns 0xFFFF where x >= y:
 31 |   return _mm256_or_si256(_mm256_cmpeq_epi16(x, y), _mm256_cmpgt_epi16(x, y));
 32 | }
 33 | 
 34 | #define _mm256_cmple_epi16(a, b) _mm256_cmpge_epi16(b, a)
 35 | 
 36 | /************************************
 37 | // Helpers
 38 | ************************************/
 39 | 
 40 | static AVS_FORCEINLINE void check_neighbour_simd(__m256i &neighbour, __m256i &center, __m256i &threshold,
 41 |   __m256i &sum_lo, __m256i &sum_hi, __m256i &cnt)
 42 | {
 43 |   auto n_minus_c = _mm256_subs_epu8(neighbour, center);
 44 |   auto c_minus_n = _mm256_subs_epu8(center, neighbour);
 45 |   auto absdiff = _mm256_or_si256(n_minus_c, c_minus_n);
 46 |   auto abs_is_lessthanoreq_thresh = _mm256_cmple_epu8(absdiff, threshold);
 47 |   // count. increment when true. We simply sub the mask value 00 (0) or FF (-1)
 48 |   cnt = _mm256_sub_epi8(cnt, abs_is_lessthanoreq_thresh);
 49 |   // increase sum elements by neighbour where true, that is mask is FF
 50 |   // sum is 16 bits
 51 |   auto masked_neighbour = _mm256_and_si256(abs_is_lessthanoreq_thresh, neighbour);
 52 |   auto zero = _mm256_setzero_si256();
 53 |   auto masked_neighbour_lo = _mm256_unpacklo_epi8(masked_neighbour, zero);
 54 |   auto masked_neighbour_hi = _mm256_unpackhi_epi8(masked_neighbour, zero);
 55 |   sum_lo = _mm256_add_epi16(sum_lo, masked_neighbour_lo);
 56 |   sum_hi = _mm256_add_epi16(sum_hi, masked_neighbour_hi);
 57 | 
 58 |   /*
 59 |   if (std::abs(neighbour - center) <= threshold)
 60 |   {
 61 |     sum += neighbour;
 62 |     ++cnt;
 63 |   }
 64 |   */
 65 | }
 66 | 
 67 | static AVS_FORCEINLINE void check_neighbour_simd_uint16(__m256i &neighbour, __m256i &center, __m256i &threshold,
 68 |   __m256i &sum_lo, __m256i &sum_hi, __m256i &cnt, const __m256i &make_signed_word)
 69 | {
 70 |   // threshold is shifted to the "signed" int16 domain
 71 |   auto n_minus_c = _mm256_subs_epu16(neighbour, center);
 72 |   auto c_minus_n = _mm256_subs_epu16(center, neighbour);
 73 |   auto absdiff = _mm256_or_si256(n_minus_c, c_minus_n);
 74 |   // absdiff <= threshold ==> !(absdiff > threshold)
 75 |   // FIXME make it a bit faster: cmpgt and later: andnot, and count in a reverse way (instead of increase-when-match use decrease-by-non-match)
 76 |   auto abs_is_lessthanoreq_thresh = _mm256_cmple_epi16(_mm256_add_epi16(absdiff, make_signed_word), threshold);
 77 |   // count. increment when true. We simply sub the mask value 0000 (0) or FFFF (-1)
 78 |   cnt = _mm256_sub_epi16(cnt, abs_is_lessthanoreq_thresh);
 79 |   // increase sum elements by neighbour where true, that is mask is FF
 80 |   // sum is 16 bits
 81 |   auto masked_neighbour = _mm256_and_si256(abs_is_lessthanoreq_thresh, neighbour);
 82 |   auto zero = _mm256_setzero_si256();
 83 |   auto masked_neighbour_lo = _mm256_unpacklo_epi16(masked_neighbour, zero);
 84 |   auto masked_neighbour_hi = _mm256_unpackhi_epi16(masked_neighbour, zero);
 85 |   sum_lo = _mm256_add_epi32(sum_lo, masked_neighbour_lo);
 86 |   sum_hi = _mm256_add_epi32(sum_hi, masked_neighbour_hi);
 87 | 
 88 |   /*
 89 |   if (std::abs(neighbour - center) <= threshold)
 90 |   {
 91 |     sum += neighbour;
 92 |     ++cnt;
 93 |   }
 94 |   */
 95 | }
 96 | 
 97 | /************************************
 98 | // Temporal only AVX2, 8 bit
 99 | ************************************/
100 | 
101 | static AVS_FORCEINLINE void fluxT_core_avx2(const BYTE * currp,
102 |   const BYTE * prevp, const BYTE * nextp,
103 |   BYTE * destp, int x,
104 |   __m256i &temporal_threshold_vector,
105 |   __m256i &scaletab_lut_lsbs,
106 |   __m256i &scaletab_lut_msbs
107 | )
108 | {
109 |   auto b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x));
110 |   auto pbt = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(prevp + x));
111 |   auto nbt = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(nextp + x));
112 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
113 |   // int pdiff = pbt - b, ndiff = nbt - b;
114 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
115 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
116 |   auto pbt_lessthan_b = _mm256_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
117 |   auto nbt_lessthan_b = _mm256_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
118 |   auto pbt_greaterthan_b = _mm256_cmpgt_epu8(pbt, b); // FF where pbt > b
119 |   auto nbt_greaterthan_b = _mm256_cmpgt_epu8(nbt, b); // FF where nbt > b
120 |   auto both_less = _mm256_and_si256(pbt_lessthan_b, nbt_lessthan_b);
121 |   auto both_greater = _mm256_and_si256(pbt_greaterthan_b, nbt_greaterthan_b);
122 |   auto mask_either_is_true = _mm256_or_si256(both_less, both_greater);
123 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
124 | 
125 |   // int sum = b, cnt = 1;
126 |   auto zero = _mm256_setzero_si256();
127 |   auto sum_lo = _mm256_unpacklo_epi8(b, zero);
128 |   auto sum_hi = _mm256_unpackhi_epi8(b, zero);
129 |   auto cnt = _mm256_set1_epi8(1);
130 | 
131 |   check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
132 |   check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
133 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
134 | 
135 |   // factor1 = sum*2 + cnt, sum elements are 16 bits
136 |   auto cnt_lo = _mm256_unpacklo_epi8(cnt, zero);
137 |   auto cnt_hi = _mm256_unpackhi_epi8(cnt, zero);
138 |   auto factor1_lo = _mm256_add_epi16(_mm256_add_epi16(sum_lo, sum_lo), cnt_lo);
139 |   auto factor1_hi = _mm256_add_epi16(_mm256_add_epi16(sum_hi, sum_hi), cnt_hi);
140 |   // factor2 = scaletab[cnt]
141 |   auto factor2_lsb = _mm256_shuffle_epi8(scaletab_lut_lsbs, cnt);
142 |   auto factor2_msb = _mm256_shuffle_epi8(scaletab_lut_msbs, cnt);
143 |   auto factor2_lo = _mm256_unpacklo_epi8(factor2_lsb, factor2_msb);
144 |   auto factor2_hi = _mm256_unpackhi_epi8(factor2_lsb, factor2_msb);
145 |   // finally mul and shift
146 |   auto mulres_lo = _mm256_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16
147 |   auto mulres_hi = _mm256_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16
148 |   // move back to 16x8 bits
149 |   auto result = _mm256_packus_epi16(mulres_lo, mulres_hi);
150 | 
151 |   // decide if original pixel is kept
152 |   auto finalres = _mm256_blendv_epi8(b, result, mask_either_is_true); // true: second param, false: 1st param
153 | 
154 |   _mm256_storeu_si256(reinterpret_cast<__m256i *>(destp + x), finalres);
155 | }
156 | 
157 | 
158 | // Temporal only
159 | void fluxT_avx2(const uint8_t* currp, const int src_pitch,
160 |   const uint8_t * prevp, const int prv_pitch,
161 |   const uint8_t * nextp, const int nxt_pitch,
162 |   uint8_t* destp, const int dst_pitch,
163 |   const int width, int height,
164 |   int temporal_threshold,
165 |   short *scaletab)
166 | {
167 |   __m256i scaletab_lut_lsbs;
168 |   __m256i scaletab_lut_msbs;
169 |   for (int i = 0; i < 16; i++) {
170 |     ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF;
171 |     ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF;
172 |     // same for hi 128
173 |     ((uint8_t*)&scaletab_lut_lsbs)[i+16] = scaletab[i] & 0xFF;
174 |     ((uint8_t*)&scaletab_lut_msbs)[i+16] = (scaletab[i] >> 8) & 0xFF;
175 |   }
176 | 
177 |   const int xcnt = width;
178 | 
179 |   __m256i temporal_threshold_vector = _mm256_set1_epi8(temporal_threshold);
180 | 
181 |   const int wmod32 = xcnt / 32 * 32;
182 |   const int rest = xcnt - wmod32;
183 | 
184 |   for (int y = 0; y < height; y++)
185 |   {
186 |     for (int x = 0; x < wmod32; x += 32)
187 |       fluxT_core_avx2(currp, prevp, nextp, destp, x, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
188 |     // do rest
189 |     if (rest > 0)
190 |       fluxT_core_avx2(currp, prevp, nextp, destp, xcnt - 32, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
191 | 
192 |     currp += src_pitch;
193 |     prevp += prv_pitch;
194 |     nextp += nxt_pitch;
195 |     destp += dst_pitch;
196 |   } // for y
197 |   _mm256_zeroupper();
198 | }
199 | 
200 | /************************************
201 | // Temporal only AVX2, 16 bit
202 | ************************************/
203 | 
204 | AVS_FORCEINLINE void fluxT_core_avx2_uint16(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
205 |   __m256i &temporal_threshold_vector // already shifted to "signed" domain
206 | )
207 | {
208 |   const auto make_signed_word = _mm256_set1_epi16(0x8000); // int16 support is better than of uint16 (cmp, etc...)
209 | 
210 |   auto b_orig = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x));
211 |   auto pbt_orig = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(prevp + x));
212 |   auto nbt_orig = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(nextp + x));
213 | 
214 |   auto b = _mm256_add_epi16(b_orig, make_signed_word);
215 |   auto pbt = _mm256_add_epi16(pbt_orig, make_signed_word);
216 |   auto nbt = _mm256_add_epi16(nbt_orig, make_signed_word);
217 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
218 |   // int pdiff = pbt - b, ndiff = nbt - b;
219 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
220 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
221 |   auto pbt_lessthan_b = _mm256_cmpgt_epi16(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
222 |   auto nbt_lessthan_b = _mm256_cmpgt_epi16(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
223 |   auto pbt_greaterthan_b = _mm256_cmpgt_epi16(pbt, b); // FF where pbt > b
224 |   auto nbt_greaterthan_b = _mm256_cmpgt_epi16(nbt, b); // FF where nbt > b
225 |   auto both_less = _mm256_and_si256(pbt_lessthan_b, nbt_lessthan_b);
226 |   auto both_greater = _mm256_and_si256(pbt_greaterthan_b, nbt_greaterthan_b);
227 |   auto mask_either_is_true = _mm256_or_si256(both_less, both_greater);
228 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
229 | 
230 |   // int sum = b, cnt = 1;
231 |   auto zero = _mm256_setzero_si256();
232 |   auto sum_lo = _mm256_unpacklo_epi16(b_orig, zero);
233 |   auto sum_hi = _mm256_unpackhi_epi16(b_orig, zero);
234 |   auto cnt = _mm256_set1_epi16(1);
235 | 
236 |   check_neighbour_simd_uint16(pbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
237 |   check_neighbour_simd_uint16(nbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
238 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
239 | 
240 |   auto cnt_lo = _mm256_unpacklo_epi16(cnt, zero);
241 |   auto cnt_hi = _mm256_unpackhi_epi16(cnt, zero);
242 |   // Difference from SSE4.1 and C: floating point division
243 |   // sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f)
244 |   const auto rounder_half = _mm256_set1_ps(0.5f);
245 |   // lower 8 pixels
246 |   auto fcnt_lo = _mm256_cvtepi32_ps(cnt_lo);
247 |   auto fsum_lo = _mm256_cvtepi32_ps(sum_lo);
248 |   // difference from AVX512: rcp14_ps has error less than 2^-14, while rcp_ps error is < 1.5*2^-12
249 |   auto mulres_lo = _mm256_cvttps_epi32(_mm256_fmadd_ps(fsum_lo, _mm256_rcp_ps(fcnt_lo), rounder_half));
250 |   // upper 8 pixels
251 |   auto fcnt_hi = _mm256_cvtepi32_ps(cnt_hi);
252 |   auto fsum_hi = _mm256_cvtepi32_ps(sum_hi);
253 |   auto mulres_hi = _mm256_cvttps_epi32(_mm256_fmadd_ps(fsum_hi, _mm256_rcp_ps(fcnt_hi), rounder_half));
254 | 
255 |   // move back to 16x16 bits
256 |   auto result = _mm256_packus_epi32(mulres_lo, mulres_hi);
257 | 
258 |   // decide if original pixel is kept
259 |   auto finalres = _mm256_blendv_epi8(b_orig, result, mask_either_is_true); // true: second param, false: 1st param
260 | 
261 |   _mm256_storeu_si256(reinterpret_cast<__m256i *>(destp + x), finalres);
262 | }
263 | 
264 | // Temporal only
265 | void fluxT_avx2_uint16(const uint8_t* currp, const int src_pitch,
266 |   const uint8_t * prevp, const int prv_pitch,
267 |   const uint8_t * nextp, const int nxt_pitch,
268 |   uint8_t* destp, const int dst_pitch,
269 |   const int width, int height,
270 |   int temporal_threshold,
271 |   short *scaletab)
272 | {
273 |   const int xcnt = width;
274 | 
275 |   __m256i temporal_threshold_vector = _mm256_set1_epi16(temporal_threshold - 0x8000); // move to signed int16 domain
276 | 
277 |   const int wmod16 = xcnt / 16 * 16;
278 |   const int rest = xcnt - wmod16;
279 | 
280 |   for (int y = 0; y < height; y++)
281 |   {
282 |     for (int x = 0; x < wmod16; x += 16)
283 |       fluxT_core_avx2_uint16(currp, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector);
284 |     // do rest
285 |     if (rest > 0)
286 |       fluxT_core_avx2_uint16(currp, prevp, nextp, destp, (xcnt - 16) * sizeof(uint16_t), temporal_threshold_vector);
287 | 
288 |     currp += src_pitch;
289 |     prevp += prv_pitch;
290 |     nextp += nxt_pitch;
291 |     destp += dst_pitch;
292 |   } // for y
293 |   _mm256_zeroupper();
294 | }
295 | 
296 | /************************************
297 | // Spatial Temporal AVX2, 8 bit
298 | ************************************/
299 | 
300 | AVS_FORCEINLINE void fluxST_core_avx2(const BYTE * currp, const int src_pitch,
301 |   const BYTE * prevp, const BYTE * nextp,
302 |   BYTE * destp, int x,
303 |   __m256i &temporal_threshold_vector,
304 |   __m256i &spatial_threshold_vector,
305 |   __m256i &scaletab_lut_lsbs,
306 |   __m256i &scaletab_lut_msbs
307 | )
308 | {
309 |   // +1: center of 3x3 pixels [+0,+1,+2]
310 |   auto b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + 1));
311 |   auto pbt = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(prevp + x + 1));
312 |   auto nbt = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(nextp + x + 1));
313 | 
314 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
315 |   // int pdiff = pbt - b, ndiff = nbt - b;
316 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
317 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
318 |   auto pbt_lessthan_b = _mm256_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
319 |   auto nbt_lessthan_b = _mm256_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
320 |   auto pbt_greaterthan_b = _mm256_cmpgt_epu8(pbt, b); // FF where pbt > b
321 |   auto nbt_greaterthan_b = _mm256_cmpgt_epu8(nbt, b); // FF where nbt > b
322 |   auto both_less = _mm256_and_si256(pbt_lessthan_b, nbt_lessthan_b);
323 |   auto both_greater = _mm256_and_si256(pbt_greaterthan_b, nbt_greaterthan_b);
324 |   auto mask_either_is_true = _mm256_or_si256(both_less, both_greater);
325 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
326 | 
327 |     // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1];
328 |     // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1];
329 |     // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1];
330 | 
331 |   auto pb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x - src_pitch + 0));
332 |   auto pb2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x - src_pitch + 1));
333 |   auto pb3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x - src_pitch + 2));
334 | 
335 |   auto b1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + 0));
336 |   auto b2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + 2));
337 | 
338 |   auto nb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + src_pitch + 0));
339 |   auto nb2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + src_pitch + 1));
340 |   auto nb3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + src_pitch + 2));
341 | 
342 |   // int sum = b, cnt = 1;
343 |   auto zero = _mm256_setzero_si256();
344 |   auto sum_lo = _mm256_unpacklo_epi8(b, zero);
345 |   auto sum_hi = _mm256_unpackhi_epi8(b, zero);
346 |   auto cnt = _mm256_set1_epi8(1);
347 | 
348 |   check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
349 |   check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
350 |   check_neighbour_simd(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
351 |   check_neighbour_simd(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
352 |   check_neighbour_simd(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
353 |   check_neighbour_simd(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
354 |   check_neighbour_simd(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
355 |   check_neighbour_simd(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
356 |   check_neighbour_simd(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
357 |   check_neighbour_simd(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
358 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
359 | 
360 |   // factor1 = sum*2 + cnt, sum elements are 16 bits
361 |   auto cnt_lo = _mm256_unpacklo_epi8(cnt, zero);
362 |   auto cnt_hi = _mm256_unpackhi_epi8(cnt, zero);
363 |   auto factor1_lo = _mm256_add_epi16(_mm256_add_epi16(sum_lo, sum_lo), cnt_lo);
364 |   auto factor1_hi = _mm256_add_epi16(_mm256_add_epi16(sum_hi, sum_hi), cnt_hi);
365 |   // factor2 = scaletab[cnt]
366 |   auto factor2_lsb = _mm256_shuffle_epi8(scaletab_lut_lsbs, cnt);
367 |   auto factor2_msb = _mm256_shuffle_epi8(scaletab_lut_msbs, cnt);
368 |   auto factor2_lo = _mm256_unpacklo_epi8(factor2_lsb, factor2_msb);
369 |   auto factor2_hi = _mm256_unpackhi_epi8(factor2_lsb, factor2_msb);
370 |   // finally mul and shift
371 |   auto mulres_lo = _mm256_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16
372 |   auto mulres_hi = _mm256_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16
373 |   // move back to 16x8 bits
374 |   auto result = _mm256_packus_epi16(mulres_lo, mulres_hi);
375 | 
376 |   // decide if original pixel is kept
377 |   auto finalres = _mm256_blendv_epi8(b, result, mask_either_is_true); // true: second param, false: 1st param
378 | 
379 |   _mm256_storeu_si256(reinterpret_cast<__m256i *>(destp + x + 1), finalres);
380 | }
381 | 
382 | // Spatial Temporal
383 | void fluxST_avx2(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
384 |   uint8_t* destp, const int dst_pitch, const int width, int height, int temporal_threshold, int spatial_threshold, short *scaletab)
385 | {
386 |   __m256i scaletab_lut_lsbs;
387 |   __m256i scaletab_lut_msbs;
388 |   for (int i = 0; i < 16; i++) {
389 |     ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF;
390 |     ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF;
391 |     // same for upper 128
392 |     ((uint8_t*)&scaletab_lut_lsbs)[i+16] = scaletab[i] & 0xFF;
393 |     ((uint8_t*)&scaletab_lut_msbs)[i+16] = (scaletab[i] >> 8) & 0xFF;
394 |   }
395 | 
396 |   // spatial: because of previous and next line involved, function is called 
397 |   // starting with the 2nd line and with height = (real_height - 2) 
398 |   const int xcnt = width - 2; // leftmost/rightmost column safety
399 | 
400 |   __m256i temporal_threshold_vector = _mm256_set1_epi8(temporal_threshold);
401 |   __m256i spatial_threshold_vector = _mm256_set1_epi8(spatial_threshold);
402 | 
403 |   const int wmod32 = xcnt / 32 * 32;
404 |   const int rest = xcnt - wmod32;
405 | 
406 |   for (int y = 0; y < height; y++)
407 |   {
408 |     destp[0] = currp[0]; // Copy left edge
409 | 
410 |     for (int x = 0; x < wmod32; x += 32)
411 |       fluxST_core_avx2(currp, src_pitch, prevp, nextp, destp, x, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
412 |     // do rest
413 |     if (rest > 0)
414 |       fluxST_core_avx2(currp, src_pitch, prevp, nextp, destp, xcnt - 32, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
415 | 
416 |     destp[width - 1] = currp[width - 1]; // Copy right edge
417 | 
418 |     currp += src_pitch;
419 |     prevp += prv_pitch;
420 |     nextp += nxt_pitch;
421 |     destp += dst_pitch;
422 |   } // for y
423 |   _mm256_zeroupper();
424 | }
425 | 
426 | /************************************
427 | // Spatial Temporal AVX2, 16 bit
428 | ************************************/
429 | AVS_FORCEINLINE void fluxST_core_avx2_uint16(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
430 |   __m256i &temporal_threshold_vector, // already shifted to "signed" domain
431 |   __m256i &spatial_threshold_vector // already shifted to "signed" domain
432 | )
433 | {
434 |   const auto make_signed_word = _mm256_set1_epi16(0x8000); // int16 support is better than of uint16 (cmp, etc...)
435 |   // +1: center of 3x3 pixels [+0,+1,+2]
436 |   auto b_orig = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + 1 * sizeof(uint16_t)));
437 |   auto pbt_orig = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(prevp + x + 1 * sizeof(uint16_t)));
438 |   auto nbt_orig = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(nextp + x + 1 * sizeof(uint16_t)));
439 | 
440 |   auto b = _mm256_add_epi16(b_orig, make_signed_word);
441 |   auto pbt = _mm256_add_epi16(pbt_orig, make_signed_word);
442 |   auto nbt = _mm256_add_epi16(nbt_orig, make_signed_word);
443 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
444 |   // int pdiff = pbt - b, ndiff = nbt - b;
445 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
446 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
447 |   auto pbt_lessthan_b = _mm256_cmpgt_epi16(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
448 |   auto nbt_lessthan_b = _mm256_cmpgt_epi16(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
449 |   auto pbt_greaterthan_b = _mm256_cmpgt_epi16(pbt, b); // FF where pbt > b
450 |   auto nbt_greaterthan_b = _mm256_cmpgt_epi16(nbt, b); // FF where nbt > b
451 |   auto both_less = _mm256_and_si256(pbt_lessthan_b, nbt_lessthan_b);
452 |   auto both_greater = _mm256_and_si256(pbt_greaterthan_b, nbt_greaterthan_b);
453 |   auto mask_either_is_true = _mm256_or_si256(both_less, both_greater);
454 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
455 | 
456 |     // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1];
457 |     // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1];
458 |     // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1];
459 | 
460 |   auto pb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x - src_pitch + 0 * sizeof(uint16_t)));
461 |   auto pb2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x - src_pitch + 1 * sizeof(uint16_t)));
462 |   auto pb3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x - src_pitch + 2 * sizeof(uint16_t)));
463 | 
464 |   auto b1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + 0 * sizeof(uint16_t)));
465 |   auto b2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + 2 * sizeof(uint16_t)));
466 | 
467 |   auto nb1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + src_pitch + 0 * sizeof(uint16_t)));
468 |   auto nb2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + src_pitch + 1 * sizeof(uint16_t)));
469 |   auto nb3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(currp + x + src_pitch + 2 * sizeof(uint16_t)));
470 | 
471 |   // int sum = b, cnt = 1;
472 |   auto zero = _mm256_setzero_si256();
473 |   auto sum_lo = _mm256_unpacklo_epi16(b_orig, zero);
474 |   auto sum_hi = _mm256_unpackhi_epi16(b_orig, zero);
475 |   auto cnt = _mm256_set1_epi16(1);
476 | 
477 |   check_neighbour_simd_uint16(pbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
478 |   check_neighbour_simd_uint16(nbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
479 |   check_neighbour_simd_uint16(pb1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
480 |   check_neighbour_simd_uint16(pb2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
481 |   check_neighbour_simd_uint16(pb3, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
482 |   check_neighbour_simd_uint16(b1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
483 |   check_neighbour_simd_uint16(b2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
484 |   check_neighbour_simd_uint16(nb1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
485 |   check_neighbour_simd_uint16(nb2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
486 |   check_neighbour_simd_uint16(nb3, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
487 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
488 | 
489 |   auto cnt_lo = _mm256_unpacklo_epi16(cnt, zero);
490 |   auto cnt_hi = _mm256_unpackhi_epi16(cnt, zero);
491 |   // Difference from SSE4.1 and C: floating point division
492 |   // sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f)
493 |   const auto rounder_half = _mm256_set1_ps(0.5f);
494 |   // lower 8 pixels
495 |   auto fcnt_lo = _mm256_cvtepi32_ps(cnt_lo);
496 |   auto fsum_lo = _mm256_cvtepi32_ps(sum_lo);
497 |   // difference from AVX512: rcp14_ps has error less than 2^-14, while rcp_ps error is < 1.5*2^-12
498 |   auto mulres_lo = _mm256_cvttps_epi32(_mm256_fmadd_ps(fsum_lo, _mm256_rcp_ps(fcnt_lo), rounder_half));
499 |   // upper 8 pixels
500 |   auto fcnt_hi = _mm256_cvtepi32_ps(cnt_hi);
501 |   auto fsum_hi = _mm256_cvtepi32_ps(sum_hi);
502 |   auto mulres_hi = _mm256_cvttps_epi32(_mm256_fmadd_ps(fsum_hi, _mm256_rcp_ps(fcnt_hi), rounder_half));
503 | 
504 |   // move back to 16x16 bits
505 |   auto result = _mm256_packus_epi32(mulres_lo, mulres_hi);
506 | 
507 |   // decide if original pixel is kept
508 |   auto finalres = _mm256_blendv_epi8(b_orig, result, mask_either_is_true); // true: second param, false: 1st param
509 | 
510 |   _mm256_storeu_si256(reinterpret_cast<__m256i *>(destp + x + 1 * sizeof(uint16_t)), finalres);
511 | }
512 | 
513 | // Spatial Temporal
514 | void fluxST_avx2_uint16(const uint8_t* currp, const int src_pitch,
515 |   const uint8_t * prevp, const int prv_pitch,
516 |   const uint8_t * nextp, const int nxt_pitch,
517 |   uint8_t* destp, const int dst_pitch,
518 |   const int width, int height,
519 |   int temporal_threshold,
520 |   int spatial_threshold,
521 |   short *scaletab)
522 | {
523 | 
524 |   // spatial: because of previous and next line involved, function is called 
525 |   // starting with the 2nd line and with height = (real_height - 2) 
526 |   const int xcnt = width - 2; // leftmost/rightmost column safety
527 | 
528 |   __m256i temporal_threshold_vector = _mm256_set1_epi16(temporal_threshold - 0x8000); // move to signed int16 domain;
529 |   __m256i spatial_threshold_vector = _mm256_set1_epi16(spatial_threshold - 0x8000); // move to signed int16 domain;
530 | 
531 |   const int wmod16 = xcnt / 16 * 16;
532 |   const int rest = xcnt - wmod16;
533 | 
534 |   for (int y = 0; y < height; y++)
535 |   {
536 |     reinterpret_cast<uint16_t*>(destp)[0] = reinterpret_cast<const uint16_t*>(currp)[0]; // Copy left edge
537 | 
538 |     for (int x = 0; x < wmod16; x += 16)
539 |       fluxST_core_avx2_uint16(currp, src_pitch, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector);
540 |     // do rest
541 |     if (rest > 0)
542 |       fluxST_core_avx2_uint16(currp, src_pitch, prevp, nextp, destp, (xcnt - 16) * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector);
543 | 
544 |     reinterpret_cast<uint16_t*>(destp)[width - 1] = reinterpret_cast<const uint16_t*>(currp)[width - 1]; // Copy right edge
545 | 
546 |     currp += src_pitch;
547 |     prevp += prv_pitch;
548 |     nextp += nxt_pitch;
549 |     destp += dst_pitch;
550 |   } // for y
551 |   _mm256_zeroupper();
552 | }
553 | 
554 | 
555 | 


--------------------------------------------------------------------------------
/FluxSmooth/FluxSmooth_avx512.cpp:
--------------------------------------------------------------------------------
  1 | #include "FluxSmooth.h"
  2 | #include <algorithm>
  3 | #include "stdint.h"
  4 | #include "immintrin.h" // also includes "zmmintrin.h" for AVX512 and "avx512bwintrin.h"
  5 | 
  6 | #ifdef FLUXSMOOTH_AVX512_ENABLED
  7 | 
  8 | // BW: starting with Skylake X and Cannon Lake.
  9 | #if defined(CLANG)
 10 | #if !defined(__AVX512F__) || !defined(__AVX512BW__)
 11 | #error "This source file will only work properly when compiled with AVX512F and AVX512BW option. Set -mavx512f -mavx512bw command line options for this file."
 12 | #endif
 13 | #else
 14 | #if defined(GCC)
 15 | #if !defined(__AVX512F__) || !defined(__AVX512BW__)
 16 | #error "This source file will only work properly when compiled with AVX512F and AVX512BW option. Set -mavx512f -mavx512bw command line options for this file."
 17 | #endif
 18 | #else
 19 | #if !defined(__AVX512BW__) // MSVC may not define __AVX512F__
 20 | #error "This source file will only work properly when compiled with AVX512 option. Set /arch=AVX512 to command line options for this file."
 21 | #endif
 22 | #endif
 23 | #endif
 24 | 
 25 | #if defined(_MSC_VER) && !defined(__clang__)
 26 | // As of April 2019, MS version of immintrin.h does not support AVX512BW _k*_mask* functions
 27 | // https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
 28 | // Fixed in July 2019, available from VS 2019 16.2
 29 | // Note: Avaliable only for v142 platform toolset from 14.22
 30 | //       v141 and v141_xp using 14.16; _MSC_VER is 1916 (e.g. Visual Studio 2017 version 15.9.11)
 31 | //       v142 using 14.22 is already implementing those mask operations --> 2019 16.2
 32 | 
 33 | #if _MSC_VER < 1922
 34 | 
 35 | AVS_FORCEINLINE __mmask64 _kand_mask64(__mmask64 A, __mmask64 B) // AVX512BW
 36 | {
 37 |   return (__mmask64)(A & B);
 38 | }
 39 | 
 40 | AVS_FORCEINLINE __mmask64 _kor_mask64(__mmask64 A, __mmask64 B) // AVX512BW
 41 | {
 42 |   return (__mmask64)(A | B);
 43 | }
 44 | 
 45 | AVS_FORCEINLINE __mmask32 _kand_mask32(__mmask32 A, __mmask32 B) // AVX512BW
 46 | {
 47 |   return (__mmask32)(A & B);
 48 | }
 49 | 
 50 | AVS_FORCEINLINE __mmask32 _kor_mask32(__mmask32 A, __mmask32 B) // AVX512BW
 51 | {
 52 |   return (__mmask32)(A | B);
 53 | }
 54 | #endif
 55 | #endif
 56 | 
 57 | 
 58 | /************************************
 59 | // Helpers
 60 | ************************************/
 61 | 
 62 | static AVS_FORCEINLINE void check_neighbour_simd(__m512i &neighbour, __m512i &center, __m512i &threshold,
 63 |   __m512i &sum_lo, __m512i &sum_hi, __m512i &cnt)
 64 | {
 65 |   auto n_minus_c = _mm512_subs_epu8(neighbour, center); // AVX512BW
 66 |   auto c_minus_n = _mm512_subs_epu8(center, neighbour); // AVX512BW
 67 |   auto absdiff = _mm512_or_si512(n_minus_c, c_minus_n); // AVX512F
 68 |   auto abs_is_lessthanoreq_thresh = _mm512_cmple_epu8_mask(absdiff, threshold); // AVX512BW
 69 |   // count. add 1 (increment) when true.
 70 |   cnt = _mm512_add_epi8(cnt, _mm512_maskz_set1_epi8(abs_is_lessthanoreq_thresh, 1)); // AVX512BW
 71 |   // increase sum elements by neighbour where true
 72 |   // sum is 16 bits
 73 |   auto masked_neighbour = _mm512_maskz_mov_epi8(abs_is_lessthanoreq_thresh, neighbour); // AVX512BW
 74 |   auto zero = _mm512_setzero_si512(); // AVX512F
 75 |   auto masked_neighbour_lo = _mm512_unpacklo_epi8(masked_neighbour, zero);
 76 |   auto masked_neighbour_hi = _mm512_unpackhi_epi8(masked_neighbour, zero);
 77 |   sum_lo = _mm512_add_epi16(sum_lo, masked_neighbour_lo);
 78 |   sum_hi = _mm512_add_epi16(sum_hi, masked_neighbour_hi);
 79 | 
 80 |   /*
 81 |   if (std::abs(neighbour - center) <= threshold)
 82 |   {
 83 |     sum += neighbour;
 84 |     ++cnt;
 85 |   }
 86 |   */
 87 | }
 88 | 
 89 | static AVS_FORCEINLINE void check_neighbour_simd_uint16(__m512i &neighbour, __m512i &center, __m512i &threshold,
 90 |   __m512i &sum_lo, __m512i &sum_hi, __m512i &cnt)
 91 | {
 92 |   auto n_minus_c = _mm512_subs_epu16(neighbour, center);
 93 |   auto c_minus_n = _mm512_subs_epu16(center, neighbour);
 94 |   auto absdiff = _mm512_or_si512(n_minus_c, c_minus_n);
 95 |   // absdiff <= threshold
 96 |   auto abs_is_lessthanoreq_thresh = _mm512_cmple_epu16_mask(absdiff, threshold);
 97 |   // count. add 1 (increment) when true.
 98 |   cnt = _mm512_add_epi16(cnt, _mm512_maskz_set1_epi16(abs_is_lessthanoreq_thresh, 1)); // AVX512BW
 99 |   // increase sum elements by neighbour where true, that is mask is FF
100 |   // sum is 16 bits
101 |   auto masked_neighbour = _mm512_maskz_mov_epi16(abs_is_lessthanoreq_thresh, neighbour); // AVX512BW
102 |   auto zero = _mm512_setzero_si512();
103 |   auto masked_neighbour_lo = _mm512_unpacklo_epi16(masked_neighbour, zero);
104 |   auto masked_neighbour_hi = _mm512_unpackhi_epi16(masked_neighbour, zero);
105 |   sum_lo = _mm512_add_epi32(sum_lo, masked_neighbour_lo);
106 |   sum_hi = _mm512_add_epi32(sum_hi, masked_neighbour_hi);
107 | 
108 |   /*
109 |   if (std::abs(neighbour - center) <= threshold)
110 |   {
111 |     sum += neighbour;
112 |     ++cnt;
113 |   }
114 |   */
115 | }
116 | 
117 | /************************************
118 | // Temporal only AVX512, 8 bit
119 | ************************************/
120 | 
121 | static AVS_FORCEINLINE void fluxT_core_avx512(const BYTE * currp,
122 |   const BYTE * prevp, const BYTE * nextp,
123 |   BYTE * destp, int x,  
124 |   __m512i &temporal_threshold_vector,
125 |   __m512i &scaletab_lut_lsbs,
126 |   __m512i &scaletab_lut_msbs
127 | )
128 | {
129 |   auto b = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x));
130 |   auto pbt = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(prevp + x));
131 |   auto nbt = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(nextp + x));
132 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
133 |   // int pdiff = pbt - b, ndiff = nbt - b;
134 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
135 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
136 |   auto pbt_lessthan_b = _mm512_cmpgt_epu8_mask(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
137 |   auto nbt_lessthan_b = _mm512_cmpgt_epu8_mask(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
138 |   auto pbt_greaterthan_b = _mm512_cmpgt_epu8_mask(pbt, b); // FF where pbt > b
139 |   auto nbt_greaterthan_b = _mm512_cmpgt_epu8_mask(nbt, b); // FF where nbt > b
140 |   __mmask64 both_less =  _kand_mask64(pbt_lessthan_b, nbt_lessthan_b); // AVX512BW
141 |   __mmask64 both_greater = _kand_mask64(pbt_greaterthan_b, nbt_greaterthan_b);
142 |   __mmask64 mask_either_is_true = _kor_mask64(both_less, both_greater);
143 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
144 | 
145 |   // int sum = b, cnt = 1;
146 |   auto zero = _mm512_setzero_si512();
147 |   auto sum_lo = _mm512_unpacklo_epi8(b, zero);
148 |   auto sum_hi = _mm512_unpackhi_epi8(b, zero);
149 |   auto cnt = _mm512_set1_epi8(1);
150 | 
151 |   check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
152 |   check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
153 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
154 | 
155 |   // factor1 = sum*2 + cnt, sum elements are 16 bits
156 |   auto cnt_lo = _mm512_unpacklo_epi8(cnt, zero);
157 |   auto cnt_hi = _mm512_unpackhi_epi8(cnt, zero);
158 |   auto factor1_lo = _mm512_add_epi16(_mm512_add_epi16(sum_lo, sum_lo), cnt_lo);
159 |   auto factor1_hi = _mm512_add_epi16(_mm512_add_epi16(sum_hi, sum_hi), cnt_hi);
160 |   // factor2 = scaletab[cnt]
161 |   auto factor2_lsb = _mm512_shuffle_epi8(scaletab_lut_lsbs, cnt);
162 |   auto factor2_msb = _mm512_shuffle_epi8(scaletab_lut_msbs, cnt);
163 |   auto factor2_lo = _mm512_unpacklo_epi8(factor2_lsb, factor2_msb);
164 |   auto factor2_hi = _mm512_unpackhi_epi8(factor2_lsb, factor2_msb);
165 |   // finally mul and shift
166 |   auto mulres_lo = _mm512_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16
167 |   auto mulres_hi = _mm512_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16
168 |   // move back to 16x8 bits
169 |   auto result = _mm512_packus_epi16(mulres_lo, mulres_hi);
170 | 
171 |   // decide if original pixel is kept
172 |   auto finalres = _mm512_mask_mov_epi8(b, mask_either_is_true, result); // true: second param, false: 1st param
173 | 
174 |   _mm512_storeu_si512(reinterpret_cast<__m512i *>(destp + x), finalres);
175 | }
176 | 
177 | 
178 | // Temporal only
179 | void fluxT_avx512(const uint8_t* currp, const int src_pitch,
180 |   const uint8_t * prevp, const int prv_pitch,
181 |   const uint8_t * nextp, const int nxt_pitch,
182 |   uint8_t* destp, const int dst_pitch,
183 |   const int width, int height,
184 |   int temporal_threshold,
185 |   short *scaletab)
186 | {
187 |   __m512i scaletab_lut_lsbs;
188 |   __m512i scaletab_lut_msbs;
189 |   for (int i = 0; i < 16; i++) {
190 |     ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF;
191 |     ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF;
192 |     // same for hi 128
193 |     ((uint8_t*)&scaletab_lut_lsbs)[i + 16] = scaletab[i] & 0xFF;
194 |     ((uint8_t*)&scaletab_lut_msbs)[i + 16] = (scaletab[i] >> 8) & 0xFF;
195 |     // same for hilo 128
196 |     ((uint8_t*)&scaletab_lut_lsbs)[i + 16*2] = scaletab[i] & 0xFF;
197 |     ((uint8_t*)&scaletab_lut_msbs)[i + 16*2] = (scaletab[i] >> 8) & 0xFF;
198 |     // same for hihi 128
199 |     ((uint8_t*)&scaletab_lut_lsbs)[i + 16*3] = scaletab[i] & 0xFF;
200 |     ((uint8_t*)&scaletab_lut_msbs)[i + 16*3] = (scaletab[i] >> 8) & 0xFF;
201 |   }
202 | 
203 |   const int xcnt = width;
204 | 
205 |   __m512i temporal_threshold_vector = _mm512_set1_epi8(temporal_threshold);
206 | 
207 |   const int wmod64 = xcnt / 64 * 64;
208 |   const int rest = xcnt - wmod64;
209 | 
210 |   for (int y = 0; y < height; y++)
211 |   {
212 |     for (int x = 0; x < wmod64; x += 64)
213 |       fluxT_core_avx512(currp, prevp, nextp, destp, x, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
214 |     // do rest
215 |     if (rest > 0)
216 |       fluxT_core_avx512(currp, prevp, nextp, destp, xcnt - 64, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
217 | 
218 |     currp += src_pitch;
219 |     prevp += prv_pitch;
220 |     nextp += nxt_pitch;
221 |     destp += dst_pitch;
222 |   } // for y
223 |   //_mm512_zeroupper();
224 | }
225 | 
226 | /************************************
227 | // Temporal only AVX512, 16 bit
228 | ************************************/
229 | 
230 | AVS_FORCEINLINE void fluxT_core_avx512_uint16(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
231 |   __m512i &temporal_threshold_vector
232 | )
233 | {
234 |   auto b = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x));
235 |   auto pbt = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(prevp + x));
236 |   auto nbt = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(nextp + x));
237 | 
238 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
239 |   // int pdiff = pbt - b, ndiff = nbt - b;
240 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
241 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
242 |   auto pbt_lessthan_b = _mm512_cmpgt_epu16_mask(b, pbt); // 1 where b > pbt. No lt --> gt with exchanged parameters
243 |   auto nbt_lessthan_b = _mm512_cmpgt_epu16_mask(b, nbt); // 1 where b > nbt. No lt --> gt with exchanged parameters
244 |   auto pbt_greaterthan_b = _mm512_cmpgt_epu16_mask(pbt, b); // 1 where pbt > b
245 |   auto nbt_greaterthan_b = _mm512_cmpgt_epu16_mask(nbt, b); // 1 where nbt > b
246 |   __mmask32 both_less = _kand_mask32(pbt_lessthan_b, nbt_lessthan_b);
247 |   __mmask32 both_greater = _kand_mask32(pbt_greaterthan_b, nbt_greaterthan_b);
248 |   __mmask32 mask_either_is_true = _kor_mask32(both_less, both_greater);
249 |   // mask will be used at the final decision. Where 1: keep computed result. 0: keep original pixel (dst=curr)
250 | 
251 |   // int sum = b, cnt = 1;
252 |   auto zero = _mm512_setzero_si512();
253 |   auto sum_lo = _mm512_unpacklo_epi16(b, zero);
254 |   auto sum_hi = _mm512_unpackhi_epi16(b, zero);
255 |   auto cnt = _mm512_set1_epi16(1);
256 | 
257 |   check_neighbour_simd_uint16(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
258 |   check_neighbour_simd_uint16(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
259 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
260 | 
261 |   auto cnt_lo = _mm512_unpacklo_epi16(cnt, zero);
262 |   auto cnt_hi = _mm512_unpackhi_epi16(cnt, zero);
263 |   // Difference from SSE4.1 and C: floating point division
264 |   // sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f)
265 |   const auto rounder_half = _mm512_set1_ps(0.5f);
266 |   // lower 16 pixels
267 |   auto fcnt_lo = _mm512_cvtepi32_ps(cnt_lo);
268 |   auto fsum_lo = _mm512_cvtepi32_ps(sum_lo);
269 |   // difference from AVX2 or less: rcp14_ps has error less than 2^-14, while rcp_ps error is < 1.5*2^-12
270 |   auto mulres_lo = _mm512_cvttps_epi32(_mm512_fmadd_ps(fsum_lo, _mm512_rcp14_ps(fcnt_lo), rounder_half));
271 |   // upper 16 pixels
272 |   auto fcnt_hi = _mm512_cvtepi32_ps(cnt_hi);
273 |   auto fsum_hi = _mm512_cvtepi32_ps(sum_hi);
274 |   auto mulres_hi = _mm512_cvttps_epi32(_mm512_fmadd_ps(fsum_hi, _mm512_rcp14_ps(fcnt_hi), rounder_half));
275 | 
276 |   // move back to 32x16 bits
277 |   auto result = _mm512_packus_epi32(mulres_lo, mulres_hi);
278 | 
279 |   // decide if original pixel is kept
280 |   auto finalres = _mm512_mask_mov_epi16(b, mask_either_is_true, result); // true: second param, false: 1st param
281 | 
282 |   _mm512_storeu_si512(reinterpret_cast<__m512i *>(destp + x), finalres);
283 | }
284 | 
285 | // Temporal only
286 | void fluxT_avx512_uint16(const uint8_t* currp, const int src_pitch,
287 |   const uint8_t * prevp, const int prv_pitch,
288 |   const uint8_t * nextp, const int nxt_pitch,
289 |   uint8_t* destp, const int dst_pitch,
290 |   const int width, int height,
291 |   int temporal_threshold,
292 |   short *scaletab)
293 | {
294 |   const int xcnt = width;
295 | 
296 |   __m512i temporal_threshold_vector = _mm512_set1_epi16(temporal_threshold);
297 | 
298 |   const int wmod32 = xcnt / 32 * 32;
299 |   const int rest = xcnt - wmod32;
300 | 
301 |   for (int y = 0; y < height; y++)
302 |   {
303 |     for (int x = 0; x < wmod32; x += 32)
304 |       fluxT_core_avx512_uint16(currp, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector);
305 |     // do rest
306 |     if (rest > 0)
307 |       fluxT_core_avx512_uint16(currp, prevp, nextp, destp, (xcnt - 32) * sizeof(uint16_t), temporal_threshold_vector);
308 | 
309 |     currp += src_pitch;
310 |     prevp += prv_pitch;
311 |     nextp += nxt_pitch;
312 |     destp += dst_pitch;
313 |   } // for y
314 |   //_mm512_zeroupper();
315 | }
316 | 
317 | /************************************
318 | // Spatial Temporal AVX2, 8 bit
319 | ************************************/
320 | 
321 | AVS_FORCEINLINE void fluxST_core_avx512(const BYTE * currp, const int src_pitch,
322 |   const BYTE * prevp, const BYTE * nextp,
323 |   BYTE * destp, int x,
324 |   __m512i &temporal_threshold_vector,
325 |   __m512i &spatial_threshold_vector,
326 |   __m512i &scaletab_lut_lsbs,
327 |   __m512i &scaletab_lut_msbs
328 | )
329 | {
330 |   // +1: center of 3x3 pixels [+0,+1,+2]
331 |   auto b = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + 1));
332 |   auto pbt = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(prevp + x + 1));
333 |   auto nbt = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(nextp + x + 1));
334 | 
335 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
336 |   // int pdiff = pbt - b, ndiff = nbt - b;
337 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
338 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
339 |   auto pbt_lessthan_b = _mm512_cmpgt_epu8_mask(b, pbt); // 1 where b > pbt. No lt --> gt with exchanged parameters
340 |   auto nbt_lessthan_b = _mm512_cmpgt_epu8_mask(b, nbt); // 1 where b > nbt. No lt --> gt with exchanged parameters
341 |   auto pbt_greaterthan_b = _mm512_cmpgt_epu8_mask(pbt, b); // 1 where pbt > b
342 |   auto nbt_greaterthan_b = _mm512_cmpgt_epu8_mask(nbt, b); // 1 where nbt > b
343 |   __mmask64 both_less = _kand_mask64(pbt_lessthan_b, nbt_lessthan_b);
344 |   __mmask64 both_greater = _kand_mask64(pbt_greaterthan_b, nbt_greaterthan_b);
345 |   __mmask64 mask_either_is_true = _kor_mask64(both_less, both_greater);
346 |   // mask will be used at the final decision. Where 1: keep computed result. 0: keep original pixel (dst=curr)
347 | 
348 |     // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1];
349 |     // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1];
350 |     // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1];
351 | 
352 |   auto pb1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x - src_pitch + 0));
353 |   auto pb2 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x - src_pitch + 1));
354 |   auto pb3 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x - src_pitch + 2));
355 | 
356 |   auto b1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + 0));
357 |   auto b2 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + 2));
358 | 
359 |   auto nb1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + src_pitch + 0));
360 |   auto nb2 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + src_pitch + 1));
361 |   auto nb3 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + src_pitch + 2));
362 | 
363 |   // int sum = b, cnt = 1;
364 |   auto zero = _mm512_setzero_si512();
365 |   auto sum_lo = _mm512_unpacklo_epi8(b, zero);
366 |   auto sum_hi = _mm512_unpackhi_epi8(b, zero);
367 |   auto cnt = _mm512_set1_epi8(1);
368 | 
369 |   check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
370 |   check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
371 |   check_neighbour_simd(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
372 |   check_neighbour_simd(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
373 |   check_neighbour_simd(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
374 |   check_neighbour_simd(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
375 |   check_neighbour_simd(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
376 |   check_neighbour_simd(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
377 |   check_neighbour_simd(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
378 |   check_neighbour_simd(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
379 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
380 | 
381 |   // factor1 = sum*2 + cnt, sum elements are 16 bits
382 |   auto cnt_lo = _mm512_unpacklo_epi8(cnt, zero);
383 |   auto cnt_hi = _mm512_unpackhi_epi8(cnt, zero);
384 |   auto factor1_lo = _mm512_add_epi16(_mm512_add_epi16(sum_lo, sum_lo), cnt_lo);
385 |   auto factor1_hi = _mm512_add_epi16(_mm512_add_epi16(sum_hi, sum_hi), cnt_hi);
386 |   // factor2 = scaletab[cnt]
387 |   auto factor2_lsb = _mm512_shuffle_epi8(scaletab_lut_lsbs, cnt);
388 |   auto factor2_msb = _mm512_shuffle_epi8(scaletab_lut_msbs, cnt);
389 |   auto factor2_lo = _mm512_unpacklo_epi8(factor2_lsb, factor2_msb);
390 |   auto factor2_hi = _mm512_unpackhi_epi8(factor2_lsb, factor2_msb);
391 |   // finally mul and shift
392 |   auto mulres_lo = _mm512_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16
393 |   auto mulres_hi = _mm512_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16
394 |   // move back to 16x8 bits
395 |   auto result = _mm512_packus_epi16(mulres_lo, mulres_hi);
396 | 
397 |   // decide if original pixel is kept
398 |   auto finalres = _mm512_mask_mov_epi8(b, mask_either_is_true, result); // true: second param, false: 1st param
399 | 
400 |   _mm512_storeu_si512(reinterpret_cast<__m512i *>(destp + x + 1), finalres);
401 | }
402 | 
403 | // Spatial Temporal
404 | void fluxST_avx512(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
405 |   uint8_t* destp, const int dst_pitch, const int width, int height, int temporal_threshold, int spatial_threshold, short *scaletab)
406 | {
407 |   __m512i scaletab_lut_lsbs;
408 |   __m512i scaletab_lut_msbs;
409 |   for (int i = 0; i < 16; i++) {
410 |     ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF;
411 |     ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF;
412 |     // same for upper 128
413 |     ((uint8_t*)&scaletab_lut_lsbs)[i + 16] = scaletab[i] & 0xFF;
414 |     ((uint8_t*)&scaletab_lut_msbs)[i + 16] = (scaletab[i] >> 8) & 0xFF;
415 |     // same for upper 2*128
416 |     ((uint8_t*)&scaletab_lut_lsbs)[i + 2 * 16] = scaletab[i] & 0xFF;
417 |     ((uint8_t*)&scaletab_lut_msbs)[i + 2 * 16] = (scaletab[i] >> 8) & 0xFF;
418 |     // same for upper 3*128
419 |     ((uint8_t*)&scaletab_lut_lsbs)[i + 3 * 16] = scaletab[i] & 0xFF;
420 |     ((uint8_t*)&scaletab_lut_msbs)[i + 3 * 16] = (scaletab[i] >> 8) & 0xFF;
421 |   }
422 | 
423 |   // spatial: because of previous and next line involved, function is called 
424 |   // starting with the 2nd line and with height = (real_height - 2) 
425 |   const int xcnt = width - 2; // leftmost/rightmost column safety
426 | 
427 |   __m512i temporal_threshold_vector = _mm512_set1_epi8(temporal_threshold);
428 |   __m512i spatial_threshold_vector = _mm512_set1_epi8(spatial_threshold);
429 | 
430 |   const int wmod64 = xcnt / 64 * 64;
431 |   const int rest = xcnt - wmod64;
432 | 
433 |   for (int y = 0; y < height; y++)
434 |   {
435 |     destp[0] = currp[0]; // Copy left edge
436 | 
437 |     for (int x = 0; x < wmod64; x += 64)
438 |       fluxST_core_avx512(currp, src_pitch, prevp, nextp, destp, x, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
439 |     // do rest
440 |     if (rest > 0)
441 |       fluxST_core_avx512(currp, src_pitch, prevp, nextp, destp, xcnt - 64, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
442 | 
443 |     destp[width - 1] = currp[width - 1]; // Copy right edge
444 | 
445 |     currp += src_pitch;
446 |     prevp += prv_pitch;
447 |     nextp += nxt_pitch;
448 |     destp += dst_pitch;
449 |   } // for y
450 |   //_mm512_zeroupper();
451 | }
452 | 
453 | /************************************
454 | // Spatial Temporal AVX2, 16 bit
455 | ************************************/
456 | AVS_FORCEINLINE void fluxST_core_avx512_uint16(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
457 |   __m512i &temporal_threshold_vector,
458 |   __m512i &spatial_threshold_vector
459 | )
460 | {
461 |   // +1: center of 3x3 pixels [+0,+1,+2]
462 |   auto b = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + 1 * sizeof(uint16_t)));
463 |   auto pbt = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(prevp + x + 1 * sizeof(uint16_t)));
464 |   auto nbt = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(nextp + x + 1 * sizeof(uint16_t)));
465 | 
466 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
467 |   // int pdiff = pbt - b, ndiff = nbt - b;
468 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
469 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
470 |   auto pbt_lessthan_b = _mm512_cmpgt_epu16_mask(b, pbt); // 1 where b > pbt. No lt --> gt with exchanged parameters
471 |   auto nbt_lessthan_b = _mm512_cmpgt_epu16_mask(b, nbt); // 1 where b > nbt. No lt --> gt with exchanged parameters
472 |   auto pbt_greaterthan_b = _mm512_cmpgt_epu16_mask(pbt, b); // 1 where pbt > b
473 |   auto nbt_greaterthan_b = _mm512_cmpgt_epu16_mask(nbt, b); // 1 where nbt > b
474 |   __mmask32 both_less = _kand_mask32(pbt_lessthan_b, nbt_lessthan_b);
475 |   __mmask32 both_greater = _kand_mask32(pbt_greaterthan_b, nbt_greaterthan_b);
476 |   __mmask32 mask_either_is_true = _kor_mask32(both_less, both_greater);
477 |   // mask will be used at the final decision. Where 1: keep computed result. 00: keep original pixel (dst=curr)
478 | 
479 |     // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1];
480 |     // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1];
481 |     // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1];
482 | 
483 |   auto pb1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x - src_pitch + 0 * sizeof(uint16_t)));
484 |   auto pb2 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x - src_pitch + 1 * sizeof(uint16_t)));
485 |   auto pb3 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x - src_pitch + 2 * sizeof(uint16_t)));
486 | 
487 |   auto b1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + 0 * sizeof(uint16_t)));
488 |   auto b2 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + 2 * sizeof(uint16_t)));
489 | 
490 |   auto nb1 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + src_pitch + 0 * sizeof(uint16_t)));
491 |   auto nb2 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + src_pitch + 1 * sizeof(uint16_t)));
492 |   auto nb3 = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(currp + x + src_pitch + 2 * sizeof(uint16_t)));
493 | 
494 |   // int sum = b, cnt = 1;
495 |   auto zero = _mm512_setzero_si512();
496 |   auto sum_lo = _mm512_unpacklo_epi16(b, zero);
497 |   auto sum_hi = _mm512_unpackhi_epi16(b, zero);
498 |   auto cnt = _mm512_set1_epi16(1);
499 | 
500 |   check_neighbour_simd_uint16(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
501 |   check_neighbour_simd_uint16(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
502 |   check_neighbour_simd_uint16(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
503 |   check_neighbour_simd_uint16(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
504 |   check_neighbour_simd_uint16(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
505 |   check_neighbour_simd_uint16(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
506 |   check_neighbour_simd_uint16(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
507 |   check_neighbour_simd_uint16(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
508 |   check_neighbour_simd_uint16(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
509 |   check_neighbour_simd_uint16(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
510 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
511 | 
512 |   auto cnt_lo = _mm512_unpacklo_epi16(cnt, zero);
513 |   auto cnt_hi = _mm512_unpackhi_epi16(cnt, zero);
514 |   // Difference from SSE4.1 and C: floating point division
515 |   // sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f)
516 |   const auto rounder_half = _mm512_set1_ps(0.5f);
517 |   // lower 16 pixels
518 |   auto fcnt_lo = _mm512_cvtepi32_ps(cnt_lo);
519 |   auto fsum_lo = _mm512_cvtepi32_ps(sum_lo);
520 |   // difference from AVX2 or less: rcp14_ps has error less than 2^-14, while rcp_ps error is < 1.5*2^-12
521 |   auto mulres_lo = _mm512_cvttps_epi32(_mm512_fmadd_ps(fsum_lo, _mm512_rcp14_ps(fcnt_lo), rounder_half));
522 |   // upper 16 pixels
523 |   auto fcnt_hi = _mm512_cvtepi32_ps(cnt_hi);
524 |   auto fsum_hi = _mm512_cvtepi32_ps(sum_hi);
525 |   auto mulres_hi = _mm512_cvttps_epi32(_mm512_fmadd_ps(fsum_hi, _mm512_rcp14_ps(fcnt_hi), rounder_half));
526 | 
527 |   // move back to 32x16 bits
528 |   auto result = _mm512_packus_epi32(mulres_lo, mulres_hi);
529 | 
530 |   // decide if original pixel is kept
531 |   auto finalres = _mm512_mask_mov_epi16(b, mask_either_is_true, result); // true: second param, false: 1st param
532 | 
533 |   _mm512_storeu_si512(reinterpret_cast<__m512i *>(destp + x + 1 * sizeof(uint16_t)), finalres);
534 | }
535 | 
536 | // Spatial Temporal
537 | void fluxST_avx512_uint16(const uint8_t* currp, const int src_pitch,
538 |   const uint8_t * prevp, const int prv_pitch,
539 |   const uint8_t * nextp, const int nxt_pitch,
540 |   uint8_t* destp, const int dst_pitch,
541 |   const int width, int height,
542 |   int temporal_threshold,
543 |   int spatial_threshold,
544 |   short *scaletab)
545 | {
546 | 
547 |   // spatial: because of previous and next line involved, function is called 
548 |   // starting with the 2nd line and with height = (real_height - 2) 
549 |   const int xcnt = width - 2; // leftmost/rightmost column safety
550 | 
551 |   __m512i temporal_threshold_vector = _mm512_set1_epi16(temporal_threshold);
552 |   __m512i spatial_threshold_vector = _mm512_set1_epi16(spatial_threshold);
553 | 
554 |   const int wmod32 = xcnt / 32 * 32;
555 |   const int rest = xcnt - wmod32;
556 | 
557 |   for (int y = 0; y < height; y++)
558 |   {
559 |     reinterpret_cast<uint16_t*>(destp)[0] = reinterpret_cast<const uint16_t*>(currp)[0]; // Copy left edge
560 | 
561 |     for (int x = 0; x < wmod32; x += 32)
562 |       fluxST_core_avx512_uint16(currp, src_pitch, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector);
563 |     // do rest
564 |     if (rest > 0)
565 |       fluxST_core_avx512_uint16(currp, src_pitch, prevp, nextp, destp, (xcnt - 32) * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector);
566 | 
567 |     reinterpret_cast<uint16_t*>(destp)[width - 1] = reinterpret_cast<const uint16_t*>(currp)[width - 1]; // Copy right edge
568 | 
569 |     currp += src_pitch;
570 |     prevp += prv_pitch;
571 |     nextp += nxt_pitch;
572 |     destp += dst_pitch;
573 |   } // for y
574 |   //_mm512_zeroupper();
575 | }
576 | 
577 | #endif // FLUXSMOOTH_AVX512_ENABLED
578 | 


--------------------------------------------------------------------------------
/FluxSmooth/FluxSmooth.cpp:
--------------------------------------------------------------------------------
  1 | // FluxSmooth
  2 | // Avisynth filter for spatio-temporal smoothing of fluctuations
  3 | //
  4 | // By Ross Thomas <ross@grinfinity.com>
  5 | //
  6 | // There is no copyright on this code, and there are no conditions
  7 | // on its distribution or use. Do with it what you will.
  8 | 
  9 | #include "FluxSmooth.h"
 10 | #include <algorithm>
 11 | #include "stdint.h"
 12 | #include "emmintrin.h" // SSE2
 13 | #include "immintrin.h" // SSSE3
 14 | #include "smmintrin.h" // SSE4.1
 15 | 
 16 | /************************************
 17 | // Helpers
 18 | ************************************/
 19 | 
 20 | #ifdef INTEL_INTRINSICS
 21 | #if defined(CLANG) || defined(GCC)
 22 | __attribute__((__target__("sse4.1")))
 23 | #endif
 24 | AVS_FORCEINLINE void check_neighbour_simd(__m128i &neighbour, __m128i &center, __m128i &threshold,
 25 |   __m128i &sum_lo, __m128i &sum_hi, __m128i &cnt)
 26 | {
 27 |   auto n_minus_c = _mm_subs_epu8(neighbour, center);
 28 |   auto c_minus_n = _mm_subs_epu8(center, neighbour);
 29 |   auto absdiff = _mm_or_si128(n_minus_c, c_minus_n);
 30 |   auto abs_is_lessthanoreq_thresh = _mm_cmple_epu8(absdiff, threshold);
 31 |   // count. increment when true. We simply sub the mask value 00 (0) or FF (-1)
 32 |   cnt = _mm_sub_epi8(cnt, abs_is_lessthanoreq_thresh);
 33 |   // increase sum elements by neighbour where true, that is mask is FF
 34 |   // sum is 16 bits
 35 |   auto masked_neighbour = _mm_and_si128(abs_is_lessthanoreq_thresh, neighbour);
 36 |   auto zero = _mm_setzero_si128();
 37 |   auto masked_neighbour_lo = _mm_unpacklo_epi8(masked_neighbour, zero);
 38 |   auto masked_neighbour_hi = _mm_unpackhi_epi8(masked_neighbour, zero);
 39 |   sum_lo = _mm_add_epi16(sum_lo, masked_neighbour_lo);
 40 |   sum_hi = _mm_add_epi16(sum_hi, masked_neighbour_hi);
 41 | 
 42 |   /*
 43 |   if (std::abs(neighbour - center) <= threshold)
 44 |   {
 45 |     sum += neighbour;
 46 |     ++cnt;
 47 |   }
 48 |   */
 49 | }
 50 | 
 51 | #if defined(CLANG) || defined(GCC)
 52 | __attribute__((__target__("sse4.1")))
 53 | #endif
 54 | AVS_FORCEINLINE void check_neighbour_simd_uint16(__m128i &neighbour, __m128i &center, __m128i &threshold,
 55 |   __m128i &sum_lo, __m128i &sum_hi, __m128i &cnt, const __m128i &make_signed_word)
 56 | {
 57 |   // threshold is shifted to the "signed" int16 domain
 58 |   auto n_minus_c = _mm_subs_epu16(neighbour, center);
 59 |   auto c_minus_n = _mm_subs_epu16(center, neighbour);
 60 |   auto absdiff = _mm_or_si128(n_minus_c, c_minus_n);
 61 |   // absdiff <= threshold ==> !(absdiff > threshold)
 62 |   // FIXME make it a bit faster: cmpgt and later: andnot, and count in a reverse way (instead of increase-when-match use decrease-by-non-match)
 63 |   auto abs_is_lessthanoreq_thresh = _mm_cmple_epi16(_mm_add_epi16(absdiff, make_signed_word), threshold); 
 64 |   // count. increment when true. We simply sub the mask value 0000 (0) or FFFF (-1)
 65 |   cnt = _mm_sub_epi16(cnt, abs_is_lessthanoreq_thresh);
 66 |   // increase sum elements by neighbour where true, that is mask is FF
 67 |   // sum is 16 bits
 68 |   auto masked_neighbour = _mm_and_si128(abs_is_lessthanoreq_thresh, neighbour);
 69 |   auto zero = _mm_setzero_si128();
 70 |   auto masked_neighbour_lo = _mm_unpacklo_epi16(masked_neighbour, zero);
 71 |   auto masked_neighbour_hi = _mm_unpackhi_epi16(masked_neighbour, zero);
 72 |   sum_lo = _mm_add_epi32(sum_lo, masked_neighbour_lo);
 73 |   sum_hi = _mm_add_epi32(sum_hi, masked_neighbour_hi);
 74 | 
 75 |   /*
 76 |   if (std::abs(neighbour - center) <= threshold)
 77 |   {
 78 |     sum += neighbour;
 79 |     ++cnt;
 80 |   }
 81 |   */
 82 | }
 83 | 
 84 | /************************************
 85 | // Temporal only SSE2, 8 bit
 86 | ************************************/
 87 | AVS_FORCEINLINE void fluxT_core_sse2(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
 88 |   __m128i &temporal_threshold_vector,
 89 |   __m128i &scaletab_lut_lsbs,
 90 |   __m128i &scaletab_lut_msbs
 91 | )
 92 | {
 93 |   auto b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x));
 94 |   auto pbt = _mm_loadu_si128(reinterpret_cast<const __m128i*>(prevp + x));
 95 |   auto nbt = _mm_loadu_si128(reinterpret_cast<const __m128i*>(nextp + x));
 96 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
 97 |   // int pdiff = pbt - b, ndiff = nbt - b;
 98 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
 99 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
100 |   auto pbt_lessthan_b = _mm_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
101 |   auto nbt_lessthan_b = _mm_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
102 |   auto pbt_greaterthan_b = _mm_cmpgt_epu8(pbt, b); // FF where pbt > b
103 |   auto nbt_greaterthan_b = _mm_cmpgt_epu8(nbt, b); // FF where nbt > b
104 |   auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b);
105 |   auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b);
106 |   auto mask_either_is_true = _mm_or_si128(both_less, both_greater);
107 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
108 | 
109 |   // int sum = b, cnt = 1;
110 |   auto zero = _mm_setzero_si128();
111 |   auto sum_lo = _mm_unpacklo_epi8(b, zero);
112 |   auto sum_hi = _mm_unpackhi_epi8(b, zero);
113 |   auto cnt = _mm_set1_epi8(1);
114 | 
115 |   check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
116 |   check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
117 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
118 | 
119 |   // factor1 = sum*2 + cnt, sum elements are 16 bits
120 |   auto cnt_lo = _mm_unpacklo_epi8(cnt, zero);
121 |   auto cnt_hi = _mm_unpackhi_epi8(cnt, zero);
122 | 
123 |   // Difference from SSE4.1 and C: floating point division
124 |   // SSE2: sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f)
125 |   const auto rounder_half = _mm_set1_ps(0.5f);
126 |   // lower 8 pixels
127 |   auto fcnt_lo_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(cnt_lo, zero));
128 |   auto fcnt_lo_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(cnt_lo, zero));
129 |   auto fsum_lo_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(sum_lo, zero));
130 |   auto fsum_lo_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(sum_lo, zero));
131 | 
132 |   auto mul_lo_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo_lo, _mm_rcp_ps(fcnt_lo_lo)), rounder_half));
133 |   auto mul_lo_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo_hi, _mm_rcp_ps(fcnt_lo_hi)), rounder_half));
134 |   auto mulres_lo = _mm_packs_epi32(mul_lo_lo, mul_lo_hi);
135 |   // upper 8 pixels
136 |   auto fcnt_hi_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(cnt_hi, zero));
137 |   auto fcnt_hi_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(cnt_hi, zero));
138 |   auto fsum_hi_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(sum_hi, zero));
139 |   auto fsum_hi_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(sum_hi, zero));
140 | 
141 |   auto mul_hi_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi_lo, _mm_rcp_ps(fcnt_hi_lo)), rounder_half));
142 |   auto mul_hi_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi_hi, _mm_rcp_ps(fcnt_hi_hi)), rounder_half));
143 |   auto mulres_hi = _mm_packs_epi32(mul_hi_lo, mul_hi_hi);
144 | 
145 |   // move back to 16x8 bits
146 |   auto result = _mm_packus_epi16(mulres_lo, mulres_hi);
147 | 
148 |   // decide if original pixel is kept
149 |   auto finalres = _MM_BLENDV_EPI8(b, result, mask_either_is_true); // true: second param, false: 1st param
150 | 
151 |   _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x), finalres);
152 | }
153 | 
154 | 
155 | // Temporal only
156 | void fluxT_sse2(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
157 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab)
158 | {
159 |   __m128i scaletab_lut_lsbs;
160 |   __m128i scaletab_lut_msbs;
161 |   for (int i = 0; i < 16; i++) {
162 |     ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF;
163 |     ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF;
164 |   }
165 | 
166 |   const int xcnt = width;
167 | 
168 |   __m128i temporal_threshold_vector = _mm_set1_epi8(temporal_threshold);
169 | 
170 |   const int wmod16 = xcnt / 16 * 16;
171 |   const int rest = xcnt - wmod16;
172 | 
173 |   for (int y = 0; y < height; y++)
174 |   {
175 |     for (int x = 0; x < wmod16; x += 16)
176 |       fluxT_core_sse2(currp, prevp, nextp, destp, x, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
177 |     // do rest
178 |     if (rest > 0)
179 |       fluxT_core_sse2(currp, prevp, nextp, destp, xcnt - 16, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
180 | 
181 |     currp += src_pitch;
182 |     prevp += prv_pitch;
183 |     nextp += nxt_pitch;
184 |     destp += dst_pitch;
185 |   } // for y
186 | }
187 | 
188 | /************************************
189 | // Temporal only SSE4.1, 8 bit
190 | ************************************/
191 | #if defined(CLANG) || defined(GCC)
192 | __attribute__((__target__("sse4.1")))
193 | #endif
194 | AVS_FORCEINLINE void fluxT_core_sse41(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
195 |   __m128i &temporal_threshold_vector,
196 |   __m128i &scaletab_lut_lsbs,
197 |   __m128i &scaletab_lut_msbs
198 | )
199 | {
200 | 
201 |   auto b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x));
202 |   auto pbt = _mm_loadu_si128(reinterpret_cast<const __m128i*>(prevp + x));
203 |   auto nbt = _mm_loadu_si128(reinterpret_cast<const __m128i*>(nextp + x));
204 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
205 |   // int pdiff = pbt - b, ndiff = nbt - b;
206 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
207 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
208 |   auto pbt_lessthan_b = _mm_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
209 |   auto nbt_lessthan_b = _mm_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
210 |   auto pbt_greaterthan_b = _mm_cmpgt_epu8(pbt, b); // FF where pbt > b
211 |   auto nbt_greaterthan_b = _mm_cmpgt_epu8(nbt, b); // FF where nbt > b
212 |   auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b);
213 |   auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b);
214 |   auto mask_either_is_true = _mm_or_si128(both_less, both_greater);
215 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
216 | 
217 |   // int sum = b, cnt = 1;
218 |   auto zero = _mm_setzero_si128();
219 |   auto sum_lo = _mm_unpacklo_epi8(b, zero);
220 |   auto sum_hi = _mm_unpackhi_epi8(b, zero);
221 |   auto cnt = _mm_set1_epi8(1);
222 | 
223 |   check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
224 |   check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
225 |     // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
226 | 
227 | #if 0
228 |   // Experiment with MADD and rounding: a bit slower, the same result
229 |   // (sum      * scaletab) + 
230 |   // (rounding *        1) in one step
231 |   // >> 15: 2nd step
232 |   auto one = _mm_set1_epi16(1);
233 |   constexpr int FACTOR_BITS = 15;
234 |   auto rounding = _mm_set1_epi16(1 << (FACTOR_BITS - 1));
235 | 
236 |   // factor2 = scaletab[cnt]
237 |   auto factor2_lsb = _mm_shuffle_epi8(scaletab_lut_lsbs, cnt);
238 |   auto factor2_msb = _mm_shuffle_epi8(scaletab_lut_msbs, cnt);
239 |   auto factor2_lo = _mm_unpacklo_epi8(factor2_lsb, factor2_msb);
240 |   auto factor2_hi = _mm_unpackhi_epi8(factor2_lsb, factor2_msb);
241 | 
242 |   auto mulres_lo_lo = _mm_srai_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(sum_lo, rounding), _mm_unpacklo_epi16(factor2_lo, one)), FACTOR_BITS);
243 |   auto mulres_lo_hi = _mm_srai_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(sum_lo, rounding), _mm_unpackhi_epi16(factor2_lo, one)), FACTOR_BITS);
244 |   auto mulres_lo = _mm_packs_epi32(mulres_lo_lo, mulres_lo_hi);
245 | 
246 |   auto mulres_hi_lo = _mm_srai_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(sum_hi, rounding), _mm_unpacklo_epi16(factor2_hi, one)), FACTOR_BITS);
247 |   auto mulres_hi_hi = _mm_srai_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(sum_hi, rounding), _mm_unpackhi_epi16(factor2_hi, one)), FACTOR_BITS);
248 |   auto mulres_hi = _mm_packs_epi32(mulres_hi_lo, mulres_hi_hi);
249 | #else
250 |     // factor1 = sum*2 + cnt, sum elements are 16 bits
251 |   auto cnt_lo = _mm_unpacklo_epi8(cnt, zero);
252 |   auto cnt_hi = _mm_unpackhi_epi8(cnt, zero);
253 |   auto factor1_lo = _mm_add_epi16(_mm_add_epi16(sum_lo, sum_lo), cnt_lo);
254 |   auto factor1_hi = _mm_add_epi16(_mm_add_epi16(sum_hi, sum_hi), cnt_hi);
255 |   // factor2 = scaletab[cnt]
256 |   auto factor2_lsb = _mm_shuffle_epi8(scaletab_lut_lsbs, cnt);
257 |   auto factor2_msb = _mm_shuffle_epi8(scaletab_lut_msbs, cnt);
258 |   auto factor2_lo = _mm_unpacklo_epi8(factor2_lsb, factor2_msb);
259 |   auto factor2_hi = _mm_unpackhi_epi8(factor2_lsb, factor2_msb);
260 |   // finally mul and shift
261 |   auto mulres_lo = _mm_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16
262 |   auto mulres_hi = _mm_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16
263 | #endif
264 |   // move back to 16x8 bits
265 |   auto result = _mm_packus_epi16(mulres_lo, mulres_hi);
266 |   // decide if original pixel is kept
267 |   auto finalres = _mm_blendv_epi8(b, result, mask_either_is_true); // true: second param, false: 1st param
268 | 
269 |   _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x), finalres);
270 | }
271 | 
272 | // Temporal only
273 | #if defined(CLANG) || defined(GCC)
274 | __attribute__((__target__("sse4.1")))
275 | #endif
276 | void fluxT_sse41(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
277 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab)
278 | {
279 |   __m128i scaletab_lut_lsbs;
280 |   __m128i scaletab_lut_msbs;
281 |   for (int i = 0; i < 16; i++) {
282 |     ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF;
283 |     ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF;
284 |   }
285 | 
286 |   const int xcnt = width;
287 | 
288 |   __m128i temporal_threshold_vector = _mm_set1_epi8(temporal_threshold);
289 | 
290 |   const int wmod16 = xcnt / 16 * 16;
291 |   const int rest = xcnt - wmod16;
292 | 
293 |   for (int y = 0; y < height; y++)
294 |   {
295 |     for (int x = 0; x < wmod16; x += 16)
296 |       fluxT_core_sse41(currp, prevp, nextp, destp, x, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
297 |     // do rest
298 |     if (rest > 0)
299 |       fluxT_core_sse41(currp, prevp, nextp, destp, xcnt - 16, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
300 | 
301 |     currp += src_pitch;
302 |     prevp += prv_pitch;
303 |     nextp += nxt_pitch;
304 |     destp += dst_pitch;
305 |   } // for y
306 | }
307 | 
308 | /************************************
309 | // Temporal only SSE4.1, 16 bit
310 | ************************************/
311 | #if defined(CLANG) || defined(GCC)
312 | __attribute__((__target__("sse4.1")))
313 | #endif
314 | AVS_FORCEINLINE void fluxT_core_sse41_uint16(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
315 |   __m128i &temporal_threshold_vector // already shifted to "signed" domain
316 | )
317 | {
318 |   const auto make_signed_word = _mm_set1_epi16(0x8000); // int16 support is better than of uint16 (cmp, etc...)
319 | 
320 |   auto b_orig = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x));
321 |   auto pbt_orig = _mm_loadu_si128(reinterpret_cast<const __m128i*>(prevp + x));
322 |   auto nbt_orig = _mm_loadu_si128(reinterpret_cast<const __m128i*>(nextp + x));
323 | 
324 |   auto b = _mm_add_epi16(b_orig, make_signed_word);
325 |   auto pbt = _mm_add_epi16(pbt_orig, make_signed_word);
326 |   auto nbt = _mm_add_epi16(nbt_orig, make_signed_word);
327 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
328 |   // int pdiff = pbt - b, ndiff = nbt - b;
329 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
330 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
331 |   auto pbt_lessthan_b = _mm_cmpgt_epi16(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
332 |   auto nbt_lessthan_b = _mm_cmpgt_epi16(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
333 |   auto pbt_greaterthan_b = _mm_cmpgt_epi16(pbt, b); // FF where pbt > b
334 |   auto nbt_greaterthan_b = _mm_cmpgt_epi16(nbt, b); // FF where nbt > b
335 |   auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b);
336 |   auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b);
337 |   auto mask_either_is_true = _mm_or_si128(both_less, both_greater);
338 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
339 | 
340 |   // int sum = b, cnt = 1;
341 |   auto zero = _mm_setzero_si128();
342 |   auto sum_lo = _mm_unpacklo_epi16(b_orig, zero);
343 |   auto sum_hi = _mm_unpackhi_epi16(b_orig, zero);
344 |   auto cnt = _mm_set1_epi16(1);
345 | 
346 |   check_neighbour_simd_uint16(pbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
347 |   check_neighbour_simd_uint16(nbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
348 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
349 | 
350 |   auto cnt_lo = _mm_unpacklo_epi16(cnt, zero);
351 |   auto cnt_hi = _mm_unpackhi_epi16(cnt, zero);
352 |   // Difference from SSE4.1 and C: floating point division
353 |   // SSE2: sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f)
354 |   const auto rounder_half = _mm_set1_ps(0.5f);
355 |   // lower 4 pixels
356 |   auto fcnt_lo = _mm_cvtepi32_ps(cnt_lo);
357 |   auto fsum_lo = _mm_cvtepi32_ps(sum_lo);
358 |   auto mulres_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo, _mm_rcp_ps(fcnt_lo)), rounder_half));
359 |   // upper 4 pixels
360 |   auto fcnt_hi = _mm_cvtepi32_ps(cnt_hi);
361 |   auto fsum_hi = _mm_cvtepi32_ps(sum_hi);
362 |   auto mulres_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi, _mm_rcp_ps(fcnt_hi)), rounder_half));
363 | 
364 |   // move back to 8x16 bits
365 |   auto result = _mm_packus_epi32(mulres_lo, mulres_hi);
366 | 
367 |   // decide if original pixel is kept
368 |   auto finalres = _mm_blendv_epi8(b_orig, result, mask_either_is_true); // true: second param, false: 1st param
369 | 
370 |   _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x), finalres);
371 | }
372 | 
373 | // Temporal only
374 | #if defined(CLANG) || defined(GCC)
375 | __attribute__((__target__("sse4.1")))
376 | #endif
377 | void fluxT_sse41_uint16(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
378 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab)
379 | {
380 |   const int xcnt = width;
381 | 
382 |   __m128i temporal_threshold_vector = _mm_set1_epi16(temporal_threshold - 0x8000); // move to signed int16 domain
383 | 
384 |   // uint16_t: 8 pixels per cycle
385 |   const int wmod8 = xcnt / 8 * 8;
386 |   const int rest = xcnt - wmod8;
387 | 
388 |   for (int y = 0; y < height; y++)
389 |   {
390 |     for (int x = 0; x < wmod8; x += 8)
391 |       fluxT_core_sse41_uint16(currp, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector);
392 |     // do rest
393 |     if (rest > 0)
394 |       fluxT_core_sse41_uint16(currp, prevp, nextp, destp, (xcnt - 8) * sizeof(uint16_t), temporal_threshold_vector);
395 | 
396 |     currp += src_pitch;
397 |     prevp += prv_pitch;
398 |     nextp += nxt_pitch;
399 |     destp += dst_pitch;
400 |   } // for y
401 | }
402 | 
403 | /************************************
404 | // Spatial Temporal SSE2, 8 bit
405 | ************************************/
406 | AVS_FORCEINLINE void fluxST_core_sse2(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
407 |   __m128i &temporal_threshold_vector,
408 |   __m128i &spatial_threshold_vector,
409 |   __m128i &scaletab_lut_lsbs,
410 |   __m128i &scaletab_lut_msbs
411 | )
412 | {
413 |   // +1: center of 3x3 pixels [+0,+1,+2]
414 |   auto b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 1));
415 |   auto pbt = _mm_loadu_si128(reinterpret_cast<const __m128i*>(prevp + x + 1));
416 |   auto nbt = _mm_loadu_si128(reinterpret_cast<const __m128i*>(nextp + x + 1));
417 | 
418 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
419 |   // int pdiff = pbt - b, ndiff = nbt - b;
420 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
421 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
422 |   auto pbt_lessthan_b = _mm_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
423 |   auto nbt_lessthan_b = _mm_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
424 |   auto pbt_greaterthan_b = _mm_cmpgt_epu8(pbt, b); // FF where pbt > b
425 |   auto nbt_greaterthan_b = _mm_cmpgt_epu8(nbt, b); // FF where nbt > b
426 |   auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b);
427 |   auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b);
428 |   auto mask_either_is_true = _mm_or_si128(both_less, both_greater);
429 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
430 | 
431 |     // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1];
432 |     // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1];
433 |     // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1];
434 | 
435 |   auto pb1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 0));
436 |   auto pb2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 1));
437 |   auto pb3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 2));
438 | 
439 |   auto b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 0));
440 |   auto b2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 2));
441 | 
442 |   auto nb1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 0));
443 |   auto nb2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 1));
444 |   auto nb3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 2));
445 | 
446 |   // int sum = b, cnt = 1;
447 |   auto zero = _mm_setzero_si128();
448 |   auto sum_lo = _mm_unpacklo_epi8(b, zero);
449 |   auto sum_hi = _mm_unpackhi_epi8(b, zero);
450 |   auto cnt = _mm_set1_epi8(1);
451 | 
452 |   check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
453 |   check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
454 |   check_neighbour_simd(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
455 |   check_neighbour_simd(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
456 |   check_neighbour_simd(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
457 |   check_neighbour_simd(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
458 |   check_neighbour_simd(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
459 |   check_neighbour_simd(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
460 |   check_neighbour_simd(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
461 |   check_neighbour_simd(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
462 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
463 | 
464 |   // factor1 = sum*2 + cnt, sum elements are 16 bits
465 |   auto cnt_lo = _mm_unpacklo_epi8(cnt, zero);
466 |   auto cnt_hi = _mm_unpackhi_epi8(cnt, zero);
467 | 
468 |   // Difference from SSE4.1 and C: floating point division
469 |   // SSE2: sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f)
470 |   const auto rounder_half = _mm_set1_ps(0.5f);
471 |   // lower 8 pixels
472 |   auto fcnt_lo_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(cnt_lo, zero));
473 |   auto fcnt_lo_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(cnt_lo, zero));
474 |   auto fsum_lo_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(sum_lo, zero));
475 |   auto fsum_lo_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(sum_lo, zero));
476 | 
477 |   auto mul_lo_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo_lo, _mm_rcp_ps(fcnt_lo_lo)), rounder_half));
478 |   auto mul_lo_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo_hi, _mm_rcp_ps(fcnt_lo_hi)), rounder_half));
479 |   auto mulres_lo = _mm_packs_epi32(mul_lo_lo, mul_lo_hi);
480 |   // upper 8 pixels
481 |   auto fcnt_hi_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(cnt_hi, zero));
482 |   auto fcnt_hi_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(cnt_hi, zero));
483 |   auto fsum_hi_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(sum_hi, zero));
484 |   auto fsum_hi_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(sum_hi, zero));
485 | 
486 |   auto mul_hi_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi_lo, _mm_rcp_ps(fcnt_hi_lo)), rounder_half));
487 |   auto mul_hi_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi_hi, _mm_rcp_ps(fcnt_hi_hi)), rounder_half));
488 |   auto mulres_hi = _mm_packs_epi32(mul_hi_lo, mul_hi_hi);
489 | 
490 |   // move back to 16x8 bits
491 |   auto result = _mm_packus_epi16(mulres_lo, mulres_hi);
492 | 
493 |   // decide if original pixel is kept
494 |   auto finalres = _MM_BLENDV_EPI8(b, result, mask_either_is_true); // true: second param, false: 1st param
495 | 
496 |   _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x + 1), finalres);
497 | }
498 | 
499 | // Spatial Temporal
500 | void fluxST_sse2(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
501 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab)
502 | {
503 |   __m128i scaletab_lut_lsbs;
504 |   __m128i scaletab_lut_msbs;
505 |   for (int i = 0; i < 16; i++) {
506 |     ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF;
507 |     ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF;
508 |   }
509 | 
510 |   // spatial: because of previous and next line involved, function is called 
511 |   // starting with the 2nd line and with height = (real_height - 2) 
512 |   const int xcnt = width - 2; // leftmost/rightmost column safety
513 | 
514 |   __m128i temporal_threshold_vector = _mm_set1_epi8(temporal_threshold);
515 |   __m128i spatial_threshold_vector = _mm_set1_epi8(spatial_threshold);
516 | 
517 |   const int wmod16 = xcnt / 16 * 16;
518 |   const int rest = xcnt - wmod16;
519 | 
520 |   for (int y = 0; y < height; y++)
521 |   {
522 |     destp[0] = currp[0]; // Copy left edge
523 | 
524 |     for (int x = 0; x < wmod16; x += 16)
525 |       fluxST_core_sse2(currp, src_pitch, prevp, nextp, destp, x, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
526 |     // do rest
527 |     if (rest > 0)
528 |       fluxST_core_sse2(currp, src_pitch, prevp, nextp, destp, xcnt - 16, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
529 | 
530 |     destp[width - 1] = currp[width - 1]; // Copy right edge
531 | 
532 |     currp += src_pitch;
533 |     prevp += prv_pitch;
534 |     nextp += nxt_pitch;
535 |     destp += dst_pitch;
536 |   } // for y
537 | }
538 | 
539 | /************************************
540 | // Spatial Temporal SSE4.1, 8 bit
541 | ************************************/
542 | #if defined(CLANG) || defined(GCC)
543 | __attribute__((__target__("sse4.1")))
544 | #endif
545 | AVS_FORCEINLINE void fluxST_core_sse41(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
546 |   __m128i &temporal_threshold_vector,
547 |   __m128i &spatial_threshold_vector,
548 |   __m128i &scaletab_lut_lsbs,
549 |   __m128i &scaletab_lut_msbs
550 | )
551 | {
552 |   // +1: center of 3x3 pixels [+0,+1,+2]
553 |   auto b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 1));
554 |   auto pbt = _mm_loadu_si128(reinterpret_cast<const __m128i*>(prevp + x + 1));
555 |   auto nbt = _mm_loadu_si128(reinterpret_cast<const __m128i*>(nextp + x + 1));
556 | 
557 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
558 |   // int pdiff = pbt - b, ndiff = nbt - b;
559 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
560 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
561 |   auto pbt_lessthan_b = _mm_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
562 |   auto nbt_lessthan_b = _mm_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
563 |   auto pbt_greaterthan_b = _mm_cmpgt_epu8(pbt, b); // FF where pbt > b
564 |   auto nbt_greaterthan_b = _mm_cmpgt_epu8(nbt, b); // FF where nbt > b
565 |   auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b);
566 |   auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b);
567 |   auto mask_either_is_true = _mm_or_si128(both_less, both_greater);
568 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
569 | 
570 |     // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1];
571 |     // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1];
572 |     // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1];
573 | 
574 |   auto pb1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 0));
575 |   auto pb2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 1));
576 |   auto pb3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 2));
577 | 
578 |   auto b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 0));
579 |   auto b2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 2));
580 | 
581 |   auto nb1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 0));
582 |   auto nb2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 1));
583 |   auto nb3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 2));
584 | 
585 |   // int sum = b, cnt = 1;
586 |   auto zero = _mm_setzero_si128();
587 |   auto sum_lo = _mm_unpacklo_epi8(b, zero);
588 |   auto sum_hi = _mm_unpackhi_epi8(b, zero);
589 |   auto cnt = _mm_set1_epi8(1);
590 | 
591 |   check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
592 |   check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt);
593 |   check_neighbour_simd(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
594 |   check_neighbour_simd(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
595 |   check_neighbour_simd(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
596 |   check_neighbour_simd(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
597 |   check_neighbour_simd(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
598 |   check_neighbour_simd(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
599 |   check_neighbour_simd(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
600 |   check_neighbour_simd(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt);
601 |     // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
602 | 
603 |     // factor1 = sum*2 + cnt, sum elements are 16 bits
604 |   auto cnt_lo = _mm_unpacklo_epi8(cnt, zero);
605 |   auto cnt_hi = _mm_unpackhi_epi8(cnt, zero);
606 |   auto factor1_lo = _mm_add_epi16(_mm_add_epi16(sum_lo, sum_lo), cnt_lo);
607 |   auto factor1_hi = _mm_add_epi16(_mm_add_epi16(sum_hi, sum_hi), cnt_hi);
608 |   // factor2 = scaletab[cnt]
609 |   auto factor2_lsb = _mm_shuffle_epi8(scaletab_lut_lsbs, cnt);
610 |   auto factor2_msb = _mm_shuffle_epi8(scaletab_lut_msbs, cnt);
611 |   auto factor2_lo = _mm_unpacklo_epi8(factor2_lsb, factor2_msb);
612 |   auto factor2_hi = _mm_unpackhi_epi8(factor2_lsb, factor2_msb);
613 |   // finally mul and shift
614 |   auto mulres_lo = _mm_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16
615 |   auto mulres_hi = _mm_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16
616 |   // move back to 16x8 bits
617 |   auto result = _mm_packus_epi16(mulres_lo, mulres_hi);
618 | 
619 |   // decide if original pixel is kept
620 |   auto finalres = _mm_blendv_epi8(b, result, mask_either_is_true); // true: second param, false: 1st param
621 | 
622 |   _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x + 1), finalres);
623 | }
624 | 
625 | // Spatial Temporal
626 | #if defined(CLANG) || defined(GCC)
627 | __attribute__((__target__("sse4.1")))
628 | #endif
629 | void fluxST_sse41(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
630 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab)
631 | {
632 |   __m128i scaletab_lut_lsbs;
633 |   __m128i scaletab_lut_msbs;
634 |   for (int i = 0; i < 16; i++) {
635 |     ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF;
636 |     ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF;
637 |   }
638 | 
639 |   // spatial: because of previous and next line involved, function is called 
640 |   // starting with the 2nd line and with height = (real_height - 2) 
641 |   const int xcnt = width - 2; // leftmost/rightmost column safety
642 | 
643 |   __m128i temporal_threshold_vector = _mm_set1_epi8(temporal_threshold);
644 |   __m128i spatial_threshold_vector = _mm_set1_epi8(spatial_threshold);
645 | 
646 |   const int wmod16 = xcnt / 16 * 16;
647 |   const int rest = xcnt - wmod16;
648 | 
649 |   for (int y = 0; y < height; y++)
650 |   {
651 |     destp[0] = currp[0]; // Copy left edge
652 | 
653 |     for (int x = 0; x < wmod16; x += 16)
654 |       fluxST_core_sse41(currp, src_pitch, prevp, nextp, destp, x, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
655 |     // do rest
656 |     if (rest > 0)
657 |       fluxST_core_sse41(currp, src_pitch, prevp, nextp, destp, xcnt - 16, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs);
658 | 
659 |     destp[width - 1] = currp[width - 1]; // Copy right edge
660 | 
661 |     currp += src_pitch;
662 |     prevp += prv_pitch;
663 |     nextp += nxt_pitch;
664 |     destp += dst_pitch;
665 |   } // for y
666 | }
667 | 
668 | /************************************
669 | // Spatial Temporal SSE4.1, 16 bit
670 | ************************************/
671 | #if defined(CLANG) || defined(GCC)
672 | __attribute__((__target__("sse4.1")))
673 | #endif
674 | AVS_FORCEINLINE void fluxST_core_sse41_uint16(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x,
675 |   __m128i &temporal_threshold_vector, // already shifted to "signed" domain
676 |   __m128i &spatial_threshold_vector // already shifted to "signed" domain
677 | )
678 | {
679 |   const auto make_signed_word = _mm_set1_epi16(0x8000); // int16 support is better than of uint16 (cmp, etc...)
680 |   // +1: center of 3x3 pixels [+0,+1,+2]
681 |   auto b_orig = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 1 * sizeof(uint16_t)));
682 |   auto pbt_orig = _mm_loadu_si128(reinterpret_cast<const __m128i*>(prevp + x + 1 * sizeof(uint16_t)));
683 |   auto nbt_orig = _mm_loadu_si128(reinterpret_cast<const __m128i*>(nextp + x + 1 * sizeof(uint16_t)));
684 | 
685 |   auto b = _mm_add_epi16(b_orig, make_signed_word);
686 |   auto pbt = _mm_add_epi16(pbt_orig, make_signed_word);
687 |   auto nbt = _mm_add_epi16(nbt_orig, make_signed_word);
688 |   // int b = *currp, pbt = *prevp++, nbt = *nextp++;
689 |   // int pdiff = pbt - b, ndiff = nbt - b;
690 |   // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
691 |   // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b))
692 |   auto pbt_lessthan_b = _mm_cmpgt_epi16(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters
693 |   auto nbt_lessthan_b = _mm_cmpgt_epi16(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters
694 |   auto pbt_greaterthan_b = _mm_cmpgt_epi16(pbt, b); // FF where pbt > b
695 |   auto nbt_greaterthan_b = _mm_cmpgt_epi16(nbt, b); // FF where nbt > b
696 |   auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b);
697 |   auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b);
698 |   auto mask_either_is_true = _mm_or_si128(both_less, both_greater);
699 |   // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr)
700 | 
701 |     // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1];
702 |     // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1];
703 |     // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1];
704 | 
705 |   auto pb1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 0 * sizeof(uint16_t)));
706 |   auto pb2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 1 * sizeof(uint16_t)));
707 |   auto pb3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x - src_pitch + 2 * sizeof(uint16_t)));
708 | 
709 |   auto b1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 0 * sizeof(uint16_t)));
710 |   auto b2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + 2 * sizeof(uint16_t)));
711 | 
712 |   auto nb1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 0 * sizeof(uint16_t)));
713 |   auto nb2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 1 * sizeof(uint16_t)));
714 |   auto nb3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(currp + x + src_pitch + 2 * sizeof(uint16_t)));
715 | 
716 |   // int sum = b, cnt = 1;
717 |   auto zero = _mm_setzero_si128();
718 |   auto sum_lo = _mm_unpacklo_epi16(b_orig, zero);
719 |   auto sum_hi = _mm_unpackhi_epi16(b_orig, zero);
720 |   auto cnt = _mm_set1_epi16(1);
721 | 
722 |   check_neighbour_simd_uint16(pbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
723 |   check_neighbour_simd_uint16(nbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
724 |   check_neighbour_simd_uint16(pb1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
725 |   check_neighbour_simd_uint16(pb2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
726 |   check_neighbour_simd_uint16(pb3, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
727 |   check_neighbour_simd_uint16(b1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
728 |   check_neighbour_simd_uint16(b2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
729 |   check_neighbour_simd_uint16(nb1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
730 |   check_neighbour_simd_uint16(nb2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
731 |   check_neighbour_simd_uint16(nb3, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word);
732 |   // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16);
733 | 
734 |   auto cnt_lo = _mm_unpacklo_epi16(cnt, zero);
735 |   auto cnt_hi = _mm_unpackhi_epi16(cnt, zero);
736 |   // Difference from SSE4.1 and C: floating point division
737 |   // SSE2: sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f)
738 |   const auto rounder_half = _mm_set1_ps(0.5f);
739 |   // lower 4 pixels
740 |   auto fcnt_lo = _mm_cvtepi32_ps(cnt_lo);
741 |   auto fsum_lo = _mm_cvtepi32_ps(sum_lo);
742 |   auto mulres_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo, _mm_rcp_ps(fcnt_lo)), rounder_half));
743 |   // upper 4 pixels
744 |   auto fcnt_hi = _mm_cvtepi32_ps(cnt_hi);
745 |   auto fsum_hi = _mm_cvtepi32_ps(sum_hi);
746 |   auto mulres_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi, _mm_rcp_ps(fcnt_hi)), rounder_half));
747 | 
748 |   // move back to 8x16 bits
749 |   auto result = _mm_packus_epi32(mulres_lo, mulres_hi);
750 | 
751 |   // decide if original pixel is kept
752 |   auto finalres = _mm_blendv_epi8(b_orig, result, mask_either_is_true); // true: second param, false: 1st param
753 | 
754 |   _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x + 1 * sizeof(uint16_t)), finalres);
755 | }
756 | 
757 | // Spatial Temporal
758 | #if defined(CLANG) || defined(GCC)
759 | __attribute__((__target__("sse4.1")))
760 | #endif
761 | void fluxST_sse41_uint16(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
762 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab)
763 | {
764 |   // spatial: because of previous and next line involved, function is called 
765 |   // starting with the 2nd line and with height = (real_height - 2) 
766 |   const int xcnt = width - 2; // leftmost/rightmost column safety
767 | 
768 |   __m128i temporal_threshold_vector = _mm_set1_epi16(temporal_threshold - 0x8000); // move to signed int16 domain
769 |   __m128i spatial_threshold_vector = _mm_set1_epi16(spatial_threshold - 0x8000); // move to signed int16 domain);
770 | 
771 |   const int wmod8 = xcnt / 8 * 8;
772 |   const int rest = xcnt - wmod8;
773 | 
774 |   for (int y = 0; y < height; y++)
775 |   {
776 |     reinterpret_cast<uint16_t *>(destp)[0] = reinterpret_cast<const uint16_t *>(currp)[0]; // Copy left edge
777 | 
778 |     for (int x = 0; x < wmod8; x += 8)
779 |       fluxST_core_sse41_uint16(currp, src_pitch, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector);
780 |     // do rest
781 |     if (rest > 0)
782 |       fluxST_core_sse41_uint16(currp, src_pitch, prevp, nextp, destp, (xcnt - 8) * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector);
783 | 
784 |     reinterpret_cast<uint16_t *>(destp)[width - 1] = reinterpret_cast<const uint16_t *>(currp)[width - 1]; // Copy right edge
785 | 
786 |     currp += src_pitch;
787 |     prevp += prv_pitch;
788 |     nextp += nxt_pitch;
789 |     destp += dst_pitch;
790 |   } // for y
791 | }
792 | #endif // INTEL_INTRINSICS
793 | 
794 | /************************************
795 | // Helper
796 | ************************************/
797 | 
798 | static AVS_FORCEINLINE void check_neighbour_C(int neighbour, int center, int threshold, int& sum, int& cnt)
799 | {
800 |   if (std::abs(neighbour - center) <= threshold)
801 |   {
802 |     sum += neighbour;
803 |     ++cnt;
804 |   }
805 | }
806 | 
807 | /************************************
808 | // Spatial Temporal C, 8-16 bit
809 | ************************************/
810 | 
811 | template<typename pixel_t>
812 | void fluxST_C(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
813 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab)
814 | {
815 |   // spatial: because of previous and next line involved, function is called 
816 |   // starting with the 2nd line and with height = (real_height - 2) 
817 |   for (int y = 0; y < height; y++)
818 |   {
819 |     // leftmost column safety
820 |     reinterpret_cast<pixel_t *>(destp)[0] = reinterpret_cast<const pixel_t *>(currp)[0]; // Copy left edge
821 | 
822 |     for (int x = 1; x < width-1; x++)
823 |     {
824 | 
825 |       int b = reinterpret_cast<const pixel_t *>(currp)[x];
826 |       int pbt = reinterpret_cast<const pixel_t *>(prevp)[x];
827 |       int nbt = reinterpret_cast<const pixel_t *>(nextp)[x];
828 |       int pdiff = pbt - b, ndiff = nbt - b;
829 |       if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
830 |       {
831 |         const pixel_t* currp0 = reinterpret_cast<const pixel_t *>(currp);
832 |         const int src_pitch0 = src_pitch / sizeof(pixel_t);
833 |         int pb1 = currp0[x - src_pitch0 - 1];
834 |         int pb2 = currp0[x - src_pitch0];
835 |         int pb3 = currp0[x - src_pitch0 + 1];
836 |         int b1 = currp0[x - 1];
837 |         /*b = currp[0]; */
838 |         int b2 = currp0[x + 1];
839 |         int nb1 = currp0[x + src_pitch0 - 1];
840 |         int nb2 = currp0[x + src_pitch0];
841 |         int nb3 = currp0[x + src_pitch0 + 1];
842 | 
843 |         int sum = b, cnt = 1;
844 |         check_neighbour_C(pbt, b, temporal_threshold, sum, cnt);
845 |         check_neighbour_C(nbt, b, temporal_threshold, sum, cnt);
846 | 
847 |         check_neighbour_C(pb1, b, spatial_threshold, sum, cnt);
848 |         check_neighbour_C(pb2, b, spatial_threshold, sum, cnt);
849 |         check_neighbour_C(pb3, b, spatial_threshold, sum, cnt);
850 | 
851 |         check_neighbour_C(b1, b, spatial_threshold, sum, cnt);
852 |         check_neighbour_C(b2, b, spatial_threshold, sum, cnt);
853 | 
854 |         check_neighbour_C(nb1, b, spatial_threshold, sum, cnt);
855 |         check_neighbour_C(nb2, b, spatial_threshold, sum, cnt);
856 |         check_neighbour_C(nb3, b, spatial_threshold, sum, cnt);
857 | 
858 |         using safe_int_t = typename std::conditional<sizeof(pixel_t) == 1, int, int64_t>::type; // 16 bit pixels: int32 overflow
859 | 
860 |         reinterpret_cast<pixel_t *>(destp)[x] = (pixel_t)(((safe_int_t)(sum * 2 + cnt) * scaletab[cnt]) >> 16);
861 |       }
862 |       else
863 |       {
864 |         reinterpret_cast<pixel_t *>(destp)[x] = b;
865 |       }
866 |     } // for x
867 | 
868 |     // rightmost column safety
869 |     reinterpret_cast<pixel_t *>(destp)[width - 1] = reinterpret_cast<const pixel_t *>(currp)[width - 1]; // Copy right edge
870 | 
871 |     currp += src_pitch;
872 |     prevp += prv_pitch;
873 |     nextp += nxt_pitch;
874 |     destp += dst_pitch;
875 | 
876 |   } // for y
877 | 
878 | }
879 | 
880 | // instantiate
881 | template void fluxST_C<uint8_t>(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
882 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
883 | template void fluxST_C<uint16_t>(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
884 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab);
885 | 
886 | /************************************
887 | // Termporal only C, 8-16 bit
888 | ************************************/
889 | 
890 | template<typename pixel_t>
891 | void fluxT_C(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
892 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab)
893 | {
894 |   for (int y = 0; y < height; y++)
895 |   {
896 |     for (int x = 0; x < width; x++)
897 |     {
898 |       int b = reinterpret_cast<const pixel_t *>(currp)[x];
899 |       int pbt = reinterpret_cast<const pixel_t *>(prevp)[x];
900 |       int nbt = reinterpret_cast<const pixel_t *>(nextp)[x];
901 |       int pdiff = pbt - b, ndiff = nbt - b;
902 |       if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0))
903 |       {
904 |         int sum = b, cnt = 1;
905 | 
906 |         check_neighbour_C(pbt, b, temporal_threshold, sum, cnt);
907 |         check_neighbour_C(nbt, b, temporal_threshold, sum, cnt);
908 |         using safe_int_t = typename std::conditional<sizeof(pixel_t) == 1, int, int64_t>::type; // 16 bit pixels: int32 overflow
909 |         // cnt: 1,2,3
910 |         reinterpret_cast<pixel_t *>(destp)[x] = (pixel_t)(((safe_int_t)(sum * 2 + cnt) * scaletab[cnt]) >> 16);
911 |       }
912 |       else
913 |       {
914 |         reinterpret_cast<pixel_t *>(destp)[x] = (pixel_t)b;
915 |       }
916 |     } // for x
917 | 
918 |     currp += src_pitch;
919 |     prevp += prv_pitch;
920 |     nextp += nxt_pitch;
921 |     destp += dst_pitch;
922 | 
923 |   } // for y
924 | }
925 | 
926 | // instantiate
927 | template void fluxT_C<uint8_t>(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
928 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
929 | template void fluxT_C<uint16_t>(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch,
930 |   uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab);
931 | 
932 | 


--------------------------------------------------------------------------------