├── FluxSmooth ├── FluxSmoothTest.cpp ├── FluxSmooth.rc ├── avs │ ├── minmax.h │ ├── types.h │ ├── win.h │ ├── cpuid.h │ ├── capi.h │ ├── alignment.h │ ├── posix.h │ └── config.h ├── CMakeLists.txt ├── FluxSmooth.h ├── documentation │ └── readme.html ├── FilterDef.cpp ├── FluxSmooth.vcxproj ├── FluxSmooth_avx2.cpp ├── FluxSmooth_avx512.cpp └── FluxSmooth.cpp ├── CMakeLists.txt ├── cmake_uninstall.cmake.in ├── FluxSmooth.sln ├── README.md ├── .gitattributes └── .gitignore /FluxSmooth/FluxSmoothTest.cpp: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /FluxSmooth/FluxSmooth.rc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pinterf/FluxSmooth/HEAD/FluxSmooth/FluxSmooth.rc -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8.2) 2 | project(FluxSmooth-pfmod LANGUAGES CXX) 3 | 4 | add_subdirectory(FluxSmooth) 5 | 6 | # uninstall target 7 | configure_file( 8 | "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in" 9 | "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" 10 | IMMEDIATE @ONLY) 11 | 12 | add_custom_target(uninstall 13 | COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) 14 | -------------------------------------------------------------------------------- /cmake_uninstall.cmake.in: -------------------------------------------------------------------------------- 1 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 2 | message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 3 | endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 4 | 5 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) 6 | string(REGEX REPLACE "\n" ";" files "${files}") 7 | foreach(file ${files}) 8 | message(STATUS "Uninstalling $ENV{DESTDIR}${file}") 9 | if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 10 | exec_program( 11 | "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" 12 | OUTPUT_VARIABLE rm_out 13 | RETURN_VALUE rm_retval 14 | ) 15 | if(NOT "${rm_retval}" STREQUAL 0) 16 | message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") 17 | endif(NOT "${rm_retval}" STREQUAL 0) 18 | else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 19 | message(STATUS "File $ENV{DESTDIR}${file} does not exist.") 20 | endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 21 | endforeach(file) 22 | -------------------------------------------------------------------------------- /FluxSmooth/avs/minmax.h: -------------------------------------------------------------------------------- 1 | // This program is free software; you can redistribute it and/or modify 2 | // it under the terms of the GNU General Public License as published by 3 | // the Free Software Foundation; either version 2 of the License, or 4 | // (at your option) any later version. 5 | // 6 | // This program is distributed in the hope that it will be useful, 7 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | // GNU General Public License for more details. 10 | // 11 | // You should have received a copy of the GNU General Public License 12 | // along with this program; if not, write to the Free Software 13 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 14 | // http://www.gnu.org/copyleft/gpl.html . 15 | // 16 | // Linking Avisynth statically or dynamically with other modules is making a 17 | // combined work based on Avisynth. Thus, the terms and conditions of the GNU 18 | // General Public License cover the whole combination. 19 | // 20 | // As a special exception, the copyright holders of Avisynth give you 21 | // permission to link Avisynth with independent modules that communicate with 22 | // Avisynth solely through the interfaces defined in avisynth.h, regardless of the license 23 | // terms of these independent modules, and to copy and distribute the 24 | // resulting combined work under terms of your choice, provided that 25 | // every copy of the combined work is accompanied by a complete copy of 26 | // the source code of Avisynth (the version of Avisynth used to produce the 27 | // combined work), being distributed under the terms of the GNU General 28 | // Public License plus this exception. An independent module is a module 29 | // which is not derived from or based on Avisynth, such as 3rd-party filters, 30 | // import and export plugins, or graphical user interfaces. 31 | 32 | #ifndef AVSCORE_MINMAX_H 33 | #define AVSCORE_MINMAX_H 34 | 35 | template 36 | T min(T v1, T v2) 37 | { 38 | return v1 < v2 ? v1 : v2; 39 | } 40 | 41 | template 42 | T max(T v1, T v2) 43 | { 44 | return v1 > v2 ? v1 : v2; 45 | } 46 | 47 | template 48 | T clamp(T n, T min, T max) 49 | { 50 | n = n > max ? max : n; 51 | return n < min ? min : n; 52 | } 53 | 54 | #endif // AVSCORE_MINMAX_H 55 | -------------------------------------------------------------------------------- /FluxSmooth/avs/types.h: -------------------------------------------------------------------------------- 1 | // Avisynth C Interface Version 0.20 2 | // Copyright 2003 Kevin Atkinson 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | // 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | // 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 17 | // http://www.gnu.org/copyleft/gpl.html . 18 | // 19 | // As a special exception, I give you permission to link to the 20 | // Avisynth C interface with independent modules that communicate with 21 | // the Avisynth C interface solely through the interfaces defined in 22 | // avisynth_c.h, regardless of the license terms of these independent 23 | // modules, and to copy and distribute the resulting combined work 24 | // under terms of your choice, provided that every copy of the 25 | // combined work is accompanied by a complete copy of the source code 26 | // of the Avisynth C interface and Avisynth itself (with the version 27 | // used to produce the combined work), being distributed under the 28 | // terms of the GNU General Public License plus this exception. An 29 | // independent module is a module which is not derived from or based 30 | // on Avisynth C Interface, such as 3rd-party filters, import and 31 | // export plugins, or graphical user interfaces. 32 | 33 | #ifndef AVS_TYPES_H 34 | #define AVS_TYPES_H 35 | 36 | // Define all types necessary for interfacing with avisynth.dll 37 | #include 38 | #include 39 | #ifdef __cplusplus 40 | #include 41 | #include 42 | #else 43 | #include 44 | #include 45 | #endif 46 | 47 | // Raster types used by VirtualDub & Avisynth 48 | typedef uint32_t Pixel32; 49 | typedef uint8_t BYTE; 50 | 51 | // Audio Sample information 52 | typedef float SFLOAT; 53 | 54 | #endif //AVS_TYPES_H 55 | -------------------------------------------------------------------------------- /FluxSmooth.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.28729.10 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FluxSmooth", "FluxSmooth\FluxSmooth.vcxproj", "{588984EE-FDBE-4901-894A-32781B765F07}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|x64 = Debug|x64 11 | Debug|x86 = Debug|x86 12 | Release LLVM|x64 = Release LLVM|x64 13 | Release LLVM|x86 = Release LLVM|x86 14 | Release XP|x64 = Release XP|x64 15 | Release XP|x86 = Release XP|x86 16 | Release|x64 = Release|x64 17 | Release|x86 = Release|x86 18 | EndGlobalSection 19 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 20 | {588984EE-FDBE-4901-894A-32781B765F07}.Debug|x64.ActiveCfg = Debug|x64 21 | {588984EE-FDBE-4901-894A-32781B765F07}.Debug|x64.Build.0 = Debug|x64 22 | {588984EE-FDBE-4901-894A-32781B765F07}.Debug|x86.ActiveCfg = Debug|Win32 23 | {588984EE-FDBE-4901-894A-32781B765F07}.Debug|x86.Build.0 = Debug|Win32 24 | {588984EE-FDBE-4901-894A-32781B765F07}.Release LLVM|x64.ActiveCfg = Release LLVM|x64 25 | {588984EE-FDBE-4901-894A-32781B765F07}.Release LLVM|x64.Build.0 = Release LLVM|x64 26 | {588984EE-FDBE-4901-894A-32781B765F07}.Release LLVM|x86.ActiveCfg = Release LLVM|Win32 27 | {588984EE-FDBE-4901-894A-32781B765F07}.Release LLVM|x86.Build.0 = Release LLVM|Win32 28 | {588984EE-FDBE-4901-894A-32781B765F07}.Release XP|x64.ActiveCfg = Release XP|x64 29 | {588984EE-FDBE-4901-894A-32781B765F07}.Release XP|x64.Build.0 = Release XP|x64 30 | {588984EE-FDBE-4901-894A-32781B765F07}.Release XP|x86.ActiveCfg = Release XP|Win32 31 | {588984EE-FDBE-4901-894A-32781B765F07}.Release XP|x86.Build.0 = Release XP|Win32 32 | {588984EE-FDBE-4901-894A-32781B765F07}.Release|x64.ActiveCfg = Release|x64 33 | {588984EE-FDBE-4901-894A-32781B765F07}.Release|x64.Build.0 = Release|x64 34 | {588984EE-FDBE-4901-894A-32781B765F07}.Release|x86.ActiveCfg = Release|Win32 35 | {588984EE-FDBE-4901-894A-32781B765F07}.Release|x86.Build.0 = Release|Win32 36 | EndGlobalSection 37 | GlobalSection(SolutionProperties) = preSolution 38 | HideSolutionNode = FALSE 39 | EndGlobalSection 40 | GlobalSection(ExtensibilityGlobals) = postSolution 41 | SolutionGuid = {483D6367-0542-4995-B683-A9CE97059A76} 42 | EndGlobalSection 43 | EndGlobal 44 | -------------------------------------------------------------------------------- /FluxSmooth/avs/win.h: -------------------------------------------------------------------------------- 1 | // This program is free software; you can redistribute it and/or modify 2 | // it under the terms of the GNU General Public License as published by 3 | // the Free Software Foundation; either version 2 of the License, or 4 | // (at your option) any later version. 5 | // 6 | // This program is distributed in the hope that it will be useful, 7 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | // GNU General Public License for more details. 10 | // 11 | // You should have received a copy of the GNU General Public License 12 | // along with this program; if not, write to the Free Software 13 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 14 | // http://www.gnu.org/copyleft/gpl.html . 15 | // 16 | // Linking Avisynth statically or dynamically with other modules is making a 17 | // combined work based on Avisynth. Thus, the terms and conditions of the GNU 18 | // General Public License cover the whole combination. 19 | // 20 | // As a special exception, the copyright holders of Avisynth give you 21 | // permission to link Avisynth with independent modules that communicate with 22 | // Avisynth solely through the interfaces defined in avisynth.h, regardless of the license 23 | // terms of these independent modules, and to copy and distribute the 24 | // resulting combined work under terms of your choice, provided that 25 | // every copy of the combined work is accompanied by a complete copy of 26 | // the source code of Avisynth (the version of Avisynth used to produce the 27 | // combined work), being distributed under the terms of the GNU General 28 | // Public License plus this exception. An independent module is a module 29 | // which is not derived from or based on Avisynth, such as 3rd-party filters, 30 | // import and export plugins, or graphical user interfaces. 31 | 32 | #ifndef AVSCORE_WIN_H 33 | #define AVSCORE_WIN_H 34 | 35 | // Whenever you need windows headers, start by including this file, then the rest. 36 | 37 | // WWUUT? We require XP now? 38 | #if !defined(NTDDI_VERSION) && !defined(_WIN32_WINNT) 39 | #define NTDDI_VERSION 0x05020000 40 | #define _WIN32_WINNT 0x0502 41 | #endif 42 | 43 | #define WIN32_LEAN_AND_MEAN 44 | #define STRICT 45 | #if !defined(NOMINMAX) 46 | #define NOMINMAX 47 | #endif 48 | 49 | #include 50 | 51 | // Provision for UTF-8 max 4 bytes per code point 52 | #define AVS_MAX_PATH MAX_PATH*4 53 | 54 | #endif // AVSCORE_WIN_H 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FluxSmooth - pfmod 2 | Avisynth filter for spatio-temporal smoothing of fluctuations 3 | 4 | By Ross Thomas 5 | 6 | There is no copyright on this code, and there are no conditions 7 | on its distribution or use. Do with it what you will. 8 | 9 | ## Changelog 10 | - (20190426) v1.4 11 | - AVX512 support 12 | when both AVX512F and AVX512BW extensions are available (e.g. Skylake X and Cannon Lake). 13 | Available processor flags can be shown through the .Info() filter in Avisynth+. 14 | New value for 'opt': opt=4 means forced AVX512. Error message if system does not support those AVX512 flags. 15 | - Moved to Visual Studio 2019 16 | - (xp builds: Microsoft C++, main builds: LLVM clang) 17 | 18 | - (20190402) v1.3, rewrite by pinterf 19 | - project moved to github: https://github.com/pinterf/FluxSmooth 20 | - Built using Visual Studio 2017, additional LLVM 8.0 clang support 21 | - Changed to AVS 2.6 plugin interface 22 | - x64 build for Avisynth+ 23 | - Added version resource to DLL 24 | - Removed MMX support, requires SSE2. (Though pure C is still available in the source) 25 | - Drop all inline assembly, SIMD intrinsics based on C code, SSE2, SSE4.1 and AVX2 optimizations 26 | - Single DLL, optimizations for different CPU instruction sets are chosen automatically. 27 | - Reports MT Modes for Avisynth+: MT_NICE_FILTER 28 | - Added Y, YV411, YV16 and YV24, 10-16 bits 4:2:0, 4:2:2, 4:4:4, planar RGB(A) 8-16 bits support besides existing YV12 29 | - (YUY2 support with workaround: internally converted to YV16, process and convert back 30 | conversion is lossless, but slower than using native YV16) 31 | - New parameters: bool "luma", bool "chroma" (default true) to disable processing of luma/chroma planes 32 | - (20101130) x64 inline assembler optimized version by Devin Gardner 33 | - (2002-2004) FluxSmooth v1.1b 34 | Original version by Ross Thomas 35 | http://web.archive.org/web/20070225212908/http://bengal.missouri.edu/~kes25c/FluxSmooth-1.1b.zip 36 | https://forum.doom9.org/showthread.php?t=38296 37 | 38 | ## Notes 39 | Previous DLL versions named differently (FluxSmoothSSE2.DLL, FluxSmoothSSSE3) should be deleted from your plugin folder. 40 | From version 1.3 a single DLL exists, which automatically chosen CPU optimization (SSE2, SSE4.1, AVX2) 41 | 42 | ## Links 43 | - Project: https://github.com/pinterf/FluxSmooth 44 | - Forum: https://forum.doom9.org/showthread.php?t=176246 45 | - Additional info: http://avisynth.nl/index.php/FluxSmooth -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /FluxSmooth/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(PluginName "FluxSmooth") 2 | 3 | if (NOT WIN32) 4 | string(TOLOWER "${PluginName}" PluginName) 5 | endif() 6 | 7 | set(ProjectName "${PluginName}") 8 | project(${ProjectName} LANGUAGES CXX) 9 | 10 | file(GLOB FluxSmoothSources RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" *.cpp *.h) 11 | add_library(${PluginName} SHARED ${FluxSmoothSources}) 12 | 13 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DINTEL_INTRINSICS -msse4.1") 14 | 15 | if (MSVC_IDE) 16 | IF(CLANG_IN_VS STREQUAL "1") 17 | # special AVX option for source files with *_avx.cpp pattern 18 | file(GLOB_RECURSE SRCS_AVX "*_avx.cpp") 19 | set_source_files_properties(${SRCS_AVX} PROPERTIES COMPILE_FLAGS " -mavx ") 20 | 21 | # special AVX2 option for source files with *_avx2.cpp pattern 22 | file(GLOB_RECURSE SRCS_AVX2 "*_avx2.cpp") 23 | set_source_files_properties(${SRCS_AVX2} PROPERTIES COMPILE_FLAGS " -mavx2 -mfma ") 24 | 25 | # special AVX512 option for source files with *_avx512.cpp pattern 26 | file(GLOB_RECURSE SRCS_AVX512 "*_avx512.cpp") 27 | set_source_files_properties(${SRCS_AVX512} PROPERTIES COMPILE_FLAGS " -mavx512f -mavx512bw ") 28 | ELSE() 29 | # special AVX option for source files with *_avx.cpp pattern 30 | file(GLOB_RECURSE SRCS_AVX "*_avx.cpp") 31 | set_source_files_properties(${SRCS_AVX} PROPERTIES COMPILE_FLAGS " /arch:AVX ") 32 | 33 | # special AVX2 option for source files with *_avx2.cpp pattern 34 | file(GLOB_RECURSE SRCS_AVX2 "*_avx2.cpp") 35 | set_source_files_properties(${SRCS_AVX2} PROPERTIES COMPILE_FLAGS " /arch:AVX2 ") 36 | 37 | # special AVX512 option for source files with *_avx512.cpp pattern 38 | file(GLOB_RECURSE SRCS_AVX512 "*_avx512.cpp") 39 | set_source_files_properties(${SRCS_AVX512} PROPERTIES COMPILE_FLAGS " /arch:AVX512 ") 40 | ENDIF() 41 | else() 42 | # special AVX option for source files with *_avx.cpp pattern 43 | file(GLOB_RECURSE SRCS_AVX "*_avx.cpp") 44 | set_source_files_properties(${SRCS_AVX} PROPERTIES COMPILE_FLAGS " -mavx ") 45 | 46 | # special AVX2 option for source files with *_avx2.cpp pattern 47 | file(GLOB_RECURSE SRCS_AVX2 "*_avx2.cpp") 48 | set_source_files_properties(${SRCS_AVX2} PROPERTIES COMPILE_FLAGS " -mavx2 -mfma ") 49 | 50 | # special AVX512 option for source files with *_avx512.cpp pattern 51 | file(GLOB_RECURSE SRCS_AVX512 "*_avx512.cpp") 52 | set_source_files_properties(${SRCS_AVX512} PROPERTIES COMPILE_FLAGS " -mavx512f -mavx512bw ") 53 | endif() 54 | 55 | target_link_libraries(${ProjectName}) 56 | target_include_directories(${ProjectName} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 57 | 58 | include(GNUInstallDirs) 59 | 60 | INSTALL(TARGETS ${ProjectName} 61 | LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}/avisynth") 62 | -------------------------------------------------------------------------------- /FluxSmooth/avs/cpuid.h: -------------------------------------------------------------------------------- 1 | // This program is free software; you can redistribute it and/or modify 2 | // it under the terms of the GNU General Public License as published by 3 | // the Free Software Foundation; either version 2 of the License, or 4 | // (at your option) any later version. 5 | // 6 | // This program is distributed in the hope that it will be useful, 7 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | // GNU General Public License for more details. 10 | // 11 | // You should have received a copy of the GNU General Public License 12 | // along with this program; if not, write to the Free Software 13 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 14 | // http://www.gnu.org/copyleft/gpl.html . 15 | // 16 | // Linking Avisynth statically or dynamically with other modules is making a 17 | // combined work based on Avisynth. Thus, the terms and conditions of the GNU 18 | // General Public License cover the whole combination. 19 | // 20 | // As a special exception, the copyright holders of Avisynth give you 21 | // permission to link Avisynth with independent modules that communicate with 22 | // Avisynth solely through the interfaces defined in avisynth.h, regardless of the license 23 | // terms of these independent modules, and to copy and distribute the 24 | // resulting combined work under terms of your choice, provided that 25 | // every copy of the combined work is accompanied by a complete copy of 26 | // the source code of Avisynth (the version of Avisynth used to produce the 27 | // combined work), being distributed under the terms of the GNU General 28 | // Public License plus this exception. An independent module is a module 29 | // which is not derived from or based on Avisynth, such as 3rd-party filters, 30 | // import and export plugins, or graphical user interfaces. 31 | 32 | #ifndef AVSCORE_CPUID_H 33 | #define AVSCORE_CPUID_H 34 | 35 | // For GetCPUFlags. These are backwards-compatible with those in VirtualDub. 36 | // ending with SSE4_2 37 | // For emulation see https://software.intel.com/en-us/articles/intel-software-development-emulator 38 | enum { 39 | /* oldest CPU to support extension */ 40 | CPUF_FORCE = 0x01, // N/A 41 | CPUF_FPU = 0x02, // 386/486DX 42 | CPUF_MMX = 0x04, // P55C, K6, PII 43 | CPUF_INTEGER_SSE = 0x08, // PIII, Athlon 44 | CPUF_SSE = 0x10, // PIII, Athlon XP/MP 45 | CPUF_SSE2 = 0x20, // PIV, K8 46 | CPUF_3DNOW = 0x40, // K6-2 47 | CPUF_3DNOW_EXT = 0x80, // Athlon 48 | CPUF_X86_64 = 0xA0, // Hammer (note: equiv. to 3DNow + SSE2, which 49 | // only Hammer will have anyway) 50 | CPUF_SSE3 = 0x100, // PIV+, K8 Venice 51 | CPUF_SSSE3 = 0x200, // Core 2 52 | CPUF_SSE4 = 0x400, 53 | CPUF_SSE4_1 = 0x400, // Penryn, Wolfdale, Yorkfield 54 | CPUF_AVX = 0x800, // Sandy Bridge, Bulldozer 55 | CPUF_SSE4_2 = 0x1000, // Nehalem 56 | // AVS+ 57 | CPUF_AVX2 = 0x2000, // Haswell 58 | CPUF_FMA3 = 0x4000, 59 | CPUF_F16C = 0x8000, 60 | CPUF_MOVBE = 0x10000, // Big Endian move 61 | CPUF_POPCNT = 0x20000, 62 | CPUF_AES = 0x40000, 63 | CPUF_FMA4 = 0x80000, 64 | 65 | CPUF_AVX512F = 0x100000, // AVX-512 Foundation. 66 | CPUF_AVX512DQ = 0x200000, // AVX-512 DQ (Double/Quad granular) Instructions 67 | CPUF_AVX512PF = 0x400000, // AVX-512 Prefetch 68 | CPUF_AVX512ER = 0x800000, // AVX-512 Exponential and Reciprocal 69 | CPUF_AVX512CD = 0x1000000, // AVX-512 Conflict Detection 70 | CPUF_AVX512BW = 0x2000000, // AVX-512 BW (Byte/Word granular) Instructions 71 | CPUF_AVX512VL = 0x4000000, // AVX-512 VL (128/256 Vector Length) Extensions 72 | CPUF_AVX512IFMA = 0x8000000, // AVX-512 IFMA integer 52 bit 73 | CPUF_AVX512VBMI = 0x10000000,// AVX-512 VBMI 74 | }; 75 | 76 | #ifdef BUILDING_AVSCORE 77 | int GetCPUFlags(); 78 | void SetMaxCPU(int new_flags); 79 | #endif 80 | 81 | #endif // AVSCORE_CPUID_H 82 | -------------------------------------------------------------------------------- /FluxSmooth/avs/capi.h: -------------------------------------------------------------------------------- 1 | // Avisynth C Interface Version 0.20 2 | // Copyright 2003 Kevin Atkinson 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | // 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | // 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 17 | // http://www.gnu.org/copyleft/gpl.html . 18 | // 19 | // As a special exception, I give you permission to link to the 20 | // Avisynth C interface with independent modules that communicate with 21 | // the Avisynth C interface solely through the interfaces defined in 22 | // avisynth_c.h, regardless of the license terms of these independent 23 | // modules, and to copy and distribute the resulting combined work 24 | // under terms of your choice, provided that every copy of the 25 | // combined work is accompanied by a complete copy of the source code 26 | // of the Avisynth C interface and Avisynth itself (with the version 27 | // used to produce the combined work), being distributed under the 28 | // terms of the GNU General Public License plus this exception. An 29 | // independent module is a module which is not derived from or based 30 | // on Avisynth C Interface, such as 3rd-party filters, import and 31 | // export plugins, or graphical user interfaces. 32 | 33 | #ifndef AVS_CAPI_H 34 | #define AVS_CAPI_H 35 | 36 | #include "config.h" 37 | 38 | #ifdef AVS_POSIX 39 | // this is also defined in avs/posix.h 40 | #define __declspec(x) 41 | #endif 42 | 43 | #ifdef __cplusplus 44 | # define EXTERN_C extern "C" 45 | #else 46 | # define EXTERN_C 47 | #endif 48 | 49 | #ifdef AVS_WINDOWS 50 | #ifdef BUILDING_AVSCORE 51 | # if defined(GCC) && defined(X86_32) 52 | # define AVSC_CC 53 | # else // MSVC builds and 64-bit GCC 54 | # ifndef AVSC_USE_STDCALL 55 | # define AVSC_CC __cdecl 56 | # else 57 | # define AVSC_CC __stdcall 58 | # endif 59 | # endif 60 | #else // needed for programs that talk to AviSynth+ 61 | # ifndef AVSC_WIN32_GCC32 // see comment below 62 | # ifndef AVSC_USE_STDCALL 63 | # define AVSC_CC __cdecl 64 | # else 65 | # define AVSC_CC __stdcall 66 | # endif 67 | # else 68 | # define AVSC_CC 69 | # endif 70 | #endif 71 | # else 72 | # define AVSC_CC 73 | #endif 74 | 75 | // On 64-bit Windows, there's only one calling convention, 76 | // so there is no difference between MSVC and GCC. On 32-bit, 77 | // this isn't true. The convention that GCC needs to use to 78 | // even build AviSynth+ as 32-bit makes anything that uses 79 | // it incompatible with 32-bit MSVC builds of AviSynth+. 80 | // The AVSC_WIN32_GCC32 define is meant to provide a user 81 | // switchable way to make builds of FFmpeg to test 32-bit 82 | // GCC builds of AviSynth+ without having to screw around 83 | // with alternate headers, while still default to the usual 84 | // situation of using 32-bit MSVC builds of AviSynth+. 85 | 86 | // Hopefully, this situation will eventually be resolved 87 | // and a broadly compatible solution will arise so the 88 | // same 32-bit FFmpeg build can handle either MSVC or GCC 89 | // builds of AviSynth+. 90 | 91 | #define AVSC_INLINE static __inline 92 | 93 | #ifdef BUILDING_AVSCORE 94 | #ifdef AVS_WINDOWS 95 | # define AVSC_EXPORT __declspec(dllexport) 96 | # define AVSC_API(ret, name) EXTERN_C AVSC_EXPORT ret AVSC_CC name 97 | #else 98 | # define AVSC_EXPORT EXTERN_C 99 | # define AVSC_API(ret, name) EXTERN_C ret AVSC_CC name 100 | #endif 101 | #else 102 | # define AVSC_EXPORT EXTERN_C __declspec(dllexport) 103 | # ifndef AVSC_NO_DECLSPEC 104 | # define AVSC_API(ret, name) EXTERN_C __declspec(dllimport) ret AVSC_CC name 105 | # else 106 | # define AVSC_API(ret, name) typedef ret (AVSC_CC *name##_func) 107 | # endif 108 | #endif 109 | 110 | #endif //AVS_CAPI_H 111 | -------------------------------------------------------------------------------- /FluxSmooth/avs/alignment.h: -------------------------------------------------------------------------------- 1 | // Avisynth C Interface Version 0.20 2 | // Copyright 2003 Kevin Atkinson 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | // 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | // 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 17 | // http://www.gnu.org/copyleft/gpl.html . 18 | // 19 | // As a special exception, I give you permission to link to the 20 | // Avisynth C interface with independent modules that communicate with 21 | // the Avisynth C interface solely through the interfaces defined in 22 | // avisynth_c.h, regardless of the license terms of these independent 23 | // modules, and to copy and distribute the resulting combined work 24 | // under terms of your choice, provided that every copy of the 25 | // combined work is accompanied by a complete copy of the source code 26 | // of the Avisynth C interface and Avisynth itself (with the version 27 | // used to produce the combined work), being distributed under the 28 | // terms of the GNU General Public License plus this exception. An 29 | // independent module is a module which is not derived from or based 30 | // on Avisynth C Interface, such as 3rd-party filters, import and 31 | // export plugins, or graphical user interfaces. 32 | 33 | #ifndef AVS_ALIGNMENT_H 34 | #define AVS_ALIGNMENT_H 35 | 36 | // Functions and macros to help work with alignment requirements. 37 | 38 | // Tells if a number is a power of two. 39 | #define IS_POWER2(n) ((n) && !((n) & ((n) - 1))) 40 | 41 | // Tells if the pointer "ptr" is aligned to "align" bytes. 42 | #define IS_PTR_ALIGNED(ptr, align) (((uintptr_t)ptr & ((uintptr_t)(align-1))) == 0) 43 | 44 | // Rounds up the number "n" to the next greater multiple of "align" 45 | #define ALIGN_NUMBER(n, align) (((n) + (align)-1) & (~((align)-1))) 46 | 47 | // Rounds up the pointer address "ptr" to the next greater multiple of "align" 48 | #define ALIGN_POINTER(ptr, align) (((uintptr_t)(ptr) + (align)-1) & (~(uintptr_t)((align)-1))) 49 | 50 | #ifdef __cplusplus 51 | 52 | #include 53 | #include 54 | #include 55 | #include "config.h" 56 | 57 | #if defined(MSVC) && _MSC_VER<1400 58 | // needed for VS2013, otherwise C++11 'alignas' works 59 | #define avs_alignas(x) __declspec(align(x)) 60 | #else 61 | // assumes C++11 support 62 | #define avs_alignas(x) alignas(x) 63 | #endif 64 | 65 | template 66 | static bool IsPtrAligned(T* ptr, size_t align) 67 | { 68 | assert(IS_POWER2(align)); 69 | return (bool)IS_PTR_ALIGNED(ptr, align); 70 | } 71 | 72 | template 73 | static T AlignNumber(T n, T align) 74 | { 75 | assert(IS_POWER2(align)); 76 | return ALIGN_NUMBER(n, align); 77 | } 78 | 79 | template 80 | static T* AlignPointer(T* ptr, size_t align) 81 | { 82 | assert(IS_POWER2(align)); 83 | return (T*)ALIGN_POINTER(ptr, align); 84 | } 85 | 86 | extern "C" 87 | { 88 | #else 89 | #include 90 | #endif // __cplusplus 91 | 92 | // Returns a new buffer that is at least the size "nbytes". 93 | // The buffer will be aligned to "align" bytes. 94 | // Returns NULL on error. On successful allocation, 95 | // the returned buffer must be freed using "avs_free". 96 | inline void* avs_malloc(size_t nbytes, size_t align) 97 | { 98 | if (!IS_POWER2(align)) 99 | return NULL; 100 | 101 | size_t offset = sizeof(void*) + align - 1; 102 | 103 | void *orig = malloc(nbytes + offset); 104 | if (orig == NULL) 105 | return NULL; 106 | 107 | void **aligned = (void**)(((uintptr_t)orig + (uintptr_t)offset) & (~(uintptr_t)(align-1))); 108 | aligned[-1] = orig; 109 | return aligned; 110 | } 111 | 112 | // Buffers allocated using "avs_malloc" must be freed 113 | // using "avs_free" instead of "free". 114 | inline void avs_free(void *ptr) 115 | { 116 | // Mirroring free()'s semantic requires us to accept NULLs 117 | if (ptr == NULL) 118 | return; 119 | 120 | free(((void**)ptr)[-1]); 121 | } 122 | 123 | #ifdef __cplusplus 124 | } // extern "C" 125 | 126 | // The point of these undef's is to force using the template functions 127 | // if we are in C++ mode. For C, the user can rely only on the macros. 128 | #undef IS_PTR_ALIGNED 129 | #undef ALIGN_NUMBER 130 | #undef ALIGN_POINTER 131 | 132 | #endif // __cplusplus 133 | 134 | #endif //AVS_ALIGNMENT_H 135 | -------------------------------------------------------------------------------- /FluxSmooth/avs/posix.h: -------------------------------------------------------------------------------- 1 | // This program is free software; you can redistribute it and/or modify 2 | // it under the terms of the GNU General Public License as published by 3 | // the Free Software Foundation; either version 2 of the License, or 4 | // (at your option) any later version. 5 | // 6 | // This program is distributed in the hope that it will be useful, 7 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 8 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 | // GNU General Public License for more details. 10 | // 11 | // You should have received a copy of the GNU General Public License 12 | // along with this program; if not, write to the Free Software 13 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 14 | // http://www.gnu.org/copyleft/gpl.html . 15 | // 16 | // Linking Avisynth statically or dynamically with other modules is making a 17 | // combined work based on Avisynth. Thus, the terms and conditions of the GNU 18 | // General Public License cover the whole combination. 19 | // 20 | // As a special exception, the copyright holders of Avisynth give you 21 | // permission to link Avisynth with independent modules that communicate with 22 | // Avisynth solely through the interfaces defined in avisynth.h, regardless of the license 23 | // terms of these independent modules, and to copy and distribute the 24 | // resulting combined work under terms of your choice, provided that 25 | // every copy of the combined work is accompanied by a complete copy of 26 | // the source code of Avisynth (the version of Avisynth used to produce the 27 | // combined work), being distributed under the terms of the GNU General 28 | // Public License plus this exception. An independent module is a module 29 | // which is not derived from or based on Avisynth, such as 3rd-party filters, 30 | // import and export plugins, or graphical user interfaces. 31 | 32 | #ifdef AVS_POSIX 33 | #ifndef AVSCORE_POSIX_H 34 | #define AVSCORE_POSIX_H 35 | 36 | #ifdef __cplusplus 37 | #include 38 | #endif 39 | #include 40 | #include 41 | 42 | // Define these MSVC-extension used in Avisynth 43 | #define __single_inheritance 44 | 45 | // These things don't exist in Linux 46 | #define __declspec(x) 47 | #define lstrlen strlen 48 | #define lstrcmp strcmp 49 | #define lstrcmpi strcasecmp 50 | #define _stricmp strcasecmp 51 | #define _strnicmp strncasecmp 52 | #define _strdup strdup 53 | #define SetCurrentDirectory(x) chdir(x) 54 | #define SetCurrentDirectoryW(x) chdir(x) 55 | #define GetCurrentDirectoryW(x) getcwd(x) 56 | #define _putenv putenv 57 | #define _alloca alloca 58 | 59 | // Borrowing some compatibility macros from AvxSynth, slightly modified 60 | #define UInt32x32To64(a, b) ((uint64_t)(((uint64_t)((uint32_t)(a))) * ((uint32_t)(b)))) 61 | #define Int64ShrlMod32(a, b) ((uint64_t)((uint64_t)(a) >> (b))) 62 | #define Int32x32To64(a, b) ((int64_t)(((int64_t)((long)(a))) * ((long)(b)))) 63 | 64 | #define InterlockedIncrement(x) __sync_add_and_fetch((x), 1) 65 | #define InterlockedDecrement(x) __sync_sub_and_fetch((x), 1) 66 | #define MulDiv(nNumber, nNumerator, nDenominator) (int32_t) (((int64_t) (nNumber) * (int64_t) (nNumerator) + (int64_t) ((nDenominator)/2)) / (int64_t) (nDenominator)) 67 | 68 | #ifndef TRUE 69 | #define TRUE true 70 | #endif 71 | 72 | #ifndef FALSE 73 | #define FALSE false 74 | #endif 75 | 76 | #define S_FALSE (0x00000001) 77 | #define E_FAIL (0x80004005) 78 | #define FAILED(hr) ((hr) & 0x80000000) 79 | #define SUCCEEDED(hr) (!FAILED(hr)) 80 | 81 | // Statuses copied from comments in exception.cpp 82 | #define STATUS_GUARD_PAGE_VIOLATION 0x80000001 83 | #define STATUS_DATATYPE_MISALIGNMENT 0x80000002 84 | #define STATUS_BREAKPOINT 0x80000003 85 | #define STATUS_SINGLE_STEP 0x80000004 86 | #define STATUS_ACCESS_VIOLATION 0xc0000005 87 | #define STATUS_IN_PAGE_ERROR 0xc0000006 88 | #define STATUS_INVALID_HANDLE 0xc0000008 89 | #define STATUS_NO_MEMORY 0xc0000017 90 | #define STATUS_ILLEGAL_INSTRUCTION 0xc000001d 91 | #define STATUS_NONCONTINUABLE_EXCEPTION 0xc0000025 92 | #define STATUS_INVALID_DISPOSITION 0xc0000026 93 | #define STATUS_ARRAY_BOUNDS_EXCEEDED 0xc000008c 94 | #define STATUS_FLOAT_DENORMAL_OPERAND 0xc000008d 95 | #define STATUS_FLOAT_DIVIDE_BY_ZERO 0xc000008e 96 | #define STATUS_FLOAT_INEXACT_RESULT 0xc000008f 97 | #define STATUS_FLOAT_INVALID_OPERATION 0xc0000090 98 | #define STATUS_FLOAT_OVERFLOW 0xc0000091 99 | #define STATUS_FLOAT_STACK_CHECK 0xc0000092 100 | #define STATUS_FLOAT_UNDERFLOW 0xc0000093 101 | #define STATUS_INTEGER_DIVIDE_BY_ZERO 0xc0000094 102 | #define STATUS_INTEGER_OVERFLOW 0xc0000095 103 | #define STATUS_PRIVILEGED_INSTRUCTION 0xc0000096 104 | #define STATUS_STACK_OVERFLOW 0xc00000fd 105 | 106 | // Calling convension 107 | #define __stdcall 108 | #define __cdecl 109 | 110 | #endif // AVSCORE_POSIX_H 111 | #endif // AVS_POSIX 112 | -------------------------------------------------------------------------------- /FluxSmooth/avs/config.h: -------------------------------------------------------------------------------- 1 | // Avisynth C Interface Version 0.20 2 | // Copyright 2003 Kevin Atkinson 3 | 4 | // This program is free software; you can redistribute it and/or modify 5 | // it under the terms of the GNU General Public License as published by 6 | // the Free Software Foundation; either version 2 of the License, or 7 | // (at your option) any later version. 8 | // 9 | // This program is distributed in the hope that it will be useful, 10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | // GNU General Public License for more details. 13 | // 14 | // You should have received a copy of the GNU General Public License 15 | // along with this program; if not, write to the Free Software 16 | // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit 17 | // http://www.gnu.org/copyleft/gpl.html . 18 | // 19 | // As a special exception, I give you permission to link to the 20 | // Avisynth C interface with independent modules that communicate with 21 | // the Avisynth C interface solely through the interfaces defined in 22 | // avisynth_c.h, regardless of the license terms of these independent 23 | // modules, and to copy and distribute the resulting combined work 24 | // under terms of your choice, provided that every copy of the 25 | // combined work is accompanied by a complete copy of the source code 26 | // of the Avisynth C interface and Avisynth itself (with the version 27 | // used to produce the combined work), being distributed under the 28 | // terms of the GNU General Public License plus this exception. An 29 | // independent module is a module which is not derived from or based 30 | // on Avisynth C Interface, such as 3rd-party filters, import and 31 | // export plugins, or graphical user interfaces. 32 | 33 | #ifndef AVS_CONFIG_H 34 | #define AVS_CONFIG_H 35 | 36 | // Undefine this to get cdecl calling convention 37 | #define AVSC_USE_STDCALL 1 38 | 39 | // NOTE TO PLUGIN AUTHORS: 40 | // Because FRAME_ALIGN can be substantially higher than the alignment 41 | // a plugin actually needs, plugins should not use FRAME_ALIGN to check for 42 | // alignment. They should always request the exact alignment value they need. 43 | // This is to make sure that plugins work over the widest range of AviSynth 44 | // builds possible. 45 | #define FRAME_ALIGN 64 46 | 47 | #if defined(_M_AMD64) || defined(__x86_64) 48 | # define X86_64 49 | #elif defined(_M_IX86) || defined(__i386__) 50 | # define X86_32 51 | // VS2017 introduced _M_ARM64 52 | #elif defined(_M_ARM64) || defined(__aarch64__) 53 | # define ARM64 54 | #elif defined(_M_ARM) || defined(__arm__) 55 | # define ARM32 56 | #else 57 | # error Unsupported CPU architecture. 58 | #endif 59 | 60 | // VC++ LLVM-Clang-cl MinGW-Gnu 61 | // MSVC x x 62 | // MSVC_PURE x 63 | // CLANG x 64 | // GCC x 65 | 66 | #if defined(__clang__) 67 | // Check clang first. clang-cl also defines __MSC_VER 68 | // We set MSVC because they are mostly compatible 69 | # define CLANG 70 | #if defined(_MSC_VER) 71 | # define MSVC 72 | # define AVS_FORCEINLINE __attribute__((always_inline)) 73 | #else 74 | # define AVS_FORCEINLINE __attribute__((always_inline)) inline 75 | #endif 76 | #elif defined(_MSC_VER) 77 | # define MSVC 78 | # define MSVC_PURE 79 | # define AVS_FORCEINLINE __forceinline 80 | #elif defined(__GNUC__) 81 | # define GCC 82 | # define AVS_FORCEINLINE __attribute__((always_inline)) inline 83 | #else 84 | # error Unsupported compiler. 85 | # define AVS_FORCEINLINE inline 86 | # undef __forceinline 87 | # define __forceinline inline 88 | #endif 89 | 90 | #if defined(_WIN32) 91 | # define AVS_WINDOWS 92 | #elif defined(__linux__) 93 | # define AVS_LINUX 94 | # define AVS_POSIX 95 | #elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) 96 | # define AVS_BSD 97 | # define AVS_POSIX 98 | #elif defined(__APPLE__) 99 | # define AVS_MACOS 100 | # define AVS_POSIX 101 | #else 102 | # error Operating system unsupported. 103 | #endif 104 | 105 | // useful warnings disabler macros for supported compilers 106 | 107 | #if defined(_MSC_VER) 108 | #define DISABLE_WARNING_PUSH __pragma(warning( push )) 109 | #define DISABLE_WARNING_POP __pragma(warning( pop )) 110 | #define DISABLE_WARNING(warningNumber) __pragma(warning( disable : warningNumber )) 111 | 112 | #define DISABLE_WARNING_UNREFERENCED_LOCAL_VARIABLE DISABLE_WARNING(4101) 113 | #define DISABLE_WARNING_UNREFERENCED_FUNCTION DISABLE_WARNING(4505) 114 | // other warnings you want to deactivate... 115 | 116 | #elif defined(__GNUC__) || defined(__clang__) 117 | #define DO_PRAGMA(X) _Pragma(#X) 118 | #define DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push) 119 | #define DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop) 120 | #define DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored #warningName) 121 | 122 | #define DISABLE_WARNING_UNREFERENCED_LOCAL_VARIABLE DISABLE_WARNING(-Wunused-variable) 123 | #define DISABLE_WARNING_UNREFERENCED_FUNCTION DISABLE_WARNING(-Wunused-function) 124 | // other warnings you want to deactivate... 125 | 126 | #else 127 | #define DISABLE_WARNING_PUSH 128 | #define DISABLE_WARNING_POP 129 | #define DISABLE_WARNING_UNREFERENCED_LOCAL_VARIABLE 130 | #define DISABLE_WARNING_UNREFERENCED_FUNCTION 131 | // other warnings you want to deactivate... 132 | 133 | #endif 134 | 135 | #if defined(AVS_POSIX) 136 | #define NEW_AVSVALUE 137 | #else 138 | #define NEW_AVSVALUE 139 | #endif 140 | 141 | #endif //AVS_CONFIG_H 142 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | [Rr]elease LLVM/ 19 | [Rr]eleases LLVM/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # MSTest test Results 33 | [Tt]est[Rr]esult*/ 34 | [Bb]uild[Ll]og.* 35 | 36 | # NUNIT 37 | *.VisualState.xml 38 | TestResult.xml 39 | 40 | # Build Results of an ATL Project 41 | [Dd]ebugPS/ 42 | [Rr]eleasePS/ 43 | dlldata.c 44 | 45 | # DNX 46 | project.lock.json 47 | project.fragment.lock.json 48 | artifacts/ 49 | 50 | *_i.c 51 | *_p.c 52 | *_i.h 53 | *.ilk 54 | *.meta 55 | *.obj 56 | *.pch 57 | *.pdb 58 | *.pgc 59 | *.pgd 60 | *.rsp 61 | *.sbr 62 | *.tlb 63 | *.tli 64 | *.tlh 65 | *.tmp 66 | *.tmp_proj 67 | *.log 68 | *.vspscc 69 | *.vssscc 70 | .builds 71 | *.pidb 72 | *.svclog 73 | *.scc 74 | 75 | # Chutzpah Test files 76 | _Chutzpah* 77 | 78 | # Visual C++ cache files 79 | ipch/ 80 | *.aps 81 | *.ncb 82 | *.opendb 83 | *.opensdf 84 | *.sdf 85 | *.cachefile 86 | *.VC.db 87 | *.VC.VC.opendb 88 | 89 | # Visual Studio profiler 90 | *.psess 91 | *.vsp 92 | *.vspx 93 | *.sap 94 | 95 | # TFS 2012 Local Workspace 96 | $tf/ 97 | 98 | # Guidance Automation Toolkit 99 | *.gpState 100 | 101 | # ReSharper is a .NET coding add-in 102 | _ReSharper*/ 103 | *.[Rr]e[Ss]harper 104 | *.DotSettings.user 105 | 106 | # JustCode is a .NET coding add-in 107 | .JustCode 108 | 109 | # TeamCity is a build add-in 110 | _TeamCity* 111 | 112 | # DotCover is a Code Coverage Tool 113 | *.dotCover 114 | 115 | # NCrunch 116 | _NCrunch_* 117 | .*crunch*.local.xml 118 | nCrunchTemp_* 119 | 120 | # MightyMoose 121 | *.mm.* 122 | AutoTest.Net/ 123 | 124 | # Web workbench (sass) 125 | .sass-cache/ 126 | 127 | # Installshield output folder 128 | [Ee]xpress/ 129 | 130 | # DocProject is a documentation generator add-in 131 | DocProject/buildhelp/ 132 | DocProject/Help/*.HxT 133 | DocProject/Help/*.HxC 134 | DocProject/Help/*.hhc 135 | DocProject/Help/*.hhk 136 | DocProject/Help/*.hhp 137 | DocProject/Help/Html2 138 | DocProject/Help/html 139 | 140 | # Click-Once directory 141 | publish/ 142 | 143 | # Publish Web Output 144 | *.[Pp]ublish.xml 145 | *.azurePubxml 146 | # TODO: Comment the next line if you want to checkin your web deploy settings 147 | # but database connection strings (with potential passwords) will be unencrypted 148 | #*.pubxml 149 | *.publishproj 150 | 151 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 152 | # checkin your Azure Web App publish settings, but sensitive information contained 153 | # in these scripts will be unencrypted 154 | PublishScripts/ 155 | 156 | # NuGet Packages 157 | *.nupkg 158 | # The packages folder can be ignored because of Package Restore 159 | **/packages/* 160 | # except build/, which is used as an MSBuild target. 161 | !**/packages/build/ 162 | # Uncomment if necessary however generally it will be regenerated when needed 163 | #!**/packages/repositories.config 164 | # NuGet v3's project.json files produces more ignoreable files 165 | *.nuget.props 166 | *.nuget.targets 167 | 168 | # Microsoft Azure Build Output 169 | csx/ 170 | *.build.csdef 171 | 172 | # Microsoft Azure Emulator 173 | ecf/ 174 | rcf/ 175 | 176 | # Windows Store app package directories and files 177 | AppPackages/ 178 | BundleArtifacts/ 179 | Package.StoreAssociation.xml 180 | _pkginfo.txt 181 | 182 | # Visual Studio cache files 183 | # files ending in .cache can be ignored 184 | *.[Cc]ache 185 | # but keep track of directories ending in .cache 186 | !*.[Cc]ache/ 187 | 188 | # Others 189 | ClientBin/ 190 | ~$* 191 | *~ 192 | *.dbmdl 193 | *.dbproj.schemaview 194 | *.jfm 195 | *.pfx 196 | *.publishsettings 197 | node_modules/ 198 | orleans.codegen.cs 199 | 200 | # Since there are multiple workflows, uncomment next line to ignore bower_components 201 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 202 | #bower_components/ 203 | 204 | # RIA/Silverlight projects 205 | Generated_Code/ 206 | 207 | # Backup & report files from converting an old project file 208 | # to a newer Visual Studio version. Backup files are not needed, 209 | # because we have git ;-) 210 | _UpgradeReport_Files/ 211 | Backup*/ 212 | UpgradeLog*.XML 213 | UpgradeLog*.htm 214 | 215 | # SQL Server files 216 | *.mdf 217 | *.ldf 218 | 219 | # Business Intelligence projects 220 | *.rdl.data 221 | *.bim.layout 222 | *.bim_*.settings 223 | 224 | # Microsoft Fakes 225 | FakesAssemblies/ 226 | 227 | # GhostDoc plugin setting file 228 | *.GhostDoc.xml 229 | 230 | # Node.js Tools for Visual Studio 231 | .ntvs_analysis.dat 232 | 233 | # Visual Studio 6 build log 234 | *.plg 235 | 236 | # Visual Studio 6 workspace options file 237 | *.opt 238 | 239 | # Visual Studio LightSwitch build output 240 | **/*.HTMLClient/GeneratedArtifacts 241 | **/*.DesktopClient/GeneratedArtifacts 242 | **/*.DesktopClient/ModelManifest.xml 243 | **/*.Server/GeneratedArtifacts 244 | **/*.Server/ModelManifest.xml 245 | _Pvt_Extensions 246 | 247 | # Paket dependency manager 248 | .paket/paket.exe 249 | paket-files/ 250 | 251 | # FAKE - F# Make 252 | .fake/ 253 | 254 | # JetBrains Rider 255 | .idea/ 256 | *.sln.iml 257 | 258 | # CodeRush 259 | .cr/ 260 | 261 | # Python Tools for Visual Studio (PTVS) 262 | __pycache__/ 263 | *.pyc -------------------------------------------------------------------------------- /FluxSmooth/FluxSmooth.h: -------------------------------------------------------------------------------- 1 | #ifndef __FLUXSMOOTH_H__ 2 | #define __FLUXSMOOTH_H__ 3 | 4 | #include "avisynth.h" 5 | #include "stdint.h" 6 | #include "emmintrin.h" 7 | #include 8 | #include 9 | #include 10 | 11 | /************************************ 12 | // AVX512 enabler switch!!! 13 | ************************************/ 14 | #define FLUXSMOOTH_AVX512_ENABLED 15 | 16 | #if defined(_MSC_VER) && !defined(__clang__) 17 | // Some missing avx512 mask intrinsics are handmade for Microsoft (for 19.20) 18 | // As of April 2019, MS version of ??intrin.h does not support AVX512BW _k*_mask* functions 19 | // https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html 20 | // uncomment if AVX512 is really not needed 21 | // #undef FLUXSMOOTH_AVX512_ENABLED 22 | #endif 23 | 24 | /************************************ 25 | // Helpers, missing intrinsics 26 | ************************************/ 27 | 28 | // SSE4.1 simulation for SSE2 29 | static AVS_FORCEINLINE __m128i _MM_BLENDV_EPI8(__m128i const &a, __m128i const &b, __m128i const &selector) { 30 | return _mm_or_si128(_mm_and_si128(selector, b), _mm_andnot_si128(selector, a)); 31 | } 32 | 33 | // non-existant simd 34 | static AVS_FORCEINLINE __m128i _MM_CMPLE_EPU16(__m128i x, __m128i y) 35 | { 36 | // Returns 0xFFFF where x <= y: 37 | return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128()); 38 | } 39 | 40 | #define _mm_cmpge_epu8(a, b) \ 41 | _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) 42 | 43 | #define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) 44 | 45 | // non-existant simd 46 | static AVS_FORCEINLINE __m128i _mm_cmpgt_epu8(__m128i x, __m128i y) 47 | { 48 | // Returns 0xFF where x > y: 49 | return _mm_andnot_si128( 50 | _mm_cmpeq_epi8(x, y), 51 | _mm_cmpeq_epi8(_mm_max_epu8(x, y), x) 52 | ); 53 | } 54 | 55 | #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) 56 | 57 | static AVS_FORCEINLINE __m128i _mm_cmpge_epi16(__m128i x, __m128i y) 58 | { 59 | // Returns 0xFFFF where x >= y: 60 | return _mm_or_si128(_mm_cmpeq_epi16(x, y), _mm_cmpgt_epi16(x, y)); 61 | } 62 | 63 | #define _mm_cmple_epi16(a, b) _mm_cmpge_epi16(b, a) 64 | 65 | /************************************ 66 | // other constants 67 | ************************************/ 68 | 69 | // Optimizations by 'opt' parameter 70 | enum { USE_OPT_C = 0, USE_OPT_SSE2 = 1, USE_OPT_SSE41 = 2, USE_OPT_AVX2 = 3, USE_OPT_AVX512 = 4}; 71 | 72 | constexpr int planes_y[4] = { PLANAR_Y, PLANAR_U, PLANAR_V, PLANAR_A }; 73 | constexpr int planes_r[4] = { PLANAR_G, PLANAR_B, PLANAR_R, PLANAR_A }; 74 | 75 | /************************************ 76 | // Prototypes, Temporal 77 | ************************************/ 78 | #ifdef FLUXSMOOTH_AVX512_ENABLED 79 | void fluxT_avx512_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 80 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 81 | 82 | void fluxT_avx512(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 83 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 84 | #endif 85 | 86 | void fluxT_avx2_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 87 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 88 | 89 | void fluxT_avx2(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 90 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 91 | 92 | void fluxT_sse41(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 93 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 94 | 95 | void fluxT_sse41_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 96 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 97 | 98 | void fluxT_sse2(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 99 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 100 | 101 | template 102 | void fluxT_C(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 103 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 104 | 105 | /************************************ 106 | // Prototypes, Spatial - Temporal 107 | ************************************/ 108 | #ifdef FLUXSMOOTH_AVX512_ENABLED 109 | void fluxST_avx512_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 110 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 111 | 112 | void fluxST_avx512(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 113 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 114 | #endif 115 | 116 | void fluxST_avx2_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 117 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 118 | 119 | void fluxST_avx2(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 120 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 121 | 122 | void fluxST_sse41(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 123 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 124 | 125 | void fluxST_sse41_uint16(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 126 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 127 | 128 | void fluxST_sse2(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 129 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 130 | 131 | template 132 | void fluxST_C(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 133 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 134 | 135 | /************************************ 136 | // Filter classes 137 | ************************************/ 138 | 139 | class FluxSmoothST: public GenericVideoFilter 140 | { 141 | using proc_ST_t = void(*)(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 142 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 143 | 144 | protected: 145 | PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment * env) override; 146 | 147 | public: 148 | FluxSmoothST(PClip _child, int _temporal_threshold, int _spatial_threshold, bool _luma, bool _chroma, int _opt, IScriptEnvironment * env); 149 | 150 | // Auto register AVS+ mode: NICE filter 151 | int __stdcall SetCacheHints(int cachehints, int frame_range) override { 152 | return cachehints == CACHE_GET_MTMODE ? MT_NICE_FILTER : 0; 153 | } 154 | 155 | private: 156 | int spatial_threshold; 157 | int temporal_threshold; 158 | bool processPlane[3]; 159 | int opt; 160 | short scaletab[16]; 161 | proc_ST_t proc_ST[3]; // for all planes 162 | }; 163 | 164 | class FluxSmoothT : public GenericVideoFilter 165 | { 166 | using proc_T_t = void(*)(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 167 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 168 | 169 | private: 170 | int temporal_threshold; 171 | bool processPlane[3]; 172 | int opt; 173 | short scaletab[16]; // for C 174 | proc_T_t proc_T[3]; // for all planes 175 | 176 | protected: 177 | PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) override; 178 | 179 | public: 180 | FluxSmoothT(PClip _child, int _temporal_threshold, bool _luma, bool _chroma, int _opt, IScriptEnvironment * env); 181 | 182 | // Auto register AVS+ mode: NICE filter 183 | int __stdcall SetCacheHints(int cachehints, int frame_range) override { 184 | return cachehints == CACHE_GET_MTMODE ? MT_NICE_FILTER : 0; 185 | } 186 | 187 | }; 188 | 189 | #endif // #define __FLUXSMOOTH_H__ 190 | 191 | -------------------------------------------------------------------------------- /FluxSmooth/documentation/readme.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | FluxSmooth 4 | 5 | 6 |

FluxSmooth

7 |

An Avisynth filter for smoothing of fluctuations

8 |

By Ross Thomas <ross@grinfinity.com>

9 |

Rewrite by Ferenc Pinter

10 |

There is no copyright on this code, and there are no conditions on its 11 | distribution or use. Do with it what you will.

12 |

Latest version:

13 |

FluxSmooth v1.4 (20190426) https://github.com/pinterf/FluxSmooth

14 |

Description

15 |

One of the fundamental properties of noise is that it's random. One of the 16 | fundamental properties of motion is that it's not. This is the premise behind 17 | FluxSmooth, which examines each pixel and compares it to the corresponding 18 | pixel in the previous and last frame. Smoothing occurs if both the previous 19 | frame's value and the next frame's value are greater, or if both are less, than 20 | the value in the current frame.

21 |

I like to call this a "fluctuating" pixel, then I like to wipe that pixel from 22 | existence by averaging it with its neighbours. For FluxSmoothST, this 23 | is (by default) done in a spatio-temporal manner, in that for each 24 | fluctuating pixel its 8 immediate spatial neighbours as well as its 2 temporal 25 | neighbours (the abovementioned corresponding pixel from the previous and next 26 | frames) are considered for inclusion in the average. If the value of each pixel 27 | is within the specified threshold, it is included. If not, it isn't. 28 | FluxSmoothT performs only temporal averaging.

29 |

This filter seems to remove almost all noise from low-noise sources (such as 30 | DVD) and a lot of noise from high-noise sources (such as cable TV captures), 31 | while maintaining a good amount of detail.

32 |

Using FluxSmoothT instead of FluxSmoothST for temporal-only smoothing is faster.

33 |

Usage

34 |
35 |

FluxSmoothT([clip], int temporal_threshold, bool luma, bool chroma, int opt)

36 |

FluxSmoothST([clip], int temporal_threshold, int spatial_threshold, bool luma, bool chroma, int opt)

37 |
38 | 39 | 40 | 42 | 44 | 46 | 47 | 48 | 49 | 56 | 57 | 58 | 59 | 60 | 66 | 67 | 68 | 69 | 70 | 74 | 75 | 76 | 77 | 78 | 82 | 83 | 84 | 85 | 86 | 92 | 93 | 94 |
41 | Parameter 43 | Meaning 45 | Default
temporal_threshold 50 |

Temporal neighbour pixels within this threshold from the current pixel are 51 | included in the average.

52 |

The threshold is normalized to match with the old 8 bit clips, you can keep the same value for 10-16 bit clips to have the same effect

53 |

If set to -1, no temporal smoothing occurs. (Cannot be set to -1 in 54 | FluxSmoothT.)

55 |
7
spatial_threshold 61 |

Spatial neighbour pixels within this threshold from the current pixel are 62 | included in the average.

63 |

The threshold is normalized to match with the old 8 bit clips, you can keep the same value for 10-16 bit clips to have the same effect

64 |

If set to -1, no spatial smoothing occurs.

65 |
7
luma 71 |

Enables luma channel processing

72 |

If set to false, luma (Y) channel is simply copied. Ineffective for RGB clips

73 |
true
chroma 79 |

Enables chroma channel processing

80 |

If set to false, chroma (U/V) channels are simply copied. Ineffective for RGB clips

81 |
true
opt 87 |

Debug parameter for directly choosing optimization

88 |

0=C, 1=SSE2, 2=SSE4.1, 3=AVX2, 4=AVX512, -1: automatic

89 |

If set to -1, the fastest one is chosen automatically

90 |

Note for 10-16 bits: due to the different averaging methods (precision of 1/x, rounding), results may not be bit-identical for different CPU targets

91 |
-1
95 |

Known Issues

96 |
    97 |
  • 98 | The very edges of the frame are unprocessed. 99 |
  • 100 | The very first and very last frame of a clip is unprocessed. 101 |
102 |

Author

103 |

Ross Thomas <ross@grinfinity.com>

104 |

Ferenc Pinter https://github.com/pinterf

105 |

History

106 | 107 | 108 | 110 | 112 | 113 | 114 | 115 | 118 | 119 | 120 | 121 | 133 | 134 | 135 | 136 | 138 | 139 | 140 | 141 | 146 | 147 | 148 | 149 | 156 | 157 | 158 | 159 | 166 | 167 | 168 | 173 | 174 | 175 | 176 | 179 | 180 | 181 | 182 | 186 | 187 | 188 | 189 | 194 | 195 | 196 | 197 | 198 | 199 |
109 | Version 111 | Description
1.4 (pinterf)
20190426
AVX512 support: when both AVX512F and AVX512BW extension are available (e.g. Skylake X and Cannon Lake).
116 | Available processor flags can be shown through the .Info() filter in Avisynth+.
117 | New value for 'opt': opt=4 means forced AVX512. Error message if system does not support those AVX512 flags.
1.3 (pinterf)
20190402
project moved to github: https://github.com/pinterf/FluxSmooth
122 | Built using Visual Studio 2017, additional LLVM 8.0 clang support
123 | Changed to AVS 2.6 plugin interface
124 | x64 build for Avisynth+
125 | Added version resource to DLL
126 | Removed MMX support, requires SSE2. (Though pure C is still available in the source)
127 | Dropped all inline assembly, new SIMD intrinsics based on C code, SSE2, SSE4.1 and AVX2 optimizations
128 | Single DLL, optimizations for different CPU instruction sets are chosen automatically.
129 | Reports MT Modes for Avisynth+: MT_NICE_FILTER
130 | Added Y, YV411, YV16 and YV24, 10-16 bits 4:2:0, 4:2:2, 4:4:4, planar RGB(A) 8-16 bits support besides existing YV12
131 | (YUY2 support was kept by a behind-the-scene YV16 to-from conversion. Conversion is lossless but slower than using native YV16)
132 | New parameters: bool "luma", bool "chroma" (default true) to disable processing of luma/chroma planes
1.1bFixed assuming previous and next frame pitches were the same as 137 | the current frame pitch.
1.1aYet another "oops" release. Current pixel is once again considered in the 142 | averaging code -- I found the lack of it too aggressive, especially during fast 143 | motion. Also fixed stupid "3am bug" involving a couple of variables I'd 144 | declared static that shouldn't've been. Thanks to krieger2005 for spotting that 145 | one, and ARDA for diagnosing it.
1.1Changed the averaging code so that the current pixel is excluded, which 150 | produces better noise reduction. Also split the code into two different 151 | filters, FluxSmoothT and FluxSmoothST. The former does temporal-only smoothing 152 | (equivalent to setting "spatial_threshold=-1" in FluxSmoothST) and is 153 | about 50% faster. Removed Avisynth 2.0x version to tidy up the code base. 154 | Does anyone actually use it any more? My thanks to fabrice and sh0dan for the 155 | 1.01 release during my extended absence :).
1.01Added by sh0dan:
160 | - Removed leak in AviSynth 2.5 YV12 mode (code by fabrice)
161 | - Aligned tables and variables.
162 | - Use AviSynth BitBlt for copying chroma.
163 | - Don't use streaming store. (movntq)
164 | All in all an approximate 15% speedup compared to previous version. All changes 165 | are marked with "sh0:".
1.0First "stable" release. I think it's been tested enough, but wait for a bunch 169 | of bugs to emerge and make me a liar... Fixed a bug that, in conjunction with a 170 | bug in the built-in resizers, caused an access violation under certain 171 | circumstances. Thanks to sh0dan for spotting that one :). Added "SetCacheHints" 172 | and upgraded to "AvisynthPluginInit2" in 2.5 version.
0.4Implemented iSSE-optimized version, which runs roughly double the speed of the 177 | C++ version. Some small optimizations to C++ version. Now smooths chroma as 178 | well as luma.
0.3Fixed bad bug that caused incorrect smoothing: no more in-place filtering. 183 | Changed defaults back to what they were, now that the algorithm works 184 | correctly. Spent some time benchmarking and tweaking various pieces of 185 | code, so should now be significantly faster.
0.2 190 |

Fixed non-fatal bug that caused a request for one frame beyond the end of the 191 | clip. Changed to in-place filtering so could squeeze a few optimizations here 192 | and there. Changed too-high defaults. First Avisynth 2.5/YV12 release.

193 |
0.1First release. Alpha code.
200 | 201 | 202 | -------------------------------------------------------------------------------- /FluxSmooth/FilterDef.cpp: -------------------------------------------------------------------------------- 1 | // FluxSmooth 2 | // Avisynth filter for spatio-temporal smoothing of fluctuations 3 | // 4 | // By Ross Thomas 5 | // 6 | // There is no copyright on this code, and there are no conditions 7 | // on its distribution or use. Do with it what you will. 8 | 9 | #ifdef AVS_WINDOWS 10 | #include 11 | #else 12 | #include "avs/posix.h" 13 | #endif 14 | #include 15 | #include "avisynth.h" 16 | #include "FluxSmooth.h" 17 | #include 18 | 19 | FluxSmoothST::FluxSmoothST(PClip _child, int _temporal_threshold, int _spatial_threshold, bool _luma, bool _chroma, int _opt, IScriptEnvironment * env) 20 | : GenericVideoFilter(_child), 21 | spatial_threshold(_spatial_threshold), 22 | temporal_threshold(_temporal_threshold), 23 | opt(_opt) 24 | { 25 | assert(temporal_threshold >= -1); 26 | assert(spatial_threshold >= -1); 27 | assert(!((-1 == temporal_threshold) && (-1 == spatial_threshold))); 28 | assert(env); 29 | 30 | // division table 1/1, 1/2, ... 1/11 31 | // only 1..11 is valid 32 | scaletab[0] = 0; 33 | scaletab[1] = 32767; 34 | for (int i = 2; i < 16; ++i) 35 | scaletab[i] = (int)(32768.0 / i + 0.5); 36 | 37 | const bool goodAVX512 = ((env->GetCPUFlags() & CPUF_AVX512F) == CPUF_AVX512F) && (env->GetCPUFlags() & CPUF_AVX512BW) == CPUF_AVX512BW; 38 | 39 | #ifndef FLUXSMOOTH_AVX512_ENABLED 40 | if (opt == USE_OPT_AVX512) 41 | env->ThrowError("FluxSmoothST: cannot apply opt: this DLL version does not support AVX512"); 42 | #endif 43 | 44 | if (opt == USE_OPT_AVX512 && !goodAVX512) 45 | env->ThrowError("FluxSmoothST: cannot apply opt: AVX512F and AVX512BW is needed"); 46 | if (opt == USE_OPT_AVX2 && !(env->GetCPUFlags() & CPUF_AVX2)) 47 | env->ThrowError("FluxSmoothST: cannot apply opt: AVX2 is not supported"); 48 | if (opt == USE_OPT_SSE41 && !(env->GetCPUFlags() & CPUF_SSE4_1)) 49 | env->ThrowError("FluxSmoothST: cannot apply opt: SSE4.1 is not supported"); 50 | if (opt == USE_OPT_SSE2 && !(env->GetCPUFlags() & CPUF_SSE2)) 51 | env->ThrowError("FluxSmoothST: cannot apply opt: SSE2 is not supported"); 52 | 53 | const int *current_planes = (vi.IsYUV() || vi.IsYUVA()) ? planes_y : planes_r; 54 | int planecount = std::min(vi.NumComponents(), 3); 55 | int bits_per_pixel = vi.BitsPerComponent(); 56 | 57 | for (int i = 0; i < planecount; i++) { 58 | if (vi.IsRGB()) 59 | processPlane[i] = true; 60 | else if (i == 0) // Y 61 | processPlane[i] = _luma; 62 | else 63 | processPlane[i] = _chroma; 64 | 65 | const int actual_width = vi.width >> vi.GetPlaneWidthSubsampling(current_planes[i]); 66 | if (bits_per_pixel == 8) { 67 | #ifdef FLUXSMOOTH_AVX512_ENABLED 68 | if ((actual_width >= 1 + 64 + 1) && ((goodAVX512 && opt < 0) || opt >= USE_OPT_AVX512)) 69 | proc_ST[i] = fluxST_avx512; 70 | else 71 | #endif 72 | if ((actual_width >= 1 + 32 + 1) && (((env->GetCPUFlags() & CPUF_AVX2) == CPUF_AVX2 && opt < 0) || opt >= USE_OPT_AVX2)) 73 | proc_ST[i] = fluxST_avx2; 74 | else if ((actual_width >= 1 + 16 + 1) && (((env->GetCPUFlags() & CPUF_SSE4_1) == CPUF_SSE4_1 && opt < 0) || opt >= USE_OPT_SSE41)) 75 | proc_ST[i] = fluxST_sse41; 76 | else if ((actual_width >= 1 + 16 + 1) && (((env->GetCPUFlags() & CPUF_SSE2) == CPUF_SSE2 && opt < 0) || opt >= USE_OPT_SSE2)) 77 | proc_ST[i] = fluxST_sse2; 78 | else 79 | proc_ST[i] = fluxST_C; 80 | } 81 | else { 82 | #ifdef FLUXSMOOTH_AVX512_ENABLED 83 | if ((actual_width >= 1 + 32 + 1) && ((goodAVX512 && opt < 0) || opt >= USE_OPT_AVX512)) 84 | proc_ST[i] = fluxST_avx512_uint16; 85 | else 86 | #endif 87 | if ((actual_width >= 1 + 16 + 1) && (((env->GetCPUFlags() & CPUF_AVX2) == CPUF_AVX2 && opt < 0) || opt >= USE_OPT_AVX2)) 88 | proc_ST[i] = fluxST_avx2_uint16; 89 | else if ((actual_width >= 1 + 8 + 1) && (((env->GetCPUFlags() & CPUF_SSE4_1) == CPUF_SSE4_1 && opt < 0) || opt >= USE_OPT_SSE41)) 90 | proc_ST[i] = fluxST_sse41_uint16; 91 | else 92 | proc_ST[i] = fluxST_C; 93 | } 94 | } 95 | } 96 | 97 | static void copy_plane(PVideoFrame &destf, PVideoFrame &currf, int plane, IScriptEnvironment *env) { 98 | const uint8_t* srcp = currf->GetReadPtr(plane); 99 | int src_pitch = currf->GetPitch(plane); 100 | int height = currf->GetHeight(plane); 101 | int row_size = currf->GetRowSize(plane); 102 | uint8_t* destp = destf->GetWritePtr(plane); 103 | int dst_pitch = destf->GetPitch(plane); 104 | env->BitBlt(destp, dst_pitch, srcp, src_pitch, row_size, height); 105 | } 106 | 107 | PVideoFrame __stdcall FluxSmoothST::GetFrame(int n, IScriptEnvironment * env) 108 | { 109 | const uint8_t* srcp; 110 | const uint8_t* prevp; 111 | const uint8_t* nextp; 112 | uint8_t* destp; 113 | int src_pitch, dst_pitch, prv_pitch, nxt_pitch, row_size, height; 114 | 115 | PVideoFrame currf = child->GetFrame(n, env); 116 | PVideoFrame destf = env->NewVideoFrame(vi); 117 | 118 | const int *current_planes = (vi.IsYUV() || vi.IsYUVA()) ? planes_y : planes_r; 119 | 120 | if (n == 0 || n == vi.num_frames - 1) 121 | { 122 | // 1st or last: not temporal 123 | for (int i = 0; i < vi.NumComponents(); i++) 124 | { 125 | const int plane = current_planes[i]; 126 | copy_plane(destf, currf, plane, env); 127 | } 128 | return destf; 129 | } 130 | 131 | PVideoFrame prevf = child->GetFrame(n - 1, env); 132 | PVideoFrame nextf = child->GetFrame(n + 1, env); 133 | 134 | int planecount = std::min(vi.NumComponents(), 3); 135 | 136 | for (int i = 0; i < planecount; i++) 137 | { 138 | const int plane = current_planes[i]; 139 | if (processPlane[i]) { 140 | dst_pitch = destf->GetPitch(plane); 141 | src_pitch = currf->GetPitch(plane); 142 | prv_pitch = prevf->GetPitch(plane); 143 | nxt_pitch = nextf->GetPitch(plane); 144 | row_size = currf->GetRowSize(plane); 145 | const int width = row_size / vi.ComponentSize(); 146 | height = currf->GetHeight(plane); 147 | srcp = currf->GetReadPtr(plane); 148 | prevp = prevf->GetReadPtr(plane); 149 | nextp = nextf->GetReadPtr(plane); 150 | destp = destf->GetWritePtr(plane); 151 | 152 | // copy top and bottom lines 153 | memcpy(destp + dst_pitch * (height - 1), srcp + src_pitch * (height - 1), row_size); 154 | memcpy(destp, srcp, row_size); 155 | // skip to 2nd line 156 | srcp += src_pitch; 157 | prevp += prv_pitch; 158 | nextp += nxt_pitch; 159 | destp += dst_pitch; 160 | height -= 2; // two lines less 161 | 162 | const int bits_per_pixel = vi.BitsPerComponent(); 163 | 164 | proc_ST[i](srcp, src_pitch, prevp, prv_pitch, nextp, nxt_pitch, destp, dst_pitch, width, height, temporal_threshold << (bits_per_pixel - 8), spatial_threshold << (bits_per_pixel - 8), scaletab); 165 | } 166 | else { 167 | copy_plane(destf, currf, plane, env); 168 | } 169 | } 170 | // copy alpha 171 | if (vi.NumComponents() == 4) { 172 | const int plane = PLANAR_A; 173 | copy_plane(destf, currf, plane, env); 174 | } 175 | 176 | return destf; 177 | } 178 | 179 | FluxSmoothT::FluxSmoothT(PClip _child, int _temporal_threshold, bool _luma, bool _chroma, int _opt, IScriptEnvironment * env) 180 | : GenericVideoFilter(_child), temporal_threshold(_temporal_threshold), 181 | opt(_opt) 182 | { 183 | assert(temporal_threshold >= -1); 184 | assert(!((-1 == temporal_threshold))); 185 | assert(env); 186 | 187 | // division table 1/1, 1/2, ... 1/11 188 | // only 1..11 is valid 189 | scaletab[0] = 0; 190 | scaletab[1] = 32767; 191 | for (int i = 2; i < 16; ++i) 192 | scaletab[i] = (int)(32768.0 / i + 0.5); 193 | 194 | const bool goodAVX512 = ((env->GetCPUFlags() & CPUF_AVX512F) == CPUF_AVX512F) && (env->GetCPUFlags() & CPUF_AVX512BW) == CPUF_AVX512BW; 195 | 196 | #ifndef FLUXSMOOTH_AVX512_ENABLED 197 | if (opt == USE_OPT_AVX512) 198 | env->ThrowError("FluxSmoothT: cannot apply opt: this DLL version does not support AVX512"); 199 | #endif 200 | 201 | if (opt == USE_OPT_AVX512 && !goodAVX512) 202 | env->ThrowError("FluxSmoothT: cannot apply opt: AVX512F and AVX512BW is needed"); 203 | if (opt == USE_OPT_AVX2 && !(env->GetCPUFlags() & CPUF_AVX2)) 204 | env->ThrowError("FluxSmoothT: cannot apply opt: AVX2 is not supported"); 205 | if (opt == USE_OPT_SSE41 && !(env->GetCPUFlags() & CPUF_SSE4_1)) 206 | env->ThrowError("FluxSmoothT: cannot apply opt: SSE4.1 is not supported"); 207 | if (opt == USE_OPT_SSE2 && !(env->GetCPUFlags() & CPUF_SSE2)) 208 | env->ThrowError("FluxSmoothT: cannot apply opt: SSE2 is not supported"); 209 | 210 | const int *current_planes = (vi.IsYUV() || vi.IsYUVA()) ? planes_y : planes_r; 211 | int planecount = std::min(vi.NumComponents(), 3); 212 | int bits_per_pixel = vi.BitsPerComponent(); 213 | 214 | for (int i = 0; i < planecount; i++) { 215 | if (vi.IsRGB()) 216 | processPlane[i] = true; 217 | else if (i == 0) // Y 218 | processPlane[i] = _luma; 219 | else 220 | processPlane[i] = _chroma; 221 | 222 | const int actual_width = vi.width >> vi.GetPlaneWidthSubsampling(current_planes[i]); 223 | 224 | if (bits_per_pixel == 8) { 225 | #ifdef FLUXSMOOTH_AVX512_ENABLED 226 | if ((actual_width >= 64) && ((goodAVX512 && opt < 0) || opt >= USE_OPT_AVX512)) 227 | proc_T[i] = fluxT_avx512; 228 | else 229 | #endif 230 | if ((actual_width >= 32) && (((env->GetCPUFlags() & CPUF_AVX2) == CPUF_AVX2 && opt < 0) || opt >= USE_OPT_AVX2)) 231 | proc_T[i] = fluxT_avx2; 232 | else if ((actual_width >= 16) && (((env->GetCPUFlags() & CPUF_SSE4_1) == CPUF_SSE4_1 && opt < 0) || opt >= USE_OPT_SSE41)) 233 | proc_T[i] = fluxT_sse41; 234 | else if ((actual_width >= 16) && (((env->GetCPUFlags() & CPUF_SSE2) == CPUF_SSE2 && opt < 0) || opt >= USE_OPT_SSE2)) 235 | proc_T[i] = fluxT_sse2; 236 | else 237 | proc_T[i] = fluxT_C; 238 | } 239 | else { 240 | #ifdef FLUXSMOOTH_AVX512_ENABLED 241 | if ((actual_width >= 32) && ((goodAVX512 && opt < 0) || opt >= USE_OPT_AVX512)) 242 | proc_T[i] = fluxT_avx512_uint16; 243 | else 244 | #endif 245 | if ((actual_width >= 16) && (((env->GetCPUFlags() & CPUF_AVX2) == CPUF_AVX2 && opt < 0) || opt >= USE_OPT_AVX2)) 246 | proc_T[i] = fluxT_avx2_uint16; 247 | else if ((actual_width >= 8) && (((env->GetCPUFlags() & CPUF_SSE4_1) == CPUF_SSE4_1 && opt < 0) || opt >= USE_OPT_SSE41)) 248 | proc_T[i] = fluxT_sse41_uint16; 249 | else 250 | proc_T[i] = fluxT_C; 251 | } 252 | } 253 | } 254 | 255 | PVideoFrame __stdcall FluxSmoothT::GetFrame(int n, IScriptEnvironment * env) 256 | { 257 | const uint8_t* srcp; 258 | const uint8_t* prevp; 259 | const uint8_t* nextp; 260 | uint8_t* destp; 261 | int src_pitch, dst_pitch, prv_pitch, nxt_pitch, row_size, height; 262 | 263 | PVideoFrame currf = child->GetFrame(n, env); 264 | PVideoFrame destf = env->NewVideoFrame(vi); 265 | 266 | const int *current_planes = (vi.IsYUV() || vi.IsYUVA()) ? planes_y : planes_r; 267 | 268 | if (n == 0 || n == vi.num_frames - 1) 269 | { 270 | // 1st or last: simple copy 271 | for (int i = 0; i < vi.NumComponents(); i++) 272 | { 273 | const int plane = current_planes[i]; 274 | copy_plane(destf, currf, plane, env); 275 | } 276 | return destf; 277 | } 278 | 279 | PVideoFrame prevf = child->GetFrame(n - 1, env); 280 | PVideoFrame nextf = child->GetFrame(n + 1, env); 281 | 282 | int planecount = std::min(vi.NumComponents(), 3); 283 | 284 | for (int i = 0; i < planecount; i++) 285 | { 286 | const int plane = current_planes[i]; 287 | if (processPlane[i]) { 288 | dst_pitch = destf->GetPitch(plane); 289 | src_pitch = currf->GetPitch(plane); 290 | prv_pitch = prevf->GetPitch(plane); 291 | nxt_pitch = nextf->GetPitch(plane); 292 | row_size = currf->GetRowSize(plane); 293 | const int width = row_size / vi.ComponentSize(); 294 | height = currf->GetHeight(plane); 295 | srcp = currf->GetReadPtr(plane); 296 | prevp = prevf->GetReadPtr(plane); 297 | nextp = nextf->GetReadPtr(plane); 298 | destp = destf->GetWritePtr(plane); 299 | 300 | const int bits_per_pixel = vi.BitsPerComponent(); 301 | 302 | proc_T[i](srcp, src_pitch, prevp, prv_pitch, nextp, nxt_pitch, destp, dst_pitch, width, height, temporal_threshold << (bits_per_pixel - 8), scaletab); 303 | } 304 | else { 305 | copy_plane(destf, currf, plane, env); 306 | } 307 | } 308 | // copy alpha 309 | if (vi.NumComponents() == 4) { 310 | const int plane = PLANAR_A; 311 | copy_plane(destf, currf, plane, env); 312 | } 313 | 314 | return destf; 315 | } 316 | 317 | AVSValue __cdecl Create_FluxSmoothT(AVSValue args, void * user_data, IScriptEnvironment * env) 318 | { 319 | enum ARGS { CLIP, TEMPORAL_THRESHOLD, LUMA, CHROMA, OPT }; 320 | 321 | PClip clip = args[CLIP].AsClip(); 322 | int temporal_threshold = args[TEMPORAL_THRESHOLD].AsInt(7); 323 | bool luma = args[LUMA].AsBool(true); 324 | bool chroma = args[CHROMA].AsBool(true); 325 | int opt = args[OPT].AsInt(-1); 326 | 327 | if (temporal_threshold < 0) 328 | env->ThrowError("FluxSmoothT: temporal_threshold must be >= 0"); 329 | 330 | const VideoInfo & vi = clip->GetVideoInfo(); 331 | 332 | // YUY2 support only through YV16 autoconversion 333 | if (vi.IsYUY2()) { 334 | AVSValue new_args[1] = { clip }; 335 | clip = env->Invoke("ConvertToYV16", AVSValue(new_args, 1)).AsClip(); 336 | clip = new FluxSmoothT(clip, temporal_threshold, luma, chroma, opt, env); 337 | AVSValue new_args2[1] = { clip }; 338 | clip = env->Invoke("ConvertToYUY2", AVSValue(new_args2, 1)).AsClip(); 339 | return clip; 340 | } 341 | 342 | if (vi.BitsPerComponent() == 32) 343 | env->ThrowError("FluxSmoothT: 32 bit float formats not supported"); 344 | 345 | if (vi.IsY() || vi.IsYV411() || vi.Is420() || vi.Is422() || vi.Is444() || vi.IsPlanarRGB() || vi.IsPlanarRGBA()) 346 | return new FluxSmoothT(clip, temporal_threshold, luma, chroma, opt, env); 347 | else 348 | env->ThrowError("FluxSmoothT: Clip must be in Y or planar YUV(A), RGB(A) or YUY2 format (8-16 bits)"); 349 | 350 | return 0; // Unreached 351 | } 352 | 353 | AVSValue __cdecl Create_FluxSmoothST(AVSValue args, void * user_data, IScriptEnvironment * env) 354 | { 355 | enum ARGS { CLIP, TEMPORAL_THRESHOLD, SPATIAL_THRESHOLD, LUMA, CHROMA, OPT }; 356 | 357 | PClip clip = args[CLIP].AsClip(); 358 | int temporal_threshold = args[TEMPORAL_THRESHOLD].AsInt(7); 359 | int spatial_threshold = args[SPATIAL_THRESHOLD].AsInt(7); 360 | bool luma = args[LUMA].AsBool(true); 361 | bool chroma = args[CHROMA].AsBool(true); 362 | int opt = args[OPT].AsInt(-1); 363 | 364 | if (temporal_threshold < -1) 365 | env->ThrowError("FluxSmoothST: temporal_threshold must be >= -1"); 366 | if (spatial_threshold < -1) 367 | env->ThrowError("FluxSmoothST: spatial_threshold must be >= -1"); 368 | if (-1 == temporal_threshold && -1 == spatial_threshold) 369 | env->ThrowError("FluxSmoothST: Both thresholds cannot be -1"); 370 | 371 | const VideoInfo & vi = clip->GetVideoInfo(); 372 | 373 | // YUY2 support only through YV16 autoconversion 374 | if (vi.IsYUY2()) { 375 | AVSValue new_args[1] = { clip }; 376 | clip = env->Invoke("ConvertToYV16", AVSValue(new_args, 1)).AsClip(); 377 | clip = new FluxSmoothST(clip, temporal_threshold, spatial_threshold, luma, chroma, opt, env); 378 | AVSValue new_args2[1] = { clip }; 379 | clip = env->Invoke("ConvertToYUY2", AVSValue(new_args2, 1)).AsClip(); 380 | return clip; 381 | } 382 | 383 | if (vi.BitsPerComponent() == 32) 384 | env->ThrowError("FluxSmoothST: 32 bit float formats not supported"); 385 | 386 | if (vi.IsY() || vi.IsYV411() || vi.Is420() || vi.Is422() || vi.Is444() || vi.IsPlanarRGB() || vi.IsPlanarRGBA()) 387 | return new FluxSmoothST(clip, temporal_threshold, spatial_threshold, luma, chroma, opt, env); 388 | else 389 | env->ThrowError("FluxSmoothST: Clip must be in Y or planar YUV(A), RGB(A) or YUY2 format (8-16 bits)"); 390 | 391 | return 0; // Unreached 392 | } 393 | 394 | /* New 2.6 requirement!!! */ 395 | // Declare and initialise server pointers static storage. 396 | const AVS_Linkage *AVS_linkage = 0; 397 | 398 | /* New 2.6 requirement!!! */ 399 | // DLL entry point called from LoadPlugin() to setup a user plugin. 400 | extern "C" __declspec(dllexport) const char* __stdcall 401 | AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors) { 402 | /* New 2.6 requirement!!! */ 403 | // Save the server pointers. 404 | AVS_linkage = vectors; 405 | env->AddFunction("FluxSmoothT", "c[temporal_threshold]i[luma]b[chroma]b[opt]i", Create_FluxSmoothT, 0); 406 | env->AddFunction("FluxSmoothST", "c[temporal_threshold]i[spatial_threshold]i[luma]b[chroma]b[opt]i", Create_FluxSmoothST, 0); 407 | return "FluxSmooth"; 408 | } 409 | -------------------------------------------------------------------------------- /FluxSmooth/FluxSmooth.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release LLVM 10 | Win32 11 | 12 | 13 | Release LLVM 14 | x64 15 | 16 | 17 | Release XP 18 | Win32 19 | 20 | 21 | Release XP 22 | x64 23 | 24 | 25 | Release 26 | Win32 27 | 28 | 29 | Debug 30 | x64 31 | 32 | 33 | Release 34 | x64 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | AdvancedVectorExtensions2 54 | AdvancedVectorExtensions2 55 | AdvancedVectorExtensions2 56 | AdvancedVectorExtensions2 57 | AdvancedVectorExtensions2 58 | AdvancedVectorExtensions2 59 | AdvancedVectorExtensions2 60 | AdvancedVectorExtensions2 61 | 62 | 63 | -mavx512bw -mavx512f %(AdditionalOptions) 64 | -mavx512bw -mavx512f %(AdditionalOptions) 65 | NotSet 66 | NotSet 67 | NotSet 68 | NotSet 69 | NotSet 70 | /arch:AVX512 %(AdditionalOptions) 71 | /arch:AVX512 %(AdditionalOptions) 72 | /arch:AVX512 %(AdditionalOptions) 73 | /arch:AVX512 %(AdditionalOptions) 74 | /arch:AVX512 %(AdditionalOptions) 75 | /arch:AVX512 %(AdditionalOptions) 76 | 77 | 78 | 79 | 80 | 81 | 82 | 15.0 83 | {588984EE-FDBE-4901-894A-32781B765F07} 84 | Win32Proj 85 | FluxSmooth 86 | 10.0 87 | 88 | 89 | 90 | DynamicLibrary 91 | true 92 | v142 93 | MultiByte 94 | 95 | 96 | DynamicLibrary 97 | false 98 | v142 99 | true 100 | MultiByte 101 | 102 | 103 | DynamicLibrary 104 | false 105 | v141_xp 106 | true 107 | MultiByte 108 | 109 | 110 | DynamicLibrary 111 | false 112 | llvm 113 | true 114 | MultiByte 115 | 116 | 117 | DynamicLibrary 118 | true 119 | v142 120 | MultiByte 121 | 122 | 123 | DynamicLibrary 124 | false 125 | v142 126 | true 127 | MultiByte 128 | 129 | 130 | DynamicLibrary 131 | false 132 | v141_xp 133 | true 134 | MultiByte 135 | 136 | 137 | DynamicLibrary 138 | false 139 | llvm 140 | true 141 | MultiByte 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | true 175 | $(SolutionDir)$(Platform)\$(Configuration)\ 176 | $(Platform)\$(Configuration)\ 177 | 178 | 179 | true 180 | 181 | 182 | false 183 | $(SolutionDir)$(Platform)\$(Configuration)\ 184 | $(Platform)\$(Configuration)\ 185 | 186 | 187 | false 188 | $(SolutionDir)$(Platform)\$(Configuration)\ 189 | $(Platform)\$(Configuration)\ 190 | 191 | 192 | false 193 | $(SolutionDir)$(Platform)\$(Configuration)\ 194 | $(Platform)\$(Configuration)\ 195 | 196 | 197 | false 198 | 199 | 200 | false 201 | 202 | 203 | false 204 | $(SolutionDir)$(Platform)\$(Configuration)\ 205 | $(Platform)\$(Configuration)\ 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | NotUsing 218 | Level3 219 | Disabled 220 | true 221 | WIN32;_DEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 222 | true 223 | StreamingSIMDExtensions2 224 | stdcpp17 225 | NoListing 226 | 227 | 228 | Windows 229 | true 230 | 231 | 232 | 233 | 234 | NotUsing 235 | Level3 236 | Disabled 237 | true 238 | _DEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 239 | true 240 | stdcpp17 241 | NoListing 242 | 243 | 244 | Windows 245 | true 246 | 247 | 248 | 249 | 250 | NotUsing 251 | Level3 252 | MaxSpeed 253 | true 254 | true 255 | true 256 | WIN32;NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 257 | true 258 | Speed 259 | StreamingSIMDExtensions2 260 | stdcpp17 261 | AssemblyAndSourceCode 262 | false 263 | true 264 | MultiThreaded 265 | 266 | 267 | Windows 268 | true 269 | true 270 | true 271 | 272 | 273 | 274 | 275 | NotUsing 276 | Level3 277 | MaxSpeed 278 | true 279 | true 280 | true 281 | WIN32;NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 282 | true 283 | Speed 284 | StreamingSIMDExtensions2 285 | /Zc:threadSafeInit- %(AdditionalOptions) 286 | stdcpp17 287 | AssemblyAndSourceCode 288 | false 289 | true 290 | MultiThreaded 291 | 292 | 293 | Windows 294 | true 295 | true 296 | true 297 | 298 | 299 | 300 | 301 | NotUsing 302 | Level3 303 | MaxSpeed 304 | true 305 | true 306 | WIN32;NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 307 | true 308 | Speed 309 | StreamingSIMDExtensions2 310 | stdcpp17 311 | NoListing 312 | false 313 | true 314 | MultiThreaded 315 | AnySuitable 316 | -Wno-gcc-compat %(AdditionalOptions) 317 | $(IntDir) 318 | false 319 | 320 | 321 | Windows 322 | true 323 | true 324 | false 325 | Default 326 | 327 | 328 | 329 | 330 | NotUsing 331 | Level3 332 | MaxSpeed 333 | true 334 | true 335 | true 336 | NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 337 | true 338 | Speed 339 | stdcpp17 340 | AssemblyAndSourceCode 341 | false 342 | true 343 | MultiThreaded 344 | 345 | 346 | Windows 347 | true 348 | true 349 | true 350 | 351 | 352 | 353 | 354 | NotUsing 355 | Level3 356 | MaxSpeed 357 | true 358 | true 359 | true 360 | NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 361 | true 362 | Speed 363 | /Zc:threadSafeInit- %(AdditionalOptions) 364 | stdcpp17 365 | AssemblyAndSourceCode 366 | false 367 | true 368 | MultiThreaded 369 | 370 | 371 | Windows 372 | true 373 | true 374 | true 375 | 376 | 377 | 378 | 379 | NotUsing 380 | Level3 381 | MaxSpeed 382 | true 383 | true 384 | NDEBUG;FLUXSMOOTH_EXPORTS;_WINDOWS;_USRDLL;%(PreprocessorDefinitions) 385 | true 386 | Speed 387 | stdcpp17 388 | AssemblyAndSourceCode 389 | false 390 | true 391 | MultiThreaded 392 | AnySuitable 393 | -Wno-gcc-compat %(AdditionalOptions) 394 | StreamingSIMDExtensions2 395 | 396 | 397 | Windows 398 | true 399 | true 400 | false 401 | Default 402 | 403 | 404 | 405 | 406 | 407 | -------------------------------------------------------------------------------- /FluxSmooth/FluxSmooth_avx2.cpp: -------------------------------------------------------------------------------- 1 | #include "FluxSmooth.h" 2 | #include 3 | #include "stdint.h" 4 | #include "immintrin.h" // AVX 5 | 6 | #if !defined(__AVX2__) 7 | #error "This source file will only work properly when compiled with AVX2 option" 8 | #endif 9 | 10 | /************************************ 11 | // Helpers, missing intrinsics 12 | ************************************/ 13 | 14 | #define _mm256_cmpge_epu8(a, b) _mm256_cmpeq_epi8(_mm256_max_epu8(a, b), a) 15 | 16 | #define _mm256_cmple_epu8(a, b) _mm256_cmpge_epu8(b, a) 17 | 18 | // does not exist 19 | static AVS_FORCEINLINE __m256i _mm256_cmpgt_epu8(__m256i x, __m256i y) 20 | { 21 | // Returns 0xFF where x > y: 22 | return _mm256_andnot_si256( 23 | _mm256_cmpeq_epi8(x, y), 24 | _mm256_cmpeq_epi8(_mm256_max_epu8(x, y), x) 25 | ); 26 | } 27 | 28 | AVS_FORCEINLINE __m256i _mm256_cmpge_epi16(__m256i x, __m256i y) 29 | { 30 | // Returns 0xFFFF where x >= y: 31 | return _mm256_or_si256(_mm256_cmpeq_epi16(x, y), _mm256_cmpgt_epi16(x, y)); 32 | } 33 | 34 | #define _mm256_cmple_epi16(a, b) _mm256_cmpge_epi16(b, a) 35 | 36 | /************************************ 37 | // Helpers 38 | ************************************/ 39 | 40 | static AVS_FORCEINLINE void check_neighbour_simd(__m256i &neighbour, __m256i ¢er, __m256i &threshold, 41 | __m256i &sum_lo, __m256i &sum_hi, __m256i &cnt) 42 | { 43 | auto n_minus_c = _mm256_subs_epu8(neighbour, center); 44 | auto c_minus_n = _mm256_subs_epu8(center, neighbour); 45 | auto absdiff = _mm256_or_si256(n_minus_c, c_minus_n); 46 | auto abs_is_lessthanoreq_thresh = _mm256_cmple_epu8(absdiff, threshold); 47 | // count. increment when true. We simply sub the mask value 00 (0) or FF (-1) 48 | cnt = _mm256_sub_epi8(cnt, abs_is_lessthanoreq_thresh); 49 | // increase sum elements by neighbour where true, that is mask is FF 50 | // sum is 16 bits 51 | auto masked_neighbour = _mm256_and_si256(abs_is_lessthanoreq_thresh, neighbour); 52 | auto zero = _mm256_setzero_si256(); 53 | auto masked_neighbour_lo = _mm256_unpacklo_epi8(masked_neighbour, zero); 54 | auto masked_neighbour_hi = _mm256_unpackhi_epi8(masked_neighbour, zero); 55 | sum_lo = _mm256_add_epi16(sum_lo, masked_neighbour_lo); 56 | sum_hi = _mm256_add_epi16(sum_hi, masked_neighbour_hi); 57 | 58 | /* 59 | if (std::abs(neighbour - center) <= threshold) 60 | { 61 | sum += neighbour; 62 | ++cnt; 63 | } 64 | */ 65 | } 66 | 67 | static AVS_FORCEINLINE void check_neighbour_simd_uint16(__m256i &neighbour, __m256i ¢er, __m256i &threshold, 68 | __m256i &sum_lo, __m256i &sum_hi, __m256i &cnt, const __m256i &make_signed_word) 69 | { 70 | // threshold is shifted to the "signed" int16 domain 71 | auto n_minus_c = _mm256_subs_epu16(neighbour, center); 72 | auto c_minus_n = _mm256_subs_epu16(center, neighbour); 73 | auto absdiff = _mm256_or_si256(n_minus_c, c_minus_n); 74 | // absdiff <= threshold ==> !(absdiff > threshold) 75 | // FIXME make it a bit faster: cmpgt and later: andnot, and count in a reverse way (instead of increase-when-match use decrease-by-non-match) 76 | auto abs_is_lessthanoreq_thresh = _mm256_cmple_epi16(_mm256_add_epi16(absdiff, make_signed_word), threshold); 77 | // count. increment when true. We simply sub the mask value 0000 (0) or FFFF (-1) 78 | cnt = _mm256_sub_epi16(cnt, abs_is_lessthanoreq_thresh); 79 | // increase sum elements by neighbour where true, that is mask is FF 80 | // sum is 16 bits 81 | auto masked_neighbour = _mm256_and_si256(abs_is_lessthanoreq_thresh, neighbour); 82 | auto zero = _mm256_setzero_si256(); 83 | auto masked_neighbour_lo = _mm256_unpacklo_epi16(masked_neighbour, zero); 84 | auto masked_neighbour_hi = _mm256_unpackhi_epi16(masked_neighbour, zero); 85 | sum_lo = _mm256_add_epi32(sum_lo, masked_neighbour_lo); 86 | sum_hi = _mm256_add_epi32(sum_hi, masked_neighbour_hi); 87 | 88 | /* 89 | if (std::abs(neighbour - center) <= threshold) 90 | { 91 | sum += neighbour; 92 | ++cnt; 93 | } 94 | */ 95 | } 96 | 97 | /************************************ 98 | // Temporal only AVX2, 8 bit 99 | ************************************/ 100 | 101 | static AVS_FORCEINLINE void fluxT_core_avx2(const BYTE * currp, 102 | const BYTE * prevp, const BYTE * nextp, 103 | BYTE * destp, int x, 104 | __m256i &temporal_threshold_vector, 105 | __m256i &scaletab_lut_lsbs, 106 | __m256i &scaletab_lut_msbs 107 | ) 108 | { 109 | auto b = _mm256_loadu_si256(reinterpret_cast(currp + x)); 110 | auto pbt = _mm256_loadu_si256(reinterpret_cast(prevp + x)); 111 | auto nbt = _mm256_loadu_si256(reinterpret_cast(nextp + x)); 112 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 113 | // int pdiff = pbt - b, ndiff = nbt - b; 114 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 115 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 116 | auto pbt_lessthan_b = _mm256_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 117 | auto nbt_lessthan_b = _mm256_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 118 | auto pbt_greaterthan_b = _mm256_cmpgt_epu8(pbt, b); // FF where pbt > b 119 | auto nbt_greaterthan_b = _mm256_cmpgt_epu8(nbt, b); // FF where nbt > b 120 | auto both_less = _mm256_and_si256(pbt_lessthan_b, nbt_lessthan_b); 121 | auto both_greater = _mm256_and_si256(pbt_greaterthan_b, nbt_greaterthan_b); 122 | auto mask_either_is_true = _mm256_or_si256(both_less, both_greater); 123 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 124 | 125 | // int sum = b, cnt = 1; 126 | auto zero = _mm256_setzero_si256(); 127 | auto sum_lo = _mm256_unpacklo_epi8(b, zero); 128 | auto sum_hi = _mm256_unpackhi_epi8(b, zero); 129 | auto cnt = _mm256_set1_epi8(1); 130 | 131 | check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 132 | check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 133 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 134 | 135 | // factor1 = sum*2 + cnt, sum elements are 16 bits 136 | auto cnt_lo = _mm256_unpacklo_epi8(cnt, zero); 137 | auto cnt_hi = _mm256_unpackhi_epi8(cnt, zero); 138 | auto factor1_lo = _mm256_add_epi16(_mm256_add_epi16(sum_lo, sum_lo), cnt_lo); 139 | auto factor1_hi = _mm256_add_epi16(_mm256_add_epi16(sum_hi, sum_hi), cnt_hi); 140 | // factor2 = scaletab[cnt] 141 | auto factor2_lsb = _mm256_shuffle_epi8(scaletab_lut_lsbs, cnt); 142 | auto factor2_msb = _mm256_shuffle_epi8(scaletab_lut_msbs, cnt); 143 | auto factor2_lo = _mm256_unpacklo_epi8(factor2_lsb, factor2_msb); 144 | auto factor2_hi = _mm256_unpackhi_epi8(factor2_lsb, factor2_msb); 145 | // finally mul and shift 146 | auto mulres_lo = _mm256_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16 147 | auto mulres_hi = _mm256_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16 148 | // move back to 16x8 bits 149 | auto result = _mm256_packus_epi16(mulres_lo, mulres_hi); 150 | 151 | // decide if original pixel is kept 152 | auto finalres = _mm256_blendv_epi8(b, result, mask_either_is_true); // true: second param, false: 1st param 153 | 154 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(destp + x), finalres); 155 | } 156 | 157 | 158 | // Temporal only 159 | void fluxT_avx2(const uint8_t* currp, const int src_pitch, 160 | const uint8_t * prevp, const int prv_pitch, 161 | const uint8_t * nextp, const int nxt_pitch, 162 | uint8_t* destp, const int dst_pitch, 163 | const int width, int height, 164 | int temporal_threshold, 165 | short *scaletab) 166 | { 167 | __m256i scaletab_lut_lsbs; 168 | __m256i scaletab_lut_msbs; 169 | for (int i = 0; i < 16; i++) { 170 | ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF; 171 | ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF; 172 | // same for hi 128 173 | ((uint8_t*)&scaletab_lut_lsbs)[i+16] = scaletab[i] & 0xFF; 174 | ((uint8_t*)&scaletab_lut_msbs)[i+16] = (scaletab[i] >> 8) & 0xFF; 175 | } 176 | 177 | const int xcnt = width; 178 | 179 | __m256i temporal_threshold_vector = _mm256_set1_epi8(temporal_threshold); 180 | 181 | const int wmod32 = xcnt / 32 * 32; 182 | const int rest = xcnt - wmod32; 183 | 184 | for (int y = 0; y < height; y++) 185 | { 186 | for (int x = 0; x < wmod32; x += 32) 187 | fluxT_core_avx2(currp, prevp, nextp, destp, x, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 188 | // do rest 189 | if (rest > 0) 190 | fluxT_core_avx2(currp, prevp, nextp, destp, xcnt - 32, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 191 | 192 | currp += src_pitch; 193 | prevp += prv_pitch; 194 | nextp += nxt_pitch; 195 | destp += dst_pitch; 196 | } // for y 197 | _mm256_zeroupper(); 198 | } 199 | 200 | /************************************ 201 | // Temporal only AVX2, 16 bit 202 | ************************************/ 203 | 204 | AVS_FORCEINLINE void fluxT_core_avx2_uint16(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 205 | __m256i &temporal_threshold_vector // already shifted to "signed" domain 206 | ) 207 | { 208 | const auto make_signed_word = _mm256_set1_epi16(0x8000); // int16 support is better than of uint16 (cmp, etc...) 209 | 210 | auto b_orig = _mm256_loadu_si256(reinterpret_cast(currp + x)); 211 | auto pbt_orig = _mm256_loadu_si256(reinterpret_cast(prevp + x)); 212 | auto nbt_orig = _mm256_loadu_si256(reinterpret_cast(nextp + x)); 213 | 214 | auto b = _mm256_add_epi16(b_orig, make_signed_word); 215 | auto pbt = _mm256_add_epi16(pbt_orig, make_signed_word); 216 | auto nbt = _mm256_add_epi16(nbt_orig, make_signed_word); 217 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 218 | // int pdiff = pbt - b, ndiff = nbt - b; 219 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 220 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 221 | auto pbt_lessthan_b = _mm256_cmpgt_epi16(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 222 | auto nbt_lessthan_b = _mm256_cmpgt_epi16(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 223 | auto pbt_greaterthan_b = _mm256_cmpgt_epi16(pbt, b); // FF where pbt > b 224 | auto nbt_greaterthan_b = _mm256_cmpgt_epi16(nbt, b); // FF where nbt > b 225 | auto both_less = _mm256_and_si256(pbt_lessthan_b, nbt_lessthan_b); 226 | auto both_greater = _mm256_and_si256(pbt_greaterthan_b, nbt_greaterthan_b); 227 | auto mask_either_is_true = _mm256_or_si256(both_less, both_greater); 228 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 229 | 230 | // int sum = b, cnt = 1; 231 | auto zero = _mm256_setzero_si256(); 232 | auto sum_lo = _mm256_unpacklo_epi16(b_orig, zero); 233 | auto sum_hi = _mm256_unpackhi_epi16(b_orig, zero); 234 | auto cnt = _mm256_set1_epi16(1); 235 | 236 | check_neighbour_simd_uint16(pbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 237 | check_neighbour_simd_uint16(nbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 238 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 239 | 240 | auto cnt_lo = _mm256_unpacklo_epi16(cnt, zero); 241 | auto cnt_hi = _mm256_unpackhi_epi16(cnt, zero); 242 | // Difference from SSE4.1 and C: floating point division 243 | // sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f) 244 | const auto rounder_half = _mm256_set1_ps(0.5f); 245 | // lower 8 pixels 246 | auto fcnt_lo = _mm256_cvtepi32_ps(cnt_lo); 247 | auto fsum_lo = _mm256_cvtepi32_ps(sum_lo); 248 | // difference from AVX512: rcp14_ps has error less than 2^-14, while rcp_ps error is < 1.5*2^-12 249 | auto mulres_lo = _mm256_cvttps_epi32(_mm256_fmadd_ps(fsum_lo, _mm256_rcp_ps(fcnt_lo), rounder_half)); 250 | // upper 8 pixels 251 | auto fcnt_hi = _mm256_cvtepi32_ps(cnt_hi); 252 | auto fsum_hi = _mm256_cvtepi32_ps(sum_hi); 253 | auto mulres_hi = _mm256_cvttps_epi32(_mm256_fmadd_ps(fsum_hi, _mm256_rcp_ps(fcnt_hi), rounder_half)); 254 | 255 | // move back to 16x16 bits 256 | auto result = _mm256_packus_epi32(mulres_lo, mulres_hi); 257 | 258 | // decide if original pixel is kept 259 | auto finalres = _mm256_blendv_epi8(b_orig, result, mask_either_is_true); // true: second param, false: 1st param 260 | 261 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(destp + x), finalres); 262 | } 263 | 264 | // Temporal only 265 | void fluxT_avx2_uint16(const uint8_t* currp, const int src_pitch, 266 | const uint8_t * prevp, const int prv_pitch, 267 | const uint8_t * nextp, const int nxt_pitch, 268 | uint8_t* destp, const int dst_pitch, 269 | const int width, int height, 270 | int temporal_threshold, 271 | short *scaletab) 272 | { 273 | const int xcnt = width; 274 | 275 | __m256i temporal_threshold_vector = _mm256_set1_epi16(temporal_threshold - 0x8000); // move to signed int16 domain 276 | 277 | const int wmod16 = xcnt / 16 * 16; 278 | const int rest = xcnt - wmod16; 279 | 280 | for (int y = 0; y < height; y++) 281 | { 282 | for (int x = 0; x < wmod16; x += 16) 283 | fluxT_core_avx2_uint16(currp, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector); 284 | // do rest 285 | if (rest > 0) 286 | fluxT_core_avx2_uint16(currp, prevp, nextp, destp, (xcnt - 16) * sizeof(uint16_t), temporal_threshold_vector); 287 | 288 | currp += src_pitch; 289 | prevp += prv_pitch; 290 | nextp += nxt_pitch; 291 | destp += dst_pitch; 292 | } // for y 293 | _mm256_zeroupper(); 294 | } 295 | 296 | /************************************ 297 | // Spatial Temporal AVX2, 8 bit 298 | ************************************/ 299 | 300 | AVS_FORCEINLINE void fluxST_core_avx2(const BYTE * currp, const int src_pitch, 301 | const BYTE * prevp, const BYTE * nextp, 302 | BYTE * destp, int x, 303 | __m256i &temporal_threshold_vector, 304 | __m256i &spatial_threshold_vector, 305 | __m256i &scaletab_lut_lsbs, 306 | __m256i &scaletab_lut_msbs 307 | ) 308 | { 309 | // +1: center of 3x3 pixels [+0,+1,+2] 310 | auto b = _mm256_loadu_si256(reinterpret_cast(currp + x + 1)); 311 | auto pbt = _mm256_loadu_si256(reinterpret_cast(prevp + x + 1)); 312 | auto nbt = _mm256_loadu_si256(reinterpret_cast(nextp + x + 1)); 313 | 314 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 315 | // int pdiff = pbt - b, ndiff = nbt - b; 316 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 317 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 318 | auto pbt_lessthan_b = _mm256_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 319 | auto nbt_lessthan_b = _mm256_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 320 | auto pbt_greaterthan_b = _mm256_cmpgt_epu8(pbt, b); // FF where pbt > b 321 | auto nbt_greaterthan_b = _mm256_cmpgt_epu8(nbt, b); // FF where nbt > b 322 | auto both_less = _mm256_and_si256(pbt_lessthan_b, nbt_lessthan_b); 323 | auto both_greater = _mm256_and_si256(pbt_greaterthan_b, nbt_greaterthan_b); 324 | auto mask_either_is_true = _mm256_or_si256(both_less, both_greater); 325 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 326 | 327 | // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1]; 328 | // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1]; 329 | // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1]; 330 | 331 | auto pb1 = _mm256_loadu_si256(reinterpret_cast(currp + x - src_pitch + 0)); 332 | auto pb2 = _mm256_loadu_si256(reinterpret_cast(currp + x - src_pitch + 1)); 333 | auto pb3 = _mm256_loadu_si256(reinterpret_cast(currp + x - src_pitch + 2)); 334 | 335 | auto b1 = _mm256_loadu_si256(reinterpret_cast(currp + x + 0)); 336 | auto b2 = _mm256_loadu_si256(reinterpret_cast(currp + x + 2)); 337 | 338 | auto nb1 = _mm256_loadu_si256(reinterpret_cast(currp + x + src_pitch + 0)); 339 | auto nb2 = _mm256_loadu_si256(reinterpret_cast(currp + x + src_pitch + 1)); 340 | auto nb3 = _mm256_loadu_si256(reinterpret_cast(currp + x + src_pitch + 2)); 341 | 342 | // int sum = b, cnt = 1; 343 | auto zero = _mm256_setzero_si256(); 344 | auto sum_lo = _mm256_unpacklo_epi8(b, zero); 345 | auto sum_hi = _mm256_unpackhi_epi8(b, zero); 346 | auto cnt = _mm256_set1_epi8(1); 347 | 348 | check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 349 | check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 350 | check_neighbour_simd(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 351 | check_neighbour_simd(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 352 | check_neighbour_simd(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 353 | check_neighbour_simd(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 354 | check_neighbour_simd(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 355 | check_neighbour_simd(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 356 | check_neighbour_simd(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 357 | check_neighbour_simd(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 358 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 359 | 360 | // factor1 = sum*2 + cnt, sum elements are 16 bits 361 | auto cnt_lo = _mm256_unpacklo_epi8(cnt, zero); 362 | auto cnt_hi = _mm256_unpackhi_epi8(cnt, zero); 363 | auto factor1_lo = _mm256_add_epi16(_mm256_add_epi16(sum_lo, sum_lo), cnt_lo); 364 | auto factor1_hi = _mm256_add_epi16(_mm256_add_epi16(sum_hi, sum_hi), cnt_hi); 365 | // factor2 = scaletab[cnt] 366 | auto factor2_lsb = _mm256_shuffle_epi8(scaletab_lut_lsbs, cnt); 367 | auto factor2_msb = _mm256_shuffle_epi8(scaletab_lut_msbs, cnt); 368 | auto factor2_lo = _mm256_unpacklo_epi8(factor2_lsb, factor2_msb); 369 | auto factor2_hi = _mm256_unpackhi_epi8(factor2_lsb, factor2_msb); 370 | // finally mul and shift 371 | auto mulres_lo = _mm256_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16 372 | auto mulres_hi = _mm256_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16 373 | // move back to 16x8 bits 374 | auto result = _mm256_packus_epi16(mulres_lo, mulres_hi); 375 | 376 | // decide if original pixel is kept 377 | auto finalres = _mm256_blendv_epi8(b, result, mask_either_is_true); // true: second param, false: 1st param 378 | 379 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(destp + x + 1), finalres); 380 | } 381 | 382 | // Spatial Temporal 383 | void fluxST_avx2(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 384 | uint8_t* destp, const int dst_pitch, const int width, int height, int temporal_threshold, int spatial_threshold, short *scaletab) 385 | { 386 | __m256i scaletab_lut_lsbs; 387 | __m256i scaletab_lut_msbs; 388 | for (int i = 0; i < 16; i++) { 389 | ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF; 390 | ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF; 391 | // same for upper 128 392 | ((uint8_t*)&scaletab_lut_lsbs)[i+16] = scaletab[i] & 0xFF; 393 | ((uint8_t*)&scaletab_lut_msbs)[i+16] = (scaletab[i] >> 8) & 0xFF; 394 | } 395 | 396 | // spatial: because of previous and next line involved, function is called 397 | // starting with the 2nd line and with height = (real_height - 2) 398 | const int xcnt = width - 2; // leftmost/rightmost column safety 399 | 400 | __m256i temporal_threshold_vector = _mm256_set1_epi8(temporal_threshold); 401 | __m256i spatial_threshold_vector = _mm256_set1_epi8(spatial_threshold); 402 | 403 | const int wmod32 = xcnt / 32 * 32; 404 | const int rest = xcnt - wmod32; 405 | 406 | for (int y = 0; y < height; y++) 407 | { 408 | destp[0] = currp[0]; // Copy left edge 409 | 410 | for (int x = 0; x < wmod32; x += 32) 411 | fluxST_core_avx2(currp, src_pitch, prevp, nextp, destp, x, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 412 | // do rest 413 | if (rest > 0) 414 | fluxST_core_avx2(currp, src_pitch, prevp, nextp, destp, xcnt - 32, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 415 | 416 | destp[width - 1] = currp[width - 1]; // Copy right edge 417 | 418 | currp += src_pitch; 419 | prevp += prv_pitch; 420 | nextp += nxt_pitch; 421 | destp += dst_pitch; 422 | } // for y 423 | _mm256_zeroupper(); 424 | } 425 | 426 | /************************************ 427 | // Spatial Temporal AVX2, 16 bit 428 | ************************************/ 429 | AVS_FORCEINLINE void fluxST_core_avx2_uint16(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 430 | __m256i &temporal_threshold_vector, // already shifted to "signed" domain 431 | __m256i &spatial_threshold_vector // already shifted to "signed" domain 432 | ) 433 | { 434 | const auto make_signed_word = _mm256_set1_epi16(0x8000); // int16 support is better than of uint16 (cmp, etc...) 435 | // +1: center of 3x3 pixels [+0,+1,+2] 436 | auto b_orig = _mm256_loadu_si256(reinterpret_cast(currp + x + 1 * sizeof(uint16_t))); 437 | auto pbt_orig = _mm256_loadu_si256(reinterpret_cast(prevp + x + 1 * sizeof(uint16_t))); 438 | auto nbt_orig = _mm256_loadu_si256(reinterpret_cast(nextp + x + 1 * sizeof(uint16_t))); 439 | 440 | auto b = _mm256_add_epi16(b_orig, make_signed_word); 441 | auto pbt = _mm256_add_epi16(pbt_orig, make_signed_word); 442 | auto nbt = _mm256_add_epi16(nbt_orig, make_signed_word); 443 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 444 | // int pdiff = pbt - b, ndiff = nbt - b; 445 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 446 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 447 | auto pbt_lessthan_b = _mm256_cmpgt_epi16(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 448 | auto nbt_lessthan_b = _mm256_cmpgt_epi16(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 449 | auto pbt_greaterthan_b = _mm256_cmpgt_epi16(pbt, b); // FF where pbt > b 450 | auto nbt_greaterthan_b = _mm256_cmpgt_epi16(nbt, b); // FF where nbt > b 451 | auto both_less = _mm256_and_si256(pbt_lessthan_b, nbt_lessthan_b); 452 | auto both_greater = _mm256_and_si256(pbt_greaterthan_b, nbt_greaterthan_b); 453 | auto mask_either_is_true = _mm256_or_si256(both_less, both_greater); 454 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 455 | 456 | // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1]; 457 | // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1]; 458 | // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1]; 459 | 460 | auto pb1 = _mm256_loadu_si256(reinterpret_cast(currp + x - src_pitch + 0 * sizeof(uint16_t))); 461 | auto pb2 = _mm256_loadu_si256(reinterpret_cast(currp + x - src_pitch + 1 * sizeof(uint16_t))); 462 | auto pb3 = _mm256_loadu_si256(reinterpret_cast(currp + x - src_pitch + 2 * sizeof(uint16_t))); 463 | 464 | auto b1 = _mm256_loadu_si256(reinterpret_cast(currp + x + 0 * sizeof(uint16_t))); 465 | auto b2 = _mm256_loadu_si256(reinterpret_cast(currp + x + 2 * sizeof(uint16_t))); 466 | 467 | auto nb1 = _mm256_loadu_si256(reinterpret_cast(currp + x + src_pitch + 0 * sizeof(uint16_t))); 468 | auto nb2 = _mm256_loadu_si256(reinterpret_cast(currp + x + src_pitch + 1 * sizeof(uint16_t))); 469 | auto nb3 = _mm256_loadu_si256(reinterpret_cast(currp + x + src_pitch + 2 * sizeof(uint16_t))); 470 | 471 | // int sum = b, cnt = 1; 472 | auto zero = _mm256_setzero_si256(); 473 | auto sum_lo = _mm256_unpacklo_epi16(b_orig, zero); 474 | auto sum_hi = _mm256_unpackhi_epi16(b_orig, zero); 475 | auto cnt = _mm256_set1_epi16(1); 476 | 477 | check_neighbour_simd_uint16(pbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 478 | check_neighbour_simd_uint16(nbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 479 | check_neighbour_simd_uint16(pb1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 480 | check_neighbour_simd_uint16(pb2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 481 | check_neighbour_simd_uint16(pb3, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 482 | check_neighbour_simd_uint16(b1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 483 | check_neighbour_simd_uint16(b2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 484 | check_neighbour_simd_uint16(nb1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 485 | check_neighbour_simd_uint16(nb2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 486 | check_neighbour_simd_uint16(nb3, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 487 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 488 | 489 | auto cnt_lo = _mm256_unpacklo_epi16(cnt, zero); 490 | auto cnt_hi = _mm256_unpackhi_epi16(cnt, zero); 491 | // Difference from SSE4.1 and C: floating point division 492 | // sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f) 493 | const auto rounder_half = _mm256_set1_ps(0.5f); 494 | // lower 8 pixels 495 | auto fcnt_lo = _mm256_cvtepi32_ps(cnt_lo); 496 | auto fsum_lo = _mm256_cvtepi32_ps(sum_lo); 497 | // difference from AVX512: rcp14_ps has error less than 2^-14, while rcp_ps error is < 1.5*2^-12 498 | auto mulres_lo = _mm256_cvttps_epi32(_mm256_fmadd_ps(fsum_lo, _mm256_rcp_ps(fcnt_lo), rounder_half)); 499 | // upper 8 pixels 500 | auto fcnt_hi = _mm256_cvtepi32_ps(cnt_hi); 501 | auto fsum_hi = _mm256_cvtepi32_ps(sum_hi); 502 | auto mulres_hi = _mm256_cvttps_epi32(_mm256_fmadd_ps(fsum_hi, _mm256_rcp_ps(fcnt_hi), rounder_half)); 503 | 504 | // move back to 16x16 bits 505 | auto result = _mm256_packus_epi32(mulres_lo, mulres_hi); 506 | 507 | // decide if original pixel is kept 508 | auto finalres = _mm256_blendv_epi8(b_orig, result, mask_either_is_true); // true: second param, false: 1st param 509 | 510 | _mm256_storeu_si256(reinterpret_cast<__m256i *>(destp + x + 1 * sizeof(uint16_t)), finalres); 511 | } 512 | 513 | // Spatial Temporal 514 | void fluxST_avx2_uint16(const uint8_t* currp, const int src_pitch, 515 | const uint8_t * prevp, const int prv_pitch, 516 | const uint8_t * nextp, const int nxt_pitch, 517 | uint8_t* destp, const int dst_pitch, 518 | const int width, int height, 519 | int temporal_threshold, 520 | int spatial_threshold, 521 | short *scaletab) 522 | { 523 | 524 | // spatial: because of previous and next line involved, function is called 525 | // starting with the 2nd line and with height = (real_height - 2) 526 | const int xcnt = width - 2; // leftmost/rightmost column safety 527 | 528 | __m256i temporal_threshold_vector = _mm256_set1_epi16(temporal_threshold - 0x8000); // move to signed int16 domain; 529 | __m256i spatial_threshold_vector = _mm256_set1_epi16(spatial_threshold - 0x8000); // move to signed int16 domain; 530 | 531 | const int wmod16 = xcnt / 16 * 16; 532 | const int rest = xcnt - wmod16; 533 | 534 | for (int y = 0; y < height; y++) 535 | { 536 | reinterpret_cast(destp)[0] = reinterpret_cast(currp)[0]; // Copy left edge 537 | 538 | for (int x = 0; x < wmod16; x += 16) 539 | fluxST_core_avx2_uint16(currp, src_pitch, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector); 540 | // do rest 541 | if (rest > 0) 542 | fluxST_core_avx2_uint16(currp, src_pitch, prevp, nextp, destp, (xcnt - 16) * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector); 543 | 544 | reinterpret_cast(destp)[width - 1] = reinterpret_cast(currp)[width - 1]; // Copy right edge 545 | 546 | currp += src_pitch; 547 | prevp += prv_pitch; 548 | nextp += nxt_pitch; 549 | destp += dst_pitch; 550 | } // for y 551 | _mm256_zeroupper(); 552 | } 553 | 554 | 555 | -------------------------------------------------------------------------------- /FluxSmooth/FluxSmooth_avx512.cpp: -------------------------------------------------------------------------------- 1 | #include "FluxSmooth.h" 2 | #include 3 | #include "stdint.h" 4 | #include "immintrin.h" // also includes "zmmintrin.h" for AVX512 and "avx512bwintrin.h" 5 | 6 | #ifdef FLUXSMOOTH_AVX512_ENABLED 7 | 8 | // BW: starting with Skylake X and Cannon Lake. 9 | #if defined(CLANG) 10 | #if !defined(__AVX512F__) || !defined(__AVX512BW__) 11 | #error "This source file will only work properly when compiled with AVX512F and AVX512BW option. Set -mavx512f -mavx512bw command line options for this file." 12 | #endif 13 | #else 14 | #if defined(GCC) 15 | #if !defined(__AVX512F__) || !defined(__AVX512BW__) 16 | #error "This source file will only work properly when compiled with AVX512F and AVX512BW option. Set -mavx512f -mavx512bw command line options for this file." 17 | #endif 18 | #else 19 | #if !defined(__AVX512BW__) // MSVC may not define __AVX512F__ 20 | #error "This source file will only work properly when compiled with AVX512 option. Set /arch=AVX512 to command line options for this file." 21 | #endif 22 | #endif 23 | #endif 24 | 25 | #if defined(_MSC_VER) && !defined(__clang__) 26 | // As of April 2019, MS version of immintrin.h does not support AVX512BW _k*_mask* functions 27 | // https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html 28 | // Fixed in July 2019, available from VS 2019 16.2 29 | // Note: Avaliable only for v142 platform toolset from 14.22 30 | // v141 and v141_xp using 14.16; _MSC_VER is 1916 (e.g. Visual Studio 2017 version 15.9.11) 31 | // v142 using 14.22 is already implementing those mask operations --> 2019 16.2 32 | 33 | #if _MSC_VER < 1922 34 | 35 | AVS_FORCEINLINE __mmask64 _kand_mask64(__mmask64 A, __mmask64 B) // AVX512BW 36 | { 37 | return (__mmask64)(A & B); 38 | } 39 | 40 | AVS_FORCEINLINE __mmask64 _kor_mask64(__mmask64 A, __mmask64 B) // AVX512BW 41 | { 42 | return (__mmask64)(A | B); 43 | } 44 | 45 | AVS_FORCEINLINE __mmask32 _kand_mask32(__mmask32 A, __mmask32 B) // AVX512BW 46 | { 47 | return (__mmask32)(A & B); 48 | } 49 | 50 | AVS_FORCEINLINE __mmask32 _kor_mask32(__mmask32 A, __mmask32 B) // AVX512BW 51 | { 52 | return (__mmask32)(A | B); 53 | } 54 | #endif 55 | #endif 56 | 57 | 58 | /************************************ 59 | // Helpers 60 | ************************************/ 61 | 62 | static AVS_FORCEINLINE void check_neighbour_simd(__m512i &neighbour, __m512i ¢er, __m512i &threshold, 63 | __m512i &sum_lo, __m512i &sum_hi, __m512i &cnt) 64 | { 65 | auto n_minus_c = _mm512_subs_epu8(neighbour, center); // AVX512BW 66 | auto c_minus_n = _mm512_subs_epu8(center, neighbour); // AVX512BW 67 | auto absdiff = _mm512_or_si512(n_minus_c, c_minus_n); // AVX512F 68 | auto abs_is_lessthanoreq_thresh = _mm512_cmple_epu8_mask(absdiff, threshold); // AVX512BW 69 | // count. add 1 (increment) when true. 70 | cnt = _mm512_add_epi8(cnt, _mm512_maskz_set1_epi8(abs_is_lessthanoreq_thresh, 1)); // AVX512BW 71 | // increase sum elements by neighbour where true 72 | // sum is 16 bits 73 | auto masked_neighbour = _mm512_maskz_mov_epi8(abs_is_lessthanoreq_thresh, neighbour); // AVX512BW 74 | auto zero = _mm512_setzero_si512(); // AVX512F 75 | auto masked_neighbour_lo = _mm512_unpacklo_epi8(masked_neighbour, zero); 76 | auto masked_neighbour_hi = _mm512_unpackhi_epi8(masked_neighbour, zero); 77 | sum_lo = _mm512_add_epi16(sum_lo, masked_neighbour_lo); 78 | sum_hi = _mm512_add_epi16(sum_hi, masked_neighbour_hi); 79 | 80 | /* 81 | if (std::abs(neighbour - center) <= threshold) 82 | { 83 | sum += neighbour; 84 | ++cnt; 85 | } 86 | */ 87 | } 88 | 89 | static AVS_FORCEINLINE void check_neighbour_simd_uint16(__m512i &neighbour, __m512i ¢er, __m512i &threshold, 90 | __m512i &sum_lo, __m512i &sum_hi, __m512i &cnt) 91 | { 92 | auto n_minus_c = _mm512_subs_epu16(neighbour, center); 93 | auto c_minus_n = _mm512_subs_epu16(center, neighbour); 94 | auto absdiff = _mm512_or_si512(n_minus_c, c_minus_n); 95 | // absdiff <= threshold 96 | auto abs_is_lessthanoreq_thresh = _mm512_cmple_epu16_mask(absdiff, threshold); 97 | // count. add 1 (increment) when true. 98 | cnt = _mm512_add_epi16(cnt, _mm512_maskz_set1_epi16(abs_is_lessthanoreq_thresh, 1)); // AVX512BW 99 | // increase sum elements by neighbour where true, that is mask is FF 100 | // sum is 16 bits 101 | auto masked_neighbour = _mm512_maskz_mov_epi16(abs_is_lessthanoreq_thresh, neighbour); // AVX512BW 102 | auto zero = _mm512_setzero_si512(); 103 | auto masked_neighbour_lo = _mm512_unpacklo_epi16(masked_neighbour, zero); 104 | auto masked_neighbour_hi = _mm512_unpackhi_epi16(masked_neighbour, zero); 105 | sum_lo = _mm512_add_epi32(sum_lo, masked_neighbour_lo); 106 | sum_hi = _mm512_add_epi32(sum_hi, masked_neighbour_hi); 107 | 108 | /* 109 | if (std::abs(neighbour - center) <= threshold) 110 | { 111 | sum += neighbour; 112 | ++cnt; 113 | } 114 | */ 115 | } 116 | 117 | /************************************ 118 | // Temporal only AVX512, 8 bit 119 | ************************************/ 120 | 121 | static AVS_FORCEINLINE void fluxT_core_avx512(const BYTE * currp, 122 | const BYTE * prevp, const BYTE * nextp, 123 | BYTE * destp, int x, 124 | __m512i &temporal_threshold_vector, 125 | __m512i &scaletab_lut_lsbs, 126 | __m512i &scaletab_lut_msbs 127 | ) 128 | { 129 | auto b = _mm512_loadu_si512(reinterpret_cast(currp + x)); 130 | auto pbt = _mm512_loadu_si512(reinterpret_cast(prevp + x)); 131 | auto nbt = _mm512_loadu_si512(reinterpret_cast(nextp + x)); 132 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 133 | // int pdiff = pbt - b, ndiff = nbt - b; 134 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 135 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 136 | auto pbt_lessthan_b = _mm512_cmpgt_epu8_mask(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 137 | auto nbt_lessthan_b = _mm512_cmpgt_epu8_mask(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 138 | auto pbt_greaterthan_b = _mm512_cmpgt_epu8_mask(pbt, b); // FF where pbt > b 139 | auto nbt_greaterthan_b = _mm512_cmpgt_epu8_mask(nbt, b); // FF where nbt > b 140 | __mmask64 both_less = _kand_mask64(pbt_lessthan_b, nbt_lessthan_b); // AVX512BW 141 | __mmask64 both_greater = _kand_mask64(pbt_greaterthan_b, nbt_greaterthan_b); 142 | __mmask64 mask_either_is_true = _kor_mask64(both_less, both_greater); 143 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 144 | 145 | // int sum = b, cnt = 1; 146 | auto zero = _mm512_setzero_si512(); 147 | auto sum_lo = _mm512_unpacklo_epi8(b, zero); 148 | auto sum_hi = _mm512_unpackhi_epi8(b, zero); 149 | auto cnt = _mm512_set1_epi8(1); 150 | 151 | check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 152 | check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 153 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 154 | 155 | // factor1 = sum*2 + cnt, sum elements are 16 bits 156 | auto cnt_lo = _mm512_unpacklo_epi8(cnt, zero); 157 | auto cnt_hi = _mm512_unpackhi_epi8(cnt, zero); 158 | auto factor1_lo = _mm512_add_epi16(_mm512_add_epi16(sum_lo, sum_lo), cnt_lo); 159 | auto factor1_hi = _mm512_add_epi16(_mm512_add_epi16(sum_hi, sum_hi), cnt_hi); 160 | // factor2 = scaletab[cnt] 161 | auto factor2_lsb = _mm512_shuffle_epi8(scaletab_lut_lsbs, cnt); 162 | auto factor2_msb = _mm512_shuffle_epi8(scaletab_lut_msbs, cnt); 163 | auto factor2_lo = _mm512_unpacklo_epi8(factor2_lsb, factor2_msb); 164 | auto factor2_hi = _mm512_unpackhi_epi8(factor2_lsb, factor2_msb); 165 | // finally mul and shift 166 | auto mulres_lo = _mm512_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16 167 | auto mulres_hi = _mm512_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16 168 | // move back to 16x8 bits 169 | auto result = _mm512_packus_epi16(mulres_lo, mulres_hi); 170 | 171 | // decide if original pixel is kept 172 | auto finalres = _mm512_mask_mov_epi8(b, mask_either_is_true, result); // true: second param, false: 1st param 173 | 174 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(destp + x), finalres); 175 | } 176 | 177 | 178 | // Temporal only 179 | void fluxT_avx512(const uint8_t* currp, const int src_pitch, 180 | const uint8_t * prevp, const int prv_pitch, 181 | const uint8_t * nextp, const int nxt_pitch, 182 | uint8_t* destp, const int dst_pitch, 183 | const int width, int height, 184 | int temporal_threshold, 185 | short *scaletab) 186 | { 187 | __m512i scaletab_lut_lsbs; 188 | __m512i scaletab_lut_msbs; 189 | for (int i = 0; i < 16; i++) { 190 | ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF; 191 | ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF; 192 | // same for hi 128 193 | ((uint8_t*)&scaletab_lut_lsbs)[i + 16] = scaletab[i] & 0xFF; 194 | ((uint8_t*)&scaletab_lut_msbs)[i + 16] = (scaletab[i] >> 8) & 0xFF; 195 | // same for hilo 128 196 | ((uint8_t*)&scaletab_lut_lsbs)[i + 16*2] = scaletab[i] & 0xFF; 197 | ((uint8_t*)&scaletab_lut_msbs)[i + 16*2] = (scaletab[i] >> 8) & 0xFF; 198 | // same for hihi 128 199 | ((uint8_t*)&scaletab_lut_lsbs)[i + 16*3] = scaletab[i] & 0xFF; 200 | ((uint8_t*)&scaletab_lut_msbs)[i + 16*3] = (scaletab[i] >> 8) & 0xFF; 201 | } 202 | 203 | const int xcnt = width; 204 | 205 | __m512i temporal_threshold_vector = _mm512_set1_epi8(temporal_threshold); 206 | 207 | const int wmod64 = xcnt / 64 * 64; 208 | const int rest = xcnt - wmod64; 209 | 210 | for (int y = 0; y < height; y++) 211 | { 212 | for (int x = 0; x < wmod64; x += 64) 213 | fluxT_core_avx512(currp, prevp, nextp, destp, x, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 214 | // do rest 215 | if (rest > 0) 216 | fluxT_core_avx512(currp, prevp, nextp, destp, xcnt - 64, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 217 | 218 | currp += src_pitch; 219 | prevp += prv_pitch; 220 | nextp += nxt_pitch; 221 | destp += dst_pitch; 222 | } // for y 223 | //_mm512_zeroupper(); 224 | } 225 | 226 | /************************************ 227 | // Temporal only AVX512, 16 bit 228 | ************************************/ 229 | 230 | AVS_FORCEINLINE void fluxT_core_avx512_uint16(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 231 | __m512i &temporal_threshold_vector 232 | ) 233 | { 234 | auto b = _mm512_loadu_si512(reinterpret_cast(currp + x)); 235 | auto pbt = _mm512_loadu_si512(reinterpret_cast(prevp + x)); 236 | auto nbt = _mm512_loadu_si512(reinterpret_cast(nextp + x)); 237 | 238 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 239 | // int pdiff = pbt - b, ndiff = nbt - b; 240 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 241 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 242 | auto pbt_lessthan_b = _mm512_cmpgt_epu16_mask(b, pbt); // 1 where b > pbt. No lt --> gt with exchanged parameters 243 | auto nbt_lessthan_b = _mm512_cmpgt_epu16_mask(b, nbt); // 1 where b > nbt. No lt --> gt with exchanged parameters 244 | auto pbt_greaterthan_b = _mm512_cmpgt_epu16_mask(pbt, b); // 1 where pbt > b 245 | auto nbt_greaterthan_b = _mm512_cmpgt_epu16_mask(nbt, b); // 1 where nbt > b 246 | __mmask32 both_less = _kand_mask32(pbt_lessthan_b, nbt_lessthan_b); 247 | __mmask32 both_greater = _kand_mask32(pbt_greaterthan_b, nbt_greaterthan_b); 248 | __mmask32 mask_either_is_true = _kor_mask32(both_less, both_greater); 249 | // mask will be used at the final decision. Where 1: keep computed result. 0: keep original pixel (dst=curr) 250 | 251 | // int sum = b, cnt = 1; 252 | auto zero = _mm512_setzero_si512(); 253 | auto sum_lo = _mm512_unpacklo_epi16(b, zero); 254 | auto sum_hi = _mm512_unpackhi_epi16(b, zero); 255 | auto cnt = _mm512_set1_epi16(1); 256 | 257 | check_neighbour_simd_uint16(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 258 | check_neighbour_simd_uint16(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 259 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 260 | 261 | auto cnt_lo = _mm512_unpacklo_epi16(cnt, zero); 262 | auto cnt_hi = _mm512_unpackhi_epi16(cnt, zero); 263 | // Difference from SSE4.1 and C: floating point division 264 | // sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f) 265 | const auto rounder_half = _mm512_set1_ps(0.5f); 266 | // lower 16 pixels 267 | auto fcnt_lo = _mm512_cvtepi32_ps(cnt_lo); 268 | auto fsum_lo = _mm512_cvtepi32_ps(sum_lo); 269 | // difference from AVX2 or less: rcp14_ps has error less than 2^-14, while rcp_ps error is < 1.5*2^-12 270 | auto mulres_lo = _mm512_cvttps_epi32(_mm512_fmadd_ps(fsum_lo, _mm512_rcp14_ps(fcnt_lo), rounder_half)); 271 | // upper 16 pixels 272 | auto fcnt_hi = _mm512_cvtepi32_ps(cnt_hi); 273 | auto fsum_hi = _mm512_cvtepi32_ps(sum_hi); 274 | auto mulres_hi = _mm512_cvttps_epi32(_mm512_fmadd_ps(fsum_hi, _mm512_rcp14_ps(fcnt_hi), rounder_half)); 275 | 276 | // move back to 32x16 bits 277 | auto result = _mm512_packus_epi32(mulres_lo, mulres_hi); 278 | 279 | // decide if original pixel is kept 280 | auto finalres = _mm512_mask_mov_epi16(b, mask_either_is_true, result); // true: second param, false: 1st param 281 | 282 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(destp + x), finalres); 283 | } 284 | 285 | // Temporal only 286 | void fluxT_avx512_uint16(const uint8_t* currp, const int src_pitch, 287 | const uint8_t * prevp, const int prv_pitch, 288 | const uint8_t * nextp, const int nxt_pitch, 289 | uint8_t* destp, const int dst_pitch, 290 | const int width, int height, 291 | int temporal_threshold, 292 | short *scaletab) 293 | { 294 | const int xcnt = width; 295 | 296 | __m512i temporal_threshold_vector = _mm512_set1_epi16(temporal_threshold); 297 | 298 | const int wmod32 = xcnt / 32 * 32; 299 | const int rest = xcnt - wmod32; 300 | 301 | for (int y = 0; y < height; y++) 302 | { 303 | for (int x = 0; x < wmod32; x += 32) 304 | fluxT_core_avx512_uint16(currp, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector); 305 | // do rest 306 | if (rest > 0) 307 | fluxT_core_avx512_uint16(currp, prevp, nextp, destp, (xcnt - 32) * sizeof(uint16_t), temporal_threshold_vector); 308 | 309 | currp += src_pitch; 310 | prevp += prv_pitch; 311 | nextp += nxt_pitch; 312 | destp += dst_pitch; 313 | } // for y 314 | //_mm512_zeroupper(); 315 | } 316 | 317 | /************************************ 318 | // Spatial Temporal AVX2, 8 bit 319 | ************************************/ 320 | 321 | AVS_FORCEINLINE void fluxST_core_avx512(const BYTE * currp, const int src_pitch, 322 | const BYTE * prevp, const BYTE * nextp, 323 | BYTE * destp, int x, 324 | __m512i &temporal_threshold_vector, 325 | __m512i &spatial_threshold_vector, 326 | __m512i &scaletab_lut_lsbs, 327 | __m512i &scaletab_lut_msbs 328 | ) 329 | { 330 | // +1: center of 3x3 pixels [+0,+1,+2] 331 | auto b = _mm512_loadu_si512(reinterpret_cast(currp + x + 1)); 332 | auto pbt = _mm512_loadu_si512(reinterpret_cast(prevp + x + 1)); 333 | auto nbt = _mm512_loadu_si512(reinterpret_cast(nextp + x + 1)); 334 | 335 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 336 | // int pdiff = pbt - b, ndiff = nbt - b; 337 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 338 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 339 | auto pbt_lessthan_b = _mm512_cmpgt_epu8_mask(b, pbt); // 1 where b > pbt. No lt --> gt with exchanged parameters 340 | auto nbt_lessthan_b = _mm512_cmpgt_epu8_mask(b, nbt); // 1 where b > nbt. No lt --> gt with exchanged parameters 341 | auto pbt_greaterthan_b = _mm512_cmpgt_epu8_mask(pbt, b); // 1 where pbt > b 342 | auto nbt_greaterthan_b = _mm512_cmpgt_epu8_mask(nbt, b); // 1 where nbt > b 343 | __mmask64 both_less = _kand_mask64(pbt_lessthan_b, nbt_lessthan_b); 344 | __mmask64 both_greater = _kand_mask64(pbt_greaterthan_b, nbt_greaterthan_b); 345 | __mmask64 mask_either_is_true = _kor_mask64(both_less, both_greater); 346 | // mask will be used at the final decision. Where 1: keep computed result. 0: keep original pixel (dst=curr) 347 | 348 | // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1]; 349 | // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1]; 350 | // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1]; 351 | 352 | auto pb1 = _mm512_loadu_si512(reinterpret_cast(currp + x - src_pitch + 0)); 353 | auto pb2 = _mm512_loadu_si512(reinterpret_cast(currp + x - src_pitch + 1)); 354 | auto pb3 = _mm512_loadu_si512(reinterpret_cast(currp + x - src_pitch + 2)); 355 | 356 | auto b1 = _mm512_loadu_si512(reinterpret_cast(currp + x + 0)); 357 | auto b2 = _mm512_loadu_si512(reinterpret_cast(currp + x + 2)); 358 | 359 | auto nb1 = _mm512_loadu_si512(reinterpret_cast(currp + x + src_pitch + 0)); 360 | auto nb2 = _mm512_loadu_si512(reinterpret_cast(currp + x + src_pitch + 1)); 361 | auto nb3 = _mm512_loadu_si512(reinterpret_cast(currp + x + src_pitch + 2)); 362 | 363 | // int sum = b, cnt = 1; 364 | auto zero = _mm512_setzero_si512(); 365 | auto sum_lo = _mm512_unpacklo_epi8(b, zero); 366 | auto sum_hi = _mm512_unpackhi_epi8(b, zero); 367 | auto cnt = _mm512_set1_epi8(1); 368 | 369 | check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 370 | check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 371 | check_neighbour_simd(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 372 | check_neighbour_simd(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 373 | check_neighbour_simd(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 374 | check_neighbour_simd(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 375 | check_neighbour_simd(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 376 | check_neighbour_simd(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 377 | check_neighbour_simd(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 378 | check_neighbour_simd(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 379 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 380 | 381 | // factor1 = sum*2 + cnt, sum elements are 16 bits 382 | auto cnt_lo = _mm512_unpacklo_epi8(cnt, zero); 383 | auto cnt_hi = _mm512_unpackhi_epi8(cnt, zero); 384 | auto factor1_lo = _mm512_add_epi16(_mm512_add_epi16(sum_lo, sum_lo), cnt_lo); 385 | auto factor1_hi = _mm512_add_epi16(_mm512_add_epi16(sum_hi, sum_hi), cnt_hi); 386 | // factor2 = scaletab[cnt] 387 | auto factor2_lsb = _mm512_shuffle_epi8(scaletab_lut_lsbs, cnt); 388 | auto factor2_msb = _mm512_shuffle_epi8(scaletab_lut_msbs, cnt); 389 | auto factor2_lo = _mm512_unpacklo_epi8(factor2_lsb, factor2_msb); 390 | auto factor2_hi = _mm512_unpackhi_epi8(factor2_lsb, factor2_msb); 391 | // finally mul and shift 392 | auto mulres_lo = _mm512_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16 393 | auto mulres_hi = _mm512_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16 394 | // move back to 16x8 bits 395 | auto result = _mm512_packus_epi16(mulres_lo, mulres_hi); 396 | 397 | // decide if original pixel is kept 398 | auto finalres = _mm512_mask_mov_epi8(b, mask_either_is_true, result); // true: second param, false: 1st param 399 | 400 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(destp + x + 1), finalres); 401 | } 402 | 403 | // Spatial Temporal 404 | void fluxST_avx512(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 405 | uint8_t* destp, const int dst_pitch, const int width, int height, int temporal_threshold, int spatial_threshold, short *scaletab) 406 | { 407 | __m512i scaletab_lut_lsbs; 408 | __m512i scaletab_lut_msbs; 409 | for (int i = 0; i < 16; i++) { 410 | ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF; 411 | ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF; 412 | // same for upper 128 413 | ((uint8_t*)&scaletab_lut_lsbs)[i + 16] = scaletab[i] & 0xFF; 414 | ((uint8_t*)&scaletab_lut_msbs)[i + 16] = (scaletab[i] >> 8) & 0xFF; 415 | // same for upper 2*128 416 | ((uint8_t*)&scaletab_lut_lsbs)[i + 2 * 16] = scaletab[i] & 0xFF; 417 | ((uint8_t*)&scaletab_lut_msbs)[i + 2 * 16] = (scaletab[i] >> 8) & 0xFF; 418 | // same for upper 3*128 419 | ((uint8_t*)&scaletab_lut_lsbs)[i + 3 * 16] = scaletab[i] & 0xFF; 420 | ((uint8_t*)&scaletab_lut_msbs)[i + 3 * 16] = (scaletab[i] >> 8) & 0xFF; 421 | } 422 | 423 | // spatial: because of previous and next line involved, function is called 424 | // starting with the 2nd line and with height = (real_height - 2) 425 | const int xcnt = width - 2; // leftmost/rightmost column safety 426 | 427 | __m512i temporal_threshold_vector = _mm512_set1_epi8(temporal_threshold); 428 | __m512i spatial_threshold_vector = _mm512_set1_epi8(spatial_threshold); 429 | 430 | const int wmod64 = xcnt / 64 * 64; 431 | const int rest = xcnt - wmod64; 432 | 433 | for (int y = 0; y < height; y++) 434 | { 435 | destp[0] = currp[0]; // Copy left edge 436 | 437 | for (int x = 0; x < wmod64; x += 64) 438 | fluxST_core_avx512(currp, src_pitch, prevp, nextp, destp, x, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 439 | // do rest 440 | if (rest > 0) 441 | fluxST_core_avx512(currp, src_pitch, prevp, nextp, destp, xcnt - 64, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 442 | 443 | destp[width - 1] = currp[width - 1]; // Copy right edge 444 | 445 | currp += src_pitch; 446 | prevp += prv_pitch; 447 | nextp += nxt_pitch; 448 | destp += dst_pitch; 449 | } // for y 450 | //_mm512_zeroupper(); 451 | } 452 | 453 | /************************************ 454 | // Spatial Temporal AVX2, 16 bit 455 | ************************************/ 456 | AVS_FORCEINLINE void fluxST_core_avx512_uint16(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 457 | __m512i &temporal_threshold_vector, 458 | __m512i &spatial_threshold_vector 459 | ) 460 | { 461 | // +1: center of 3x3 pixels [+0,+1,+2] 462 | auto b = _mm512_loadu_si512(reinterpret_cast(currp + x + 1 * sizeof(uint16_t))); 463 | auto pbt = _mm512_loadu_si512(reinterpret_cast(prevp + x + 1 * sizeof(uint16_t))); 464 | auto nbt = _mm512_loadu_si512(reinterpret_cast(nextp + x + 1 * sizeof(uint16_t))); 465 | 466 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 467 | // int pdiff = pbt - b, ndiff = nbt - b; 468 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 469 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 470 | auto pbt_lessthan_b = _mm512_cmpgt_epu16_mask(b, pbt); // 1 where b > pbt. No lt --> gt with exchanged parameters 471 | auto nbt_lessthan_b = _mm512_cmpgt_epu16_mask(b, nbt); // 1 where b > nbt. No lt --> gt with exchanged parameters 472 | auto pbt_greaterthan_b = _mm512_cmpgt_epu16_mask(pbt, b); // 1 where pbt > b 473 | auto nbt_greaterthan_b = _mm512_cmpgt_epu16_mask(nbt, b); // 1 where nbt > b 474 | __mmask32 both_less = _kand_mask32(pbt_lessthan_b, nbt_lessthan_b); 475 | __mmask32 both_greater = _kand_mask32(pbt_greaterthan_b, nbt_greaterthan_b); 476 | __mmask32 mask_either_is_true = _kor_mask32(both_less, both_greater); 477 | // mask will be used at the final decision. Where 1: keep computed result. 00: keep original pixel (dst=curr) 478 | 479 | // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1]; 480 | // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1]; 481 | // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1]; 482 | 483 | auto pb1 = _mm512_loadu_si512(reinterpret_cast(currp + x - src_pitch + 0 * sizeof(uint16_t))); 484 | auto pb2 = _mm512_loadu_si512(reinterpret_cast(currp + x - src_pitch + 1 * sizeof(uint16_t))); 485 | auto pb3 = _mm512_loadu_si512(reinterpret_cast(currp + x - src_pitch + 2 * sizeof(uint16_t))); 486 | 487 | auto b1 = _mm512_loadu_si512(reinterpret_cast(currp + x + 0 * sizeof(uint16_t))); 488 | auto b2 = _mm512_loadu_si512(reinterpret_cast(currp + x + 2 * sizeof(uint16_t))); 489 | 490 | auto nb1 = _mm512_loadu_si512(reinterpret_cast(currp + x + src_pitch + 0 * sizeof(uint16_t))); 491 | auto nb2 = _mm512_loadu_si512(reinterpret_cast(currp + x + src_pitch + 1 * sizeof(uint16_t))); 492 | auto nb3 = _mm512_loadu_si512(reinterpret_cast(currp + x + src_pitch + 2 * sizeof(uint16_t))); 493 | 494 | // int sum = b, cnt = 1; 495 | auto zero = _mm512_setzero_si512(); 496 | auto sum_lo = _mm512_unpacklo_epi16(b, zero); 497 | auto sum_hi = _mm512_unpackhi_epi16(b, zero); 498 | auto cnt = _mm512_set1_epi16(1); 499 | 500 | check_neighbour_simd_uint16(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 501 | check_neighbour_simd_uint16(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 502 | check_neighbour_simd_uint16(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 503 | check_neighbour_simd_uint16(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 504 | check_neighbour_simd_uint16(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 505 | check_neighbour_simd_uint16(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 506 | check_neighbour_simd_uint16(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 507 | check_neighbour_simd_uint16(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 508 | check_neighbour_simd_uint16(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 509 | check_neighbour_simd_uint16(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 510 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 511 | 512 | auto cnt_lo = _mm512_unpacklo_epi16(cnt, zero); 513 | auto cnt_hi = _mm512_unpackhi_epi16(cnt, zero); 514 | // Difference from SSE4.1 and C: floating point division 515 | // sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f) 516 | const auto rounder_half = _mm512_set1_ps(0.5f); 517 | // lower 16 pixels 518 | auto fcnt_lo = _mm512_cvtepi32_ps(cnt_lo); 519 | auto fsum_lo = _mm512_cvtepi32_ps(sum_lo); 520 | // difference from AVX2 or less: rcp14_ps has error less than 2^-14, while rcp_ps error is < 1.5*2^-12 521 | auto mulres_lo = _mm512_cvttps_epi32(_mm512_fmadd_ps(fsum_lo, _mm512_rcp14_ps(fcnt_lo), rounder_half)); 522 | // upper 16 pixels 523 | auto fcnt_hi = _mm512_cvtepi32_ps(cnt_hi); 524 | auto fsum_hi = _mm512_cvtepi32_ps(sum_hi); 525 | auto mulres_hi = _mm512_cvttps_epi32(_mm512_fmadd_ps(fsum_hi, _mm512_rcp14_ps(fcnt_hi), rounder_half)); 526 | 527 | // move back to 32x16 bits 528 | auto result = _mm512_packus_epi32(mulres_lo, mulres_hi); 529 | 530 | // decide if original pixel is kept 531 | auto finalres = _mm512_mask_mov_epi16(b, mask_either_is_true, result); // true: second param, false: 1st param 532 | 533 | _mm512_storeu_si512(reinterpret_cast<__m512i *>(destp + x + 1 * sizeof(uint16_t)), finalres); 534 | } 535 | 536 | // Spatial Temporal 537 | void fluxST_avx512_uint16(const uint8_t* currp, const int src_pitch, 538 | const uint8_t * prevp, const int prv_pitch, 539 | const uint8_t * nextp, const int nxt_pitch, 540 | uint8_t* destp, const int dst_pitch, 541 | const int width, int height, 542 | int temporal_threshold, 543 | int spatial_threshold, 544 | short *scaletab) 545 | { 546 | 547 | // spatial: because of previous and next line involved, function is called 548 | // starting with the 2nd line and with height = (real_height - 2) 549 | const int xcnt = width - 2; // leftmost/rightmost column safety 550 | 551 | __m512i temporal_threshold_vector = _mm512_set1_epi16(temporal_threshold); 552 | __m512i spatial_threshold_vector = _mm512_set1_epi16(spatial_threshold); 553 | 554 | const int wmod32 = xcnt / 32 * 32; 555 | const int rest = xcnt - wmod32; 556 | 557 | for (int y = 0; y < height; y++) 558 | { 559 | reinterpret_cast(destp)[0] = reinterpret_cast(currp)[0]; // Copy left edge 560 | 561 | for (int x = 0; x < wmod32; x += 32) 562 | fluxST_core_avx512_uint16(currp, src_pitch, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector); 563 | // do rest 564 | if (rest > 0) 565 | fluxST_core_avx512_uint16(currp, src_pitch, prevp, nextp, destp, (xcnt - 32) * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector); 566 | 567 | reinterpret_cast(destp)[width - 1] = reinterpret_cast(currp)[width - 1]; // Copy right edge 568 | 569 | currp += src_pitch; 570 | prevp += prv_pitch; 571 | nextp += nxt_pitch; 572 | destp += dst_pitch; 573 | } // for y 574 | //_mm512_zeroupper(); 575 | } 576 | 577 | #endif // FLUXSMOOTH_AVX512_ENABLED 578 | -------------------------------------------------------------------------------- /FluxSmooth/FluxSmooth.cpp: -------------------------------------------------------------------------------- 1 | // FluxSmooth 2 | // Avisynth filter for spatio-temporal smoothing of fluctuations 3 | // 4 | // By Ross Thomas 5 | // 6 | // There is no copyright on this code, and there are no conditions 7 | // on its distribution or use. Do with it what you will. 8 | 9 | #include "FluxSmooth.h" 10 | #include 11 | #include "stdint.h" 12 | #include "emmintrin.h" // SSE2 13 | #include "immintrin.h" // SSSE3 14 | #include "smmintrin.h" // SSE4.1 15 | 16 | /************************************ 17 | // Helpers 18 | ************************************/ 19 | 20 | #ifdef INTEL_INTRINSICS 21 | #if defined(CLANG) || defined(GCC) 22 | __attribute__((__target__("sse4.1"))) 23 | #endif 24 | AVS_FORCEINLINE void check_neighbour_simd(__m128i &neighbour, __m128i ¢er, __m128i &threshold, 25 | __m128i &sum_lo, __m128i &sum_hi, __m128i &cnt) 26 | { 27 | auto n_minus_c = _mm_subs_epu8(neighbour, center); 28 | auto c_minus_n = _mm_subs_epu8(center, neighbour); 29 | auto absdiff = _mm_or_si128(n_minus_c, c_minus_n); 30 | auto abs_is_lessthanoreq_thresh = _mm_cmple_epu8(absdiff, threshold); 31 | // count. increment when true. We simply sub the mask value 00 (0) or FF (-1) 32 | cnt = _mm_sub_epi8(cnt, abs_is_lessthanoreq_thresh); 33 | // increase sum elements by neighbour where true, that is mask is FF 34 | // sum is 16 bits 35 | auto masked_neighbour = _mm_and_si128(abs_is_lessthanoreq_thresh, neighbour); 36 | auto zero = _mm_setzero_si128(); 37 | auto masked_neighbour_lo = _mm_unpacklo_epi8(masked_neighbour, zero); 38 | auto masked_neighbour_hi = _mm_unpackhi_epi8(masked_neighbour, zero); 39 | sum_lo = _mm_add_epi16(sum_lo, masked_neighbour_lo); 40 | sum_hi = _mm_add_epi16(sum_hi, masked_neighbour_hi); 41 | 42 | /* 43 | if (std::abs(neighbour - center) <= threshold) 44 | { 45 | sum += neighbour; 46 | ++cnt; 47 | } 48 | */ 49 | } 50 | 51 | #if defined(CLANG) || defined(GCC) 52 | __attribute__((__target__("sse4.1"))) 53 | #endif 54 | AVS_FORCEINLINE void check_neighbour_simd_uint16(__m128i &neighbour, __m128i ¢er, __m128i &threshold, 55 | __m128i &sum_lo, __m128i &sum_hi, __m128i &cnt, const __m128i &make_signed_word) 56 | { 57 | // threshold is shifted to the "signed" int16 domain 58 | auto n_minus_c = _mm_subs_epu16(neighbour, center); 59 | auto c_minus_n = _mm_subs_epu16(center, neighbour); 60 | auto absdiff = _mm_or_si128(n_minus_c, c_minus_n); 61 | // absdiff <= threshold ==> !(absdiff > threshold) 62 | // FIXME make it a bit faster: cmpgt and later: andnot, and count in a reverse way (instead of increase-when-match use decrease-by-non-match) 63 | auto abs_is_lessthanoreq_thresh = _mm_cmple_epi16(_mm_add_epi16(absdiff, make_signed_word), threshold); 64 | // count. increment when true. We simply sub the mask value 0000 (0) or FFFF (-1) 65 | cnt = _mm_sub_epi16(cnt, abs_is_lessthanoreq_thresh); 66 | // increase sum elements by neighbour where true, that is mask is FF 67 | // sum is 16 bits 68 | auto masked_neighbour = _mm_and_si128(abs_is_lessthanoreq_thresh, neighbour); 69 | auto zero = _mm_setzero_si128(); 70 | auto masked_neighbour_lo = _mm_unpacklo_epi16(masked_neighbour, zero); 71 | auto masked_neighbour_hi = _mm_unpackhi_epi16(masked_neighbour, zero); 72 | sum_lo = _mm_add_epi32(sum_lo, masked_neighbour_lo); 73 | sum_hi = _mm_add_epi32(sum_hi, masked_neighbour_hi); 74 | 75 | /* 76 | if (std::abs(neighbour - center) <= threshold) 77 | { 78 | sum += neighbour; 79 | ++cnt; 80 | } 81 | */ 82 | } 83 | 84 | /************************************ 85 | // Temporal only SSE2, 8 bit 86 | ************************************/ 87 | AVS_FORCEINLINE void fluxT_core_sse2(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 88 | __m128i &temporal_threshold_vector, 89 | __m128i &scaletab_lut_lsbs, 90 | __m128i &scaletab_lut_msbs 91 | ) 92 | { 93 | auto b = _mm_loadu_si128(reinterpret_cast(currp + x)); 94 | auto pbt = _mm_loadu_si128(reinterpret_cast(prevp + x)); 95 | auto nbt = _mm_loadu_si128(reinterpret_cast(nextp + x)); 96 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 97 | // int pdiff = pbt - b, ndiff = nbt - b; 98 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 99 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 100 | auto pbt_lessthan_b = _mm_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 101 | auto nbt_lessthan_b = _mm_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 102 | auto pbt_greaterthan_b = _mm_cmpgt_epu8(pbt, b); // FF where pbt > b 103 | auto nbt_greaterthan_b = _mm_cmpgt_epu8(nbt, b); // FF where nbt > b 104 | auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b); 105 | auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b); 106 | auto mask_either_is_true = _mm_or_si128(both_less, both_greater); 107 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 108 | 109 | // int sum = b, cnt = 1; 110 | auto zero = _mm_setzero_si128(); 111 | auto sum_lo = _mm_unpacklo_epi8(b, zero); 112 | auto sum_hi = _mm_unpackhi_epi8(b, zero); 113 | auto cnt = _mm_set1_epi8(1); 114 | 115 | check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 116 | check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 117 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 118 | 119 | // factor1 = sum*2 + cnt, sum elements are 16 bits 120 | auto cnt_lo = _mm_unpacklo_epi8(cnt, zero); 121 | auto cnt_hi = _mm_unpackhi_epi8(cnt, zero); 122 | 123 | // Difference from SSE4.1 and C: floating point division 124 | // SSE2: sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f) 125 | const auto rounder_half = _mm_set1_ps(0.5f); 126 | // lower 8 pixels 127 | auto fcnt_lo_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(cnt_lo, zero)); 128 | auto fcnt_lo_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(cnt_lo, zero)); 129 | auto fsum_lo_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(sum_lo, zero)); 130 | auto fsum_lo_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(sum_lo, zero)); 131 | 132 | auto mul_lo_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo_lo, _mm_rcp_ps(fcnt_lo_lo)), rounder_half)); 133 | auto mul_lo_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo_hi, _mm_rcp_ps(fcnt_lo_hi)), rounder_half)); 134 | auto mulres_lo = _mm_packs_epi32(mul_lo_lo, mul_lo_hi); 135 | // upper 8 pixels 136 | auto fcnt_hi_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(cnt_hi, zero)); 137 | auto fcnt_hi_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(cnt_hi, zero)); 138 | auto fsum_hi_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(sum_hi, zero)); 139 | auto fsum_hi_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(sum_hi, zero)); 140 | 141 | auto mul_hi_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi_lo, _mm_rcp_ps(fcnt_hi_lo)), rounder_half)); 142 | auto mul_hi_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi_hi, _mm_rcp_ps(fcnt_hi_hi)), rounder_half)); 143 | auto mulres_hi = _mm_packs_epi32(mul_hi_lo, mul_hi_hi); 144 | 145 | // move back to 16x8 bits 146 | auto result = _mm_packus_epi16(mulres_lo, mulres_hi); 147 | 148 | // decide if original pixel is kept 149 | auto finalres = _MM_BLENDV_EPI8(b, result, mask_either_is_true); // true: second param, false: 1st param 150 | 151 | _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x), finalres); 152 | } 153 | 154 | 155 | // Temporal only 156 | void fluxT_sse2(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 157 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab) 158 | { 159 | __m128i scaletab_lut_lsbs; 160 | __m128i scaletab_lut_msbs; 161 | for (int i = 0; i < 16; i++) { 162 | ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF; 163 | ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF; 164 | } 165 | 166 | const int xcnt = width; 167 | 168 | __m128i temporal_threshold_vector = _mm_set1_epi8(temporal_threshold); 169 | 170 | const int wmod16 = xcnt / 16 * 16; 171 | const int rest = xcnt - wmod16; 172 | 173 | for (int y = 0; y < height; y++) 174 | { 175 | for (int x = 0; x < wmod16; x += 16) 176 | fluxT_core_sse2(currp, prevp, nextp, destp, x, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 177 | // do rest 178 | if (rest > 0) 179 | fluxT_core_sse2(currp, prevp, nextp, destp, xcnt - 16, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 180 | 181 | currp += src_pitch; 182 | prevp += prv_pitch; 183 | nextp += nxt_pitch; 184 | destp += dst_pitch; 185 | } // for y 186 | } 187 | 188 | /************************************ 189 | // Temporal only SSE4.1, 8 bit 190 | ************************************/ 191 | #if defined(CLANG) || defined(GCC) 192 | __attribute__((__target__("sse4.1"))) 193 | #endif 194 | AVS_FORCEINLINE void fluxT_core_sse41(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 195 | __m128i &temporal_threshold_vector, 196 | __m128i &scaletab_lut_lsbs, 197 | __m128i &scaletab_lut_msbs 198 | ) 199 | { 200 | 201 | auto b = _mm_loadu_si128(reinterpret_cast(currp + x)); 202 | auto pbt = _mm_loadu_si128(reinterpret_cast(prevp + x)); 203 | auto nbt = _mm_loadu_si128(reinterpret_cast(nextp + x)); 204 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 205 | // int pdiff = pbt - b, ndiff = nbt - b; 206 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 207 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 208 | auto pbt_lessthan_b = _mm_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 209 | auto nbt_lessthan_b = _mm_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 210 | auto pbt_greaterthan_b = _mm_cmpgt_epu8(pbt, b); // FF where pbt > b 211 | auto nbt_greaterthan_b = _mm_cmpgt_epu8(nbt, b); // FF where nbt > b 212 | auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b); 213 | auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b); 214 | auto mask_either_is_true = _mm_or_si128(both_less, both_greater); 215 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 216 | 217 | // int sum = b, cnt = 1; 218 | auto zero = _mm_setzero_si128(); 219 | auto sum_lo = _mm_unpacklo_epi8(b, zero); 220 | auto sum_hi = _mm_unpackhi_epi8(b, zero); 221 | auto cnt = _mm_set1_epi8(1); 222 | 223 | check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 224 | check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 225 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 226 | 227 | #if 0 228 | // Experiment with MADD and rounding: a bit slower, the same result 229 | // (sum * scaletab) + 230 | // (rounding * 1) in one step 231 | // >> 15: 2nd step 232 | auto one = _mm_set1_epi16(1); 233 | constexpr int FACTOR_BITS = 15; 234 | auto rounding = _mm_set1_epi16(1 << (FACTOR_BITS - 1)); 235 | 236 | // factor2 = scaletab[cnt] 237 | auto factor2_lsb = _mm_shuffle_epi8(scaletab_lut_lsbs, cnt); 238 | auto factor2_msb = _mm_shuffle_epi8(scaletab_lut_msbs, cnt); 239 | auto factor2_lo = _mm_unpacklo_epi8(factor2_lsb, factor2_msb); 240 | auto factor2_hi = _mm_unpackhi_epi8(factor2_lsb, factor2_msb); 241 | 242 | auto mulres_lo_lo = _mm_srai_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(sum_lo, rounding), _mm_unpacklo_epi16(factor2_lo, one)), FACTOR_BITS); 243 | auto mulres_lo_hi = _mm_srai_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(sum_lo, rounding), _mm_unpackhi_epi16(factor2_lo, one)), FACTOR_BITS); 244 | auto mulres_lo = _mm_packs_epi32(mulres_lo_lo, mulres_lo_hi); 245 | 246 | auto mulres_hi_lo = _mm_srai_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(sum_hi, rounding), _mm_unpacklo_epi16(factor2_hi, one)), FACTOR_BITS); 247 | auto mulres_hi_hi = _mm_srai_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(sum_hi, rounding), _mm_unpackhi_epi16(factor2_hi, one)), FACTOR_BITS); 248 | auto mulres_hi = _mm_packs_epi32(mulres_hi_lo, mulres_hi_hi); 249 | #else 250 | // factor1 = sum*2 + cnt, sum elements are 16 bits 251 | auto cnt_lo = _mm_unpacklo_epi8(cnt, zero); 252 | auto cnt_hi = _mm_unpackhi_epi8(cnt, zero); 253 | auto factor1_lo = _mm_add_epi16(_mm_add_epi16(sum_lo, sum_lo), cnt_lo); 254 | auto factor1_hi = _mm_add_epi16(_mm_add_epi16(sum_hi, sum_hi), cnt_hi); 255 | // factor2 = scaletab[cnt] 256 | auto factor2_lsb = _mm_shuffle_epi8(scaletab_lut_lsbs, cnt); 257 | auto factor2_msb = _mm_shuffle_epi8(scaletab_lut_msbs, cnt); 258 | auto factor2_lo = _mm_unpacklo_epi8(factor2_lsb, factor2_msb); 259 | auto factor2_hi = _mm_unpackhi_epi8(factor2_lsb, factor2_msb); 260 | // finally mul and shift 261 | auto mulres_lo = _mm_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16 262 | auto mulres_hi = _mm_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16 263 | #endif 264 | // move back to 16x8 bits 265 | auto result = _mm_packus_epi16(mulres_lo, mulres_hi); 266 | // decide if original pixel is kept 267 | auto finalres = _mm_blendv_epi8(b, result, mask_either_is_true); // true: second param, false: 1st param 268 | 269 | _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x), finalres); 270 | } 271 | 272 | // Temporal only 273 | #if defined(CLANG) || defined(GCC) 274 | __attribute__((__target__("sse4.1"))) 275 | #endif 276 | void fluxT_sse41(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 277 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab) 278 | { 279 | __m128i scaletab_lut_lsbs; 280 | __m128i scaletab_lut_msbs; 281 | for (int i = 0; i < 16; i++) { 282 | ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF; 283 | ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF; 284 | } 285 | 286 | const int xcnt = width; 287 | 288 | __m128i temporal_threshold_vector = _mm_set1_epi8(temporal_threshold); 289 | 290 | const int wmod16 = xcnt / 16 * 16; 291 | const int rest = xcnt - wmod16; 292 | 293 | for (int y = 0; y < height; y++) 294 | { 295 | for (int x = 0; x < wmod16; x += 16) 296 | fluxT_core_sse41(currp, prevp, nextp, destp, x, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 297 | // do rest 298 | if (rest > 0) 299 | fluxT_core_sse41(currp, prevp, nextp, destp, xcnt - 16, temporal_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 300 | 301 | currp += src_pitch; 302 | prevp += prv_pitch; 303 | nextp += nxt_pitch; 304 | destp += dst_pitch; 305 | } // for y 306 | } 307 | 308 | /************************************ 309 | // Temporal only SSE4.1, 16 bit 310 | ************************************/ 311 | #if defined(CLANG) || defined(GCC) 312 | __attribute__((__target__("sse4.1"))) 313 | #endif 314 | AVS_FORCEINLINE void fluxT_core_sse41_uint16(const uint8_t * currp, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 315 | __m128i &temporal_threshold_vector // already shifted to "signed" domain 316 | ) 317 | { 318 | const auto make_signed_word = _mm_set1_epi16(0x8000); // int16 support is better than of uint16 (cmp, etc...) 319 | 320 | auto b_orig = _mm_loadu_si128(reinterpret_cast(currp + x)); 321 | auto pbt_orig = _mm_loadu_si128(reinterpret_cast(prevp + x)); 322 | auto nbt_orig = _mm_loadu_si128(reinterpret_cast(nextp + x)); 323 | 324 | auto b = _mm_add_epi16(b_orig, make_signed_word); 325 | auto pbt = _mm_add_epi16(pbt_orig, make_signed_word); 326 | auto nbt = _mm_add_epi16(nbt_orig, make_signed_word); 327 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 328 | // int pdiff = pbt - b, ndiff = nbt - b; 329 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 330 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 331 | auto pbt_lessthan_b = _mm_cmpgt_epi16(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 332 | auto nbt_lessthan_b = _mm_cmpgt_epi16(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 333 | auto pbt_greaterthan_b = _mm_cmpgt_epi16(pbt, b); // FF where pbt > b 334 | auto nbt_greaterthan_b = _mm_cmpgt_epi16(nbt, b); // FF where nbt > b 335 | auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b); 336 | auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b); 337 | auto mask_either_is_true = _mm_or_si128(both_less, both_greater); 338 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 339 | 340 | // int sum = b, cnt = 1; 341 | auto zero = _mm_setzero_si128(); 342 | auto sum_lo = _mm_unpacklo_epi16(b_orig, zero); 343 | auto sum_hi = _mm_unpackhi_epi16(b_orig, zero); 344 | auto cnt = _mm_set1_epi16(1); 345 | 346 | check_neighbour_simd_uint16(pbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 347 | check_neighbour_simd_uint16(nbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 348 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 349 | 350 | auto cnt_lo = _mm_unpacklo_epi16(cnt, zero); 351 | auto cnt_hi = _mm_unpackhi_epi16(cnt, zero); 352 | // Difference from SSE4.1 and C: floating point division 353 | // SSE2: sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f) 354 | const auto rounder_half = _mm_set1_ps(0.5f); 355 | // lower 4 pixels 356 | auto fcnt_lo = _mm_cvtepi32_ps(cnt_lo); 357 | auto fsum_lo = _mm_cvtepi32_ps(sum_lo); 358 | auto mulres_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo, _mm_rcp_ps(fcnt_lo)), rounder_half)); 359 | // upper 4 pixels 360 | auto fcnt_hi = _mm_cvtepi32_ps(cnt_hi); 361 | auto fsum_hi = _mm_cvtepi32_ps(sum_hi); 362 | auto mulres_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi, _mm_rcp_ps(fcnt_hi)), rounder_half)); 363 | 364 | // move back to 8x16 bits 365 | auto result = _mm_packus_epi32(mulres_lo, mulres_hi); 366 | 367 | // decide if original pixel is kept 368 | auto finalres = _mm_blendv_epi8(b_orig, result, mask_either_is_true); // true: second param, false: 1st param 369 | 370 | _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x), finalres); 371 | } 372 | 373 | // Temporal only 374 | #if defined(CLANG) || defined(GCC) 375 | __attribute__((__target__("sse4.1"))) 376 | #endif 377 | void fluxT_sse41_uint16(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 378 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab) 379 | { 380 | const int xcnt = width; 381 | 382 | __m128i temporal_threshold_vector = _mm_set1_epi16(temporal_threshold - 0x8000); // move to signed int16 domain 383 | 384 | // uint16_t: 8 pixels per cycle 385 | const int wmod8 = xcnt / 8 * 8; 386 | const int rest = xcnt - wmod8; 387 | 388 | for (int y = 0; y < height; y++) 389 | { 390 | for (int x = 0; x < wmod8; x += 8) 391 | fluxT_core_sse41_uint16(currp, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector); 392 | // do rest 393 | if (rest > 0) 394 | fluxT_core_sse41_uint16(currp, prevp, nextp, destp, (xcnt - 8) * sizeof(uint16_t), temporal_threshold_vector); 395 | 396 | currp += src_pitch; 397 | prevp += prv_pitch; 398 | nextp += nxt_pitch; 399 | destp += dst_pitch; 400 | } // for y 401 | } 402 | 403 | /************************************ 404 | // Spatial Temporal SSE2, 8 bit 405 | ************************************/ 406 | AVS_FORCEINLINE void fluxST_core_sse2(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 407 | __m128i &temporal_threshold_vector, 408 | __m128i &spatial_threshold_vector, 409 | __m128i &scaletab_lut_lsbs, 410 | __m128i &scaletab_lut_msbs 411 | ) 412 | { 413 | // +1: center of 3x3 pixels [+0,+1,+2] 414 | auto b = _mm_loadu_si128(reinterpret_cast(currp + x + 1)); 415 | auto pbt = _mm_loadu_si128(reinterpret_cast(prevp + x + 1)); 416 | auto nbt = _mm_loadu_si128(reinterpret_cast(nextp + x + 1)); 417 | 418 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 419 | // int pdiff = pbt - b, ndiff = nbt - b; 420 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 421 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 422 | auto pbt_lessthan_b = _mm_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 423 | auto nbt_lessthan_b = _mm_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 424 | auto pbt_greaterthan_b = _mm_cmpgt_epu8(pbt, b); // FF where pbt > b 425 | auto nbt_greaterthan_b = _mm_cmpgt_epu8(nbt, b); // FF where nbt > b 426 | auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b); 427 | auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b); 428 | auto mask_either_is_true = _mm_or_si128(both_less, both_greater); 429 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 430 | 431 | // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1]; 432 | // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1]; 433 | // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1]; 434 | 435 | auto pb1 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 0)); 436 | auto pb2 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 1)); 437 | auto pb3 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 2)); 438 | 439 | auto b1 = _mm_loadu_si128(reinterpret_cast(currp + x + 0)); 440 | auto b2 = _mm_loadu_si128(reinterpret_cast(currp + x + 2)); 441 | 442 | auto nb1 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 0)); 443 | auto nb2 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 1)); 444 | auto nb3 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 2)); 445 | 446 | // int sum = b, cnt = 1; 447 | auto zero = _mm_setzero_si128(); 448 | auto sum_lo = _mm_unpacklo_epi8(b, zero); 449 | auto sum_hi = _mm_unpackhi_epi8(b, zero); 450 | auto cnt = _mm_set1_epi8(1); 451 | 452 | check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 453 | check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 454 | check_neighbour_simd(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 455 | check_neighbour_simd(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 456 | check_neighbour_simd(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 457 | check_neighbour_simd(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 458 | check_neighbour_simd(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 459 | check_neighbour_simd(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 460 | check_neighbour_simd(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 461 | check_neighbour_simd(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 462 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 463 | 464 | // factor1 = sum*2 + cnt, sum elements are 16 bits 465 | auto cnt_lo = _mm_unpacklo_epi8(cnt, zero); 466 | auto cnt_hi = _mm_unpackhi_epi8(cnt, zero); 467 | 468 | // Difference from SSE4.1 and C: floating point division 469 | // SSE2: sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f) 470 | const auto rounder_half = _mm_set1_ps(0.5f); 471 | // lower 8 pixels 472 | auto fcnt_lo_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(cnt_lo, zero)); 473 | auto fcnt_lo_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(cnt_lo, zero)); 474 | auto fsum_lo_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(sum_lo, zero)); 475 | auto fsum_lo_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(sum_lo, zero)); 476 | 477 | auto mul_lo_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo_lo, _mm_rcp_ps(fcnt_lo_lo)), rounder_half)); 478 | auto mul_lo_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo_hi, _mm_rcp_ps(fcnt_lo_hi)), rounder_half)); 479 | auto mulres_lo = _mm_packs_epi32(mul_lo_lo, mul_lo_hi); 480 | // upper 8 pixels 481 | auto fcnt_hi_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(cnt_hi, zero)); 482 | auto fcnt_hi_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(cnt_hi, zero)); 483 | auto fsum_hi_lo = _mm_cvtepi32_ps(_mm_unpacklo_epi16(sum_hi, zero)); 484 | auto fsum_hi_hi = _mm_cvtepi32_ps(_mm_unpackhi_epi16(sum_hi, zero)); 485 | 486 | auto mul_hi_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi_lo, _mm_rcp_ps(fcnt_hi_lo)), rounder_half)); 487 | auto mul_hi_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi_hi, _mm_rcp_ps(fcnt_hi_hi)), rounder_half)); 488 | auto mulres_hi = _mm_packs_epi32(mul_hi_lo, mul_hi_hi); 489 | 490 | // move back to 16x8 bits 491 | auto result = _mm_packus_epi16(mulres_lo, mulres_hi); 492 | 493 | // decide if original pixel is kept 494 | auto finalres = _MM_BLENDV_EPI8(b, result, mask_either_is_true); // true: second param, false: 1st param 495 | 496 | _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x + 1), finalres); 497 | } 498 | 499 | // Spatial Temporal 500 | void fluxST_sse2(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 501 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab) 502 | { 503 | __m128i scaletab_lut_lsbs; 504 | __m128i scaletab_lut_msbs; 505 | for (int i = 0; i < 16; i++) { 506 | ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF; 507 | ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF; 508 | } 509 | 510 | // spatial: because of previous and next line involved, function is called 511 | // starting with the 2nd line and with height = (real_height - 2) 512 | const int xcnt = width - 2; // leftmost/rightmost column safety 513 | 514 | __m128i temporal_threshold_vector = _mm_set1_epi8(temporal_threshold); 515 | __m128i spatial_threshold_vector = _mm_set1_epi8(spatial_threshold); 516 | 517 | const int wmod16 = xcnt / 16 * 16; 518 | const int rest = xcnt - wmod16; 519 | 520 | for (int y = 0; y < height; y++) 521 | { 522 | destp[0] = currp[0]; // Copy left edge 523 | 524 | for (int x = 0; x < wmod16; x += 16) 525 | fluxST_core_sse2(currp, src_pitch, prevp, nextp, destp, x, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 526 | // do rest 527 | if (rest > 0) 528 | fluxST_core_sse2(currp, src_pitch, prevp, nextp, destp, xcnt - 16, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 529 | 530 | destp[width - 1] = currp[width - 1]; // Copy right edge 531 | 532 | currp += src_pitch; 533 | prevp += prv_pitch; 534 | nextp += nxt_pitch; 535 | destp += dst_pitch; 536 | } // for y 537 | } 538 | 539 | /************************************ 540 | // Spatial Temporal SSE4.1, 8 bit 541 | ************************************/ 542 | #if defined(CLANG) || defined(GCC) 543 | __attribute__((__target__("sse4.1"))) 544 | #endif 545 | AVS_FORCEINLINE void fluxST_core_sse41(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 546 | __m128i &temporal_threshold_vector, 547 | __m128i &spatial_threshold_vector, 548 | __m128i &scaletab_lut_lsbs, 549 | __m128i &scaletab_lut_msbs 550 | ) 551 | { 552 | // +1: center of 3x3 pixels [+0,+1,+2] 553 | auto b = _mm_loadu_si128(reinterpret_cast(currp + x + 1)); 554 | auto pbt = _mm_loadu_si128(reinterpret_cast(prevp + x + 1)); 555 | auto nbt = _mm_loadu_si128(reinterpret_cast(nextp + x + 1)); 556 | 557 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 558 | // int pdiff = pbt - b, ndiff = nbt - b; 559 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 560 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 561 | auto pbt_lessthan_b = _mm_cmpgt_epu8(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 562 | auto nbt_lessthan_b = _mm_cmpgt_epu8(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 563 | auto pbt_greaterthan_b = _mm_cmpgt_epu8(pbt, b); // FF where pbt > b 564 | auto nbt_greaterthan_b = _mm_cmpgt_epu8(nbt, b); // FF where nbt > b 565 | auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b); 566 | auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b); 567 | auto mask_either_is_true = _mm_or_si128(both_less, both_greater); 568 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 569 | 570 | // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1]; 571 | // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1]; 572 | // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1]; 573 | 574 | auto pb1 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 0)); 575 | auto pb2 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 1)); 576 | auto pb3 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 2)); 577 | 578 | auto b1 = _mm_loadu_si128(reinterpret_cast(currp + x + 0)); 579 | auto b2 = _mm_loadu_si128(reinterpret_cast(currp + x + 2)); 580 | 581 | auto nb1 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 0)); 582 | auto nb2 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 1)); 583 | auto nb3 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 2)); 584 | 585 | // int sum = b, cnt = 1; 586 | auto zero = _mm_setzero_si128(); 587 | auto sum_lo = _mm_unpacklo_epi8(b, zero); 588 | auto sum_hi = _mm_unpackhi_epi8(b, zero); 589 | auto cnt = _mm_set1_epi8(1); 590 | 591 | check_neighbour_simd(pbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 592 | check_neighbour_simd(nbt, b, temporal_threshold_vector, sum_lo, sum_hi, cnt); 593 | check_neighbour_simd(pb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 594 | check_neighbour_simd(pb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 595 | check_neighbour_simd(pb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 596 | check_neighbour_simd(b1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 597 | check_neighbour_simd(b2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 598 | check_neighbour_simd(nb1, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 599 | check_neighbour_simd(nb2, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 600 | check_neighbour_simd(nb3, b, spatial_threshold_vector, sum_lo, sum_hi, cnt); 601 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 602 | 603 | // factor1 = sum*2 + cnt, sum elements are 16 bits 604 | auto cnt_lo = _mm_unpacklo_epi8(cnt, zero); 605 | auto cnt_hi = _mm_unpackhi_epi8(cnt, zero); 606 | auto factor1_lo = _mm_add_epi16(_mm_add_epi16(sum_lo, sum_lo), cnt_lo); 607 | auto factor1_hi = _mm_add_epi16(_mm_add_epi16(sum_hi, sum_hi), cnt_hi); 608 | // factor2 = scaletab[cnt] 609 | auto factor2_lsb = _mm_shuffle_epi8(scaletab_lut_lsbs, cnt); 610 | auto factor2_msb = _mm_shuffle_epi8(scaletab_lut_msbs, cnt); 611 | auto factor2_lo = _mm_unpacklo_epi8(factor2_lsb, factor2_msb); 612 | auto factor2_hi = _mm_unpackhi_epi8(factor2_lsb, factor2_msb); 613 | // finally mul and shift 614 | auto mulres_lo = _mm_mulhi_epi16(factor1_lo, factor2_lo); // upper 16 bit of mul result, no need for >> 16 615 | auto mulres_hi = _mm_mulhi_epi16(factor1_hi, factor2_hi); // upper 16 bit of mul result, no need for >> 16 616 | // move back to 16x8 bits 617 | auto result = _mm_packus_epi16(mulres_lo, mulres_hi); 618 | 619 | // decide if original pixel is kept 620 | auto finalres = _mm_blendv_epi8(b, result, mask_either_is_true); // true: second param, false: 1st param 621 | 622 | _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x + 1), finalres); 623 | } 624 | 625 | // Spatial Temporal 626 | #if defined(CLANG) || defined(GCC) 627 | __attribute__((__target__("sse4.1"))) 628 | #endif 629 | void fluxST_sse41(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 630 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab) 631 | { 632 | __m128i scaletab_lut_lsbs; 633 | __m128i scaletab_lut_msbs; 634 | for (int i = 0; i < 16; i++) { 635 | ((uint8_t*)&scaletab_lut_lsbs)[i] = scaletab[i] & 0xFF; 636 | ((uint8_t*)&scaletab_lut_msbs)[i] = (scaletab[i] >> 8) & 0xFF; 637 | } 638 | 639 | // spatial: because of previous and next line involved, function is called 640 | // starting with the 2nd line and with height = (real_height - 2) 641 | const int xcnt = width - 2; // leftmost/rightmost column safety 642 | 643 | __m128i temporal_threshold_vector = _mm_set1_epi8(temporal_threshold); 644 | __m128i spatial_threshold_vector = _mm_set1_epi8(spatial_threshold); 645 | 646 | const int wmod16 = xcnt / 16 * 16; 647 | const int rest = xcnt - wmod16; 648 | 649 | for (int y = 0; y < height; y++) 650 | { 651 | destp[0] = currp[0]; // Copy left edge 652 | 653 | for (int x = 0; x < wmod16; x += 16) 654 | fluxST_core_sse41(currp, src_pitch, prevp, nextp, destp, x, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 655 | // do rest 656 | if (rest > 0) 657 | fluxST_core_sse41(currp, src_pitch, prevp, nextp, destp, xcnt - 16, temporal_threshold_vector, spatial_threshold_vector, scaletab_lut_lsbs, scaletab_lut_msbs); 658 | 659 | destp[width - 1] = currp[width - 1]; // Copy right edge 660 | 661 | currp += src_pitch; 662 | prevp += prv_pitch; 663 | nextp += nxt_pitch; 664 | destp += dst_pitch; 665 | } // for y 666 | } 667 | 668 | /************************************ 669 | // Spatial Temporal SSE4.1, 16 bit 670 | ************************************/ 671 | #if defined(CLANG) || defined(GCC) 672 | __attribute__((__target__("sse4.1"))) 673 | #endif 674 | AVS_FORCEINLINE void fluxST_core_sse41_uint16(const uint8_t * currp, const int src_pitch, const uint8_t* prevp, const uint8_t *nextp, uint8_t *destp, int x, 675 | __m128i &temporal_threshold_vector, // already shifted to "signed" domain 676 | __m128i &spatial_threshold_vector // already shifted to "signed" domain 677 | ) 678 | { 679 | const auto make_signed_word = _mm_set1_epi16(0x8000); // int16 support is better than of uint16 (cmp, etc...) 680 | // +1: center of 3x3 pixels [+0,+1,+2] 681 | auto b_orig = _mm_loadu_si128(reinterpret_cast(currp + x + 1 * sizeof(uint16_t))); 682 | auto pbt_orig = _mm_loadu_si128(reinterpret_cast(prevp + x + 1 * sizeof(uint16_t))); 683 | auto nbt_orig = _mm_loadu_si128(reinterpret_cast(nextp + x + 1 * sizeof(uint16_t))); 684 | 685 | auto b = _mm_add_epi16(b_orig, make_signed_word); 686 | auto pbt = _mm_add_epi16(pbt_orig, make_signed_word); 687 | auto nbt = _mm_add_epi16(nbt_orig, make_signed_word); 688 | // int b = *currp, pbt = *prevp++, nbt = *nextp++; 689 | // int pdiff = pbt - b, ndiff = nbt - b; 690 | // if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 691 | // --> if ((pbt < b && nbt < b) || (pbt > b && nbt > b)) 692 | auto pbt_lessthan_b = _mm_cmpgt_epi16(b, pbt); // FF where b > pbt. No lt --> gt with exchanged parameters 693 | auto nbt_lessthan_b = _mm_cmpgt_epi16(b, nbt); // FF where b > nbt. No lt --> gt with exchanged parameters 694 | auto pbt_greaterthan_b = _mm_cmpgt_epi16(pbt, b); // FF where pbt > b 695 | auto nbt_greaterthan_b = _mm_cmpgt_epi16(nbt, b); // FF where nbt > b 696 | auto both_less = _mm_and_si128(pbt_lessthan_b, nbt_lessthan_b); 697 | auto both_greater = _mm_and_si128(pbt_greaterthan_b, nbt_greaterthan_b); 698 | auto mask_either_is_true = _mm_or_si128(both_less, both_greater); 699 | // mask will be used at the final decision. Where FF: keep computed result. 00: keep original pixel (dst=curr) 700 | 701 | // int pb1 = currp[-src_pitch - 1], pb2 = currp[-src_pitch], pb3 = currp[-src_pitch + 1]; 702 | // int b1 = currp[-1], /*b = currp[0], */b2 = currp[1]; 703 | // int nb1 = currp[src_pitch - 1], nb2 = currp[src_pitch], nb3 = currp[src_pitch + 1]; 704 | 705 | auto pb1 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 0 * sizeof(uint16_t))); 706 | auto pb2 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 1 * sizeof(uint16_t))); 707 | auto pb3 = _mm_loadu_si128(reinterpret_cast(currp + x - src_pitch + 2 * sizeof(uint16_t))); 708 | 709 | auto b1 = _mm_loadu_si128(reinterpret_cast(currp + x + 0 * sizeof(uint16_t))); 710 | auto b2 = _mm_loadu_si128(reinterpret_cast(currp + x + 2 * sizeof(uint16_t))); 711 | 712 | auto nb1 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 0 * sizeof(uint16_t))); 713 | auto nb2 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 1 * sizeof(uint16_t))); 714 | auto nb3 = _mm_loadu_si128(reinterpret_cast(currp + x + src_pitch + 2 * sizeof(uint16_t))); 715 | 716 | // int sum = b, cnt = 1; 717 | auto zero = _mm_setzero_si128(); 718 | auto sum_lo = _mm_unpacklo_epi16(b_orig, zero); 719 | auto sum_hi = _mm_unpackhi_epi16(b_orig, zero); 720 | auto cnt = _mm_set1_epi16(1); 721 | 722 | check_neighbour_simd_uint16(pbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 723 | check_neighbour_simd_uint16(nbt_orig, b_orig, temporal_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 724 | check_neighbour_simd_uint16(pb1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 725 | check_neighbour_simd_uint16(pb2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 726 | check_neighbour_simd_uint16(pb3, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 727 | check_neighbour_simd_uint16(b1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 728 | check_neighbour_simd_uint16(b2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 729 | check_neighbour_simd_uint16(nb1, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 730 | check_neighbour_simd_uint16(nb2, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 731 | check_neighbour_simd_uint16(nb3, b_orig, spatial_threshold_vector, sum_lo, sum_hi, cnt, make_signed_word); 732 | // (BYTE)(((sum * 2 + cnt) * scaletab[cnt]) >> 16); 733 | 734 | auto cnt_lo = _mm_unpacklo_epi16(cnt, zero); 735 | auto cnt_hi = _mm_unpackhi_epi16(cnt, zero); 736 | // Difference from SSE4.1 and C: floating point division 737 | // SSE2: sum / count -> (int)((float)sum * 1.0f/(float)count + 0.5f) 738 | const auto rounder_half = _mm_set1_ps(0.5f); 739 | // lower 4 pixels 740 | auto fcnt_lo = _mm_cvtepi32_ps(cnt_lo); 741 | auto fsum_lo = _mm_cvtepi32_ps(sum_lo); 742 | auto mulres_lo = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_lo, _mm_rcp_ps(fcnt_lo)), rounder_half)); 743 | // upper 4 pixels 744 | auto fcnt_hi = _mm_cvtepi32_ps(cnt_hi); 745 | auto fsum_hi = _mm_cvtepi32_ps(sum_hi); 746 | auto mulres_hi = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(fsum_hi, _mm_rcp_ps(fcnt_hi)), rounder_half)); 747 | 748 | // move back to 8x16 bits 749 | auto result = _mm_packus_epi32(mulres_lo, mulres_hi); 750 | 751 | // decide if original pixel is kept 752 | auto finalres = _mm_blendv_epi8(b_orig, result, mask_either_is_true); // true: second param, false: 1st param 753 | 754 | _mm_storeu_si128(reinterpret_cast<__m128i *>(destp + x + 1 * sizeof(uint16_t)), finalres); 755 | } 756 | 757 | // Spatial Temporal 758 | #if defined(CLANG) || defined(GCC) 759 | __attribute__((__target__("sse4.1"))) 760 | #endif 761 | void fluxST_sse41_uint16(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 762 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab) 763 | { 764 | // spatial: because of previous and next line involved, function is called 765 | // starting with the 2nd line and with height = (real_height - 2) 766 | const int xcnt = width - 2; // leftmost/rightmost column safety 767 | 768 | __m128i temporal_threshold_vector = _mm_set1_epi16(temporal_threshold - 0x8000); // move to signed int16 domain 769 | __m128i spatial_threshold_vector = _mm_set1_epi16(spatial_threshold - 0x8000); // move to signed int16 domain); 770 | 771 | const int wmod8 = xcnt / 8 * 8; 772 | const int rest = xcnt - wmod8; 773 | 774 | for (int y = 0; y < height; y++) 775 | { 776 | reinterpret_cast(destp)[0] = reinterpret_cast(currp)[0]; // Copy left edge 777 | 778 | for (int x = 0; x < wmod8; x += 8) 779 | fluxST_core_sse41_uint16(currp, src_pitch, prevp, nextp, destp, x * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector); 780 | // do rest 781 | if (rest > 0) 782 | fluxST_core_sse41_uint16(currp, src_pitch, prevp, nextp, destp, (xcnt - 8) * sizeof(uint16_t), temporal_threshold_vector, spatial_threshold_vector); 783 | 784 | reinterpret_cast(destp)[width - 1] = reinterpret_cast(currp)[width - 1]; // Copy right edge 785 | 786 | currp += src_pitch; 787 | prevp += prv_pitch; 788 | nextp += nxt_pitch; 789 | destp += dst_pitch; 790 | } // for y 791 | } 792 | #endif // INTEL_INTRINSICS 793 | 794 | /************************************ 795 | // Helper 796 | ************************************/ 797 | 798 | static AVS_FORCEINLINE void check_neighbour_C(int neighbour, int center, int threshold, int& sum, int& cnt) 799 | { 800 | if (std::abs(neighbour - center) <= threshold) 801 | { 802 | sum += neighbour; 803 | ++cnt; 804 | } 805 | } 806 | 807 | /************************************ 808 | // Spatial Temporal C, 8-16 bit 809 | ************************************/ 810 | 811 | template 812 | void fluxST_C(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 813 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab) 814 | { 815 | // spatial: because of previous and next line involved, function is called 816 | // starting with the 2nd line and with height = (real_height - 2) 817 | for (int y = 0; y < height; y++) 818 | { 819 | // leftmost column safety 820 | reinterpret_cast(destp)[0] = reinterpret_cast(currp)[0]; // Copy left edge 821 | 822 | for (int x = 1; x < width-1; x++) 823 | { 824 | 825 | int b = reinterpret_cast(currp)[x]; 826 | int pbt = reinterpret_cast(prevp)[x]; 827 | int nbt = reinterpret_cast(nextp)[x]; 828 | int pdiff = pbt - b, ndiff = nbt - b; 829 | if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 830 | { 831 | const pixel_t* currp0 = reinterpret_cast(currp); 832 | const int src_pitch0 = src_pitch / sizeof(pixel_t); 833 | int pb1 = currp0[x - src_pitch0 - 1]; 834 | int pb2 = currp0[x - src_pitch0]; 835 | int pb3 = currp0[x - src_pitch0 + 1]; 836 | int b1 = currp0[x - 1]; 837 | /*b = currp[0]; */ 838 | int b2 = currp0[x + 1]; 839 | int nb1 = currp0[x + src_pitch0 - 1]; 840 | int nb2 = currp0[x + src_pitch0]; 841 | int nb3 = currp0[x + src_pitch0 + 1]; 842 | 843 | int sum = b, cnt = 1; 844 | check_neighbour_C(pbt, b, temporal_threshold, sum, cnt); 845 | check_neighbour_C(nbt, b, temporal_threshold, sum, cnt); 846 | 847 | check_neighbour_C(pb1, b, spatial_threshold, sum, cnt); 848 | check_neighbour_C(pb2, b, spatial_threshold, sum, cnt); 849 | check_neighbour_C(pb3, b, spatial_threshold, sum, cnt); 850 | 851 | check_neighbour_C(b1, b, spatial_threshold, sum, cnt); 852 | check_neighbour_C(b2, b, spatial_threshold, sum, cnt); 853 | 854 | check_neighbour_C(nb1, b, spatial_threshold, sum, cnt); 855 | check_neighbour_C(nb2, b, spatial_threshold, sum, cnt); 856 | check_neighbour_C(nb3, b, spatial_threshold, sum, cnt); 857 | 858 | using safe_int_t = typename std::conditional::type; // 16 bit pixels: int32 overflow 859 | 860 | reinterpret_cast(destp)[x] = (pixel_t)(((safe_int_t)(sum * 2 + cnt) * scaletab[cnt]) >> 16); 861 | } 862 | else 863 | { 864 | reinterpret_cast(destp)[x] = b; 865 | } 866 | } // for x 867 | 868 | // rightmost column safety 869 | reinterpret_cast(destp)[width - 1] = reinterpret_cast(currp)[width - 1]; // Copy right edge 870 | 871 | currp += src_pitch; 872 | prevp += prv_pitch; 873 | nextp += nxt_pitch; 874 | destp += dst_pitch; 875 | 876 | } // for y 877 | 878 | } 879 | 880 | // instantiate 881 | template void fluxST_C(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 882 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 883 | template void fluxST_C(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 884 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, int spatial_threshold, short *scaletab); 885 | 886 | /************************************ 887 | // Termporal only C, 8-16 bit 888 | ************************************/ 889 | 890 | template 891 | void fluxT_C(const uint8_t* currp, const int src_pitch, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 892 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab) 893 | { 894 | for (int y = 0; y < height; y++) 895 | { 896 | for (int x = 0; x < width; x++) 897 | { 898 | int b = reinterpret_cast(currp)[x]; 899 | int pbt = reinterpret_cast(prevp)[x]; 900 | int nbt = reinterpret_cast(nextp)[x]; 901 | int pdiff = pbt - b, ndiff = nbt - b; 902 | if ((pdiff < 0 && ndiff < 0) || (pdiff > 0 && ndiff > 0)) 903 | { 904 | int sum = b, cnt = 1; 905 | 906 | check_neighbour_C(pbt, b, temporal_threshold, sum, cnt); 907 | check_neighbour_C(nbt, b, temporal_threshold, sum, cnt); 908 | using safe_int_t = typename std::conditional::type; // 16 bit pixels: int32 overflow 909 | // cnt: 1,2,3 910 | reinterpret_cast(destp)[x] = (pixel_t)(((safe_int_t)(sum * 2 + cnt) * scaletab[cnt]) >> 16); 911 | } 912 | else 913 | { 914 | reinterpret_cast(destp)[x] = (pixel_t)b; 915 | } 916 | } // for x 917 | 918 | currp += src_pitch; 919 | prevp += prv_pitch; 920 | nextp += nxt_pitch; 921 | destp += dst_pitch; 922 | 923 | } // for y 924 | } 925 | 926 | // instantiate 927 | template void fluxT_C(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 928 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 929 | template void fluxT_C(const uint8_t*, const int, const uint8_t * prevp, const int prv_pitch, const uint8_t * nextp, const int nxt_pitch, 930 | uint8_t* destp, const int dst_pitch, const int width, const int height, int temporal_threshold, short *scaletab); 931 | 932 | --------------------------------------------------------------------------------